From 6c398387a3f8cb203b591ba479a6bf4bf99a2a87 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Fri, 8 Aug 2014 17:12:51 -0700 Subject: [PATCH 01/15] step 1 --- .../kafka/server/DelayedOperationPurgatory.scala | 285 ++++++++++++++++++ .../server/FetchDelayedOperationPurgatory.scala | 69 +++++ .../scala/kafka/server/FetchRequestPurgatory.scala | 69 ----- core/src/main/scala/kafka/server/KafkaApis.scala | 4 +- .../server/ProducerDelayedOperationPurgatory.scala | 69 +++++ .../kafka/server/ProducerRequestPurgatory.scala | 69 ----- .../main/scala/kafka/server/ReplicaManager.scala | 8 +- .../main/scala/kafka/server/RequestPurgatory.scala | 317 --------------------- core/src/main/scala/kafka/utils/DelayedItem.scala | 6 +- .../server/DelayedOperationPurgatoryTest.scala | 94 ++++++ .../unit/kafka/server/RequestPurgatoryTest.scala | 94 ------ 11 files changed, 526 insertions(+), 558 deletions(-) create mode 100644 core/src/main/scala/kafka/server/DelayedOperationPurgatory.scala create mode 100644 core/src/main/scala/kafka/server/FetchDelayedOperationPurgatory.scala delete mode 100644 core/src/main/scala/kafka/server/FetchRequestPurgatory.scala create mode 100644 core/src/main/scala/kafka/server/ProducerDelayedOperationPurgatory.scala delete mode 100644 core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala delete mode 100644 core/src/main/scala/kafka/server/RequestPurgatory.scala create mode 100644 core/src/test/scala/unit/kafka/server/DelayedOperationPurgatoryTest.scala delete mode 100644 core/src/test/scala/unit/kafka/server/RequestPurgatoryTest.scala diff --git a/core/src/main/scala/kafka/server/DelayedOperationPurgatory.scala b/core/src/main/scala/kafka/server/DelayedOperationPurgatory.scala new file mode 100644 index 0000000..3998b6d --- /dev/null +++ b/core/src/main/scala/kafka/server/DelayedOperationPurgatory.scala @@ -0,0 +1,285 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.server + +import kafka.network._ +import kafka.utils._ +import kafka.metrics.KafkaMetricsGroup + +import java.util +import java.util.concurrent._ +import java.util.concurrent.atomic._ +import scala.collection._ + +import com.yammer.metrics.core.Gauge + + +/** + * An operation whose processing needs to be delayed for at most the given delayMs; + * upon complete, the given callback function will be triggered. For example a delayed + * message append operation could be waiting for specified number of acks; or a delayed + * message fetch operation could be waiting for a given number of bytes to accumulate. + */ +abstract class DelayedOperation(delayMs: Long, onComplete: Boolean => Unit) extends DelayedItem(delayMs) { + val completed = new AtomicBoolean(false) + + /* + * Check if the delayed operation is already completed + * + * Note that concurrent threads can check if an operation can be completed or not, + * but only the first thread will succeed in completing the operation + */ + def tryComplete(): Boolean = completed.compareAndSet(false, true) + + /* + * When delayMs has elapsed, expire the delayed operation + */ + def onExpired() = onComplete(false) +} + +/** + * A helper purgatory class for bookkeeping delayed operations with a timeout, and expiring timed out operations. + * + */ +abstract class DelayedOperationPurgatory[T <: DelayedOperation](brokerId: Int = 0, purgeInterval: Int = 1000) + extends Logging with KafkaMetricsGroup { + + /* a list of requests watching each key */ + private val watchersForKey = new Pool[Any, Watchers](Some((key: Any) => new Watchers)) + + /* the number of requests being watched, duplicates added on different watchers are also counted */ + private val watched = new AtomicInteger(0) + + /* background thread expiring requests that have been waiting too long */ + private val expirationReaper = new ExpiredOperationReaper + + newGauge( + "PurgatorySize", + new Gauge[Int] { + def value = size + } + ) + + newGauge( + "NumDelayedOperations", + new Gauge[Int] { + def value = expirationReaper.numOperations + } + ) + + expirationThread.start() + + /** + * Check if the operation can be completed, if not watch it based on the given watch keys + * + * Note that a delayed operation can be watched on multiple keys, and hence due to concurrency may be + * found completed when trying to watch it on some later keys. In this case the operation is still + * treated as completed and hence no longer watched although it is still in the watch lists of + * the earlier keys. Those already watched elements will be later purged by the expire reaper. + * + * @param operation the delayed operation to be checked + * @param watchKeys keys for bookkeeping the operation + * @return true iff the delayed operations can be completed + */ + def tryCompleteElseWatch(operation: DelayedOperation, watchKeys: Seq[Any]): Boolean = { + for(key <- watchKeys) { + val watchers = watchersFor(key) + // if the operation is found completed, stop adding it to any further + // lists and return true immediately + if(!watchers.checkAndMaybeAdd(operation)) { + return true + } + } + + // if it is indeed watched, add to the expire queue also + watched.getAndIncrement() + expirationReaper.enqueue(operation) + + false + + } + + /** + * Return a list of completed operations with the given watch key. + */ + def getCompleted(key: Any): Seq[T] = { + val watchers = watchersForKey.get(key) + if(watchers == null) + Seq.empty + else + watchers.collectCompletedOperations() + } + + /* + * Return the watch list of the given key + */ + private def watchersFor(key: Any) = watchersForKey.getAndMaybePut(key) + + /* + * Return the size of the purgatory, which is size of watch lists plus the size of the expire reaper. + * Since an operation may still be in the watch lists even when it has been completed, this number + * may be larger than the number of real operations watched + */ + protected def size() = watchersForKey.values.map(_.numRequests).sum + expirationReaper.numOperations + + /** + * Shutdown the expire reaper thread + */ + def shutdown() { + expirationReaper.shutdown() + } + + /** + * A linked list of watched delayed operations based on some key + */ + private class Watchers { + private val requests = new util.ArrayList[T] + + // potentially add the element to watch if it is not satisfied yet + def checkAndMaybeAdd(t: T): Boolean = { + synchronized { + // if it is already satisfied, return false + if (t.completed.get()) + return false + // if the operation can be completed, return false; otherwise add to watch list + if(t.tryComplete()) { + return false + } else { + requests.add(t) + return true + } + } + } + + // traverse the list and purge satisfied elements + def purgeSatisfied(): Int = { + synchronized { + val iter = requests.iterator() + var purged = 0 + while (iter.hasNext) { + val curr = iter.next + if(curr.completed.get()) { + iter.remove() + purged += 1 + } + } + purged + } + } + + // traverse the list and try to satisfy watched elements + def collectCompletedOperations(): Seq[T] = { + val response = new mutable.ArrayBuffer[T] + synchronized { + val iter = requests.iterator() + while(iter.hasNext) { + val curr = iter.next + if (curr.completed.get()) { + // another thread has completed this request, just remove it + iter.remove() + } else { + val completed = curr.tryComplete() + if(completed) { + iter.remove() + watched.getAndDecrement() + response += curr + expirationReaper.satisfyRequest() + } + } + } + } + response + } + } + + /** + * A background reaper to expire delayed operations that have timed out + */ + private class ExpiredOperationReaper extends ShutdownableThread( + "ExpirationReaper-%d".format(brokerId), + false) { + + /* The queue storing all delayed operations */ + private val delayed = new DelayQueue[T] + + /* + * Return the number of delayed operations kept by the reaper + */ + def numOperations = delayed.size() + + /* + * Add a operation to be expired + */ + def enqueue(t: T) { + delayed.add(t) + unsatisfied.incrementAndGet() + } + + /** + * Get the next expired event + */ + private def pollExpired(): T = { + while (true) { + val curr = delayed.poll(200L, TimeUnit.MILLISECONDS) + if (curr == null) + return null.asInstanceOf[T] + // try set the operation failed (and hence completed), if succeed return it; + // otherwise try to get the next expired operation since this one has been completed by others + if (curr.completed.compareAndSet(false, true)) { + return curr + } + } + throw new RuntimeException("This should not happen") + } + + /** + * Delete all satisfied events from the delay queue and the watcher lists + */ + private def purgeSatisfied(): Int = { + var purged = 0 + + // purge the delayed queue + val iter = delayed.iterator() + while (iter.hasNext) { + val curr = iter.next() + if (curr.completed.get()) { + iter.remove() + purged += 1 + } + } + + purged + } + + + override def doWork() { + val curr = pollExpired() + if (curr != null) { + curr.onExpired() + } + if (size >= purgeInterval) { // see if we need to force a full purge + debug("Beginning purgatory purge") + val purged = purgeSatisfied() + debug("Purged %d operations from delay queue.".format(purged)) + val numPurgedFromWatchers = watchersForKey.values.map(_.purgeSatisfied()).sum + debug("Purged %d operations from watch lists.".format(numPurgedFromWatchers)) + } + } + } + +} diff --git a/core/src/main/scala/kafka/server/FetchDelayedOperationPurgatory.scala b/core/src/main/scala/kafka/server/FetchDelayedOperationPurgatory.scala new file mode 100644 index 0000000..71c5920 --- /dev/null +++ b/core/src/main/scala/kafka/server/FetchDelayedOperationPurgatory.scala @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.server + +import kafka.metrics.KafkaMetricsGroup +import kafka.network.RequestChannel +import kafka.api.FetchResponseSend + +import java.util.concurrent.TimeUnit + +/** + * The purgatory holding delayed fetch requests + */ +class FetchDelayedOperationPurgatory(replicaManager: ReplicaManager, requestChannel: RequestChannel) + extends DelayedOperationPurgatory[DelayedFetch](replicaManager.config.brokerId, replicaManager.config.fetchPurgatoryPurgeIntervalRequests) { + this.logIdent = "[FetchRequestPurgatory-%d] ".format(replicaManager.config.brokerId) + + private class DelayedFetchRequestMetrics(forFollower: Boolean) extends KafkaMetricsGroup { + private val metricPrefix = if (forFollower) "Follower" else "Consumer" + + val expiredRequestMeter = newMeter(metricPrefix + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) + } + + private val aggregateFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = true) + private val aggregateNonFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = false) + + private def recordDelayedFetchExpired(forFollower: Boolean) { + val metrics = if (forFollower) aggregateFollowerFetchRequestMetrics + else aggregateNonFollowerFetchRequestMetrics + + metrics.expiredRequestMeter.mark() + } + + /** + * Check if a specified delayed fetch request is satisfied + */ + def checkSatisfied(delayedFetch: DelayedFetch): Boolean = delayedFetch.isSatisfied(replicaManager) + + /** + * When a delayed fetch request expires just answer it with whatever data is present + */ + def expire(delayedFetch: DelayedFetch) { + debug("Expiring fetch request %s.".format(delayedFetch.fetch)) + val fromFollower = delayedFetch.fetch.isFromFollower + recordDelayedFetchExpired(fromFollower) + respond(delayedFetch) + } + + // TODO: purgatory should not be responsible for sending back the responses + def respond(delayedFetch: DelayedFetch) { + val response = delayedFetch.respond(replicaManager) + requestChannel.sendResponse(new RequestChannel.Response(delayedFetch.request, new FetchResponseSend(response))) + } +} \ No newline at end of file diff --git a/core/src/main/scala/kafka/server/FetchRequestPurgatory.scala b/core/src/main/scala/kafka/server/FetchRequestPurgatory.scala deleted file mode 100644 index ed13188..0000000 --- a/core/src/main/scala/kafka/server/FetchRequestPurgatory.scala +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import kafka.metrics.KafkaMetricsGroup -import kafka.network.RequestChannel -import kafka.api.FetchResponseSend - -import java.util.concurrent.TimeUnit - -/** - * The purgatory holding delayed fetch requests - */ -class FetchRequestPurgatory(replicaManager: ReplicaManager, requestChannel: RequestChannel) - extends RequestPurgatory[DelayedFetch](replicaManager.config.brokerId, replicaManager.config.fetchPurgatoryPurgeIntervalRequests) { - this.logIdent = "[FetchRequestPurgatory-%d] ".format(replicaManager.config.brokerId) - - private class DelayedFetchRequestMetrics(forFollower: Boolean) extends KafkaMetricsGroup { - private val metricPrefix = if (forFollower) "Follower" else "Consumer" - - val expiredRequestMeter = newMeter(metricPrefix + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) - } - - private val aggregateFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = true) - private val aggregateNonFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = false) - - private def recordDelayedFetchExpired(forFollower: Boolean) { - val metrics = if (forFollower) aggregateFollowerFetchRequestMetrics - else aggregateNonFollowerFetchRequestMetrics - - metrics.expiredRequestMeter.mark() - } - - /** - * Check if a specified delayed fetch request is satisfied - */ - def checkSatisfied(delayedFetch: DelayedFetch): Boolean = delayedFetch.isSatisfied(replicaManager) - - /** - * When a delayed fetch request expires just answer it with whatever data is present - */ - def expire(delayedFetch: DelayedFetch) { - debug("Expiring fetch request %s.".format(delayedFetch.fetch)) - val fromFollower = delayedFetch.fetch.isFromFollower - recordDelayedFetchExpired(fromFollower) - respond(delayedFetch) - } - - // TODO: purgatory should not be responsible for sending back the responses - def respond(delayedFetch: DelayedFetch) { - val response = delayedFetch.respond(replicaManager) - requestChannel.sendResponse(new RequestChannel.Response(delayedFetch.request, new FetchResponseSend(response))) - } -} \ No newline at end of file diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index c584b55..ef64207 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -42,8 +42,8 @@ class KafkaApis(val requestChannel: RequestChannel, val config: KafkaConfig, val controller: KafkaController) extends Logging { - val producerRequestPurgatory = new ProducerRequestPurgatory(replicaManager, offsetManager, requestChannel) - val fetchRequestPurgatory = new FetchRequestPurgatory(replicaManager, requestChannel) + val producerRequestPurgatory = new ProducerDelayedOperationPurgatory(replicaManager, offsetManager, requestChannel) + val fetchRequestPurgatory = new FetchDelayedOperationPurgatory(replicaManager, requestChannel) // TODO: the following line will be removed in 0.9 replicaManager.initWithRequestPurgatory(producerRequestPurgatory, fetchRequestPurgatory) var metadataCache = new MetadataCache diff --git a/core/src/main/scala/kafka/server/ProducerDelayedOperationPurgatory.scala b/core/src/main/scala/kafka/server/ProducerDelayedOperationPurgatory.scala new file mode 100644 index 0000000..4b950e1 --- /dev/null +++ b/core/src/main/scala/kafka/server/ProducerDelayedOperationPurgatory.scala @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.server + +import kafka.metrics.KafkaMetricsGroup +import kafka.utils.Pool +import kafka.network.{BoundedByteBufferSend, RequestChannel} + +import java.util.concurrent.TimeUnit + +/** + * The purgatory holding delayed producer requests + */ +class ProducerDelayedOperationPurgatory(replicaManager: ReplicaManager, offsetManager: OffsetManager, requestChannel: RequestChannel) + extends DelayedOperationPurgatory[DelayedProduce](replicaManager.config.brokerId, replicaManager.config.producerPurgatoryPurgeIntervalRequests) { + this.logIdent = "[ProducerRequestPurgatory-%d] ".format(replicaManager.config.brokerId) + + private class DelayedProducerRequestMetrics(keyLabel: String = DelayedRequestKey.globalLabel) extends KafkaMetricsGroup { + val expiredRequestMeter = newMeter(keyLabel + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) + } + + private val producerRequestMetricsForKey = { + val valueFactory = (k: DelayedRequestKey) => new DelayedProducerRequestMetrics(k.keyLabel + "-") + new Pool[DelayedRequestKey, DelayedProducerRequestMetrics](Some(valueFactory)) + } + + private val aggregateProduceRequestMetrics = new DelayedProducerRequestMetrics + + private def recordDelayedProducerKeyExpired(key: DelayedRequestKey) { + val keyMetrics = producerRequestMetricsForKey.getAndMaybePut(key) + List(keyMetrics, aggregateProduceRequestMetrics).foreach(_.expiredRequestMeter.mark()) + } + + /** + * Check if a specified delayed fetch request is satisfied + */ + def checkSatisfied(delayedProduce: DelayedProduce) = delayedProduce.isSatisfied(replicaManager) + + /** + * When a delayed produce request expires answer it with possible time out error codes + */ + def expire(delayedProduce: DelayedProduce) { + debug("Expiring produce request %s.".format(delayedProduce.produce)) + for ((topicPartition, responseStatus) <- delayedProduce.partitionStatus if responseStatus.acksPending) + recordDelayedProducerKeyExpired(new TopicPartitionRequestKey(topicPartition)) + respond(delayedProduce) + } + + // TODO: purgatory should not be responsible for sending back the responses + def respond(delayedProduce: DelayedProduce) { + val response = delayedProduce.respond(offsetManager) + requestChannel.sendResponse(new RequestChannel.Response(delayedProduce.request, new BoundedByteBufferSend(response))) + } +} diff --git a/core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala b/core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala deleted file mode 100644 index d4a7d4a..0000000 --- a/core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import kafka.metrics.KafkaMetricsGroup -import kafka.utils.Pool -import kafka.network.{BoundedByteBufferSend, RequestChannel} - -import java.util.concurrent.TimeUnit - -/** - * The purgatory holding delayed producer requests - */ -class ProducerRequestPurgatory(replicaManager: ReplicaManager, offsetManager: OffsetManager, requestChannel: RequestChannel) - extends RequestPurgatory[DelayedProduce](replicaManager.config.brokerId, replicaManager.config.producerPurgatoryPurgeIntervalRequests) { - this.logIdent = "[ProducerRequestPurgatory-%d] ".format(replicaManager.config.brokerId) - - private class DelayedProducerRequestMetrics(keyLabel: String = DelayedRequestKey.globalLabel) extends KafkaMetricsGroup { - val expiredRequestMeter = newMeter(keyLabel + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) - } - - private val producerRequestMetricsForKey = { - val valueFactory = (k: DelayedRequestKey) => new DelayedProducerRequestMetrics(k.keyLabel + "-") - new Pool[DelayedRequestKey, DelayedProducerRequestMetrics](Some(valueFactory)) - } - - private val aggregateProduceRequestMetrics = new DelayedProducerRequestMetrics - - private def recordDelayedProducerKeyExpired(key: DelayedRequestKey) { - val keyMetrics = producerRequestMetricsForKey.getAndMaybePut(key) - List(keyMetrics, aggregateProduceRequestMetrics).foreach(_.expiredRequestMeter.mark()) - } - - /** - * Check if a specified delayed fetch request is satisfied - */ - def checkSatisfied(delayedProduce: DelayedProduce) = delayedProduce.isSatisfied(replicaManager) - - /** - * When a delayed produce request expires answer it with possible time out error codes - */ - def expire(delayedProduce: DelayedProduce) { - debug("Expiring produce request %s.".format(delayedProduce.produce)) - for ((topicPartition, responseStatus) <- delayedProduce.partitionStatus if responseStatus.acksPending) - recordDelayedProducerKeyExpired(new TopicPartitionRequestKey(topicPartition)) - respond(delayedProduce) - } - - // TODO: purgatory should not be responsible for sending back the responses - def respond(delayedProduce: DelayedProduce) { - val response = delayedProduce.respond(offsetManager) - requestChannel.sendResponse(new RequestChannel.Response(delayedProduce.request, new BoundedByteBufferSend(response))) - } -} diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index 68758e3..d6a8356 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -46,7 +46,7 @@ object ReplicaManager { case class PartitionDataAndOffset(data: FetchResponsePartitionData, offset: LogOffsetMetadata) -class ReplicaManager(val config: KafkaConfig, +class ReplicaManager(config: KafkaConfig, time: Time, val zkClient: ZkClient, scheduler: Scheduler, @@ -64,8 +64,8 @@ class ReplicaManager(val config: KafkaConfig, this.logIdent = "[Replica Manager on Broker " + localBrokerId + "]: " val stateChangeLogger = KafkaController.stateChangeLogger - var producerRequestPurgatory: ProducerRequestPurgatory = null - var fetchRequestPurgatory: FetchRequestPurgatory = null + var producerRequestPurgatory: ProducerDelayedOperationPurgatory = null + var fetchRequestPurgatory: FetchDelayedOperationPurgatory = null newGauge( "LeaderCount", @@ -105,7 +105,7 @@ class ReplicaManager(val config: KafkaConfig, * TODO: will be removed in 0.9 where we refactor server structure */ - def initWithRequestPurgatory(producerRequestPurgatory: ProducerRequestPurgatory, fetchRequestPurgatory: FetchRequestPurgatory) { + def initWithRequestPurgatory(producerRequestPurgatory: ProducerDelayedOperationPurgatory, fetchRequestPurgatory: FetchDelayedOperationPurgatory) { this.producerRequestPurgatory = producerRequestPurgatory this.fetchRequestPurgatory = fetchRequestPurgatory } diff --git a/core/src/main/scala/kafka/server/RequestPurgatory.scala b/core/src/main/scala/kafka/server/RequestPurgatory.scala deleted file mode 100644 index ce06d2c..0000000 --- a/core/src/main/scala/kafka/server/RequestPurgatory.scala +++ /dev/null @@ -1,317 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import kafka.network._ -import kafka.utils._ -import kafka.metrics.KafkaMetricsGroup - -import java.util -import java.util.concurrent._ -import java.util.concurrent.atomic._ -import scala.collection._ - -import com.yammer.metrics.core.Gauge - - -/** - * A request whose processing needs to be delayed for at most the given delayMs - * The associated keys are used for bookeeping, and represent the "trigger" that causes this request to check if it is satisfied, - * for example a key could be a (topic, partition) pair. - */ -class DelayedRequest(val keys: Seq[Any], val request: RequestChannel.Request, delayMs: Long) extends DelayedItem[RequestChannel.Request](request, delayMs) { - val satisfied = new AtomicBoolean(false) -} - -/** - * A helper class for dealing with asynchronous requests with a timeout. A DelayedRequest has a request to delay - * and also a list of keys that can trigger the action. Implementations can add customized logic to control what it means for a given - * request to be satisfied. For example it could be that we are waiting for user-specified number of acks on a given (topic, partition) - * to be able to respond to a request or it could be that we are waiting for a given number of bytes to accumulate on a given request - * to be able to respond to that request (in the simple case we might wait for at least one byte to avoid busy waiting). - * - * For us the key is generally a (topic, partition) pair. - * By calling - * val isSatisfiedByMe = checkAndMaybeWatch(delayedRequest) - * we will check if a request is satisfied already, and if not add the request for watch on all its keys. - * - * It is up to the user to then call - * val satisfied = update(key, request) - * when a request relevant to the given key occurs. This triggers bookeeping logic and returns back any requests satisfied by this - * new request. - * - * An implementation provides extends two helper functions - * def checkSatisfied(request: R, delayed: T): Boolean - * this function returns true if the given request (in combination with whatever previous requests have happened) satisfies the delayed - * request delayed. This method will likely also need to do whatever bookkeeping is necessary. - * - * The second function is - * def expire(delayed: T) - * this function handles delayed requests that have hit their time limit without being satisfied. - * - */ -abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: Int = 1000) - extends Logging with KafkaMetricsGroup { - - /* a list of requests watching each key */ - private val watchersForKey = new Pool[Any, Watchers](Some((key: Any) => new Watchers)) - - /* the number of requests being watched, duplicates added on different watchers are also counted */ - private val watched = new AtomicInteger(0) - - /* background thread expiring requests that have been waiting too long */ - private val expiredRequestReaper = new ExpiredRequestReaper - private val expirationThread = Utils.newThread(name="request-expiration-task", runnable=expiredRequestReaper, daemon=false) - - newGauge( - "PurgatorySize", - new Gauge[Int] { - def value = watched.get() + expiredRequestReaper.numRequests - } - ) - - newGauge( - "NumDelayedRequests", - new Gauge[Int] { - def value = expiredRequestReaper.unsatisfied.get() - } - ) - - expirationThread.start() - - /** - * Try to add the request for watch on all keys. Return true iff the request is - * satisfied and the satisfaction is done by the caller. - * - * Requests can be watched on only a few of the keys if it is found satisfied when - * trying to add it to each one of the keys. In this case the request is still treated as satisfied - * and hence no longer watched. Those already added elements will be later purged by the expire reaper. - */ - def checkAndMaybeWatch(delayedRequest: T): Boolean = { - for(key <- delayedRequest.keys) { - val lst = watchersFor(key) - if(!lst.checkAndMaybeAdd(delayedRequest)) { - if(delayedRequest.satisfied.compareAndSet(false, true)) - return true - else - return false - } - } - - // if it is indeed watched, add to the expire queue also - expiredRequestReaper.enqueue(delayedRequest) - - false - } - - /** - * Update any watchers and return a list of newly satisfied requests. - */ - def update(key: Any): Seq[T] = { - val w = watchersForKey.get(key) - if(w == null) - Seq.empty - else - w.collectSatisfiedRequests() - } - - private def watchersFor(key: Any) = watchersForKey.getAndMaybePut(key) - - /** - * Check if this delayed request is already satisfied - */ - protected def checkSatisfied(request: T): Boolean - - /** - * Handle an expired delayed request - */ - protected def expire(delayed: T) - - /** - * Shutdown the expire reaper thread - */ - def shutdown() { - expiredRequestReaper.shutdown() - } - - /** - * A linked list of DelayedRequests watching some key with some associated - * bookkeeping logic. - */ - private class Watchers { - private val requests = new util.ArrayList[T] - - // potentially add the element to watch if it is not satisfied yet - def checkAndMaybeAdd(t: T): Boolean = { - synchronized { - // if it is already satisfied, do not add to the watch list - if (t.satisfied.get) - return false - // synchronize on the delayed request to avoid any race condition - // with expire and update threads on client-side. - if(t synchronized checkSatisfied(t)) { - return false - } - requests.add(t) - watched.getAndIncrement() - return true - } - } - - // traverse the list and purge satisfied elements - def purgeSatisfied(): Int = { - synchronized { - val iter = requests.iterator() - var purged = 0 - while(iter.hasNext) { - val curr = iter.next - if(curr.satisfied.get()) { - iter.remove() - watched.getAndDecrement() - purged += 1 - } - } - purged - } - } - - // traverse the list and try to satisfy watched elements - def collectSatisfiedRequests(): Seq[T] = { - val response = new mutable.ArrayBuffer[T] - synchronized { - val iter = requests.iterator() - while(iter.hasNext) { - val curr = iter.next - if(curr.satisfied.get) { - // another thread has satisfied this request, remove it - iter.remove() - } else { - // synchronize on curr to avoid any race condition with expire - // on client-side. - val satisfied = curr synchronized checkSatisfied(curr) - if(satisfied) { - iter.remove() - watched.getAndDecrement() - val updated = curr.satisfied.compareAndSet(false, true) - if(updated == true) { - response += curr - expiredRequestReaper.satisfyRequest() - } - } - } - } - } - response - } - } - - /** - * Runnable to expire requests that have sat unfullfilled past their deadline - */ - private class ExpiredRequestReaper extends Runnable with Logging { - this.logIdent = "ExpiredRequestReaper-%d ".format(brokerId) - - private val delayed = new DelayQueue[T] - private val running = new AtomicBoolean(true) - private val shutdownLatch = new CountDownLatch(1) - - /* The count of elements in the delay queue that are unsatisfied */ - private [kafka] val unsatisfied = new AtomicInteger(0) - - def numRequests = delayed.size() - - /** Main loop for the expiry thread */ - def run() { - while(running.get) { - try { - val curr = pollExpired() - if (curr != null) { - curr synchronized { - expire(curr) - } - } - if (watched.get + numRequests >= purgeInterval) { // see if we need to force a full purge - debug("Beginning purgatory purge") - val purged = purgeSatisfied() - debug("Purged %d requests from delay queue.".format(purged)) - val numPurgedFromWatchers = watchersForKey.values.map(_.purgeSatisfied()).sum - debug("Purged %d requests from watch lists.".format(numPurgedFromWatchers)) - } - } catch { - case e: Exception => - error("Error in long poll expiry thread: ", e) - } - } - shutdownLatch.countDown() - } - - /** Add a request to be expired */ - def enqueue(t: T) { - delayed.add(t) - unsatisfied.incrementAndGet() - } - - /** Shutdown the expiry thread*/ - def shutdown() { - debug("Shutting down.") - running.set(false) - shutdownLatch.await() - debug("Shut down complete.") - } - - /** Record the fact that we satisfied a request in the stats for the expiry queue */ - def satisfyRequest(): Unit = unsatisfied.getAndDecrement() - - /** - * Get the next expired event - */ - private def pollExpired(): T = { - while(true) { - val curr = delayed.poll(200L, TimeUnit.MILLISECONDS) - if (curr == null) - return null.asInstanceOf[T] - val updated = curr.satisfied.compareAndSet(false, true) - if(updated) { - unsatisfied.getAndDecrement() - return curr - } - } - throw new RuntimeException("This should not happen") - } - - /** - * Delete all satisfied events from the delay queue and the watcher lists - */ - private def purgeSatisfied(): Int = { - var purged = 0 - - // purge the delayed queue - val iter = delayed.iterator() - while(iter.hasNext) { - val curr = iter.next() - if(curr.satisfied.get) { - iter.remove() - purged += 1 - } - } - - purged - } - } - -} diff --git a/core/src/main/scala/kafka/utils/DelayedItem.scala b/core/src/main/scala/kafka/utils/DelayedItem.scala index d727649..3d7df84 100644 --- a/core/src/main/scala/kafka/utils/DelayedItem.scala +++ b/core/src/main/scala/kafka/utils/DelayedItem.scala @@ -20,7 +20,7 @@ package kafka.utils import java.util.concurrent._ import scala.math._ -class DelayedItem[T](val item: T, delay: Long, unit: TimeUnit) extends Delayed with Logging { +class DelayedItem(delay: Long, unit: TimeUnit) extends Delayed with Logging { val createdMs = SystemTime.milliseconds val delayMs = { @@ -29,8 +29,8 @@ class DelayedItem[T](val item: T, delay: Long, unit: TimeUnit) extends Delayed w else given } - def this(item: T, delayMs: Long) = - this(item, delayMs, TimeUnit.MILLISECONDS) + def this(delayMs: Long) = + this(delayMs, TimeUnit.MILLISECONDS) /** * The remaining delay time diff --git a/core/src/test/scala/unit/kafka/server/DelayedOperationPurgatoryTest.scala b/core/src/test/scala/unit/kafka/server/DelayedOperationPurgatoryTest.scala new file mode 100644 index 0000000..c9a5f2e --- /dev/null +++ b/core/src/test/scala/unit/kafka/server/DelayedOperationPurgatoryTest.scala @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.server + +import scala.collection._ +import org.junit.Test +import junit.framework.Assert._ +import kafka.message._ +import kafka.api._ +import kafka.utils.TestUtils +import org.scalatest.junit.JUnit3Suite + + +class DelayedOperationPurgatoryTest extends JUnit3Suite { + + val producerRequest1 = TestUtils.produceRequest("test", 0, new ByteBufferMessageSet(new Message("hello1".getBytes))) + val producerRequest2 = TestUtils.produceRequest("test", 0, new ByteBufferMessageSet(new Message("hello2".getBytes))) + var purgatory: MockDelayedOperationPurgatory = null + + override def setUp() { + super.setUp() + purgatory = new MockDelayedOperationPurgatory() + } + + override def tearDown() { + purgatory.shutdown() + super.tearDown() + } + + @Test + def testRequestSatisfaction() { + val r1 = new DelayedRequest(Array("test1"), null, 100000L) + val r2 = new DelayedRequest(Array("test2"), null, 100000L) + assertEquals("With no waiting requests, nothing should be satisfied", 0, purgatory.update("test1").size) + assertFalse("r1 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r1)) + assertEquals("Still nothing satisfied", 0, purgatory.update("test1").size) + assertFalse("r2 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r2)) + assertEquals("Still nothing satisfied", 0, purgatory.update("test2").size) + purgatory.satisfied += r1 + assertEquals("r1 satisfied", mutable.ArrayBuffer(r1), purgatory.update("test1")) + assertEquals("Nothing satisfied", 0, purgatory.update("test1").size) + purgatory.satisfied += r2 + assertEquals("r2 satisfied", mutable.ArrayBuffer(r2), purgatory.update("test2")) + assertEquals("Nothing satisfied", 0, purgatory.update("test2").size) + } + + @Test + def testRequestExpiry() { + val expiration = 20L + val r1 = new DelayedRequest(Array("test1"), null, expiration) + val r2 = new DelayedRequest(Array("test1"), null, 200000L) + val start = System.currentTimeMillis + assertFalse("r1 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r1)) + assertFalse("r2 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r2)) + purgatory.awaitExpiration(r1) + val elapsed = System.currentTimeMillis - start + assertTrue("r1 expired", purgatory.expired.contains(r1)) + assertTrue("r2 hasn't expired", !purgatory.expired.contains(r2)) + assertTrue("Time for expiration %d should at least %d".format(elapsed, expiration), elapsed >= expiration) + } + + class MockDelayedOperationPurgatory extends DelayedOperationPurgatory[DelayedRequest] { + val satisfied = mutable.Set[DelayedRequest]() + val expired = mutable.Set[DelayedRequest]() + def awaitExpiration(delayed: DelayedRequest) = { + delayed synchronized { + delayed.wait() + } + } + def checkSatisfied(delayed: DelayedRequest): Boolean = satisfied.contains(delayed) + def expire(delayed: DelayedRequest) { + expired += delayed + delayed synchronized { + delayed.notify() + } + } + } + +} \ No newline at end of file diff --git a/core/src/test/scala/unit/kafka/server/RequestPurgatoryTest.scala b/core/src/test/scala/unit/kafka/server/RequestPurgatoryTest.scala deleted file mode 100644 index 168712d..0000000 --- a/core/src/test/scala/unit/kafka/server/RequestPurgatoryTest.scala +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import scala.collection._ -import org.junit.Test -import junit.framework.Assert._ -import kafka.message._ -import kafka.api._ -import kafka.utils.TestUtils -import org.scalatest.junit.JUnit3Suite - - -class RequestPurgatoryTest extends JUnit3Suite { - - val producerRequest1 = TestUtils.produceRequest("test", 0, new ByteBufferMessageSet(new Message("hello1".getBytes))) - val producerRequest2 = TestUtils.produceRequest("test", 0, new ByteBufferMessageSet(new Message("hello2".getBytes))) - var purgatory: MockRequestPurgatory = null - - override def setUp() { - super.setUp() - purgatory = new MockRequestPurgatory() - } - - override def tearDown() { - purgatory.shutdown() - super.tearDown() - } - - @Test - def testRequestSatisfaction() { - val r1 = new DelayedRequest(Array("test1"), null, 100000L) - val r2 = new DelayedRequest(Array("test2"), null, 100000L) - assertEquals("With no waiting requests, nothing should be satisfied", 0, purgatory.update("test1").size) - assertFalse("r1 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r1)) - assertEquals("Still nothing satisfied", 0, purgatory.update("test1").size) - assertFalse("r2 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r2)) - assertEquals("Still nothing satisfied", 0, purgatory.update("test2").size) - purgatory.satisfied += r1 - assertEquals("r1 satisfied", mutable.ArrayBuffer(r1), purgatory.update("test1")) - assertEquals("Nothing satisfied", 0, purgatory.update("test1").size) - purgatory.satisfied += r2 - assertEquals("r2 satisfied", mutable.ArrayBuffer(r2), purgatory.update("test2")) - assertEquals("Nothing satisfied", 0, purgatory.update("test2").size) - } - - @Test - def testRequestExpiry() { - val expiration = 20L - val r1 = new DelayedRequest(Array("test1"), null, expiration) - val r2 = new DelayedRequest(Array("test1"), null, 200000L) - val start = System.currentTimeMillis - assertFalse("r1 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r1)) - assertFalse("r2 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r2)) - purgatory.awaitExpiration(r1) - val elapsed = System.currentTimeMillis - start - assertTrue("r1 expired", purgatory.expired.contains(r1)) - assertTrue("r2 hasn't expired", !purgatory.expired.contains(r2)) - assertTrue("Time for expiration %d should at least %d".format(elapsed, expiration), elapsed >= expiration) - } - - class MockRequestPurgatory extends RequestPurgatory[DelayedRequest] { - val satisfied = mutable.Set[DelayedRequest]() - val expired = mutable.Set[DelayedRequest]() - def awaitExpiration(delayed: DelayedRequest) = { - delayed synchronized { - delayed.wait() - } - } - def checkSatisfied(delayed: DelayedRequest): Boolean = satisfied.contains(delayed) - def expire(delayed: DelayedRequest) { - expired += delayed - delayed synchronized { - delayed.notify() - } - } - } - -} \ No newline at end of file -- 1.7.12.4 From d34094a30e900ee1d67af1dcf2e0db5f21051fd7 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Tue, 12 Aug 2014 09:49:17 -0700 Subject: [PATCH 02/15] change name back --- .../kafka/server/DelayedOperationPurgatory.scala | 285 -------------------- .../server/FetchDelayedOperationPurgatory.scala | 69 ----- .../scala/kafka/server/FetchRequestPurgatory.scala | 69 +++++ core/src/main/scala/kafka/server/KafkaApis.scala | 4 +- .../server/ProducerDelayedOperationPurgatory.scala | 69 ----- .../kafka/server/ProducerRequestPurgatory.scala | 69 +++++ .../main/scala/kafka/server/ReplicaManager.scala | 6 +- .../main/scala/kafka/server/RequestPurgatory.scala | 286 +++++++++++++++++++++ .../server/DelayedOperationPurgatoryTest.scala | 94 ------- .../unit/kafka/server/RequestPurgatoryTest.scala | 94 +++++++ 10 files changed, 523 insertions(+), 522 deletions(-) delete mode 100644 core/src/main/scala/kafka/server/DelayedOperationPurgatory.scala delete mode 100644 core/src/main/scala/kafka/server/FetchDelayedOperationPurgatory.scala create mode 100644 core/src/main/scala/kafka/server/FetchRequestPurgatory.scala delete mode 100644 core/src/main/scala/kafka/server/ProducerDelayedOperationPurgatory.scala create mode 100644 core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala create mode 100644 core/src/main/scala/kafka/server/RequestPurgatory.scala delete mode 100644 core/src/test/scala/unit/kafka/server/DelayedOperationPurgatoryTest.scala create mode 100644 core/src/test/scala/unit/kafka/server/RequestPurgatoryTest.scala diff --git a/core/src/main/scala/kafka/server/DelayedOperationPurgatory.scala b/core/src/main/scala/kafka/server/DelayedOperationPurgatory.scala deleted file mode 100644 index 3998b6d..0000000 --- a/core/src/main/scala/kafka/server/DelayedOperationPurgatory.scala +++ /dev/null @@ -1,285 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import kafka.network._ -import kafka.utils._ -import kafka.metrics.KafkaMetricsGroup - -import java.util -import java.util.concurrent._ -import java.util.concurrent.atomic._ -import scala.collection._ - -import com.yammer.metrics.core.Gauge - - -/** - * An operation whose processing needs to be delayed for at most the given delayMs; - * upon complete, the given callback function will be triggered. For example a delayed - * message append operation could be waiting for specified number of acks; or a delayed - * message fetch operation could be waiting for a given number of bytes to accumulate. - */ -abstract class DelayedOperation(delayMs: Long, onComplete: Boolean => Unit) extends DelayedItem(delayMs) { - val completed = new AtomicBoolean(false) - - /* - * Check if the delayed operation is already completed - * - * Note that concurrent threads can check if an operation can be completed or not, - * but only the first thread will succeed in completing the operation - */ - def tryComplete(): Boolean = completed.compareAndSet(false, true) - - /* - * When delayMs has elapsed, expire the delayed operation - */ - def onExpired() = onComplete(false) -} - -/** - * A helper purgatory class for bookkeeping delayed operations with a timeout, and expiring timed out operations. - * - */ -abstract class DelayedOperationPurgatory[T <: DelayedOperation](brokerId: Int = 0, purgeInterval: Int = 1000) - extends Logging with KafkaMetricsGroup { - - /* a list of requests watching each key */ - private val watchersForKey = new Pool[Any, Watchers](Some((key: Any) => new Watchers)) - - /* the number of requests being watched, duplicates added on different watchers are also counted */ - private val watched = new AtomicInteger(0) - - /* background thread expiring requests that have been waiting too long */ - private val expirationReaper = new ExpiredOperationReaper - - newGauge( - "PurgatorySize", - new Gauge[Int] { - def value = size - } - ) - - newGauge( - "NumDelayedOperations", - new Gauge[Int] { - def value = expirationReaper.numOperations - } - ) - - expirationThread.start() - - /** - * Check if the operation can be completed, if not watch it based on the given watch keys - * - * Note that a delayed operation can be watched on multiple keys, and hence due to concurrency may be - * found completed when trying to watch it on some later keys. In this case the operation is still - * treated as completed and hence no longer watched although it is still in the watch lists of - * the earlier keys. Those already watched elements will be later purged by the expire reaper. - * - * @param operation the delayed operation to be checked - * @param watchKeys keys for bookkeeping the operation - * @return true iff the delayed operations can be completed - */ - def tryCompleteElseWatch(operation: DelayedOperation, watchKeys: Seq[Any]): Boolean = { - for(key <- watchKeys) { - val watchers = watchersFor(key) - // if the operation is found completed, stop adding it to any further - // lists and return true immediately - if(!watchers.checkAndMaybeAdd(operation)) { - return true - } - } - - // if it is indeed watched, add to the expire queue also - watched.getAndIncrement() - expirationReaper.enqueue(operation) - - false - - } - - /** - * Return a list of completed operations with the given watch key. - */ - def getCompleted(key: Any): Seq[T] = { - val watchers = watchersForKey.get(key) - if(watchers == null) - Seq.empty - else - watchers.collectCompletedOperations() - } - - /* - * Return the watch list of the given key - */ - private def watchersFor(key: Any) = watchersForKey.getAndMaybePut(key) - - /* - * Return the size of the purgatory, which is size of watch lists plus the size of the expire reaper. - * Since an operation may still be in the watch lists even when it has been completed, this number - * may be larger than the number of real operations watched - */ - protected def size() = watchersForKey.values.map(_.numRequests).sum + expirationReaper.numOperations - - /** - * Shutdown the expire reaper thread - */ - def shutdown() { - expirationReaper.shutdown() - } - - /** - * A linked list of watched delayed operations based on some key - */ - private class Watchers { - private val requests = new util.ArrayList[T] - - // potentially add the element to watch if it is not satisfied yet - def checkAndMaybeAdd(t: T): Boolean = { - synchronized { - // if it is already satisfied, return false - if (t.completed.get()) - return false - // if the operation can be completed, return false; otherwise add to watch list - if(t.tryComplete()) { - return false - } else { - requests.add(t) - return true - } - } - } - - // traverse the list and purge satisfied elements - def purgeSatisfied(): Int = { - synchronized { - val iter = requests.iterator() - var purged = 0 - while (iter.hasNext) { - val curr = iter.next - if(curr.completed.get()) { - iter.remove() - purged += 1 - } - } - purged - } - } - - // traverse the list and try to satisfy watched elements - def collectCompletedOperations(): Seq[T] = { - val response = new mutable.ArrayBuffer[T] - synchronized { - val iter = requests.iterator() - while(iter.hasNext) { - val curr = iter.next - if (curr.completed.get()) { - // another thread has completed this request, just remove it - iter.remove() - } else { - val completed = curr.tryComplete() - if(completed) { - iter.remove() - watched.getAndDecrement() - response += curr - expirationReaper.satisfyRequest() - } - } - } - } - response - } - } - - /** - * A background reaper to expire delayed operations that have timed out - */ - private class ExpiredOperationReaper extends ShutdownableThread( - "ExpirationReaper-%d".format(brokerId), - false) { - - /* The queue storing all delayed operations */ - private val delayed = new DelayQueue[T] - - /* - * Return the number of delayed operations kept by the reaper - */ - def numOperations = delayed.size() - - /* - * Add a operation to be expired - */ - def enqueue(t: T) { - delayed.add(t) - unsatisfied.incrementAndGet() - } - - /** - * Get the next expired event - */ - private def pollExpired(): T = { - while (true) { - val curr = delayed.poll(200L, TimeUnit.MILLISECONDS) - if (curr == null) - return null.asInstanceOf[T] - // try set the operation failed (and hence completed), if succeed return it; - // otherwise try to get the next expired operation since this one has been completed by others - if (curr.completed.compareAndSet(false, true)) { - return curr - } - } - throw new RuntimeException("This should not happen") - } - - /** - * Delete all satisfied events from the delay queue and the watcher lists - */ - private def purgeSatisfied(): Int = { - var purged = 0 - - // purge the delayed queue - val iter = delayed.iterator() - while (iter.hasNext) { - val curr = iter.next() - if (curr.completed.get()) { - iter.remove() - purged += 1 - } - } - - purged - } - - - override def doWork() { - val curr = pollExpired() - if (curr != null) { - curr.onExpired() - } - if (size >= purgeInterval) { // see if we need to force a full purge - debug("Beginning purgatory purge") - val purged = purgeSatisfied() - debug("Purged %d operations from delay queue.".format(purged)) - val numPurgedFromWatchers = watchersForKey.values.map(_.purgeSatisfied()).sum - debug("Purged %d operations from watch lists.".format(numPurgedFromWatchers)) - } - } - } - -} diff --git a/core/src/main/scala/kafka/server/FetchDelayedOperationPurgatory.scala b/core/src/main/scala/kafka/server/FetchDelayedOperationPurgatory.scala deleted file mode 100644 index 71c5920..0000000 --- a/core/src/main/scala/kafka/server/FetchDelayedOperationPurgatory.scala +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import kafka.metrics.KafkaMetricsGroup -import kafka.network.RequestChannel -import kafka.api.FetchResponseSend - -import java.util.concurrent.TimeUnit - -/** - * The purgatory holding delayed fetch requests - */ -class FetchDelayedOperationPurgatory(replicaManager: ReplicaManager, requestChannel: RequestChannel) - extends DelayedOperationPurgatory[DelayedFetch](replicaManager.config.brokerId, replicaManager.config.fetchPurgatoryPurgeIntervalRequests) { - this.logIdent = "[FetchRequestPurgatory-%d] ".format(replicaManager.config.brokerId) - - private class DelayedFetchRequestMetrics(forFollower: Boolean) extends KafkaMetricsGroup { - private val metricPrefix = if (forFollower) "Follower" else "Consumer" - - val expiredRequestMeter = newMeter(metricPrefix + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) - } - - private val aggregateFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = true) - private val aggregateNonFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = false) - - private def recordDelayedFetchExpired(forFollower: Boolean) { - val metrics = if (forFollower) aggregateFollowerFetchRequestMetrics - else aggregateNonFollowerFetchRequestMetrics - - metrics.expiredRequestMeter.mark() - } - - /** - * Check if a specified delayed fetch request is satisfied - */ - def checkSatisfied(delayedFetch: DelayedFetch): Boolean = delayedFetch.isSatisfied(replicaManager) - - /** - * When a delayed fetch request expires just answer it with whatever data is present - */ - def expire(delayedFetch: DelayedFetch) { - debug("Expiring fetch request %s.".format(delayedFetch.fetch)) - val fromFollower = delayedFetch.fetch.isFromFollower - recordDelayedFetchExpired(fromFollower) - respond(delayedFetch) - } - - // TODO: purgatory should not be responsible for sending back the responses - def respond(delayedFetch: DelayedFetch) { - val response = delayedFetch.respond(replicaManager) - requestChannel.sendResponse(new RequestChannel.Response(delayedFetch.request, new FetchResponseSend(response))) - } -} \ No newline at end of file diff --git a/core/src/main/scala/kafka/server/FetchRequestPurgatory.scala b/core/src/main/scala/kafka/server/FetchRequestPurgatory.scala new file mode 100644 index 0000000..ed13188 --- /dev/null +++ b/core/src/main/scala/kafka/server/FetchRequestPurgatory.scala @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.server + +import kafka.metrics.KafkaMetricsGroup +import kafka.network.RequestChannel +import kafka.api.FetchResponseSend + +import java.util.concurrent.TimeUnit + +/** + * The purgatory holding delayed fetch requests + */ +class FetchRequestPurgatory(replicaManager: ReplicaManager, requestChannel: RequestChannel) + extends RequestPurgatory[DelayedFetch](replicaManager.config.brokerId, replicaManager.config.fetchPurgatoryPurgeIntervalRequests) { + this.logIdent = "[FetchRequestPurgatory-%d] ".format(replicaManager.config.brokerId) + + private class DelayedFetchRequestMetrics(forFollower: Boolean) extends KafkaMetricsGroup { + private val metricPrefix = if (forFollower) "Follower" else "Consumer" + + val expiredRequestMeter = newMeter(metricPrefix + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) + } + + private val aggregateFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = true) + private val aggregateNonFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = false) + + private def recordDelayedFetchExpired(forFollower: Boolean) { + val metrics = if (forFollower) aggregateFollowerFetchRequestMetrics + else aggregateNonFollowerFetchRequestMetrics + + metrics.expiredRequestMeter.mark() + } + + /** + * Check if a specified delayed fetch request is satisfied + */ + def checkSatisfied(delayedFetch: DelayedFetch): Boolean = delayedFetch.isSatisfied(replicaManager) + + /** + * When a delayed fetch request expires just answer it with whatever data is present + */ + def expire(delayedFetch: DelayedFetch) { + debug("Expiring fetch request %s.".format(delayedFetch.fetch)) + val fromFollower = delayedFetch.fetch.isFromFollower + recordDelayedFetchExpired(fromFollower) + respond(delayedFetch) + } + + // TODO: purgatory should not be responsible for sending back the responses + def respond(delayedFetch: DelayedFetch) { + val response = delayedFetch.respond(replicaManager) + requestChannel.sendResponse(new RequestChannel.Response(delayedFetch.request, new FetchResponseSend(response))) + } +} \ No newline at end of file diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index ef64207..c584b55 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -42,8 +42,8 @@ class KafkaApis(val requestChannel: RequestChannel, val config: KafkaConfig, val controller: KafkaController) extends Logging { - val producerRequestPurgatory = new ProducerDelayedOperationPurgatory(replicaManager, offsetManager, requestChannel) - val fetchRequestPurgatory = new FetchDelayedOperationPurgatory(replicaManager, requestChannel) + val producerRequestPurgatory = new ProducerRequestPurgatory(replicaManager, offsetManager, requestChannel) + val fetchRequestPurgatory = new FetchRequestPurgatory(replicaManager, requestChannel) // TODO: the following line will be removed in 0.9 replicaManager.initWithRequestPurgatory(producerRequestPurgatory, fetchRequestPurgatory) var metadataCache = new MetadataCache diff --git a/core/src/main/scala/kafka/server/ProducerDelayedOperationPurgatory.scala b/core/src/main/scala/kafka/server/ProducerDelayedOperationPurgatory.scala deleted file mode 100644 index 4b950e1..0000000 --- a/core/src/main/scala/kafka/server/ProducerDelayedOperationPurgatory.scala +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import kafka.metrics.KafkaMetricsGroup -import kafka.utils.Pool -import kafka.network.{BoundedByteBufferSend, RequestChannel} - -import java.util.concurrent.TimeUnit - -/** - * The purgatory holding delayed producer requests - */ -class ProducerDelayedOperationPurgatory(replicaManager: ReplicaManager, offsetManager: OffsetManager, requestChannel: RequestChannel) - extends DelayedOperationPurgatory[DelayedProduce](replicaManager.config.brokerId, replicaManager.config.producerPurgatoryPurgeIntervalRequests) { - this.logIdent = "[ProducerRequestPurgatory-%d] ".format(replicaManager.config.brokerId) - - private class DelayedProducerRequestMetrics(keyLabel: String = DelayedRequestKey.globalLabel) extends KafkaMetricsGroup { - val expiredRequestMeter = newMeter(keyLabel + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) - } - - private val producerRequestMetricsForKey = { - val valueFactory = (k: DelayedRequestKey) => new DelayedProducerRequestMetrics(k.keyLabel + "-") - new Pool[DelayedRequestKey, DelayedProducerRequestMetrics](Some(valueFactory)) - } - - private val aggregateProduceRequestMetrics = new DelayedProducerRequestMetrics - - private def recordDelayedProducerKeyExpired(key: DelayedRequestKey) { - val keyMetrics = producerRequestMetricsForKey.getAndMaybePut(key) - List(keyMetrics, aggregateProduceRequestMetrics).foreach(_.expiredRequestMeter.mark()) - } - - /** - * Check if a specified delayed fetch request is satisfied - */ - def checkSatisfied(delayedProduce: DelayedProduce) = delayedProduce.isSatisfied(replicaManager) - - /** - * When a delayed produce request expires answer it with possible time out error codes - */ - def expire(delayedProduce: DelayedProduce) { - debug("Expiring produce request %s.".format(delayedProduce.produce)) - for ((topicPartition, responseStatus) <- delayedProduce.partitionStatus if responseStatus.acksPending) - recordDelayedProducerKeyExpired(new TopicPartitionRequestKey(topicPartition)) - respond(delayedProduce) - } - - // TODO: purgatory should not be responsible for sending back the responses - def respond(delayedProduce: DelayedProduce) { - val response = delayedProduce.respond(offsetManager) - requestChannel.sendResponse(new RequestChannel.Response(delayedProduce.request, new BoundedByteBufferSend(response))) - } -} diff --git a/core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala b/core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala new file mode 100644 index 0000000..d4a7d4a --- /dev/null +++ b/core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.server + +import kafka.metrics.KafkaMetricsGroup +import kafka.utils.Pool +import kafka.network.{BoundedByteBufferSend, RequestChannel} + +import java.util.concurrent.TimeUnit + +/** + * The purgatory holding delayed producer requests + */ +class ProducerRequestPurgatory(replicaManager: ReplicaManager, offsetManager: OffsetManager, requestChannel: RequestChannel) + extends RequestPurgatory[DelayedProduce](replicaManager.config.brokerId, replicaManager.config.producerPurgatoryPurgeIntervalRequests) { + this.logIdent = "[ProducerRequestPurgatory-%d] ".format(replicaManager.config.brokerId) + + private class DelayedProducerRequestMetrics(keyLabel: String = DelayedRequestKey.globalLabel) extends KafkaMetricsGroup { + val expiredRequestMeter = newMeter(keyLabel + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) + } + + private val producerRequestMetricsForKey = { + val valueFactory = (k: DelayedRequestKey) => new DelayedProducerRequestMetrics(k.keyLabel + "-") + new Pool[DelayedRequestKey, DelayedProducerRequestMetrics](Some(valueFactory)) + } + + private val aggregateProduceRequestMetrics = new DelayedProducerRequestMetrics + + private def recordDelayedProducerKeyExpired(key: DelayedRequestKey) { + val keyMetrics = producerRequestMetricsForKey.getAndMaybePut(key) + List(keyMetrics, aggregateProduceRequestMetrics).foreach(_.expiredRequestMeter.mark()) + } + + /** + * Check if a specified delayed fetch request is satisfied + */ + def checkSatisfied(delayedProduce: DelayedProduce) = delayedProduce.isSatisfied(replicaManager) + + /** + * When a delayed produce request expires answer it with possible time out error codes + */ + def expire(delayedProduce: DelayedProduce) { + debug("Expiring produce request %s.".format(delayedProduce.produce)) + for ((topicPartition, responseStatus) <- delayedProduce.partitionStatus if responseStatus.acksPending) + recordDelayedProducerKeyExpired(new TopicPartitionRequestKey(topicPartition)) + respond(delayedProduce) + } + + // TODO: purgatory should not be responsible for sending back the responses + def respond(delayedProduce: DelayedProduce) { + val response = delayedProduce.respond(offsetManager) + requestChannel.sendResponse(new RequestChannel.Response(delayedProduce.request, new BoundedByteBufferSend(response))) + } +} diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index d6a8356..06e7108 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -64,8 +64,8 @@ class ReplicaManager(config: KafkaConfig, this.logIdent = "[Replica Manager on Broker " + localBrokerId + "]: " val stateChangeLogger = KafkaController.stateChangeLogger - var producerRequestPurgatory: ProducerDelayedOperationPurgatory = null - var fetchRequestPurgatory: FetchDelayedOperationPurgatory = null + var producerRequestPurgatory: ProducerRequestPurgatory = null + var fetchRequestPurgatory: FetchRequestPurgatory = null newGauge( "LeaderCount", @@ -105,7 +105,7 @@ class ReplicaManager(config: KafkaConfig, * TODO: will be removed in 0.9 where we refactor server structure */ - def initWithRequestPurgatory(producerRequestPurgatory: ProducerDelayedOperationPurgatory, fetchRequestPurgatory: FetchDelayedOperationPurgatory) { + def initWithRequestPurgatory(producerRequestPurgatory: ProducerRequestPurgatory, fetchRequestPurgatory: FetchRequestPurgatory) { this.producerRequestPurgatory = producerRequestPurgatory this.fetchRequestPurgatory = fetchRequestPurgatory } diff --git a/core/src/main/scala/kafka/server/RequestPurgatory.scala b/core/src/main/scala/kafka/server/RequestPurgatory.scala new file mode 100644 index 0000000..dc4ce54 --- /dev/null +++ b/core/src/main/scala/kafka/server/RequestPurgatory.scala @@ -0,0 +1,286 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.server + +import kafka.utils._ +import kafka.metrics.KafkaMetricsGroup + +import java.util +import java.util.concurrent._ +import java.util.concurrent.atomic._ +import scala.collection._ + +import com.yammer.metrics.core.Gauge + + +/** + * An operation whose processing needs to be delayed for at most the given delayMs; + * upon complete, the given callback function will be triggered. For example a delayed + * message append operation could be waiting for specified number of acks; or a delayed + * message fetch operation could be waiting for a given number of bytes to accumulate. + */ +abstract class DelayedRequest(delayMs: Long, onComplete: Boolean => Unit) extends DelayedItem(delayMs) { + val completed = new AtomicBoolean(false) + + /* + * Check if the delayed operation is already completed + * + * Note that concurrent threads can check if an operation can be completed or not, + * but only the first thread will succeed in completing the operation + */ + def tryComplete(): Boolean = completed.compareAndSet(false, true) + + /* + * When delayMs has elapsed, expire the delayed operation + */ + def onExpired() = onComplete(false) +} + +/** + * A helper purgatory class for bookkeeping delayed operations with a timeout, and expiring timed out operations. + * + */ +abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: Int = 1000) + extends Logging with KafkaMetricsGroup { + + /* a list of requests watching each key */ + private val watchersForKey = new Pool[Any, Watchers](Some((key: Any) => new Watchers)) + + /* the number of requests being watched, duplicates added on different watchers are also counted */ + private val watched = new AtomicInteger(0) + + /* background thread expiring requests that have been waiting too long */ + private val expirationReaper = new ExpiredOperationReaper + + newGauge( + "PurgatorySize", + new Gauge[Int] { + def value = size() + } + ) + + newGauge( + "NumDelayedOperations", + new Gauge[Int] { + def value = expirationReaper.numOperations + } + ) + + expirationThread.start() + + /** + * Check if the operation can be completed, if not watch it based on the given watch keys + * + * Note that a delayed operation can be watched on multiple keys, and hence due to concurrency may be + * found completed when trying to watch it on some later keys. In this case the operation is still + * treated as completed and hence no longer watched although it is still in the watch lists of + * the earlier keys. Those already watched elements will be later purged by the expire reaper. + * + * @param operation the delayed operation to be checked + * @param watchKeys keys for bookkeeping the operation + * @return true iff the delayed operations can be completed + */ + def tryCompleteElseWatch(operation: DelayedRequest, watchKeys: Seq[Any]): Boolean = { + for(key <- watchKeys) { + val watchers = watchersFor(key) + // if the operation is found completed, stop adding it to any further + // lists and return true immediately + if(!watchers.checkAndMaybeAdd(operation)) { + return true + } + } + + // if it is indeed watched, add to the expire queue also + watched.getAndIncrement() + expirationReaper.enqueue(operation) + + false + + } + + /** + * Return a list of completed operations with the given watch key. + * + * @return the list of completed operations + */ + def getCompleted(key: Any): Seq[T] = { + val watchers = watchersForKey.get(key) + if(watchers == null) + Seq.empty + else + watchers.collectCompletedOperations() + } + + /* + * Return the watch list of the given key + */ + private def watchersFor(key: Any) = watchersForKey.getAndMaybePut(key) + + /* + * Return the size of the purgatory, which is size of watch lists plus the size of the expire reaper. + * Since an operation may still be in the watch lists even when it has been completed, this number + * may be larger than the number of real operations watched + */ + protected def size() = watchersForKey.values.map(_.numRequests).sum + expirationReaper.numOperations + + /** + * Shutdown the expire reaper thread + */ + def shutdown() { + expirationReaper.shutdown() + } + + /** + * A linked list of watched delayed operations based on some key + */ + private class Watchers { + private val requests = new util.ArrayList[T] + + // potentially add the element to watch if it is not satisfied yet + def checkAndMaybeAdd(t: T): Boolean = { + synchronized { + // if it is already satisfied, return false + if (t.completed.get()) + return false + // if the operation can be completed, return false; otherwise add to watch list + if(t.tryComplete()) { + return false + } else { + requests.add(t) + return true + } + } + } + + // traverse the list and purge satisfied elements + def purgeSatisfied(): Int = { + synchronized { + val iter = requests.iterator() + var purged = 0 + while (iter.hasNext) { + val curr = iter.next + if(curr.completed.get()) { + iter.remove() + purged += 1 + } + } + purged + } + } + + // traverse the list and try to satisfy watched elements + def collectCompletedOperations(): Seq[T] = { + val response = new mutable.ArrayBuffer[T] + synchronized { + val iter = requests.iterator() + while(iter.hasNext) { + val curr = iter.next + if (curr.completed.get()) { + // another thread has completed this request, just remove it + iter.remove() + } else { + val completed = curr.tryComplete() + if(completed) { + iter.remove() + watched.getAndDecrement() + response += curr + expirationReaper.satisfyRequest() + } + } + } + } + response + } + } + + /** + * A background reaper to expire delayed operations that have timed out + */ + private class ExpiredOperationReaper extends ShutdownableThread( + "ExpirationReaper-%d".format(brokerId), + false) { + + /* The queue storing all delayed operations */ + private val delayed = new DelayQueue[T] + + /* + * Return the number of delayed operations kept by the reaper + */ + def numOperations = delayed.size() + + /* + * Add a operation to be expired + */ + def enqueue(t: T) { + delayed.add(t) + unsatisfied.incrementAndGet() + } + + /** + * Get the next expired event + */ + private def pollExpired(): T = { + while (true) { + val curr = delayed.poll(200L, TimeUnit.MILLISECONDS) + if (curr == null) + return null.asInstanceOf[T] + // try set the operation failed (and hence completed), if succeed return it; + // otherwise try to get the next expired operation since this one has been completed by others + if (curr.completed.compareAndSet(false, true)) { + return curr + } + } + throw new RuntimeException("This should not happen") + } + + /** + * Delete all satisfied events from the delay queue and the watcher lists + */ + private def purgeSatisfied(): Int = { + var purged = 0 + + // purge the delayed queue + val iter = delayed.iterator() + while (iter.hasNext) { + val curr = iter.next() + if (curr.completed.get()) { + iter.remove() + purged += 1 + } + } + + purged + } + + + override def doWork() { + val curr = pollExpired() + if (curr != null) { + curr.onExpired() + } + if (size() >= purgeInterval) { // see if we need to force a full purge + debug("Beginning purgatory purge") + val purged = purgeSatisfied() + debug("Purged %d operations from delay queue.".format(purged)) + val numPurgedFromWatchers = watchersForKey.values.map(_.purgeSatisfied()).sum + debug("Purged %d operations from watch lists.".format(numPurgedFromWatchers)) + } + } + } + +} diff --git a/core/src/test/scala/unit/kafka/server/DelayedOperationPurgatoryTest.scala b/core/src/test/scala/unit/kafka/server/DelayedOperationPurgatoryTest.scala deleted file mode 100644 index c9a5f2e..0000000 --- a/core/src/test/scala/unit/kafka/server/DelayedOperationPurgatoryTest.scala +++ /dev/null @@ -1,94 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import scala.collection._ -import org.junit.Test -import junit.framework.Assert._ -import kafka.message._ -import kafka.api._ -import kafka.utils.TestUtils -import org.scalatest.junit.JUnit3Suite - - -class DelayedOperationPurgatoryTest extends JUnit3Suite { - - val producerRequest1 = TestUtils.produceRequest("test", 0, new ByteBufferMessageSet(new Message("hello1".getBytes))) - val producerRequest2 = TestUtils.produceRequest("test", 0, new ByteBufferMessageSet(new Message("hello2".getBytes))) - var purgatory: MockDelayedOperationPurgatory = null - - override def setUp() { - super.setUp() - purgatory = new MockDelayedOperationPurgatory() - } - - override def tearDown() { - purgatory.shutdown() - super.tearDown() - } - - @Test - def testRequestSatisfaction() { - val r1 = new DelayedRequest(Array("test1"), null, 100000L) - val r2 = new DelayedRequest(Array("test2"), null, 100000L) - assertEquals("With no waiting requests, nothing should be satisfied", 0, purgatory.update("test1").size) - assertFalse("r1 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r1)) - assertEquals("Still nothing satisfied", 0, purgatory.update("test1").size) - assertFalse("r2 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r2)) - assertEquals("Still nothing satisfied", 0, purgatory.update("test2").size) - purgatory.satisfied += r1 - assertEquals("r1 satisfied", mutable.ArrayBuffer(r1), purgatory.update("test1")) - assertEquals("Nothing satisfied", 0, purgatory.update("test1").size) - purgatory.satisfied += r2 - assertEquals("r2 satisfied", mutable.ArrayBuffer(r2), purgatory.update("test2")) - assertEquals("Nothing satisfied", 0, purgatory.update("test2").size) - } - - @Test - def testRequestExpiry() { - val expiration = 20L - val r1 = new DelayedRequest(Array("test1"), null, expiration) - val r2 = new DelayedRequest(Array("test1"), null, 200000L) - val start = System.currentTimeMillis - assertFalse("r1 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r1)) - assertFalse("r2 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r2)) - purgatory.awaitExpiration(r1) - val elapsed = System.currentTimeMillis - start - assertTrue("r1 expired", purgatory.expired.contains(r1)) - assertTrue("r2 hasn't expired", !purgatory.expired.contains(r2)) - assertTrue("Time for expiration %d should at least %d".format(elapsed, expiration), elapsed >= expiration) - } - - class MockDelayedOperationPurgatory extends DelayedOperationPurgatory[DelayedRequest] { - val satisfied = mutable.Set[DelayedRequest]() - val expired = mutable.Set[DelayedRequest]() - def awaitExpiration(delayed: DelayedRequest) = { - delayed synchronized { - delayed.wait() - } - } - def checkSatisfied(delayed: DelayedRequest): Boolean = satisfied.contains(delayed) - def expire(delayed: DelayedRequest) { - expired += delayed - delayed synchronized { - delayed.notify() - } - } - } - -} \ No newline at end of file diff --git a/core/src/test/scala/unit/kafka/server/RequestPurgatoryTest.scala b/core/src/test/scala/unit/kafka/server/RequestPurgatoryTest.scala new file mode 100644 index 0000000..a4b7f5b --- /dev/null +++ b/core/src/test/scala/unit/kafka/server/RequestPurgatoryTest.scala @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package kafka.server + +import scala.collection._ +import org.junit.Test +import junit.framework.Assert._ +import kafka.message._ +import kafka.api._ +import kafka.utils.TestUtils +import org.scalatest.junit.JUnit3Suite + + +class RequestPurgatoryTest extends JUnit3Suite { + + val producerRequest1 = TestUtils.produceRequest("test", 0, new ByteBufferMessageSet(new Message("hello1".getBytes))) + val producerRequest2 = TestUtils.produceRequest("test", 0, new ByteBufferMessageSet(new Message("hello2".getBytes))) + var purgatory: MockDelayedOperationPurgatory = null + + override def setUp() { + super.setUp() + purgatory = new MockDelayedOperationPurgatory() + } + + override def tearDown() { + purgatory.shutdown() + super.tearDown() + } + + @Test + def testRequestSatisfaction() { + val r1 = new DelayedRequest(Array("test1"), null, 100000L) + val r2 = new DelayedRequest(Array("test2"), null, 100000L) + assertEquals("With no waiting requests, nothing should be satisfied", 0, purgatory.update("test1").size) + assertFalse("r1 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r1)) + assertEquals("Still nothing satisfied", 0, purgatory.update("test1").size) + assertFalse("r2 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r2)) + assertEquals("Still nothing satisfied", 0, purgatory.update("test2").size) + purgatory.satisfied += r1 + assertEquals("r1 satisfied", mutable.ArrayBuffer(r1), purgatory.update("test1")) + assertEquals("Nothing satisfied", 0, purgatory.update("test1").size) + purgatory.satisfied += r2 + assertEquals("r2 satisfied", mutable.ArrayBuffer(r2), purgatory.update("test2")) + assertEquals("Nothing satisfied", 0, purgatory.update("test2").size) + } + + @Test + def testRequestExpiry() { + val expiration = 20L + val r1 = new DelayedRequest(Array("test1"), null, expiration) + val r2 = new DelayedRequest(Array("test1"), null, 200000L) + val start = System.currentTimeMillis + assertFalse("r1 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r1)) + assertFalse("r2 not satisfied and hence watched", purgatory.checkAndMaybeWatch(r2)) + purgatory.awaitExpiration(r1) + val elapsed = System.currentTimeMillis - start + assertTrue("r1 expired", purgatory.expired.contains(r1)) + assertTrue("r2 hasn't expired", !purgatory.expired.contains(r2)) + assertTrue("Time for expiration %d should at least %d".format(elapsed, expiration), elapsed >= expiration) + } + + class MockRequestPurgatory extends RequestPurgatory[DelayedRequest] { + val satisfied = mutable.Set[DelayedRequest]() + val expired = mutable.Set[DelayedRequest]() + def awaitExpiration(delayed: DelayedRequest) = { + delayed synchronized { + delayed.wait() + } + } + def checkSatisfied(delayed: DelayedRequest): Boolean = satisfied.contains(delayed) + def expire(delayed: DelayedRequest) { + expired += delayed + delayed synchronized { + delayed.notify() + } + } + } + +} \ No newline at end of file -- 1.7.12.4 From 472a37ff653e092e68b1a551782282ffc7a9ab5e Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Wed, 13 Aug 2014 09:13:06 -0700 Subject: [PATCH 03/15] dummy --- .../src/main/scala/kafka/server/DelayedFetch.scala | 39 ++++++++++++---------- core/src/main/scala/kafka/server/KafkaApis.scala | 16 +++++++-- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/core/src/main/scala/kafka/server/DelayedFetch.scala b/core/src/main/scala/kafka/server/DelayedFetch.scala index e0f14e2..d367786 100644 --- a/core/src/main/scala/kafka/server/DelayedFetch.scala +++ b/core/src/main/scala/kafka/server/DelayedFetch.scala @@ -37,34 +37,39 @@ import scala.collection.Seq * - should return whatever data is available. */ -class DelayedFetch(override val keys: Seq[TopicPartitionRequestKey], - override val request: RequestChannel.Request, - override val delayMs: Long, - val fetch: FetchRequest, - private val partitionFetchOffsets: Map[TopicAndPartition, LogOffsetMetadata]) - extends DelayedRequest(keys, request, delayMs) { +case class FetchInfo(fetchMinBytes: Int, + fetchOnlyCommitted: Boolean, + fetchStartOffsets: Map[TopicAndPartition, LogOffsetMetadata]) { - def isSatisfied(replicaManager: ReplicaManager) : Boolean = { + override def toString = "FetchInfo [minBytes: " + fetchMinBytes + "] : " + + "[committedOnly: " + fetchOnlyCommitted + "] : " + "[startOffsets: " + fetchStartOffsets + "]" +} + + +class DelayedFetch(delayMs: Long, onComplete: Boolean => Unit, fetchInfo: FetchInfo, replicaManager: ReplicaManager) + extends DelayedRequest(delayMs, onComplete) { + + override def tryComplete() : Boolean = { var accumulatedSize = 0 - val fromFollower = fetch.isFromFollower - partitionFetchOffsets.foreach { + fetchInfo.fetchStartOffsets.foreach { case (topicAndPartition, fetchOffset) => try { if (fetchOffset != LogOffsetMetadata.UnknownOffsetMetadata) { val replica = replicaManager.getLeaderReplicaIfLocal(topicAndPartition.topic, topicAndPartition.partition) val endOffset = - if (fromFollower) - replica.logEndOffset - else + if (fetchInfo.fetchOnlyCommitted) replica.highWatermark + else + replica.logEndOffset if (endOffset.offsetOnOlderSegment(fetchOffset)) { // Case C, this can happen when the new follower replica fetching on a truncated leader - debug("Satisfying fetch request %s since it is fetching later segments of partition %s.".format(fetch, topicAndPartition)) + debug("Satisfying %s since it is fetching later segments of partition %s.".format(fetchInfo, topicAndPartition)) return true } else if (fetchOffset.offsetOnOlderSegment(endOffset)) { // Case C, this can happen when the folloer replica is lagging too much - debug("Satisfying fetch request %s immediately since it is fetching older segments.".format(fetch)) + debug("Satisfying %s immediately since it is fetching older segments.".format(fetchInfo)) return true } else if (fetchOffset.precedes(endOffset)) { accumulatedSize += endOffset.positionDiff(fetchOffset) @@ -72,16 +77,16 @@ class DelayedFetch(override val keys: Seq[TopicPartitionRequestKey], } } catch { case utpe: UnknownTopicOrPartitionException => // Case A - debug("Broker no longer know of %s, satisfy %s immediately".format(topicAndPartition, fetch)) + debug("Broker no longer know of %s, satisfy %s immediately".format(topicAndPartition, fetchInfo)) return true case nle: NotLeaderForPartitionException => // Case B - debug("Broker is no longer the leader of %s, satisfy %s immediately".format(topicAndPartition, fetch)) + debug("Broker is no longer the leader of %s, satisfy %s immediately".format(topicAndPartition, fetchInfo)) return true } } // Case D - accumulatedSize >= fetch.minBytes + accumulatedSize >= fetchInfo.fetchMinBytes } def respond(replicaManager: ReplicaManager): FetchResponse = { diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index c584b55..b1c7bda 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -327,8 +327,20 @@ class KafkaApis(val requestChannel: RequestChannel, fetchRequest.clientId)) // create a list of (topic, partition) pairs to use as keys for this delayed request val delayedFetchKeys = fetchRequest.requestInfo.keys.toSeq.map(new TopicPartitionRequestKey(_)) - val delayedFetch = new DelayedFetch(delayedFetchKeys, request, fetchRequest.maxWait, fetchRequest, - dataRead.mapValues(_.offset)) + + //val delayedFetch = new DelayedFetch(delayedFetchKeys, request, fetchRequest.maxWait, fetchRequest, + // dataRead.mapValues(_.offset)) + + def callback(succeeded: Boolean) { + if (succeeded) requestChannel.sendResponse(new RequestChannel.Response(request, new FetchResponseSend(FetchResponse(fetch.correlationId, replicaManager.readMessageSets(fetchRequest).mapValues(_.data))))) + } + + val delayedFetch = new DelayedFetch( + fetchRequest.maxWait, + callback, + dataRead.mapValues(_.offset), + replicaManager) + // add the fetch request for watch if it's not satisfied, otherwise send the response back val satisfiedByMe = fetchRequestPurgatory.checkAndMaybeWatch(delayedFetch) -- 1.7.12.4 From 73bde4073e0bdce98f9303dd64b684231006502e Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Wed, 13 Aug 2014 15:48:13 -0700 Subject: [PATCH 04/15] wip version 1 --- .../main/scala/kafka/api/ProducerResponse.scala | 3 +- core/src/main/scala/kafka/log/Log.scala | 26 +- .../src/main/scala/kafka/server/DelayedFetch.scala | 25 +- .../main/scala/kafka/server/DelayedProduce.scala | 108 ++++---- core/src/main/scala/kafka/server/KafkaApis.scala | 230 +++------------- .../main/scala/kafka/server/ReplicaManager.scala | 291 ++++++++++++++++----- .../main/scala/kafka/server/RequestPurgatory.scala | 24 +- 7 files changed, 375 insertions(+), 332 deletions(-) diff --git a/core/src/main/scala/kafka/api/ProducerResponse.scala b/core/src/main/scala/kafka/api/ProducerResponse.scala index a286272..5d1fac4 100644 --- a/core/src/main/scala/kafka/api/ProducerResponse.scala +++ b/core/src/main/scala/kafka/api/ProducerResponse.scala @@ -43,8 +43,7 @@ object ProducerResponse { case class ProducerResponseStatus(var error: Short, offset: Long) -case class ProducerResponse(correlationId: Int, - status: Map[TopicAndPartition, ProducerResponseStatus]) +case class ProducerResponse(correlationId: Int, status: Map[TopicAndPartition, ProducerResponseStatus]) extends RequestOrResponse() { /** diff --git a/core/src/main/scala/kafka/log/Log.scala b/core/src/main/scala/kafka/log/Log.scala index 0ddf97b..002c902 100644 --- a/core/src/main/scala/kafka/log/Log.scala +++ b/core/src/main/scala/kafka/log/Log.scala @@ -31,6 +31,21 @@ import scala.collection.JavaConversions import com.yammer.metrics.core.Gauge +/** + * Struct to hold various quantities we compute about each message set before appending to the log + * @param firstOffset The first offset in the message set + * @param lastOffset The last offset in the message set + * @param shallowCount The number of shallow messages + * @param validBytes The number of valid bytes + * @param codec The codec used in the message set + * @param offsetsMonotonic Are the offsets in this message set monotonically increasing + */ +case class LogAppendInfo(var firstOffset: Long, var lastOffset: Long, codec: CompressionCodec, shallowCount: Int, validBytes: Int, offsetsMonotonic: Boolean) + +object LogAppendInfo { + val UnknownLogAppendInfo = LogAppendInfo(-1, -1, NoCompressionCodec, -1, -1, false) +} + /** * An append-only log for storing messages. @@ -302,17 +317,6 @@ class Log(val dir: File, } /** - * Struct to hold various quantities we compute about each message set before appending to the log - * @param firstOffset The first offset in the message set - * @param lastOffset The last offset in the message set - * @param shallowCount The number of shallow messages - * @param validBytes The number of valid bytes - * @param codec The codec used in the message set - * @param offsetsMonotonic Are the offsets in this message set monotonically increasing - */ - case class LogAppendInfo(var firstOffset: Long, var lastOffset: Long, codec: CompressionCodec, shallowCount: Int, validBytes: Int, offsetsMonotonic: Boolean) - - /** * Validate the following: *
    *
  1. each message matches its CRC diff --git a/core/src/main/scala/kafka/server/DelayedFetch.scala b/core/src/main/scala/kafka/server/DelayedFetch.scala index d367786..4b7542f 100644 --- a/core/src/main/scala/kafka/server/DelayedFetch.scala +++ b/core/src/main/scala/kafka/server/DelayedFetch.scala @@ -38,6 +38,7 @@ import scala.collection.Seq */ case class FetchInfo(fetchMinBytes: Int, + fetchOnlyLeader: Boolean, fetchOnlyCommitted: Boolean, fetchStartOffsets: Map[TopicAndPartition, LogOffsetMetadata]) { @@ -47,7 +48,10 @@ case class FetchInfo(fetchMinBytes: Int, } -class DelayedFetch(delayMs: Long, onComplete: Boolean => Unit, fetchInfo: FetchInfo, replicaManager: ReplicaManager) +class DelayedFetch(delayMs: Long, + fetchInfo: FetchInfo, + replicaManager: ReplicaManager, + onComplete: Map[TopicAndPartition, PartitionDataAndOffset] => Unit) extends DelayedRequest(delayMs, onComplete) { override def tryComplete() : Boolean = { @@ -66,11 +70,11 @@ class DelayedFetch(delayMs: Long, onComplete: Boolean => Unit, fetchInfo: FetchI if (endOffset.offsetOnOlderSegment(fetchOffset)) { // Case C, this can happen when the new follower replica fetching on a truncated leader debug("Satisfying %s since it is fetching later segments of partition %s.".format(fetchInfo, topicAndPartition)) - return true + return super.tryComplete() } else if (fetchOffset.offsetOnOlderSegment(endOffset)) { // Case C, this can happen when the folloer replica is lagging too much debug("Satisfying %s immediately since it is fetching older segments.".format(fetchInfo)) - return true + return super.tryComplete() } else if (fetchOffset.precedes(endOffset)) { accumulatedSize += endOffset.positionDiff(fetchOffset) } @@ -78,15 +82,24 @@ class DelayedFetch(delayMs: Long, onComplete: Boolean => Unit, fetchInfo: FetchI } catch { case utpe: UnknownTopicOrPartitionException => // Case A debug("Broker no longer know of %s, satisfy %s immediately".format(topicAndPartition, fetchInfo)) - return true + return super.tryComplete() case nle: NotLeaderForPartitionException => // Case B debug("Broker is no longer the leader of %s, satisfy %s immediately".format(topicAndPartition, fetchInfo)) - return true + return super.tryComplete() } } // Case D - accumulatedSize >= fetchInfo.fetchMinBytes + if (accumulatedSize >= fetchInfo.fetchMinBytes) + super.tryComplete() + else + false + } + + override def onExpired() { + // read whatever data is available and return + val readData = replicaManager.readMessageSets(fetch) + onComplete(readData) } def respond(replicaManager: ReplicaManager): FetchResponse = { diff --git a/core/src/main/scala/kafka/server/DelayedProduce.scala b/core/src/main/scala/kafka/server/DelayedProduce.scala index 9481508..99bdf6f 100644 --- a/core/src/main/scala/kafka/server/DelayedProduce.scala +++ b/core/src/main/scala/kafka/server/DelayedProduce.scala @@ -35,81 +35,93 @@ import scala.collection.Seq * B.2 - else, at least requiredAcks replicas should be caught up to this request. */ -class DelayedProduce(override val keys: Seq[TopicPartitionRequestKey], - override val request: RequestChannel.Request, - override val delayMs: Long, - val produce: ProducerRequest, - val partitionStatus: Map[TopicAndPartition, DelayedProduceResponseStatus], - val offsetCommitRequestOpt: Option[OffsetCommitRequest] = None) - extends DelayedRequest(keys, request, delayMs) with Logging { +case class ProduceStatus(requiredOffset: Long, responseStatus: ProducerResponseStatus) { + @volatile var acksPending = false - // first update the acks pending variable according to the error code - partitionStatus foreach { case (topicAndPartition, delayedStatus) => - if (delayedStatus.responseStatus.error == ErrorMapping.NoError) { - // Timeout error state will be cleared when required acks are received - delayedStatus.acksPending = true - delayedStatus.responseStatus.error = ErrorMapping.RequestTimedOutCode - } else { - delayedStatus.acksPending = false - } + override def toString = "acksPending:%b, error: %d, startOffset: %d, requiredOffset: %d" + .format(acksPending, responseStatus.error, responseStatus.offset, requiredOffset) +} - trace("Initial partition status for %s is %s".format(topicAndPartition, delayedStatus)) - } +case class ProduceInfo(produceRequiredAcks: Short, + produceStatus: Map[TopicAndPartition, ProduceStatus]) { - def respond(offsetManager: OffsetManager): RequestOrResponse = { - val responseStatus = partitionStatus.mapValues(status => status.responseStatus) + override def toString = "ProduceInfo [requiredBytes: " + fetchMinBytes + "] : " + + "[partitionStatus: " + produceStatus + "]" +} - val errorCode = responseStatus.find { case (_, status) => - status.error != ErrorMapping.NoError - }.map(_._2.error).getOrElse(ErrorMapping.NoError) +class DelayedProduce(delayMs: Long, + produceInfo: ProduceInfo, + replicaManager: ReplicaManager, + onComplete: Map[TopicAndPartition, ProducerResponseStatus] => Unit) + extends DelayedRequest(delayMs) with Logging { - if (errorCode == ErrorMapping.NoError) { - offsetCommitRequestOpt.foreach(ocr => offsetManager.putOffsets(ocr.groupId, ocr.requestInfo) ) + // first update the acks pending variable according to the error code + produceInfo.produceStatus foreach { case (topicAndPartition, status) => + if (status.responseStatus.error == ErrorMapping.NoError) { + // Timeout error state will be cleared when required acks are received + status.acksPending = true + status.responseStatus.error = ErrorMapping.RequestTimedOutCode + } else { + status.acksPending = false } - val response = offsetCommitRequestOpt.map(_.responseFor(errorCode, offsetManager.config.maxMetadataSize)) - .getOrElse(ProducerResponse(produce.correlationId, responseStatus)) - - response + trace("Initial partition status for %s is %s".format(topicAndPartition, status)) } - def isSatisfied(replicaManager: ReplicaManager) = { + def tryComplete(): Boolean = { // check for each partition if it still has pending acks - partitionStatus.foreach { case (topicAndPartition, fetchPartitionStatus) => + produceInfo.produceStatus.foreach { case (topicAndPartition, status) => trace("Checking producer request satisfaction for %s, acksPending = %b" - .format(topicAndPartition, fetchPartitionStatus.acksPending)) + .format(topicAndPartition, status.acksPending)) // skip those partitions that have already been satisfied - if (fetchPartitionStatus.acksPending) { + if (status.acksPending) { val partitionOpt = replicaManager.getPartition(topicAndPartition.topic, topicAndPartition.partition) val (hasEnough, errorCode) = partitionOpt match { case Some(partition) => partition.checkEnoughReplicasReachOffset( - fetchPartitionStatus.requiredOffset, - produce.requiredAcks) + status.requiredOffset, + produceInfo.produceRequiredAcks) case None => (false, ErrorMapping.UnknownTopicOrPartitionCode) } if (errorCode != ErrorMapping.NoError) { - fetchPartitionStatus.acksPending = false - fetchPartitionStatus.responseStatus.error = errorCode + status.acksPending = false + status.responseStatus.error = errorCode } else if (hasEnough) { - fetchPartitionStatus.acksPending = false - fetchPartitionStatus.responseStatus.error = ErrorMapping.NoError + status.acksPending = false + status.responseStatus.error = ErrorMapping.NoError } } } // unblocked if there are no partitions with pending acks - val satisfied = ! partitionStatus.exists(p => p._2.acksPending) - satisfied + if (! produceInfo.produceStatus.values.exists(p => p.acksPending)) + super.tryComplete() + else + false } -} -case class DelayedProduceResponseStatus(val requiredOffset: Long, - val responseStatus: ProducerResponseStatus) { - @volatile var acksPending = false + override def onExpired() { + // return the current response status + val responseStatus = produceInfo.produceStatus.mapValues(status => status.responseStatus) + onComplete(responseStatus) + } + + def respond(offsetManager: OffsetManager): RequestOrResponse = { + val responseStatus = partitionStatus.mapValues(status => status.responseStatus) + + val errorCode = responseStatus.find { case (_, status) => + status.error != ErrorMapping.NoError + }.map(_._2.error).getOrElse(ErrorMapping.NoError) - override def toString = - "acksPending:%b, error: %d, startOffset: %d, requiredOffset: %d".format( - acksPending, responseStatus.error, responseStatus.offset, requiredOffset) + if (errorCode == ErrorMapping.NoError) { + offsetCommitRequestOpt.foreach(ocr => offsetManager.putOffsets(ocr.groupId, ocr.requestInfo) ) + } + + val response = offsetCommitRequestOpt.map(_.responseFor(errorCode, offsetManager.config.maxMetadataSize)) + .getOrElse(ProducerResponse(produce.correlationId, responseStatus)) + + response + } } + diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index b1c7bda..949f57e 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -151,215 +151,61 @@ class KafkaApis(val requestChannel: RequestChannel, * Handle a produce request or offset commit request (which is really a specialized producer request) */ def handleProducerOrOffsetCommitRequest(request: RequestChannel.Request) { - val (produceRequest, offsetCommitRequestOpt) = - if (request.requestId == RequestKeys.OffsetCommitKey) { - val offsetCommitRequest = request.requestObj.asInstanceOf[OffsetCommitRequest] - (producerRequestFromOffsetCommit(offsetCommitRequest), Some(offsetCommitRequest)) - } else { - (request.requestObj.asInstanceOf[ProducerRequest], None) - } - - val sTime = SystemTime.milliseconds - val localProduceResults = appendToLocalLog(produceRequest, offsetCommitRequestOpt.nonEmpty) - debug("Produce to local log in %d ms".format(SystemTime.milliseconds - sTime)) - - val firstErrorCode = localProduceResults.find(_.errorCode != ErrorMapping.NoError).map(_.errorCode).getOrElse(ErrorMapping.NoError) - - val numPartitionsInError = localProduceResults.count(_.error.isDefined) - if(produceRequest.requiredAcks == 0) { - // no operation needed if producer request.required.acks = 0; however, if there is any exception in handling the request, since - // no response is expected by the producer the handler will send a close connection response to the socket server - // to close the socket so that the producer client will know that some exception has happened and will refresh its metadata - if (numPartitionsInError != 0) { - info(("Send the close connection response due to error handling produce request " + - "[clientId = %s, correlationId = %s, topicAndPartition = %s] with Ack=0") - .format(produceRequest.clientId, produceRequest.correlationId, produceRequest.topicPartitionMessageSizeMap.keySet.mkString(","))) - requestChannel.closeConnection(request.processor, request) - } else { - - if (firstErrorCode == ErrorMapping.NoError) - offsetCommitRequestOpt.foreach(ocr => offsetManager.putOffsets(ocr.groupId, ocr.requestInfo)) - - if (offsetCommitRequestOpt.isDefined) { - val response = offsetCommitRequestOpt.get.responseFor(firstErrorCode, config.offsetMetadataMaxSize) - requestChannel.sendResponse(new RequestChannel.Response(request, new BoundedByteBufferSend(response))) - } else + val produceRequest = request.requestObj.asInstanceOf[ProducerRequest] + + // the callback for sending the response + def sendResponseCallback(responseStatus: Map[TopicAndPartition, ProducerResponseStatus]) { + val numPartitionsInError = responseStatus.values.count(_.error.isDefined) + + if(produceRequest.requiredAcks == 0) { + // no operation needed if producer request.required.acks = 0; however, if there is any exception in handling the request, since + // no response is expected by the producer the handler will send a close connection response to the socket server + // to close the socket so that the producer client will know that some exception has happened and will refresh its metadata + if (numPartitionsInError != 0) { + info(("Send the close connection response due to error handling produce request " + + "[clientId = %s, correlationId = %s, topicAndPartition = %s] with Ack=0") + .format(produceRequest.clientId, produceRequest.correlationId, produceRequest.topicPartitionMessageSizeMap.keySet.mkString(","))) + requestChannel.closeConnection(request.processor, request) + } else { requestChannel.noOperation(request.processor, request) + } + } else { + val response = ProducerResponse(produceRequest.correlationId, responseStatus) + requestChannel.sendResponse(new RequestChannel.Response(request, new BoundedByteBufferSend(response))) } - } else if (produceRequest.requiredAcks == 1 || - produceRequest.numPartitions <= 0 || - numPartitionsInError == produceRequest.numPartitions) { - - if (firstErrorCode == ErrorMapping.NoError) { - offsetCommitRequestOpt.foreach(ocr => offsetManager.putOffsets(ocr.groupId, ocr.requestInfo) ) - } - - val statuses = localProduceResults.map(r => r.key -> ProducerResponseStatus(r.errorCode, r.start)).toMap - val response = offsetCommitRequestOpt.map(_.responseFor(firstErrorCode, config.offsetMetadataMaxSize)) - .getOrElse(ProducerResponse(produceRequest.correlationId, statuses)) - - requestChannel.sendResponse(new RequestChannel.Response(request, new BoundedByteBufferSend(response))) - } else { - // create a list of (topic, partition) pairs to use as keys for this delayed request - val producerRequestKeys = produceRequest.data.keys.map( - topicAndPartition => new TopicPartitionRequestKey(topicAndPartition)).toSeq - val statuses = localProduceResults.map(r => - r.key -> DelayedProduceResponseStatus(r.end + 1, ProducerResponseStatus(r.errorCode, r.start))).toMap - val delayedRequest = new DelayedProduce( - producerRequestKeys, - request, - produceRequest.ackTimeoutMs.toLong, - produceRequest, - statuses, - offsetCommitRequestOpt) - - // add the produce request for watch if it's not satisfied, otherwise send the response back - val satisfiedByMe = producerRequestPurgatory.checkAndMaybeWatch(delayedRequest) - if (satisfiedByMe) - producerRequestPurgatory.respond(delayedRequest) } + // call the replica manager to append messages to the replicas + replicaManager.appendMessages( + produceRequest.ackTimeoutMs.toLong, + produceRequest.requiredAcks, + produceRequest.data, + sendResponseCallback) + // we do not need the data anymore produceRequest.emptyData() } - case class ProduceResult(key: TopicAndPartition, start: Long, end: Long, error: Option[Throwable] = None) { - def this(key: TopicAndPartition, throwable: Throwable) = - this(key, -1L, -1L, Some(throwable)) - - def errorCode = error match { - case None => ErrorMapping.NoError - case Some(error) => ErrorMapping.codeFor(error.getClass.asInstanceOf[Class[Throwable]]) - } - } - - /** - * Helper method for handling a parsed producer request - */ - private def appendToLocalLog(producerRequest: ProducerRequest, isOffsetCommit: Boolean): Iterable[ProduceResult] = { - val partitionAndData: Map[TopicAndPartition, MessageSet] = producerRequest.data - trace("Append [%s] to local log ".format(partitionAndData.toString)) - partitionAndData.map {case (topicAndPartition, messages) => - try { - if (Topic.InternalTopics.contains(topicAndPartition.topic) && - !(isOffsetCommit && topicAndPartition.topic == OffsetManager.OffsetsTopicName)) { - throw new InvalidTopicException("Cannot append to internal topic %s".format(topicAndPartition.topic)) - } - val partitionOpt = replicaManager.getPartition(topicAndPartition.topic, topicAndPartition.partition) - val info = partitionOpt match { - case Some(partition) => - partition.appendMessagesToLeader(messages.asInstanceOf[ByteBufferMessageSet]) - case None => throw new UnknownTopicOrPartitionException("Partition %s doesn't exist on %d" - .format(topicAndPartition, brokerId)) - } - - val numAppendedMessages = if (info.firstOffset == -1L || info.lastOffset == -1L) 0 else (info.lastOffset - info.firstOffset + 1) - - // update stats for successfully appended bytes and messages as bytesInRate and messageInRate - BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).bytesInRate.mark(messages.sizeInBytes) - BrokerTopicStats.getBrokerAllTopicsStats.bytesInRate.mark(messages.sizeInBytes) - BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).messagesInRate.mark(numAppendedMessages) - BrokerTopicStats.getBrokerAllTopicsStats.messagesInRate.mark(numAppendedMessages) - - trace("%d bytes written to log %s-%d beginning at offset %d and ending at offset %d" - .format(messages.size, topicAndPartition.topic, topicAndPartition.partition, info.firstOffset, info.lastOffset)) - ProduceResult(topicAndPartition, info.firstOffset, info.lastOffset) - } catch { - // NOTE: Failed produce requests is not incremented for UnknownTopicOrPartitionException and NotLeaderForPartitionException - // since failed produce requests metric is supposed to indicate failure of a broker in handling a produce request - // for a partition it is the leader for - case e: KafkaStorageException => - fatal("Halting due to unrecoverable I/O error while handling produce request: ", e) - Runtime.getRuntime.halt(1) - null - case ite: InvalidTopicException => - warn("Produce request with correlation id %d from client %s on partition %s failed due to %s".format( - producerRequest.correlationId, producerRequest.clientId, topicAndPartition, ite.getMessage)) - new ProduceResult(topicAndPartition, ite) - case utpe: UnknownTopicOrPartitionException => - warn("Produce request with correlation id %d from client %s on partition %s failed due to %s".format( - producerRequest.correlationId, producerRequest.clientId, topicAndPartition, utpe.getMessage)) - new ProduceResult(topicAndPartition, utpe) - case nle: NotLeaderForPartitionException => - warn("Produce request with correlation id %d from client %s on partition %s failed due to %s".format( - producerRequest.correlationId, producerRequest.clientId, topicAndPartition, nle.getMessage)) - new ProduceResult(topicAndPartition, nle) - case e: Throwable => - BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).failedProduceRequestRate.mark() - BrokerTopicStats.getBrokerAllTopicsStats.failedProduceRequestRate.mark() - error("Error processing ProducerRequest with correlation id %d from client %s on partition %s" - .format(producerRequest.correlationId, producerRequest.clientId, topicAndPartition), e) - new ProduceResult(topicAndPartition, e) - } - } - } - /** * Handle a fetch request */ def handleFetchRequest(request: RequestChannel.Request) { val fetchRequest = request.requestObj.asInstanceOf[FetchRequest] - val dataRead = replicaManager.readMessageSets(fetchRequest) - - // if the fetch request comes from the follower, - // update its corresponding log end offset - if(fetchRequest.isFromFollower) - recordFollowerLogEndOffsets(fetchRequest.replicaId, dataRead.mapValues(_.offset)) - - // check if this fetch request can be satisfied right away - val bytesReadable = dataRead.values.map(_.data.messages.sizeInBytes).sum - val errorReadingData = dataRead.values.foldLeft(false)((errorIncurred, dataAndOffset) => - errorIncurred || (dataAndOffset.data.error != ErrorMapping.NoError)) - // send the data immediately if 1) fetch request does not want to wait - // 2) fetch request does not require any data - // 3) has enough data to respond - // 4) some error happens while reading data - if(fetchRequest.maxWait <= 0 || - fetchRequest.numPartitions <= 0 || - bytesReadable >= fetchRequest.minBytes || - errorReadingData) { - debug("Returning fetch response %s for fetch request with correlation id %d to client %s" - .format(dataRead.values.map(_.data.error).mkString(","), fetchRequest.correlationId, fetchRequest.clientId)) - val response = new FetchResponse(fetchRequest.correlationId, dataRead.mapValues(_.data)) - requestChannel.sendResponse(new RequestChannel.Response(request, new FetchResponseSend(response))) - } else { - debug("Putting fetch request with correlation id %d from client %s into purgatory".format(fetchRequest.correlationId, - fetchRequest.clientId)) - // create a list of (topic, partition) pairs to use as keys for this delayed request - val delayedFetchKeys = fetchRequest.requestInfo.keys.toSeq.map(new TopicPartitionRequestKey(_)) - - //val delayedFetch = new DelayedFetch(delayedFetchKeys, request, fetchRequest.maxWait, fetchRequest, - // dataRead.mapValues(_.offset)) - - def callback(succeeded: Boolean) { - if (succeeded) requestChannel.sendResponse(new RequestChannel.Response(request, new FetchResponseSend(FetchResponse(fetch.correlationId, replicaManager.readMessageSets(fetchRequest).mapValues(_.data))))) - } - - val delayedFetch = new DelayedFetch( - fetchRequest.maxWait, - callback, - dataRead.mapValues(_.offset), - replicaManager) + // the callback for sending the response + def sendResponseCallback(responsePartitionData: Map[TopicAndPartition, FetchResponsePartitionData]) { - // add the fetch request for watch if it's not satisfied, otherwise send the response back - val satisfiedByMe = fetchRequestPurgatory.checkAndMaybeWatch(delayedFetch) - if (satisfiedByMe) - fetchRequestPurgatory.respond(delayedFetch) + val response = FetchResponse(fetchRequest.correlationId, responsePartitionData) + requestChannel.sendResponse(new RequestChannel.Response(request, new BoundedByteBufferSend(response))) } - } - private def recordFollowerLogEndOffsets(replicaId: Int, offsets: Map[TopicAndPartition, LogOffsetMetadata]) { - debug("Record follower log end offsets: %s ".format(offsets)) - offsets.foreach { - case (topicAndPartition, offset) => - replicaManager.updateReplicaLEOAndPartitionHW(topicAndPartition.topic, - topicAndPartition.partition, replicaId, offset) - - // for producer requests with ack > 1, we need to check - // if they can be unblocked after some follower's log end offsets have moved - replicaManager.unblockDelayedProduceRequests(new TopicPartitionRequestKey(topicAndPartition)) - } + // call the replica manager to append messages to the replicas + replicaManager.fetchMessages( + fetchRequest.maxWait.toLong, + fetchRequest.replicaId, + fetchRequest.minBytes, + fetchRequest.requestInfo, + sendResponseCallback) } /** diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index 06e7108..eb25660 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -20,11 +20,11 @@ import kafka.api._ import kafka.common._ import kafka.utils._ import kafka.cluster.{Broker, Partition, Replica} -import kafka.log.LogManager +import kafka.log.{LogAppendInfo, LogManager} import kafka.metrics.KafkaMetricsGroup import kafka.controller.KafkaController import kafka.common.TopicAndPartition -import kafka.message.MessageSet +import kafka.message.{ByteBufferMessageSet, MessageSet} import java.util.concurrent.atomic.AtomicBoolean import java.io.{IOException, File} @@ -45,6 +45,19 @@ object ReplicaManager { case class PartitionDataAndOffset(data: FetchResponsePartitionData, offset: LogOffsetMetadata) +case class LogAppendResult(info: LogAppendInfo, error: Option[Throwable] = None) { + def errorCode = error match { + case None => ErrorMapping.NoError + case Some(e) => ErrorMapping.codeFor(e.getClass.asInstanceOf[Class[Throwable]]) + } +} + +case class LogReadResult(info: FetchDataInfo, hw: Long, error: Option[Throwable] = None) { + def errorCode = error match { + case None => ErrorMapping.NoError + case Some(e) => ErrorMapping.codeFor(e.getClass.asInstanceOf[Class[Throwable]]) + } +} class ReplicaManager(config: KafkaConfig, time: Time, @@ -237,74 +250,228 @@ class ReplicaManager(config: KafkaConfig, } /** - * Read from all the offset details given and return a map of - * (topic, partition) -> PartitionData + * Append messages to leader replicas of the partition, and wait for replicated to other replicas, + * the callback function will be triggered either when timeout or the required acks are satisfied */ - def readMessageSets(fetchRequest: FetchRequest) = { - val isFetchFromFollower = fetchRequest.isFromFollower - fetchRequest.requestInfo.map - { - case (TopicAndPartition(topic, partition), PartitionFetchInfo(offset, fetchSize)) => - val partitionDataAndOffsetInfo = - try { - val (fetchInfo, highWatermark) = readMessageSet(topic, partition, offset, fetchSize, fetchRequest.replicaId) - BrokerTopicStats.getBrokerTopicStats(topic).bytesOutRate.mark(fetchInfo.messageSet.sizeInBytes) - BrokerTopicStats.getBrokerAllTopicsStats.bytesOutRate.mark(fetchInfo.messageSet.sizeInBytes) - if (isFetchFromFollower) { - debug("Partition [%s,%d] received fetch request from follower %d" - .format(topic, partition, fetchRequest.replicaId)) - } - new PartitionDataAndOffset(new FetchResponsePartitionData(ErrorMapping.NoError, highWatermark, fetchInfo.messageSet), fetchInfo.fetchOffset) - } catch { - // NOTE: Failed fetch requests is not incremented for UnknownTopicOrPartitionException and NotLeaderForPartitionException - // since failed fetch requests metric is supposed to indicate failure of a broker in handling a fetch request - // for a partition it is the leader for - case utpe: UnknownTopicOrPartitionException => - warn("Fetch request with correlation id %d from client %s on partition [%s,%d] failed due to %s".format( - fetchRequest.correlationId, fetchRequest.clientId, topic, partition, utpe.getMessage)) - new PartitionDataAndOffset(new FetchResponsePartitionData(ErrorMapping.codeFor(utpe.getClass.asInstanceOf[Class[Throwable]]), -1L, MessageSet.Empty), LogOffsetMetadata.UnknownOffsetMetadata) - case nle: NotLeaderForPartitionException => - warn("Fetch request with correlation id %d from client %s on partition [%s,%d] failed due to %s".format( - fetchRequest.correlationId, fetchRequest.clientId, topic, partition, nle.getMessage)) - new PartitionDataAndOffset(new FetchResponsePartitionData(ErrorMapping.codeFor(nle.getClass.asInstanceOf[Class[Throwable]]), -1L, MessageSet.Empty), LogOffsetMetadata.UnknownOffsetMetadata) - case t: Throwable => - BrokerTopicStats.getBrokerTopicStats(topic).failedFetchRequestRate.mark() - BrokerTopicStats.getBrokerAllTopicsStats.failedFetchRequestRate.mark() - error("Error when processing fetch request for partition [%s,%d] offset %d from %s with correlation id %d. Possible cause: %s" - .format(topic, partition, offset, if (isFetchFromFollower) "follower" else "consumer", fetchRequest.correlationId, t.getMessage)) - new PartitionDataAndOffset(new FetchResponsePartitionData(ErrorMapping.codeFor(t.getClass.asInstanceOf[Class[Throwable]]), -1L, MessageSet.Empty), LogOffsetMetadata.UnknownOffsetMetadata) - } - (TopicAndPartition(topic, partition), partitionDataAndOffsetInfo) + def appendMessages(timeout: Long, + requiredAcks : Short, + messagesPerPartition: Map[TopicAndPartition, MessageSet], + callbackOnComplete: Map[TopicAndPartition, ProducerResponseStatus] => Unit) { + + val sTime = SystemTime.milliseconds + val localProduceResults = appendToLocalLog(messagesPerPartition) + debug("Produce to local log in %d ms".format(SystemTime.milliseconds - sTime)) + + val produceStatus = localProduceResults.mapValues(result => + ProduceStatus( + result.info.lastOffset + 1 // required offset + ProducerResponseStatus(result.errorCode, result.info.firstOffset)) // response status + ) + + if(requiredAcks == 0) { + // if required acks = 0 we can trigger complete immediately + val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus) + callbackOnComplete(produceResponseStatus) + } else if (produceRequest.requiredAcks == 1 || + messagesPerPartition.size <= 0 || + localProduceResults.values.count(_.error.isDefined) == produceRequest.numPartitions) { + // if required acks = 1 or all partition appends have failed we can trigger complete immediately + val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus) + callbackOnComplete(produceResponseStatus) + } else { + // create delayed produce operation and try to watch it in the purgatory + val delayedRequest = new DelayedProduce(timeout, ProduceInfo(requiredAcks, produceStatus), this, callbackOnComplete) + val producerRequestKeys = messagesPerPartition.keys.map(TopicPartitionRequestKey(_)).toSeq + + val completedByMe = producerRequestPurgatory.tryCompleteElseWatch(delayedRequest, producerRequestKeys) + if (completedByMe) { + val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus) + callbackOnComplete(produceResponseStatus) + } + } + } + + /** + * Append the messages to the local replica logs + */ + private def appendToLocalLog(messagesPerPartition: Map[TopicAndPartition, MessageSet]): Map[TopicAndPartition, LogAppendResult] = { + trace("Append [%s] to local log ".format(messagesPerPartition)) + messagesPerPartition.map { case (topicAndPartition, messages) => + try { + val partitionOpt = getPartition(topicAndPartition.topic, topicAndPartition.partition) + val info = partitionOpt match { + case Some(partition) => + partition.appendMessagesToLeader(messages.asInstanceOf[ByteBufferMessageSet]) + case None => throw new UnknownTopicOrPartitionException("Partition %s doesn't exist on %d" + .format(topicAndPartition, brokerId)) + } + + val numAppendedMessages = + if (info.firstOffset == -1L || info.lastOffset == -1L) + 0 + else + info.lastOffset - info.firstOffset + 1 + + // update stats for successfully appended bytes and messages as bytesInRate and messageInRate + BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).bytesInRate.mark(messages.sizeInBytes) + BrokerTopicStats.getBrokerAllTopicsStats.bytesInRate.mark(messages.sizeInBytes) + BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).messagesInRate.mark(numAppendedMessages) + BrokerTopicStats.getBrokerAllTopicsStats.messagesInRate.mark(numAppendedMessages) + + trace("%d bytes written to log %s-%d beginning at offset %d and ending at offset %d" + .format(messages.size, topicAndPartition.topic, topicAndPartition.partition, info.firstOffset, info.lastOffset)) + (topicAndPartition, LogAppendResult(info)) + } catch { + // NOTE: Failed produce requests is not incremented for UnknownTopicOrPartitionException and NotLeaderForPartitionException + // since failed produce requests metric is supposed to indicate failure of a broker in handling a produce request + // for a partition it is the leader for + case e: KafkaStorageException => + fatal("Halting due to unrecoverable I/O error while handling produce request: ", e) + Runtime.getRuntime.halt(1) + (topicAndPartition, null) + case utpe: UnknownTopicOrPartitionException => // TODO + warn("Produce request with correlation id %d from client %s on partition %s failed due to %s".format( + producerRequest.correlationId, producerRequest.clientId, topicAndPartition, utpe.getMessage)) + (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, utpe)) + case nle: NotLeaderForPartitionException => + warn("Produce request with correlation id %d from client %s on partition %s failed due to %s".format( + producerRequest.correlationId, producerRequest.clientId, topicAndPartition, nle.getMessage)) + (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, nle)) + case e: Throwable => + BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).failedProduceRequestRate.mark() + BrokerTopicStats.getBrokerAllTopicsStats.failedProduceRequestRate.mark() + error("Error processing ProducerRequest with correlation id %d from client %s on partition %s" + .format(producerRequest.correlationId, producerRequest.clientId, topicAndPartition), e) + (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, e)) + } + } + } + + /** + * Fetch messages from the leader replica, + * the callback function will be triggered either when timeout or required fetch info is satisfied + */ + def fetchMessages(timeout: Long, + replicaId: Int, + fetchMinBytes: Int, + fetchInfo: Map[TopicAndPartition, PartitionFetchInfo], + callbackOnComplete: Map[TopicAndPartition, FetchResponsePartitionData] => Unit) { + + val fetchOnlyLeader: Boolean = replicaId != Request.DebuggingConsumerId + val fetchOnlyCommitted: Boolean = ! Request.isValidBrokerId(fetchRequest.replicaId) + + // read from local logs + val fetchResults = readFromLocalLog(fetchOnlyLeader, fetchOnlyCommitted, fetchInfo) + + // if the fetch comes from the follower, + // update its corresponding log end offset + if(Request.isValidBrokerId(fetchRequest.replicaId)) + recordFollowerLogEndOffsets(replicaId, dataRead.mapValues(_.offset)) + + // check if this fetch request can be satisfied right away + val bytesReadable = fetchResults.values.map(_.info.messageSet.sizeInBytes).sum + val errorReadingData = fetchResults.values.foldLeft(false) ((errorIncurred, readResult) => + errorIncurred || (readResult.errorCode != ErrorMapping.NoError)) + // send the data immediately if 1) fetch request does not want to wait + // 2) fetch request does not require any data + // 3) has enough data to respond + // 4) some error happens while reading data + if(timeout <= 0 || + fetchInfo.size <= 0 || + bytesReadable >= fetchMinBytes || + errorReadingData) { + val fetchPartitionData = fetchResults.mapValues(result => FetchResponsePartitionData(result.errorCode, result.hw, result.info.messageSet)) + callbackOnComplete(fetchPartitionData) + } else { + val fetchStartOffsets = fetchResults.mapValues(result => result.info.fetchOffset) + val delayedFetch = new DelayedFetch(time, FetchInfo(fetchMinBytes, fetchOnlyLeader, fetchOnlyCommitted, fetchStartOffsets), this, callbackOnComplete) + // create a list of (topic, partition) pairs to use as keys for this delayed request + val delayedFetchKeys = fetchInfo.keys.map(new TopicPartitionRequestKey(_)).toSeq + + // add the fetch request for watch if it's not satisfied, otherwise send the response back + val completedByMe = fetchRequestPurgatory.tryCompleteElseWatch(delayedFetch, delayedFetchKeys) + if (completedByMe) { + // fetch again to get whatever is available + val fetchPartitionData = readFromLocalLog(fetchOnlyLeader, fetchOnlyCommitted, fetchInfo) + .mapValues(result => FetchResponsePartitionData(result.errorCode, result.hw, result.info.messageSet)) + callbackOnComplete(fetchPartitionData) + } } } /** * Read from a single topic/partition at the given offset upto maxSize bytes */ - private def readMessageSet(topic: String, - partition: Int, - offset: Long, - maxSize: Int, - fromReplicaId: Int): (FetchDataInfo, Long) = { - // check if the current broker is the leader for the partitions - val localReplica = if(fromReplicaId == Request.DebuggingConsumerId) - getReplicaOrException(topic, partition) - else - getLeaderReplicaIfLocal(topic, partition) - trace("Fetching log segment for topic, partition, offset, size = " + (topic, partition, offset, maxSize)) - val maxOffsetOpt = - if (Request.isValidBrokerId(fromReplicaId)) - None - else - Some(localReplica.highWatermark.messageOffset) - val fetchInfo = localReplica.log match { - case Some(log) => - log.read(offset, maxSize, maxOffsetOpt) - case None => - error("Leader for partition [%s,%d] does not have a local log".format(topic, partition)) - FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty) + private def readFromLocalLog(readOnlyIfLeader: Boolean, + readOnlyCommitted: Boolean, + readInfo: Map[TopicAndPartition, PartitionFetchInfo]): Map[TopicAndPartition, LogReadResult] = { + + readInfo.map { case (TopicAndPartition(topic, partition), PartitionFetchInfo(offset, fetchSize)) => + val partitionDataAndOffsetInfo = + try { + trace("Fetching log segment for topic %s, partition %d, offset %ld, size %d" + .format(topic, partition, offset, fetchSize)) + + // decide whether to only fetch from leader + val localReplica = if (readOnlyIfLeader) + getLeaderReplicaIfLocal(topic, partition) + else + getReplicaOrException(topic, partition) + + // decide whether to only fetch committed data (i.e. messages below high watermark) + val maxOffsetOpt = if (readOnlyCommitted) + None + else + Some(localReplica.highWatermark.messageOffset) + + // read on log + val logReadInfo = localReplica.log match { + case Some(log) => + log.read(offset, maxSize, maxOffsetOpt) + case None => + error("Leader for partition [%s,%d] does not have a local log".format(topic, partition)) + FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty) + } + + BrokerTopicStats.getBrokerTopicStats(topic).bytesOutRate.mark(logReadInfo.messageSet.sizeInBytes) + BrokerTopicStats.getBrokerAllTopicsStats.bytesOutRate.mark(logReadInfo.messageSet.sizeInBytes) + + LogReadResult(logReadInfo, localReplica.highWatermark.messageOffset, ErrorMapping.NoError) + } catch { + // NOTE: Failed fetch requests is not incremented for UnknownTopicOrPartitionException, NotLeaderForPartitionException + // and ReplicaNotAvailableException since failed fetch requests metric is supposed to indicate failure of a broker in handling a fetch request + // for a partition it is the leader for + case utpe: UnknownTopicOrPartitionException => // TODO + warn("Fetch request with correlation id %d from client %s on partition [%s,%d] failed due to %s".format( + fetchRequest.correlationId, fetchRequest.clientId, topic, partition, utpe.getMessage)) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, utpe) + case nle: NotLeaderForPartitionException => + warn("Fetch request with correlation id %d from client %s on partition [%s,%d] failed due to %s".format( + fetchRequest.correlationId, fetchRequest.clientId, topic, partition, nle.getMessage)) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, nle) + case rnae: ReplicaNotAvailableException => + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, rnae) + case t: Throwable => + BrokerTopicStats.getBrokerTopicStats(topic).failedFetchRequestRate.mark() + BrokerTopicStats.getBrokerAllTopicsStats.failedFetchRequestRate.mark() + error("Error when processing fetch request for partition [%s,%d] offset %d from %s with correlation id %d. Possible cause: %s" + .format(topic, partition, offset, if (isFetchFromFollower) "follower" else "consumer", fetchRequest.correlationId, t.getMessage)) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, t) + } + (TopicAndPartition(topic, partition), partitionDataAndOffsetInfo) + } + } + + private def recordFollowerLogEndOffsets(replicaId: Int, offsets: Map[TopicAndPartition, LogOffsetMetadata]) { + debug("Record follower log end offsets: %s ".format(offsets)) + offsets.foreach { + case (topicAndPartition, offset) => + updateReplicaLEOAndPartitionHW(topicAndPartition.topic, topicAndPartition.partition, replicaId, offset) + + // for producer requests with ack > 1, we need to check + // if they can be unblocked after some follower's log end offsets have moved + unblockDelayedProduceRequests(new TopicPartitionRequestKey(topicAndPartition)) } - (fetchInfo, localReplica.highWatermark.messageOffset) } def maybeUpdateMetadataCache(updateMetadataRequest: UpdateMetadataRequest, metadataCache: MetadataCache) { diff --git a/core/src/main/scala/kafka/server/RequestPurgatory.scala b/core/src/main/scala/kafka/server/RequestPurgatory.scala index dc4ce54..cbf32be 100644 --- a/core/src/main/scala/kafka/server/RequestPurgatory.scala +++ b/core/src/main/scala/kafka/server/RequestPurgatory.scala @@ -34,7 +34,7 @@ import com.yammer.metrics.core.Gauge * message append operation could be waiting for specified number of acks; or a delayed * message fetch operation could be waiting for a given number of bytes to accumulate. */ -abstract class DelayedRequest(delayMs: Long, onComplete: Boolean => Unit) extends DelayedItem(delayMs) { +abstract class DelayedRequest(delayMs: Long, onComplete: Any => Unit) extends DelayedItem(delayMs) { val completed = new AtomicBoolean(false) /* @@ -48,7 +48,7 @@ abstract class DelayedRequest(delayMs: Long, onComplete: Boolean => Unit) extend /* * When delayMs has elapsed, expire the delayed operation */ - def onExpired() = onComplete(false) + def onExpired() = onComplete(null) } /** @@ -97,20 +97,25 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt */ def tryCompleteElseWatch(operation: DelayedRequest, watchKeys: Seq[Any]): Boolean = { for(key <- watchKeys) { + // if the operation is already completed, stopping adding it to + // any further lists and return false + if (operation.completed.get()) + return false val watchers = watchersFor(key) - // if the operation is found completed, stop adding it to any further - // lists and return true immediately - if(!watchers.checkAndMaybeAdd(operation)) { + // if the operation is completed by myself, stop adding it to + // any further lists and return true immediately + if(! watchers.checkAndMaybeAdd(operation)) { return true } } // if it is indeed watched, add to the expire queue also - watched.getAndIncrement() - expirationReaper.enqueue(operation) + if (! operation.completed.get()) { + watched.getAndIncrement() + expirationReaper.enqueue(operation) + } false - } /** @@ -154,9 +159,6 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt // potentially add the element to watch if it is not satisfied yet def checkAndMaybeAdd(t: T): Boolean = { synchronized { - // if it is already satisfied, return false - if (t.completed.get()) - return false // if the operation can be completed, return false; otherwise add to watch list if(t.tryComplete()) { return false -- 1.7.12.4 From 0157beb183b7f407801670b55d3a926dbe0a91ce Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Wed, 13 Aug 2014 16:08:14 -0700 Subject: [PATCH 05/15] removal respond from delayed requests --- core/src/main/scala/kafka/server/DelayedFetch.scala | 5 ----- core/src/main/scala/kafka/server/DelayedProduce.scala | 17 ----------------- 2 files changed, 22 deletions(-) diff --git a/core/src/main/scala/kafka/server/DelayedFetch.scala b/core/src/main/scala/kafka/server/DelayedFetch.scala index 4b7542f..40abebe 100644 --- a/core/src/main/scala/kafka/server/DelayedFetch.scala +++ b/core/src/main/scala/kafka/server/DelayedFetch.scala @@ -101,9 +101,4 @@ class DelayedFetch(delayMs: Long, val readData = replicaManager.readMessageSets(fetch) onComplete(readData) } - - def respond(replicaManager: ReplicaManager): FetchResponse = { - val topicData = replicaManager.readMessageSets(fetch) - FetchResponse(fetch.correlationId, topicData.mapValues(_.data)) - } } \ No newline at end of file diff --git a/core/src/main/scala/kafka/server/DelayedProduce.scala b/core/src/main/scala/kafka/server/DelayedProduce.scala index 99bdf6f..1b6ad16 100644 --- a/core/src/main/scala/kafka/server/DelayedProduce.scala +++ b/core/src/main/scala/kafka/server/DelayedProduce.scala @@ -106,22 +106,5 @@ class DelayedProduce(delayMs: Long, val responseStatus = produceInfo.produceStatus.mapValues(status => status.responseStatus) onComplete(responseStatus) } - - def respond(offsetManager: OffsetManager): RequestOrResponse = { - val responseStatus = partitionStatus.mapValues(status => status.responseStatus) - - val errorCode = responseStatus.find { case (_, status) => - status.error != ErrorMapping.NoError - }.map(_._2.error).getOrElse(ErrorMapping.NoError) - - if (errorCode == ErrorMapping.NoError) { - offsetCommitRequestOpt.foreach(ocr => offsetManager.putOffsets(ocr.groupId, ocr.requestInfo) ) - } - - val response = offsetCommitRequestOpt.map(_.responseFor(errorCode, offsetManager.config.maxMetadataSize)) - .getOrElse(ProducerResponse(produce.correlationId, responseStatus)) - - response - } } -- 1.7.12.4 From 83a61d6d785abcc2087d58b0d2dd81e8ec12cccc Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Thu, 14 Aug 2014 16:36:35 -0700 Subject: [PATCH 06/15] move warning logs --- .../src/main/scala/kafka/common/ErrorMapping.scala | 2 ++ core/src/main/scala/kafka/server/KafkaApis.scala | 21 +++++++++++++-- .../main/scala/kafka/server/ReplicaManager.scala | 30 +++++++--------------- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/core/src/main/scala/kafka/common/ErrorMapping.scala b/core/src/main/scala/kafka/common/ErrorMapping.scala index 3fae791..8232e35 100644 --- a/core/src/main/scala/kafka/common/ErrorMapping.scala +++ b/core/src/main/scala/kafka/common/ErrorMapping.scala @@ -79,4 +79,6 @@ object ErrorMapping { throw codeToException(code).newInstance() def exceptionFor(code: Short) : Throwable = codeToException(code).newInstance() + + def exceptionNameFor(code: Short) : String = codeToException(code).getName() } diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index 949f57e..373f3e5 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -155,13 +155,22 @@ class KafkaApis(val requestChannel: RequestChannel, // the callback for sending the response def sendResponseCallback(responseStatus: Map[TopicAndPartition, ProducerResponseStatus]) { - val numPartitionsInError = responseStatus.values.count(_.error.isDefined) + var errorInResponse = false + responseStatus.foreach { case (topicAndPartition, status) => + // Here we only print warnings for known errors; if it is unknown, it will cause + // an error message in the replica manager already and hence can be ignored here + if (status.error != ErrorMapping.NoError && status.error != ErrorMapping.UnknownCode) { + warn("Produce request with correlation id %d from client %s on partition %s failed due to %s".format( + produceRequest.correlationId, produceRequest.clientId, topicAndPartition, ErrorMapping.exceptionNameFor(status.error))) + errorInResponse = true + } + } if(produceRequest.requiredAcks == 0) { // no operation needed if producer request.required.acks = 0; however, if there is any exception in handling the request, since // no response is expected by the producer the handler will send a close connection response to the socket server // to close the socket so that the producer client will know that some exception has happened and will refresh its metadata - if (numPartitionsInError != 0) { + if (errorInResponse) { info(("Send the close connection response due to error handling produce request " + "[clientId = %s, correlationId = %s, topicAndPartition = %s] with Ack=0") .format(produceRequest.clientId, produceRequest.correlationId, produceRequest.topicPartitionMessageSizeMap.keySet.mkString(","))) @@ -194,6 +203,14 @@ class KafkaApis(val requestChannel: RequestChannel, // the callback for sending the response def sendResponseCallback(responsePartitionData: Map[TopicAndPartition, FetchResponsePartitionData]) { + responsePartitionData.foreach { case (topicAndPartition, response) => + // Here we only print warnings for known errors; if it is unknown, it will cause + // an error message in the replica manager already and hence can be ignored here + if (status.error != ErrorMapping.NoError && status.error != ErrorMapping.UnknownCode) { + warn("Fetch request with correlation id %d from client %s on partition %s failed due to %s".format( + fetchRequest.correlationId, fetchRequest.clientId, topicAndPartition, ErrorMapping.exceptionNameFor(response.error))) + } + } val response = FetchResponse(fetchRequest.correlationId, responsePartitionData) requestChannel.sendResponse(new RequestChannel.Response(request, new BoundedByteBufferSend(response))) diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index eb25660..40bd3f4 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -264,7 +264,7 @@ class ReplicaManager(config: KafkaConfig, val produceStatus = localProduceResults.mapValues(result => ProduceStatus( - result.info.lastOffset + 1 // required offset + result.info.lastOffset + 1, // required offset ProducerResponseStatus(result.errorCode, result.info.firstOffset)) // response status ) @@ -322,26 +322,20 @@ class ReplicaManager(config: KafkaConfig, .format(messages.size, topicAndPartition.topic, topicAndPartition.partition, info.firstOffset, info.lastOffset)) (topicAndPartition, LogAppendResult(info)) } catch { - // NOTE: Failed produce requests is not incremented for UnknownTopicOrPartitionException and NotLeaderForPartitionException - // since failed produce requests metric is supposed to indicate failure of a broker in handling a produce request - // for a partition it is the leader for + // NOTE: Failed produce requests metric is not incremented for known exceptions + // it is supposed to indicate un-expected failures of a broker in handling a produce request case e: KafkaStorageException => fatal("Halting due to unrecoverable I/O error while handling produce request: ", e) Runtime.getRuntime.halt(1) (topicAndPartition, null) case utpe: UnknownTopicOrPartitionException => // TODO - warn("Produce request with correlation id %d from client %s on partition %s failed due to %s".format( - producerRequest.correlationId, producerRequest.clientId, topicAndPartition, utpe.getMessage)) (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, utpe)) case nle: NotLeaderForPartitionException => - warn("Produce request with correlation id %d from client %s on partition %s failed due to %s".format( - producerRequest.correlationId, producerRequest.clientId, topicAndPartition, nle.getMessage)) (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, nle)) case e: Throwable => BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).failedProduceRequestRate.mark() BrokerTopicStats.getBrokerAllTopicsStats.failedProduceRequestRate.mark() - error("Error processing ProducerRequest with correlation id %d from client %s on partition %s" - .format(producerRequest.correlationId, producerRequest.clientId, topicAndPartition), e) + error("Error processing append operation on partition %s".format(topicAndPartition), e) (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, e)) } } @@ -409,7 +403,7 @@ class ReplicaManager(config: KafkaConfig, readInfo.map { case (TopicAndPartition(topic, partition), PartitionFetchInfo(offset, fetchSize)) => val partitionDataAndOffsetInfo = try { - trace("Fetching log segment for topic %s, partition %d, offset %ld, size %d" + trace("Fetching log segment for topic %s, partition %d, offset %d, size %d" .format(topic, partition, offset, fetchSize)) // decide whether to only fetch from leader @@ -438,24 +432,18 @@ class ReplicaManager(config: KafkaConfig, LogReadResult(logReadInfo, localReplica.highWatermark.messageOffset, ErrorMapping.NoError) } catch { - // NOTE: Failed fetch requests is not incremented for UnknownTopicOrPartitionException, NotLeaderForPartitionException - // and ReplicaNotAvailableException since failed fetch requests metric is supposed to indicate failure of a broker in handling a fetch request - // for a partition it is the leader for + // NOTE: Failed fetch requests metric is not incremented for known exceptions since it + // is supposed to indicate un-expected failure of a broker in handling a fetch request case utpe: UnknownTopicOrPartitionException => // TODO - warn("Fetch request with correlation id %d from client %s on partition [%s,%d] failed due to %s".format( - fetchRequest.correlationId, fetchRequest.clientId, topic, partition, utpe.getMessage)) LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, utpe) case nle: NotLeaderForPartitionException => - warn("Fetch request with correlation id %d from client %s on partition [%s,%d] failed due to %s".format( - fetchRequest.correlationId, fetchRequest.clientId, topic, partition, nle.getMessage)) LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, nle) case rnae: ReplicaNotAvailableException => LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, rnae) - case t: Throwable => + case e: Throwable => BrokerTopicStats.getBrokerTopicStats(topic).failedFetchRequestRate.mark() BrokerTopicStats.getBrokerAllTopicsStats.failedFetchRequestRate.mark() - error("Error when processing fetch request for partition [%s,%d] offset %d from %s with correlation id %d. Possible cause: %s" - .format(topic, partition, offset, if (isFetchFromFollower) "follower" else "consumer", fetchRequest.correlationId, t.getMessage)) + error("Error processing fetch operation on partition [%s,%d] offset %d".format(topic, partition, offset)) LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, t) } (TopicAndPartition(topic, partition), partitionDataAndOffsetInfo) -- 1.7.12.4 From 21695aa95c89f40a9d5460a779fe41c67317a8b0 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Sat, 16 Aug 2014 11:13:34 -0700 Subject: [PATCH 07/15] dummy --- core/src/main/scala/kafka/server/KafkaApis.scala | 10 +++++++--- core/src/main/scala/kafka/server/ReplicaManager.scala | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index 373f3e5..5aa4264 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -56,7 +56,7 @@ class KafkaApis(val requestChannel: RequestChannel, try{ trace("Handling request: " + request.requestObj + " from client: " + request.remoteAddress) request.requestId match { - case RequestKeys.ProduceKey => handleProducerOrOffsetCommitRequest(request) + case RequestKeys.ProduceKey => handleProducerRequest(request) case RequestKeys.FetchKey => handleFetchRequest(request) case RequestKeys.OffsetsKey => handleOffsetRequest(request) case RequestKeys.MetadataKey => handleTopicMetadataRequest(request) @@ -64,7 +64,7 @@ class KafkaApis(val requestChannel: RequestChannel, case RequestKeys.StopReplicaKey => handleStopReplicaRequest(request) case RequestKeys.UpdateMetadataKey => handleUpdateMetadataRequest(request) case RequestKeys.ControlledShutdownKey => handleControlledShutdownRequest(request) - case RequestKeys.OffsetCommitKey => handleProducerOrOffsetCommitRequest(request) + case RequestKeys.OffsetCommitKey => handleOffsetCommitRequest(request) case RequestKeys.OffsetFetchKey => handleOffsetFetchRequest(request) case RequestKeys.ConsumerMetadataKey => handleConsumerMetadataRequest(request) case requestId => throw new KafkaException("Unknown api code " + requestId) @@ -123,6 +123,10 @@ class KafkaApis(val requestChannel: RequestChannel, requestChannel.sendResponse(new Response(request, new BoundedByteBufferSend(controlledShutdownResponse))) } + def handleOffsetCommitRequest(request: RequestChannel.Request) { + + } + private def producerRequestFromOffsetCommit(offsetCommitRequest: OffsetCommitRequest) = { val msgs = offsetCommitRequest.filterLargeMetadata(config.offsetMetadataMaxSize).map { case (topicAndPartition, offset) => @@ -150,7 +154,7 @@ class KafkaApis(val requestChannel: RequestChannel, /** * Handle a produce request or offset commit request (which is really a specialized producer request) */ - def handleProducerOrOffsetCommitRequest(request: RequestChannel.Request) { + def handleProducerRequest(request: RequestChannel.Request) { val produceRequest = request.requestObj.asInstanceOf[ProducerRequest] // the callback for sending the response diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index 40bd3f4..871bfaf 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -221,7 +221,7 @@ class ReplicaManager(config: KafkaConfig, def getReplicaOrException(topic: String, partition: Int): Replica = { val replicaOpt = getReplica(topic, partition) if(replicaOpt.isDefined) - return replicaOpt.get + replicaOpt.get else throw new ReplicaNotAvailableException("Replica %d is not available for partition [%s,%d]".format(config.brokerId, topic, partition)) } @@ -434,7 +434,7 @@ class ReplicaManager(config: KafkaConfig, } catch { // NOTE: Failed fetch requests metric is not incremented for known exceptions since it // is supposed to indicate un-expected failure of a broker in handling a fetch request - case utpe: UnknownTopicOrPartitionException => // TODO + case utpe: UnknownTopicOrPartitionException => LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, utpe) case nle: NotLeaderForPartitionException => LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, nle) -- 1.7.12.4 From 42cdfb3a6ff093970054a38b79bda0ebac02a147 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Mon, 18 Aug 2014 10:40:04 -0700 Subject: [PATCH 08/15] dummy --- core/src/main/scala/kafka/api/FetchRequest.scala | 1 - .../src/main/scala/kafka/server/DelayedFetch.scala | 50 +++++++++------- .../main/scala/kafka/server/DelayedProduce.scala | 30 +++++----- core/src/main/scala/kafka/server/KafkaApis.scala | 39 +++++++++---- .../main/scala/kafka/server/ReplicaManager.scala | 66 +++++++++++----------- .../main/scala/kafka/server/RequestPurgatory.scala | 35 ++++++------ 6 files changed, 123 insertions(+), 98 deletions(-) diff --git a/core/src/main/scala/kafka/api/FetchRequest.scala b/core/src/main/scala/kafka/api/FetchRequest.scala index 51cdccf..56dab5f 100644 --- a/core/src/main/scala/kafka/api/FetchRequest.scala +++ b/core/src/main/scala/kafka/api/FetchRequest.scala @@ -30,7 +30,6 @@ import scala.collection.immutable.Map case class PartitionFetchInfo(offset: Long, fetchSize: Int) - object FetchRequest { val CurrentVersion = 0.shortValue val DefaultMaxWait = 0 diff --git a/core/src/main/scala/kafka/server/DelayedFetch.scala b/core/src/main/scala/kafka/server/DelayedFetch.scala index 40abebe..c80ed17 100644 --- a/core/src/main/scala/kafka/server/DelayedFetch.scala +++ b/core/src/main/scala/kafka/server/DelayedFetch.scala @@ -17,12 +17,10 @@ package kafka.server -import kafka.network.RequestChannel -import kafka.api.{FetchResponse, FetchRequest} +import kafka.api.{FetchResponsePartitionData, PartitionFetchInfo, FetchResponse, FetchRequest} import kafka.common.{UnknownTopicOrPartitionException, NotLeaderForPartitionException, TopicAndPartition} import scala.collection.immutable.Map -import scala.collection.Seq /** * A delayed fetch request, which is satisfied (or more @@ -37,43 +35,49 @@ import scala.collection.Seq * - should return whatever data is available. */ -case class FetchInfo(fetchMinBytes: Int, - fetchOnlyLeader: Boolean, - fetchOnlyCommitted: Boolean, - fetchStartOffsets: Map[TopicAndPartition, LogOffsetMetadata]) { +case class FetchPartitionStatus(startOffsetMetadata: LogOffsetMetadata, fetchInfo: PartitionFetchInfo) { - override def toString = "FetchInfo [minBytes: " + fetchMinBytes + "] : " + - "[committedOnly: " + fetchOnlyCommitted + "] : " - "[startOffsets: " + fetchStartOffsets + "]" + override def toString = "[startOffset: " + startOffsetMetadata + ", " + + "fetchSize: " + fetchSize + "]" +} + +case class FetchMetadata(fetchMinBytes: Int, + fetchOnlyLeader: Boolean, + fetchOnlyCommitted: Boolean, + fetchPartitionStatus: Map[TopicAndPartition, FetchPartitionStatus]) { + + override def toString = "[minBytes: " + fetchMinBytes + ", " + + "committedOnly: " + fetchOnlyCommitted + ", " + "partitionStatus: " + fetchPartitionStatus + "]" } class DelayedFetch(delayMs: Long, - fetchInfo: FetchInfo, + fetchMetadata: FetchMetadata, replicaManager: ReplicaManager, onComplete: Map[TopicAndPartition, PartitionDataAndOffset] => Unit) extends DelayedRequest(delayMs, onComplete) { override def tryComplete() : Boolean = { var accumulatedSize = 0 - fetchInfo.fetchStartOffsets.foreach { + fetchMetadata.fetchPartitionStatus.foreach { case (topicAndPartition, fetchOffset) => try { if (fetchOffset != LogOffsetMetadata.UnknownOffsetMetadata) { val replica = replicaManager.getLeaderReplicaIfLocal(topicAndPartition.topic, topicAndPartition.partition) val endOffset = - if (fetchInfo.fetchOnlyCommitted) + if (fetchMetadata.fetchOnlyCommitted) replica.highWatermark else replica.logEndOffset if (endOffset.offsetOnOlderSegment(fetchOffset)) { // Case C, this can happen when the new follower replica fetching on a truncated leader - debug("Satisfying %s since it is fetching later segments of partition %s.".format(fetchInfo, topicAndPartition)) + debug("Satisfying fetch %s since it is fetching later segments of partition %s.".format(fetchMetadata, topicAndPartition)) return super.tryComplete() } else if (fetchOffset.offsetOnOlderSegment(endOffset)) { // Case C, this can happen when the folloer replica is lagging too much - debug("Satisfying %s immediately since it is fetching older segments.".format(fetchInfo)) + debug("Satisfying fetch %s immediately since it is fetching older segments.".format(fetchMetadata)) return super.tryComplete() } else if (fetchOffset.precedes(endOffset)) { accumulatedSize += endOffset.positionDiff(fetchOffset) @@ -81,24 +85,30 @@ class DelayedFetch(delayMs: Long, } } catch { case utpe: UnknownTopicOrPartitionException => // Case A - debug("Broker no longer know of %s, satisfy %s immediately".format(topicAndPartition, fetchInfo)) + debug("Broker no longer know of %s, satisfy %s immediately".format(topicAndPartition, fetchMetadata)) return super.tryComplete() case nle: NotLeaderForPartitionException => // Case B - debug("Broker is no longer the leader of %s, satisfy %s immediately".format(topicAndPartition, fetchInfo)) + debug("Broker is no longer the leader of %s, satisfy %s immediately".format(topicAndPartition, fetchMetadata)) return super.tryComplete() } } // Case D - if (accumulatedSize >= fetchInfo.fetchMinBytes) + if (accumulatedSize >= fetchMetadata.fetchMinBytes) super.tryComplete() else false } override def onExpired() { + debug("Expire fetch %s and return whatever fetch data is available".format(fetchMetadata)) + // read whatever data is available and return - val readData = replicaManager.readMessageSets(fetch) - onComplete(readData) + val logReadResults = replicaManager.readFromLocalLog(fetchMetadata.fetchOnlyLeader, + fetchMetadata.fetchOnlyCommitted, + fetchMetadata.fetchPartitionStatus.mapValues(status => status.fetchInfo)) + val fetchPartitionData = logReadResults.mapValues(result => + FetchResponsePartitionData(result.errorCode, result.hw, result.info.messageSet)) + onComplete(fetchPartitionData) } } \ No newline at end of file diff --git a/core/src/main/scala/kafka/server/DelayedProduce.scala b/core/src/main/scala/kafka/server/DelayedProduce.scala index 1b6ad16..0ec0055 100644 --- a/core/src/main/scala/kafka/server/DelayedProduce.scala +++ b/core/src/main/scala/kafka/server/DelayedProduce.scala @@ -21,11 +21,9 @@ import kafka.api._ import kafka.common.ErrorMapping import kafka.common.TopicAndPartition import kafka.utils.Logging -import kafka.network.RequestChannel import scala.Some import scala.collection.immutable.Map -import scala.collection.Seq /** A delayed produce request, which is satisfied (or more * accurately, unblocked) -- if for every partition it produce to: @@ -35,28 +33,28 @@ import scala.collection.Seq * B.2 - else, at least requiredAcks replicas should be caught up to this request. */ -case class ProduceStatus(requiredOffset: Long, responseStatus: ProducerResponseStatus) { +case class ProducePartitionStatus(requiredOffset: Long, responseStatus: ProducerResponseStatus) { @volatile var acksPending = false - override def toString = "acksPending:%b, error: %d, startOffset: %d, requiredOffset: %d" + override def toString = "[acksPending: %b, error: %d, startOffset: %d, requiredOffset: %d]" .format(acksPending, responseStatus.error, responseStatus.offset, requiredOffset) } -case class ProduceInfo(produceRequiredAcks: Short, - produceStatus: Map[TopicAndPartition, ProduceStatus]) { +case class ProduceMetadata(produceRequiredAcks: Short, + produceStatus: Map[TopicAndPartition, ProducePartitionStatus]) { - override def toString = "ProduceInfo [requiredBytes: " + fetchMinBytes + "] : " + - "[partitionStatus: " + produceStatus + "]" + override def toString = "[requiredAcks: %d, partitionStatus: %s]" + .format(produceRequiredAcks, produceStatus) } class DelayedProduce(delayMs: Long, - produceInfo: ProduceInfo, + produceMetadata: ProduceMetadata, replicaManager: ReplicaManager, onComplete: Map[TopicAndPartition, ProducerResponseStatus] => Unit) extends DelayedRequest(delayMs) with Logging { // first update the acks pending variable according to the error code - produceInfo.produceStatus foreach { case (topicAndPartition, status) => + produceMetadata.produceStatus foreach { case (topicAndPartition, status) => if (status.responseStatus.error == ErrorMapping.NoError) { // Timeout error state will be cleared when required acks are received status.acksPending = true @@ -70,8 +68,8 @@ class DelayedProduce(delayMs: Long, def tryComplete(): Boolean = { // check for each partition if it still has pending acks - produceInfo.produceStatus.foreach { case (topicAndPartition, status) => - trace("Checking producer request satisfaction for %s, acksPending = %b" + produceMetadata.produceStatus.foreach { case (topicAndPartition, status) => + trace("Checking produce satisfaction for %s, acksPending = %b" .format(topicAndPartition, status.acksPending)) // skip those partitions that have already been satisfied if (status.acksPending) { @@ -80,7 +78,7 @@ class DelayedProduce(delayMs: Long, case Some(partition) => partition.checkEnoughReplicasReachOffset( status.requiredOffset, - produceInfo.produceRequiredAcks) + produceMetadata.produceRequiredAcks) case None => (false, ErrorMapping.UnknownTopicOrPartitionCode) } @@ -95,15 +93,17 @@ class DelayedProduce(delayMs: Long, } // unblocked if there are no partitions with pending acks - if (! produceInfo.produceStatus.values.exists(p => p.acksPending)) + if (! produceMetadata.produceStatus.values.exists(p => p.acksPending)) super.tryComplete() else false } override def onExpired() { + debug("Expire produce %s and return the error codes".format(produceMetadata)) + // return the current response status - val responseStatus = produceInfo.produceStatus.mapValues(status => status.responseStatus) + val responseStatus = produceMetadata.produceStatus.mapValues(status => status.responseStatus) onComplete(responseStatus) } } diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index 5aa4264..4c03d2c 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -124,7 +124,21 @@ class KafkaApis(val requestChannel: RequestChannel, } def handleOffsetCommitRequest(request: RequestChannel.Request) { + val offsetCommitRequest = request.requestObj.asInstanceOf[OffsetCommitRequest] + // the callback for sending the response + def sendResponseCallback(responseStatus: Map[TopicAndPartition, ProducerResponseStatus]) { + var errorInResponse = false + responseStatus.foreach { case (topicAndPartition, status) => + // Here we only print warnings for known errors; if it is unknown, it will cause + // an error message in the replica manager already and hence can be ignored here + if (status.error != ErrorMapping.NoError && status.error != ErrorMapping.UnknownCode) { + warn("Produce request with correlation id %d from client %s on partition %s failed due to %s" + .format(produceRequest.correlationId, produceRequest.clientId, + topicAndPartition, ErrorMapping.exceptionNameFor(status.error))) + errorInResponse = true + } + } } private def producerRequestFromOffsetCommit(offsetCommitRequest: OffsetCommitRequest) = { @@ -164,20 +178,20 @@ class KafkaApis(val requestChannel: RequestChannel, // Here we only print warnings for known errors; if it is unknown, it will cause // an error message in the replica manager already and hence can be ignored here if (status.error != ErrorMapping.NoError && status.error != ErrorMapping.UnknownCode) { - warn("Produce request with correlation id %d from client %s on partition %s failed due to %s".format( - produceRequest.correlationId, produceRequest.clientId, topicAndPartition, ErrorMapping.exceptionNameFor(status.error))) + warn("Produce request with correlation id %d from client %s on partition %s failed due to %s" + .format(produceRequest.correlationId, produceRequest.clientId, + topicAndPartition, ErrorMapping.exceptionNameFor(status.error))) errorInResponse = true } } if(produceRequest.requiredAcks == 0) { - // no operation needed if producer request.required.acks = 0; however, if there is any exception in handling the request, since - // no response is expected by the producer the handler will send a close connection response to the socket server - // to close the socket so that the producer client will know that some exception has happened and will refresh its metadata + // no operation needed if producer request.required.acks = 0; however, if there is any error in handling + // the request, since no response is expected by the producer, the server will close socket server so that + // the producer client will know that some error has happened and will refresh its metadata if (errorInResponse) { - info(("Send the close connection response due to error handling produce request " + - "[clientId = %s, correlationId = %s, topicAndPartition = %s] with Ack=0") - .format(produceRequest.clientId, produceRequest.correlationId, produceRequest.topicPartitionMessageSizeMap.keySet.mkString(","))) + info("Close connection due to error handling produce request with correlation id %d from client id %s with ack=0" + .format(produceRequest.correlationId, produceRequest.clientId)) requestChannel.closeConnection(request.processor, request) } else { requestChannel.noOperation(request.processor, request) @@ -207,12 +221,13 @@ class KafkaApis(val requestChannel: RequestChannel, // the callback for sending the response def sendResponseCallback(responsePartitionData: Map[TopicAndPartition, FetchResponsePartitionData]) { - responsePartitionData.foreach { case (topicAndPartition, response) => + responsePartitionData.foreach { case (topicAndPartition, data) => // Here we only print warnings for known errors; if it is unknown, it will cause // an error message in the replica manager already and hence can be ignored here - if (status.error != ErrorMapping.NoError && status.error != ErrorMapping.UnknownCode) { - warn("Fetch request with correlation id %d from client %s on partition %s failed due to %s".format( - fetchRequest.correlationId, fetchRequest.clientId, topicAndPartition, ErrorMapping.exceptionNameFor(response.error))) + if (data.error != ErrorMapping.NoError && data.error != ErrorMapping.UnknownCode) { + warn("Fetch request with correlation id %d from client %s on partition %s failed due to %s" + .format(fetchRequest.correlationId, fetchRequest.clientId, + topicAndPartition, ErrorMapping.exceptionNameFor(data.error))) } } diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index 871bfaf..0eda9bd 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -52,7 +52,7 @@ case class LogAppendResult(info: LogAppendInfo, error: Option[Throwable] = None) } } -case class LogReadResult(info: FetchDataInfo, hw: Long, error: Option[Throwable] = None) { +case class LogReadResult(info: FetchDataInfo, hw: Long, readSize: Int, error: Option[Throwable] = None) { def errorCode = error match { case None => ErrorMapping.NoError case Some(e) => ErrorMapping.codeFor(e.getClass.asInstanceOf[Class[Throwable]]) @@ -254,7 +254,7 @@ class ReplicaManager(config: KafkaConfig, * the callback function will be triggered either when timeout or the required acks are satisfied */ def appendMessages(timeout: Long, - requiredAcks : Short, + requiredAcks: Short, messagesPerPartition: Map[TopicAndPartition, MessageSet], callbackOnComplete: Map[TopicAndPartition, ProducerResponseStatus] => Unit) { @@ -263,7 +263,7 @@ class ReplicaManager(config: KafkaConfig, debug("Produce to local log in %d ms".format(SystemTime.milliseconds - sTime)) val produceStatus = localProduceResults.mapValues(result => - ProduceStatus( + ProducePartitionStatus( result.info.lastOffset + 1, // required offset ProducerResponseStatus(result.errorCode, result.info.firstOffset)) // response status ) @@ -280,10 +280,11 @@ class ReplicaManager(config: KafkaConfig, callbackOnComplete(produceResponseStatus) } else { // create delayed produce operation and try to watch it in the purgatory - val delayedRequest = new DelayedProduce(timeout, ProduceInfo(requiredAcks, produceStatus), this, callbackOnComplete) + val produceMetadata = ProduceMetadata(requiredAcks, produceStatus) + val delayedProduce = new DelayedProduce(timeout, produceMetadata, this, callbackOnComplete) val producerRequestKeys = messagesPerPartition.keys.map(TopicPartitionRequestKey(_)).toSeq - val completedByMe = producerRequestPurgatory.tryCompleteElseWatch(delayedRequest, producerRequestKeys) + val completedByMe = producerRequestPurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys) if (completedByMe) { val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus) callbackOnComplete(produceResponseStatus) @@ -294,7 +295,7 @@ class ReplicaManager(config: KafkaConfig, /** * Append the messages to the local replica logs */ - private def appendToLocalLog(messagesPerPartition: Map[TopicAndPartition, MessageSet]): Map[TopicAndPartition, LogAppendResult] = { + private def appendToLocalLog(messagesPerPartition: Map[TopicAndPartition, MessageSet]) = { trace("Append [%s] to local log ".format(messagesPerPartition)) messagesPerPartition.map { case (topicAndPartition, messages) => try { @@ -303,7 +304,7 @@ class ReplicaManager(config: KafkaConfig, case Some(partition) => partition.appendMessagesToLeader(messages.asInstanceOf[ByteBufferMessageSet]) case None => throw new UnknownTopicOrPartitionException("Partition %s doesn't exist on %d" - .format(topicAndPartition, brokerId)) + .format(topicAndPartition, localBrokerId)) } val numAppendedMessages = @@ -328,7 +329,7 @@ class ReplicaManager(config: KafkaConfig, fatal("Halting due to unrecoverable I/O error while handling produce request: ", e) Runtime.getRuntime.halt(1) (topicAndPartition, null) - case utpe: UnknownTopicOrPartitionException => // TODO + case utpe: UnknownTopicOrPartitionException => (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, utpe)) case nle: NotLeaderForPartitionException => (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, nle)) @@ -355,7 +356,7 @@ class ReplicaManager(config: KafkaConfig, val fetchOnlyCommitted: Boolean = ! Request.isValidBrokerId(fetchRequest.replicaId) // read from local logs - val fetchResults = readFromLocalLog(fetchOnlyLeader, fetchOnlyCommitted, fetchInfo) + val logReadResults = readFromLocalLog(fetchOnlyLeader, fetchOnlyCommitted, fetchInfo) // if the fetch comes from the follower, // update its corresponding log end offset @@ -363,24 +364,26 @@ class ReplicaManager(config: KafkaConfig, recordFollowerLogEndOffsets(replicaId, dataRead.mapValues(_.offset)) // check if this fetch request can be satisfied right away - val bytesReadable = fetchResults.values.map(_.info.messageSet.sizeInBytes).sum - val errorReadingData = fetchResults.values.foldLeft(false) ((errorIncurred, readResult) => + val bytesReadable = logReadResults.values.map(_.info.messageSet.sizeInBytes).sum + val errorReadingData = logReadResults.values.foldLeft(false) ((errorIncurred, readResult) => errorIncurred || (readResult.errorCode != ErrorMapping.NoError)) + // send the data immediately if 1) fetch request does not want to wait // 2) fetch request does not require any data // 3) has enough data to respond // 4) some error happens while reading data - if(timeout <= 0 || - fetchInfo.size <= 0 || - bytesReadable >= fetchMinBytes || - errorReadingData) { - val fetchPartitionData = fetchResults.mapValues(result => FetchResponsePartitionData(result.errorCode, result.hw, result.info.messageSet)) + if(timeout <= 0 || fetchInfo.size <= 0 || bytesReadable >= fetchMinBytes || errorReadingData) { + val fetchPartitionData = logReadResults.mapValues(result => + FetchResponsePartitionData(result.errorCode, result.hw, result.info.messageSet)) callbackOnComplete(fetchPartitionData) } else { - val fetchStartOffsets = fetchResults.mapValues(result => result.info.fetchOffset) - val delayedFetch = new DelayedFetch(time, FetchInfo(fetchMinBytes, fetchOnlyLeader, fetchOnlyCommitted, fetchStartOffsets), this, callbackOnComplete) + // construct the fetch results from the read results + val fetchPartitionStatus = logReadResults.mapValues(result => FetchPartitionStatus(result.info.fetchOffset, result.readSize)) + val fetchMetadata = FetchMetadata(fetchMinBytes, fetchOnlyLeader, fetchOnlyCommitted, fetchPartitionStatus) + val delayedFetch = new DelayedFetch(time, fetchMetadata, this, callbackOnComplete) + // create a list of (topic, partition) pairs to use as keys for this delayed request - val delayedFetchKeys = fetchInfo.keys.map(new TopicPartitionRequestKey(_)).toSeq + val delayedFetchKeys = fetchPartitionStatus.keys.map(new TopicPartitionRequestKey(_)).toSeq // add the fetch request for watch if it's not satisfied, otherwise send the response back val completedByMe = fetchRequestPurgatory.tryCompleteElseWatch(delayedFetch, delayedFetchKeys) @@ -396,15 +399,14 @@ class ReplicaManager(config: KafkaConfig, /** * Read from a single topic/partition at the given offset upto maxSize bytes */ - private def readFromLocalLog(readOnlyIfLeader: Boolean, - readOnlyCommitted: Boolean, - readInfo: Map[TopicAndPartition, PartitionFetchInfo]): Map[TopicAndPartition, LogReadResult] = { + def readFromLocalLog(readOnlyIfLeader: Boolean, + readOnlyCommitted: Boolean, + readPartitionInfo: Map[TopicAndPartition, PartitionFetchInfo]) = { - readInfo.map { case (TopicAndPartition(topic, partition), PartitionFetchInfo(offset, fetchSize)) => + readPartitionInfo.map { case (TopicAndPartition(topic, partition), PartitionFetchInfo(offset, fetchSize)) => val partitionDataAndOffsetInfo = try { - trace("Fetching log segment for topic %s, partition %d, offset %d, size %d" - .format(topic, partition, offset, fetchSize)) + trace("Fetching log segment for topic %s, partition %d, offset %d, size %d".format(topic, partition, offset, fetchSize)) // decide whether to only fetch from leader val localReplica = if (readOnlyIfLeader) @@ -428,23 +430,23 @@ class ReplicaManager(config: KafkaConfig, } BrokerTopicStats.getBrokerTopicStats(topic).bytesOutRate.mark(logReadInfo.messageSet.sizeInBytes) - BrokerTopicStats.getBrokerAllTopicsStats.bytesOutRate.mark(logReadInfo.messageSet.sizeInBytes) + BrokerTopicStats.getBrokerAllTopicsStats().bytesOutRate.mark(logReadInfo.messageSet.sizeInBytes) - LogReadResult(logReadInfo, localReplica.highWatermark.messageOffset, ErrorMapping.NoError) + LogReadResult(logReadInfo, localReplica.highWatermark.messageOffset, fetchSize, ErrorMapping.NoError) } catch { // NOTE: Failed fetch requests metric is not incremented for known exceptions since it // is supposed to indicate un-expected failure of a broker in handling a fetch request case utpe: UnknownTopicOrPartitionException => - LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, utpe) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, utpe) case nle: NotLeaderForPartitionException => - LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, nle) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, nle) case rnae: ReplicaNotAvailableException => - LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, rnae) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, rnae) case e: Throwable => BrokerTopicStats.getBrokerTopicStats(topic).failedFetchRequestRate.mark() - BrokerTopicStats.getBrokerAllTopicsStats.failedFetchRequestRate.mark() + BrokerTopicStats.getBrokerAllTopicsStats().failedFetchRequestRate.mark() error("Error processing fetch operation on partition [%s,%d] offset %d".format(topic, partition, offset)) - LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, t) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, t) } (TopicAndPartition(topic, partition), partitionDataAndOffsetInfo) } diff --git a/core/src/main/scala/kafka/server/RequestPurgatory.scala b/core/src/main/scala/kafka/server/RequestPurgatory.scala index cbf32be..e459e55 100644 --- a/core/src/main/scala/kafka/server/RequestPurgatory.scala +++ b/core/src/main/scala/kafka/server/RequestPurgatory.scala @@ -38,17 +38,24 @@ abstract class DelayedRequest(delayMs: Long, onComplete: Any => Unit) extends De val completed = new AtomicBoolean(false) /* - * Check if the delayed operation is already completed + * Check if the delayed operation can be completed * * Note that concurrent threads can check if an operation can be completed or not, - * but only the first thread will succeed in completing the operation + * but only the first thread will succeed in completing the operation and return + * true, others will still return false */ def tryComplete(): Boolean = completed.compareAndSet(false, true) + /** + * Check if the delayed operation is already completed + */ + def isCompleted(): Boolean = completed.get() + /* * When delayMs has elapsed, expire the delayed operation */ - def onExpired() = onComplete(null) + def onExpired(): Unit + } /** @@ -61,9 +68,6 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt /* a list of requests watching each key */ private val watchersForKey = new Pool[Any, Watchers](Some((key: Any) => new Watchers)) - /* the number of requests being watched, duplicates added on different watchers are also counted */ - private val watched = new AtomicInteger(0) - /* background thread expiring requests that have been waiting too long */ private val expirationReaper = new ExpiredOperationReaper @@ -99,7 +103,7 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt for(key <- watchKeys) { // if the operation is already completed, stopping adding it to // any further lists and return false - if (operation.completed.get()) + if (operation.isCompleted()) return false val watchers = watchersFor(key) // if the operation is completed by myself, stop adding it to @@ -110,8 +114,7 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt } // if it is indeed watched, add to the expire queue also - if (! operation.completed.get()) { - watched.getAndIncrement() + if (! operation.isCompleted()) { expirationReaper.enqueue(operation) } @@ -176,7 +179,7 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt var purged = 0 while (iter.hasNext) { val curr = iter.next - if(curr.completed.get()) { + if(curr.isCompleted()) { iter.remove() purged += 1 } @@ -192,16 +195,14 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt val iter = requests.iterator() while(iter.hasNext) { val curr = iter.next - if (curr.completed.get()) { + if (curr.isCompleted()) { // another thread has completed this request, just remove it iter.remove() } else { val completed = curr.tryComplete() if(completed) { iter.remove() - watched.getAndDecrement() response += curr - expirationReaper.satisfyRequest() } } } @@ -230,7 +231,6 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt */ def enqueue(t: T) { delayed.add(t) - unsatisfied.incrementAndGet() } /** @@ -239,7 +239,7 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt private def pollExpired(): T = { while (true) { val curr = delayed.poll(200L, TimeUnit.MILLISECONDS) - if (curr == null) + if (curr == null.asInstanceOf[T]) return null.asInstanceOf[T] // try set the operation failed (and hence completed), if succeed return it; // otherwise try to get the next expired operation since this one has been completed by others @@ -260,7 +260,7 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt val iter = delayed.iterator() while (iter.hasNext) { val curr = iter.next() - if (curr.completed.get()) { + if (curr.isCompleted()) { iter.remove() purged += 1 } @@ -272,7 +272,7 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt override def doWork() { val curr = pollExpired() - if (curr != null) { + if (curr != null.asInstanceOf[T]) { curr.onExpired() } if (size() >= purgeInterval) { // see if we need to force a full purge @@ -284,5 +284,4 @@ abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInt } } } - } -- 1.7.12.4 From c591ccbf9a354dd7efcd3f8e2a925d70f7626ae2 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Mon, 18 Aug 2014 14:57:03 -0700 Subject: [PATCH 09/15] tons of compilation errors --- .../main/scala/kafka/api/OffsetCommitRequest.scala | 24 ++---- .../src/main/scala/kafka/server/DelayedFetch.scala | 5 ++ .../main/scala/kafka/server/DelayedProduce.scala | 5 +- core/src/main/scala/kafka/server/KafkaApis.scala | 54 +++++-------- .../main/scala/kafka/server/OffsetManager.scala | 90 ++++++++++++++++++++-- .../main/scala/kafka/server/ReplicaManager.scala | 80 ++++++++----------- .../main/scala/kafka/server/RequestPurgatory.scala | 2 +- .../scala/unit/kafka/server/SimpleFetchTest.scala | 4 +- 8 files changed, 150 insertions(+), 114 deletions(-) diff --git a/core/src/main/scala/kafka/api/OffsetCommitRequest.scala b/core/src/main/scala/kafka/api/OffsetCommitRequest.scala index 861a6cf..050615c 100644 --- a/core/src/main/scala/kafka/api/OffsetCommitRequest.scala +++ b/core/src/main/scala/kafka/api/OffsetCommitRequest.scala @@ -78,28 +78,12 @@ case class OffsetCommitRequest(groupId: String, groupGenerationId: Int = org.apache.kafka.common.requests.OffsetCommitRequest.DEFAULT_GENERATION_ID, consumerId: String = org.apache.kafka.common.requests.OffsetCommitRequest.DEFAULT_CONSUMER_ID) extends RequestOrResponse(Some(RequestKeys.OffsetCommitKey)) { + assert(versionId == 0 || versionId == 1, "Version " + versionId + " is invalid for OffsetCommitRequest. Valid versions are 0 or 1.") lazy val requestInfoGroupedByTopic = requestInfo.groupBy(_._1.topic) - def filterLargeMetadata(maxMetadataSize: Int) = - requestInfo.filter(info => info._2.metadata == null || info._2.metadata.length <= maxMetadataSize) - - def responseFor(errorCode: Short, offsetMetadataMaxSize: Int) = { - val commitStatus = requestInfo.map {info => - (info._1, if (info._2.metadata != null && info._2.metadata.length > offsetMetadataMaxSize) - ErrorMapping.OffsetMetadataTooLargeCode - else if (errorCode == ErrorMapping.UnknownTopicOrPartitionCode) - ErrorMapping.ConsumerCoordinatorNotAvailableCode - else if (errorCode == ErrorMapping.NotLeaderForPartitionCode) - ErrorMapping.NotCoordinatorForConsumerCode - else - errorCode) - }.toMap - OffsetCommitResponse(commitStatus, correlationId) - } - def writeTo(buffer: ByteBuffer) { // Write envelope buffer.putShort(versionId) @@ -150,8 +134,10 @@ case class OffsetCommitRequest(groupId: String, override def handleError(e: Throwable, requestChannel: RequestChannel, request: RequestChannel.Request): Unit = { val errorCode = ErrorMapping.codeFor(e.getClass.asInstanceOf[Class[Throwable]]) - val errorResponse = responseFor(errorCode, Int.MaxValue) - requestChannel.sendResponse(new Response(request, new BoundedByteBufferSend(errorResponse))) + val commitStatus = requestInfo.mapValues(_ => errorCode) + val commitResponse = OffsetCommitResponse(commitStatus, correlationId) + + requestChannel.sendResponse(new Response(request, new BoundedByteBufferSend(commitResponse))) } override def describe(details: Boolean): String = { diff --git a/core/src/main/scala/kafka/server/DelayedFetch.scala b/core/src/main/scala/kafka/server/DelayedFetch.scala index c80ed17..5cff718 100644 --- a/core/src/main/scala/kafka/server/DelayedFetch.scala +++ b/core/src/main/scala/kafka/server/DelayedFetch.scala @@ -102,13 +102,18 @@ class DelayedFetch(delayMs: Long, override def onExpired() { debug("Expire fetch %s and return whatever fetch data is available".format(fetchMetadata)) + completeFetch() + } + def completeFetch() { // read whatever data is available and return val logReadResults = replicaManager.readFromLocalLog(fetchMetadata.fetchOnlyLeader, fetchMetadata.fetchOnlyCommitted, fetchMetadata.fetchPartitionStatus.mapValues(status => status.fetchInfo)) + val fetchPartitionData = logReadResults.mapValues(result => FetchResponsePartitionData(result.errorCode, result.hw, result.info.messageSet)) + onComplete(fetchPartitionData) } } \ No newline at end of file diff --git a/core/src/main/scala/kafka/server/DelayedProduce.scala b/core/src/main/scala/kafka/server/DelayedProduce.scala index 0ec0055..bbb176c 100644 --- a/core/src/main/scala/kafka/server/DelayedProduce.scala +++ b/core/src/main/scala/kafka/server/DelayedProduce.scala @@ -66,7 +66,7 @@ class DelayedProduce(delayMs: Long, trace("Initial partition status for %s is %s".format(topicAndPartition, status)) } - def tryComplete(): Boolean = { + override def tryComplete(): Boolean = { // check for each partition if it still has pending acks produceMetadata.produceStatus.foreach { case (topicAndPartition, status) => trace("Checking produce satisfaction for %s, acksPending = %b" @@ -101,7 +101,10 @@ class DelayedProduce(delayMs: Long, override def onExpired() { debug("Expire produce %s and return the error codes".format(produceMetadata)) + completeProduce() + } + def completeProduce() { // return the current response status val responseStatus = produceMetadata.produceStatus.mapValues(status => status.responseStatus) onComplete(responseStatus) diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index 4c03d2c..d6dfcee 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -42,12 +42,8 @@ class KafkaApis(val requestChannel: RequestChannel, val config: KafkaConfig, val controller: KafkaController) extends Logging { - val producerRequestPurgatory = new ProducerRequestPurgatory(replicaManager, offsetManager, requestChannel) - val fetchRequestPurgatory = new FetchRequestPurgatory(replicaManager, requestChannel) - // TODO: the following line will be removed in 0.9 - replicaManager.initWithRequestPurgatory(producerRequestPurgatory, fetchRequestPurgatory) - var metadataCache = new MetadataCache this.logIdent = "[KafkaApi-%d] ".format(brokerId) + val metadataCache = new MetadataCache /** * Top-level method that handles all requests and multiplexes to the right api @@ -127,42 +123,28 @@ class KafkaApis(val requestChannel: RequestChannel, val offsetCommitRequest = request.requestObj.asInstanceOf[OffsetCommitRequest] // the callback for sending the response - def sendResponseCallback(responseStatus: Map[TopicAndPartition, ProducerResponseStatus]) { - var errorInResponse = false - responseStatus.foreach { case (topicAndPartition, status) => + def sendResponseCallback(responseStatus: Map[TopicAndPartition, Short]) { + responseStatus.foreach { case (topicAndPartition, errorCode) => // Here we only print warnings for known errors; if it is unknown, it will cause // an error message in the replica manager already and hence can be ignored here - if (status.error != ErrorMapping.NoError && status.error != ErrorMapping.UnknownCode) { - warn("Produce request with correlation id %d from client %s on partition %s failed due to %s" - .format(produceRequest.correlationId, produceRequest.clientId, - topicAndPartition, ErrorMapping.exceptionNameFor(status.error))) - errorInResponse = true + if (errorCode != ErrorMapping.NoError && errorCode != ErrorMapping.UnknownCode) { + warn("Offset commit request with correlation id %d from client %s on partition %s failed due to %s" + .format(offsetCommitRequest.correlationId, offsetCommitRequest.clientId, + topicAndPartition, ErrorMapping.exceptionNameFor(errorCode))) } } - } - private def producerRequestFromOffsetCommit(offsetCommitRequest: OffsetCommitRequest) = { - val msgs = offsetCommitRequest.filterLargeMetadata(config.offsetMetadataMaxSize).map { - case (topicAndPartition, offset) => - new Message( - bytes = OffsetManager.offsetCommitValue(offset), - key = OffsetManager.offsetCommitKey(offsetCommitRequest.groupId, topicAndPartition.topic, topicAndPartition.partition) - ) - }.toSeq - - val producerData = mutable.Map( - TopicAndPartition(OffsetManager.OffsetsTopicName, offsetManager.partitionFor(offsetCommitRequest.groupId)) -> - new ByteBufferMessageSet(config.offsetsTopicCompressionCodec, msgs:_*) - ) - - val request = ProducerRequest( - correlationId = offsetCommitRequest.correlationId, - clientId = offsetCommitRequest.clientId, - requiredAcks = config.offsetCommitRequiredAcks, - ackTimeoutMs = config.offsetCommitTimeoutMs, - data = producerData) - trace("Created producer request %s for offset commit request %s.".format(request, offsetCommitRequest)) - request + val response = OffsetCommitResponse(commitStatus, offsetCommitRequest.correlationId) + requestChannel.sendResponse(new RequestChannel.Response(request, new BoundedByteBufferSend(response))) + } + + // call offset manager to store offsets + offsetManager.storeOffsets( + offsetCommitRequest.groupId, + offsetCommitRequest.consumerId, + offsetCommitRequest.groupGenerationId, + offsetCommitRequest.requestInfo, + sendResponseCallback) } /** diff --git a/core/src/main/scala/kafka/server/OffsetManager.scala b/core/src/main/scala/kafka/server/OffsetManager.scala index 43eb2a3..c4d6747 100644 --- a/core/src/main/scala/kafka/server/OffsetManager.scala +++ b/core/src/main/scala/kafka/server/OffsetManager.scala @@ -35,11 +35,13 @@ import scala.collection._ import java.io.PrintStream import java.util.concurrent.atomic.AtomicBoolean import java.nio.ByteBuffer -import java.util.Properties +import java.util.{Collections, Properties} import java.util.concurrent.TimeUnit import com.yammer.metrics.core.Gauge import org.I0Itec.zkclient.ZkClient +import kafka.api.{ProducerResponse, ProducerResponseStatus} +import kafka.network.{BoundedByteBufferSend, RequestChannel} /** @@ -192,13 +194,87 @@ class OffsetManager(val config: OffsetManagerConfig, offsetsCache.put(key, offsetAndMetadata) } - def putOffsets(group: String, offsets: Map[TopicAndPartition, OffsetAndMetadata]) { - // this method is called _after_ the offsets have been durably appended to the commit log, so there is no need to - // check for current leadership as we do for the offset fetch - trace("Putting offsets %s for group %s in offsets partition %d.".format(offsets, group, partitionFor(group))) - offsets.foreach { case (topicAndPartition, offsetAndMetadata) => - putOffset(GroupTopicPartition(group, topicAndPartition), offsetAndMetadata) + /** + * Store offsets by appending it to the replicated log and then inserting to cache + */ + def storeOffsets(groupName: String, + consumerId: String, + generationId: Int, + offsetMetadata: Map[TopicAndPartition, OffsetAndMetadata], + callbackOnComplete: Map[TopicAndPartition, Short] => Unit) { + // TODO: generation id and consumer id is needed by coordinator to do consumer checking + + // first filter out partitions with offset metadata size exceeding limit + // TODO: in the future we may want to only support atomic commit and hence fail the whole commit when this happens + var commitStatus = offsetMetadata.mapValues { offsetAndMetadata => + if (offsetAndMetadata.metadata != null && offsetAndMetadata.metadata.lengh > maxMetadataSize) + ErrorMapping.OffsetMetadataTooLargeCode + else + ErrorMapping.NoError + } + + val filteredOffsetMetadata = offsetMetadata.filter { case (topicAndPartition, offsetAndMetadata) => + commitStatus.get(TopicAndPartition).get == ErrorMapping.NoError + } + + // construct the message set to append + val messages = filteredOffsetMetadata.map { case (topicAndPartition, offsetAndMetadata) => + new Message( + key = OffsetManager.offsetCommitKey(groupName, topicAndPartition.topic, topicAndPartition.partition), + bytes = OffsetManager.offsetCommitValue(offsetAndMetadata) + ) + }.toSeq + + val offsetTopicPartition = TopicAndPartition(OffsetsTopicName, partitionFor(groupName)) + + val messageSet = Collections.singletonMap(offsetTopicPartition, + new ByteBufferMessageSet(config.offsetsTopicCompressionCodec, messages:_*)) + + // set the callback function to insert offsets into cache after log append completed + def putCacheCallback(responseStatus: Map[TopicAndPartition, ProducerResponseStatus]) { + // the append response should only contain the topics partition + if (responseStatus.size != 1 || ! responseStatus.contains(offsetTopicPartition)) + throw new IllegalStateException("Append status %s should only have one partition %s" + .format(responseStatus, offsetTopicPartition)) + + // construct the commit response status and insert + // the offset and metadata to cache iff the append status has no error + val status = responseStatus.get(offsetTopicPartition).get + + if (status.error == ErrorMapping.NoError) { + filteredOffsetMetadata.foreach { case (topicAndPartition, offsetAndMetadata) => + putOffset(GroupTopicPartition(group, topicAndPartition), offsetAndMetadata) + } + } else { + debug("Offset commit %s from group %s consumer %s with generation %d failed when appending to log due to %s" + .format(filteredOffsetMetadata, groupName, consumerId, generationId, ErrorMapping.exceptionNameFor(status.error))) + + // update the commit status error code with the corresponding log append error code + val commitErrorCode = + if (status.error == ErrorMapping.UnknownTopicOrPartitionCode) + ErrorMapping.ConsumerCoordinatorNotAvailableCode + else if (status.error == ErrorMapping.NotLeaderForPartitionCode) + ErrorMapping.NotCoordinatorForConsumerCode + else + status.error + + commitStatus = commitStatus.mapValues { case errorCode => + if (errorCode == ErrorMapping.NoError) + commitErrorCode + else + errorCode + } + } + + callbackOnComplete(commitStatus) } + + // call replica manager to append the offset messages + replicaManager.appendMessages( + config.offsetCommitTimeoutMs.toLong, + config.offsetCommitRequiredAcks, + messageSet, + putCacheCallback) } /** diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index 0eda9bd..c542ded 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -77,8 +77,9 @@ class ReplicaManager(config: KafkaConfig, this.logIdent = "[Replica Manager on Broker " + localBrokerId + "]: " val stateChangeLogger = KafkaController.stateChangeLogger - var producerRequestPurgatory: ProducerRequestPurgatory = null - var fetchRequestPurgatory: FetchRequestPurgatory = null + val producerRequestPurgatory = new RequestPurgatory[DelayedProduce](brokerId, config.producerPurgatoryPurgeIntervalRequests) + val fetchRequestPurgatory = new RequestPurgatory[DelayedFetch](brokerId, config.fetchPurgatoryPurgeIntervalRequests) + newGauge( "LeaderCount", @@ -113,37 +114,26 @@ class ReplicaManager(config: KafkaConfig, } /** - * Initialize the replica manager with the request purgatory - * - * TODO: will be removed in 0.9 where we refactor server structure - */ - - def initWithRequestPurgatory(producerRequestPurgatory: ProducerRequestPurgatory, fetchRequestPurgatory: FetchRequestPurgatory) { - this.producerRequestPurgatory = producerRequestPurgatory - this.fetchRequestPurgatory = fetchRequestPurgatory - } - - /** * Unblock some delayed produce requests with the request key */ def unblockDelayedProduceRequests(key: DelayedRequestKey) { - val satisfied = producerRequestPurgatory.update(key) + val satisfied = producerRequestPurgatory.getCompleted(key) debug("Request key %s unblocked %d producer requests." .format(key.keyLabel, satisfied.size)) - // send any newly unblocked responses - satisfied.foreach(producerRequestPurgatory.respond(_)) + // complete the produce operation + satisfied.foreach(_.completeProduce()) } /** * Unblock some delayed fetch requests with the request key */ def unblockDelayedFetchRequests(key: DelayedRequestKey) { - val satisfied = fetchRequestPurgatory.update(key) + val satisfied = fetchRequestPurgatory.getCompleted(key) debug("Request key %s unblocked %d fetch requests.".format(key.keyLabel, satisfied.size)) - // send any newly unblocked responses - satisfied.foreach(fetchRequestPurgatory.respond(_)) + // complete the fetch operation + satisfied.foreach(_.completeFetch()) } def startup() { @@ -285,10 +275,8 @@ class ReplicaManager(config: KafkaConfig, val producerRequestKeys = messagesPerPartition.keys.map(TopicPartitionRequestKey(_)).toSeq val completedByMe = producerRequestPurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys) - if (completedByMe) { - val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus) - callbackOnComplete(produceResponseStatus) - } + if (completedByMe) + delayedProduce.completeProduce() } } @@ -361,7 +349,7 @@ class ReplicaManager(config: KafkaConfig, // if the fetch comes from the follower, // update its corresponding log end offset if(Request.isValidBrokerId(fetchRequest.replicaId)) - recordFollowerLogEndOffsets(replicaId, dataRead.mapValues(_.offset)) + updateFollowerLEOs(replicaId, dataRead.mapValues(_.offset)) // check if this fetch request can be satisfied right away val bytesReadable = logReadResults.values.map(_.info.messageSet.sizeInBytes).sum @@ -387,12 +375,8 @@ class ReplicaManager(config: KafkaConfig, // add the fetch request for watch if it's not satisfied, otherwise send the response back val completedByMe = fetchRequestPurgatory.tryCompleteElseWatch(delayedFetch, delayedFetchKeys) - if (completedByMe) { - // fetch again to get whatever is available - val fetchPartitionData = readFromLocalLog(fetchOnlyLeader, fetchOnlyCommitted, fetchInfo) - .mapValues(result => FetchResponsePartitionData(result.errorCode, result.hw, result.info.messageSet)) - callbackOnComplete(fetchPartitionData) - } + if (completedByMe) + delayedFetch.completeFetch() } } @@ -452,18 +436,6 @@ class ReplicaManager(config: KafkaConfig, } } - private def recordFollowerLogEndOffsets(replicaId: Int, offsets: Map[TopicAndPartition, LogOffsetMetadata]) { - debug("Record follower log end offsets: %s ".format(offsets)) - offsets.foreach { - case (topicAndPartition, offset) => - updateReplicaLEOAndPartitionHW(topicAndPartition.topic, topicAndPartition.partition, replicaId, offset) - - // for producer requests with ack > 1, we need to check - // if they can be unblocked after some follower's log end offsets have moved - unblockDelayedProduceRequests(new TopicPartitionRequestKey(topicAndPartition)) - } - } - def maybeUpdateMetadataCache(updateMetadataRequest: UpdateMetadataRequest, metadataCache: MetadataCache) { replicaStateChangeLock synchronized { if(updateMetadataRequest.controllerEpoch < controllerEpoch) { @@ -714,15 +686,29 @@ class ReplicaManager(config: KafkaConfig, allPartitions.values.foreach(partition => partition.maybeShrinkIsr(config.replicaLagTimeMaxMs, config.replicaLagMaxMessages)) } - def updateReplicaLEOAndPartitionHW(topic: String, partitionId: Int, replicaId: Int, offset: LogOffsetMetadata) = { - getPartition(topic, partitionId) match { + private def updateFollowerLEOs(replicaId: Int, offsets: Map[TopicAndPartition, LogOffsetMetadata]) { + debug("Recording follower broker %d log end offsets: %s ".format(replicaId, offsets)) + offsets.foreach { + case (topicAndPartition, offset) => + updateReplicaLEO(topicAndPartition, replicaId, offset) + + // for producer requests with ack > 1, we need to check + // if they can be unblocked after some follower's log end offsets have moved + unblockDelayedProduceRequests(new TopicPartitionRequestKey(topicAndPartition)) + } + } + + private def updateReplicaLEO(topicAndPartition: TopicAndPartition, replicaId: Int, offset: LogOffsetMetadata) = { + getPartition(topicAndPartition.topic, topicAndPartition.partition) match { case Some(partition) => partition.getReplica(replicaId) match { case Some(replica) => replica.logEndOffset = offset - // check if we need to update HW and expand Isr + + // check if we need to update HW and expand Isr after some of its replica's LEOs have changed partition.updateLeaderHWAndMaybeExpandIsr(replicaId) - debug("Recorded follower %d position %d for partition [%s,%d].".format(replicaId, offset.messageOffset, topic, partitionId)) + + debug("Recorded replica %d LEO position %d for partition %s.".format(replicaId, offset.messageOffset, topicAndPartition)) case None => throw new NotAssignedReplicaException(("Leader %d failed to record follower %d's position %d since the replica" + " is not recognized to be one of the assigned replicas %s for partition [%s,%d]").format(localBrokerId, replicaId, @@ -730,7 +716,7 @@ class ReplicaManager(config: KafkaConfig, } case None => - warn("While recording the follower position, the partition [%s,%d] hasn't been created, skip updating leader HW".format(topic, partitionId)) + warn("While recording the replica LEO, the partition %s hasn't been created.".format(topicAndPartition)) } } diff --git a/core/src/main/scala/kafka/server/RequestPurgatory.scala b/core/src/main/scala/kafka/server/RequestPurgatory.scala index e459e55..c99602f 100644 --- a/core/src/main/scala/kafka/server/RequestPurgatory.scala +++ b/core/src/main/scala/kafka/server/RequestPurgatory.scala @@ -62,7 +62,7 @@ abstract class DelayedRequest(delayMs: Long, onComplete: Any => Unit) extends De * A helper purgatory class for bookkeeping delayed operations with a timeout, and expiring timed out operations. * */ -abstract class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: Int = 1000) +class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: Int = 1000) extends Logging with KafkaMetricsGroup { /* a list of requests watching each key */ diff --git a/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala b/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala index 09ed8f5..5ebd585 100644 --- a/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala +++ b/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala @@ -95,7 +95,6 @@ class SimpleFetchTest extends JUnit3Suite { EasyMock.reset(replicaManager) EasyMock.expect(replicaManager.config).andReturn(configs.head).anyTimes() EasyMock.expect(replicaManager.getLeaderReplicaIfLocal(topic, partitionId)).andReturn(partition.leaderReplicaIfLocal().get).anyTimes() - EasyMock.expect(replicaManager.initWithRequestPurgatory(EasyMock.anyObject(), EasyMock.anyObject())) EasyMock.expect(replicaManager.readMessageSets(EasyMock.anyObject())).andReturn({ val fetchInfo = log.read(0, fetchSize, Some(hw)) val partitionData = new FetchResponsePartitionData(ErrorMapping.NoError, hw.toLong, fetchInfo.messageSet) @@ -184,10 +183,9 @@ class SimpleFetchTest extends JUnit3Suite { EasyMock.reset(replicaManager) EasyMock.expect(replicaManager.config).andReturn(configs.head).anyTimes() - EasyMock.expect(replicaManager.updateReplicaLEOAndPartitionHW(topic, partitionId, followerReplicaId, new LogOffsetMetadata(followerLEO.asInstanceOf[Long], 0L, followerLEO))) + EasyMock.expect(replicaManager.updateReplicaLEO(TopicAndPartition(topic, partitionId), followerReplicaId, new LogOffsetMetadata(followerLEO.asInstanceOf[Long], 0L, followerLEO))) EasyMock.expect(replicaManager.getReplica(topic, partitionId, followerReplicaId)).andReturn(partition.inSyncReplicas.find(_.brokerId == configs(1).brokerId)) EasyMock.expect(replicaManager.getLeaderReplicaIfLocal(topic, partitionId)).andReturn(partition.leaderReplicaIfLocal().get).anyTimes() - EasyMock.expect(replicaManager.initWithRequestPurgatory(EasyMock.anyObject(), EasyMock.anyObject())) EasyMock.expect(replicaManager.readMessageSets(EasyMock.anyObject())).andReturn({ val fetchInfo = log.read(followerLEO, Integer.MAX_VALUE, None) val partitionData = new FetchResponsePartitionData(ErrorMapping.NoError, hw.toLong, fetchInfo.messageSet) -- 1.7.12.4 From 4419d8391bf0eb633e41862b10d9e2c1ee819007 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Mon, 18 Aug 2014 15:08:43 -0700 Subject: [PATCH 10/15] fix 1 --- core/src/main/scala/kafka/server/DelayedFetch.scala | 7 ++++--- core/src/main/scala/kafka/server/ReplicaManager.scala | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/kafka/server/DelayedFetch.scala b/core/src/main/scala/kafka/server/DelayedFetch.scala index 5cff718..0cba36c 100644 --- a/core/src/main/scala/kafka/server/DelayedFetch.scala +++ b/core/src/main/scala/kafka/server/DelayedFetch.scala @@ -37,8 +37,8 @@ import scala.collection.immutable.Map case class FetchPartitionStatus(startOffsetMetadata: LogOffsetMetadata, fetchInfo: PartitionFetchInfo) { - override def toString = "[startOffset: " + startOffsetMetadata + ", " + - "fetchSize: " + fetchSize + "]" + override def toString = "[startOffsetMetadata: " + startOffsetMetadata + ", " + + "fetchInfo: " + fetchInfo + "]" } case class FetchMetadata(fetchMinBytes: Int, @@ -61,7 +61,8 @@ class DelayedFetch(delayMs: Long, override def tryComplete() : Boolean = { var accumulatedSize = 0 fetchMetadata.fetchPartitionStatus.foreach { - case (topicAndPartition, fetchOffset) => + case (topicAndPartition, fetchStatus) => + val fetchOffset = fetchStatus.startOffsetMetadata try { if (fetchOffset != LogOffsetMetadata.UnknownOffsetMetadata) { val replica = replicaManager.getLeaderReplicaIfLocal(topicAndPartition.topic, topicAndPartition.partition) diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index c542ded..ce80369 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -59,7 +59,7 @@ case class LogReadResult(info: FetchDataInfo, hw: Long, readSize: Int, error: Op } } -class ReplicaManager(config: KafkaConfig, +class ReplicaManager(val config: KafkaConfig, time: Time, val zkClient: ZkClient, scheduler: Scheduler, -- 1.7.12.4 From 3d83ee90cf88e394232fc34f50c1e85530b36496 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Mon, 18 Aug 2014 17:35:21 -0700 Subject: [PATCH 11/15] fix 2 --- .../src/main/scala/kafka/server/DelayedFetch.scala | 5 +- .../main/scala/kafka/server/DelayedProduce.scala | 2 +- .../scala/kafka/server/FetchRequestPurgatory.scala | 69 ---------------------- core/src/main/scala/kafka/server/KafkaApis.scala | 5 +- .../kafka/server/ProducerRequestPurgatory.scala | 69 ---------------------- .../main/scala/kafka/server/ReplicaManager.scala | 18 +++--- 6 files changed, 14 insertions(+), 154 deletions(-) delete mode 100644 core/src/main/scala/kafka/server/FetchRequestPurgatory.scala delete mode 100644 core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala diff --git a/core/src/main/scala/kafka/server/DelayedFetch.scala b/core/src/main/scala/kafka/server/DelayedFetch.scala index 0cba36c..19dbcc4 100644 --- a/core/src/main/scala/kafka/server/DelayedFetch.scala +++ b/core/src/main/scala/kafka/server/DelayedFetch.scala @@ -47,7 +47,8 @@ case class FetchMetadata(fetchMinBytes: Int, fetchPartitionStatus: Map[TopicAndPartition, FetchPartitionStatus]) { override def toString = "[minBytes: " + fetchMinBytes + ", " + - "committedOnly: " + fetchOnlyCommitted + ", " + "onlyLeader:" + fetchOnlyLeader + ", " + "onlyCommitted: " + fetchOnlyCommitted + ", " "partitionStatus: " + fetchPartitionStatus + "]" } @@ -55,7 +56,7 @@ case class FetchMetadata(fetchMinBytes: Int, class DelayedFetch(delayMs: Long, fetchMetadata: FetchMetadata, replicaManager: ReplicaManager, - onComplete: Map[TopicAndPartition, PartitionDataAndOffset] => Unit) + onComplete: Map[TopicAndPartition, FetchResponsePartitionData] => Unit) extends DelayedRequest(delayMs, onComplete) { override def tryComplete() : Boolean = { diff --git a/core/src/main/scala/kafka/server/DelayedProduce.scala b/core/src/main/scala/kafka/server/DelayedProduce.scala index bbb176c..ee66abe 100644 --- a/core/src/main/scala/kafka/server/DelayedProduce.scala +++ b/core/src/main/scala/kafka/server/DelayedProduce.scala @@ -51,7 +51,7 @@ class DelayedProduce(delayMs: Long, produceMetadata: ProduceMetadata, replicaManager: ReplicaManager, onComplete: Map[TopicAndPartition, ProducerResponseStatus] => Unit) - extends DelayedRequest(delayMs) with Logging { + extends DelayedRequest(delayMs, onComplete) with Logging { // first update the acks pending variable according to the error code produceMetadata.produceStatus foreach { case (topicAndPartition, status) => diff --git a/core/src/main/scala/kafka/server/FetchRequestPurgatory.scala b/core/src/main/scala/kafka/server/FetchRequestPurgatory.scala deleted file mode 100644 index ed13188..0000000 --- a/core/src/main/scala/kafka/server/FetchRequestPurgatory.scala +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import kafka.metrics.KafkaMetricsGroup -import kafka.network.RequestChannel -import kafka.api.FetchResponseSend - -import java.util.concurrent.TimeUnit - -/** - * The purgatory holding delayed fetch requests - */ -class FetchRequestPurgatory(replicaManager: ReplicaManager, requestChannel: RequestChannel) - extends RequestPurgatory[DelayedFetch](replicaManager.config.brokerId, replicaManager.config.fetchPurgatoryPurgeIntervalRequests) { - this.logIdent = "[FetchRequestPurgatory-%d] ".format(replicaManager.config.brokerId) - - private class DelayedFetchRequestMetrics(forFollower: Boolean) extends KafkaMetricsGroup { - private val metricPrefix = if (forFollower) "Follower" else "Consumer" - - val expiredRequestMeter = newMeter(metricPrefix + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) - } - - private val aggregateFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = true) - private val aggregateNonFollowerFetchRequestMetrics = new DelayedFetchRequestMetrics(forFollower = false) - - private def recordDelayedFetchExpired(forFollower: Boolean) { - val metrics = if (forFollower) aggregateFollowerFetchRequestMetrics - else aggregateNonFollowerFetchRequestMetrics - - metrics.expiredRequestMeter.mark() - } - - /** - * Check if a specified delayed fetch request is satisfied - */ - def checkSatisfied(delayedFetch: DelayedFetch): Boolean = delayedFetch.isSatisfied(replicaManager) - - /** - * When a delayed fetch request expires just answer it with whatever data is present - */ - def expire(delayedFetch: DelayedFetch) { - debug("Expiring fetch request %s.".format(delayedFetch.fetch)) - val fromFollower = delayedFetch.fetch.isFromFollower - recordDelayedFetchExpired(fromFollower) - respond(delayedFetch) - } - - // TODO: purgatory should not be responsible for sending back the responses - def respond(delayedFetch: DelayedFetch) { - val response = delayedFetch.respond(replicaManager) - requestChannel.sendResponse(new RequestChannel.Response(delayedFetch.request, new FetchResponseSend(response))) - } -} \ No newline at end of file diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index d6dfcee..7ec3f57 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -20,7 +20,6 @@ package kafka.server import kafka.api._ import kafka.common._ import kafka.log._ -import kafka.message._ import kafka.network._ import kafka.admin.AdminUtils import kafka.network.RequestChannel.Response @@ -123,8 +122,8 @@ class KafkaApis(val requestChannel: RequestChannel, val offsetCommitRequest = request.requestObj.asInstanceOf[OffsetCommitRequest] // the callback for sending the response - def sendResponseCallback(responseStatus: Map[TopicAndPartition, Short]) { - responseStatus.foreach { case (topicAndPartition, errorCode) => + def sendResponseCallback(commitStatus: Map[TopicAndPartition, Short]) { + commitStatus.foreach { case (topicAndPartition, errorCode) => // Here we only print warnings for known errors; if it is unknown, it will cause // an error message in the replica manager already and hence can be ignored here if (errorCode != ErrorMapping.NoError && errorCode != ErrorMapping.UnknownCode) { diff --git a/core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala b/core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala deleted file mode 100644 index d4a7d4a..0000000 --- a/core/src/main/scala/kafka/server/ProducerRequestPurgatory.scala +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package kafka.server - -import kafka.metrics.KafkaMetricsGroup -import kafka.utils.Pool -import kafka.network.{BoundedByteBufferSend, RequestChannel} - -import java.util.concurrent.TimeUnit - -/** - * The purgatory holding delayed producer requests - */ -class ProducerRequestPurgatory(replicaManager: ReplicaManager, offsetManager: OffsetManager, requestChannel: RequestChannel) - extends RequestPurgatory[DelayedProduce](replicaManager.config.brokerId, replicaManager.config.producerPurgatoryPurgeIntervalRequests) { - this.logIdent = "[ProducerRequestPurgatory-%d] ".format(replicaManager.config.brokerId) - - private class DelayedProducerRequestMetrics(keyLabel: String = DelayedRequestKey.globalLabel) extends KafkaMetricsGroup { - val expiredRequestMeter = newMeter(keyLabel + "ExpiresPerSecond", "requests", TimeUnit.SECONDS) - } - - private val producerRequestMetricsForKey = { - val valueFactory = (k: DelayedRequestKey) => new DelayedProducerRequestMetrics(k.keyLabel + "-") - new Pool[DelayedRequestKey, DelayedProducerRequestMetrics](Some(valueFactory)) - } - - private val aggregateProduceRequestMetrics = new DelayedProducerRequestMetrics - - private def recordDelayedProducerKeyExpired(key: DelayedRequestKey) { - val keyMetrics = producerRequestMetricsForKey.getAndMaybePut(key) - List(keyMetrics, aggregateProduceRequestMetrics).foreach(_.expiredRequestMeter.mark()) - } - - /** - * Check if a specified delayed fetch request is satisfied - */ - def checkSatisfied(delayedProduce: DelayedProduce) = delayedProduce.isSatisfied(replicaManager) - - /** - * When a delayed produce request expires answer it with possible time out error codes - */ - def expire(delayedProduce: DelayedProduce) { - debug("Expiring produce request %s.".format(delayedProduce.produce)) - for ((topicPartition, responseStatus) <- delayedProduce.partitionStatus if responseStatus.acksPending) - recordDelayedProducerKeyExpired(new TopicPartitionRequestKey(topicPartition)) - respond(delayedProduce) - } - - // TODO: purgatory should not be responsible for sending back the responses - def respond(delayedProduce: DelayedProduce) { - val response = delayedProduce.respond(offsetManager) - requestChannel.sendResponse(new RequestChannel.Response(delayedProduce.request, new BoundedByteBufferSend(response))) - } -} diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index ce80369..a395402 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -43,8 +43,6 @@ object ReplicaManager { val HighWatermarkFilename = "replication-offset-checkpoint" } -case class PartitionDataAndOffset(data: FetchResponsePartitionData, offset: LogOffsetMetadata) - case class LogAppendResult(info: LogAppendInfo, error: Option[Throwable] = None) { def errorCode = error match { case None => ErrorMapping.NoError @@ -283,7 +281,7 @@ class ReplicaManager(val config: KafkaConfig, /** * Append the messages to the local replica logs */ - private def appendToLocalLog(messagesPerPartition: Map[TopicAndPartition, MessageSet]) = { + private def appendToLocalLog(messagesPerPartition: Map[TopicAndPartition, MessageSet]): immutable.Map[TopicAndPartition, LogAppendResult] = { trace("Append [%s] to local log ".format(messagesPerPartition)) messagesPerPartition.map { case (topicAndPartition, messages) => try { @@ -385,7 +383,7 @@ class ReplicaManager(val config: KafkaConfig, */ def readFromLocalLog(readOnlyIfLeader: Boolean, readOnlyCommitted: Boolean, - readPartitionInfo: Map[TopicAndPartition, PartitionFetchInfo]) = { + readPartitionInfo: Map[TopicAndPartition, PartitionFetchInfo]): immutable.Map[TopicAndPartition, LogReadResult] = { readPartitionInfo.map { case (TopicAndPartition(topic, partition), PartitionFetchInfo(offset, fetchSize)) => val partitionDataAndOffsetInfo = @@ -407,7 +405,7 @@ class ReplicaManager(val config: KafkaConfig, // read on log val logReadInfo = localReplica.log match { case Some(log) => - log.read(offset, maxSize, maxOffsetOpt) + log.read(offset, fetchSize, maxOffsetOpt) case None => error("Leader for partition [%s,%d] does not have a local log".format(topic, partition)) FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty) @@ -416,21 +414,21 @@ class ReplicaManager(val config: KafkaConfig, BrokerTopicStats.getBrokerTopicStats(topic).bytesOutRate.mark(logReadInfo.messageSet.sizeInBytes) BrokerTopicStats.getBrokerAllTopicsStats().bytesOutRate.mark(logReadInfo.messageSet.sizeInBytes) - LogReadResult(logReadInfo, localReplica.highWatermark.messageOffset, fetchSize, ErrorMapping.NoError) + LogReadResult(logReadInfo, localReplica.highWatermark.messageOffset, fetchSize, None) } catch { // NOTE: Failed fetch requests metric is not incremented for known exceptions since it // is supposed to indicate un-expected failure of a broker in handling a fetch request case utpe: UnknownTopicOrPartitionException => - LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, utpe) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, Some(utpe)) case nle: NotLeaderForPartitionException => - LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, nle) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, Some(nle)) case rnae: ReplicaNotAvailableException => - LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, rnae) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, Some(rnae)) case e: Throwable => BrokerTopicStats.getBrokerTopicStats(topic).failedFetchRequestRate.mark() BrokerTopicStats.getBrokerAllTopicsStats().failedFetchRequestRate.mark() error("Error processing fetch operation on partition [%s,%d] offset %d".format(topic, partition, offset)) - LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, t) + LogReadResult(FetchDataInfo(LogOffsetMetadata.UnknownOffsetMetadata, MessageSet.Empty), -1L, fetchSize, Some(e)) } (TopicAndPartition(topic, partition), partitionDataAndOffsetInfo) } -- 1.7.12.4 From 51e8b946e76fef8f0fe04148a4f7d2625ef46f18 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Mon, 18 Aug 2014 17:44:09 -0700 Subject: [PATCH 12/15] fix 3 --- core/src/main/scala/kafka/server/KafkaApis.scala | 7 ++----- core/src/main/scala/kafka/server/OffsetManager.scala | 4 ++-- core/src/main/scala/kafka/server/ReplicaManager.scala | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index 7ec3f57..40ef625 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -122,7 +122,7 @@ class KafkaApis(val requestChannel: RequestChannel, val offsetCommitRequest = request.requestObj.asInstanceOf[OffsetCommitRequest] // the callback for sending the response - def sendResponseCallback(commitStatus: Map[TopicAndPartition, Short]) { + def sendResponseCallback(commitStatus: immutable.Map[TopicAndPartition, Short]) { commitStatus.foreach { case (topicAndPartition, errorCode) => // Here we only print warnings for known errors; if it is unknown, it will cause // an error message in the replica manager already and hence can be ignored here @@ -201,7 +201,7 @@ class KafkaApis(val requestChannel: RequestChannel, val fetchRequest = request.requestObj.asInstanceOf[FetchRequest] // the callback for sending the response - def sendResponseCallback(responsePartitionData: Map[TopicAndPartition, FetchResponsePartitionData]) { + def sendResponseCallback(responsePartitionData: immutable.Map[TopicAndPartition, FetchResponsePartitionData]) { responsePartitionData.foreach { case (topicAndPartition, data) => // Here we only print warnings for known errors; if it is unknown, it will cause // an error message in the replica manager already and hence can be ignored here @@ -411,9 +411,6 @@ class KafkaApis(val requestChannel: RequestChannel, } def close() { - debug("Shutting down.") - fetchRequestPurgatory.shutdown() - producerRequestPurgatory.shutdown() debug("Shut down complete.") } } diff --git a/core/src/main/scala/kafka/server/OffsetManager.scala b/core/src/main/scala/kafka/server/OffsetManager.scala index c4d6747..1bbfd14 100644 --- a/core/src/main/scala/kafka/server/OffsetManager.scala +++ b/core/src/main/scala/kafka/server/OffsetManager.scala @@ -200,8 +200,8 @@ class OffsetManager(val config: OffsetManagerConfig, def storeOffsets(groupName: String, consumerId: String, generationId: Int, - offsetMetadata: Map[TopicAndPartition, OffsetAndMetadata], - callbackOnComplete: Map[TopicAndPartition, Short] => Unit) { + offsetMetadata: immutable.Map[TopicAndPartition, OffsetAndMetadata], + callbackOnComplete: immutable.Map[TopicAndPartition, Short] => Unit) { // TODO: generation id and consumer id is needed by coordinator to do consumer checking // first filter out partitions with offset metadata size exceeding limit diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index a395402..b891ca4 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -336,7 +336,7 @@ class ReplicaManager(val config: KafkaConfig, replicaId: Int, fetchMinBytes: Int, fetchInfo: Map[TopicAndPartition, PartitionFetchInfo], - callbackOnComplete: Map[TopicAndPartition, FetchResponsePartitionData] => Unit) { + callbackOnComplete: immutable.Map[TopicAndPartition, FetchResponsePartitionData] => Unit) { val fetchOnlyLeader: Boolean = replicaId != Request.DebuggingConsumerId val fetchOnlyCommitted: Boolean = ! Request.isValidBrokerId(fetchRequest.replicaId) -- 1.7.12.4 From c48fa9249270781181d1442bd2f28039483e47ab Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Mon, 18 Aug 2014 19:21:11 -0700 Subject: [PATCH 13/15] fix 4 --- core/src/main/scala/kafka/log/Log.scala | 13 ++++++ .../main/scala/kafka/server/OffsetManager.scala | 16 ++++---- .../main/scala/kafka/server/ReplicaManager.scala | 48 ++++++++-------------- .../main/scala/kafka/server/RequestPurgatory.scala | 12 +++--- core/src/main/scala/kafka/utils/DelayedItem.scala | 2 +- 5 files changed, 48 insertions(+), 43 deletions(-) diff --git a/core/src/main/scala/kafka/log/Log.scala b/core/src/main/scala/kafka/log/Log.scala index 002c902..f5203f0 100644 --- a/core/src/main/scala/kafka/log/Log.scala +++ b/core/src/main/scala/kafka/log/Log.scala @@ -46,6 +46,19 @@ object LogAppendInfo { val UnknownLogAppendInfo = LogAppendInfo(-1, -1, NoCompressionCodec, -1, -1, false) } +case class LogAppendResult(info: LogAppendInfo, error: Option[Throwable] = None) { + def errorCode = error match { + case None => ErrorMapping.NoError + case Some(e) => ErrorMapping.codeFor(e.getClass.asInstanceOf[Class[Throwable]]) + } +} + +case class LogReadResult(info: FetchDataInfo, hw: Long, readSize: Int, error: Option[Throwable] = None) { + def errorCode = error match { + case None => ErrorMapping.NoError + case Some(e) => ErrorMapping.codeFor(e.getClass.asInstanceOf[Class[Throwable]]) + } +} /** * An append-only log for storing messages. diff --git a/core/src/main/scala/kafka/server/OffsetManager.scala b/core/src/main/scala/kafka/server/OffsetManager.scala index 1bbfd14..14e0cc6 100644 --- a/core/src/main/scala/kafka/server/OffsetManager.scala +++ b/core/src/main/scala/kafka/server/OffsetManager.scala @@ -40,8 +40,10 @@ import java.util.concurrent.TimeUnit import com.yammer.metrics.core.Gauge import org.I0Itec.zkclient.ZkClient -import kafka.api.{ProducerResponse, ProducerResponseStatus} -import kafka.network.{BoundedByteBufferSend, RequestChannel} +import kafka.api.ProducerResponseStatus + +import collection.JavaConversions._ + /** @@ -207,14 +209,14 @@ class OffsetManager(val config: OffsetManagerConfig, // first filter out partitions with offset metadata size exceeding limit // TODO: in the future we may want to only support atomic commit and hence fail the whole commit when this happens var commitStatus = offsetMetadata.mapValues { offsetAndMetadata => - if (offsetAndMetadata.metadata != null && offsetAndMetadata.metadata.lengh > maxMetadataSize) + if (offsetAndMetadata.metadata != null && offsetAndMetadata.metadata.length() > config.maxMetadataSize) ErrorMapping.OffsetMetadataTooLargeCode else ErrorMapping.NoError } val filteredOffsetMetadata = offsetMetadata.filter { case (topicAndPartition, offsetAndMetadata) => - commitStatus.get(TopicAndPartition).get == ErrorMapping.NoError + commitStatus.get(topicAndPartition).get == ErrorMapping.NoError } // construct the message set to append @@ -225,10 +227,10 @@ class OffsetManager(val config: OffsetManagerConfig, ) }.toSeq - val offsetTopicPartition = TopicAndPartition(OffsetsTopicName, partitionFor(groupName)) + val offsetTopicPartition = TopicAndPartition(OffsetManager.OffsetsTopicName, partitionFor(groupName)) val messageSet = Collections.singletonMap(offsetTopicPartition, - new ByteBufferMessageSet(config.offsetsTopicCompressionCodec, messages:_*)) + new ByteBufferMessageSet(config.offsetsTopicCompressionCodec, messages:_*)).toMap // set the callback function to insert offsets into cache after log append completed def putCacheCallback(responseStatus: Map[TopicAndPartition, ProducerResponseStatus]) { @@ -243,7 +245,7 @@ class OffsetManager(val config: OffsetManagerConfig, if (status.error == ErrorMapping.NoError) { filteredOffsetMetadata.foreach { case (topicAndPartition, offsetAndMetadata) => - putOffset(GroupTopicPartition(group, topicAndPartition), offsetAndMetadata) + putOffset(GroupTopicPartition(groupName, topicAndPartition), offsetAndMetadata) } } else { debug("Offset commit %s from group %s consumer %s with generation %d failed when appending to log due to %s" diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index b891ca4..c52bc47 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -20,7 +20,7 @@ import kafka.api._ import kafka.common._ import kafka.utils._ import kafka.cluster.{Broker, Partition, Replica} -import kafka.log.{LogAppendInfo, LogManager} +import kafka.log.{LogReadResult, LogAppendResult, LogAppendInfo, LogManager} import kafka.metrics.KafkaMetricsGroup import kafka.controller.KafkaController import kafka.common.TopicAndPartition @@ -43,20 +43,6 @@ object ReplicaManager { val HighWatermarkFilename = "replication-offset-checkpoint" } -case class LogAppendResult(info: LogAppendInfo, error: Option[Throwable] = None) { - def errorCode = error match { - case None => ErrorMapping.NoError - case Some(e) => ErrorMapping.codeFor(e.getClass.asInstanceOf[Class[Throwable]]) - } -} - -case class LogReadResult(info: FetchDataInfo, hw: Long, readSize: Int, error: Option[Throwable] = None) { - def errorCode = error match { - case None => ErrorMapping.NoError - case Some(e) => ErrorMapping.codeFor(e.getClass.asInstanceOf[Class[Throwable]]) - } -} - class ReplicaManager(val config: KafkaConfig, time: Time, val zkClient: ZkClient, @@ -75,8 +61,8 @@ class ReplicaManager(val config: KafkaConfig, this.logIdent = "[Replica Manager on Broker " + localBrokerId + "]: " val stateChangeLogger = KafkaController.stateChangeLogger - val producerRequestPurgatory = new RequestPurgatory[DelayedProduce](brokerId, config.producerPurgatoryPurgeIntervalRequests) - val fetchRequestPurgatory = new RequestPurgatory[DelayedFetch](brokerId, config.fetchPurgatoryPurgeIntervalRequests) + val producerRequestPurgatory = new RequestPurgatory[DelayedProduce](config.brokerId, config.producerPurgatoryPurgeIntervalRequests) + val fetchRequestPurgatory = new RequestPurgatory[DelayedFetch](config.brokerId, config.fetchPurgatoryPurgeIntervalRequests) newGauge( @@ -260,9 +246,9 @@ class ReplicaManager(val config: KafkaConfig, // if required acks = 0 we can trigger complete immediately val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus) callbackOnComplete(produceResponseStatus) - } else if (produceRequest.requiredAcks == 1 || + } else if (requiredAcks == 1 || messagesPerPartition.size <= 0 || - localProduceResults.values.count(_.error.isDefined) == produceRequest.numPartitions) { + localProduceResults.values.count(_.error.isDefined) == messagesPerPartition.size) { // if required acks = 1 or all partition appends have failed we can trigger complete immediately val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus) callbackOnComplete(produceResponseStatus) @@ -270,7 +256,7 @@ class ReplicaManager(val config: KafkaConfig, // create delayed produce operation and try to watch it in the purgatory val produceMetadata = ProduceMetadata(requiredAcks, produceStatus) val delayedProduce = new DelayedProduce(timeout, produceMetadata, this, callbackOnComplete) - val producerRequestKeys = messagesPerPartition.keys.map(TopicPartitionRequestKey(_)).toSeq + val producerRequestKeys = messagesPerPartition.keys.map(new TopicPartitionRequestKey(_)).toSeq val completedByMe = producerRequestPurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys) if (completedByMe) @@ -316,14 +302,14 @@ class ReplicaManager(val config: KafkaConfig, Runtime.getRuntime.halt(1) (topicAndPartition, null) case utpe: UnknownTopicOrPartitionException => - (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, utpe)) + (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(utpe))) case nle: NotLeaderForPartitionException => - (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, nle)) + (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(nle))) case e: Throwable => BrokerTopicStats.getBrokerTopicStats(topicAndPartition.topic).failedProduceRequestRate.mark() BrokerTopicStats.getBrokerAllTopicsStats.failedProduceRequestRate.mark() error("Error processing append operation on partition %s".format(topicAndPartition), e) - (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, e)) + (topicAndPartition, LogAppendResult(LogAppendInfo.UnknownLogAppendInfo, Some(e))) } } } @@ -339,15 +325,15 @@ class ReplicaManager(val config: KafkaConfig, callbackOnComplete: immutable.Map[TopicAndPartition, FetchResponsePartitionData] => Unit) { val fetchOnlyLeader: Boolean = replicaId != Request.DebuggingConsumerId - val fetchOnlyCommitted: Boolean = ! Request.isValidBrokerId(fetchRequest.replicaId) + val fetchOnlyCommitted: Boolean = ! Request.isValidBrokerId(replicaId) // read from local logs val logReadResults = readFromLocalLog(fetchOnlyLeader, fetchOnlyCommitted, fetchInfo) // if the fetch comes from the follower, // update its corresponding log end offset - if(Request.isValidBrokerId(fetchRequest.replicaId)) - updateFollowerLEOs(replicaId, dataRead.mapValues(_.offset)) + if(Request.isValidBrokerId(replicaId)) + updateFollowerLEOs(replicaId, logReadResults.mapValues(_.info.fetchOffset)) // check if this fetch request can be satisfied right away val bytesReadable = logReadResults.values.map(_.info.messageSet.sizeInBytes).sum @@ -364,9 +350,11 @@ class ReplicaManager(val config: KafkaConfig, callbackOnComplete(fetchPartitionData) } else { // construct the fetch results from the read results - val fetchPartitionStatus = logReadResults.mapValues(result => FetchPartitionStatus(result.info.fetchOffset, result.readSize)) + val fetchPartitionStatus = logReadResults.map { case (topicAndPartition, result) => + (topicAndPartition, FetchPartitionStatus(result.info.fetchOffset, fetchInfo.get(topicAndPartition).get)) + } val fetchMetadata = FetchMetadata(fetchMinBytes, fetchOnlyLeader, fetchOnlyCommitted, fetchPartitionStatus) - val delayedFetch = new DelayedFetch(time, fetchMetadata, this, callbackOnComplete) + val delayedFetch = new DelayedFetch(timeout, fetchMetadata, this, callbackOnComplete) // create a list of (topic, partition) pairs to use as keys for this delayed request val delayedFetchKeys = fetchPartitionStatus.keys.map(new TopicPartitionRequestKey(_)).toSeq @@ -709,8 +697,8 @@ class ReplicaManager(val config: KafkaConfig, debug("Recorded replica %d LEO position %d for partition %s.".format(replicaId, offset.messageOffset, topicAndPartition)) case None => throw new NotAssignedReplicaException(("Leader %d failed to record follower %d's position %d since the replica" + - " is not recognized to be one of the assigned replicas %s for partition [%s,%d]").format(localBrokerId, replicaId, - offset.messageOffset, partition.assignedReplicas().map(_.brokerId).mkString(","), topic, partitionId)) + " is not recognized to be one of the assigned replicas %s for partition %s").format(localBrokerId, replicaId, + offset.messageOffset, partition.assignedReplicas().map(_.brokerId).mkString(","), topicAndPartition)) } case None => diff --git a/core/src/main/scala/kafka/server/RequestPurgatory.scala b/core/src/main/scala/kafka/server/RequestPurgatory.scala index c99602f..b2b78e2 100644 --- a/core/src/main/scala/kafka/server/RequestPurgatory.scala +++ b/core/src/main/scala/kafka/server/RequestPurgatory.scala @@ -81,11 +81,11 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In newGauge( "NumDelayedOperations", new Gauge[Int] { - def value = expirationReaper.numOperations + def value = expirationReaper.enqueued } ) - expirationThread.start() + expirationReaper.start() /** * Check if the operation can be completed, if not watch it based on the given watch keys @@ -99,7 +99,7 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In * @param watchKeys keys for bookkeeping the operation * @return true iff the delayed operations can be completed */ - def tryCompleteElseWatch(operation: DelayedRequest, watchKeys: Seq[Any]): Boolean = { + def tryCompleteElseWatch(operation: T, watchKeys: Seq[Any]): Boolean = { for(key <- watchKeys) { // if the operation is already completed, stopping adding it to // any further lists and return false @@ -144,7 +144,7 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In * Since an operation may still be in the watch lists even when it has been completed, this number * may be larger than the number of real operations watched */ - protected def size() = watchersForKey.values.map(_.numRequests).sum + expirationReaper.numOperations + protected def size() = watchersForKey.values.map(_.watched).sum + expirationReaper.enqueued /** * Shutdown the expire reaper thread @@ -159,6 +159,8 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In private class Watchers { private val requests = new util.ArrayList[T] + def watched = requests.size() + // potentially add the element to watch if it is not satisfied yet def checkAndMaybeAdd(t: T): Boolean = { synchronized { @@ -224,7 +226,7 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In /* * Return the number of delayed operations kept by the reaper */ - def numOperations = delayed.size() + def enqueued = delayed.size() /* * Add a operation to be expired diff --git a/core/src/main/scala/kafka/utils/DelayedItem.scala b/core/src/main/scala/kafka/utils/DelayedItem.scala index 3d7df84..a4e0dab 100644 --- a/core/src/main/scala/kafka/utils/DelayedItem.scala +++ b/core/src/main/scala/kafka/utils/DelayedItem.scala @@ -41,7 +41,7 @@ class DelayedItem(delay: Long, unit: TimeUnit) extends Delayed with Logging { } def compareTo(d: Delayed): Int = { - val delayed = d.asInstanceOf[DelayedItem[T]] + val delayed = d.asInstanceOf[DelayedItem] val myEnd = createdMs + delayMs val yourEnd = delayed.createdMs + delayed.delayMs -- 1.7.12.4 From f85184b5e5d0748f57c987af285b29e6c43b4ee9 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Tue, 19 Aug 2014 16:31:19 -0700 Subject: [PATCH 14/15] compilation passed --- core/src/main/scala/kafka/api/FetchResponse.scala | 10 +++++++--- core/src/main/scala/kafka/log/Log.scala | 3 ++- core/src/main/scala/kafka/network/BoundedByteBufferSend.scala | 4 ++-- core/src/main/scala/kafka/server/DelayedFetch.scala | 8 ++++---- core/src/main/scala/kafka/server/DelayedProduce.scala | 8 ++++---- core/src/main/scala/kafka/server/KafkaApis.scala | 4 ++-- core/src/main/scala/kafka/server/ReplicaManager.scala | 6 +++--- core/src/main/scala/kafka/server/RequestPurgatory.scala | 2 +- 8 files changed, 25 insertions(+), 20 deletions(-) diff --git a/core/src/main/scala/kafka/api/FetchResponse.scala b/core/src/main/scala/kafka/api/FetchResponse.scala index af93087..c2b84e6 100644 --- a/core/src/main/scala/kafka/api/FetchResponse.scala +++ b/core/src/main/scala/kafka/api/FetchResponse.scala @@ -25,6 +25,8 @@ import kafka.message.{MessageSet, ByteBufferMessageSet} import kafka.network.{MultiSend, Send} import kafka.api.ApiUtils._ +import scala.collection._ + object FetchResponsePartitionData { def readFrom(buffer: ByteBuffer): FetchResponsePartitionData = { val error = buffer.getShort @@ -150,9 +152,11 @@ object FetchResponse { } } - -case class FetchResponse(correlationId: Int, - data: Map[TopicAndPartition, FetchResponsePartitionData]) { +/* + * Note that FetchResponse does not extend from RequestOrResponse as other responses does since it will + * be sent through the FetchResponseSend instead of the BoundedByteBufferSend. + */ +case class FetchResponse(correlationId: Int, data: Map[TopicAndPartition, FetchResponsePartitionData]) { /** * Partitions the data into a map of maps (one for each topic). diff --git a/core/src/main/scala/kafka/log/Log.scala b/core/src/main/scala/kafka/log/Log.scala index f5203f0..c7d6e3c 100644 --- a/core/src/main/scala/kafka/log/Log.scala +++ b/core/src/main/scala/kafka/log/Log.scala @@ -40,12 +40,13 @@ import com.yammer.metrics.core.Gauge * @param codec The codec used in the message set * @param offsetsMonotonic Are the offsets in this message set monotonically increasing */ -case class LogAppendInfo(var firstOffset: Long, var lastOffset: Long, codec: CompressionCodec, shallowCount: Int, validBytes: Int, offsetsMonotonic: Boolean) object LogAppendInfo { val UnknownLogAppendInfo = LogAppendInfo(-1, -1, NoCompressionCodec, -1, -1, false) } +case class LogAppendInfo(var firstOffset: Long, var lastOffset: Long, codec: CompressionCodec, shallowCount: Int, validBytes: Int, offsetsMonotonic: Boolean) + case class LogAppendResult(info: LogAppendInfo, error: Option[Throwable] = None) { def errorCode = error match { case None => ErrorMapping.NoError diff --git a/core/src/main/scala/kafka/network/BoundedByteBufferSend.scala b/core/src/main/scala/kafka/network/BoundedByteBufferSend.scala index a624359..55ecac2 100644 --- a/core/src/main/scala/kafka/network/BoundedByteBufferSend.scala +++ b/core/src/main/scala/kafka/network/BoundedByteBufferSend.scala @@ -25,7 +25,7 @@ import kafka.api.RequestOrResponse @nonthreadsafe private[kafka] class BoundedByteBufferSend(val buffer: ByteBuffer) extends Send { - private var sizeBuffer = ByteBuffer.allocate(4) + private val sizeBuffer = ByteBuffer.allocate(4) // Avoid possibility of overflow for 2GB-4 byte buffer if(buffer.remaining > Int.MaxValue - sizeBuffer.limit) @@ -53,7 +53,7 @@ private[kafka] class BoundedByteBufferSend(val buffer: ByteBuffer) extends Send def writeTo(channel: GatheringByteChannel): Int = { expectIncomplete() - var written = channel.write(Array(sizeBuffer, buffer)) + val written = channel.write(Array(sizeBuffer, buffer)) // if we are done, mark it off if(!buffer.hasRemaining) complete = true diff --git a/core/src/main/scala/kafka/server/DelayedFetch.scala b/core/src/main/scala/kafka/server/DelayedFetch.scala index 19dbcc4..6e17027 100644 --- a/core/src/main/scala/kafka/server/DelayedFetch.scala +++ b/core/src/main/scala/kafka/server/DelayedFetch.scala @@ -20,7 +20,7 @@ package kafka.server import kafka.api.{FetchResponsePartitionData, PartitionFetchInfo, FetchResponse, FetchRequest} import kafka.common.{UnknownTopicOrPartitionException, NotLeaderForPartitionException, TopicAndPartition} -import scala.collection.immutable.Map +import scala.collection._ /** * A delayed fetch request, which is satisfied (or more @@ -56,8 +56,8 @@ case class FetchMetadata(fetchMinBytes: Int, class DelayedFetch(delayMs: Long, fetchMetadata: FetchMetadata, replicaManager: ReplicaManager, - onComplete: Map[TopicAndPartition, FetchResponsePartitionData] => Unit) - extends DelayedRequest(delayMs, onComplete) { + callbackOnComplete: Map[TopicAndPartition, FetchResponsePartitionData] => Unit) + extends DelayedRequest(delayMs) { override def tryComplete() : Boolean = { var accumulatedSize = 0 @@ -116,6 +116,6 @@ class DelayedFetch(delayMs: Long, val fetchPartitionData = logReadResults.mapValues(result => FetchResponsePartitionData(result.errorCode, result.hw, result.info.messageSet)) - onComplete(fetchPartitionData) + callbackOnComplete(fetchPartitionData) } } \ No newline at end of file diff --git a/core/src/main/scala/kafka/server/DelayedProduce.scala b/core/src/main/scala/kafka/server/DelayedProduce.scala index ee66abe..846fc97 100644 --- a/core/src/main/scala/kafka/server/DelayedProduce.scala +++ b/core/src/main/scala/kafka/server/DelayedProduce.scala @@ -23,7 +23,7 @@ import kafka.common.TopicAndPartition import kafka.utils.Logging import scala.Some -import scala.collection.immutable.Map +import scala.collection._ /** A delayed produce request, which is satisfied (or more * accurately, unblocked) -- if for every partition it produce to: @@ -50,8 +50,8 @@ case class ProduceMetadata(produceRequiredAcks: Short, class DelayedProduce(delayMs: Long, produceMetadata: ProduceMetadata, replicaManager: ReplicaManager, - onComplete: Map[TopicAndPartition, ProducerResponseStatus] => Unit) - extends DelayedRequest(delayMs, onComplete) with Logging { + callbackOnComplete: Map[TopicAndPartition, ProducerResponseStatus] => Unit) + extends DelayedRequest(delayMs) { // first update the acks pending variable according to the error code produceMetadata.produceStatus foreach { case (topicAndPartition, status) => @@ -107,7 +107,7 @@ class DelayedProduce(delayMs: Long, def completeProduce() { // return the current response status val responseStatus = produceMetadata.produceStatus.mapValues(status => status.responseStatus) - onComplete(responseStatus) + callbackOnComplete(responseStatus) } } diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index 40ef625..321cdbf 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -201,7 +201,7 @@ class KafkaApis(val requestChannel: RequestChannel, val fetchRequest = request.requestObj.asInstanceOf[FetchRequest] // the callback for sending the response - def sendResponseCallback(responsePartitionData: immutable.Map[TopicAndPartition, FetchResponsePartitionData]) { + def sendResponseCallback(responsePartitionData: Map[TopicAndPartition, FetchResponsePartitionData]) { responsePartitionData.foreach { case (topicAndPartition, data) => // Here we only print warnings for known errors; if it is unknown, it will cause // an error message in the replica manager already and hence can be ignored here @@ -213,7 +213,7 @@ class KafkaApis(val requestChannel: RequestChannel, } val response = FetchResponse(fetchRequest.correlationId, responsePartitionData) - requestChannel.sendResponse(new RequestChannel.Response(request, new BoundedByteBufferSend(response))) + requestChannel.sendResponse(new RequestChannel.Response(request, new FetchResponseSend(response))) } // call the replica manager to append messages to the replicas diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index c52bc47..0b39902 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -267,7 +267,7 @@ class ReplicaManager(val config: KafkaConfig, /** * Append the messages to the local replica logs */ - private def appendToLocalLog(messagesPerPartition: Map[TopicAndPartition, MessageSet]): immutable.Map[TopicAndPartition, LogAppendResult] = { + private def appendToLocalLog(messagesPerPartition: Map[TopicAndPartition, MessageSet]): Map[TopicAndPartition, LogAppendResult] = { trace("Append [%s] to local log ".format(messagesPerPartition)) messagesPerPartition.map { case (topicAndPartition, messages) => try { @@ -322,7 +322,7 @@ class ReplicaManager(val config: KafkaConfig, replicaId: Int, fetchMinBytes: Int, fetchInfo: Map[TopicAndPartition, PartitionFetchInfo], - callbackOnComplete: immutable.Map[TopicAndPartition, FetchResponsePartitionData] => Unit) { + callbackOnComplete: Map[TopicAndPartition, FetchResponsePartitionData] => Unit) { val fetchOnlyLeader: Boolean = replicaId != Request.DebuggingConsumerId val fetchOnlyCommitted: Boolean = ! Request.isValidBrokerId(replicaId) @@ -371,7 +371,7 @@ class ReplicaManager(val config: KafkaConfig, */ def readFromLocalLog(readOnlyIfLeader: Boolean, readOnlyCommitted: Boolean, - readPartitionInfo: Map[TopicAndPartition, PartitionFetchInfo]): immutable.Map[TopicAndPartition, LogReadResult] = { + readPartitionInfo: Map[TopicAndPartition, PartitionFetchInfo]): Map[TopicAndPartition, LogReadResult] = { readPartitionInfo.map { case (TopicAndPartition(topic, partition), PartitionFetchInfo(offset, fetchSize)) => val partitionDataAndOffsetInfo = diff --git a/core/src/main/scala/kafka/server/RequestPurgatory.scala b/core/src/main/scala/kafka/server/RequestPurgatory.scala index b2b78e2..8461458 100644 --- a/core/src/main/scala/kafka/server/RequestPurgatory.scala +++ b/core/src/main/scala/kafka/server/RequestPurgatory.scala @@ -34,7 +34,7 @@ import com.yammer.metrics.core.Gauge * message append operation could be waiting for specified number of acks; or a delayed * message fetch operation could be waiting for a given number of bytes to accumulate. */ -abstract class DelayedRequest(delayMs: Long, onComplete: Any => Unit) extends DelayedItem(delayMs) { +abstract class DelayedRequest(delayMs: Long) extends DelayedItem(delayMs) { val completed = new AtomicBoolean(false) /* -- 1.7.12.4 From 0906d1835384bd62f48b8f80641906b3b6794c90 Mon Sep 17 00:00:00 2001 From: Guozhang Wang Date: Wed, 20 Aug 2014 13:53:47 -0700 Subject: [PATCH 15/15] comments --- .../src/main/scala/kafka/api/ProducerRequest.scala | 5 - core/src/main/scala/kafka/cluster/Partition.scala | 39 +++++++- .../src/main/scala/kafka/server/DelayedFetch.scala | 54 ++++++----- .../main/scala/kafka/server/DelayedProduce.scala | 43 ++++++--- core/src/main/scala/kafka/server/KafkaApis.scala | 13 +-- .../main/scala/kafka/server/OffsetManager.scala | 8 +- .../main/scala/kafka/server/ReplicaManager.scala | 96 ++++++++---------- .../main/scala/kafka/server/RequestPurgatory.scala | 107 +++++++++++---------- .../scala/unit/kafka/server/SimpleFetchTest.scala | 2 +- 9 files changed, 206 insertions(+), 161 deletions(-) diff --git a/core/src/main/scala/kafka/api/ProducerRequest.scala b/core/src/main/scala/kafka/api/ProducerRequest.scala index b2366e7..b062406 100644 --- a/core/src/main/scala/kafka/api/ProducerRequest.scala +++ b/core/src/main/scala/kafka/api/ProducerRequest.scala @@ -152,10 +152,5 @@ case class ProducerRequest(versionId: Short = ProducerRequest.CurrentVersion, producerRequest.append("; TopicAndPartition: " + topicPartitionMessageSizeMap.mkString(",")) producerRequest.toString() } - - - def emptyData(){ - data.clear() - } } diff --git a/core/src/main/scala/kafka/cluster/Partition.scala b/core/src/main/scala/kafka/cluster/Partition.scala index ff106b4..308f4e6 100644 --- a/core/src/main/scala/kafka/cluster/Partition.scala +++ b/core/src/main/scala/kafka/cluster/Partition.scala @@ -230,7 +230,31 @@ class Partition(val topic: String, } } - def updateLeaderHWAndMaybeExpandIsr(replicaId: Int) { + /** + * Update the log end offset of a replica of the partition + */ + def updateReplicaLEO(replicaId: Int, offset: LogOffsetMetadata) = { + getReplica(replicaId) match { + case Some(replica) => + replica.logEndOffset = offset + + // check if we need to expand Isr to include this replica after its LEO has advanced + maybeExpandIsr(replicaId) + + debug("Recorded replica %d LEO position %d for partition [%s,%d]." + .format(replicaId, offset.messageOffset, topic, partitionId)) + case None => + throw new NotAssignedReplicaException(("Leader %d failed to record follower %d's position %d since the replica" + + " is not recognized to be one of the assigned replicas %s for partition [%s,%d]").format(localBrokerId, replicaId, + offset.messageOffset, assignedReplicas().map(_.brokerId).mkString(","), topic, partitionId)) + } + } + + /** + * Check and maybe expand the ISR of the partition; + * this can happen when a non-ISR replica's LEO has incremented + */ + def maybeExpandIsr(replicaId: Int) { inWriteLock(leaderIsrUpdateLock) { // check if this replica needs to be added to the ISR leaderReplicaIfLocal() match { @@ -252,6 +276,7 @@ class Partition(val topic: String, updateIsr(newInSyncReplicas) replicaManager.isrExpandRate.mark() } + // after the isr change, check if the HW of the partition can now be incremented maybeIncrementLeaderHW(leaderReplica) case None => // nothing to do if no longer leader } @@ -286,8 +311,14 @@ class Partition(val topic: String, } /** - * There is no need to acquire the leaderIsrUpdate lock here since all callers of this private API acquire that lock - * @param leaderReplica + * Check and maybe increment the high watermark of the partition; + * this can happen when + * + * 1. Partition ISR changed + * 2. Leader LEO changed and the ISR is down to 1 + * + * Note There is no need to acquire the leaderIsrUpdate lock here + * since all callers of this private API acquire that lock */ private def maybeIncrementLeaderHW(leaderReplica: Replica) { val allLogEndOffsets = inSyncReplicas.map(_.logEndOffset) @@ -299,7 +330,7 @@ class Partition(val topic: String, // some delayed requests may be unblocked after HW changed val requestKey = new TopicPartitionRequestKey(this.topic, this.partitionId) replicaManager.unblockDelayedFetchRequests(requestKey) - replicaManager.unblockDelayedProduceRequests(requestKey) + replicaManager.tryCompleteDelayedProduce(requestKey) } else { debug("Skipping update high watermark since Old hw %s is larger than new hw %s for partition [%s,%d]. All leo's are %s" .format(oldHighWatermark, newHighWatermark, topic, partitionId, allLogEndOffsets.mkString(","))) diff --git a/core/src/main/scala/kafka/server/DelayedFetch.scala b/core/src/main/scala/kafka/server/DelayedFetch.scala index 6e17027..812b445 100644 --- a/core/src/main/scala/kafka/server/DelayedFetch.scala +++ b/core/src/main/scala/kafka/server/DelayedFetch.scala @@ -17,30 +17,23 @@ package kafka.server -import kafka.api.{FetchResponsePartitionData, PartitionFetchInfo, FetchResponse, FetchRequest} -import kafka.common.{UnknownTopicOrPartitionException, NotLeaderForPartitionException, TopicAndPartition} +import kafka.api.FetchResponsePartitionData +import kafka.api.PartitionFetchInfo +import kafka.common.UnknownTopicOrPartitionException +import kafka.common.NotLeaderForPartitionException +import kafka.common.TopicAndPartition import scala.collection._ -/** - * A delayed fetch request, which is satisfied (or more - * accurately, unblocked) -- if: - * Case A: This broker is no longer the leader for some partitions it tries to fetch - * - should return whatever data is available for the rest partitions. - * Case B: This broker is does not know of some partitions it tries to fetch - * - should return whatever data is available for the rest partitions. - * Case C: The fetch offset locates not on the last segment of the log - * - should return all the data on that segment. - * Case D: The accumulated bytes from all the fetching partitions exceeds the minimum bytes - * - should return whatever data is available. - */ - case class FetchPartitionStatus(startOffsetMetadata: LogOffsetMetadata, fetchInfo: PartitionFetchInfo) { override def toString = "[startOffsetMetadata: " + startOffsetMetadata + ", " + "fetchInfo: " + fetchInfo + "]" } +/** + * The fetch metadata maintained by the delayed produce request + */ case class FetchMetadata(fetchMinBytes: Int, fetchOnlyLeader: Boolean, fetchOnlyCommitted: Boolean, @@ -52,13 +45,24 @@ case class FetchMetadata(fetchMinBytes: Int, "partitionStatus: " + fetchPartitionStatus + "]" } - +/** + * A delayed fetch request that can be created by the replica manager and watched + * in the fetch request purgatory + */ class DelayedFetch(delayMs: Long, fetchMetadata: FetchMetadata, replicaManager: ReplicaManager, callbackOnComplete: Map[TopicAndPartition, FetchResponsePartitionData] => Unit) extends DelayedRequest(delayMs) { + /** + * The request can be completed if: + * Case A: This broker is no longer the leader for some partitions it tries to fetch + * Case B: This broker is does not know of some partitions it tries to fetch + * Case C: The fetch offset locates not on the last segment of the log + * Case D: The accumulated bytes from all the fetching partitions exceeds the minimum bytes + * + */ override def tryComplete() : Boolean = { var accumulatedSize = 0 fetchMetadata.fetchPartitionStatus.foreach { @@ -78,11 +82,12 @@ class DelayedFetch(delayMs: Long, debug("Satisfying fetch %s since it is fetching later segments of partition %s.".format(fetchMetadata, topicAndPartition)) return super.tryComplete() } else if (fetchOffset.offsetOnOlderSegment(endOffset)) { - // Case C, this can happen when the folloer replica is lagging too much + // Case C, this can happen when the follower replica is lagging too much debug("Satisfying fetch %s immediately since it is fetching older segments.".format(fetchMetadata)) return super.tryComplete() } else if (fetchOffset.precedes(endOffset)) { - accumulatedSize += endOffset.positionDiff(fetchOffset) + // we need take the partition fetch size as upper bound when accumulating the bytes + accumulatedSize += math.min(endOffset.positionDiff(fetchOffset), fetchStatus.fetchInfo.fetchSize) } } } catch { @@ -102,13 +107,18 @@ class DelayedFetch(delayMs: Long, false } - override def onExpired() { + /** + * Upon expire, complete the fetch request and return + */ + override def expire() { debug("Expire fetch %s and return whatever fetch data is available".format(fetchMetadata)) - completeFetch() + complete() } - def completeFetch() { - // read whatever data is available and return + /** + * Upon completion, read whatever data is available and pass to the complete callback + */ + override def complete() { val logReadResults = replicaManager.readFromLocalLog(fetchMetadata.fetchOnlyLeader, fetchMetadata.fetchOnlyCommitted, fetchMetadata.fetchPartitionStatus.mapValues(status => status.fetchInfo)) diff --git a/core/src/main/scala/kafka/server/DelayedProduce.scala b/core/src/main/scala/kafka/server/DelayedProduce.scala index 846fc97..bd4d5ad 100644 --- a/core/src/main/scala/kafka/server/DelayedProduce.scala +++ b/core/src/main/scala/kafka/server/DelayedProduce.scala @@ -17,22 +17,14 @@ package kafka.server -import kafka.api._ + +import kafka.api.ProducerResponseStatus import kafka.common.ErrorMapping import kafka.common.TopicAndPartition -import kafka.utils.Logging import scala.Some import scala.collection._ -/** A delayed produce request, which is satisfied (or more - * accurately, unblocked) -- if for every partition it produce to: - * Case A: This broker is not the leader: unblock - should return error. - * Case B: This broker is the leader: - * B.1 - If there was a localError (when writing to the local log): unblock - should return error - * B.2 - else, at least requiredAcks replicas should be caught up to this request. - */ - case class ProducePartitionStatus(requiredOffset: Long, responseStatus: ProducerResponseStatus) { @volatile var acksPending = false @@ -40,6 +32,9 @@ case class ProducePartitionStatus(requiredOffset: Long, responseStatus: Producer .format(acksPending, responseStatus.error, responseStatus.offset, requiredOffset) } +/** + * The produce metadata maintained by the delayed produce request + */ case class ProduceMetadata(produceRequiredAcks: Short, produceStatus: Map[TopicAndPartition, ProducePartitionStatus]) { @@ -47,6 +42,10 @@ case class ProduceMetadata(produceRequiredAcks: Short, .format(produceRequiredAcks, produceStatus) } +/** + * A delayed produce request that can be created by the replica manager and watched + * in the produce request purgatory + */ class DelayedProduce(delayMs: Long, produceMetadata: ProduceMetadata, replicaManager: ReplicaManager, @@ -66,6 +65,13 @@ class DelayedProduce(delayMs: Long, trace("Initial partition status for %s is %s".format(topicAndPartition, status)) } + /** + * The delayed produce request can be completed if for every partition it produce to: + * Case A: This broker is no longer the leader: should return error + * Case B: This broker is the leader: + * B.1 - If there was a localError (when writing to the local log): should return error + * B.2 - else, at least requiredAcks replicas should be caught up to this request. + */ override def tryComplete(): Boolean = { // check for each partition if it still has pending acks produceMetadata.produceStatus.foreach { case (topicAndPartition, status) => @@ -80,9 +86,11 @@ class DelayedProduce(delayMs: Long, status.requiredOffset, produceMetadata.produceRequiredAcks) case None => + // Case A (false, ErrorMapping.UnknownTopicOrPartitionCode) } if (errorCode != ErrorMapping.NoError) { + // Case B.1 status.acksPending = false status.responseStatus.error = errorCode } else if (hasEnough) { @@ -92,20 +100,25 @@ class DelayedProduce(delayMs: Long, } } - // unblocked if there are no partitions with pending acks + // Case B.2 if (! produceMetadata.produceStatus.values.exists(p => p.acksPending)) super.tryComplete() else false } - override def onExpired() { + /** + * Upon expire, complete the produce request and return + */ + override def expire() { debug("Expire produce %s and return the error codes".format(produceMetadata)) - completeProduce() + complete() } - def completeProduce() { - // return the current response status + /** + * Upon completion, return the current response status along with the error code per partition + */ + def complete() { val responseStatus = produceMetadata.produceStatus.mapValues(status => status.responseStatus) callbackOnComplete(responseStatus) } diff --git a/core/src/main/scala/kafka/server/KafkaApis.scala b/core/src/main/scala/kafka/server/KafkaApis.scala index 321cdbf..f5f2b32 100644 --- a/core/src/main/scala/kafka/server/KafkaApis.scala +++ b/core/src/main/scala/kafka/server/KafkaApis.scala @@ -118,6 +118,10 @@ class KafkaApis(val requestChannel: RequestChannel, requestChannel.sendResponse(new Response(request, new BoundedByteBufferSend(controlledShutdownResponse))) } + + /** + * Handle an offset commit request + */ def handleOffsetCommitRequest(request: RequestChannel.Request) { val offsetCommitRequest = request.requestObj.asInstanceOf[OffsetCommitRequest] @@ -147,7 +151,7 @@ class KafkaApis(val requestChannel: RequestChannel, } /** - * Handle a produce request or offset commit request (which is really a specialized producer request) + * Handle a produce request */ def handleProducerRequest(request: RequestChannel.Request) { val produceRequest = request.requestObj.asInstanceOf[ProducerRequest] @@ -157,7 +161,7 @@ class KafkaApis(val requestChannel: RequestChannel, var errorInResponse = false responseStatus.foreach { case (topicAndPartition, status) => // Here we only print warnings for known errors; if it is unknown, it will cause - // an error message in the replica manager already and hence can be ignored here + // an error message in the replica manager if (status.error != ErrorMapping.NoError && status.error != ErrorMapping.UnknownCode) { warn("Produce request with correlation id %d from client %s on partition %s failed due to %s" .format(produceRequest.correlationId, produceRequest.clientId, @@ -166,7 +170,7 @@ class KafkaApis(val requestChannel: RequestChannel, } } - if(produceRequest.requiredAcks == 0) { + if (produceRequest.requiredAcks == 0) { // no operation needed if producer request.required.acks = 0; however, if there is any error in handling // the request, since no response is expected by the producer, the server will close socket server so that // the producer client will know that some error has happened and will refresh its metadata @@ -189,9 +193,6 @@ class KafkaApis(val requestChannel: RequestChannel, produceRequest.requiredAcks, produceRequest.data, sendResponseCallback) - - // we do not need the data anymore - produceRequest.emptyData() } /** diff --git a/core/src/main/scala/kafka/server/OffsetManager.scala b/core/src/main/scala/kafka/server/OffsetManager.scala index 14e0cc6..875ce8a 100644 --- a/core/src/main/scala/kafka/server/OffsetManager.scala +++ b/core/src/main/scala/kafka/server/OffsetManager.scala @@ -199,15 +199,15 @@ class OffsetManager(val config: OffsetManagerConfig, /** * Store offsets by appending it to the replicated log and then inserting to cache */ + // TODO: generation id and consumer id is needed by coordinator to do consumer checking in the future def storeOffsets(groupName: String, consumerId: String, generationId: Int, offsetMetadata: immutable.Map[TopicAndPartition, OffsetAndMetadata], callbackOnComplete: immutable.Map[TopicAndPartition, Short] => Unit) { - // TODO: generation id and consumer id is needed by coordinator to do consumer checking // first filter out partitions with offset metadata size exceeding limit - // TODO: in the future we may want to only support atomic commit and hence fail the whole commit when this happens + // TODO: in the future we may want to only support atomic commit and hence fail the whole commit var commitStatus = offsetMetadata.mapValues { offsetAndMetadata => if (offsetAndMetadata.metadata != null && offsetAndMetadata.metadata.length() > config.maxMetadataSize) ErrorMapping.OffsetMetadataTooLargeCode @@ -229,7 +229,7 @@ class OffsetManager(val config: OffsetManagerConfig, val offsetTopicPartition = TopicAndPartition(OffsetManager.OffsetsTopicName, partitionFor(groupName)) - val messageSet = Collections.singletonMap(offsetTopicPartition, + val offsetsAndMetadataMessageSet = Collections.singletonMap(offsetTopicPartition, new ByteBufferMessageSet(config.offsetsTopicCompressionCodec, messages:_*)).toMap // set the callback function to insert offsets into cache after log append completed @@ -275,7 +275,7 @@ class OffsetManager(val config: OffsetManagerConfig, replicaManager.appendMessages( config.offsetCommitTimeoutMs.toLong, config.offsetCommitRequiredAcks, - messageSet, + offsetsAndMetadataMessageSet, putCacheCallback) } diff --git a/core/src/main/scala/kafka/server/ReplicaManager.scala b/core/src/main/scala/kafka/server/ReplicaManager.scala index 0b39902..56b8430 100644 --- a/core/src/main/scala/kafka/server/ReplicaManager.scala +++ b/core/src/main/scala/kafka/server/ReplicaManager.scala @@ -98,26 +98,27 @@ class ReplicaManager(val config: KafkaConfig, } /** - * Unblock some delayed produce requests with the request key + * Try to complete some delayed produce requests with the request key; + * this can be triggered when: + * + * 1. The partition HW has changed (for acks = -1). + * 2. A follower replica's fetch operation is received (for acks > 1) */ - def unblockDelayedProduceRequests(key: DelayedRequestKey) { - val satisfied = producerRequestPurgatory.getCompleted(key) - debug("Request key %s unblocked %d producer requests." - .format(key.keyLabel, satisfied.size)) - - // complete the produce operation - satisfied.foreach(_.completeProduce()) + def tryCompleteDelayedProduce(key: DelayedRequestKey) { + val completed = producerRequestPurgatory.checkAndComplete(key) + debug("Request key %s unblocked %d producer requests.".format(key.keyLabel, completed)) } /** - * Unblock some delayed fetch requests with the request key + * Try to complete some delayed fetch requests with the request key; + * this can be triggered when: + * + * 1. The partition HW has changed; + * 2. A new message set is appended to the local log (for follower fetch) */ def unblockDelayedFetchRequests(key: DelayedRequestKey) { - val satisfied = fetchRequestPurgatory.getCompleted(key) - debug("Request key %s unblocked %d fetch requests.".format(key.keyLabel, satisfied.size)) - - // complete the fetch operation - satisfied.foreach(_.completeFetch()) + val completed = fetchRequestPurgatory.checkAndComplete(key) + debug("Request key %s unblocked %d fetch requests.".format(key.keyLabel, completed)) } def startup() { @@ -224,7 +225,7 @@ class ReplicaManager(val config: KafkaConfig, } /** - * Append messages to leader replicas of the partition, and wait for replicated to other replicas, + * Append messages to leader replicas of the partition, and wait for them to be replicated to other replicas; * the callback function will be triggered either when timeout or the required acks are satisfied */ def appendMessages(timeout: Long, @@ -253,14 +254,17 @@ class ReplicaManager(val config: KafkaConfig, val produceResponseStatus = produceStatus.mapValues(status => status.responseStatus) callbackOnComplete(produceResponseStatus) } else { - // create delayed produce operation and try to watch it in the purgatory + // create delayed produce operation val produceMetadata = ProduceMetadata(requiredAcks, produceStatus) val delayedProduce = new DelayedProduce(timeout, produceMetadata, this, callbackOnComplete) + + // create a list of (topic, partition) pairs to use as keys for this delayed request val producerRequestKeys = messagesPerPartition.keys.map(new TopicPartitionRequestKey(_)).toSeq - val completedByMe = producerRequestPurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys) - if (completedByMe) - delayedProduce.completeProduce() + // try to complete the request immediately, otherwise put it into the purgatory + // this is because while the delayed request is being created, new requests may + // arrive which can make this request completable. + producerRequestPurgatory.tryCompleteElseWatch(delayedProduce, producerRequestKeys) } } @@ -315,7 +319,7 @@ class ReplicaManager(val config: KafkaConfig, } /** - * Fetch messages from the leader replica, + * Fetch messages from the leader replica, and wait until enough data can be fetched and return; * the callback function will be triggered either when timeout or required fetch info is satisfied */ def fetchMessages(timeout: Long, @@ -359,10 +363,10 @@ class ReplicaManager(val config: KafkaConfig, // create a list of (topic, partition) pairs to use as keys for this delayed request val delayedFetchKeys = fetchPartitionStatus.keys.map(new TopicPartitionRequestKey(_)).toSeq - // add the fetch request for watch if it's not satisfied, otherwise send the response back - val completedByMe = fetchRequestPurgatory.tryCompleteElseWatch(delayedFetch, delayedFetchKeys) - if (completedByMe) - delayedFetch.completeFetch() + // try to complete the request immediately, otherwise put it into the purgatory; + // this is because while the delayed request is being created, new requests may + // arrive which can make this request completable. + fetchRequestPurgatory.tryCompleteElseWatch(delayedFetch, delayedFetchKeys) } } @@ -674,35 +678,17 @@ class ReplicaManager(val config: KafkaConfig, private def updateFollowerLEOs(replicaId: Int, offsets: Map[TopicAndPartition, LogOffsetMetadata]) { debug("Recording follower broker %d log end offsets: %s ".format(replicaId, offsets)) - offsets.foreach { - case (topicAndPartition, offset) => - updateReplicaLEO(topicAndPartition, replicaId, offset) - - // for producer requests with ack > 1, we need to check - // if they can be unblocked after some follower's log end offsets have moved - unblockDelayedProduceRequests(new TopicPartitionRequestKey(topicAndPartition)) - } - } - - private def updateReplicaLEO(topicAndPartition: TopicAndPartition, replicaId: Int, offset: LogOffsetMetadata) = { - getPartition(topicAndPartition.topic, topicAndPartition.partition) match { - case Some(partition) => - partition.getReplica(replicaId) match { - case Some(replica) => - replica.logEndOffset = offset - - // check if we need to update HW and expand Isr after some of its replica's LEOs have changed - partition.updateLeaderHWAndMaybeExpandIsr(replicaId) - - debug("Recorded replica %d LEO position %d for partition %s.".format(replicaId, offset.messageOffset, topicAndPartition)) - case None => - throw new NotAssignedReplicaException(("Leader %d failed to record follower %d's position %d since the replica" + - " is not recognized to be one of the assigned replicas %s for partition %s").format(localBrokerId, replicaId, - offset.messageOffset, partition.assignedReplicas().map(_.brokerId).mkString(","), topicAndPartition)) - - } - case None => - warn("While recording the replica LEO, the partition %s hasn't been created.".format(topicAndPartition)) + offsets.foreach { case (topicAndPartition, offset) => + getPartition(topicAndPartition.topic, topicAndPartition.partition) match { + case Some(partition) => + partition.updateReplicaLEO(replicaId, offset) + + // for producer requests with ack > 1, we need to check + // if they can be unblocked after some follower's log end offsets have moved + tryCompleteDelayedProduce(new TopicPartitionRequestKey(topicAndPartition)) + case None => + warn("While recording the replica LEO, the partition %s hasn't been created.".format(topicAndPartition)) + } } } @@ -728,8 +714,10 @@ class ReplicaManager(val config: KafkaConfig, } def shutdown() { - info("Shut down") + info("Shutting down") replicaFetcherManager.shutdown() + fetchRequestPurgatory.shutdown() + producerRequestPurgatory.shutdown() checkpointHighWatermarks() info("Shut down completely") } diff --git a/core/src/main/scala/kafka/server/RequestPurgatory.scala b/core/src/main/scala/kafka/server/RequestPurgatory.scala index 8461458..8d55a1d 100644 --- a/core/src/main/scala/kafka/server/RequestPurgatory.scala +++ b/core/src/main/scala/kafka/server/RequestPurgatory.scala @@ -29,22 +29,28 @@ import com.yammer.metrics.core.Gauge /** - * An operation whose processing needs to be delayed for at most the given delayMs; - * upon complete, the given callback function will be triggered. For example a delayed - * message append operation could be waiting for specified number of acks; or a delayed - * message fetch operation could be waiting for a given number of bytes to accumulate. + * An operation whose processing needs to be delayed for at most the given delayMs. For example + * a delayed produce operation could be waiting for specified number of acks; or + * a delayed fetch operation could be waiting for a given number of bytes to accumulate. */ abstract class DelayedRequest(delayMs: Long) extends DelayedItem(delayMs) { val completed = new AtomicBoolean(false) /* - * Check if the delayed operation can be completed + * Check if the delayed operation can be completed by the caller * * Note that concurrent threads can check if an operation can be completed or not, * but only the first thread will succeed in completing the operation and return * true, others will still return false */ - def tryComplete(): Boolean = completed.compareAndSet(false, true) + def tryComplete(): Boolean = { + if (completed.compareAndSet(false, true)) { + complete() + true + } else { + false + } + } /** * Check if the delayed operation is already completed @@ -52,15 +58,18 @@ abstract class DelayedRequest(delayMs: Long) extends DelayedItem(delayMs) { def isCompleted(): Boolean = completed.get() /* - * When delayMs has elapsed, expire the delayed operation + * Process for expiring a timed out request */ - def onExpired(): Unit + def expire(): Unit = complete + /** + * Process for completing a request + */ + def complete(): Unit } /** * A helper purgatory class for bookkeeping delayed operations with a timeout, and expiring timed out operations. - * */ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: Int = 1000) extends Logging with KafkaMetricsGroup { @@ -106,14 +115,16 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In if (operation.isCompleted()) return false val watchers = watchersFor(key) - // if the operation is completed by myself, stop adding it to + // if the operation can by completed by myself, stop adding it to // any further lists and return true immediately - if(! watchers.checkAndMaybeAdd(operation)) { + if(operation synchronized operation.tryComplete()) { return true + } else { + watchers.watch(operation) } } - // if it is indeed watched, add to the expire queue also + // if it cannot be completed by now and hence is watched, add to the expire queue also if (! operation.isCompleted()) { expirationReaper.enqueue(operation) } @@ -122,16 +133,17 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In } /** - * Return a list of completed operations with the given watch key. + * Check if some some delayed requests can be completed with the given watch key, + * and if yes complete them. * - * @return the list of completed operations + * @return the number of completed requests during this process */ - def getCompleted(key: Any): Seq[T] = { + def checkAndComplete(key: Any): Int = { val watchers = watchersForKey.get(key) if(watchers == null) - Seq.empty + 0 else - watchers.collectCompletedOperations() + watchers.completeWatched() } /* @@ -145,7 +157,7 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In * may be larger than the number of real operations watched */ protected def size() = watchersForKey.values.map(_.watched).sum + expirationReaper.enqueued - + /** * Shutdown the expire reaper thread */ @@ -161,55 +173,48 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In def watched = requests.size() - // potentially add the element to watch if it is not satisfied yet - def checkAndMaybeAdd(t: T): Boolean = { + // add the element to watch + def watch(t: T) { synchronized { - // if the operation can be completed, return false; otherwise add to watch list - if(t.tryComplete()) { - return false - } else { - requests.add(t) - return true - } + requests.add(t) } } - // traverse the list and purge satisfied elements - def purgeSatisfied(): Int = { + // traverse the list and try to complete some watched elements + def completeWatched(): Int = { + var completed = 0 synchronized { val iter = requests.iterator() - var purged = 0 - while (iter.hasNext) { + while(iter.hasNext) { val curr = iter.next - if(curr.isCompleted()) { + if (curr.isCompleted()) { + // another thread has completed this request, just remove it iter.remove() - purged += 1 + } else { + if(curr.tryComplete()) { + iter.remove() + completed += 1 + } } } - purged } + completed } - // traverse the list and try to satisfy watched elements - def collectCompletedOperations(): Seq[T] = { - val response = new mutable.ArrayBuffer[T] + // traverse the list and purge elements that are already completed by others + private def purgeCompleted(): Int = { + var purged = 0 synchronized { val iter = requests.iterator() - while(iter.hasNext) { + while (iter.hasNext) { val curr = iter.next - if (curr.isCompleted()) { - // another thread has completed this request, just remove it + if(curr.isCompleted()) { iter.remove() - } else { - val completed = curr.tryComplete() - if(completed) { - iter.remove() - response += curr - } + purged += 1 } } } - response + purged } } @@ -273,15 +278,17 @@ class RequestPurgatory[T <: DelayedRequest](brokerId: Int = 0, purgeInterval: In override def doWork() { + // try to get the next expired operation and trigger its expiration process val curr = pollExpired() if (curr != null.asInstanceOf[T]) { - curr.onExpired() + curr.expire() } - if (size() >= purgeInterval) { // see if we need to force a full purge + // see if we need to force a full purge + if (size() >= purgeInterval) { debug("Beginning purgatory purge") val purged = purgeSatisfied() debug("Purged %d operations from delay queue.".format(purged)) - val numPurgedFromWatchers = watchersForKey.values.map(_.purgeSatisfied()).sum + val numPurgedFromWatchers = watchersForKey.values.map(_.purgeCompleted()).sum debug("Purged %d operations from watch lists.".format(numPurgedFromWatchers)) } } diff --git a/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala b/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala index 5ebd585..ad77ae8 100644 --- a/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala +++ b/core/src/test/scala/unit/kafka/server/SimpleFetchTest.scala @@ -191,7 +191,7 @@ class SimpleFetchTest extends JUnit3Suite { val partitionData = new FetchResponsePartitionData(ErrorMapping.NoError, hw.toLong, fetchInfo.messageSet) Map(TopicAndPartition(topic, partitionId) -> new PartitionDataAndOffset(partitionData, fetchInfo.fetchOffset)) }).anyTimes() - EasyMock.expect(replicaManager.unblockDelayedProduceRequests(EasyMock.anyObject())).anyTimes() + EasyMock.expect(replicaManager.tryCompleteDelayedProduce(EasyMock.anyObject())).anyTimes() EasyMock.replay(replicaManager) val offsetManager = EasyMock.createMock(classOf[kafka.server.OffsetManager]) -- 1.7.12.4