Index: oak-core/src/main/java/org/apache/jackrabbit/oak/api/jmx/IndexStatsMBean.java =================================================================== --- oak-core/src/main/java/org/apache/jackrabbit/oak/api/jmx/IndexStatsMBean.java (revision 1710226) +++ oak-core/src/main/java/org/apache/jackrabbit/oak/api/jmx/IndexStatsMBean.java (working copy) @@ -134,6 +134,13 @@ CompositeData getExecutionTime(); /** + * Returns the number of indexed nodes as a {@link org.apache.jackrabbit.api.stats.TimeSeries}. + * + * @return the indexed nodes time series + */ + CompositeData getExecutionNodesCount(); + + /** * Returns the consolidated execution stats since last reset * @return consolidated execution stats */ @@ -162,4 +169,34 @@ */ void registerAsyncIndexer(@Name("name") String name, @Name("delayInSeconds") long delayInSeconds); + + /** + * @return true if the indexing job is failing + */ + boolean isFailing(); + + /** + * @return The time the indexing job stared failing, or {@code ""} if the + * job is not currently failing. + */ + String getFailingSince(); + + /** + * @return the number of consecutive failed executions or {@code 0} if the + * job is not currently failing. + */ + long getConsecutiveFailedExecutions(); + + /** + * @return the latest indexing error seen, will not be reset once the job + * starts working again + */ + String getLatestError(); + + /** + * @return the time when the latest indexing error has been seen, will not + * be reset once the job starts working again + */ + String getLatestErrorTime(); + } Index: oak-core/src/main/java/org/apache/jackrabbit/oak/api/jmx/package-info.java =================================================================== --- oak-core/src/main/java/org/apache/jackrabbit/oak/api/jmx/package-info.java (revision 1710226) +++ oak-core/src/main/java/org/apache/jackrabbit/oak/api/jmx/package-info.java (working copy) @@ -15,7 +15,7 @@ * limitations under the License. */ -@Version("2.0.0") +@Version("3.0.0") @Export(optional = "provide:=true") package org.apache.jackrabbit.oak.api.jmx; Index: oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/index/AsyncIndexUpdate.java =================================================================== --- oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/index/AsyncIndexUpdate.java (revision 1710226) +++ oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/index/AsyncIndexUpdate.java (working copy) @@ -26,6 +26,8 @@ import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.REINDEX_PROPERTY_NAME; import static org.apache.jackrabbit.oak.plugins.memory.EmptyNodeState.MISSING_NODE; +import java.io.PrintWriter; +import java.io.StringWriter; import java.util.Calendar; import java.util.HashSet; import java.util.Set; @@ -121,9 +123,6 @@ private final long lifetime = DEFAULT_LIFETIME; // TODO: make configurable - /** Flag to avoid repeatedly logging failure warnings */ - private boolean failing = false; - private final AsyncIndexStats indexStats = new AsyncIndexStats(); /** Flag to switch to synchronous updates once the index caught up to the repo */ @@ -148,6 +147,14 @@ private IndexMBeanRegistration mbeanRegistration; + /** + * Controls the length of the interval (in minutes) at which an indexing + * error is logged as 'warning'. for the rest of the indexing cycles errors + * will be logged at 'debug' level + */ + private static long ERROR_WARN_INTERVAL = TimeUnit.MINUTES.toMillis(Integer + .getInteger("oak.async.warn.interval", 30)); + public AsyncIndexUpdate(@Nonnull String name, @Nonnull NodeStore store, @Nonnull IndexEditorProvider provider, boolean switchOnSync) { this.name = checkNotNull(name); @@ -334,9 +341,8 @@ after, afterCheckpoint, afterTime); // the update succeeded, i.e. it no longer fails - if (failing) { - log.info("[{}] Index update no longer fails", name); - failing = false; + if (indexStats.isFailing()) { + indexStats.fixed(); } // the update succeeded, so we can release the earlier checkpoint @@ -348,14 +354,7 @@ indexStats.releaseTempCheckpoint(afterCheckpoint); } catch (CommitFailedException e) { - if (e == CONCURRENT_UPDATE) { - log.debug("[{}] Concurrent update detected in the index update", name); - } else if (failing) { - log.debug("[{}] The index update is still failing", name, e); - } else { - log.warn("[{}] The index update failed", name, e); - failing = true; - } + indexStats.failed(e); } finally { if (threadNameChanged) { @@ -533,6 +532,15 @@ private final Stopwatch watch = Stopwatch.createUnstarted(); private final ExecutionStats execStats = new ExecutionStats(); + /** Flag to avoid repeatedly logging failure warnings */ + private boolean failing = false; + private long latestErrorWarn = 0; + + private String failingSince = ""; + private String latestError = null; + private String latestErrorTime = ""; + private long consecutiveFailures = 0; + public void start(String now) { status = STATUS_RUNNING; start = now; @@ -555,6 +563,40 @@ watch.reset(); } + public void failed(CommitFailedException e) { + latestError = format(e); + latestErrorTime = now(); + consecutiveFailures++; + if (!failing) { + // first occurrence of a failure + failing = true; + // reusing value so value display is consistent + failingSince = latestErrorTime; + latestErrorWarn = System.currentTimeMillis(); + log.warn("[{}] The index update failed", name, e); + } else { + // subsequent occurrences + boolean warn = System.currentTimeMillis() - latestErrorWarn > ERROR_WARN_INTERVAL; + if (warn) { + log.warn("[{}] The index update is still failing", name, e); + } else { + log.debug("[{}] The index update is still failing", name, e); + } + } + } + + public void fixed() { + log.info("[{}] Index update no longer fails", name); + failing = false; + failingSince = ""; + consecutiveFailures = 0; + latestErrorWarn = 0; + } + + public boolean isFailing() { + return failing; + } + @Override public String getStart() { return start; @@ -644,6 +686,11 @@ } @Override + public CompositeData getExecutionNodesCount() { + return execStats.getExecutionNodesCount(); + } + + @Override public CompositeData getConsolidatedExecutionStats() { return execStats.getConsolidatedStats(); } @@ -657,9 +704,12 @@ public String toString() { return "AsyncIndexStats [start=" + start + ", done=" + done + ", status=" + status + ", paused=" + isPaused + + ", failing=" + failing + ", failingSince=" + failingSince + + ", consecutiveFailures=" + consecutiveFailures + ", updates=" + updates + ", referenceCheckpoint=" + referenceCp + ", processedCheckpoint=" + processedCp - + " ,tempCheckpoints=" + tempCps + " ]"; + + " ,tempCheckpoints=" + tempCps + ", latestErrorTime=" + + latestErrorTime + ", latestError=" + latestError + " ]"; } @Override @@ -670,6 +720,7 @@ private class ExecutionStats { private final TimeSeriesRecorder execCounter; private final TimeSeriesRecorder execTimer; + private final TimeSeriesRecorder execNodesCounter; /** * Captures consolidated execution stats since last reset @@ -683,6 +734,7 @@ private ExecutionStats() { execCounter = new TimeSeriesRecorder(true); execTimer = new TimeSeriesRecorder(true); + execNodesCounter = new TimeSeriesRecorder(true); try { consolidatedType = new CompositeType("ConsolidatedStats", @@ -701,6 +753,7 @@ private void recordExecution(long time, long updates) { execTimer.getCounter().addAndGet(time); + execNodesCounter.getCounter().addAndGet(updates); consolidatedExecTime.addAndGet(time); consolidatedNodes.addAndGet(updates); } @@ -713,6 +766,10 @@ return TimeSeriesStatsUtil.asCompositeData(execTimer, "ExecutionTime"); } + private CompositeData getExecutionNodesCount() { + return TimeSeriesStatsUtil.asCompositeData(execNodesCounter, "ExecutionNodesCount"); + } + private CompositeData getConsolidatedStats() { try { Long[] values = new Long[]{consolidatedExecRuns.longValue(), @@ -733,6 +790,7 @@ private void recordTick() { execCounter.recordOneSecond(); execTimer.recordOneSecond(); + execNodesCounter.recordOneSecond(); } } @@ -751,6 +809,26 @@ public void registerAsyncIndexer(String name, long delayInSeconds) { taskSplitter.registerAsyncIndexer(name, delayInSeconds); } + + @Override + public String getFailingSince() { + return failingSince; + } + + @Override + public long getConsecutiveFailedExecutions() { + return consecutiveFailures; + } + + @Override + public String getLatestError() { + return latestError; + } + + @Override + public String getLatestErrorTime() { + return latestErrorTime; + } } /** @@ -808,7 +886,7 @@ } public boolean isFailing() { - return failing; + return indexStats.isFailing(); } class IndexTaskSpliter { @@ -919,4 +997,14 @@ return name; } + private static String format(Exception exception) { + if (exception == null) { + return ""; + } else { + StringWriter writer = new StringWriter(); + exception.printStackTrace(new PrintWriter(writer)); + return writer.toString(); + } + } + } Index: oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/index/AsyncIndexUpdateTest.java =================================================================== --- oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/index/AsyncIndexUpdateTest.java (revision 1710226) +++ oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/index/AsyncIndexUpdateTest.java (working copy) @@ -25,6 +25,7 @@ import static org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider.TYPE; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -40,11 +41,11 @@ import javax.annotation.Nullable; import javax.management.openmbean.CompositeData; -import ch.qos.logback.classic.Level; import org.apache.jackrabbit.oak.api.CommitFailedException; import org.apache.jackrabbit.oak.api.PropertyState; import org.apache.jackrabbit.oak.api.Type; import org.apache.jackrabbit.oak.commons.junit.LogCustomizer; +import org.apache.jackrabbit.oak.plugins.index.AsyncIndexUpdate.AsyncIndexStats; import org.apache.jackrabbit.oak.plugins.index.AsyncIndexUpdate.IndexTaskSpliter; import org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider; import org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexLookup; @@ -61,6 +62,8 @@ import org.apache.jackrabbit.oak.spi.state.NodeStore; import org.junit.Test; +import ch.qos.logback.classic.Level; + import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; import com.google.common.collect.Sets; @@ -559,6 +562,24 @@ provider.isFailed()); assertTrue("Expecting no checkpoints", store.listCheckpoints().size() == 0); + + // OAK-3054 failure reports + AsyncIndexStats stats = async.getIndexStats(); + String since = stats.getFailingSince(); + assertTrue(stats.isFailing()); + assertEquals(1, stats.getConsecutiveFailedExecutions()); + assertEquals(since, stats.getLatestErrorTime()); + + async.run(); + assertTrue(stats.isFailing()); + assertEquals(2, stats.getConsecutiveFailedExecutions()); + assertEquals(since, stats.getFailingSince()); + assertNotEquals(since, stats.getLatestErrorTime()); + + stats.fixed(); + assertFalse(stats.isFailing()); + assertEquals(0, stats.getConsecutiveFailedExecutions()); + assertEquals("", stats.getFailingSince()); } /**