diff --git core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java index e2c562a..bb55f1e 100644 --- core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java +++ core/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java @@ -270,11 +270,18 @@ extends InputFormat { */ @Override public List getSplits(JobContext context) throws IOException { - Pair keys = table.getStartEndKeys(); - if (keys == null || keys.getFirst() == null || - keys.getFirst().length == 0) { + Pair unprunedKeys = table.getStartEndKeys(); + if (unprunedKeys == null || unprunedKeys.getFirst() == null || + unprunedKeys.getFirst().length == 0) { throw new IOException("Expecting at least one region."); } + Pair keys = this.getPrunedKeys(unprunedKeys); + if (keys == null || keys.getFirst() == null || + keys.getFirst().length == 0) { + throw new IOException("(Post-Pruning) Expecting at least one region."); + } + if (LOG.isDebugEnabled()) + LOG.debug("Keys (unpruned): " + unprunedKeys.getFirst().length + " --- (pruned)--> " + keys.getFirst().length); if (table == null) { throw new IOException("No table was provided."); } @@ -306,7 +313,29 @@ extends InputFormat { } return splits; } - + + /** + * List of pruned keys as applicable for this M-R job by cutting down on the number of InputSplit-s. + *

+ * This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job, + * (and hence, not contributing to the InputSplit), given the start and end keys of the same.
+ * Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing, + * continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys. + *
+ *
+ * Note: It is possible that unprunedKeys.getSecond[i].length() == 0 , for the last (recent) region. + *
+ * Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included). + * + * + * @param unprunedKeys Boundaries of all the regions. + * @return Subset of the boundaries of the regions , for the given table for which M-R is applicable. By default, the input list is returned, indicating + * that all regions participate in the M-R pipeline. + */ + protected Pair getPrunedKeys(final Pair unprunedKeys) { + return unprunedKeys; + } + /** * Allows subclasses to get the {@link HTable}. */