Index: src/java/org/apache/nutch/tools/SegmentMergeTool.java =================================================================== --- src/java/org/apache/nutch/tools/SegmentMergeTool.java (revision 161419) +++ src/java/org/apache/nutch/tools/SegmentMergeTool.java (working copy) @@ -50,7 +50,7 @@ /** * This class cleans up accumulated segments data, and merges them into a single * (or optionally multiple) segment(s), with no duplicates in it. - * + * *
* There are no prerequisites for its correct * operation except for a set of already fetched segments (they don't have to @@ -60,7 +60,7 @@ * most recent versions of pages for every unique value of url or hash. *
*If some of the input segments are corrupted, this tool will attempt to - * repair them, using + * repair them, using * {@link org.apache.nutch.segment.SegmentReader#fixSegment(NutchFileSystem, File, boolean, boolean, boolean, boolean)} method.
*Output segment can be optionally split on the fly into several segments of fixed * length.
@@ -79,7 +79,7 @@ * with all options turned on, i.e. to merge segments into the output segment(s), * index it, and then delete the original segments data. * - * + * * @author Andrzej Bialecki <ab@getopt.org> */ public class SegmentMergeTool implements Runnable { @@ -94,7 +94,7 @@ public static int INDEX_SIZE = 250000; public static int INDEX_MERGE_FACTOR = 30; public static int INDEX_MIN_MERGE_DOCS = 100; - + private NutchFileSystem nfs = null; private File[] segments = null; private int stage = SegmentMergeStatus.STAGE_OPENING; @@ -156,9 +156,9 @@ public long startTime, curTime; public long totalRecords; public long processedRecords; - + public SegmentMergeStatus() {}; - + public SegmentMergeStatus(int stage, File[] inputSegments, long startTime, long totalRecords, long processedRecords) { this.stage = stage; @@ -167,15 +167,15 @@ this.curTime = System.currentTimeMillis(); this.totalRecords = totalRecords; this.processedRecords = processedRecords; - } + } } - + public SegmentMergeStatus getStatus() { SegmentMergeStatus status = new SegmentMergeStatus(stage, segments, start, totalRecords, processedRecords); return status; } - + /** Run the tool, periodically reporting progress. */ public void run() { start = System.currentTimeMillis(); @@ -193,6 +193,7 @@ sr = new SegmentReader(nfs, dir, true); } catch (Exception e) { // this segment is hosed beyond repair, don't use it + LOG.warning("* Segment " + dir.getName() + " is corrupt beyond repair; skipping it."); continue; } segdirs.add(dir); @@ -337,7 +338,7 @@ // // keep the IndexReader open... // - + LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms"); stage = SegmentMergeStatus.STAGE_WRITING; processedRecords = 0L;