Index: Crawl.java =================================================================== --- Crawl.java (revision 652713) +++ Crawl.java (working copy) @@ -49,7 +49,7 @@ public static void main(String args[]) throws Exception { if (args.length < 1) { System.out.println - ("Usage: Crawl [-dir d] [-threads n] [-depth i] [-topN N]"); + ("Usage: Crawl [-dir d] [-threads n] [-depth i] [-topN N] [-solr url]"); return; } @@ -62,7 +62,7 @@ int threads = job.getInt("fetcher.threads.fetch", 10); int depth = 5; long topN = Long.MAX_VALUE; - + String solrUrl = null; for (int i = 0; i < args.length; i++) { if ("-dir".equals(args[i])) { dir = new Path(args[i+1]); @@ -74,8 +74,11 @@ depth = Integer.parseInt(args[i+1]); i++; } else if ("-topN".equals(args[i])) { - topN = Integer.parseInt(args[i+1]); - i++; + topN = Integer.parseInt(args[i+1]); + i++; + } else if ("-solr".equals(args[i])) { + solrUrl = args[i+1]; + i++; } else if (args[i] != null) { rootUrlDir = new Path(args[i]); } @@ -95,7 +98,7 @@ Path crawlDb = new Path(dir + "/crawldb"); Path linkDb = new Path(dir + "/linkdb"); Path segments = new Path(dir + "/segments"); - Path indexes = new Path(dir + "/indexes"); + Path indexes = (solrUrl == null ) ? new Path(dir + "/indexes") : null; Path index = new Path(dir + "/index"); Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate()); @@ -128,22 +131,26 @@ if (i > 0) { linkDbTool.invert(linkDb, segments, true, true, false); // invert links - // Delete old indexes - if (fs.exists(indexes)) { - LOG.info("Deleting old indexes: " + indexes); - fs.delete(indexes); + if(indexes != null){ + // Delete old indexes + if (fs.exists(indexes)) { + LOG.info("Deleting old indexes: " + indexes); + fs.delete(indexes); + } + + // Delete old index + if (fs.exists(index)) { + LOG.info("Deleting old merged index: " + index); + fs.delete(index); + } } - - // Delete old index - if (fs.exists(index)) { - LOG.info("Deleting old merged index: " + index); - fs.delete(index); + // index, dedup & merge + indexer.index(indexes, solrUrl, crawlDb, linkDb, + Arrays.asList(fs.listPaths(segments, HadoopFSUtil.getPassAllFilter()))); + if(indexes != null){ + dedup.dedup(new Path[] { indexes }); + merger.merge(fs.listPaths(indexes, HadoopFSUtil.getPassAllFilter()), index, tmpDir); } - - // index, dedup & merge - indexer.index(indexes, crawlDb, linkDb, fs.listPaths(segments, HadoopFSUtil.getPassAllFilter())); - dedup.dedup(new Path[] { indexes }); - merger.merge(fs.listPaths(indexes, HadoopFSUtil.getPassAllFilter()), index, tmpDir); } else { LOG.warn("No URLs to fetch - check your seed list and URL filters."); }