--- src/java/org/apache/nutch/parse/ParseOutputFormat.java.orig 2006-05-20 02:20:51.000000000 +0200 +++ src/java/org/apache/nutch/parse/ParseOutputFormat.java 2006-05-20 19:40:05.000000000 +0200 @@ -27,6 +27,7 @@ import org.apache.nutch.net.*; import java.io.*; +import java.net.*; import java.util.ArrayList; import java.util.logging.Logger; @@ -54,6 +55,7 @@ final float interval = job.getFloat("db.default.fetch.interval", 30f); final float extscore = job.getFloat("db.score.link.external", 1.0f); final boolean countFiltered = job.getBoolean("db.score.count.filtered", false); + final boolean ignoreExternalLinks = job.getBoolean("crawl.ignore.external.links", false); Path text = new Path(new Path(job.getOutputPath(), ParseText.DIR_NAME), name); @@ -78,7 +80,10 @@ throws IOException { Parse parse = (Parse)value; - + String fromUrl = key.toString(); + String fromHost = null; + String toHost = null; + textOut.append(key, new ParseText(parse.getText())); ParseData parseData = parse.getData(); @@ -97,6 +102,16 @@ // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); + if (ignoreExternalLinks) { + try { + fromHost = new URL(fromUrl).getHost().toLowerCase(); + } catch (MalformedURLException e) { + fromHost = null; + } + } else { + fromHost = null; + } + String[] toUrls = new String[links.length]; int validCount = 0; for (int i = 0; i < links.length; i++) { @@ -114,6 +129,18 @@ // compute score contributions and adjustment to the original score for (int i = 0; i < toUrls.length; i++) { if (toUrls[i] == null) continue; + + if (ignoreExternalLinks) { + try { + toHost = new URL(toUrls[i]).getHost().toLowerCase(); + } catch (MalformedURLException e) { + toHost = null; + } + if (toHost == null || !toHost.equals(fromHost)) { // external links + continue; // skip it + } + } + CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); UTF8 targetUrl = new UTF8(toUrls[i]); adjust = null; --- conf/nutch-default.xml.orig 2006-05-20 02:20:50.000000000 +0200 +++ conf/nutch-default.xml 2006-05-20 19:46:22.000000000 +0200 @@ -330,6 +330,13 @@ required that a local caching DNS be used. + + crawl.ignore.external.links + false + If true all links where hostname changes between page and + link-target are ignored. + +