--- src/java/org/apache/nutch/parse/ParseOutputFormat.java.orig 2006-05-20 02:20:51.000000000 +0200
+++ src/java/org/apache/nutch/parse/ParseOutputFormat.java 2006-05-20 19:40:05.000000000 +0200
@@ -27,6 +27,7 @@
import org.apache.nutch.net.*;
import java.io.*;
+import java.net.*;
import java.util.ArrayList;
import java.util.logging.Logger;
@@ -54,6 +55,7 @@
final float interval = job.getFloat("db.default.fetch.interval", 30f);
final float extscore = job.getFloat("db.score.link.external", 1.0f);
final boolean countFiltered = job.getBoolean("db.score.count.filtered", false);
+ final boolean ignoreExternalLinks = job.getBoolean("crawl.ignore.external.links", false);
Path text =
new Path(new Path(job.getOutputPath(), ParseText.DIR_NAME), name);
@@ -78,7 +80,10 @@
throws IOException {
Parse parse = (Parse)value;
-
+ String fromUrl = key.toString();
+ String fromHost = null;
+ String toHost = null;
+
textOut.append(key, new ParseText(parse.getText()));
ParseData parseData = parse.getData();
@@ -97,6 +102,16 @@
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
+ if (ignoreExternalLinks) {
+ try {
+ fromHost = new URL(fromUrl).getHost().toLowerCase();
+ } catch (MalformedURLException e) {
+ fromHost = null;
+ }
+ } else {
+ fromHost = null;
+ }
+
String[] toUrls = new String[links.length];
int validCount = 0;
for (int i = 0; i < links.length; i++) {
@@ -114,6 +129,18 @@
// compute score contributions and adjustment to the original score
for (int i = 0; i < toUrls.length; i++) {
if (toUrls[i] == null) continue;
+
+ if (ignoreExternalLinks) {
+ try {
+ toHost = new URL(toUrls[i]).getHost().toLowerCase();
+ } catch (MalformedURLException e) {
+ toHost = null;
+ }
+ if (toHost == null || !toHost.equals(fromHost)) { // external links
+ continue; // skip it
+ }
+ }
+
CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
UTF8 targetUrl = new UTF8(toUrls[i]);
adjust = null;
--- conf/nutch-default.xml.orig 2006-05-20 02:20:50.000000000 +0200
+++ conf/nutch-default.xml 2006-05-20 19:46:22.000000000 +0200
@@ -330,6 +330,13 @@
required that a local caching DNS be used.
+
+ crawl.ignore.external.links
+ false
+ If true all links where hostname changes between page and
+ link-target are ignored.
+
+