Array of
+ * {@link URLNormalizer}s for.
+ * @return An Array of {@link URLNormalizer}s for the given scope.
+ * @throws PluginRuntimeException
+ */
+ private URLNormalizer[] getURLNormalizers(String scope) {
+ List extensions = getExtensions(scope);
+
+ if (extensions == EMPTY_EXTENSION_LIST) {
+ return EMPTY_NORMALIZERS;
+ }
+
+ List normalizers = new Vector(extensions.size());
+
+ Iterator it = extensions.iterator();
+ while (it.hasNext()) {
+ Extension ext = (Extension) it.next();
+ URLNormalizer normalizer = null;
+ try {
+ // check to see if we've cached this URLNormalizer instance yet
+ normalizer = (URLNormalizer) this.conf.getObject(ext.getId());
+ if (normalizer == null) {
+ // go ahead and instantiate it and then cache it
+ normalizer = (URLNormalizer) ext.getExtensionInstance();
+ this.conf.setObject(ext.getId(), normalizer);
+ }
+ normalizers.add(normalizer);
+ } catch (PluginRuntimeException e) {
+ e.printStackTrace();
+ LOG.warn("URLNormalizers:PluginRuntimeException when "
+ + "initializing url normalizer plugin "
+ + ext.getDescriptor().getPluginId()
+ + " instance in getURLNormalizers "
+ + "function: attempting to continue instantiating plugins");
+ }
+ }
+ return (URLNormalizer[]) normalizers.toArray(new URLNormalizer[normalizers
+ .size()]);
+ }
+
+ /**
+ * Finds the best-suited normalizer plugin for a given scope.
+ *
+ * @param scope Scope for which we seek a normalizer plugin.
+ * @return a list of extensions to be used for this scope. If none, returns
+ * empty list.
+ * @throws PluginRuntimeException
+ */
+ private List getExtensions(String scope) {
+
+ List extensions = (List) this.conf.getObject(URLNormalizer.X_POINT_ID + "_x_"
+ + scope);
+
+ // Just compare the reference:
+ // if this is the empty list, we know we will find no extension.
+ if (extensions == EMPTY_EXTENSION_LIST) {
+ return EMPTY_EXTENSION_LIST;
+ }
+
+ if (extensions == null) {
+ extensions = findExtensions(scope);
+ if (extensions != null) {
+ this.conf.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, extensions);
+ } else {
+ // Put the empty extension list into cache
+ // to remember we don't know any related extension.
+ this.conf.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, EMPTY_EXTENSION_LIST);
+ extensions = EMPTY_EXTENSION_LIST;
+ }
+ }
+ return extensions;
+ }
+
+ /**
+ * searches a list of suitable url normalizer plugins for the given scope.
+ *
+ * @param scope Scope for which we seek a url normalizer plugin.
+ * @return List - List of extensions to be used for this scope. If none,
+ * returns null.
+ * @throws PluginRuntimeException
+ */
+ private List findExtensions(String scope) {
+
+ String[] orders = null;
+ String orderlist = conf.get("urlnormalizer.order." + scope);
+ if (orderlist == null) orderlist = conf.get("urlnormalizer.order");
+ if (orderlist != null && !orderlist.trim().equals("")) {
+ orders = orderlist.split("\\s+");
+ }
+ String scopelist = conf.get("urlnormalizer.scope." + scope);
+ Set impls = null;
+ if (scopelist != null && !scopelist.trim().equals("")) {
+ String[] names = scopelist.split("\\s+");
+ impls = new HashSet(Arrays.asList(names));
+ }
+ Extension[] extensions = this.extensionPoint.getExtensions();
+ HashMap normalizerExtensions = new HashMap();
+ for (int i = 0; i < extensions.length; i++) {
+ Extension extension = extensions[i];
+ if (impls != null && !impls.contains(extension.getClazz()))
+ continue;
+ normalizerExtensions.put(extension.getClazz(), extension);
+ }
+ List res = new ArrayList();
+ if (orders == null) {
+ res.addAll(normalizerExtensions.values());
+ } else {
+ // first add those explicitly named in correct order
+ for (int i = 0; i < orders.length; i++) {
+ Extension e = (Extension)normalizerExtensions.get(orders[i]);
+ if (e != null) {
+ res.add(e);
+ normalizerExtensions.remove(orders[i]);
+ }
+ }
+ // then add all others in random order
+ res.addAll(normalizerExtensions.values());
+ }
+ return res;
+ }
+
+ public String normalize(String urlString, String scope)
+ throws MalformedURLException {
+ for (int i = 0; i < this.normalizers.length; i++) {
+ if (urlString == null)
+ return null;
+ urlString = this.normalizers[i].normalize(urlString, scope);
+ }
+ return urlString;
+ }
+}
Property changes on: src/java/org/apache/nutch/net/URLNormalizers.java
___________________________________________________________________
Name: svn:eol-style
+ native
Index: src/java/org/apache/nutch/net/UrlNormalizerFactory.java
===================================================================
--- src/java/org/apache/nutch/net/UrlNormalizerFactory.java (revision 440417)
+++ src/java/org/apache/nutch/net/UrlNormalizerFactory.java (working copy)
@@ -1,57 +0,0 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.net;
-
-// Commons Logging imports
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.conf.*;
-
-
-/** Factory to create a UrlNormalizer from "urlnormalizer.class" config property. */
-public class UrlNormalizerFactory {
- private static final Log LOG = LogFactory.getLog(UrlNormalizerFactory.class);
-
- private Configuration conf;
-
- public UrlNormalizerFactory(Configuration conf) {
- this.conf = conf;
- }
-
- /** Return the default UrlNormalizer implementation. */
- public UrlNormalizer getNormalizer() {
- String urlNormalizer = null;
- UrlNormalizer normalizer = (UrlNormalizer) this.conf
- .getObject(UrlNormalizer.class.getName());
- if (normalizer == null) {
- try {
- urlNormalizer = this.conf.get("urlnormalizer.class");
- if (LOG.isInfoEnabled()) {
- LOG.info("Using URL normalizer: " + urlNormalizer);
- }
- Class normalizerClass = Class.forName(urlNormalizer);
- normalizer = (UrlNormalizer) normalizerClass.newInstance();
- normalizer.setConf(this.conf);
- this.conf.setObject(UrlNormalizer.class.getName(), normalizer);
- } catch (Exception e) {
- throw new RuntimeException("Couldn't create " + urlNormalizer, e);
- }
- }
- return normalizer;
- }
-}
Index: src/java/org/apache/nutch/crawl/LinkDb.java
===================================================================
--- src/java/org/apache/nutch/crawl/LinkDb.java (revision 440417)
+++ src/java/org/apache/nutch/crawl/LinkDb.java (working copy)
@@ -32,8 +32,7 @@
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.net.URLFilters;
-import org.apache.nutch.net.UrlNormalizer;
-import org.apache.nutch.net.UrlNormalizerFactory;
+import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.*;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -50,7 +49,7 @@
private int maxInlinks;
private boolean ignoreInternalLinks;
private URLFilters urlFilters;
- private UrlNormalizer urlNormalizer;
+ private URLNormalizers urlNormalizers;
public static class Merger extends MapReduceBase implements Reducer {
private int _maxInlinks;
@@ -98,7 +97,7 @@
urlFilters = new URLFilters(job);
}
if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
- urlNormalizer = new UrlNormalizerFactory(job).getNormalizer();
+ urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
}
}
@@ -109,9 +108,9 @@
throws IOException {
String fromUrl = key.toString();
String fromHost = getHost(fromUrl);
- if (urlNormalizer != null) {
+ if (urlNormalizers != null) {
try {
- fromUrl = urlNormalizer.normalize(fromUrl); // normalize the url
+ fromUrl = urlNormalizers.normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping " + fromUrl + ":" + e);
fromUrl = null;
@@ -139,9 +138,9 @@
continue; // skip it
}
}
- if (urlNormalizer != null) {
+ if (urlNormalizers != null) {
try {
- toUrl = urlNormalizer.normalize(toUrl); // normalize the url
+ toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping " + toUrl + ":" + e);
toUrl = null;
Index: src/java/org/apache/nutch/crawl/LinkDbFilter.java
===================================================================
--- src/java/org/apache/nutch/crawl/LinkDbFilter.java (revision 440417)
+++ src/java/org/apache/nutch/crawl/LinkDbFilter.java (working copy)
@@ -28,8 +28,7 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.nutch.net.URLFilters;
-import org.apache.nutch.net.UrlNormalizer;
-import org.apache.nutch.net.UrlNormalizerFactory;
+import org.apache.nutch.net.URLNormalizers;
/**
* This class provides a way to separate the URL normalization
@@ -42,16 +41,20 @@
public static final String URL_NORMALIZING = "linkdb.url.normalizer";
+ public static final String URL_NORMALIZING_SCOPE = "linkdb.url.normalizer.scope";
+
private boolean filter;
private boolean normalize;
private URLFilters filters;
- private UrlNormalizer normalizer;
+ private URLNormalizers normalizers;
private JobConf jobConf;
+ private String scope;
+
public static final Log LOG = LogFactory.getLog(LinkDbFilter.class);
public void configure(JobConf job) {
@@ -62,7 +65,8 @@
filters = new URLFilters(job);
}
if (normalize) {
- normalizer = new UrlNormalizerFactory(job).getNormalizer();
+ scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_LINKDB);
+ normalizers = new URLNormalizers(job, scope);
}
}
@@ -72,7 +76,7 @@
String url = key.toString();
if (normalize) {
try {
- url = normalizer.normalize(url); // normalize the url
+ url = normalizers.normalize(url, scope); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
url = null;
@@ -95,7 +99,7 @@
fromUrl = inlink.getFromUrl();
if (normalize) {
try {
- fromUrl = normalizer.normalize(fromUrl); // normalize the url
+ fromUrl = normalizers.normalize(fromUrl, scope); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping " + fromUrl + ":" + e);
fromUrl = null;
Index: src/java/org/apache/nutch/crawl/Injector.java
===================================================================
--- src/java/org/apache/nutch/crawl/Injector.java (revision 440417)
+++ src/java/org/apache/nutch/crawl/Injector.java (working copy)
@@ -44,16 +44,16 @@
/** Normalize and filter injected urls. */
public static class InjectMapper implements Mapper {
- private UrlNormalizer urlNormalizer;
+ private URLNormalizers urlNormalizers;
private float interval;
private float scoreInjected;
private JobConf jobConf;
private URLFilters filters;
- private ScoringFilters scfilters;
+ private ScoringFilters scfilters;
public void configure(JobConf job) {
this.jobConf = job;
- urlNormalizer = new UrlNormalizerFactory(jobConf).getNormalizer();
+ urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
interval = jobConf.getFloat("db.default.fetch.interval", 30f);
filters = new URLFilters(jobConf);
scfilters = new ScoringFilters(jobConf);
@@ -69,7 +69,7 @@
String url = value.toString(); // value is line of text
// System.out.println("url: " +url);
try {
- url = urlNormalizer.normalize(url); // normalize the url
+ url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); // normalize the url
url = filters.filter(url); // filter the url
} catch (Exception e) {
if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); }
Index: src/java/org/apache/nutch/crawl/CrawlDbFilter.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbFilter.java (revision 440417)
+++ src/java/org/apache/nutch/crawl/CrawlDbFilter.java (working copy)
@@ -28,8 +28,7 @@
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.nutch.net.URLFilters;
-import org.apache.nutch.net.UrlNormalizer;
-import org.apache.nutch.net.UrlNormalizerFactory;
+import org.apache.nutch.net.URLNormalizers;
/**
* This class provides a way to separate the URL normalization
@@ -40,29 +39,34 @@
public class CrawlDbFilter implements Mapper {
public static final String URL_FILTERING = "crawldb.url.filters";
- public static final String URL_NORMALIZING = "crawldb.url.normalizer";
+ public static final String URL_NORMALIZING = "crawldb.url.normalizers";
+ public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";
+
private boolean urlFiltering;
- private boolean urlNormalizer;
+ private boolean urlNormalizers;
private URLFilters filters;
- private UrlNormalizer normalizer;
+ private URLNormalizers normalizers;
private JobConf jobConf;
+
+ private String scope;
public static final Log LOG = LogFactory.getLog(CrawlDbFilter.class);
public void configure(JobConf job) {
this.jobConf = job;
urlFiltering = job.getBoolean(URL_FILTERING, false);
- urlNormalizer = job.getBoolean(URL_NORMALIZING, false);
+ urlNormalizers = job.getBoolean(URL_NORMALIZING, false);
if (urlFiltering) {
filters = new URLFilters(job);
}
- if (urlNormalizer) {
- normalizer = new UrlNormalizerFactory(job).getNormalizer();
+ if (urlNormalizers) {
+ scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
+ normalizers = new URLNormalizers(job, scope);
}
}
@@ -71,9 +75,9 @@
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
String url = key.toString();
- if (urlNormalizer) {
+ if (urlNormalizers) {
try {
- url = normalizer.normalize(url); // normalize the url
+ url = normalizers.normalize(url, scope); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
url = null;
Index: src/java/org/apache/nutch/crawl/PartitionUrlByHost.java
===================================================================
--- src/java/org/apache/nutch/crawl/PartitionUrlByHost.java (revision 440417)
+++ src/java/org/apache/nutch/crawl/PartitionUrlByHost.java (working copy)
@@ -19,15 +19,22 @@
import java.net.URL;
import java.net.MalformedURLException;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
+import org.apache.nutch.net.URLNormalizers;
/** Partition urls by hostname. */
public class PartitionUrlByHost implements Partitioner {
+ private static final Log LOG = LogFactory.getLog(PartitionUrlByHost.class);
+
private int seed;
+ private URLNormalizers normalizers;
public void configure(JobConf job) {
seed = job.getInt("partition.url.by.host.seed", 0);
+ normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION);
}
public void close() {}
@@ -36,10 +43,16 @@
public int getPartition(WritableComparable key, Writable value,
int numReduceTasks) {
String urlString = ((UTF8)key).toString();
+ try {
+ urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
+ } catch (Exception e) {
+ LOG.warn("Malformed URL: '" + urlString + "'");
+ }
URL url = null;
try {
url = new URL(urlString);
} catch (MalformedURLException e) {
+ LOG.warn("Malformed URL: '" + urlString + "'");
}
int hashCode = (url==null ? urlString : url.getHost()).hashCode();
Index: src/java/org/apache/nutch/crawl/Generator.java
===================================================================
--- src/java/org/apache/nutch/crawl/Generator.java (revision 440417)
+++ src/java/org/apache/nutch/crawl/Generator.java (working copy)
@@ -33,6 +33,7 @@
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
@@ -77,6 +78,7 @@
private int maxPerHost;
private Partitioner hostPartitioner = new PartitionUrlByHost();
private URLFilters filters;
+ private URLNormalizers normalizers;
private ScoringFilters scfilters;
private SelectorEntry entry = new SelectorEntry();
private FloatWritable sortValue = new FloatWritable();
@@ -89,7 +91,9 @@
maxPerHost = job.getInt("generate.max.per.host", -1);
byIP = job.getBoolean("generate.max.per.host.by.ip", false);
filters = new URLFilters(job);
+ normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
scfilters = new ScoringFilters(job);
+ hostPartitioner.configure(job);
}
public void close() {}
@@ -170,6 +174,12 @@
continue;
}
}
+ try {
+ host = normalizers.normalize(host, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
+ host = new URL(host).getHost().toLowerCase();
+ } catch (Exception e) {
+ LOG.warn("Malformed URL: '" + host + "', skipping");
+ }
IntWritable hostCount = (IntWritable)hostCounts.get(host);
if (hostCount == null) {
hostCount = new IntWritable();
Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseOutputFormat.java (revision 440417)
+++ src/java/org/apache/nutch/parse/ParseOutputFormat.java (working copy)
@@ -40,7 +40,7 @@
public class ParseOutputFormat implements OutputFormat {
private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class);
- private UrlNormalizer urlNormalizer;
+ private URLNormalizers urlNormalizers;
private URLFilters filters;
private ScoringFilters scfilters;
@@ -52,7 +52,7 @@
public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
String name, Progressable progress) throws IOException {
- this.urlNormalizer = new UrlNormalizerFactory(job).getNormalizer();
+ this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
this.filters = new URLFilters(job);
this.scfilters = new ScoringFilters(job);
final float interval = job.getFloat("db.default.fetch.interval", 30f);
@@ -116,7 +116,7 @@
for (int i = 0; i < links.length; i++) {
String toUrl = links[i].getToUrl();
try {
- toUrl = urlNormalizer.normalize(toUrl); // normalize the url
+ toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); // normalize the url
toUrl = filters.filter(toUrl); // filter the url
} catch (Exception e) {
toUrl = null;
Index: src/java/org/apache/nutch/parse/Outlink.java
===================================================================
--- src/java/org/apache/nutch/parse/Outlink.java (revision 440417)
+++ src/java/org/apache/nutch/parse/Outlink.java (working copy)
@@ -20,7 +20,7 @@
import java.net.MalformedURLException;
import org.apache.hadoop.io.*;
-import org.apache.nutch.net.UrlNormalizerFactory;
+import org.apache.nutch.net.URLNormalizers;
import org.apache.hadoop.conf.Configuration;
/* An outgoing link from a page. */
@@ -32,7 +32,7 @@
public Outlink() {}
public Outlink(String toUrl, String anchor, Configuration conf) throws MalformedURLException {
- this.toUrl = new UrlNormalizerFactory(conf).getNormalizer().normalize(toUrl);
+ this.toUrl = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK).normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
this.anchor = anchor;
}
Index: src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
===================================================================
--- src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (revision 0)
+++ src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (revision 0)
@@ -0,0 +1,104 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.basic;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+/** Unit tests for BasicURLNormalizer. */
+public class TestBasicURLNormalizer extends TestCase {
+ private BasicURLNormalizer normalizer;
+ private Configuration conf;
+
+ public TestBasicURLNormalizer(String name) {
+ super(name);
+ normalizer = new BasicURLNormalizer();
+ conf = NutchConfiguration.create();
+ normalizer.setConf(conf);
+ }
+
+ public void testNormalizer() throws Exception {
+ // check that leading and trailing spaces are removed
+ normalizeTest(" http://foo.com/ ", "http://foo.com/");
+
+ // check that protocol is lower cased
+ normalizeTest("HTTP://foo.com/", "http://foo.com/");
+
+ // check that host is lower cased
+ normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
+ normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
+
+ // check that port number is normalized
+ normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
+ normalizeTest("http://foo.com:81/", "http://foo.com:81/");
+
+ // check that null path is normalized
+ normalizeTest("http://foo.com", "http://foo.com/");
+
+ // check that references are removed
+ normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html");
+
+ // // check that encoding is normalized
+ // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
+
+ // check that unnecessary "../" are removed
+ normalizeTest("http://foo.com/aa/../",
+ "http://foo.com/" );
+ normalizeTest("http://foo.com/aa/bb/../",
+ "http://foo.com/aa/");
+ normalizeTest("http://foo.com/aa/..",
+ "http://foo.com/aa/..");
+ normalizeTest("http://foo.com/aa/bb/cc/../../foo.html",
+ "http://foo.com/aa/foo.html");
+ normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html",
+ "http://foo.com/aa/cc/ee/foo.html");
+ normalizeTest("http://foo.com/../foo.html",
+ "http://foo.com/foo.html" );
+ normalizeTest("http://foo.com/../../foo.html",
+ "http://foo.com/foo.html" );
+ normalizeTest("http://foo.com/../aa/../foo.html",
+ "http://foo.com/foo.html" );
+ normalizeTest("http://foo.com/aa/../../foo.html",
+ "http://foo.com/foo.html" );
+ normalizeTest("http://foo.com/aa/../bb/../foo.html/../../",
+ "http://foo.com/" );
+ normalizeTest("http://foo.com/../aa/foo.html",
+ "http://foo.com/aa/foo.html" );
+ normalizeTest("http://foo.com/../aa/../foo.html",
+ "http://foo.com/foo.html" );
+ normalizeTest("http://foo.com/a..a/foo.html",
+ "http://foo.com/a..a/foo.html" );
+ normalizeTest("http://foo.com/a..a/../foo.html",
+ "http://foo.com/foo.html" );
+ normalizeTest("http://foo.com/foo.foo/../foo.html",
+ "http://foo.com/foo.html" );
+ }
+
+ private void normalizeTest(String weird, String normal) throws Exception {
+ assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
+ }
+
+ public static void main(String[] args) throws Exception {
+ new TestBasicURLNormalizer("test").testNormalizer();
+ }
+
+
+
+}
Property changes on: src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
___________________________________________________________________
Name: svn:eol-style
+ native
Index: src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
===================================================================
--- src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (revision 0)
+++ src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (revision 0)
@@ -0,0 +1,191 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.basic;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+
+// Commons Logging imports
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+// Nutch imports
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.util.LogUtil;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.*;
+
+/** Converts URLs to a normal form . */
+public class BasicURLNormalizer implements URLNormalizer {
+ public static final Log LOG = LogFactory.getLog(BasicURLNormalizer.class);
+
+ private Perl5Compiler compiler = new Perl5Compiler();
+ private ThreadLocal matchers = new ThreadLocal() {
+ protected synchronized Object initialValue() {
+ return new Perl5Matcher();
+ }
+ };
+ private Rule relativePathRule = null;
+ private Rule leadingRelativePathRule = null;
+
+ private Configuration conf;
+
+ public BasicURLNormalizer() {
+ try {
+ // this pattern tries to find spots like "/xx/../" in the url, which
+ // could be replaced by "/" xx consists of chars, different then "/"
+ // (slash) and needs to have at least one char different from "."
+ relativePathRule = new Rule();
+ relativePathRule.pattern = (Perl5Pattern)
+ compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)",
+ Perl5Compiler.READ_ONLY_MASK);
+ relativePathRule.substitution = new Perl5Substitution("/");
+
+ // this pattern tries to find spots like leading "/../" in the url,
+ // which could be replaced by "/"
+ leadingRelativePathRule = new Rule();
+ leadingRelativePathRule.pattern = (Perl5Pattern)
+ compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK);
+ leadingRelativePathRule.substitution = new Perl5Substitution("/");
+
+ } catch (MalformedPatternException e) {
+ e.printStackTrace(LogUtil.getWarnStream(LOG));
+ throw new RuntimeException(e);
+ }
+ }
+
+ public String normalize(String urlString, String scope)
+ throws MalformedURLException {
+ if ("".equals(urlString)) // permit empty
+ return urlString;
+
+ urlString = urlString.trim(); // remove extra spaces
+
+ URL url = new URL(urlString);
+
+ String protocol = url.getProtocol();
+ String host = url.getHost();
+ int port = url.getPort();
+ String file = url.getFile();
+
+ boolean changed = false;
+
+ if (!urlString.startsWith(protocol)) // protocol was lowercased
+ changed = true;
+
+ if ("http".equals(protocol) || "ftp".equals(protocol)) {
+
+ if (host != null) {
+ String newHost = host.toLowerCase(); // lowercase host
+ if (!host.equals(newHost)) {
+ host = newHost;
+ changed = true;
+ }
+ }
+
+ if (port == url.getDefaultPort()) { // uses default port
+ port = -1; // so don't specify it
+ changed = true;
+ }
+
+ if (file == null || "".equals(file)) { // add a slash
+ file = "/";
+ changed = true;
+ }
+
+ if (url.getRef() != null) { // remove the ref
+ changed = true;
+ }
+
+ // check for unnecessary use of "/../"
+ String file2 = substituteUnnecessaryRelativePaths(file);
+
+ if (!file.equals(file2)) {
+ changed = true;
+ file = file2;
+ }
+
+ }
+
+ if (changed)
+ urlString = new URL(protocol, host, port, file).toString();
+
+ return urlString;
+ }
+
+ private String substituteUnnecessaryRelativePaths(String file) {
+ String fileWorkCopy = file;
+ int oldLen = file.length();
+ int newLen = oldLen - 1;
+
+ // All substitutions will be done step by step, to ensure that certain
+ // constellations will be normalized, too
+ //
+ // For example: "/aa/bb/../../cc/../foo.html will be normalized in the
+ // following manner:
+ // "/aa/bb/../../cc/../foo.html"
+ // "/aa/../cc/../foo.html"
+ // "/cc/../foo.html"
+ // "/foo.html"
+ //
+ // The normalization also takes care of leading "/../", which will be
+ // replaced by "/", because this is a rather a sign of bad webserver
+ // configuration than of a wanted link. For example, urls like
+ // "http://www.foo.com/../" should return a http 404 error instead of
+ // redirecting to "http://www.foo.com".
+ //
+ Perl5Matcher matcher = (Perl5Matcher)matchers.get();
+
+ while (oldLen != newLen) {
+ // substitue first occurence of "/xx/../" by "/"
+ oldLen = fileWorkCopy.length();
+ fileWorkCopy = Util.substitute
+ (matcher, relativePathRule.pattern,
+ relativePathRule.substitution, fileWorkCopy, 1);
+
+ // remove leading "/../"
+ fileWorkCopy = Util.substitute
+ (matcher, leadingRelativePathRule.pattern,
+ leadingRelativePathRule.substitution, fileWorkCopy, 1);
+ newLen = fileWorkCopy.length();
+ }
+
+ return fileWorkCopy;
+ }
+
+
+ /**
+ * Class which holds a compiled pattern and its corresponding substition
+ * string.
+ */
+ private static class Rule {
+ public Perl5Pattern pattern;
+ public Perl5Substitution substitution;
+ }
+
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+}
+
Property changes on: src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
___________________________________________________________________
Name: svn:eol-style
+ native
Index: src/plugin/urlnormalizer-basic/plugin.xml
===================================================================
--- src/plugin/urlnormalizer-basic/plugin.xml (revision 0)
+++ src/plugin/urlnormalizer-basic/plugin.xml (revision 0)
@@ -0,0 +1,26 @@
+
+
++ * This class must be specified as the URL normalizer to be used in + * nutch-site.xml or nutch-default.xml. To do this + * specify the urlnormalizer.class property to have the value: + * org.apache.nutch.net.RegexUrlNormalizer. The + * urlnormalizer.regex.file property should also be set to the file + * name of an xml file which should contain the patterns and substitutions to be + * done on encountered URLs. + *
+ * + * @author Luke Baker + */ +public class RegexURLNormalizer extends Configured implements URLNormalizer { + + private static final Log LOG = LogFactory.getLog(RegexURLNormalizer.class); + + /** + * Class which holds a compiled pattern and its corresponding substition + * string. + */ + private static class Rule { + public Perl5Pattern pattern; + + public String substitution; + } + + private HashMap scopedRules; + + private static final List EMPTY_RULES = Collections.EMPTY_LIST; + + private PatternMatcher matcher = new Perl5Matcher(); + + /** + * The default constructor which is called from UrlNormalizerFactory + * (normalizerClass.newInstance()) in method: getNormalizer()* + */ + public RegexURLNormalizer() { + super(null); + } + + public RegexURLNormalizer(Configuration conf) { + super(conf); + } + + /** + * Constructor which can be passed the file name, so it doesn't look in the + * configuration files for it. + */ + public RegexURLNormalizer(Configuration conf, String filename) + throws IOException, MalformedPatternException { + super(conf); + List rules = readConfigurationFile(filename); + if (rules != null) + scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules); + } + + public void setConf(Configuration conf) { + super.setConf(conf); + if (conf == null) return; + // the default constructor was called + if (this.scopedRules == null) { + String filename = getConf().get("urlnormalizer.regex.file"); + scopedRules = new HashMap(); + URL url = getConf().getResource(filename); + List rules = null; + if (url == null) { + LOG.warn("Can't load the default config file! " + filename); + rules = EMPTY_RULES; + } else { + try { + rules = readConfiguration(url.openStream()); + } catch (Exception e) { + LOG.warn("Couldn't read default config from '" + url + "': " + e); + rules = EMPTY_RULES; + } + } + scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules); + } + } + + // used in JUnit test. + void setConfiguration(InputStream is, String scope) { + List rules = readConfiguration(is); + scopedRules.put(scope, rules); + LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules."); + } + + /** + * This function does the replacements by iterating through all the regex + * patterns. It accepts a string url as input and returns the altered string. + */ + public synchronized String regexNormalize(String urlString, String scope) { + List curRules = (List)scopedRules.get(scope); + if (curRules == null) { + // try to populate + String configFile = getConf().get("urlnormalizer.regex.file." + scope); + if (configFile != null) { + URL resource = getConf().getResource(configFile); + LOG.debug("resource for scope '" + scope + "': " + resource); + if (resource == null) { + LOG.warn("Can't load resource for config file: " + configFile); + } else { + try { + InputStream is = resource.openStream(); + curRules = readConfiguration(resource.openStream()); + scopedRules.put(scope, curRules); + } catch (Exception e) { + LOG.warn("Couldn't load resource '" + resource + "': " + e); + } + } + } else { + LOG.warn("can't load rule file for scope '" + scope + "': " + configFile); + } + } + if (curRules == EMPTY_RULES || curRules == null) { + LOG.warn("can't find rules for scope '" + scope + "', using default"); + // use global rules + curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT); + } + Iterator i = curRules.iterator(); + while (i.hasNext()) { + Rule r = (Rule) i.next(); + urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution( + r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual + // substitution + } + return urlString; + } + + public synchronized String normalize(String urlString, String scope) + throws MalformedURLException { + return regexNormalize(urlString, scope); + } + + /** Reads the configuration file and populates a List of Rules. */ + private List readConfigurationFile(String filename) { + if (LOG.isInfoEnabled()) { + LOG.info("loading " + filename); + } + try { + FileInputStream fis = new FileInputStream(filename); + return readConfiguration(fis); + } catch (Exception e) { + LOG.fatal("Error loading rules from '" + filename + "': " + e); + return EMPTY_RULES; + } + } + + private List readConfiguration(InputStream is) { + Perl5Compiler compiler = new Perl5Compiler(); + List rules = new ArrayList(); + try { + + // borrowed heavily from code in Configuration.java + Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() + .parse(is); + Element root = doc.getDocumentElement(); + if ((!"regex-normalize".equals(root.getTagName())) + && (LOG.isFatalEnabled())) { + LOG.fatal("bad conf file: top-level element not