Index: conf/nutch-default.xml =================================================================== --- conf/nutch-default.xml (revision 440417) +++ conf/nutch-default.xml (working copy) @@ -622,12 +622,6 @@ - urlnormalizer.class - org.apache.nutch.net.BasicUrlNormalizer - Name of the class used to normalize URLs. - - - urlnormalizer.regex.file regex-normalize.xml Name of the config file used by the RegexUrlNormalizer class. @@ -669,7 +663,7 @@ plugin.includes - protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic|scoring-opic + protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic) Regular expression naming plugin directory names to include. Any plugin not matching this expression is excluded. In any case you need at least include the nutch-extensionpoints plugin. By Index: src/test/org/apache/nutch/net/TestRegexUrlNormalizer.java =================================================================== --- src/test/org/apache/nutch/net/TestRegexUrlNormalizer.java (revision 440417) +++ src/test/org/apache/nutch/net/TestRegexUrlNormalizer.java (working copy) @@ -1,50 +0,0 @@ -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.net; - -import org.apache.nutch.net.RegexUrlNormalizer; - -/** Unit tests for RegexUrlNormalizer. */ -public class TestRegexUrlNormalizer extends TestBasicUrlNormalizer { - public TestRegexUrlNormalizer(String name) { super(name); } - - public void testNormalizer() throws Exception { - normalizeTest("http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03", - "http://foo.com/foo.php?f=2"); - normalizeTest("http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3", - "http://foo.com/foo.php?f=2&q=3"); - normalizeTest("http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2", - "http://foo.com/foo.php?f=2"); - normalizeTest("http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03", - "http://foo.com/foo.php"); - } - - private void normalizeTest(String weird, String normal) throws Exception { - String testSrcDir = System.getProperty("test.src.dir"); - String path = testSrcDir + "/org/apache/nutch/net/test-regex-normalize.xml"; - RegexUrlNormalizer normalizer = new RegexUrlNormalizer(path); - assertEquals(normal, normalizer.normalize(weird)); - } - - public static void main(String[] args) throws Exception { - new TestRegexUrlNormalizer("test").testNormalizer(); - new TestBasicUrlNormalizer("test").testNormalizer(); // need to make sure it passes this test too - } - - - -} Index: src/test/org/apache/nutch/net/TestBasicUrlNormalizer.java =================================================================== --- src/test/org/apache/nutch/net/TestBasicUrlNormalizer.java (revision 440417) +++ src/test/org/apache/nutch/net/TestBasicUrlNormalizer.java (working copy) @@ -1,94 +0,0 @@ -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.net; - -import org.apache.nutch.util.NutchConfiguration; - -import junit.framework.TestCase; - -/** Unit tests for BasicUrlNormalizer. */ -public class TestBasicUrlNormalizer extends TestCase { - public TestBasicUrlNormalizer(String name) { super(name); } - - public void testNormalizer() throws Exception { - // check that leading and trailing spaces are removed - normalizeTest(" http://foo.com/ ", "http://foo.com/"); - - // check that protocol is lower cased - normalizeTest("HTTP://foo.com/", "http://foo.com/"); - - // check that host is lower cased - normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); - normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); - - // check that port number is normalized - normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html"); - normalizeTest("http://foo.com:81/", "http://foo.com:81/"); - - // check that null path is normalized - normalizeTest("http://foo.com", "http://foo.com/"); - - // check that references are removed - normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html"); - - // // check that encoding is normalized - // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); - - // check that unnecessary "../" are removed - normalizeTest("http://foo.com/aa/../", - "http://foo.com/" ); - normalizeTest("http://foo.com/aa/bb/../", - "http://foo.com/aa/"); - normalizeTest("http://foo.com/aa/..", - "http://foo.com/aa/.."); - normalizeTest("http://foo.com/aa/bb/cc/../../foo.html", - "http://foo.com/aa/foo.html"); - normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html", - "http://foo.com/aa/cc/ee/foo.html"); - normalizeTest("http://foo.com/../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/../../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/../aa/../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/aa/../../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/aa/../bb/../foo.html/../../", - "http://foo.com/" ); - normalizeTest("http://foo.com/../aa/foo.html", - "http://foo.com/aa/foo.html" ); - normalizeTest("http://foo.com/../aa/../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/a..a/foo.html", - "http://foo.com/a..a/foo.html" ); - normalizeTest("http://foo.com/a..a/../foo.html", - "http://foo.com/foo.html" ); - normalizeTest("http://foo.com/foo.foo/../foo.html", - "http://foo.com/foo.html" ); - } - - private void normalizeTest(String weird, String normal) throws Exception { - assertEquals(normal, new UrlNormalizerFactory(NutchConfiguration.create()).getNormalizer().normalize(weird)); - } - - public static void main(String[] args) throws Exception { - new TestBasicUrlNormalizer("test").testNormalizer(); - } - - - -} Index: src/test/org/apache/nutch/net/TestURLNormalizers.java =================================================================== --- src/test/org/apache/nutch/net/TestURLNormalizers.java (revision 0) +++ src/test/org/apache/nutch/net/TestURLNormalizers.java (revision 0) @@ -0,0 +1,39 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestURLNormalizers extends TestCase { + + public void testURLNormalizers() { + Configuration conf = NutchConfiguration.create(); + + URLNormalizers normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT); + + assertNotNull(normalizers); + try { + normalizers.normalize("http://www.example.com/", URLNormalizers.SCOPE_DEFAULT); + } catch (MalformedURLException mue) { + fail(mue.toString()); + } + } +} Property changes on: src/test/org/apache/nutch/net/TestURLNormalizers.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/nutch/fetcher/Fetcher.java =================================================================== --- src/java/org/apache/nutch/fetcher/Fetcher.java (revision 440417) +++ src/java/org/apache/nutch/fetcher/Fetcher.java (working copy) @@ -83,7 +83,7 @@ private URLFilters urlFilters; private ScoringFilters scfilters; private ParseUtil parseUtil; - private UrlNormalizer normalizer; + private URLNormalizers normalizers; private ProtocolFactory protocolFactory; public FetcherThread(Configuration conf) { @@ -94,7 +94,7 @@ this.scfilters = new ScoringFilters(conf); this.parseUtil = new ParseUtil(conf); this.protocolFactory = new ProtocolFactory(conf); - this.normalizer = new UrlNormalizerFactory(conf).getNormalizer(); + this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER); } public void run() { @@ -155,7 +155,7 @@ if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); - newUrl = normalizer.normalize(newUrl); + newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = this.urlFilters.filter(newUrl); if (newUrl != null && !newUrl.equals(url.toString())) { url = new UTF8(newUrl); @@ -174,7 +174,7 @@ case ProtocolStatus.MOVED: // redirect case ProtocolStatus.TEMP_MOVED: String newUrl = status.getMessage(); - newUrl = normalizer.normalize(newUrl); + newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); newUrl = this.urlFilters.filter(newUrl); if (newUrl != null && !newUrl.equals(url.toString())) { url = new UTF8(newUrl); Index: src/java/org/apache/nutch/net/UrlNormalizer.java =================================================================== --- src/java/org/apache/nutch/net/UrlNormalizer.java (revision 440417) +++ src/java/org/apache/nutch/net/UrlNormalizer.java (working copy) @@ -1,29 +0,0 @@ -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.net; - -import java.net.MalformedURLException; - -import org.apache.hadoop.conf.Configurable; - -/** Interface used to convert URLs to normal form and optionally do regex substitutions */ -public interface UrlNormalizer extends Configurable { - - /* Interface for URL normalization */ - public String normalize(String urlString) throws MalformedURLException; - -} Index: src/java/org/apache/nutch/net/URLNormalizer.java =================================================================== --- src/java/org/apache/nutch/net/URLNormalizer.java (revision 0) +++ src/java/org/apache/nutch/net/URLNormalizer.java (revision 0) @@ -0,0 +1,32 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configurable; + +/** Interface used to convert URLs to normal form and optionally perform substitutions */ +public interface URLNormalizer extends Configurable { + + /* Extension ID */ + public static final String X_POINT_ID = URLNormalizer.class.getName(); + + /* Interface for URL normalization */ + public String normalize(String urlString, String scope) throws MalformedURLException; + +} Property changes on: src/java/org/apache/nutch/net/URLNormalizer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/nutch/net/URLNormalizers.java =================================================================== --- src/java/org/apache/nutch/net/URLNormalizers.java (revision 0) +++ src/java/org/apache/nutch/net/URLNormalizers.java (revision 0) @@ -0,0 +1,219 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.Vector; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.ExtensionPoint; +import org.apache.nutch.plugin.PluginRepository; +import org.apache.nutch.plugin.PluginRuntimeException; + +public final class URLNormalizers { + + public static final String SCOPE_DEFAULT = "default"; + public static final String SCOPE_PARTITION = "partition"; + public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count"; + public static final String SCOPE_FETCHER = "fetcher"; + public static final String SCOPE_CRAWLDB = "crawldb"; + public static final String SCOPE_LINKDB = "linkdb"; + public static final String SCOPE_INJECT = "inject"; + public static final String SCOPE_OUTLINK = "outlink"; + + + public static final Log LOG = LogFactory.getLog(URLNormalizers.class); + + /** Empty extension list for caching purposes. */ + private final List EMPTY_EXTENSION_LIST = Collections.EMPTY_LIST; + + private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0]; + + private Configuration conf; + + private ExtensionPoint extensionPoint; + + private URLNormalizer[] normalizers; + + public URLNormalizers(Configuration conf, String scope) { + this.conf = conf; + this.extensionPoint = PluginRepository.get(conf).getExtensionPoint( + URLNormalizer.X_POINT_ID); + + if (this.extensionPoint == null) { + throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID + + " not found."); + } + + normalizers = (URLNormalizer[])conf.getObject(URLNormalizer.X_POINT_ID + "_" + scope); + if (normalizers == null) { + normalizers = getURLNormalizers(scope); + } + if (normalizers == EMPTY_NORMALIZERS) { + normalizers = (URLNormalizer[])conf.getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT); + if (normalizers == null) { + normalizers = getURLNormalizers(SCOPE_DEFAULT); + } + } + } + + /** + * Function returns an array of {@link URLNormalizer}s for a given scope, + * with a specified order + * + * @param scope The scope to return the Array of + * {@link URLNormalizer}s for. + * @return An Array of {@link URLNormalizer}s for the given scope. + * @throws PluginRuntimeException + */ + private URLNormalizer[] getURLNormalizers(String scope) { + List extensions = getExtensions(scope); + + if (extensions == EMPTY_EXTENSION_LIST) { + return EMPTY_NORMALIZERS; + } + + List normalizers = new Vector(extensions.size()); + + Iterator it = extensions.iterator(); + while (it.hasNext()) { + Extension ext = (Extension) it.next(); + URLNormalizer normalizer = null; + try { + // check to see if we've cached this URLNormalizer instance yet + normalizer = (URLNormalizer) this.conf.getObject(ext.getId()); + if (normalizer == null) { + // go ahead and instantiate it and then cache it + normalizer = (URLNormalizer) ext.getExtensionInstance(); + this.conf.setObject(ext.getId(), normalizer); + } + normalizers.add(normalizer); + } catch (PluginRuntimeException e) { + e.printStackTrace(); + LOG.warn("URLNormalizers:PluginRuntimeException when " + + "initializing url normalizer plugin " + + ext.getDescriptor().getPluginId() + + " instance in getURLNormalizers " + + "function: attempting to continue instantiating plugins"); + } + } + return (URLNormalizer[]) normalizers.toArray(new URLNormalizer[normalizers + .size()]); + } + + /** + * Finds the best-suited normalizer plugin for a given scope. + * + * @param scope Scope for which we seek a normalizer plugin. + * @return a list of extensions to be used for this scope. If none, returns + * empty list. + * @throws PluginRuntimeException + */ + private List getExtensions(String scope) { + + List extensions = (List) this.conf.getObject(URLNormalizer.X_POINT_ID + "_x_" + + scope); + + // Just compare the reference: + // if this is the empty list, we know we will find no extension. + if (extensions == EMPTY_EXTENSION_LIST) { + return EMPTY_EXTENSION_LIST; + } + + if (extensions == null) { + extensions = findExtensions(scope); + if (extensions != null) { + this.conf.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, extensions); + } else { + // Put the empty extension list into cache + // to remember we don't know any related extension. + this.conf.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, EMPTY_EXTENSION_LIST); + extensions = EMPTY_EXTENSION_LIST; + } + } + return extensions; + } + + /** + * searches a list of suitable url normalizer plugins for the given scope. + * + * @param scope Scope for which we seek a url normalizer plugin. + * @return List - List of extensions to be used for this scope. If none, + * returns null. + * @throws PluginRuntimeException + */ + private List findExtensions(String scope) { + + String[] orders = null; + String orderlist = conf.get("urlnormalizer.order." + scope); + if (orderlist == null) orderlist = conf.get("urlnormalizer.order"); + if (orderlist != null && !orderlist.trim().equals("")) { + orders = orderlist.split("\\s+"); + } + String scopelist = conf.get("urlnormalizer.scope." + scope); + Set impls = null; + if (scopelist != null && !scopelist.trim().equals("")) { + String[] names = scopelist.split("\\s+"); + impls = new HashSet(Arrays.asList(names)); + } + Extension[] extensions = this.extensionPoint.getExtensions(); + HashMap normalizerExtensions = new HashMap(); + for (int i = 0; i < extensions.length; i++) { + Extension extension = extensions[i]; + if (impls != null && !impls.contains(extension.getClazz())) + continue; + normalizerExtensions.put(extension.getClazz(), extension); + } + List res = new ArrayList(); + if (orders == null) { + res.addAll(normalizerExtensions.values()); + } else { + // first add those explicitly named in correct order + for (int i = 0; i < orders.length; i++) { + Extension e = (Extension)normalizerExtensions.get(orders[i]); + if (e != null) { + res.add(e); + normalizerExtensions.remove(orders[i]); + } + } + // then add all others in random order + res.addAll(normalizerExtensions.values()); + } + return res; + } + + public String normalize(String urlString, String scope) + throws MalformedURLException { + for (int i = 0; i < this.normalizers.length; i++) { + if (urlString == null) + return null; + urlString = this.normalizers[i].normalize(urlString, scope); + } + return urlString; + } +} Property changes on: src/java/org/apache/nutch/net/URLNormalizers.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/java/org/apache/nutch/net/UrlNormalizerFactory.java =================================================================== --- src/java/org/apache/nutch/net/UrlNormalizerFactory.java (revision 440417) +++ src/java/org/apache/nutch/net/UrlNormalizerFactory.java (working copy) @@ -1,57 +0,0 @@ -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.nutch.net; - -// Commons Logging imports -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import org.apache.hadoop.conf.*; - - -/** Factory to create a UrlNormalizer from "urlnormalizer.class" config property. */ -public class UrlNormalizerFactory { - private static final Log LOG = LogFactory.getLog(UrlNormalizerFactory.class); - - private Configuration conf; - - public UrlNormalizerFactory(Configuration conf) { - this.conf = conf; - } - - /** Return the default UrlNormalizer implementation. */ - public UrlNormalizer getNormalizer() { - String urlNormalizer = null; - UrlNormalizer normalizer = (UrlNormalizer) this.conf - .getObject(UrlNormalizer.class.getName()); - if (normalizer == null) { - try { - urlNormalizer = this.conf.get("urlnormalizer.class"); - if (LOG.isInfoEnabled()) { - LOG.info("Using URL normalizer: " + urlNormalizer); - } - Class normalizerClass = Class.forName(urlNormalizer); - normalizer = (UrlNormalizer) normalizerClass.newInstance(); - normalizer.setConf(this.conf); - this.conf.setObject(UrlNormalizer.class.getName(), normalizer); - } catch (Exception e) { - throw new RuntimeException("Couldn't create " + urlNormalizer, e); - } - } - return normalizer; - } -} Index: src/java/org/apache/nutch/crawl/LinkDb.java =================================================================== --- src/java/org/apache/nutch/crawl/LinkDb.java (revision 440417) +++ src/java/org/apache/nutch/crawl/LinkDb.java (working copy) @@ -32,8 +32,7 @@ import org.apache.hadoop.util.StringUtils; import org.apache.nutch.net.URLFilters; -import org.apache.nutch.net.UrlNormalizer; -import org.apache.nutch.net.UrlNormalizerFactory; +import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.*; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; @@ -50,7 +49,7 @@ private int maxInlinks; private boolean ignoreInternalLinks; private URLFilters urlFilters; - private UrlNormalizer urlNormalizer; + private URLNormalizers urlNormalizers; public static class Merger extends MapReduceBase implements Reducer { private int _maxInlinks; @@ -98,7 +97,7 @@ urlFilters = new URLFilters(job); } if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) { - urlNormalizer = new UrlNormalizerFactory(job).getNormalizer(); + urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB); } } @@ -109,9 +108,9 @@ throws IOException { String fromUrl = key.toString(); String fromHost = getHost(fromUrl); - if (urlNormalizer != null) { + if (urlNormalizers != null) { try { - fromUrl = urlNormalizer.normalize(fromUrl); // normalize the url + fromUrl = urlNormalizers.normalize(fromUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url } catch (Exception e) { LOG.warn("Skipping " + fromUrl + ":" + e); fromUrl = null; @@ -139,9 +138,9 @@ continue; // skip it } } - if (urlNormalizer != null) { + if (urlNormalizers != null) { try { - toUrl = urlNormalizer.normalize(toUrl); // normalize the url + toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB); // normalize the url } catch (Exception e) { LOG.warn("Skipping " + toUrl + ":" + e); toUrl = null; Index: src/java/org/apache/nutch/crawl/LinkDbFilter.java =================================================================== --- src/java/org/apache/nutch/crawl/LinkDbFilter.java (revision 440417) +++ src/java/org/apache/nutch/crawl/LinkDbFilter.java (working copy) @@ -28,8 +28,7 @@ import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.nutch.net.URLFilters; -import org.apache.nutch.net.UrlNormalizer; -import org.apache.nutch.net.UrlNormalizerFactory; +import org.apache.nutch.net.URLNormalizers; /** * This class provides a way to separate the URL normalization @@ -42,16 +41,20 @@ public static final String URL_NORMALIZING = "linkdb.url.normalizer"; + public static final String URL_NORMALIZING_SCOPE = "linkdb.url.normalizer.scope"; + private boolean filter; private boolean normalize; private URLFilters filters; - private UrlNormalizer normalizer; + private URLNormalizers normalizers; private JobConf jobConf; + private String scope; + public static final Log LOG = LogFactory.getLog(LinkDbFilter.class); public void configure(JobConf job) { @@ -62,7 +65,8 @@ filters = new URLFilters(job); } if (normalize) { - normalizer = new UrlNormalizerFactory(job).getNormalizer(); + scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_LINKDB); + normalizers = new URLNormalizers(job, scope); } } @@ -72,7 +76,7 @@ String url = key.toString(); if (normalize) { try { - url = normalizer.normalize(url); // normalize the url + url = normalizers.normalize(url, scope); // normalize the url } catch (Exception e) { LOG.warn("Skipping " + url + ":" + e); url = null; @@ -95,7 +99,7 @@ fromUrl = inlink.getFromUrl(); if (normalize) { try { - fromUrl = normalizer.normalize(fromUrl); // normalize the url + fromUrl = normalizers.normalize(fromUrl, scope); // normalize the url } catch (Exception e) { LOG.warn("Skipping " + fromUrl + ":" + e); fromUrl = null; Index: src/java/org/apache/nutch/crawl/Injector.java =================================================================== --- src/java/org/apache/nutch/crawl/Injector.java (revision 440417) +++ src/java/org/apache/nutch/crawl/Injector.java (working copy) @@ -44,16 +44,16 @@ /** Normalize and filter injected urls. */ public static class InjectMapper implements Mapper { - private UrlNormalizer urlNormalizer; + private URLNormalizers urlNormalizers; private float interval; private float scoreInjected; private JobConf jobConf; private URLFilters filters; - private ScoringFilters scfilters; + private ScoringFilters scfilters; public void configure(JobConf job) { this.jobConf = job; - urlNormalizer = new UrlNormalizerFactory(jobConf).getNormalizer(); + urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT); interval = jobConf.getFloat("db.default.fetch.interval", 30f); filters = new URLFilters(jobConf); scfilters = new ScoringFilters(jobConf); @@ -69,7 +69,7 @@ String url = value.toString(); // value is line of text // System.out.println("url: " +url); try { - url = urlNormalizer.normalize(url); // normalize the url + url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); // normalize the url url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); } Index: src/java/org/apache/nutch/crawl/CrawlDbFilter.java =================================================================== --- src/java/org/apache/nutch/crawl/CrawlDbFilter.java (revision 440417) +++ src/java/org/apache/nutch/crawl/CrawlDbFilter.java (working copy) @@ -28,8 +28,7 @@ import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.nutch.net.URLFilters; -import org.apache.nutch.net.UrlNormalizer; -import org.apache.nutch.net.UrlNormalizerFactory; +import org.apache.nutch.net.URLNormalizers; /** * This class provides a way to separate the URL normalization @@ -40,29 +39,34 @@ public class CrawlDbFilter implements Mapper { public static final String URL_FILTERING = "crawldb.url.filters"; - public static final String URL_NORMALIZING = "crawldb.url.normalizer"; + public static final String URL_NORMALIZING = "crawldb.url.normalizers"; + public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope"; + private boolean urlFiltering; - private boolean urlNormalizer; + private boolean urlNormalizers; private URLFilters filters; - private UrlNormalizer normalizer; + private URLNormalizers normalizers; private JobConf jobConf; + + private String scope; public static final Log LOG = LogFactory.getLog(CrawlDbFilter.class); public void configure(JobConf job) { this.jobConf = job; urlFiltering = job.getBoolean(URL_FILTERING, false); - urlNormalizer = job.getBoolean(URL_NORMALIZING, false); + urlNormalizers = job.getBoolean(URL_NORMALIZING, false); if (urlFiltering) { filters = new URLFilters(job); } - if (urlNormalizer) { - normalizer = new UrlNormalizerFactory(job).getNormalizer(); + if (urlNormalizers) { + scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB); + normalizers = new URLNormalizers(job, scope); } } @@ -71,9 +75,9 @@ public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException { String url = key.toString(); - if (urlNormalizer) { + if (urlNormalizers) { try { - url = normalizer.normalize(url); // normalize the url + url = normalizers.normalize(url, scope); // normalize the url } catch (Exception e) { LOG.warn("Skipping " + url + ":" + e); url = null; Index: src/java/org/apache/nutch/crawl/PartitionUrlByHost.java =================================================================== --- src/java/org/apache/nutch/crawl/PartitionUrlByHost.java (revision 440417) +++ src/java/org/apache/nutch/crawl/PartitionUrlByHost.java (working copy) @@ -19,15 +19,22 @@ import java.net.URL; import java.net.MalformedURLException; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; +import org.apache.nutch.net.URLNormalizers; /** Partition urls by hostname. */ public class PartitionUrlByHost implements Partitioner { + private static final Log LOG = LogFactory.getLog(PartitionUrlByHost.class); + private int seed; + private URLNormalizers normalizers; public void configure(JobConf job) { seed = job.getInt("partition.url.by.host.seed", 0); + normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION); } public void close() {} @@ -36,10 +43,16 @@ public int getPartition(WritableComparable key, Writable value, int numReduceTasks) { String urlString = ((UTF8)key).toString(); + try { + urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION); + } catch (Exception e) { + LOG.warn("Malformed URL: '" + urlString + "'"); + } URL url = null; try { url = new URL(urlString); } catch (MalformedURLException e) { + LOG.warn("Malformed URL: '" + urlString + "'"); } int hashCode = (url==null ? urlString : url.getHost()).hashCode(); Index: src/java/org/apache/nutch/crawl/Generator.java =================================================================== --- src/java/org/apache/nutch/crawl/Generator.java (revision 440417) +++ src/java/org/apache/nutch/crawl/Generator.java (working copy) @@ -33,6 +33,7 @@ import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.NutchConfiguration; @@ -77,6 +78,7 @@ private int maxPerHost; private Partitioner hostPartitioner = new PartitionUrlByHost(); private URLFilters filters; + private URLNormalizers normalizers; private ScoringFilters scfilters; private SelectorEntry entry = new SelectorEntry(); private FloatWritable sortValue = new FloatWritable(); @@ -89,7 +91,9 @@ maxPerHost = job.getInt("generate.max.per.host", -1); byIP = job.getBoolean("generate.max.per.host.by.ip", false); filters = new URLFilters(job); + normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); scfilters = new ScoringFilters(job); + hostPartitioner.configure(job); } public void close() {} @@ -170,6 +174,12 @@ continue; } } + try { + host = normalizers.normalize(host, URLNormalizers.SCOPE_GENERATE_HOST_COUNT); + host = new URL(host).getHost().toLowerCase(); + } catch (Exception e) { + LOG.warn("Malformed URL: '" + host + "', skipping"); + } IntWritable hostCount = (IntWritable)hostCounts.get(host); if (hostCount == null) { hostCount = new IntWritable(); Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java =================================================================== --- src/java/org/apache/nutch/parse/ParseOutputFormat.java (revision 440417) +++ src/java/org/apache/nutch/parse/ParseOutputFormat.java (working copy) @@ -40,7 +40,7 @@ public class ParseOutputFormat implements OutputFormat { private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class); - private UrlNormalizer urlNormalizer; + private URLNormalizers urlNormalizers; private URLFilters filters; private ScoringFilters scfilters; @@ -52,7 +52,7 @@ public RecordWriter getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress) throws IOException { - this.urlNormalizer = new UrlNormalizerFactory(job).getNormalizer(); + this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); this.filters = new URLFilters(job); this.scfilters = new ScoringFilters(job); final float interval = job.getFloat("db.default.fetch.interval", 30f); @@ -116,7 +116,7 @@ for (int i = 0; i < links.length; i++) { String toUrl = links[i].getToUrl(); try { - toUrl = urlNormalizer.normalize(toUrl); // normalize the url + toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); // normalize the url toUrl = filters.filter(toUrl); // filter the url } catch (Exception e) { toUrl = null; Index: src/java/org/apache/nutch/parse/Outlink.java =================================================================== --- src/java/org/apache/nutch/parse/Outlink.java (revision 440417) +++ src/java/org/apache/nutch/parse/Outlink.java (working copy) @@ -20,7 +20,7 @@ import java.net.MalformedURLException; import org.apache.hadoop.io.*; -import org.apache.nutch.net.UrlNormalizerFactory; +import org.apache.nutch.net.URLNormalizers; import org.apache.hadoop.conf.Configuration; /* An outgoing link from a page. */ @@ -32,7 +32,7 @@ public Outlink() {} public Outlink(String toUrl, String anchor, Configuration conf) throws MalformedURLException { - this.toUrl = new UrlNormalizerFactory(conf).getNormalizer().normalize(toUrl); + this.toUrl = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK).normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); this.anchor = anchor; } Index: src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java =================================================================== --- src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (revision 0) +++ src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (revision 0) @@ -0,0 +1,104 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.basic; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +/** Unit tests for BasicURLNormalizer. */ +public class TestBasicURLNormalizer extends TestCase { + private BasicURLNormalizer normalizer; + private Configuration conf; + + public TestBasicURLNormalizer(String name) { + super(name); + normalizer = new BasicURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + } + + public void testNormalizer() throws Exception { + // check that leading and trailing spaces are removed + normalizeTest(" http://foo.com/ ", "http://foo.com/"); + + // check that protocol is lower cased + normalizeTest("HTTP://foo.com/", "http://foo.com/"); + + // check that host is lower cased + normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); + normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html"); + + // check that port number is normalized + normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html"); + normalizeTest("http://foo.com:81/", "http://foo.com:81/"); + + // check that null path is normalized + normalizeTest("http://foo.com", "http://foo.com/"); + + // check that references are removed + normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html"); + + // // check that encoding is normalized + // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html"); + + // check that unnecessary "../" are removed + normalizeTest("http://foo.com/aa/../", + "http://foo.com/" ); + normalizeTest("http://foo.com/aa/bb/../", + "http://foo.com/aa/"); + normalizeTest("http://foo.com/aa/..", + "http://foo.com/aa/.."); + normalizeTest("http://foo.com/aa/bb/cc/../../foo.html", + "http://foo.com/aa/foo.html"); + normalizeTest("http://foo.com/aa/bb/../cc/dd/../ee/foo.html", + "http://foo.com/aa/cc/ee/foo.html"); + normalizeTest("http://foo.com/../foo.html", + "http://foo.com/foo.html" ); + normalizeTest("http://foo.com/../../foo.html", + "http://foo.com/foo.html" ); + normalizeTest("http://foo.com/../aa/../foo.html", + "http://foo.com/foo.html" ); + normalizeTest("http://foo.com/aa/../../foo.html", + "http://foo.com/foo.html" ); + normalizeTest("http://foo.com/aa/../bb/../foo.html/../../", + "http://foo.com/" ); + normalizeTest("http://foo.com/../aa/foo.html", + "http://foo.com/aa/foo.html" ); + normalizeTest("http://foo.com/../aa/../foo.html", + "http://foo.com/foo.html" ); + normalizeTest("http://foo.com/a..a/foo.html", + "http://foo.com/a..a/foo.html" ); + normalizeTest("http://foo.com/a..a/../foo.html", + "http://foo.com/foo.html" ); + normalizeTest("http://foo.com/foo.foo/../foo.html", + "http://foo.com/foo.html" ); + } + + private void normalizeTest(String weird, String normal) throws Exception { + assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT)); + } + + public static void main(String[] args) throws Exception { + new TestBasicURLNormalizer("test").testNormalizer(); + } + + + +} Property changes on: src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java =================================================================== --- src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (revision 0) +++ src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java (revision 0) @@ -0,0 +1,191 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.basic; + +import java.net.URL; +import java.net.MalformedURLException; + +// Commons Logging imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +// Nutch imports +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.util.LogUtil; + +import org.apache.hadoop.conf.Configuration; +import org.apache.oro.text.regex.*; + +/** Converts URLs to a normal form . */ +public class BasicURLNormalizer implements URLNormalizer { + public static final Log LOG = LogFactory.getLog(BasicURLNormalizer.class); + + private Perl5Compiler compiler = new Perl5Compiler(); + private ThreadLocal matchers = new ThreadLocal() { + protected synchronized Object initialValue() { + return new Perl5Matcher(); + } + }; + private Rule relativePathRule = null; + private Rule leadingRelativePathRule = null; + + private Configuration conf; + + public BasicURLNormalizer() { + try { + // this pattern tries to find spots like "/xx/../" in the url, which + // could be replaced by "/" xx consists of chars, different then "/" + // (slash) and needs to have at least one char different from "." + relativePathRule = new Rule(); + relativePathRule.pattern = (Perl5Pattern) + compiler.compile("(/[^/]*[^/.]{1}[^/]*/\\.\\./)", + Perl5Compiler.READ_ONLY_MASK); + relativePathRule.substitution = new Perl5Substitution("/"); + + // this pattern tries to find spots like leading "/../" in the url, + // which could be replaced by "/" + leadingRelativePathRule = new Rule(); + leadingRelativePathRule.pattern = (Perl5Pattern) + compiler.compile("^(/\\.\\./)+", Perl5Compiler.READ_ONLY_MASK); + leadingRelativePathRule.substitution = new Perl5Substitution("/"); + + } catch (MalformedPatternException e) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + throw new RuntimeException(e); + } + } + + public String normalize(String urlString, String scope) + throws MalformedURLException { + if ("".equals(urlString)) // permit empty + return urlString; + + urlString = urlString.trim(); // remove extra spaces + + URL url = new URL(urlString); + + String protocol = url.getProtocol(); + String host = url.getHost(); + int port = url.getPort(); + String file = url.getFile(); + + boolean changed = false; + + if (!urlString.startsWith(protocol)) // protocol was lowercased + changed = true; + + if ("http".equals(protocol) || "ftp".equals(protocol)) { + + if (host != null) { + String newHost = host.toLowerCase(); // lowercase host + if (!host.equals(newHost)) { + host = newHost; + changed = true; + } + } + + if (port == url.getDefaultPort()) { // uses default port + port = -1; // so don't specify it + changed = true; + } + + if (file == null || "".equals(file)) { // add a slash + file = "/"; + changed = true; + } + + if (url.getRef() != null) { // remove the ref + changed = true; + } + + // check for unnecessary use of "/../" + String file2 = substituteUnnecessaryRelativePaths(file); + + if (!file.equals(file2)) { + changed = true; + file = file2; + } + + } + + if (changed) + urlString = new URL(protocol, host, port, file).toString(); + + return urlString; + } + + private String substituteUnnecessaryRelativePaths(String file) { + String fileWorkCopy = file; + int oldLen = file.length(); + int newLen = oldLen - 1; + + // All substitutions will be done step by step, to ensure that certain + // constellations will be normalized, too + // + // For example: "/aa/bb/../../cc/../foo.html will be normalized in the + // following manner: + // "/aa/bb/../../cc/../foo.html" + // "/aa/../cc/../foo.html" + // "/cc/../foo.html" + // "/foo.html" + // + // The normalization also takes care of leading "/../", which will be + // replaced by "/", because this is a rather a sign of bad webserver + // configuration than of a wanted link. For example, urls like + // "http://www.foo.com/../" should return a http 404 error instead of + // redirecting to "http://www.foo.com". + // + Perl5Matcher matcher = (Perl5Matcher)matchers.get(); + + while (oldLen != newLen) { + // substitue first occurence of "/xx/../" by "/" + oldLen = fileWorkCopy.length(); + fileWorkCopy = Util.substitute + (matcher, relativePathRule.pattern, + relativePathRule.substitution, fileWorkCopy, 1); + + // remove leading "/../" + fileWorkCopy = Util.substitute + (matcher, leadingRelativePathRule.pattern, + leadingRelativePathRule.substitution, fileWorkCopy, 1); + newLen = fileWorkCopy.length(); + } + + return fileWorkCopy; + } + + + /** + * Class which holds a compiled pattern and its corresponding substition + * string. + */ + private static class Rule { + public Perl5Pattern pattern; + public Perl5Substitution substitution; + } + + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + +} + Property changes on: src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-basic/plugin.xml =================================================================== --- src/plugin/urlnormalizer-basic/plugin.xml (revision 0) +++ src/plugin/urlnormalizer-basic/plugin.xml (revision 0) @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + Property changes on: src/plugin/urlnormalizer-basic/plugin.xml ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-basic/build.xml =================================================================== --- src/plugin/urlnormalizer-basic/build.xml (revision 0) +++ src/plugin/urlnormalizer-basic/build.xml (revision 0) @@ -0,0 +1,7 @@ + + + + + + + Property changes on: src/plugin/urlnormalizer-basic/build.xml ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/build.xml =================================================================== --- src/plugin/build.xml (revision 440417) +++ src/plugin/build.xml (working copy) @@ -58,6 +58,9 @@ + + + @@ -85,6 +88,9 @@ + + + @@ -141,5 +147,8 @@ + + + Index: src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.xml =================================================================== --- src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.xml (revision 0) +++ src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.xml (revision 0) @@ -0,0 +1,21 @@ + + + + + + + + + (^[a-z]{3,5}://)([\w]+\.)*?(\w+\.\w+)[/$].* + $1$3/ + + + Property changes on: src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.xml ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test =================================================================== --- src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test (revision 0) +++ src/plugin/urlnormalizer-regex/sample/regex-normalize-default.test (revision 0) @@ -0,0 +1,11 @@ +# test simple removal of session id +http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php + +# test removal of session id, and keep parameters before +http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://foo.com/foo.php?f=2 + +# test removal of session id, and keep parameters after +http://foo.com/foo.php?f=2&PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&q=3 http://foo.com/foo.php?f=2&q=3 + +# test removal of session id, and keep parameters after +http://foo.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03&f=2 http://foo.com/foo.php?f=2 Index: src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml =================================================================== --- src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml (revision 0) +++ src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml (revision 0) @@ -0,0 +1,23 @@ + + + + + + + + + (\?|\&|\&amp;)PHPSESSID=[a-zA-Z0-9]{32}$ + + + + (\?|\&|\&amp;)PHPSESSID=[a-zA-Z0-9]{32}(\&|\&amp;)(.*) + $1$3 + + + Property changes on: src/plugin/urlnormalizer-regex/sample/regex-normalize-default.xml ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test =================================================================== --- src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test (revision 0) +++ src/plugin/urlnormalizer-regex/sample/regex-normalize-scope1.test (revision 0) @@ -0,0 +1,8 @@ +# test removal of subdomains +http://www.foo.bar.com/ http://bar.com/ + +# test removal of url path +http://www.foo.bar.com/foo.php?PHPSESSID=cdc993a493e899bed04f4d0c8a462a03 http://bar.com/ + +# test removal of urls in arguments +https://www.foo.bar.com/foo.php?url=http://www.example.com/test.php https://bar.com/ Index: src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java =================================================================== --- src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java (revision 0) +++ src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java (revision 0) @@ -0,0 +1,176 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.regex; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileFilter; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +/** Unit tests for RegexUrlNormalizer. */ +public class TestRegexURLNormalizer extends TestCase { + private static final Log LOG = LogFactory.getLog(TestRegexURLNormalizer.class); + + private RegexURLNormalizer normalizer; + private Configuration conf; + private HashMap testData = new HashMap(); + + // This system property is defined in ./src/plugin/build-plugin.xml + private String sampleDir = System.getProperty("test.data", "."); + // Make sure sample files are copied to "test.data" as specified in + // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation. + + public TestRegexURLNormalizer(String name) throws IOException { + super(name); + normalizer = new RegexURLNormalizer(); + conf = NutchConfiguration.create(); + normalizer.setConf(conf); + File[] configs = new File(sampleDir).listFiles(new FileFilter() { + public boolean accept(File f) { + if (f.getName().endsWith(".xml") && f.getName().startsWith("regex-normalize-")) + return true; + return false; + } + }); + for (int i = 0; i < configs.length; i++) { + try { + FileInputStream fis = new FileInputStream(configs[i]); + String cname = configs[i].getName(); + cname = cname.substring(16, cname.indexOf(".xml")); + normalizer.setConfiguration(fis, cname); + NormalizedURL[] urls = readTestFile(cname); + testData.put(cname, urls); + } catch (Exception e) { + LOG.warn("Could load config from '" + configs[i] + "': " + e.toString()); + } + } + } + + public void testNormalizerDefault() throws Exception { + normalizeTest((NormalizedURL[])testData.get(URLNormalizers.SCOPE_DEFAULT), + URLNormalizers.SCOPE_DEFAULT); + } + + public void testNormalizerScope() throws Exception { + Iterator it = testData.keySet().iterator(); + while (it.hasNext()) { + String scope = (String)it.next(); + normalizeTest((NormalizedURL[])testData.get(scope), scope); + } + } + + private void normalizeTest(NormalizedURL[] urls, String scope) throws Exception { + for (int i = 0; i < urls.length; i++) { + assertEquals(urls[i].expectedURL, + normalizer.normalize(urls[i].url, scope)); + } + } + + private void bench(int loops, String scope) { + long start = System.currentTimeMillis(); + try { + NormalizedURL[] expected = (NormalizedURL[])testData.get(scope); + if (expected == null) return; + for (int i = 0; i < loops; i++) { + normalizeTest(expected, scope); + } + } catch (Exception e) { + fail(e.toString()); + } + LOG.info("bench time (" + loops + ") " + + (System.currentTimeMillis() - start) + "ms"); + } + + private static class NormalizedURL { + String url; + String expectedURL; + + public NormalizedURL(String line) { + String[] fields = line.split("\\s+"); + url = fields[0]; + expectedURL = fields[1]; + } + } + + private NormalizedURL[] readTestFile(String scope) throws IOException { + File f = new File(sampleDir, "regex-normalize-" + scope + ".test"); + BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")); + List list = new ArrayList(); + String line; + while((line = in.readLine()) != null) { + if ( line.trim().length() == 0 || + line.startsWith("#") || + line.startsWith(" ")) continue; + list.add(new NormalizedURL(line)); + } + return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]); + } + + public static void main(String[] args) throws Exception { + if (args.length == 0) { + System.err.println("TestRegexURLNormalizer [-bench ] "); + System.exit(-1); + } + boolean bench = false; + int iter = -1; + String scope = null; + for (int i = 0; i < args.length; i++) { + if (args[i].equals("-bench")) { + bench = true; + iter = Integer.parseInt(args[++i]); + } else scope = args[i]; + } + if (scope == null) { + System.err.println("Missing required scope name."); + System.exit(-1); + } + if (bench && iter < 0) { + System.err.println("Invalid number of iterations: " + iter); + System.exit(-1); + } + TestRegexURLNormalizer test = new TestRegexURLNormalizer("test"); + NormalizedURL[] urls = (NormalizedURL[])test.testData.get(scope); + if (urls == null) { + LOG.warn("Missing test data for scope '" + scope + "', using default scope."); + scope = URLNormalizers.SCOPE_DEFAULT; + urls = (NormalizedURL[])test.testData.get(scope); + } + if (bench) { + test.bench(iter, scope); + } else { + test.normalizeTest(urls, scope); + } + } + + + +} Property changes on: src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java =================================================================== --- src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (revision 0) +++ src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (revision 0) @@ -0,0 +1,291 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.regex; + +import java.net.URL; +import java.net.MalformedURLException; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.ArrayList; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; + +import org.apache.nutch.net.URLNormalizer; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import javax.xml.parsers.*; +import org.w3c.dom.*; +import org.apache.oro.text.regex.*; + +/** + * Allows users to do regex substitutions on all/any URLs that are encountered, + * which is useful for stripping session IDs from URLs. + * + *

+ * This class must be specified as the URL normalizer to be used in + * nutch-site.xml or nutch-default.xml. To do this + * specify the urlnormalizer.class property to have the value: + * org.apache.nutch.net.RegexUrlNormalizer. The + * urlnormalizer.regex.file property should also be set to the file + * name of an xml file which should contain the patterns and substitutions to be + * done on encountered URLs. + *

+ * + * @author Luke Baker + */ +public class RegexURLNormalizer extends Configured implements URLNormalizer { + + private static final Log LOG = LogFactory.getLog(RegexURLNormalizer.class); + + /** + * Class which holds a compiled pattern and its corresponding substition + * string. + */ + private static class Rule { + public Perl5Pattern pattern; + + public String substitution; + } + + private HashMap scopedRules; + + private static final List EMPTY_RULES = Collections.EMPTY_LIST; + + private PatternMatcher matcher = new Perl5Matcher(); + + /** + * The default constructor which is called from UrlNormalizerFactory + * (normalizerClass.newInstance()) in method: getNormalizer()* + */ + public RegexURLNormalizer() { + super(null); + } + + public RegexURLNormalizer(Configuration conf) { + super(conf); + } + + /** + * Constructor which can be passed the file name, so it doesn't look in the + * configuration files for it. + */ + public RegexURLNormalizer(Configuration conf, String filename) + throws IOException, MalformedPatternException { + super(conf); + List rules = readConfigurationFile(filename); + if (rules != null) + scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules); + } + + public void setConf(Configuration conf) { + super.setConf(conf); + if (conf == null) return; + // the default constructor was called + if (this.scopedRules == null) { + String filename = getConf().get("urlnormalizer.regex.file"); + scopedRules = new HashMap(); + URL url = getConf().getResource(filename); + List rules = null; + if (url == null) { + LOG.warn("Can't load the default config file! " + filename); + rules = EMPTY_RULES; + } else { + try { + rules = readConfiguration(url.openStream()); + } catch (Exception e) { + LOG.warn("Couldn't read default config from '" + url + "': " + e); + rules = EMPTY_RULES; + } + } + scopedRules.put(URLNormalizers.SCOPE_DEFAULT, rules); + } + } + + // used in JUnit test. + void setConfiguration(InputStream is, String scope) { + List rules = readConfiguration(is); + scopedRules.put(scope, rules); + LOG.debug("Set config for scope '" + scope + "': " + rules.size() + " rules."); + } + + /** + * This function does the replacements by iterating through all the regex + * patterns. It accepts a string url as input and returns the altered string. + */ + public synchronized String regexNormalize(String urlString, String scope) { + List curRules = (List)scopedRules.get(scope); + if (curRules == null) { + // try to populate + String configFile = getConf().get("urlnormalizer.regex.file." + scope); + if (configFile != null) { + URL resource = getConf().getResource(configFile); + LOG.debug("resource for scope '" + scope + "': " + resource); + if (resource == null) { + LOG.warn("Can't load resource for config file: " + configFile); + } else { + try { + InputStream is = resource.openStream(); + curRules = readConfiguration(resource.openStream()); + scopedRules.put(scope, curRules); + } catch (Exception e) { + LOG.warn("Couldn't load resource '" + resource + "': " + e); + } + } + } else { + LOG.warn("can't load rule file for scope '" + scope + "': " + configFile); + } + } + if (curRules == EMPTY_RULES || curRules == null) { + LOG.warn("can't find rules for scope '" + scope + "', using default"); + // use global rules + curRules = (List)scopedRules.get(URLNormalizers.SCOPE_DEFAULT); + } + Iterator i = curRules.iterator(); + while (i.hasNext()) { + Rule r = (Rule) i.next(); + urlString = Util.substitute(matcher, r.pattern, new Perl5Substitution( + r.substitution), urlString, Util.SUBSTITUTE_ALL); // actual + // substitution + } + return urlString; + } + + public synchronized String normalize(String urlString, String scope) + throws MalformedURLException { + return regexNormalize(urlString, scope); + } + + /** Reads the configuration file and populates a List of Rules. */ + private List readConfigurationFile(String filename) { + if (LOG.isInfoEnabled()) { + LOG.info("loading " + filename); + } + try { + FileInputStream fis = new FileInputStream(filename); + return readConfiguration(fis); + } catch (Exception e) { + LOG.fatal("Error loading rules from '" + filename + "': " + e); + return EMPTY_RULES; + } + } + + private List readConfiguration(InputStream is) { + Perl5Compiler compiler = new Perl5Compiler(); + List rules = new ArrayList(); + try { + + // borrowed heavily from code in Configuration.java + Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() + .parse(is); + Element root = doc.getDocumentElement(); + if ((!"regex-normalize".equals(root.getTagName())) + && (LOG.isFatalEnabled())) { + LOG.fatal("bad conf file: top-level element not "); + } + NodeList regexes = root.getChildNodes(); + for (int i = 0; i < regexes.getLength(); i++) { + Node regexNode = regexes.item(i); + if (!(regexNode instanceof Element)) + continue; + Element regex = (Element) regexNode; + if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) { + LOG.warn("bad conf file: element not "); + } + NodeList fields = regex.getChildNodes(); + String patternValue = null; + String subValue = null; + for (int j = 0; j < fields.getLength(); j++) { + Node fieldNode = fields.item(j); + if (!(fieldNode instanceof Element)) + continue; + Element field = (Element) fieldNode; + if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) + patternValue = ((Text) field.getFirstChild()).getData(); + if ("substitution".equals(field.getTagName()) + && field.hasChildNodes()) + subValue = ((Text) field.getFirstChild()).getData(); + if (!field.hasChildNodes()) + subValue = ""; + } + if (patternValue != null && subValue != null) { + Rule rule = new Rule(); + rule.pattern = (Perl5Pattern) compiler.compile(patternValue); + rule.substitution = subValue; + rules.add(rule); + } + } + } catch (Exception e) { + if (LOG.isFatalEnabled()) { + LOG.fatal("error parsing conf file: " + e); + } + return EMPTY_RULES; + } + return rules; + } + + /** Spits out patterns and substitutions that are in the configuration file. */ + public static void main(String args[]) throws MalformedPatternException, + IOException { + RegexURLNormalizer normalizer = new RegexURLNormalizer(); + normalizer.setConf(NutchConfiguration.create()); + Iterator i = ((List)normalizer.scopedRules.get(URLNormalizers.SCOPE_DEFAULT)).iterator(); + System.out.println("* Rules for 'DEFAULT' scope:"); + while (i.hasNext()) { + Rule r = (Rule) i.next(); + System.out.print(" " + r.pattern.getPattern() + " -> "); + System.out.println(r.substitution); + } + // load the scope + if (args.length > 1) { + normalizer.normalize("http://test.com", args[1]); + } + if (normalizer.scopedRules.size() > 1) { + Iterator it = normalizer.scopedRules.keySet().iterator(); + while (it.hasNext()) { + String scope = (String)it.next(); + if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) continue; + System.out.println("* Rules for '" + scope + "' scope:"); + i = ((List)normalizer.scopedRules.get(scope)).iterator(); + while (i.hasNext()) { + Rule r = (Rule) i.next(); + System.out.print(" " + r.pattern.getPattern() + " -> "); + System.out.println(r.substitution); + } + } + } + if (args.length > 0) { + System.out.println("\n---------- Normalizer test -----------"); + String scope = URLNormalizers.SCOPE_DEFAULT; + if (args.length > 1) scope = args[1]; + System.out.println("Scope: " + scope); + System.out.println("Input url: '" + args[0] + "'"); + System.out.println("Output url: '" + normalizer.normalize(args[0], scope) + "'"); + } + System.exit(0); + } + +} Property changes on: src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-regex/plugin.xml =================================================================== --- src/plugin/urlnormalizer-regex/plugin.xml (revision 0) +++ src/plugin/urlnormalizer-regex/plugin.xml (revision 0) @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + Property changes on: src/plugin/urlnormalizer-regex/plugin.xml ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-regex/build.xml =================================================================== --- src/plugin/urlnormalizer-regex/build.xml (revision 0) +++ src/plugin/urlnormalizer-regex/build.xml (revision 0) @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + Property changes on: src/plugin/urlnormalizer-regex/build.xml ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/nutch-extensionpoints/plugin.xml =================================================================== --- src/plugin/nutch-extensionpoints/plugin.xml (revision 440417) +++ src/plugin/nutch-extensionpoints/plugin.xml (working copy) @@ -42,6 +42,10 @@ name="Nutch URL Filter"/> + + Index: src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java =================================================================== --- src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java (revision 0) +++ src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java (revision 0) @@ -0,0 +1,44 @@ +/* + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.net.urlnormalizer.pass; + + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.util.NutchConfiguration; + +import junit.framework.TestCase; + +public class TestPassURLNormalizer extends TestCase { + + public void testPassURLNormalizer() { + Configuration conf = NutchConfiguration.create(); + + PassURLNormalizer normalizer = new PassURLNormalizer(); + normalizer.setConf(conf); + String url = "http://www.example.com/test/..//"; + String result = null; + try { + result = normalizer.normalize(url, URLNormalizers.SCOPE_DEFAULT); + } catch (MalformedURLException mue) { + fail(mue.toString()); + } + + assertEquals(url, result); + } +} Property changes on: src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java =================================================================== --- src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java (revision 0) +++ src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java (revision 0) @@ -0,0 +1,40 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net.urlnormalizer.pass; + +import java.net.MalformedURLException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.net.URLNormalizer; + +public class PassURLNormalizer implements URLNormalizer { + + private Configuration conf; + + public String normalize(String urlString, String scope) throws MalformedURLException { + return urlString; + } + + public Configuration getConf() { + return conf; + } + + public void setConf(Configuration conf) { + this.conf = conf; + } + +} Property changes on: src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-pass/plugin.xml =================================================================== --- src/plugin/urlnormalizer-pass/plugin.xml (revision 0) +++ src/plugin/urlnormalizer-pass/plugin.xml (revision 0) @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + Property changes on: src/plugin/urlnormalizer-pass/plugin.xml ___________________________________________________________________ Name: svn:eol-style + native Index: src/plugin/urlnormalizer-pass/build.xml =================================================================== --- src/plugin/urlnormalizer-pass/build.xml (revision 0) +++ src/plugin/urlnormalizer-pass/build.xml (revision 0) @@ -0,0 +1,7 @@ + + + + + + + Property changes on: src/plugin/urlnormalizer-pass/build.xml ___________________________________________________________________ Name: svn:eol-style + native