--- src.orig/conf/regex-normalize.xml 2006-05-21 17:05:23.000000000 +0200 +++ src.new/conf/regex-normalize.xml 2006-05-22 15:04:17.000000000 +0200 @@ -19,4 +19,27 @@ (\?|\&|\&)PHPSESSID=[a-zA-Z0-9]{32}(\&|\&)(.*) $1$3 + + + + (;jsessionid=[a-zA-Z0-9]{32}) + + + + + + + (\/sid\/[a-z0-9]{32}\/) + / + + + (\/-S:[a-zA-Z0-9]{6}:[\-\.\,a-zA-Z0-9]{20}\/) + / + + diff -u -r -N src.orig/java/org/apache/nutch/net/RegexUrlNormalizerChecker.java src.new/java/org/apache/nutch/net/RegexUrlNormalizerChecker.java --- src.orig/java/org/apache/nutch/net/RegexUrlNormalizerChecker.java 1970-01-01 01:00:00.000000000 +0100 +++ src.new/java/org/apache/nutch/net/RegexUrlNormalizerChecker.java 2006-05-22 15:03:52.000000000 +0200 @@ -0,0 +1,75 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.net; + +import java.io.IOException; +import org.apache.oro.text.regex.*; + +import org.apache.nutch.plugin.Extension; +import org.apache.nutch.plugin.ExtensionPoint; +import org.apache.nutch.plugin.PluginRepository; + +import org.apache.hadoop.util.LogFormatter; +import org.apache.hadoop.conf.Configuration; + +import org.apache.nutch.util.NutchConfiguration; + +import java.util.logging.Logger; + +import java.io.BufferedReader; +import java.io.InputStreamReader; + +/** + * Check RegexUrlNormalizer + * + * Based on URLFilterChecker by John Xing + * + * @author Stefan Neufeind + */ + +public class RegexUrlNormalizerChecker { + + public static final Logger LOG = + LogFormatter.getLogger(RegexUrlNormalizerChecker.class.getName()); + private Configuration conf; + + public RegexUrlNormalizerChecker(Configuration conf) { + this.conf = conf; + } + + private void normalize() + throws MalformedPatternException, IOException { + LOG.info("Normalizing with regex-url-normalizer"); + + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + String line; + while((line=in.readLine())!=null) { + RegexUrlNormalizer normalizer = new RegexUrlNormalizer(); + normalizer.setConf(this.conf); + String out = normalizer.normalize(line); + System.out.println(out); + } + } + + public static void main(String args[]) + throws MalformedPatternException, IOException { + RegexUrlNormalizerChecker checker = new RegexUrlNormalizerChecker(NutchConfiguration.create()); + checker.normalize(); + + System.exit(0); + } +}