--- src.orig/conf/regex-normalize.xml 2006-05-21 17:05:23.000000000 +0200
+++ src.new/conf/regex-normalize.xml 2006-05-22 15:04:17.000000000 +0200
@@ -19,4 +19,27 @@
(\?|\&|\&)PHPSESSID=[a-zA-Z0-9]{32}(\&|\&)(.*)
$1$3
+
+
+
+ (;jsessionid=[a-zA-Z0-9]{32})
+
+
+
+
+
+
+ (\/sid\/[a-z0-9]{32}\/)
+ /
+
+
+ (\/-S:[a-zA-Z0-9]{6}:[\-\.\,a-zA-Z0-9]{20}\/)
+ /
+
+
diff -u -r -N src.orig/java/org/apache/nutch/net/RegexUrlNormalizerChecker.java src.new/java/org/apache/nutch/net/RegexUrlNormalizerChecker.java
--- src.orig/java/org/apache/nutch/net/RegexUrlNormalizerChecker.java 1970-01-01 01:00:00.000000000 +0100
+++ src.new/java/org/apache/nutch/net/RegexUrlNormalizerChecker.java 2006-05-22 15:03:52.000000000 +0200
@@ -0,0 +1,75 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import java.io.IOException;
+import org.apache.oro.text.regex.*;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.util.logging.Logger;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+/**
+ * Check RegexUrlNormalizer
+ *
+ * Based on URLFilterChecker by John Xing
+ *
+ * @author Stefan Neufeind
+ */
+
+public class RegexUrlNormalizerChecker {
+
+ public static final Logger LOG =
+ LogFormatter.getLogger(RegexUrlNormalizerChecker.class.getName());
+ private Configuration conf;
+
+ public RegexUrlNormalizerChecker(Configuration conf) {
+ this.conf = conf;
+ }
+
+ private void normalize()
+ throws MalformedPatternException, IOException {
+ LOG.info("Normalizing with regex-url-normalizer");
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+ String line;
+ while((line=in.readLine())!=null) {
+ RegexUrlNormalizer normalizer = new RegexUrlNormalizer();
+ normalizer.setConf(this.conf);
+ String out = normalizer.normalize(line);
+ System.out.println(out);
+ }
+ }
+
+ public static void main(String args[])
+ throws MalformedPatternException, IOException {
+ RegexUrlNormalizerChecker checker = new RegexUrlNormalizerChecker(NutchConfiguration.create());
+ checker.normalize();
+
+ System.exit(0);
+ }
+}