Index: src/test/org/apache/nutch/metadata/TestMetadata.java
===================================================================
--- src/test/org/apache/nutch/metadata/TestMetadata.java (revision 467333)
+++ src/test/org/apache/nutch/metadata/TestMetadata.java (working copy)
@@ -1,267 +0,0 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.metadata;
-
-// JDK imports
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.util.Properties;
-import junit.framework.Test;
-
-// JUnit imports
-import junit.framework.TestCase;
-import junit.framework.TestSuite;
-import junit.textui.TestRunner;
-
-// Nutch imports
-import org.apache.nutch.metadata.Metadata;
-
-
-/**
- * JUnit based tests of class {@link org.apache.nutch.metadata.Metadata}.
- *
- * @author Chris Mattmann
- * @author Jérôme Charron
- */
-public class TestMetadata extends TestCase {
-
-
- public TestMetadata(String testName) {
- super(testName);
- }
-
- public static Test suite() {
- return new TestSuite(TestMetadata.class);
- }
-
- public static void main(String[] args) {
- TestRunner.run(suite());
- }
-
-
- /** Test for the getNormalizedName(String) method. */
- public void testGetNormalizedName() {
- assertEquals("Content-Type", Metadata.getNormalizedName("Content-Type"));
- assertEquals("Content-Type", Metadata.getNormalizedName("ContentType"));
- assertEquals("Content-Type", Metadata.getNormalizedName("Content-type"));
- assertEquals("Content-Type", Metadata.getNormalizedName("contenttype"));
- assertEquals("Content-Type", Metadata.getNormalizedName("contentype"));
- assertEquals("Content-Type", Metadata.getNormalizedName("contntype"));
- }
-
- /** Test for the add(String, String) method. */
- public void testAdd() {
- String[] values = null;
- Metadata meta = new Metadata();
-
- values = meta.getValues("contentype");
- assertEquals(0, values.length);
-
- meta.add("contentype", "value1");
- values = meta.getValues("contentype");
- assertEquals(1, values.length);
- assertEquals("value1", values[0]);
-
- meta.add("Content-Type", "value2");
- values = meta.getValues("contentype");
- assertEquals(2, values.length);
- assertEquals("value1", values[0]);
- assertEquals("value2", values[1]);
-
- // NOTE : For now, the same value can be added many times.
- // Should it be changed?
- meta.add("ContentType", "value1");
- values = meta.getValues("Content-Type");
- assertEquals(3, values.length);
- assertEquals("value1", values[0]);
- assertEquals("value2", values[1]);
- assertEquals("value1", values[2]);
- }
-
- /** Test for the set(String, String) method. */
- public void testSet() {
- String[] values = null;
- Metadata meta = new Metadata();
-
- values = meta.getValues("contentype");
- assertEquals(0, values.length);
-
- meta.set("contentype", "value1");
- values = meta.getValues("contentype");
- assertEquals(1, values.length);
- assertEquals("value1", values[0]);
-
- meta.set("Content-Type", "value2");
- values = meta.getValues("contentype");
- assertEquals(1, values.length);
- assertEquals("value2", values[0]);
-
- meta.set("contenttype", "new value 1");
- meta.add("contenttype", "new value 2");
- values = meta.getValues("contentype");
- assertEquals(2, values.length);
- assertEquals("new value 1", values[0]);
- assertEquals("new value 2", values[1]);
- }
-
- /** Test for setAll(Properties) method */
- public void testSetProperties() {
- String[] values = null;
- Metadata meta = new Metadata();
- Properties props = new Properties();
-
- meta.setAll(props);
- assertEquals(0, meta.size());
-
- props.setProperty("name-one", "value1.1");
- meta.setAll(props);
- assertEquals(1, meta.size());
- values = meta.getValues("name-one");
- assertEquals(1, values.length);
- assertEquals("value1.1", values[0]);
-
- props.setProperty("name-two", "value2.1");
- meta.setAll(props);
- assertEquals(2, meta.size());
- values = meta.getValues("name-one");
- assertEquals(1, values.length);
- assertEquals("value1.1", values[0]);
- values = meta.getValues("name-two");
- assertEquals(1, values.length);
- assertEquals("value2.1", values[0]);
- }
-
- /** Test for get(String) method */
- public void testGet() {
- Metadata meta = new Metadata();
- assertNull(meta.get("a-name"));
-
- meta.add("a-name", "value-1");
- assertEquals("value-1", meta.get("a-name"));
- meta.add("a-name", "value-2");
- assertEquals("value-1", meta.get("a-name"));
- }
-
- /** Test for isMultiValued() method */
- public void testIsMultiValued() {
- Metadata meta = new Metadata();
- assertFalse(meta.isMultiValued("key"));
- meta.add("key", "value1");
- assertFalse(meta.isMultiValued("key"));
- meta.add("key", "value2");
- assertTrue(meta.isMultiValued("key"));
- }
-
- /** Test for names method */
- public void testNames() {
- String[] names = null;
- Metadata meta = new Metadata();
- names = meta.names();
- assertEquals(0, names.length);
-
- meta.add("name-one", "value");
- names = meta.names();
- assertEquals(1, names.length);
- assertEquals("name-one", names[0]);
- meta.add("name-two", "value");
- names = meta.names();
- assertEquals(2, names.length);
- }
-
- /** Test for remove(String) method */
- public void testRemove() {
- Metadata meta = new Metadata();
- meta.remove("name-one");
- assertEquals(0, meta.size());
- meta.add("name-one", "value-1.1");
- meta.add("name-one", "value-1.2");
- meta.add("name-two", "value-2.2");
- assertEquals(2, meta.size());
- assertNotNull(meta.get("name-one"));
- assertNotNull(meta.get("name-two"));
- meta.remove("name-one");
- assertEquals(1, meta.size());
- assertNull(meta.get("name-one"));
- assertNotNull(meta.get("name-two"));
- meta.remove("name-two");
- assertEquals(0, meta.size());
- assertNull(meta.get("name-one"));
- assertNull(meta.get("name-two"));
- }
-
- /** Test for equals(Object) method */
- public void testObject() {
- Metadata meta1 = new Metadata();
- Metadata meta2 = new Metadata();
- assertFalse(meta1.equals(null));
- assertFalse(meta1.equals("String"));
- assertTrue(meta1.equals(meta2));
- meta1.add("name-one", "value-1.1");
- assertFalse(meta1.equals(meta2));
- meta2.add("name-one", "value-1.1");
- assertTrue(meta1.equals(meta2));
- meta1.add("name-one", "value-1.2");
- assertFalse(meta1.equals(meta2));
- meta2.add("name-one", "value-1.2");
- assertTrue(meta1.equals(meta2));
- meta1.add("name-two", "value-2.1");
- assertFalse(meta1.equals(meta2));
- meta2.add("name-two", "value-2.1");
- assertTrue(meta1.equals(meta2));
- meta1.add("name-two", "value-2.2");
- assertFalse(meta1.equals(meta2));
- meta2.add("name-two", "value-2.x");
- assertFalse(meta1.equals(meta2));
- }
-
- /** Test for Writable implementation */
- public void testWritable() {
- Metadata result = null;
- Metadata meta = new Metadata();
- result = writeRead(meta);
- assertEquals(0, result.size());
- meta.add("name-one", "value-1.1");
- result = writeRead(meta);
- assertEquals(1, result.size());
- assertEquals(1, result.getValues("name-one").length);
- assertEquals("value-1.1", result.get("name-one"));
- meta.add("name-two", "value-2.1");
- meta.add("name-two", "value-2.2");
- result = writeRead(meta);
- assertEquals(2, result.size());
- assertEquals(1, result.getValues("name-one").length);
- assertEquals("value-1.1", result.getValues("name-one")[0]);
- assertEquals(2, result.getValues("name-two").length);
- assertEquals("value-2.1", result.getValues("name-two")[0]);
- assertEquals("value-2.2", result.getValues("name-two")[1]);
- }
-
- private Metadata writeRead(Metadata meta) {
- Metadata readed = new Metadata();
- try {
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- meta.write(new DataOutputStream(out));
- readed.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray())));
- } catch (IOException ioe) {
- fail(ioe.toString());
- }
- return readed;
- }
-
-}
Index: src/test/org/apache/nutch/protocol/TestContent.java
===================================================================
--- src/test/org/apache/nutch/protocol/TestContent.java (revision 467333)
+++ src/test/org/apache/nutch/protocol/TestContent.java (working copy)
@@ -16,7 +16,7 @@
package org.apache.nutch.protocol;
-import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.PlainMetadata;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.WritableTestUtils;
@@ -38,7 +38,7 @@
String url = "http://www.foo.com/";
- Metadata metaData = new Metadata();
+ PlainMetadata metaData = new PlainMetadata();
metaData.add("Host", "www.foo.com");
metaData.add("Content-Type", "text/html");
@@ -54,7 +54,7 @@
/** Unit tests for getContentType(String, String, byte[]) method. */
public void testGetContentType() throws Exception {
Content c = null;
- Metadata p = new Metadata();
+ PlainMetadata p = new PlainMetadata();
c = new Content("http://www.foo.com/",
"http://www.foo.com/",
Index: src/test/org/apache/nutch/parse/TestParseData.java
===================================================================
--- src/test/org/apache/nutch/parse/TestParseData.java (revision 467333)
+++ src/test/org/apache/nutch/parse/TestParseData.java (working copy)
@@ -16,11 +16,22 @@
package org.apache.nutch.parse;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.WritableTestUtils;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.PlainMetadata;
+import org.apache.nutch.protocol.Content;
import junit.framework.TestCase;
@@ -28,6 +39,7 @@
public class TestParseData extends TestCase {
+ private static final int ITERATIONS = 1000;
private Configuration conf = NutchConfiguration.create();
public TestParseData(String name) { super(name); }
@@ -41,7 +53,7 @@
new Outlink("http://bar.com/", "Bar", conf)
};
- Metadata metaData = new Metadata();
+ PlainMetadata metaData = new PlainMetadata();
metaData.add("Language", "en/us");
metaData.add("Charset", "UTF-8");
@@ -59,7 +71,7 @@
ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS,
"Max Outlinks Title",
outlinks,
- new Metadata());
+ new PlainMetadata());
Configuration conf = NutchConfiguration.create();
// No Outlinks
conf.setInt("db.max.outlinks.per.page", 0);
@@ -78,4 +90,69 @@
data = (ParseData) WritableTestUtils.writeRead(original, conf);
assertEquals(outlinks.length, data.getOutlinks().length);
}
+
+
+ public void testIOPerformance() throws FileNotFoundException, MalformedURLException{
+ Configuration conf=NutchConfiguration.create();
+ String url="http://www.apache.org/";
+ String base="http://www.apache.org/";
+ String contentType="text/html";
+
+
+ File f=new File("build/test/parsedataio");
+ FileOutputStream fos=new FileOutputStream(f);
+ DataOutputStream dos=new DataOutputStream(fos);
+
+ long time=System.currentTimeMillis();
+
+ Outlink [] links=new Outlink[30];
+
+ for(int i=0;i *
* -------------------------- */
-
+
+ public static PlainMetadata convert(Metadata s){
+ String[] names=s.names();
+ PlainMetadata dest=new PlainMetadata();
+ for(int i=0;i 1;
+ }
+
+ /**
+ * Returns an array of the names contained in the metadata.
+ */
+ public String[] names() {
+ return (String[]) metadata.keySet().toArray(new String[metadata.size()]);
+ }
+
+ /**
+ * Get the value associated to a metadata name. If many values are assiociated
+ * to the specified name, then the first one is returned.
+ *
+ * @param name
+ * of the metadata.
+ * @return the value associated to the specified metadata name.
+ */
+ public String get(String name) {
+ Object values = metadata.get(name);
+ if ((values != null) && (values instanceof List)) {
+ return (String) ((List) values).get(0);
+ } else {
+ return (String) values;
+ }
+ }
+
+ /**
+ * Get the values associated to a metadata name.
+ *
+ * @param name
+ * of the metadata.
+ * @return the values associated to a metadata name.
+ */
+ public String[] getValues(String name) {
+ Object values = metadata.get(name);
+ if (values != null) {
+ if (values instanceof List) {
+ List list = (List) values;
+ return (String[]) list.toArray(new String[list.size()]);
+ } else {
+ return new String[] { (String) values };
+ }
+ }
+ return new String[0];
+ }
+
+ /**
+ * Add a metadata name/value mapping. Add the specified value to the list of
+ * values associated to the specified metadata name.
+ *
+ * @param name
+ * the metadata name.
+ * @param value
+ * the metadata value.
+ */
+ public void add(String name, String value) {
+ Object values = metadata.get(name);
+ if (values != null) {
+ if (values instanceof String) {
+ List list = new ArrayList();
+ list.add(values);
+ list.add(value);
+ metadata.put(name, list);
+ } else if (values instanceof List) {
+ ((List) values).add(value);
+ }
+ } else {
+ metadata.put(name, value);
+ }
+ }
+
+ public void setAll(Properties properties) {
+ Enumeration names = properties.propertyNames();
+ while (names.hasMoreElements()) {
+ String name = (String) names.nextElement();
+ set(name, properties.getProperty(name));
+ }
+ }
+
+ /**
+ * Set metadata name/value. Associate the specified value to the specified
+ * metadata name. If some previous values were associated to this name, they
+ * are removed.
+ *
+ * @param name
+ * the metadata name.
+ * @param value
+ * the metadata value.
+ */
+ public void set(String name, String value) {
+ remove(name);
+ add(name, value);
+ }
+
+ /**
+ * Remove a metadata and all its associated values.
+ */
+ public void remove(String name) {
+ metadata.remove(name);
+ }
+
+ /**
+ * Returns the number of metadata names in this metadata.
+ */
+ public int size() {
+ return metadata.size();
+ }
+
+ public boolean equals(Object o) {
+
+ if (o == null) {
+ return false;
+ }
+
+ PlainMetadata other = null;
+ try {
+ other = (PlainMetadata) o;
+ } catch (ClassCastException cce) {
+ return false;
+ }
+
+ if (other.size() != size()) {
+ return false;
+ }
+
+ String[] names = names();
+ for (int i = 0; i < names.length; i++) {
+ String[] otherValues = other.getValues(names[i]);
+ String[] thisValues = getValues(names[i]);
+ if (otherValues.length != thisValues.length) {
+ return false;
+ }
+ for (int j = 0; j < otherValues.length; j++) {
+ if (!otherValues[j].equals(thisValues[j])) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+
+ public String toString() {
+ StringBuffer buf = new StringBuffer();
+ String[] names = names();
+ for (int i = 0; i < names.length; i++) {
+ String[] values = getValues(names[i]);
+ for (int j = 0; j < values.length; j++) {
+ buf.append(names[i]).append("=").append(values[j]).append(" ");
+ }
+ }
+ return buf.toString();
+ }
+
+ public final void write(DataOutput out) throws IOException {
+ String[] values = null;
+ String[] names = names();
+ WritableUtils.writeStringArray(out, names);
+ for (int i = 0; i < names.length; i++) {
+ values = getValues(names[i]);
+ WritableUtils.writeStringArray(out, values);
+ }
+ }
+
+ public final void readFields(DataInput in) throws IOException {
+ String keys[] = WritableUtils.readStringArray(in);
+ for (int i = 0; i < keys.length; i++) {
+ String values[] = WritableUtils.readStringArray(in);
+ for (int j = 0; j < values.length; j++) {
+ add(keys[i], values[j]);
+ }
+ }
+ }
+}
Index: src/java/org/apache/nutch/metadata/MetadataSpellChecker.java
===================================================================
--- src/java/org/apache/nutch/metadata/MetadataSpellChecker.java (revision 0)
+++ src/java/org/apache/nutch/metadata/MetadataSpellChecker.java (revision 0)
@@ -0,0 +1,112 @@
+package org.apache.nutch.metadata;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * This decorator brings spell checking capabilities to metadata, if
+ * none is required use {Metadata} instead
+ *
+ * All the static String fields declared by this class are used as reference
+ * names for syntax correction on meta-data naming.
+ */
+public class MetadataSpellChecker extends PlainMetadata implements CreativeCommons,
+ DublinCore, HttpHeaders, Nutch, Office {
+
+ private final static Map NAMES_IDX = new HashMap();
+
+ private static String[] normalized = null;
+
+ // Uses self introspection to fill the metanames index and the
+ // metanames list.
+ static {
+ Field[] fields = PlainMetadata.class.getFields();
+ for (int i=0; i
+ * content-type gives Content-Type
+ * CoNtEntType gives Content-Type
+ * ConTnTtYpe gives Content-Type
+ *
+ * If no matching with a well-known metadata name is found, then the original
+ * name is returned.
+ */
+ public static String getNormalizedName(String name) {
+ String searched = normalize(name);
+ String value = (String) NAMES_IDX.get(searched);
+
+ if ((value == null) && (normalized != null)) {
+ int threshold = searched.length() / 3;
+ for (int i=0; i VERSION)
throw new VersionMismatchException(VERSION, version);
- url = UTF8.readString(in); // read url
- base = UTF8.readString(in); // read base
+ if (version == 2) {
+ //handle version 2 format
+ String sData[]=WritableUtils.readStringArray(in);
+ url = sData[0];
+ base = sData[1];
+ contentType = sData[2];
- content = new byte[in.readInt()]; // read content
- in.readFully(content);
+ metadata = new PlainMetadata();
+ metadata.readFields(in);
- contentType = UTF8.readString(in); // read contentType
+ content = new byte[in.readInt()]; // read content
+ in.readFully(content);
+ } else if (version == 1) {
+ //handle verion 1 format
- metadata = new Metadata();
- metadata.readFields(in); // read meta data
+ url = UTF8.readString(in); // read url
+ base = UTF8.readString(in); // read base
+
+ content = new byte[in.readInt()]; // read content
+ in.readFully(content);
+
+ contentType = UTF8.readString(in); // read contentType
+
+ Metadata metadata = new Metadata();
+ metadata.readFields(in); // read meta data
+ this.metadata = Metadata.convert(metadata);
+ }
}
protected final void writeCompressed(DataOutput out) throws IOException {
out.writeByte(version);
- UTF8.writeString(out, url); // write url
- UTF8.writeString(out, base); // write base
-
- out.writeInt(content.length); // write content
+ //collect all strings to array
+ String sData[]=new String[]{
+ url,base,contentType
+ };
+
+ WritableUtils.writeStringArray(out,sData); //write Strings
+
+ metadata.write(out); // write metadata
+
+ out.writeInt(content.length); // write content
out.write(content);
-
- UTF8.writeString(out, contentType); // write contentType
-
- metadata.write(out); // write metadata
}
public static Content read(DataInput in) throws IOException {
@@ -99,18 +157,15 @@
return content;
}
- //
- // Accessor methods
- //
-
/** The url fetched. */
public String getUrl() {
ensureInflated();
return url;
}
- /** The base url for relative links contained in the content.
- * Maybe be different from url if the request redirected.
+ /**
+ * The base url for relative links contained in the content. Maybe be
+ * different from url if the request redirected.
*/
public String getBaseUrl() {
ensureInflated();
@@ -122,12 +177,15 @@
ensureInflated();
return content;
}
+
public void setContent(byte[] content) {
ensureInflated();
this.content = content;
}
- /** The media type of the retrieved content.
+ /**
+ * The media type of the retrieved content.
+ *
* @see
* http://www.iana.org/assignments/media-types/
*/
@@ -135,48 +193,47 @@
ensureInflated();
return contentType;
}
+
public void setContentType(String contentType) {
ensureInflated();
this.contentType = contentType;
}
/** Other protocol-specific data. */
- public Metadata getMetadata() {
+ public PlainMetadata getMetadata() {
ensureInflated();
return metadata;
}
/** Other protocol-specific data. */
- public void setMetadata(Metadata metadata) {
+ public void setMetadata(PlainMetadata metadata) {
ensureInflated();
this.metadata = metadata;
}
public boolean equals(Object o) {
ensureInflated();
- if (!(o instanceof Content)){
+ if (!(o instanceof Content)) {
return false;
}
- Content that = (Content)o;
+ Content that = (Content) o;
that.ensureInflated();
- return
- this.url.equals(that.url) &&
- this.base.equals(that.base) &&
- Arrays.equals(this.getContent(), that.getContent()) &&
- this.contentType.equals(that.contentType) &&
- this.metadata.equals(that.metadata);
+ return this.url.equals(that.url) && this.base.equals(that.base)
+ && Arrays.equals(this.getContent(), that.getContent())
+ && this.contentType.equals(that.contentType)
+ && this.metadata.equals(that.metadata);
}
public String toString() {
ensureInflated();
StringBuffer buffer = new StringBuffer();
- buffer.append("url: " + url + "\n" );
- buffer.append("base: " + base + "\n" );
- buffer.append("contentType: " + contentType + "\n" );
- buffer.append("metadata: " + metadata + "\n" );
+ buffer.append("url: " + url + "\n");
+ buffer.append("base: " + base + "\n");
+ buffer.append("contentType: " + contentType + "\n");
+ buffer.append("metadata: " + metadata + "\n");
buffer.append("Content:\n");
- buffer.append(new String(content)); // try default encoding
+ buffer.append(new String(content)); // try default encoding
return buffer.toString();
@@ -185,7 +242,7 @@
public static void main(String argv[]) throws Exception {
String usage = "Content (-local | -dfs ) recno segment";
-
+
if (argv.length < 3) {
System.out.println("usage:" + usage);
return;
@@ -199,7 +256,8 @@
Path file = new Path(segment, DIR_NAME);
System.out.println("Reading from file: " + file);
- ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), conf);
+ ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(),
+ conf);
Content content = new Content();
contents.get(recno, content);
@@ -216,10 +274,10 @@
private String getContentType(String typeName, String url, byte[] data) {
MimeType type = null;
try {
- typeName = MimeType.clean(typeName);
- type = typeName == null ? null : this.mimeTypes.forName(typeName);
+ typeName = MimeType.clean(typeName);
+ type = typeName == null ? null : this.mimeTypes.forName(typeName);
} catch (MimeTypeException mte) {
- // Seems to be a malformed mime type name...
+ // Seems to be a malformed mime type name...
}
if (typeName == null || type == null || !type.matches(url)) {
@@ -229,8 +287,8 @@
type = this.mimeTypes.getMimeType(url);
typeName = type == null ? typeName : type.getName();
}
- if (typeName == null || type == null ||
- (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) {
+ if (typeName == null || type == null
+ || (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) {
// If no mime-type already found, or the one found doesn't match
// the magic bytes it should be, then, guess a mime-type from the
// document content (magic bytes)
Index: src/java/org/apache/nutch/segment/SegmentMerger.java
===================================================================
--- src/java/org/apache/nutch/segment/SegmentMerger.java (revision 467333)
+++ src/java/org/apache/nutch/segment/SegmentMerger.java (working copy)
@@ -32,7 +32,7 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.fetcher.Fetcher;
-import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.PlainMetadata;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
@@ -144,12 +144,12 @@
((CrawlDatum)o).getMetaData().put(SEGMENT_NAME_KEY, new UTF8(segment));
} else if (o instanceof Content) {
if (((Content)o).getMetadata() == null) {
- ((Content)o).setMetadata(new Metadata());
+ ((Content)o).setMetadata(new PlainMetadata());
}
((Content)o).getMetadata().set(SEGMENT_NAME_KEY.toString(), segment);
} else if (o instanceof ParseData) {
if (((ParseData)o).getParseMeta() == null) {
- ((ParseData)o).setParseMeta(new Metadata());
+ ((ParseData)o).setParseMeta(new PlainMetadata());
}
((ParseData)o).getParseMeta().set(SEGMENT_NAME_KEY.toString(), segment);
} else if (o instanceof ParseText) {
Index: src/java/org/apache/nutch/crawl/CrawlDbMerger.java
===================================================================
--- src/java/org/apache/nutch/crawl/CrawlDbMerger.java (revision 467333)
+++ src/java/org/apache/nutch/crawl/CrawlDbMerger.java (working copy)
@@ -28,6 +28,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.*;
import org.apache.nutch.net.URLFilters;
@@ -51,30 +52,104 @@
*/
public class CrawlDbMerger extends Configured {
private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class);
+
+
+ public static class TimeoutWatchDog implements Runnable{
- public static class Merger extends MapReduceBase implements Reducer {
+ Thread toNotify=null;
+ int timeout;
+ long lastPing=0;
+
+ public TimeoutWatchDog(int timeoutInSeconds){
+ this.timeout=timeoutInSeconds;
+ }
+
+ public void on(){
+ if(toNotify==null){
+ toNotify=Thread.currentThread();
+ }
+ lastPing=System.currentTimeMillis();
+ }
+
+ public void off(){
+ toNotify=null;
+ }
+
+ public void run(){
+ try{
+ Thread.sleep(1000);
+ } catch (InterruptedException e){
+ if(toNotify!=null && System.currentTimeMillis()-lastPing>1000*timeout){
+ LOG.info("seems like thread is in trouble!");
+ if(toNotify!=null){
+ toNotify.interrupt();
+ }
+ }
+ }
+ }
+ }
+
+
+ public static class MyMapper extends org.apache.hadoop.mapred.lib.IdentityMapper {
+
private URLFilters filters = null;
- MapWritable meta = new MapWritable();
+ int index=0;
+ private Thread watchDogThread;
+ private TimeoutWatchDog watchDog;
- public void close() throws IOException {}
-
- public void configure(JobConf conf) {
- if (conf.getBoolean("crawldb.merger.urlfilters", false))
+ public void configure(JobConf conf) {
+ watchDog=new TimeoutWatchDog(10);
+ watchDogThread=new Thread(watchDog);
+ watchDogThread.start();
+
+ super.configure(conf);
+ if (conf.getBoolean("crawldb.merger.urlfilters", false)) {
+ LOG.info("Filtering urls");
filters = new URLFilters(conf);
+ }
}
- public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
- throws IOException {
+ public void map(WritableComparable key, Writable val, OutputCollector output, Reporter reporter) throws IOException {
+ if(index++%10000==0){
+ if(LOG.isInfoEnabled()){
+ LOG.info("Processing entry #" + index);
+ }
+ }
if (filters != null) {
try {
- if (filters.filter(((UTF8) key).toString()) == null)
+ watchDog.on();
+ if (filters.filter(((UTF8) key).toString()) == null) {
+ if(LOG.isDebugEnabled()) {
+ LOG.debug("Filtering out: " + key.toString());
+ }
return;
+ }
+ watchDog.off();
} catch (Exception e) {
+ watchDog.off();
if (LOG.isDebugEnabled()) {
LOG.debug("Can't filter " + key + ": " + e);
}
}
+ super.map(key, val, output, reporter);
}
+ }
+ }
+
+ public static class Merger extends MapReduceBase implements Reducer {
+ MapWritable meta = new MapWritable();
+ int index=0;
+
+ public void close() throws IOException {}
+
+ public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter)
+ throws IOException {
+ if(index++%10000==0){
+ if(LOG.isInfoEnabled()){
+ LOG.info("Processing entry #" + index);
+ }
+ }
+
CrawlDatum res = null;
long resTime = 0L;
meta.clear();
@@ -130,6 +205,7 @@
job.setInputKeyClass(UTF8.class);
job.setInputValueClass(CrawlDatum.class);
+ job.setMapperClass(MyMapper.class);
job.setReducerClass(Merger.class);
job.setOutputPath(newCrawlDb);
@@ -165,4 +241,5 @@
CrawlDbMerger merger = new CrawlDbMerger(conf);
merger.merge(output, (Path[]) dbs.toArray(new Path[dbs.size()]), filter);
}
+
}
Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseOutputFormat.java (revision 467333)
+++ src/java/org/apache/nutch/parse/ParseOutputFormat.java (working copy)
@@ -23,10 +23,12 @@
import org.apache.hadoop.io.*;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.fetcher.Fetcher;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapred.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.net.*;
@@ -38,6 +40,9 @@
/* Parse content in a segment. */
public class ParseOutputFormat implements OutputFormat {
+
+ static Configuration conf=NutchConfiguration.create();
+
private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class);
private UrlNormalizer urlNormalizer;
@@ -80,10 +85,12 @@
public void write(WritableComparable key, Writable value)
throws IOException {
- Parse parse = (Parse)value;
+ ParseImpl parse=(ParseImpl) value;
+
String fromUrl = key.toString();
String fromHost = null;
- String toHost = null;
+ String toHost = null;
+ //LOG.info("to: appending text");
textOut.append(key, new ParseText(parse.getText()));
ParseData parseData = parse.getData();
@@ -95,6 +102,7 @@
// append a CrawlDatum with a signature
CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f);
d.setSignature(signature);
+ // LOG.info("co: appending signature");
crawlOut.append(key, d);
}
}
@@ -153,9 +161,14 @@
}
continue;
}
+ //LOG.info("co: appending target");
crawlOut.append(targetUrl, target);
- if (adjust != null) crawlOut.append(key, adjust);
+ if (adjust != null) {
+ //LOG.info("co: appending adjust");
+ crawlOut.append(key, adjust);
+ }
}
+ //LOG.info("appending parseData");
dataOut.append(key, parseData);
}
Index: src/java/org/apache/nutch/parse/ParseData.java
===================================================================
--- src/java/org/apache/nutch/parse/ParseData.java (revision 467333)
+++ src/java/org/apache/nutch/parse/ParseData.java (working copy)
@@ -25,76 +25,111 @@
import org.apache.hadoop.conf.Configurable;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.PlainMetadata;
import org.apache.nutch.util.NutchConfiguration;
-
-/** Data extracted from a page's content.
+/**
+ * Data extracted from a page's content.
+ *
* @see Parse#getData()
*/
public final class ParseData extends VersionedWritable implements Configurable {
public static final String DIR_NAME = "parse_data";
- private final static byte VERSION = 4;
+ private final static byte VERSION = 5;
+ private int maxOutlinks = Integer.MAX_VALUE;
private String title;
+
private Outlink[] outlinks;
- private Metadata contentMeta;
- private Metadata parseMeta;
+
+ private PlainMetadata contentMeta;
+
+ private PlainMetadata parseMeta;
+
private ParseStatus status;
+
private Configuration conf;
-
- // TODO mb@media-style.com: should we really implement Configurable or should we add the
- // parameter Configuration to the default-constructor. NOTE: The test
- // TestWriteable instantiates ParseData with Class.newInstance() -> the default
- // constructor is called -> conf is null. The programmer which use this object may not forget to set the conf.
- public ParseData() {}
+ public ParseData() {
+ }
+
+ /**
+ * @deprecated use {@link #ParseData(ParseStatus, String, Outlink[], PlainMetadata)}
+ * instead
+ */
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
- Metadata contentMeta) {
+ Metadata contentMeta) {
this(status, title, outlinks, contentMeta, new Metadata());
}
+
+ public ParseData(ParseStatus status, String title, Outlink[] outlinks,
+ PlainMetadata contentMeta) {
+ this(status, title, outlinks, contentMeta, new PlainMetadata());
+ }
+
+ /**
+ * @deprecated use {@link #ParseData(ParseStatus, String, Outlink[], PlainMetadata, PlainMetadata)}
+ * instead
+ */
+ public ParseData(ParseStatus status, String title, Outlink[] outlinks,
+ Metadata contentMeta, Metadata parseMeta) {
+ this.status = status;
+ this.title = title;
+ this.outlinks = outlinks;
+ //convert metadata
+ this.contentMeta = Metadata.convert(contentMeta);
+ this.parseMeta = Metadata.convert(parseMeta);
+ }
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
- Metadata contentMeta, Metadata parseMeta) {
+ PlainMetadata contentMeta, PlainMetadata parseMeta) {
this.status = status;
this.title = title;
this.outlinks = outlinks;
+ //convert metadata
this.contentMeta = contentMeta;
this.parseMeta = parseMeta;
}
- //
- // Accessor methods
- //
+ /** The status of parsing the page. */
+ public ParseStatus getStatus() {
+ return status;
+ }
- /** The status of parsing the page. */
- public ParseStatus getStatus() { return status; }
-
/** The title of the page. */
- public String getTitle() { return title; }
+ public String getTitle() {
+ return title;
+ }
/** The outlinks of the page. */
- public Outlink[] getOutlinks() { return outlinks; }
+ public Outlink[] getOutlinks() {
+ return outlinks;
+ }
/** The original Metadata retrieved from content */
- public Metadata getContentMeta() { return contentMeta; }
+ public PlainMetadata getContentMeta() {
+ return contentMeta;
+ }
/**
- * Other content properties.
- * This is the place to find format-specific properties.
- * Different parser implementations for different content types will populate
- * this differently.
+ * Other content properties. This is the place to find format-specific
+ * properties. Different parser implementations for different content types
+ * will populate this differently.
*/
- public Metadata getParseMeta() { return parseMeta; }
-
- public void setParseMeta(Metadata parseMeta) {
+ public PlainMetadata getParseMeta() {
+ return parseMeta;
+ }
+
+ public void setParseMeta(PlainMetadata parseMeta) {
this.parseMeta = parseMeta;
}
-
+
/**
- * Get a metadata single value.
- * This method first looks for the metadata value in the parse metadata. If no
- * value is found it the looks for the metadata in the content metadata.
+ * Get a metadata single value. This method first looks for the metadata value
+ * in the parse metadata. If no value is found it the looks for the metadata
+ * in the content metadata.
+ *
* @see #getContentMeta()
* @see #getParseMeta()
*/
@@ -105,62 +140,92 @@
}
return value;
}
-
- //
- // Writable methods
- //
- public byte getVersion() { return VERSION; }
+ public byte getVersion() {
+ return VERSION;
+ }
public final void readFields(DataInput in) throws IOException {
byte version = in.readByte();
- if (version > 1)
+
+ if (version == 5) {
+ //handle version 5 here
status = ParseStatus.read(in);
- else
- status = ParseStatus.STATUS_SUCCESS;
- title = UTF8.readString(in); // read title
+ title = UTF8.readString(in); // read title
+ parseMeta=new PlainMetadata();
+
+ String[] anchors=WritableUtils.readStringArray(in);
+ String[] urls=WritableUtils.readStringArray(in);
+
+ outlinks=new Outlink[anchors.length];
+ for(int i=0;i= 0) {
- outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks);
- }
- outlinks = new Outlink[outlinksToRead];
- for (int i = 0; i < outlinksToRead; i++) {
- outlinks[i] = Outlink.read(in);
- }
- for (int i = outlinksToRead; i < totalOutlinks; i++) {
- Outlink.skip(in);
- }
-
- if (version < 3) {
- int propertyCount = in.readInt(); // read metadata
- contentMeta = new Metadata();
- for (int i = 0; i < propertyCount; i++) {
- contentMeta.add(UTF8.readString(in), UTF8.readString(in));
- }
- } else {
- contentMeta = new Metadata();
+ contentMeta = new PlainMetadata();
contentMeta.readFields(in);
- }
- if (version > 3) {
- parseMeta = new Metadata();
+ parseMeta = new PlainMetadata();
parseMeta.readFields(in);
+ } else {
+ //handle other versions here
+ if (version > 1)
+ status = ParseStatus.read(in);
+ else
+ status = ParseStatus.STATUS_SUCCESS;
+ title = UTF8.readString(in); // read title
+
+ int totalOutlinks = in.readInt(); // read outlinks
+ int maxOutlinksPerPage = this.conf
+ .getInt("db.max.outlinks.per.page", 100);
+ int outlinksToRead = totalOutlinks;
+ if (maxOutlinksPerPage >= 0) {
+ outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks);
+ }
+ outlinks = new Outlink[outlinksToRead];
+ for (int i = 0; i < outlinksToRead; i++) {
+ outlinks[i] = Outlink.read(in);
+ }
+ for (int i = outlinksToRead; i < totalOutlinks; i++) {
+ Outlink.skip(in);
+ }
+
+ if (version < 3) {
+ int propertyCount = in.readInt(); // read metadata
+ contentMeta = new PlainMetadata();
+ for (int i = 0; i < propertyCount; i++) {
+ contentMeta.add(UTF8.readString(in), UTF8.readString(in));
+ }
+ } else {
+ contentMeta = new PlainMetadata();
+ contentMeta.readFields(in);
+ }
+ if (version > 3) {
+ parseMeta = new PlainMetadata();
+ parseMeta.readFields(in);
+ }
}
}
public final void write(DataOutput out) throws IOException {
- out.writeByte(VERSION); // write version
- status.write(out); // write status
- UTF8.writeString(out, title); // write title
+ out.writeByte(VERSION); // write version
+ status.write(out); // write status
+ UTF8.writeString(out, title); // write title
- out.writeInt(outlinks.length); // write outlinks
- for (int i = 0; i < outlinks.length; i++) {
- outlinks[i].write(out);
+ String[] anchors=new String[outlinks.length];
+ String[] urls=new String[outlinks.length];
+
+ for(int i=0;i