Index: src/test/org/apache/nutch/metadata/TestMetadata.java =================================================================== --- src/test/org/apache/nutch/metadata/TestMetadata.java (revision 467333) +++ src/test/org/apache/nutch/metadata/TestMetadata.java (working copy) @@ -1,267 +0,0 @@ -/** - * Copyright 2005 The Apache Software Foundation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.metadata; - -// JDK imports -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.util.Properties; -import junit.framework.Test; - -// JUnit imports -import junit.framework.TestCase; -import junit.framework.TestSuite; -import junit.textui.TestRunner; - -// Nutch imports -import org.apache.nutch.metadata.Metadata; - - -/** - * JUnit based tests of class {@link org.apache.nutch.metadata.Metadata}. - * - * @author Chris Mattmann - * @author Jérôme Charron - */ -public class TestMetadata extends TestCase { - - - public TestMetadata(String testName) { - super(testName); - } - - public static Test suite() { - return new TestSuite(TestMetadata.class); - } - - public static void main(String[] args) { - TestRunner.run(suite()); - } - - - /** Test for the getNormalizedName(String) method. */ - public void testGetNormalizedName() { - assertEquals("Content-Type", Metadata.getNormalizedName("Content-Type")); - assertEquals("Content-Type", Metadata.getNormalizedName("ContentType")); - assertEquals("Content-Type", Metadata.getNormalizedName("Content-type")); - assertEquals("Content-Type", Metadata.getNormalizedName("contenttype")); - assertEquals("Content-Type", Metadata.getNormalizedName("contentype")); - assertEquals("Content-Type", Metadata.getNormalizedName("contntype")); - } - - /** Test for the add(String, String) method. */ - public void testAdd() { - String[] values = null; - Metadata meta = new Metadata(); - - values = meta.getValues("contentype"); - assertEquals(0, values.length); - - meta.add("contentype", "value1"); - values = meta.getValues("contentype"); - assertEquals(1, values.length); - assertEquals("value1", values[0]); - - meta.add("Content-Type", "value2"); - values = meta.getValues("contentype"); - assertEquals(2, values.length); - assertEquals("value1", values[0]); - assertEquals("value2", values[1]); - - // NOTE : For now, the same value can be added many times. - // Should it be changed? - meta.add("ContentType", "value1"); - values = meta.getValues("Content-Type"); - assertEquals(3, values.length); - assertEquals("value1", values[0]); - assertEquals("value2", values[1]); - assertEquals("value1", values[2]); - } - - /** Test for the set(String, String) method. */ - public void testSet() { - String[] values = null; - Metadata meta = new Metadata(); - - values = meta.getValues("contentype"); - assertEquals(0, values.length); - - meta.set("contentype", "value1"); - values = meta.getValues("contentype"); - assertEquals(1, values.length); - assertEquals("value1", values[0]); - - meta.set("Content-Type", "value2"); - values = meta.getValues("contentype"); - assertEquals(1, values.length); - assertEquals("value2", values[0]); - - meta.set("contenttype", "new value 1"); - meta.add("contenttype", "new value 2"); - values = meta.getValues("contentype"); - assertEquals(2, values.length); - assertEquals("new value 1", values[0]); - assertEquals("new value 2", values[1]); - } - - /** Test for setAll(Properties) method */ - public void testSetProperties() { - String[] values = null; - Metadata meta = new Metadata(); - Properties props = new Properties(); - - meta.setAll(props); - assertEquals(0, meta.size()); - - props.setProperty("name-one", "value1.1"); - meta.setAll(props); - assertEquals(1, meta.size()); - values = meta.getValues("name-one"); - assertEquals(1, values.length); - assertEquals("value1.1", values[0]); - - props.setProperty("name-two", "value2.1"); - meta.setAll(props); - assertEquals(2, meta.size()); - values = meta.getValues("name-one"); - assertEquals(1, values.length); - assertEquals("value1.1", values[0]); - values = meta.getValues("name-two"); - assertEquals(1, values.length); - assertEquals("value2.1", values[0]); - } - - /** Test for get(String) method */ - public void testGet() { - Metadata meta = new Metadata(); - assertNull(meta.get("a-name")); - - meta.add("a-name", "value-1"); - assertEquals("value-1", meta.get("a-name")); - meta.add("a-name", "value-2"); - assertEquals("value-1", meta.get("a-name")); - } - - /** Test for isMultiValued() method */ - public void testIsMultiValued() { - Metadata meta = new Metadata(); - assertFalse(meta.isMultiValued("key")); - meta.add("key", "value1"); - assertFalse(meta.isMultiValued("key")); - meta.add("key", "value2"); - assertTrue(meta.isMultiValued("key")); - } - - /** Test for names method */ - public void testNames() { - String[] names = null; - Metadata meta = new Metadata(); - names = meta.names(); - assertEquals(0, names.length); - - meta.add("name-one", "value"); - names = meta.names(); - assertEquals(1, names.length); - assertEquals("name-one", names[0]); - meta.add("name-two", "value"); - names = meta.names(); - assertEquals(2, names.length); - } - - /** Test for remove(String) method */ - public void testRemove() { - Metadata meta = new Metadata(); - meta.remove("name-one"); - assertEquals(0, meta.size()); - meta.add("name-one", "value-1.1"); - meta.add("name-one", "value-1.2"); - meta.add("name-two", "value-2.2"); - assertEquals(2, meta.size()); - assertNotNull(meta.get("name-one")); - assertNotNull(meta.get("name-two")); - meta.remove("name-one"); - assertEquals(1, meta.size()); - assertNull(meta.get("name-one")); - assertNotNull(meta.get("name-two")); - meta.remove("name-two"); - assertEquals(0, meta.size()); - assertNull(meta.get("name-one")); - assertNull(meta.get("name-two")); - } - - /** Test for equals(Object) method */ - public void testObject() { - Metadata meta1 = new Metadata(); - Metadata meta2 = new Metadata(); - assertFalse(meta1.equals(null)); - assertFalse(meta1.equals("String")); - assertTrue(meta1.equals(meta2)); - meta1.add("name-one", "value-1.1"); - assertFalse(meta1.equals(meta2)); - meta2.add("name-one", "value-1.1"); - assertTrue(meta1.equals(meta2)); - meta1.add("name-one", "value-1.2"); - assertFalse(meta1.equals(meta2)); - meta2.add("name-one", "value-1.2"); - assertTrue(meta1.equals(meta2)); - meta1.add("name-two", "value-2.1"); - assertFalse(meta1.equals(meta2)); - meta2.add("name-two", "value-2.1"); - assertTrue(meta1.equals(meta2)); - meta1.add("name-two", "value-2.2"); - assertFalse(meta1.equals(meta2)); - meta2.add("name-two", "value-2.x"); - assertFalse(meta1.equals(meta2)); - } - - /** Test for Writable implementation */ - public void testWritable() { - Metadata result = null; - Metadata meta = new Metadata(); - result = writeRead(meta); - assertEquals(0, result.size()); - meta.add("name-one", "value-1.1"); - result = writeRead(meta); - assertEquals(1, result.size()); - assertEquals(1, result.getValues("name-one").length); - assertEquals("value-1.1", result.get("name-one")); - meta.add("name-two", "value-2.1"); - meta.add("name-two", "value-2.2"); - result = writeRead(meta); - assertEquals(2, result.size()); - assertEquals(1, result.getValues("name-one").length); - assertEquals("value-1.1", result.getValues("name-one")[0]); - assertEquals(2, result.getValues("name-two").length); - assertEquals("value-2.1", result.getValues("name-two")[0]); - assertEquals("value-2.2", result.getValues("name-two")[1]); - } - - private Metadata writeRead(Metadata meta) { - Metadata readed = new Metadata(); - try { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - meta.write(new DataOutputStream(out)); - readed.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray()))); - } catch (IOException ioe) { - fail(ioe.toString()); - } - return readed; - } - -} Index: src/test/org/apache/nutch/protocol/TestContent.java =================================================================== --- src/test/org/apache/nutch/protocol/TestContent.java (revision 467333) +++ src/test/org/apache/nutch/protocol/TestContent.java (working copy) @@ -16,7 +16,7 @@ package org.apache.nutch.protocol; -import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.PlainMetadata; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.WritableTestUtils; @@ -38,7 +38,7 @@ String url = "http://www.foo.com/"; - Metadata metaData = new Metadata(); + PlainMetadata metaData = new PlainMetadata(); metaData.add("Host", "www.foo.com"); metaData.add("Content-Type", "text/html"); @@ -54,7 +54,7 @@ /** Unit tests for getContentType(String, String, byte[]) method. */ public void testGetContentType() throws Exception { Content c = null; - Metadata p = new Metadata(); + PlainMetadata p = new PlainMetadata(); c = new Content("http://www.foo.com/", "http://www.foo.com/", Index: src/test/org/apache/nutch/parse/TestParseData.java =================================================================== --- src/test/org/apache/nutch/parse/TestParseData.java (revision 467333) +++ src/test/org/apache/nutch/parse/TestParseData.java (working copy) @@ -16,11 +16,22 @@ package org.apache.nutch.parse; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.MalformedURLException; + import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.WritableTestUtils; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.PlainMetadata; +import org.apache.nutch.protocol.Content; import junit.framework.TestCase; @@ -28,6 +39,7 @@ public class TestParseData extends TestCase { + private static final int ITERATIONS = 1000; private Configuration conf = NutchConfiguration.create(); public TestParseData(String name) { super(name); } @@ -41,7 +53,7 @@ new Outlink("http://bar.com/", "Bar", conf) }; - Metadata metaData = new Metadata(); + PlainMetadata metaData = new PlainMetadata(); metaData.add("Language", "en/us"); metaData.add("Charset", "UTF-8"); @@ -59,7 +71,7 @@ ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS, "Max Outlinks Title", outlinks, - new Metadata()); + new PlainMetadata()); Configuration conf = NutchConfiguration.create(); // No Outlinks conf.setInt("db.max.outlinks.per.page", 0); @@ -78,4 +90,69 @@ data = (ParseData) WritableTestUtils.writeRead(original, conf); assertEquals(outlinks.length, data.getOutlinks().length); } + + + public void testIOPerformance() throws FileNotFoundException, MalformedURLException{ + Configuration conf=NutchConfiguration.create(); + String url="http://www.apache.org/"; + String base="http://www.apache.org/"; + String contentType="text/html"; + + + File f=new File("build/test/parsedataio"); + FileOutputStream fos=new FileOutputStream(f); + DataOutputStream dos=new DataOutputStream(fos); + + long time=System.currentTimeMillis(); + + Outlink [] links=new Outlink[30]; + + for(int i=0;i * * -------------------------- */ - + + public static PlainMetadata convert(Metadata s){ + String[] names=s.names(); + PlainMetadata dest=new PlainMetadata(); + for(int i=0;i 1; + } + + /** + * Returns an array of the names contained in the metadata. + */ + public String[] names() { + return (String[]) metadata.keySet().toArray(new String[metadata.size()]); + } + + /** + * Get the value associated to a metadata name. If many values are assiociated + * to the specified name, then the first one is returned. + * + * @param name + * of the metadata. + * @return the value associated to the specified metadata name. + */ + public String get(String name) { + Object values = metadata.get(name); + if ((values != null) && (values instanceof List)) { + return (String) ((List) values).get(0); + } else { + return (String) values; + } + } + + /** + * Get the values associated to a metadata name. + * + * @param name + * of the metadata. + * @return the values associated to a metadata name. + */ + public String[] getValues(String name) { + Object values = metadata.get(name); + if (values != null) { + if (values instanceof List) { + List list = (List) values; + return (String[]) list.toArray(new String[list.size()]); + } else { + return new String[] { (String) values }; + } + } + return new String[0]; + } + + /** + * Add a metadata name/value mapping. Add the specified value to the list of + * values associated to the specified metadata name. + * + * @param name + * the metadata name. + * @param value + * the metadata value. + */ + public void add(String name, String value) { + Object values = metadata.get(name); + if (values != null) { + if (values instanceof String) { + List list = new ArrayList(); + list.add(values); + list.add(value); + metadata.put(name, list); + } else if (values instanceof List) { + ((List) values).add(value); + } + } else { + metadata.put(name, value); + } + } + + public void setAll(Properties properties) { + Enumeration names = properties.propertyNames(); + while (names.hasMoreElements()) { + String name = (String) names.nextElement(); + set(name, properties.getProperty(name)); + } + } + + /** + * Set metadata name/value. Associate the specified value to the specified + * metadata name. If some previous values were associated to this name, they + * are removed. + * + * @param name + * the metadata name. + * @param value + * the metadata value. + */ + public void set(String name, String value) { + remove(name); + add(name, value); + } + + /** + * Remove a metadata and all its associated values. + */ + public void remove(String name) { + metadata.remove(name); + } + + /** + * Returns the number of metadata names in this metadata. + */ + public int size() { + return metadata.size(); + } + + public boolean equals(Object o) { + + if (o == null) { + return false; + } + + PlainMetadata other = null; + try { + other = (PlainMetadata) o; + } catch (ClassCastException cce) { + return false; + } + + if (other.size() != size()) { + return false; + } + + String[] names = names(); + for (int i = 0; i < names.length; i++) { + String[] otherValues = other.getValues(names[i]); + String[] thisValues = getValues(names[i]); + if (otherValues.length != thisValues.length) { + return false; + } + for (int j = 0; j < otherValues.length; j++) { + if (!otherValues[j].equals(thisValues[j])) { + return false; + } + } + } + return true; + } + + public String toString() { + StringBuffer buf = new StringBuffer(); + String[] names = names(); + for (int i = 0; i < names.length; i++) { + String[] values = getValues(names[i]); + for (int j = 0; j < values.length; j++) { + buf.append(names[i]).append("=").append(values[j]).append(" "); + } + } + return buf.toString(); + } + + public final void write(DataOutput out) throws IOException { + String[] values = null; + String[] names = names(); + WritableUtils.writeStringArray(out, names); + for (int i = 0; i < names.length; i++) { + values = getValues(names[i]); + WritableUtils.writeStringArray(out, values); + } + } + + public final void readFields(DataInput in) throws IOException { + String keys[] = WritableUtils.readStringArray(in); + for (int i = 0; i < keys.length; i++) { + String values[] = WritableUtils.readStringArray(in); + for (int j = 0; j < values.length; j++) { + add(keys[i], values[j]); + } + } + } +} Index: src/java/org/apache/nutch/metadata/MetadataSpellChecker.java =================================================================== --- src/java/org/apache/nutch/metadata/MetadataSpellChecker.java (revision 0) +++ src/java/org/apache/nutch/metadata/MetadataSpellChecker.java (revision 0) @@ -0,0 +1,112 @@ +package org.apache.nutch.metadata; + +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; + +/** + * This decorator brings spell checking capabilities to metadata, if + * none is required use {Metadata} instead + * + * All the static String fields declared by this class are used as reference + * names for syntax correction on meta-data naming. + */ +public class MetadataSpellChecker extends PlainMetadata implements CreativeCommons, + DublinCore, HttpHeaders, Nutch, Office { + + private final static Map NAMES_IDX = new HashMap(); + + private static String[] normalized = null; + + // Uses self introspection to fill the metanames index and the + // metanames list. + static { + Field[] fields = PlainMetadata.class.getFields(); + for (int i=0; i + *
  • content-type gives Content-Type
  • + *
  • CoNtEntType gives Content-Type
  • + *
  • ConTnTtYpe gives Content-Type
  • + * + * If no matching with a well-known metadata name is found, then the original + * name is returned. + */ + public static String getNormalizedName(String name) { + String searched = normalize(name); + String value = (String) NAMES_IDX.get(searched); + + if ((value == null) && (normalized != null)) { + int threshold = searched.length() / 3; + for (int i=0; i VERSION) throw new VersionMismatchException(VERSION, version); - url = UTF8.readString(in); // read url - base = UTF8.readString(in); // read base + if (version == 2) { + //handle version 2 format + String sData[]=WritableUtils.readStringArray(in); + url = sData[0]; + base = sData[1]; + contentType = sData[2]; - content = new byte[in.readInt()]; // read content - in.readFully(content); + metadata = new PlainMetadata(); + metadata.readFields(in); - contentType = UTF8.readString(in); // read contentType + content = new byte[in.readInt()]; // read content + in.readFully(content); + } else if (version == 1) { + //handle verion 1 format - metadata = new Metadata(); - metadata.readFields(in); // read meta data + url = UTF8.readString(in); // read url + base = UTF8.readString(in); // read base + + content = new byte[in.readInt()]; // read content + in.readFully(content); + + contentType = UTF8.readString(in); // read contentType + + Metadata metadata = new Metadata(); + metadata.readFields(in); // read meta data + this.metadata = Metadata.convert(metadata); + } } protected final void writeCompressed(DataOutput out) throws IOException { out.writeByte(version); - UTF8.writeString(out, url); // write url - UTF8.writeString(out, base); // write base - - out.writeInt(content.length); // write content + //collect all strings to array + String sData[]=new String[]{ + url,base,contentType + }; + + WritableUtils.writeStringArray(out,sData); //write Strings + + metadata.write(out); // write metadata + + out.writeInt(content.length); // write content out.write(content); - - UTF8.writeString(out, contentType); // write contentType - - metadata.write(out); // write metadata } public static Content read(DataInput in) throws IOException { @@ -99,18 +157,15 @@ return content; } - // - // Accessor methods - // - /** The url fetched. */ public String getUrl() { ensureInflated(); return url; } - /** The base url for relative links contained in the content. - * Maybe be different from url if the request redirected. + /** + * The base url for relative links contained in the content. Maybe be + * different from url if the request redirected. */ public String getBaseUrl() { ensureInflated(); @@ -122,12 +177,15 @@ ensureInflated(); return content; } + public void setContent(byte[] content) { ensureInflated(); this.content = content; } - /** The media type of the retrieved content. + /** + * The media type of the retrieved content. + * * @see * http://www.iana.org/assignments/media-types/ */ @@ -135,48 +193,47 @@ ensureInflated(); return contentType; } + public void setContentType(String contentType) { ensureInflated(); this.contentType = contentType; } /** Other protocol-specific data. */ - public Metadata getMetadata() { + public PlainMetadata getMetadata() { ensureInflated(); return metadata; } /** Other protocol-specific data. */ - public void setMetadata(Metadata metadata) { + public void setMetadata(PlainMetadata metadata) { ensureInflated(); this.metadata = metadata; } public boolean equals(Object o) { ensureInflated(); - if (!(o instanceof Content)){ + if (!(o instanceof Content)) { return false; } - Content that = (Content)o; + Content that = (Content) o; that.ensureInflated(); - return - this.url.equals(that.url) && - this.base.equals(that.base) && - Arrays.equals(this.getContent(), that.getContent()) && - this.contentType.equals(that.contentType) && - this.metadata.equals(that.metadata); + return this.url.equals(that.url) && this.base.equals(that.base) + && Arrays.equals(this.getContent(), that.getContent()) + && this.contentType.equals(that.contentType) + && this.metadata.equals(that.metadata); } public String toString() { ensureInflated(); StringBuffer buffer = new StringBuffer(); - buffer.append("url: " + url + "\n" ); - buffer.append("base: " + base + "\n" ); - buffer.append("contentType: " + contentType + "\n" ); - buffer.append("metadata: " + metadata + "\n" ); + buffer.append("url: " + url + "\n"); + buffer.append("base: " + base + "\n"); + buffer.append("contentType: " + contentType + "\n"); + buffer.append("metadata: " + metadata + "\n"); buffer.append("Content:\n"); - buffer.append(new String(content)); // try default encoding + buffer.append(new String(content)); // try default encoding return buffer.toString(); @@ -185,7 +242,7 @@ public static void main(String argv[]) throws Exception { String usage = "Content (-local | -dfs ) recno segment"; - + if (argv.length < 3) { System.out.println("usage:" + usage); return; @@ -199,7 +256,8 @@ Path file = new Path(segment, DIR_NAME); System.out.println("Reading from file: " + file); - ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), conf); + ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), + conf); Content content = new Content(); contents.get(recno, content); @@ -216,10 +274,10 @@ private String getContentType(String typeName, String url, byte[] data) { MimeType type = null; try { - typeName = MimeType.clean(typeName); - type = typeName == null ? null : this.mimeTypes.forName(typeName); + typeName = MimeType.clean(typeName); + type = typeName == null ? null : this.mimeTypes.forName(typeName); } catch (MimeTypeException mte) { - // Seems to be a malformed mime type name... + // Seems to be a malformed mime type name... } if (typeName == null || type == null || !type.matches(url)) { @@ -229,8 +287,8 @@ type = this.mimeTypes.getMimeType(url); typeName = type == null ? typeName : type.getName(); } - if (typeName == null || type == null || - (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) { + if (typeName == null || type == null + || (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) { // If no mime-type already found, or the one found doesn't match // the magic bytes it should be, then, guess a mime-type from the // document content (magic bytes) Index: src/java/org/apache/nutch/segment/SegmentMerger.java =================================================================== --- src/java/org/apache/nutch/segment/SegmentMerger.java (revision 467333) +++ src/java/org/apache/nutch/segment/SegmentMerger.java (working copy) @@ -32,7 +32,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.Generator; import org.apache.nutch.fetcher.Fetcher; -import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.PlainMetadata; import org.apache.nutch.net.URLFilters; import org.apache.nutch.parse.ParseData; import org.apache.nutch.parse.ParseText; @@ -144,12 +144,12 @@ ((CrawlDatum)o).getMetaData().put(SEGMENT_NAME_KEY, new UTF8(segment)); } else if (o instanceof Content) { if (((Content)o).getMetadata() == null) { - ((Content)o).setMetadata(new Metadata()); + ((Content)o).setMetadata(new PlainMetadata()); } ((Content)o).getMetadata().set(SEGMENT_NAME_KEY.toString(), segment); } else if (o instanceof ParseData) { if (((ParseData)o).getParseMeta() == null) { - ((ParseData)o).setParseMeta(new Metadata()); + ((ParseData)o).setParseMeta(new PlainMetadata()); } ((ParseData)o).getParseMeta().set(SEGMENT_NAME_KEY.toString(), segment); } else if (o instanceof ParseText) { Index: src/java/org/apache/nutch/crawl/CrawlDbMerger.java =================================================================== --- src/java/org/apache/nutch/crawl/CrawlDbMerger.java (revision 467333) +++ src/java/org/apache/nutch/crawl/CrawlDbMerger.java (working copy) @@ -28,6 +28,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.UTF8; +import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapred.*; import org.apache.nutch.net.URLFilters; @@ -51,30 +52,104 @@ */ public class CrawlDbMerger extends Configured { private static final Log LOG = LogFactory.getLog(CrawlDbMerger.class); + + + public static class TimeoutWatchDog implements Runnable{ - public static class Merger extends MapReduceBase implements Reducer { + Thread toNotify=null; + int timeout; + long lastPing=0; + + public TimeoutWatchDog(int timeoutInSeconds){ + this.timeout=timeoutInSeconds; + } + + public void on(){ + if(toNotify==null){ + toNotify=Thread.currentThread(); + } + lastPing=System.currentTimeMillis(); + } + + public void off(){ + toNotify=null; + } + + public void run(){ + try{ + Thread.sleep(1000); + } catch (InterruptedException e){ + if(toNotify!=null && System.currentTimeMillis()-lastPing>1000*timeout){ + LOG.info("seems like thread is in trouble!"); + if(toNotify!=null){ + toNotify.interrupt(); + } + } + } + } + } + + + public static class MyMapper extends org.apache.hadoop.mapred.lib.IdentityMapper { + private URLFilters filters = null; - MapWritable meta = new MapWritable(); + int index=0; + private Thread watchDogThread; + private TimeoutWatchDog watchDog; - public void close() throws IOException {} - - public void configure(JobConf conf) { - if (conf.getBoolean("crawldb.merger.urlfilters", false)) + public void configure(JobConf conf) { + watchDog=new TimeoutWatchDog(10); + watchDogThread=new Thread(watchDog); + watchDogThread.start(); + + super.configure(conf); + if (conf.getBoolean("crawldb.merger.urlfilters", false)) { + LOG.info("Filtering urls"); filters = new URLFilters(conf); + } } - public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) - throws IOException { + public void map(WritableComparable key, Writable val, OutputCollector output, Reporter reporter) throws IOException { + if(index++%10000==0){ + if(LOG.isInfoEnabled()){ + LOG.info("Processing entry #" + index); + } + } if (filters != null) { try { - if (filters.filter(((UTF8) key).toString()) == null) + watchDog.on(); + if (filters.filter(((UTF8) key).toString()) == null) { + if(LOG.isDebugEnabled()) { + LOG.debug("Filtering out: " + key.toString()); + } return; + } + watchDog.off(); } catch (Exception e) { + watchDog.off(); if (LOG.isDebugEnabled()) { LOG.debug("Can't filter " + key + ": " + e); } } + super.map(key, val, output, reporter); } + } + } + + public static class Merger extends MapReduceBase implements Reducer { + MapWritable meta = new MapWritable(); + int index=0; + + public void close() throws IOException {} + + public void reduce(WritableComparable key, Iterator values, OutputCollector output, Reporter reporter) + throws IOException { + if(index++%10000==0){ + if(LOG.isInfoEnabled()){ + LOG.info("Processing entry #" + index); + } + } + CrawlDatum res = null; long resTime = 0L; meta.clear(); @@ -130,6 +205,7 @@ job.setInputKeyClass(UTF8.class); job.setInputValueClass(CrawlDatum.class); + job.setMapperClass(MyMapper.class); job.setReducerClass(Merger.class); job.setOutputPath(newCrawlDb); @@ -165,4 +241,5 @@ CrawlDbMerger merger = new CrawlDbMerger(conf); merger.merge(output, (Path[]) dbs.toArray(new Path[dbs.size()]), filter); } + } Index: src/java/org/apache/nutch/parse/ParseOutputFormat.java =================================================================== --- src/java/org/apache/nutch/parse/ParseOutputFormat.java (revision 467333) +++ src/java/org/apache/nutch/parse/ParseOutputFormat.java (working copy) @@ -23,10 +23,12 @@ import org.apache.hadoop.io.*; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.fetcher.Fetcher; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.apache.hadoop.mapred.*; import org.apache.nutch.scoring.ScoringFilterException; import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.StringUtil; import org.apache.nutch.net.*; @@ -38,6 +40,9 @@ /* Parse content in a segment. */ public class ParseOutputFormat implements OutputFormat { + + static Configuration conf=NutchConfiguration.create(); + private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class); private UrlNormalizer urlNormalizer; @@ -80,10 +85,12 @@ public void write(WritableComparable key, Writable value) throws IOException { - Parse parse = (Parse)value; + ParseImpl parse=(ParseImpl) value; + String fromUrl = key.toString(); String fromHost = null; - String toHost = null; + String toHost = null; + //LOG.info("to: appending text"); textOut.append(key, new ParseText(parse.getText())); ParseData parseData = parse.getData(); @@ -95,6 +102,7 @@ // append a CrawlDatum with a signature CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f); d.setSignature(signature); + // LOG.info("co: appending signature"); crawlOut.append(key, d); } } @@ -153,9 +161,14 @@ } continue; } + //LOG.info("co: appending target"); crawlOut.append(targetUrl, target); - if (adjust != null) crawlOut.append(key, adjust); + if (adjust != null) { + //LOG.info("co: appending adjust"); + crawlOut.append(key, adjust); + } } + //LOG.info("appending parseData"); dataOut.append(key, parseData); } Index: src/java/org/apache/nutch/parse/ParseData.java =================================================================== --- src/java/org/apache/nutch/parse/ParseData.java (revision 467333) +++ src/java/org/apache/nutch/parse/ParseData.java (working copy) @@ -25,76 +25,111 @@ import org.apache.hadoop.conf.Configurable; import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.PlainMetadata; import org.apache.nutch.util.NutchConfiguration; - -/** Data extracted from a page's content. +/** + * Data extracted from a page's content. + * * @see Parse#getData() */ public final class ParseData extends VersionedWritable implements Configurable { public static final String DIR_NAME = "parse_data"; - private final static byte VERSION = 4; + private final static byte VERSION = 5; + private int maxOutlinks = Integer.MAX_VALUE; private String title; + private Outlink[] outlinks; - private Metadata contentMeta; - private Metadata parseMeta; + + private PlainMetadata contentMeta; + + private PlainMetadata parseMeta; + private ParseStatus status; + private Configuration conf; - - // TODO mb@media-style.com: should we really implement Configurable or should we add the - // parameter Configuration to the default-constructor. NOTE: The test - // TestWriteable instantiates ParseData with Class.newInstance() -> the default - // constructor is called -> conf is null. The programmer which use this object may not forget to set the conf. - public ParseData() {} + public ParseData() { + } + + /** + * @deprecated use {@link #ParseData(ParseStatus, String, Outlink[], PlainMetadata)} + * instead + */ public ParseData(ParseStatus status, String title, Outlink[] outlinks, - Metadata contentMeta) { + Metadata contentMeta) { this(status, title, outlinks, contentMeta, new Metadata()); } + + public ParseData(ParseStatus status, String title, Outlink[] outlinks, + PlainMetadata contentMeta) { + this(status, title, outlinks, contentMeta, new PlainMetadata()); + } + + /** + * @deprecated use {@link #ParseData(ParseStatus, String, Outlink[], PlainMetadata, PlainMetadata)} + * instead + */ + public ParseData(ParseStatus status, String title, Outlink[] outlinks, + Metadata contentMeta, Metadata parseMeta) { + this.status = status; + this.title = title; + this.outlinks = outlinks; + //convert metadata + this.contentMeta = Metadata.convert(contentMeta); + this.parseMeta = Metadata.convert(parseMeta); + } public ParseData(ParseStatus status, String title, Outlink[] outlinks, - Metadata contentMeta, Metadata parseMeta) { + PlainMetadata contentMeta, PlainMetadata parseMeta) { this.status = status; this.title = title; this.outlinks = outlinks; + //convert metadata this.contentMeta = contentMeta; this.parseMeta = parseMeta; } - // - // Accessor methods - // + /** The status of parsing the page. */ + public ParseStatus getStatus() { + return status; + } - /** The status of parsing the page. */ - public ParseStatus getStatus() { return status; } - /** The title of the page. */ - public String getTitle() { return title; } + public String getTitle() { + return title; + } /** The outlinks of the page. */ - public Outlink[] getOutlinks() { return outlinks; } + public Outlink[] getOutlinks() { + return outlinks; + } /** The original Metadata retrieved from content */ - public Metadata getContentMeta() { return contentMeta; } + public PlainMetadata getContentMeta() { + return contentMeta; + } /** - * Other content properties. - * This is the place to find format-specific properties. - * Different parser implementations for different content types will populate - * this differently. + * Other content properties. This is the place to find format-specific + * properties. Different parser implementations for different content types + * will populate this differently. */ - public Metadata getParseMeta() { return parseMeta; } - - public void setParseMeta(Metadata parseMeta) { + public PlainMetadata getParseMeta() { + return parseMeta; + } + + public void setParseMeta(PlainMetadata parseMeta) { this.parseMeta = parseMeta; } - + /** - * Get a metadata single value. - * This method first looks for the metadata value in the parse metadata. If no - * value is found it the looks for the metadata in the content metadata. + * Get a metadata single value. This method first looks for the metadata value + * in the parse metadata. If no value is found it the looks for the metadata + * in the content metadata. + * * @see #getContentMeta() * @see #getParseMeta() */ @@ -105,62 +140,92 @@ } return value; } - - // - // Writable methods - // - public byte getVersion() { return VERSION; } + public byte getVersion() { + return VERSION; + } public final void readFields(DataInput in) throws IOException { byte version = in.readByte(); - if (version > 1) + + if (version == 5) { + //handle version 5 here status = ParseStatus.read(in); - else - status = ParseStatus.STATUS_SUCCESS; - title = UTF8.readString(in); // read title + title = UTF8.readString(in); // read title + parseMeta=new PlainMetadata(); + + String[] anchors=WritableUtils.readStringArray(in); + String[] urls=WritableUtils.readStringArray(in); + + outlinks=new Outlink[anchors.length]; + for(int i=0;i= 0) { - outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks); - } - outlinks = new Outlink[outlinksToRead]; - for (int i = 0; i < outlinksToRead; i++) { - outlinks[i] = Outlink.read(in); - } - for (int i = outlinksToRead; i < totalOutlinks; i++) { - Outlink.skip(in); - } - - if (version < 3) { - int propertyCount = in.readInt(); // read metadata - contentMeta = new Metadata(); - for (int i = 0; i < propertyCount; i++) { - contentMeta.add(UTF8.readString(in), UTF8.readString(in)); - } - } else { - contentMeta = new Metadata(); + contentMeta = new PlainMetadata(); contentMeta.readFields(in); - } - if (version > 3) { - parseMeta = new Metadata(); + parseMeta = new PlainMetadata(); parseMeta.readFields(in); + } else { + //handle other versions here + if (version > 1) + status = ParseStatus.read(in); + else + status = ParseStatus.STATUS_SUCCESS; + title = UTF8.readString(in); // read title + + int totalOutlinks = in.readInt(); // read outlinks + int maxOutlinksPerPage = this.conf + .getInt("db.max.outlinks.per.page", 100); + int outlinksToRead = totalOutlinks; + if (maxOutlinksPerPage >= 0) { + outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks); + } + outlinks = new Outlink[outlinksToRead]; + for (int i = 0; i < outlinksToRead; i++) { + outlinks[i] = Outlink.read(in); + } + for (int i = outlinksToRead; i < totalOutlinks; i++) { + Outlink.skip(in); + } + + if (version < 3) { + int propertyCount = in.readInt(); // read metadata + contentMeta = new PlainMetadata(); + for (int i = 0; i < propertyCount; i++) { + contentMeta.add(UTF8.readString(in), UTF8.readString(in)); + } + } else { + contentMeta = new PlainMetadata(); + contentMeta.readFields(in); + } + if (version > 3) { + parseMeta = new PlainMetadata(); + parseMeta.readFields(in); + } } } public final void write(DataOutput out) throws IOException { - out.writeByte(VERSION); // write version - status.write(out); // write status - UTF8.writeString(out, title); // write title + out.writeByte(VERSION); // write version + status.write(out); // write status + UTF8.writeString(out, title); // write title - out.writeInt(outlinks.length); // write outlinks - for (int i = 0; i < outlinks.length; i++) { - outlinks[i].write(out); + String[] anchors=new String[outlinks.length]; + String[] urls=new String[outlinks.length]; + + for(int i=0;i