Files ../protocol-smb/plugins/protocol-smb/jcifs-1.2.13.jar and ./plugins/protocol-smb/jcifs-1.2.13.jar differ Files ../protocol-smb/plugins/protocol-smb/jcifs-1.3.0.jar and ./plugins/protocol-smb/jcifs-1.3.0.jar differ diff -urN ../protocol-smb/plugins/protocol-smb/plugin.xml ./plugins/protocol-smb/plugin.xml --- ../protocol-smb/plugins/protocol-smb/plugin.xml 2007-05-25 22:31:38.000000000 -0400 +++ ./plugins/protocol-smb/plugin.xml 2008-11-07 14:28:38.184000000 -0500 @@ -13,7 +13,7 @@ - + @@ -23,4 +23,4 @@ - \ No newline at end of file + Files ../protocol-smb/plugins/protocol-smb/protocol-smb.jar and ./plugins/protocol-smb/protocol-smb.jar differ Files ../protocol-smb/src/plugin/protocol-smb/lib/jcifs-1.2.13.jar and ./src/plugin/protocol-smb/lib/jcifs-1.2.13.jar differ Files ../protocol-smb/src/plugin/protocol-smb/lib/jcifs-1.3.0.jar and ./src/plugin/protocol-smb/lib/jcifs-1.3.0.jar differ diff -urN ../protocol-smb/src/plugin/protocol-smb/plugin.xml ./src/plugin/protocol-smb/plugin.xml --- ../protocol-smb/src/plugin/protocol-smb/plugin.xml 2007-05-25 22:31:38.000000000 -0400 +++ ./src/plugin/protocol-smb/plugin.xml 2008-11-07 14:28:38.184388000 -0500 @@ -13,7 +13,7 @@ - + @@ -23,4 +23,4 @@ - \ No newline at end of file + diff -urN ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMB.java ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMB.java --- ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMB.java 2007-05-25 22:28:20.000000000 -0400 +++ ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMB.java 2008-11-07 23:35:29.843225000 -0500 @@ -51,9 +51,9 @@ */ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) { String urlString = url.toString(); - System.out.println("SMB Protocol: Now using the SMB protocol"); - System.out.println("SMB Protocol will interrogate the following URL: "+urlString); - try{ + // System.out.println("SMB Protocol: Now using the SMB protocol"); + // System.out.println("SMB Protocol will interrogate the following URL: "+urlString); + try { URL u = new URL(urlString); int redirects = 0; if(!setJCifsProp()) @@ -64,21 +64,22 @@ response = new SMBResponse(u, datum, this, getConf()); // make a request int code = response.getCode(); - if(code == 200){ + if (code == 200){ return new ProtocolOutput(response.toContent()); } else if (code >= 300 && code < 400){ if(redirects == MAX_REDIRECTS) throw new SMBException("too many redirects: "+url); u = new URL(response.getHeader("Location")); redirects++; - if(LOG.isTraceEnabled()) + if (LOG.isTraceEnabled()) { LOG.trace("redirect to "+u); - else - throw new SMBError(code); + } + // continue the while loop to retry + } else { + throw new SMBError(code); } } - }catch (Exception ex) - { + } catch (Exception ex) { return new ProtocolOutput(null, new ProtocolStatus(ex)); } } diff -urN ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMBResponse.java ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMBResponse.java --- ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMBResponse.java 2007-01-05 10:34:38.000000000 -0500 +++ ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMBResponse.java 2008-11-07 23:36:57.307444000 -0500 @@ -9,12 +9,21 @@ import java.io.File; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.net.URL; +import java.util.BitSet; + import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; + +import org.apache.commons.codec.net.URLCodec; +import org.apache.commons.codec.DecoderException; +import org.apache.commons.lang.StringEscapeUtils; + import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.metadata.Metadata; @@ -33,24 +42,50 @@ private int code; private Metadata headers = new Metadata(); - private final SMB file; + private final SMB protocol; private Configuration conf; + + private static final URLCodec urlCodec = new URLCodec(); + private static final BitSet safeUrlChars = new BitSet(256); + + static { + // alpha characters + for (int i = 'a'; i <= 'z'; i++) { + safeUrlChars.set(i); + } + for (int i = 'A'; i <= 'Z'; i++) { + safeUrlChars.set(i); + } + // numeric characters + for (int i = '0'; i <= '9'; i++) { + safeUrlChars.set(i); + } + // special chars + safeUrlChars.set('-'); + safeUrlChars.set('_'); + safeUrlChars.set('.'); + safeUrlChars.set('*'); + } + public static final Log LOG = LogFactory.getLog(SMB.class); /** Creates a new instance of SMBResponse * @param the url, the crawlDatum, the file, configuration for Nutch * @throws SMBException, IOException */ - public SMBResponse(URL url, CrawlDatum datum, SMB file, Configuration conf) - throws SMBException, IOException { - this.orig = url.toString(); - this.base = url.toString(); - this.file = file; + public SMBResponse(URL url, CrawlDatum datum, SMB protocol, Configuration conf) + throws DecoderException, SMBException, IOException, UnsupportedEncodingException { + String urlString = url.toString(); + String decodedUrlString = new String(URLCodec.decodeUrl(urlString.getBytes("UTF-8")), "UTF-8"); + url = new URL(decodedUrlString); + this.orig = decodedUrlString; + this.base = decodedUrlString; + this.protocol = protocol; this.conf = conf; // check if protocol is a window file share protocol if(!"smb".equals(url.getProtocol())) - throw new SMBException("Not an SMB url: "+url); + throw new SMBException("Not an SMB url: " + url); try { this.content = null; @@ -135,9 +170,15 @@ * http response code with a value of 200 * @param file - the SmbFile to retrieve content */ - private void getDirAsHttpResponse(SmbFile f) throws SmbException { + private void getDirAsHttpResponse(SmbFile f) throws SmbException, UnsupportedEncodingException { String path = f.getName(); - this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true); + SmbFile[] fileList = {}; + try { + fileList = f.listFiles(); + } catch (SmbException e) { + // leave fileList empty + } + this.content = list2html(fileList, path, false); // set headers headers.set(Response.CONTENT_TYPE, "text/html"); @@ -155,24 +196,24 @@ */ private void getFileAsHttpResponse(SmbFile f) throws SmbException, SMBException, IOException { long size = f.length(); - if(size > Integer.MAX_VALUE) + if (size > Integer.MAX_VALUE) throw new SMBException("file is too large, size: "+size); // capture content int len = (int)size; - byte[] bytes = new byte[len]; - java.io.InputStream is = new SmbFileInputStream(f); + this.content = new byte[len]; + java.io.InputStream is = f.getInputStream(); int offset = 0; int n = 0; - while(offset < size && (n = is.read(this.content, offset, len-offset)) >= 0){ + while (offset < size && (n = is.read(this.content, offset, len-offset)) >= 0){ offset += n; + LOG.debug("Read " + n + " bytes from " + f.getName()); } - if(offset < len){ + if (offset < len){ if(LOG.isWarnEnabled()) LOG.warn("not enough bytes read from file: "+f.getCanonicalPath()); } - is.close(); // set headers @@ -189,10 +230,12 @@ * @param the smbFile, path, boolean value to include dotdot in the file path * @return html content as byte */ - private byte[] list2html(SmbFile[] smbFile, String path, boolean includeDotDot) throws SmbException { + private byte[] list2html(SmbFile[] smbFile, String path, boolean includeDotDot) + throws SmbException, UnsupportedEncodingException { + String htmlPath = StringEscapeUtils.escapeHtml(path); StringBuffer x = new StringBuffer(""); - x.append("Index of "+path+"\n"); - x.append("

Index of "+path+"

"); + x.append("Index of " + htmlPath + "\n"); + x.append("

Index of " + htmlPath + "

");
         
         if(includeDotDot){
             x.append("../\t-\t-\n");
@@ -201,12 +244,26 @@
         SmbFile f;
         for(int i=0; i"+name+"/\t");
+            if (f.isDirectory() || f.isFile()) {
+                x.append("" + htmlName + "\t");
                 x.append(time+"\t-\n");
-            } else{} // ignore any other
+            } else {
+                // ignore any other
+            }
         }
         x.append("
\n"); return new String(x).getBytes(); diff -urN ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/test/SMBTest.java ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/test/SMBTest.java --- ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/test/SMBTest.java 2007-01-05 15:07:44.000000000 -0500 +++ ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/test/SMBTest.java 2008-11-07 13:57:25.937700000 -0500 @@ -13,6 +13,10 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Properties; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import jcifs.smb.SmbException; import jcifs.smb.SmbFile; @@ -26,16 +30,22 @@ public SMBTest() { } + public static String USAGE = "Usage: SMBTest smb.properties \"smb://\" SERVER \"/\" SHARE \"/\" (FILE OUTPUT_FILE|DIR)"; + /** * @param args the command line arguments */ public static void main(String[] args) throws MalformedURLException, SmbException, FileNotFoundException, IOException { + if (args.length < 2) { + System.out.println(USAGE); + System.exit(1); + } Properties prop = new Properties(); - prop.load(new FileInputStream("smb.properties")); + prop.load(new FileInputStream(args[0])); jcifs.Config.setProperties(prop); // URL url = new URL("smb://xxxxxxxx"); - SmbFile file = new SmbFile("smb://xxxxxxx"); + SmbFile file = new SmbFile(args[1]); System.out.println("Does the file exist on the share: "+file.exists()); // System.out.println("here is the protocol used: "+url.getProtocol()); // System.out.println("here is the host used: "+url.getHost()); @@ -46,7 +56,26 @@ System.out.println(file.getName()); } }else if(file.isFile()){ - System.out.println("Here is the name of the file: "+file.getName()); + if (args.length != 3) { + System.err.println(USAGE); + System.exit(1); + } + System.out.println("Name " + file.getName() + ", length " + file.getContentLength()); + System.out.println("Saving to " + args[2] + "..."); + java.io.InputStream is = file.getInputStream(); + java.io.FileOutputStream fos = new java.io.FileOutputStream(args[2]); + int numread; + byte b[] = new byte[4096]; + do { + numread = is.read(b); + if (numread < 0) { + break; + } + fos.write(b, 0, numread); + } while(true); + is.close(); + fos.close(); + System.out.println("Done."); } } diff -urN ../protocol-smb/src/plugin/protocol-smb/src/log4j.properties ./src/plugin/protocol-smb/src/log4j.properties --- ../protocol-smb/src/plugin/protocol-smb/src/log4j.properties 1969-12-31 19:00:00.000000000 -0500 +++ ./src/plugin/protocol-smb/src/log4j.properties 2008-11-07 13:53:14.549708000 -0500 @@ -0,0 +1,33 @@ +# RootLogger - DailyRollingFileAppender +# log4j.rootLogger=DEBUG,DRFA +log4j.rootLogger=DEBUG,stdout + +# Logging Threshold +log4j.threshhold=ALL + +# +# stdout +# Add *stdout* to rootlogger above if you want to use this +# + +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + +# +# plain layout used for commandline tools to output to console +# +log4j.appender.cmdstdout=org.apache.log4j.ConsoleAppender +log4j.appender.cmdstdout.layout=org.apache.log4j.PatternLayout +log4j.appender.cmdstdout.layout.ConversionPattern=%m%n + + +# Rolling file +log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender +log4j.appender.DRFA.File=log.txt +log4j.appender.DRFA.DatePattern=.yyyy-MM-dd +log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout +log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n + + + diff -urN ../protocol-smb/src/plugin/protocol-smb/src/test.sh ./src/plugin/protocol-smb/src/test.sh --- ../protocol-smb/src/plugin/protocol-smb/src/test.sh 1969-12-31 19:00:00.000000000 -0500 +++ ./src/plugin/protocol-smb/src/test.sh 2008-11-07 20:32:08.480629000 -0500 @@ -0,0 +1,10 @@ +#! /bin/sh -ex +export PATH=$PATH:/c/Program\ Files/Java/jdk1.6.0_07/bin +libdir="../../../../lib/" +CP="../lib/jcifs-1.3.0.jar;${libdir}/log4j-1.2.15.jar;${libdir}/commons-logging-1.0.4.jar;${libdir}/commons-logging-api-1.0.4.jar;java;." +DEFS=("-Dorg.apache.commons.logging.Log=org.apache.commons.logging.impl.Log4JCategoryLog" "-Dlog4j.configuration=log4j.properties") +# DEFS=() +MAIN="org/apache/nutch/protocol/smb/test/SMBTest" +javac -cp "$CP" "java/${MAIN}.java" +java -cp "$CP" "${DEFS[@]}" "$MAIN" "${libdir}../conf/smb.properties" "smb://SERVER/SHARE/DIR/FILE" OUTPUT_FILE +