@@ -23,4 +23,4 @@
-
\ No newline at end of file
+
diff -urN ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMB.java ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMB.java
--- ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMB.java 2007-05-25 22:28:20.000000000 -0400
+++ ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMB.java 2008-11-07 23:35:29.843225000 -0500
@@ -51,9 +51,9 @@
*/
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
- System.out.println("SMB Protocol: Now using the SMB protocol");
- System.out.println("SMB Protocol will interrogate the following URL: "+urlString);
- try{
+ // System.out.println("SMB Protocol: Now using the SMB protocol");
+ // System.out.println("SMB Protocol will interrogate the following URL: "+urlString);
+ try {
URL u = new URL(urlString);
int redirects = 0;
if(!setJCifsProp())
@@ -64,21 +64,22 @@
response = new SMBResponse(u, datum, this, getConf()); // make a request
int code = response.getCode();
- if(code == 200){
+ if (code == 200){
return new ProtocolOutput(response.toContent());
} else if (code >= 300 && code < 400){
if(redirects == MAX_REDIRECTS)
throw new SMBException("too many redirects: "+url);
u = new URL(response.getHeader("Location"));
redirects++;
- if(LOG.isTraceEnabled())
+ if (LOG.isTraceEnabled()) {
LOG.trace("redirect to "+u);
- else
- throw new SMBError(code);
+ }
+ // continue the while loop to retry
+ } else {
+ throw new SMBError(code);
}
}
- }catch (Exception ex)
- {
+ } catch (Exception ex) {
return new ProtocolOutput(null, new ProtocolStatus(ex));
}
}
diff -urN ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMBResponse.java ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMBResponse.java
--- ../protocol-smb/src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMBResponse.java 2007-01-05 10:34:38.000000000 -0500
+++ ./src/plugin/protocol-smb/src/java/org/apache/nutch/protocol/smb/SMBResponse.java 2008-11-07 23:36:57.307444000 -0500
@@ -9,12 +9,21 @@
import java.io.File;
import java.io.IOException;
+import java.io.UnsupportedEncodingException;
import java.net.URL;
+import java.util.BitSet;
+
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+
+import org.apache.commons.codec.net.URLCodec;
+import org.apache.commons.codec.DecoderException;
+import org.apache.commons.lang.StringEscapeUtils;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
@@ -33,24 +42,50 @@
private int code;
private Metadata headers = new Metadata();
- private final SMB file;
+ private final SMB protocol;
private Configuration conf;
+
+ private static final URLCodec urlCodec = new URLCodec();
+ private static final BitSet safeUrlChars = new BitSet(256);
+
+ static {
+ // alpha characters
+ for (int i = 'a'; i <= 'z'; i++) {
+ safeUrlChars.set(i);
+ }
+ for (int i = 'A'; i <= 'Z'; i++) {
+ safeUrlChars.set(i);
+ }
+ // numeric characters
+ for (int i = '0'; i <= '9'; i++) {
+ safeUrlChars.set(i);
+ }
+ // special chars
+ safeUrlChars.set('-');
+ safeUrlChars.set('_');
+ safeUrlChars.set('.');
+ safeUrlChars.set('*');
+ }
+
public static final Log LOG = LogFactory.getLog(SMB.class);
/** Creates a new instance of SMBResponse
* @param the url, the crawlDatum, the file, configuration for Nutch
* @throws SMBException, IOException
*/
- public SMBResponse(URL url, CrawlDatum datum, SMB file, Configuration conf)
- throws SMBException, IOException {
- this.orig = url.toString();
- this.base = url.toString();
- this.file = file;
+ public SMBResponse(URL url, CrawlDatum datum, SMB protocol, Configuration conf)
+ throws DecoderException, SMBException, IOException, UnsupportedEncodingException {
+ String urlString = url.toString();
+ String decodedUrlString = new String(URLCodec.decodeUrl(urlString.getBytes("UTF-8")), "UTF-8");
+ url = new URL(decodedUrlString);
+ this.orig = decodedUrlString;
+ this.base = decodedUrlString;
+ this.protocol = protocol;
this.conf = conf;
// check if protocol is a window file share protocol
if(!"smb".equals(url.getProtocol()))
- throw new SMBException("Not an SMB url: "+url);
+ throw new SMBException("Not an SMB url: " + url);
try {
this.content = null;
@@ -135,9 +170,15 @@
* http response code with a value of 200
* @param file - the SmbFile to retrieve content
*/
- private void getDirAsHttpResponse(SmbFile f) throws SmbException {
+ private void getDirAsHttpResponse(SmbFile f) throws SmbException, UnsupportedEncodingException {
String path = f.getName();
- this.content = list2html(f.listFiles(), path, "/".equals(path) ? false : true);
+ SmbFile[] fileList = {};
+ try {
+ fileList = f.listFiles();
+ } catch (SmbException e) {
+ // leave fileList empty
+ }
+ this.content = list2html(fileList, path, false);
// set headers
headers.set(Response.CONTENT_TYPE, "text/html");
@@ -155,24 +196,24 @@
*/
private void getFileAsHttpResponse(SmbFile f) throws SmbException, SMBException, IOException {
long size = f.length();
- if(size > Integer.MAX_VALUE)
+ if (size > Integer.MAX_VALUE)
throw new SMBException("file is too large, size: "+size);
// capture content
int len = (int)size;
- byte[] bytes = new byte[len];
- java.io.InputStream is = new SmbFileInputStream(f);
+ this.content = new byte[len];
+ java.io.InputStream is = f.getInputStream();
int offset = 0;
int n = 0;
- while(offset < size && (n = is.read(this.content, offset, len-offset)) >= 0){
+ while (offset < size && (n = is.read(this.content, offset, len-offset)) >= 0){
offset += n;
+ LOG.debug("Read " + n + " bytes from " + f.getName());
}
- if(offset < len){
+ if (offset < len){
if(LOG.isWarnEnabled())
LOG.warn("not enough bytes read from file: "+f.getCanonicalPath());
}
-
is.close();
// set headers
@@ -189,10 +230,12 @@
* @param the smbFile, path, boolean value to include dotdot in the file path
* @return html content as byte
*/
- private byte[] list2html(SmbFile[] smbFile, String path, boolean includeDotDot) throws SmbException {
+ private byte[] list2html(SmbFile[] smbFile, String path, boolean includeDotDot)
+ throws SmbException, UnsupportedEncodingException {
+ String htmlPath = StringEscapeUtils.escapeHtml(path);
StringBuffer x = new StringBuffer("");
- x.append("Index of "+path+"\n");
- x.append("Index of "+path+"
");
+ x.append("Index of " + htmlPath + "\n");
+ x.append("Index of " + htmlPath + "
");
if(includeDotDot){
x.append("../\t-\t-\n");
@@ -201,12 +244,26 @@
SmbFile f;
for(int i=0; i"+name+"/\t");
+ if (f.isDirectory() || f.isFile()) {
+ x.append("" + htmlName + "\t");
x.append(time+"\t-\n");
- } else{} // ignore any other
+ } else {
+ // ignore any other
+ }
}
x.append("