Index: src/java/org/apache/nutch/util/DeflateUtils.java =================================================================== --- src/java/org/apache/nutch/util/DeflateUtils.java (revision 0) +++ src/java/org/apache/nutch/util/DeflateUtils.java (revision 0) @@ -0,0 +1,143 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.util; + +import java.io.ByteArrayOutputStream; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.zip.Inflater; +import java.util.zip.InflaterInputStream; +import java.util.zip.DeflaterOutputStream; + +// Commons Logging imports +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * A collection of utility methods for working on deflated data. + */ +public class DeflateUtils { + + private static final Log LOG = LogFactory.getLog(DeflateUtils.class); + private static final int EXPECTED_COMPRESSION_RATIO= 5; + private static final int BUF_SIZE= 4096; + + /** + * Returns an inflated copy of the input array. If the deflated + * input has been truncated or corrupted, a best-effort attempt is + * made to inflate as much as possible. If no data can be extracted + * null is returned. + */ + public static final byte[] inflateBestEffort(byte[] in) { + return inflateBestEffort(in, Integer.MAX_VALUE); + } + + /** + * Returns an ginflateped copy of the input array, truncated to + * sizeLimit bytes, if necessary. If the deflated input + * has been truncated or corrupted, a best-effort attempt is made to + * inflate as much as possible. If no data can be extracted + * null is returned. + */ + public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) { + // decompress using InflaterInputStream + ByteArrayOutputStream outStream = + new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length); + + // "true" because HTTP does not provide zlib headers + Inflater inflater = new Inflater(true); + InflaterInputStream inStream = + new InflaterInputStream(new ByteArrayInputStream(in), inflater); + + byte[] buf = new byte[BUF_SIZE]; + int written = 0; + while (true) { + try { + int size = inStream.read(buf); + if (size <= 0) + break; + if ((written + size) > sizeLimit) { + outStream.write(buf, 0, sizeLimit - written); + break; + } + outStream.write(buf, 0, size); + written+= size; + } catch (Exception e) { + LOG.info( "Caught Exception in inflateBestEffort" ); + e.printStackTrace(LogUtil.getWarnStream(LOG)); + break; + } + } + try { + outStream.close(); + } catch (IOException e) { + } + + return outStream.toByteArray(); + } + + + /** + * Returns an ginflateped copy of the input array. + * @throws IOException if the input cannot be properly decompressed + */ + public static final byte[] inflate(byte[] in) throws IOException { + // decompress using InflaterInputStream + ByteArrayOutputStream outStream = + new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length); + + InflaterInputStream inStream = + new InflaterInputStream ( new ByteArrayInputStream(in) ); + + byte[] buf = new byte[BUF_SIZE]; + while (true) { + int size = inStream.read(buf); + if (size <= 0) + break; + outStream.write(buf, 0, size); + } + outStream.close(); + + return outStream.toByteArray(); + } + + /** + * Returns an deflated copy of the input array. + */ + public static final byte[] deflate(byte[] in) { + // compress using DeflaterOutputStream + ByteArrayOutputStream byteOut= + new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO); + + DeflaterOutputStream outStream= new DeflaterOutputStream(byteOut); + + try { + outStream.write(in); + } catch (Exception e) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + } + + try { + outStream.close(); + } catch (IOException e) { + e.printStackTrace(LogUtil.getWarnStream(LOG)); + } + + return byteOut.toByteArray(); + } + +} Property changes on: src/java/org/apache/nutch/util/DeflateUtils.java ___________________________________________________________________ Name: svn:executable + * Index: src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java =================================================================== --- src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (revision 429630) +++ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (working copy) @@ -36,6 +36,7 @@ import org.apache.nutch.protocol.ProtocolOutput; import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.util.GZIPUtils; +import org.apache.nutch.util.DeflateUtils; import org.apache.nutch.util.LogUtil; // Hadoop imports @@ -461,7 +462,24 @@ } return content; } - + + public byte[] processDeflateEncoded(byte[] compressed, URL url) throws IOException { + + if (LOGGER.isTraceEnabled()) { LOGGER.trace("inflating...."); } + + byte[] content = DeflateUtils.inflateBestEffort(compressed, getMaxContent()); + + if (content == null) + throw new IOException("inflateBestEffort returned null"); + + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("fetched " + compressed.length + + " bytes of compressed content (expanded to " + + content.length + " bytes) from " + url); + } + return content; + } + protected static void main(HttpBase http, String[] args) throws Exception { boolean verbose = false; String url = null; Index: src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java =================================================================== --- src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (revision 429630) +++ src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (working copy) @@ -110,7 +110,7 @@ reqStr.append(portString); reqStr.append("\r\n"); - reqStr.append("Accept-Encoding: x-gzip, gzip\r\n"); + reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n"); String userAgent = http.getUserAgent(); if ((userAgent == null) || (userAgent.length() == 0)) { @@ -148,6 +148,8 @@ String contentEncoding = getHeader(Response.CONTENT_ENCODING); if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) { content = http.processGzipEncoded(content, url); + } else if ("deflate".equals(contentEncoding)) { + content = http.processDeflateEncoded(content, url); } else { if (Http.LOG.isTraceEnabled()) { Http.LOG.trace("fetched " + content.length + " bytes from " + url);