diff -urN --exclude=.svn james-mime4j-trunk/core/src/main/java/org/apache/james/mime4j/codec/DecoderUtil.java james-mime4j/core/src/main/java/org/apache/james/mime4j/codec/DecoderUtil.java --- james-mime4j-trunk/core/src/main/java/org/apache/james/mime4j/codec/DecoderUtil.java 2011-03-31 09:57:51.000000000 +0100 +++ james-mime4j/core/src/main/java/org/apache/james/mime4j/codec/DecoderUtil.java 2011-03-31 10:01:43.000000000 +0100 @@ -257,4 +257,104 @@ return sb.toString(); } + + /** + * A struct to hold an encoded value. A parsed encoded value is stored as + * both the decoded value and the original encoded value (so that toString + * will produce the same result). An encoded value that is set explicitly is + * stored as the original value and the encoded value, to ensure that get + * will return the same value that was set. + */ + public static class RFC2231Value { + public String value; + public String charset; + public String encodedValue; + } + + /** + * Decode RFC2231 parameter value with charset + */ + public static RFC2231Value decodeRFC2231Value(String value, + DecodeMonitor monitor) { + @SuppressWarnings("unused") String lang = null; + String charset = null; + + RFC2231Value v = new RFC2231Value(); + v.encodedValue = value; + v.value = value; // in case we fail to decode it + + try { + int charsetDelimiter = value.indexOf('\''); + if (charsetDelimiter <= 0) { + return v; // not encoded correctly? return as is. + } + + charset = value.substring(0, charsetDelimiter); + int langDelimiter = value.indexOf('\'', charsetDelimiter + 1); + if (langDelimiter < 0) { + return v; // not encoded correctly? return as is. + } + + lang = value.substring(charsetDelimiter + 1, langDelimiter); + value = value.substring(langDelimiter + 1); + v.charset = charset; + v.value = decodeRFC2231Bytes(value, charset); + } catch (UnsupportedEncodingException e) { + // should not happen because of isDecodingSupported check above + monitor(monitor, charset, "RFC2231", value, "leaving word encoded", + "Unsupported encoding (", e.getMessage(), + ") in encoded word"); + } catch (RuntimeException e) { + monitor(monitor, charset, "RFC2231", value, "leaving word encoded", + "Could not decode (", e.getMessage(), ") encoded word"); + } + + return v; + } + + /** + * Decode RFC2231 parameter value without charset + */ + public static String decodeRFC2231Value(String value, String charset, + DecodeMonitor monitor) { + try { + value = decodeRFC2231Bytes(value, charset); + } catch (UnsupportedEncodingException e) { + // should not happen because of isDecodingSupported check above + monitor(monitor, charset, "RFC2231", value, "leaving word encoded", + "Unsupported encoding (", e.getMessage(), + ") in encoded word"); + } catch (RuntimeException e) { + monitor(monitor, charset, "RFC2231", value, "leaving word encoded", + "Could not decode (", e.getMessage(), ") encoded word"); + } + + return value; + } + + /** + * Decode the encoded bytes in RFC2231 value using the specified charset. + */ + private static String decodeRFC2231Bytes(String value, final String charset) + throws UnsupportedEncodingException { + /* + * Decode the ASCII characters in value into an array of bytes, and then + * convert the bytes to a String using the specified charset. We'll + * never need more bytes than encoded characters, so use that to size + * the array. + */ + byte[] b = new byte[value.length()]; + int i, bi; + for (i = 0, bi = 0; i < value.length(); i++) { + char c = value.charAt(i); + if (c == '%') { + String hex = value.substring(i + 1, i + 3); + c = (char) Integer.parseInt(hex, 16); + i += 2; + } + b[bi++] = (byte) c; + } + + return new String(b, 0, bi, CharsetUtil.toJavaCharset(charset)); + } } diff -urN --exclude=.svn james-mime4j-trunk/dom/src/main/java/org/apache/james/mime4j/field/ContentDispositionFieldImpl.java james-mime4j/dom/src/main/java/org/apache/james/mime4j/field/ContentDispositionFieldImpl.java --- james-mime4j-trunk/dom/src/main/java/org/apache/james/mime4j/field/ContentDispositionFieldImpl.java 2011-03-31 09:57:48.000000000 +0100 +++ james-mime4j/dom/src/main/java/org/apache/james/mime4j/field/ContentDispositionFieldImpl.java 2011-03-31 11:32:03.000000000 +0100 @@ -23,11 +23,15 @@ import java.util.Collections; import java.util.Date; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Set; import org.apache.james.mime4j.codec.DecodeMonitor; +import org.apache.james.mime4j.codec.DecoderUtil; +import org.apache.james.mime4j.codec.DecoderUtil.RFC2231Value; import org.apache.james.mime4j.field.contentdisposition.parser.ContentDispositionParser; import org.apache.james.mime4j.field.contentdisposition.parser.ParseException; import org.apache.james.mime4j.field.contentdisposition.parser.TokenMgrError; @@ -39,6 +43,45 @@ */ public class ContentDispositionFieldImpl extends AbstractField implements org.apache.james.mime4j.dom.field.ContentDispositionField { + /** + * A set of names for multi-segment parameters that we haven't processed + * yet. Normally such names are accumulated during the inital parse and + * processed at the end of the parse, but such names can also be set via the + * set method when the IMAP provider accumulates pre-parsed pieces of a + * parameter list. (A special call to the set method tells us when the IMAP + * provider is done setting parameters.) + * + * A multi-segment parameter is defined by RFC 2231. For example, + * "title*0=part1; title*1=part2", which represents a parameter named + * "title" with value "part1part2". + * + * Note also that each segment of the value might or might not be encoded, + * indicated by a trailing "*" on the parameter name. If any segment is + * encoded, the first segment must be encoded. Only the first segment + * contains the charset and language information needed to decode any + * encoded segments. + * + * RFC 2231 introduces many possible failure modes, which we try to handle + * as gracefully as possible. Generally, a failure to decode a parameter + * value causes the non-decoded parameter value to be used instead. Missing + * segments cause all later segments to be appear as independent parameters + * with names that include the segment number. For example, "title*0=part1; + * title*1=part2; + * title*3=part4" appears as two parameters named "title" and "title*3". + */ + private Set multisegmentNames = new HashSet(); + + /** + * A map containing the segments for all not-yet-processed multi-segment + * parameters. The map is indexed by "name*seg". The value object is either + * a String or a Value object. The Value object is not decoded during the + * initial parse because the segments may appear in any order and until the + * first segment appears we don't know what charset to use to decode any + * encoded segments. The segments are decoded in order in the + * combineMultisegmentNames method. + */ + private Map segmentList = new HashMap(); + private boolean parsed = false; private String dispositionType = ""; @@ -236,14 +279,117 @@ for (int i = 0; i < len; i++) { String paramName = paramNames.get(i).toLowerCase(Locale.US); String paramValue = paramValues.get(i); - parameters.put(paramName, paramValue); + putParameter(paramName, paramValue); } + combineMultisegmentParameters(); } } parsed = true; } + /** + * If the name is an encoded or multi-segment name (or both) handle it + * appropriately, storing the appropriate String or Value object. + * Multi-segment names are stored in the main parameter list as an emtpy + * string as a placeholder, replaced later in combineMultisegmentNames with + * a MultiValue object. This causes all pieces of the multi-segment + * parameter to appear in the position of the first seen segment of the + * parameter. + */ + private void putParameter(String name, String value) { + int star = name.indexOf('*'); + if (star < 0) { + // single parameter, unencoded value + parameters.put(name, value); + } else if (star == name.length() - 1) { + // single parameter, encoded value + name = name.substring(0, star); + RFC2231Value v = DecoderUtil.decodeRFC2231Value(value, monitor); + parameters.put(name, v.value); + } else { + // multiple segments + String paramName = name.substring(0, star); + multisegmentNames.add(paramName); + parameters.put(paramName, ""); + + if (name.endsWith("*")) { + // encoded value + RFC2231Value valObject = new RFC2231Value(); + valObject.encodedValue = value; + valObject.value = value; // default; decoded later + + String segmentName = name.substring(0, name.length() - 1); + segmentList.put(segmentName, valObject); + } else { + // plain value + segmentList.put(name, value); + } + + } + } + + /** + * Iterate through the saved set of names of multi-segment parameters, for + * each parameter find all segments stored in the slist map, decode each + * segment as needed, combine the segments together into a single decoded + * value. + */ + private void combineMultisegmentParameters() { + for(String name : multisegmentNames) { + StringBuilder paramValue = new StringBuilder(); + String charset = null; + String segmentName; + String segmentValue; + + // find and decode each segment + int segment; + for (segment = 0;; segment++) { + segmentName = name + "*" + segment; + segmentValue = null; + Object v = segmentList.get(segmentName); + + if (v == null) // out of segments + break; + + if (v instanceof RFC2231Value) { + String encodedValue = ((RFC2231Value) v).encodedValue; + segmentValue = encodedValue; // in case of exception + + if (segment == 0) { + // the first segment specifies charset for all other encoded segments + RFC2231Value vnew = DecoderUtil.decodeRFC2231Value(encodedValue, monitor); + charset = vnew.charset; + segmentValue = vnew.value; + } else { + if (charset == null) { + // should never happen + multisegmentNames.remove(name); + break; + } + segmentValue = DecoderUtil.decodeRFC2231Value(encodedValue, charset, monitor); + } + } else { + segmentValue = (String) v; + } + + paramValue.append(segmentValue); + //segmentList.remove(segmentName); + } + + if (segment == 0) { + // didn't find any segments at all + parameters.remove(name); + } else { + parameters.put(name, paramValue.toString()); + } + } + + // clear out the set of names and segments + multisegmentNames.clear(); + segmentList.clear(); + } + static final FieldParser PARSER = new FieldParser() { public ContentDispositionFieldImpl parse(final String name, final String body, final ByteSequence raw, DecodeMonitor monitor) { diff -urN --exclude=.svn james-mime4j-trunk/dom/src/test/java/org/apache/james/mime4j/field/ContentDispositionFieldTest.java james-mime4j/dom/src/test/java/org/apache/james/mime4j/field/ContentDispositionFieldTest.java --- james-mime4j-trunk/dom/src/test/java/org/apache/james/mime4j/field/ContentDispositionFieldTest.java 2011-03-31 09:57:48.000000000 +0100 +++ james-mime4j/dom/src/test/java/org/apache/james/mime4j/field/ContentDispositionFieldTest.java 2011-03-31 11:22:57.000000000 +0100 @@ -201,4 +201,20 @@ assertEquals(12, f.getSize()); } + public void testGetRFC2231Filename() throws Exception { + ContentDispositionField f = null; + + f = (ContentDispositionField) DefaultFieldParser + .parse("Content-Disposition: attachment; \r\n" + + " filename*0*=\"UTF-8''%C4%B0nformasiyanin%20M%C9%99xfiliyi.r\";\r\n" + + " filename*1*=\"ar\"\r\n"); + assertEquals("\u0130nformasiyanin M\u0259xfiliyi.rar", f.getFilename()); + + f = (ContentDispositionField) DefaultFieldParser + .parse("Content-Disposition: attachment; \r\n" + + " filename*0*=us-ascii'en'This%20is%20even%20more%20;\r\n" + + " filename*2=\"isn't it.txt\";\r\n" + + " filename*1*=%3D%3D%3Dmad%3D%3D%3D%20\r\n;"); + assertEquals("This is even more ===mad=== isn't it.txt", f.getFilename()); + } }