Bug 663057 - support RFC2231/5987 encoding for title parameter in HTTP link header fields. r=hsivonen

This commit is contained in:
julian.reschke@gmx.de 2012-05-21 06:31:00 -07:00
parent b2e906975f
commit 7d5a2dd5d4
7 changed files with 302 additions and 20 deletions

View File

@ -416,6 +416,48 @@ nsContentSink::LinkContextIsOurDocument(const nsSubstring& aAnchor)
return same;
}
// Decode a parameter value using the encoding defined in RFC 5987 (in place)
//
// charset "'" [ language ] "'" value-chars
//
// returns true when decoding happened successfully (otherwise leaves
// passed value alone)
bool
nsContentSink::Decode5987Format(nsAString& aEncoded) {
nsresult rv;
nsCOMPtr<nsIMIMEHeaderParam> mimehdrpar =
do_GetService(NS_MIMEHEADERPARAM_CONTRACTID, &rv);
if (NS_FAILED(rv))
return false;
nsCAutoString asciiValue;
const PRUnichar* encstart = aEncoded.BeginReading();
const PRUnichar* encend = aEncoded.EndReading();
// create a plain ASCII string, aborting if we can't do that
// converted form is always shorter than input
while (encstart != encend) {
if (*encstart > 0 && *encstart < 128) {
asciiValue.Append((char)*encstart);
} else {
return false;
}
encstart++;
}
nsAutoString decoded;
nsCAutoString language;
rv = mimehdrpar->DecodeRFC5987Param(asciiValue, language, decoded);
if (NS_FAILED(rv))
return false;
aEncoded = decoded;
return true;
}
nsresult
nsContentSink::ProcessLinkHeader(nsIContent* aElement,
const nsAString& aLinkData)
@ -429,6 +471,7 @@ nsContentSink::ProcessLinkHeader(nsIContent* aElement,
nsAutoString href;
nsAutoString rel;
nsAutoString title;
nsAutoString titleStar;
nsAutoString type;
nsAutoString media;
nsAutoString anchor;
@ -453,7 +496,7 @@ nsContentSink::ProcessLinkHeader(nsIContent* aElement,
end = start;
last = end - 1;
bool needsUnescape = false;
bool wasQuotedString = false;
// look for semicolon or comma
while (*end != kNullCh && *end != kSemicolon && *end != kComma) {
@ -467,14 +510,14 @@ nsContentSink::ProcessLinkHeader(nsIContent* aElement,
quote = kGreaterThan;
}
needsUnescape = (ch == kQuote);
wasQuotedString = (ch == kQuote);
PRUnichar* closeQuote = (end + 1);
// seek closing quote
while (*closeQuote != kNullCh && quote != *closeQuote) {
// in quoted-string, "\" is an escape character
if (needsUnescape && *closeQuote == kBackSlash && *(closeQuote + 1) != kNullCh) {
if (wasQuotedString && *closeQuote == kBackSlash && *(closeQuote + 1) != kNullCh) {
++closeQuote;
}
@ -549,7 +592,7 @@ nsContentSink::ProcessLinkHeader(nsIContent* aElement,
value++;
}
if (needsUnescape) {
if (wasQuotedString) {
// unescape in-place
PRUnichar* unescaped = value;
PRUnichar *src = value;
@ -574,6 +617,20 @@ nsContentSink::ProcessLinkHeader(nsIContent* aElement,
title = value;
title.CompressWhitespace();
}
} else if (attr.LowerCaseEqualsLiteral("title*")) {
if (titleStar.IsEmpty() && !wasQuotedString) {
// RFC 5987 encoding; uses token format only, so skip if we get
// here with a quoted-string
nsAutoString tmp;
tmp = value;
if (Decode5987Format(tmp)) {
titleStar = tmp;
titleStar.CompressWhitespace();
} else {
// header value did not parse, throw it away
titleStar.Truncate();
}
}
} else if (attr.LowerCaseEqualsLiteral("type")) {
if (type.IsEmpty()) {
type = value;
@ -602,7 +659,10 @@ nsContentSink::ProcessLinkHeader(nsIContent* aElement,
href.Trim(" \t\n\r\f"); // trim HTML5 whitespace
if (!href.IsEmpty() && !rel.IsEmpty()) {
rv = ProcessLink(aElement, anchor, href, rel, title, type, media);
rv = ProcessLink(aElement, anchor, href, rel,
// prefer RFC 5987 variant over non-I18zed version
titleStar.IsEmpty() ? title : titleStar,
type, media);
}
href.Truncate();
@ -620,7 +680,10 @@ nsContentSink::ProcessLinkHeader(nsIContent* aElement,
href.Trim(" \t\n\r\f"); // trim HTML5 whitespace
if (!href.IsEmpty() && !rel.IsEmpty()) {
rv = ProcessLink(aElement, anchor, href, rel, title, type, media);
rv = ProcessLink(aElement, anchor, href, rel,
// prefer RFC 5987 variant over non-I18zed version
titleStar.IsEmpty() ? title : titleStar,
type, media);
}
return rv;

View File

@ -117,6 +117,7 @@ class nsContentSink : public nsICSSLoaderObserver,
bool IsTimeToNotify();
bool LinkContextIsOurDocument(const nsSubstring& aAnchor);
bool Decode5987Format(nsAString& aEncoded);
static void InitializeStatics();

View File

@ -25,6 +25,8 @@ interface nsIUTF8ConverterService : nsISupports
* The most common case is the input is in 7bit non-ASCII charsets
* like ISO-2022-JP, HZ or UTF-7 (in its original form or
* a modified form used in IMAP folder names).
* @param aAllowSubstitution when true, allow the decoder to substitute
* invalid input sequences by replacement characters
* @return the converted string in UTF-8.
* @throws NS_ERROR_UCONV_NOCONV when there is no decoder for aCharset
* or error code of nsIUnicodeDecoder in case of conversion failure
@ -32,7 +34,8 @@ interface nsIUTF8ConverterService : nsISupports
AUTF8String convertStringToUTF8(in ACString aString,
in string aCharset,
in boolean aSkipCheck);
in boolean aSkipCheck,
in boolean aAllowSubstitution);
/* XXX : To-be-added. convertStringFromUTF8 */

View File

@ -17,7 +17,8 @@
NS_IMPL_ISUPPORTS1(nsUTF8ConverterService, nsIUTF8ConverterService)
static nsresult
ToUTF8(const nsACString &aString, const char *aCharset, nsACString &aResult)
ToUTF8(const nsACString &aString, const char *aCharset,
bool aAllowSubstitution, nsACString &aResult)
{
nsresult rv;
if (!aCharset || !*aCharset)
@ -33,6 +34,9 @@ ToUTF8(const nsACString &aString, const char *aCharset, nsACString &aResult)
getter_AddRefs(unicodeDecoder));
NS_ENSURE_SUCCESS(rv, rv);
if (!aAllowSubstitution)
unicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);
PRInt32 srcLen = aString.Length();
PRInt32 dstLen;
const nsAFlatCString& inStr = PromiseFlatCString(aString);
@ -54,6 +58,7 @@ NS_IMETHODIMP
nsUTF8ConverterService::ConvertStringToUTF8(const nsACString &aString,
const char *aCharset,
bool aSkipCheck,
bool aAllowSubstitution,
nsACString &aUTF8String)
{
// return if ASCII only or valid UTF-8 providing that the ASCII/UTF-8
@ -67,7 +72,7 @@ nsUTF8ConverterService::ConvertStringToUTF8(const nsACString &aString,
aUTF8String.Truncate();
nsresult rv = ToUTF8(aString, aCharset, aUTF8String);
nsresult rv = ToUTF8(aString, aCharset, aAllowSubstitution, aUTF8String);
// additional protection for cases where check is skipped and the input
// is actually in UTF-8 as opposed to aCharset. (i.e. caller's hunch
@ -111,6 +116,6 @@ nsUTF8ConverterService::ConvertURISpecToUTF8(const nsACString &aSpec,
return NS_OK;
}
return ToUTF8(unescapedSpec, aCharset, aUTF8Spec);
return ToUTF8(unescapedSpec, aCharset, true, aUTF8Spec);
}

View File

@ -77,6 +77,30 @@ interface nsIMIMEHeaderParam : nsISupports {
in boolean aTryLocaleCharset,
out string aLang);
/**
* Given the value of a header field parameter using the encoding
* defined in RFC 5987, decode the value into a Unicode string, and extract
* the optional language parameter.
*
* <p>
* This function is purposefully picky; it will abort for all (most?)
* invalid inputs. This is by design. In particular, it does not support
* any character encodings other than UTF-8, in order not to promote
* non-interoperable usage.
*
* <p>
* This code is currently not used inside nsMIMEHeaderParamImpl, but
* might be in the future. New code that needs RFC2231/5987
* encoding should use this one.
*
* @param aParamVal a header field parameter to decode.
* @param aLang will be set to the language part (possibly
* empty).
* @return the decoded parameter value.
*/
AString decodeRFC5987Param(in ACString aParamVal,
out ACString aLang);
/**
* Given the value of a single header field (such as
* Content-Disposition and Content-Type) and the name of a parameter

View File

@ -21,6 +21,7 @@
#include "nsReadableUtils.h"
#include "nsNativeCharsetUtils.h"
#include "nsNetError.h"
#include "nsIUnicodeDecoder.h"
// static functions declared below are moved from mailnews/mime/src/comi18n.cpp
@ -96,7 +97,8 @@ nsMIMEHeaderParamImpl::DoGetParameter(const nsACString& aHeaderVal,
cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID));
if (cvtUTF8 &&
NS_SUCCEEDED(cvtUTF8->ConvertStringToUTF8(str1,
PromiseFlatCString(aFallbackCharset).get(), false, str2))) {
PromiseFlatCString(aFallbackCharset).get(), false, true,
str2))) {
CopyUTF8toUTF16(str2, aResult);
return NS_OK;
}
@ -279,7 +281,7 @@ bool IsValidOctetSequenceForCharset(nsACString& aCharset, const char *aOctets)
nsresult rv = cvtUTF8->ConvertStringToUTF8(tmpRaw,
PromiseFlatCString(aCharset).get(),
true, tmpDecoded);
false, false, tmpDecoded);
if (rv != NS_OK) {
// we can't decode; charset may be unsupported, or the octet sequence
@ -709,6 +711,135 @@ nsMIMEHeaderParamImpl::DecodeRFC2047Header(const char* aHeaderVal,
return NS_OK;
}
// true if the character is allowed in a RFC 5987 value
// see RFC 5987, Section 3.2.1, "attr-char"
bool IsRFC5987AttrChar(char aChar)
{
char c = aChar;
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
(c == '!' || c == '#' || c == '$' || c == '&' ||
c == '+' || c == '-' || c == '.' || c == '^' ||
c == '_' || c == '`' || c == '|' || c == '~');
}
// true is character is a hex digit
bool IsHexDigit(char aChar)
{
char c = aChar;
return (c >= 'a' && c <= 'f') ||
(c >= 'A' && c <= 'F') ||
(c >= '0' && c <= '9');
}
// percent-decode a value
// returns false on failure
bool PercentDecode(nsACString& aValue)
{
char *c = (char *) nsMemory::Alloc(aValue.Length() + 1);
if (!c) {
return false;
}
strcpy(c, PromiseFlatCString(aValue).get());
nsUnescape(c);
aValue.Assign(c);
nsMemory::Free(c);
return true;
}
// Decode a parameter value using the encoding defined in RFC 5987
//
// charset "'" [ language ] "'" value-chars
NS_IMETHODIMP
nsMIMEHeaderParamImpl::DecodeRFC5987Param(const nsACString& aParamVal,
nsACString& aLang,
nsAString& aResult)
{
nsCAutoString charset;
nsCAutoString language;
nsCAutoString value;
PRUint32 delimiters = 0;
const char *encoded = PromiseFlatCString(aParamVal).get();
const char *c = encoded;
while (*c) {
char tc = *c++;
if (tc == '\'') {
// single quote
delimiters++;
} else if (tc >= 128) {
// fail early, not ASCII
NS_WARNING("non-US-ASCII character in RFC5987-encoded param");
return NS_ERROR_INVALID_ARG;
} else {
if (delimiters == 0) {
// valid characters are checked later implicitly
charset.Append(tc);
} else if (delimiters == 1) {
// no value checking for now
language.Append(tc);
} else if (delimiters == 2) {
if (IsRFC5987AttrChar(tc)) {
value.Append(tc);
} else if (tc == '%') {
if (!IsHexDigit(c[0]) || !IsHexDigit(c[1])) {
// we expect two more characters
NS_WARNING("broken %-escape in RFC5987-encoded param");
return NS_ERROR_INVALID_ARG;
}
value.Append(tc);
// we consume two more
value.Append(*c++);
value.Append(*c++);
} else {
// character not allowed here
NS_WARNING("invalid character in RFC5987-encoded param");
return NS_ERROR_INVALID_ARG;
}
}
}
}
if (delimiters != 2) {
NS_WARNING("missing delimiters in RFC5987-encoded param");
return NS_ERROR_INVALID_ARG;
}
// abort early for unsupported encodings
if (!charset.LowerCaseEqualsLiteral("utf-8")) {
NS_WARNING("unsupported charset in RFC5987-encoded param");
return NS_ERROR_INVALID_ARG;
}
// percent-decode
if (!PercentDecode(value)) {
return NS_ERROR_OUT_OF_MEMORY;
}
// return the encoding
aLang.Assign(language);
// finally convert octet sequence to UTF-8 and be done
nsresult rv = NS_OK;
nsCOMPtr<nsIUTF8ConverterService> cvtUTF8 =
do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
nsCAutoString utf8;
rv = cvtUTF8->ConvertStringToUTF8(value, charset.get(), true, false, utf8);
NS_ENSURE_SUCCESS(rv, rv);
CopyUTF8toUTF16(utf8, aResult);
return NS_OK;
}
NS_IMETHODIMP
nsMIMEHeaderParamImpl::DecodeParameter(const nsACString& aParamValue,
const char* aCharset,
@ -724,7 +855,7 @@ nsMIMEHeaderParamImpl::DecodeParameter(const nsACString& aParamValue,
nsCOMPtr<nsIUTF8ConverterService> cvtUTF8(do_GetService(NS_UTF8CONVERTERSERVICE_CONTRACTID));
if (cvtUTF8)
return cvtUTF8->ConvertStringToUTF8(aParamValue, aCharset,
true, aResult);
true, true, aResult);
}
const nsAFlatCString& param = PromiseFlatCString(aParamValue);
@ -904,7 +1035,7 @@ void CopyRawHeader(const char *aInput, PRUint32 aLen,
if (cvtUTF8 &&
NS_SUCCEEDED(
cvtUTF8->ConvertStringToUTF8(Substring(aInput, aInput + aLen),
aDefaultCharset, skipCheck, utf8Text))) {
aDefaultCharset, skipCheck, true, utf8Text))) {
aOutput.Append(utf8Text);
} else { // replace each octet with Unicode replacement char in UTF-8.
for (PRUint32 i = 0; i < aLen; i++) {
@ -1037,7 +1168,7 @@ nsresult DecodeRFC2047Str(const char *aHeader, const char *aDefaultCharset,
if (cvtUTF8 &&
NS_SUCCEEDED(
cvtUTF8->ConvertStringToUTF8(nsDependentCString(decodedText),
charset, IS_7BIT_NON_ASCII_CHARSET(charset), utf8Text))) {
charset, IS_7BIT_NON_ASCII_CHARSET(charset), true, utf8Text))) {
aResult.Append(utf8Text);
} else {
aResult.Append(REPLACEMENT_CHAR);

View File

@ -391,6 +391,35 @@ var tests = [
"attachment", "foo"],
];
var rfc5987paramtests = [
[ // basic test
"UTF-8'language'value", "value", "language", Cr.NS_OK ],
[ // percent decoding
"UTF-8''1%202", "1 2", "", Cr.NS_OK ],
[ // UTF-8
"UTF-8''%c2%a3%20and%20%e2%82%ac%20rates", "\u00a3 and \u20ac rates", "", Cr.NS_OK ],
[ // missing charset
"''abc", "", "", Cr.NS_ERROR_INVALID_ARG ],
[ // ISO-8859-1: unsupported
"ISO-8859-1''%A3%20rates", "", "", Cr.NS_ERROR_INVALID_ARG ],
[ // unknown charset
"foo''abc", "", "", Cr.NS_ERROR_INVALID_ARG ],
[ // missing component
"abc", "", "", Cr.NS_ERROR_INVALID_ARG ],
[ // missing component
"'abc", "", "", Cr.NS_ERROR_INVALID_ARG ],
[ // illegal chars
"UTF-8''a b", "", "", Cr.NS_ERROR_INVALID_ARG ],
[ // broken % escapes
"UTF-8''a%zz", "", "", Cr.NS_ERROR_INVALID_ARG ],
[ // broken % escapes
"UTF-8''a%b", "", "", Cr.NS_ERROR_INVALID_ARG ],
[ // broken % escapes
"UTF-8''a%", "", "", Cr.NS_ERROR_INVALID_ARG ],
[ // broken UTF-8
"UTF-8''%A3%20rates", "", "", 0x8050000E /* NS_ERROR_UDEC_ILLEGALINPUT */ ],
];
function do_tests(whichRFC)
{
var mhp = Components.classes["@mozilla.org/network/mime-hdrparam;1"]
@ -451,12 +480,38 @@ function do_tests(whichRFC)
}
}
function run_test() {
function test_decode5987Param() {
var mhp = Components.classes["@mozilla.org/network/mime-hdrparam;1"]
.getService(Components.interfaces.nsIMIMEHeaderParam);
// Test RFC 2231
do_tests(0);
for (var i = 0; i < rfc5987paramtests.length; ++i) {
dump("Testing #" + i + ": " + rfc5987paramtests[i] + "\n");
// Test RFC 5987
do_tests(1);
var lang = {};
try {
var decoded = mhp.decodeRFC5987Param(rfc5987paramtests[i][0], lang);
if (rfc5987paramtests[i][3] == Cr.NS_OK) {
do_check_eq(rfc5987paramtests[i][1], decoded);
do_check_eq(rfc5987paramtests[i][2], lang.value);
}
else {
do_check_eq(rfc5987paramtests[i][3], "instead got: " + decoded);
}
}
catch (e) {
do_check_eq(rfc5987paramtests[i][3], e.result);
}
}
}
function run_test() {
// Test RFC 2231 (complete header field values)
do_tests(0);
// Test RFC 5987 (complete header field values)
do_tests(1);
// tests for RFC5987 parameter parsing
test_decode5987Param();
}