Sniff "text/plain; charset=UTF8" for possibly being binary. Bug 394647, r+sr=biesi

This commit is contained in:
bzbarsky@mit.edu 2007-12-03 10:37:54 -08:00
parent e628936671
commit a051d516ec
2 changed files with 186 additions and 4 deletions

View File

@ -686,13 +686,15 @@ nsBinaryDetector::DetermineContentType(nsIRequest* aRequest)
// Make sure to do a case-sensitive exact match comparison here. Apache
// 1.x just sends text/plain for "unknown", while Apache 2.x sends
// text/plain with a ISO-8859-1 charset. Debian's Apache version, just to
// be different, sends text/plain with iso-8859-1 charset. Don't do
// general case-insensitive comparison, since we really want to apply this
// crap as rarely as we can.
// be different, sends text/plain with iso-8859-1 charset. For extra fun,
// FC7, RHEL4, and Ubuntu Feisty send charset=UTF-8. Don't do general
// case-insensitive comparison, since we really want to apply this crap as
// rarely as we can.
if (!contentType.EqualsLiteral("text/plain") ||
(!contentTypeHdr.EqualsLiteral("text/plain") &&
!contentTypeHdr.EqualsLiteral("text/plain; charset=ISO-8859-1") &&
!contentTypeHdr.EqualsLiteral("text/plain; charset=iso-8859-1"))) {
!contentTypeHdr.EqualsLiteral("text/plain; charset=iso-8859-1") &&
!contentTypeHdr.EqualsLiteral("text/plain; charset=UTF-8"))) {
return;
}

View File

@ -0,0 +1,180 @@
// Test the plaintext-or-binary sniffer
do_import_script("netwerk/test/httpserver/httpd.js");
// List of Content-Type headers to test. For each header we have an array.
// The first element in the array is the Content-Type header string. The
// second element in the array is a boolean indicating whether we allow
// sniffing for that type.
var contentTypeHeaderList =
[
[ "text/plain", true ],
[ "text/plain; charset=ISO-8859-1", true ],
[ "text/plain; charset=iso-8859-1", true ],
[ "text/plain; charset=UTF-8", true ],
[ "text/plain; charset=unknown", false ],
[ "text/plain; param", false ],
[ "text/plain; charset=ISO-8859-1; param", false ],
[ "text/plain; charset=iso-8859-1; param", false ],
[ "text/plain; charset=UTF-8; param", false ],
[ "text/plain; charset=utf-8", false ],
[ "text/plain; charset=utf8", false ],
[ "text/plain; charset=UTF8", false ],
[ "text/plain; charset=iSo-8859-1", false ]
];
// List of response bodies to test. For each response we have an array. The
// first element in the array is the body string. The second element in the
// array is a boolean indicating whether that string should sniff as binary.
var bodyList =
[
[ "Plaintext", false ]
];
// List of possible BOMs
var BOMList =
[
"\xFE\xFF", // UTF-16BE
"\xFF\xFE", // UTF-16LE
"\xEF\xBB\xBF", // UTF-8
"\x00\x00\xFE\xFF", // UCS-4BE
"\x00\x00\xFF\xFE" // UCS-4LE
];
// Build up bodyList. The things we treat as binary are ASCII codes 0-8,
// 14-26, 28-31. That is, the control char range, except for tab, newline,
// vertical tab, form feed, carriage return, and ESC (this last being used by
// Shift_JIS, apparently).
function isBinaryChar(ch) {
return (0 <= ch && ch <= 8) || (14 <= ch && ch <= 26) ||
(28 <= ch && ch <= 31);
}
// Test chars on their own
var i;
for (i = 0; i <= 127; ++i) {
bodyList.push([ String.fromCharCode(i), isBinaryChar(i) ]);
}
// Test that having a BOM prevents plaintext sniffing
var j;
for (i = 0; i <= 127; ++i) {
for (j = 0; j < BOMList.length; ++j) {
bodyList.push([ BOMList[j] + String.fromCharCode(i, i), false ]);
}
}
// Test that having a BOM requires at least 4 chars to kick in
for (i = 0; i <= 127; ++i) {
for (j = 0; j < BOMList.length; ++j) {
bodyList.push([ BOMList[j] + String.fromCharCode(i),
BOMList[j].length == 2 && isBinaryChar(i) ]);
}
}
function makeChan(headerIdx, bodyIdx) {
var ios = Components.classes["@mozilla.org/network/io-service;1"]
.getService(Components.interfaces.nsIIOService);
var chan =
ios.newChannel("http://localhost:4444/" + headerIdx + "/" + bodyIdx, null,
null)
.QueryInterface(Components.interfaces.nsIHttpChannel);
chan.loadFlags |=
Components.interfaces.nsIChannel.LOAD_CALL_CONTENT_SNIFFERS;
return chan;
}
function makeListener(headerIdx, bodyIdx) {
var listener = {
onStartRequest : function test_onStartR(request, ctx) {
try {
var chan = request.QueryInterface(Components.interfaces.nsIChannel);
var type = chan.contentType;
var expectedType =
contentTypeHeaderList[headerIdx][1] && bodyList[bodyIdx][1] ?
"application/x-vnd.mozilla.guess-from-ext" : "text/plain";
if (expectedType != type) {
do_throw("Unexpected sniffed type '" + type + "'. " +
"Should be '" + expectedType + "'. " +
"Header is ['" +
contentTypeHeaderList[headerIdx][0] + "', " +
contentTypeHeaderList[headerIdx][1] + "]. " +
"Body is ['" +
bodyList[bodyIdx][0].toSource() + "', " +
bodyList[bodyIdx][1] +
"].");
}
do_check_eq(expectedType, type);
} catch (e) {
do_throw("Unexpected exception: " + e);
}
throw Components.results.NS_ERROR_ABORT;
},
onDataAvailable: function test_ODA() {
do_throw("Should not get any data!");
},
onStopRequest: function test_onStopR(request, ctx, status) {
// Advance to next test
++headerIdx;
if (headerIdx == contentTypeHeaderList.length) {
headerIdx = 0;
++bodyIdx;
}
if (bodyIdx == bodyList.length) {
httpserv.stop();
} else {
doTest(headerIdx, bodyIdx);
}
do_test_finished();
}
};
return listener;
}
function doTest(headerIdx, bodyIdx) {
var chan = makeChan(headerIdx, bodyIdx);
var listener = makeListener(headerIdx, bodyIdx);
chan.asyncOpen(listener, null);
do_test_pending();
}
function createResponse(headerIdx, bodyIdx, metadata, response) {
response.setHeader("Content-Type", contentTypeHeaderList[headerIdx][0]);
response.bodyOutputStream.write(bodyList[bodyIdx][0],
bodyList[bodyIdx][0].length);
}
function makeHandler(headerIdx, bodyIdx) {
var f =
function handlerClosure(metadata, response) {
return createResponse(headerIdx, bodyIdx, metadata, response);
};
return f;
}
var httpserv;
function run_test() {
httpserv = new nsHttpServer();
for (i = 0; i < contentTypeHeaderList.length; ++i) {
for (j = 0; j < bodyList.length; ++j) {
httpserv.registerPathHandler("/" + i + "/" + j, makeHandler(i, j));
}
}
httpserv.start(4444);
doTest(0, 0);
}