/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=2 sw=2 et tw=80: */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is mozilla.org HTML Sanitizer code. * * The Initial Developer of the Original Code is * Ben Bucksch . * Portions created by the Initial Developer are Copyright (C) 2002 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Netscape * * Alternatively, the contents of this file may be used under the terms of * either of the GNU General Public License Version 2 or later (the "GPL"), * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ /* * A serializer and content sink that removes potentially insecure or * otherwise dangerous or offending HTML (eg for display of HTML * e-mail attachments or something). */ /* I used nsPlaintextSerializer as base for this class. I don't understand all of the functions in the beginning. Possible that I fail to do something or do something useless. I am not proud about the implementation here at all. Feel free to fix it :-). */ #include "mozSanitizingSerializer.h" #include "nsIServiceManager.h" #include "nsIDOMText.h" #include "nsIDOMElement.h" #include "nsTextFragment.h" #include "nsContentUtils.h" #include "nsReadableUtils.h" #include "plstr.h" #include "nsIProperties.h" #include "nsUnicharUtils.h" #include "nsIURI.h" #include "nsNetUtil.h" #include "nsEscape.h" //#define DEBUG_BenB static inline PRUnichar* escape(const nsString& source) { return nsEscapeHTML2(source.get(), source.Length()); } /* XXX: |printf|s in some error conditions. They are intended as information for the user, because they complain about malformed pref values. Not sure, if popping up dialog boxes is the right thing for such code (and if so, how to do it). */ #define TEXT_REMOVED "<Text removed>" #define TEXT_BREAKER "|" nsresult NS_NewSanitizingHTMLSerializer(nsIContentSerializer** aSerializer) { mozSanitizingHTMLSerializer* it = new mozSanitizingHTMLSerializer(); if (!it) { return NS_ERROR_OUT_OF_MEMORY; } NS_ADDREF(it); *aSerializer = it; return NS_OK; } mozSanitizingHTMLSerializer::mozSanitizingHTMLSerializer() : mSkipLevel(0), mAllowedTags(30) // Just some initial buffer size { mOutputString = nsnull; } mozSanitizingHTMLSerializer::~mozSanitizingHTMLSerializer() { #ifdef DEBUG_BenB printf("Output:\n%s\n", NS_LossyConvertUTF16toASCII(*mOutputString).get()); #endif mAllowedTags.Enumerate(ReleaseProperties); } // PRBool mozSanitizingHTMLSerializer::ReleaseProperties(nsHashKey* key, void* data, void* closure) { nsIProperties* prop = (nsIProperties*)data; NS_IF_RELEASE(prop); return PR_TRUE; } // NS_IMPL_ISUPPORTS4(mozSanitizingHTMLSerializer, nsIContentSerializer, nsIContentSink, nsIHTMLContentSink, mozISanitizingHTMLSerializer) NS_IMETHODIMP mozSanitizingHTMLSerializer::Init(PRUint32 aFlags, PRUint32 dummy, const char* aCharSet, PRBool aIsCopying, PRBool aIsWholeDocument) { NS_ENSURE_TRUE(nsContentUtils::GetParserService(), NS_ERROR_UNEXPECTED); return NS_OK; } NS_IMETHODIMP mozSanitizingHTMLSerializer::Initialize(nsAString* aOutString, PRUint32 aFlags, const nsAString& allowedTags) { nsresult rv = Init(aFlags, 0, nsnull, PR_FALSE, PR_FALSE); NS_ENSURE_SUCCESS(rv, rv); // XXX This is wrong. It violates XPCOM string ownership rules. // We're only getting away with this because instances of this // class are restricted to single function scope. // (Comment copied from nsPlaintextSerializer) mOutputString = aOutString; ParsePrefs(allowedTags); return NS_OK; } // This is not used within the class, but maybe called from somewhere else? NS_IMETHODIMP mozSanitizingHTMLSerializer::Flush(nsAString& aStr) { #ifdef DEBUG_BenB printf("Flush: -%s-", NS_LossyConvertUTF16toASCII(aStr).get()); #endif Write(aStr); return NS_OK; } NS_IMETHODIMP mozSanitizingHTMLSerializer::AppendDocumentStart(nsIDOMDocument *aDocument, nsAString& aStr) { return NS_OK; } void mozSanitizingHTMLSerializer::Write(const nsAString& aString) { mOutputString->Append(aString); } NS_IMETHODIMP mozSanitizingHTMLSerializer::IsEnabled(PRInt32 aTag, PRBool* aReturn) { *aReturn = PR_FALSE; return NS_OK; } /** * Returns true, if the id represents a container */ PRBool mozSanitizingHTMLSerializer::IsContainer(PRInt32 aId) { PRBool isContainer = PR_FALSE; nsIParserService* parserService = nsContentUtils::GetParserService(); if (parserService) { parserService->IsContainer(aId, isContainer); } return isContainer; } /* XXX I don't really know, what these functions do, but they seem to be needed ;-). Mostly copied from nsPlaintextSerializer. */ /* akk says: "I wonder if the sanitizing class could inherit from nsHTMLSerializer, so that at least these methods that none of us understand only have to be written once?" */ // static PRInt32 mozSanitizingHTMLSerializer::GetIdForContent(nsIContent* aContent) { if (!aContent->IsHTML()) { return eHTMLTag_unknown; } nsIParserService* parserService = nsContentUtils::GetParserService(); return parserService ? parserService->HTMLAtomTagToId(aContent->Tag()) : eHTMLTag_unknown; } NS_IMETHODIMP mozSanitizingHTMLSerializer::AppendText(nsIDOMText* aText, PRInt32 aStartOffset, PRInt32 aEndOffset, nsAString& aStr) { nsresult rv = NS_OK; mOutputString = &aStr; nsAutoString linebuffer; rv = DoAddLeaf(eHTMLTag_text, linebuffer); return rv; } NS_IMETHODIMP mozSanitizingHTMLSerializer::AppendElementStart(nsIDOMElement *aElement, nsIDOMElement *aOriginalElement, nsAString& aStr) { NS_ENSURE_ARG(aElement); mContent = do_QueryInterface(aElement); NS_ENSURE_TRUE(mContent, NS_ERROR_FAILURE); mOutputString = &aStr; PRInt32 id = GetIdForContent(mContent); PRBool isContainer = IsContainer(id); nsresult rv; if (isContainer) { rv = DoOpenContainer(id); } else { rv = DoAddLeaf(id, EmptyString()); } mContent = 0; mOutputString = nsnull; return rv; } NS_IMETHODIMP mozSanitizingHTMLSerializer::AppendElementEnd(nsIDOMElement *aElement, nsAString& aStr) { NS_ENSURE_ARG(aElement); mContent = do_QueryInterface(aElement); NS_ENSURE_TRUE(mContent, NS_ERROR_FAILURE); mOutputString = &aStr; nsresult rv = NS_OK; PRInt32 id = GetIdForContent(mContent); PRBool isContainer = IsContainer(id); if (isContainer) { rv = DoCloseContainer(id); } mContent = 0; mOutputString = nsnull; return rv; } NS_IMETHODIMP mozSanitizingHTMLSerializer::OpenContainer(const nsIParserNode& aNode) { PRInt32 type = aNode.GetNodeType(); mParserNode = const_cast(&aNode); return DoOpenContainer(type); } NS_IMETHODIMP mozSanitizingHTMLSerializer::CloseContainer(const nsHTMLTag aTag) { return DoCloseContainer(aTag); } NS_IMETHODIMP mozSanitizingHTMLSerializer::AddLeaf(const nsIParserNode& aNode) { eHTMLTags type = (eHTMLTags)aNode.GetNodeType(); const nsAString& text = aNode.GetText(); mParserNode = const_cast(&aNode); return DoAddLeaf(type, text); } NS_IMETHODIMP mozSanitizingHTMLSerializer::AddDocTypeDecl(const nsIParserNode& aNode) { return NS_OK; } NS_IMETHODIMP mozSanitizingHTMLSerializer::SetDocumentCharset(nsACString& aCharset) { // No idea, if this works - it isn't invoked by |TestOutput|. Write(NS_LITERAL_STRING("\n\n")); return NS_OK; } NS_IMETHODIMP mozSanitizingHTMLSerializer::OpenHead() { // XXX We don't have a parser node here, is it okay to ignore this? // return OpenContainer(aNode); return NS_OK; } // Here comes the actual code... nsresult mozSanitizingHTMLSerializer::DoOpenContainer(PRInt32 aTag) { eHTMLTags type = (eHTMLTags)aTag; if (mSkipLevel == 0 && IsAllowedTag(type)) { nsIParserService* parserService = nsContentUtils::GetParserService(); if (!parserService) return NS_ERROR_OUT_OF_MEMORY; const PRUnichar* tag_name = parserService->HTMLIdToStringTag(aTag); NS_ENSURE_TRUE(tag_name, NS_ERROR_INVALID_POINTER); Write(NS_LITERAL_STRING("<") + nsDependentString(tag_name)); // Attributes if (mParserNode) { PRInt32 count = mParserNode->GetAttributeCount(); for (PRInt32 i = 0; i < count; i++) { const nsAString& key = mParserNode->GetKeyAt(i); if(IsAllowedAttribute(type, key)) { // Ensure basic sanity of value nsAutoString value(mParserNode->GetValueAt(i)); // SanitizeAttrValue() modifies |value| if (NS_SUCCEEDED(SanitizeAttrValue(type, key, value))) { // Write out Write(NS_LITERAL_STRING(" ")); Write(key); // I get an infinive loop with | + key + | !!! Write(NS_LITERAL_STRING("=\"") + value + NS_LITERAL_STRING("\"")); } } } } Write(NS_LITERAL_STRING(">")); } else if (mSkipLevel != 0 || type == eHTMLTag_script || type == eHTMLTag_style) ++mSkipLevel; else Write(NS_LITERAL_STRING(" ")); return NS_OK; } nsresult mozSanitizingHTMLSerializer::DoCloseContainer(PRInt32 aTag) { eHTMLTags type = (eHTMLTags)aTag; if (mSkipLevel == 0 && IsAllowedTag(type)) { nsIParserService* parserService = nsContentUtils::GetParserService(); if (!parserService) return NS_ERROR_OUT_OF_MEMORY; const PRUnichar* tag_name = parserService->HTMLIdToStringTag(aTag); NS_ENSURE_TRUE(tag_name, NS_ERROR_INVALID_POINTER); Write(NS_LITERAL_STRING("")); } else if (mSkipLevel == 0) Write(NS_LITERAL_STRING(" ")); else --mSkipLevel; return NS_OK; } nsresult mozSanitizingHTMLSerializer::DoAddLeaf(PRInt32 aTag, const nsAString& aText) { if (mSkipLevel != 0) return NS_OK; eHTMLTags type = (eHTMLTags)aTag; nsresult rv = NS_OK; if (type == eHTMLTag_whitespace || type == eHTMLTag_newline) { Write(aText); // sure to be safe? } else if (type == eHTMLTag_text) { nsAutoString text(aText); if(NS_SUCCEEDED(SanitizeTextNode(text))) Write(text); else Write(NS_LITERAL_STRING(TEXT_REMOVED)); // Does not happen (yet) NS_ENSURE_SUCCESS(rv, rv); } else if (type == eHTMLTag_entity) { Write(NS_LITERAL_STRING("&")); Write(aText); // sure to be safe? // using + operator here might give an infinitive loop, see above. // not adding ";", because Gecko delivers that as part of |aText| (freaky) } else { DoOpenContainer(type); } return rv; } /** Similar to SanitizeAttrValue. */ nsresult mozSanitizingHTMLSerializer::SanitizeTextNode(nsString& aText /*inout*/) { aText.Adopt(escape(aText)); return NS_OK; } /** Ensures basic sanity of attribute value. This function also (tries to :-( ) makes sure, that no unwanted / dangerous URLs appear in the document (like javascript: and data:). Pass the value as |aValue| arg. It will be modified in-place. If the value is not allowed at all, we return with NS_ERROR_ILLEGAL_VALUE. In that case, do not use the |aValue|, but output nothing. */ nsresult mozSanitizingHTMLSerializer::SanitizeAttrValue(nsHTMLTag aTag, const nsAString& anAttrName, nsString& aValue /*inout*/) { /* First, cut the attribute to 1000 chars. Attributes with values longer than 1000 chars seem bogus, considering that we don't support any JS. The longest attributes I can think of are URLs, and URLs with 1000 chars are likely to be bogus, too. */ aValue = Substring(aValue, 0, 1000); //aValue.Truncate(1000); //-- this cuts half of the document !!?!! aValue.Adopt(escape(aValue)); /* Check some known bad stuff. Add more! I don't care too much, if it happens to trigger in some innocent cases (like Statistical data: Mortage rates and newspapers) - security first. */ if (aValue.Find("javascript:") != kNotFound || aValue.Find("data:") != kNotFound || aValue.Find("base64") != kNotFound) return NS_ERROR_ILLEGAL_VALUE; // Check img src scheme if (aTag == eHTMLTag_img && anAttrName.LowerCaseEqualsLiteral("src")) { nsresult rv; nsCOMPtr ioService = do_GetIOService(&rv); NS_ENSURE_SUCCESS(rv, rv); nsCAutoString scheme; rv = ioService->ExtractScheme(NS_LossyConvertUTF16toASCII(aValue), scheme); NS_ENSURE_SUCCESS(rv, rv); if (!scheme.Equals("cid", nsCaseInsensitiveCStringComparator())) return NS_ERROR_ILLEGAL_VALUE; } #ifdef DEBUG_BenB printf("attribute value for %s: -%s-\n", NS_LossyConvertUTF16toASCII(anAttrName).get(), NS_LossyConvertUTF16toASCII(aValue).get()); #endif return NS_OK; } /** */ PRBool mozSanitizingHTMLSerializer::IsAllowedTag(nsHTMLTag aTag) { nsPRUint32Key tag_key(aTag); #ifdef DEBUG_BenB printf("IsAllowedTag %d: %s\n", aTag, mAllowedTags.Exists(&tag_key)?"yes":"no"); #endif return mAllowedTags.Exists(&tag_key); } /** */ PRBool mozSanitizingHTMLSerializer::IsAllowedAttribute(nsHTMLTag aTag, const nsAString& anAttributeName) { #ifdef DEBUG_BenB printf("IsAllowedAttribute %d, -%s-\n", aTag, NS_LossyConvertUTF16toASCII(anAttributeName).get()); #endif nsresult rv; nsPRUint32Key tag_key(aTag); nsIProperties* attr_bag = (nsIProperties*)mAllowedTags.Get(&tag_key); NS_ENSURE_TRUE(attr_bag, PR_FALSE); PRBool allowed; nsAutoString attr(anAttributeName); ToLowerCase(attr); rv = attr_bag->Has(NS_LossyConvertUTF16toASCII(attr).get(), &allowed); if (NS_FAILED(rv)) return PR_FALSE; #ifdef DEBUG_BenB printf(" Allowed: %s\n", allowed?"yes":"no"); #endif return allowed; } /** aPref is a long string, which holds an exhaustive list of allowed tags and attributes. All other tags and attributes will be removed. aPref has the format "html head body ul ol li a(href,name,title) img(src,alt,title) #text" i.e. - tags are separated by whitespace - the attribute list follows the tag directly in brackets - the attributes are separated by commas. There is no way to express further restrictions, like "no text inside the element". This is so to considerably reduce the complexity of the pref and this implementation. Update: Akk told me that I might be able to use DTD classes. Later(TM)... */ nsresult mozSanitizingHTMLSerializer::ParsePrefs(const nsAString& aPref) { char* pref = ToNewCString(aPref); char* tags_lasts; for (char* iTag = PL_strtok_r(pref, " ", &tags_lasts); iTag; iTag = PL_strtok_r(NULL, " ", &tags_lasts)) { ParseTagPref(nsCAutoString(iTag)); } delete[] pref; return NS_OK; } /** Parses e.g. "a(href,title)" (but not several tags at once). */ nsresult mozSanitizingHTMLSerializer::ParseTagPref(const nsCAutoString& tagpref) { nsIParserService* parserService = nsContentUtils::GetParserService(); if (!parserService) return NS_ERROR_OUT_OF_MEMORY; // Parsing tag PRInt32 bracket = tagpref.FindChar('('); if (bracket == 0) { printf(" malformed pref: %s\n", tagpref.get()); return NS_ERROR_CANNOT_CONVERT_DATA; } nsAutoString tag; CopyUTF8toUTF16(StringHead(tagpref, bracket), tag); // Create key PRInt32 tag_id = parserService->HTMLStringTagToId(tag); if (tag_id == eHTMLTag_userdefined) { printf(" unknown tag <%s>, won't add.\n", NS_ConvertUTF16toUTF8(tag).get()); return NS_ERROR_CANNOT_CONVERT_DATA; } nsPRUint32Key tag_key(tag_id); if (mAllowedTags.Exists(&tag_key)) { printf(" duplicate tag: %s\n", NS_ConvertUTF16toUTF8(tag).get()); return NS_ERROR_CANNOT_CONVERT_DATA; } if (bracket == kNotFound) /* There are no attributes in the pref. So, allow none; only the tag itself */ { mAllowedTags.Put(&tag_key, 0); } else { // Attributes // where is the macro for non-fatal errors in opt builds? if(tagpref[tagpref.Length() - 1] != ')' || tagpref.Length() < PRUint32(bracket) + 3) { printf(" malformed pref: %s\n", tagpref.get()); return NS_ERROR_CANNOT_CONVERT_DATA; } nsCOMPtr attr_bag = do_CreateInstance(NS_PROPERTIES_CONTRACTID); NS_ENSURE_TRUE(attr_bag, NS_ERROR_INVALID_POINTER); nsCAutoString attrList; attrList.Append(Substring(tagpref, bracket + 1, tagpref.Length() - 2 - bracket)); char* attrs_lasts; for (char* iAttr = PL_strtok_r(attrList.BeginWriting(), ",", &attrs_lasts); iAttr; iAttr = PL_strtok_r(NULL, ",", &attrs_lasts)) { attr_bag->Set(iAttr, 0); } nsIProperties* attr_bag_raw = attr_bag; NS_ADDREF(attr_bag_raw); mAllowedTags.Put(&tag_key, attr_bag_raw); } return NS_OK; } /* might be useful: htmlparser/public/nsHTMLTokens.h for tag categories */