gecko/netwerk/streamconv/converters/nsUnknownDecoder.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsUnknownDecoder.h"
#include "nsIServiceManager.h"
#include "nsIStreamConverterService.h"

#include "nsIPipe.h"
#include "nsIInputStream.h"
#include "nsIOutputStream.h"
#include "nsMimeTypes.h"
#include "netCore.h"
#include "nsXPIDLString.h"
#include "nsIPrefService.h"
#include "nsIPrefBranch.h"
#include "nsICategoryManager.h"
#include "nsISupportsPrimitives.h"
#include "nsIContentSniffer.h"

#include "nsCRT.h"

#include "nsIMIMEService.h"

#include "nsIViewSourceChannel.h"
#include "nsIHttpChannel.h"
#include "nsNetCID.h"


#define MAX_BUFFER_SIZE 512

nsUnknownDecoder::nsUnknownDecoder()
  : mBuffer(nsnull)
  , mBufferLen(0)
  , mRequireHTMLsuffix(false)
{
  nsCOMPtr<nsIPrefBranch> prefs = do_GetService(NS_PREFSERVICE_CONTRACTID);
  if (prefs) {
    bool val;
    if (NS_SUCCEEDED(prefs->GetBoolPref("security.requireHTMLsuffix", &val)))
      mRequireHTMLsuffix = val;
  }
}

nsUnknownDecoder::~nsUnknownDecoder()
{
  if (mBuffer) {
    delete [] mBuffer;
    mBuffer = nsnull;
  }
}

// ----
//
// nsISupports implementation...
//
// ----

NS_IMPL_ADDREF(nsUnknownDecoder)
NS_IMPL_RELEASE(nsUnknownDecoder)

NS_INTERFACE_MAP_BEGIN(nsUnknownDecoder)
   NS_INTERFACE_MAP_ENTRY(nsIStreamConverter)
   NS_INTERFACE_MAP_ENTRY(nsIStreamListener)
   NS_INTERFACE_MAP_ENTRY(nsIRequestObserver)
   NS_INTERFACE_MAP_ENTRY(nsIContentSniffer)
   NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIStreamListener)
NS_INTERFACE_MAP_END


// ----
//
// nsIStreamConverter methods...
//
// ----

NS_IMETHODIMP
nsUnknownDecoder::Convert(nsIInputStream *aFromStream,
                          const char *aFromType,
                          const char *aToType,
                          nsISupports *aCtxt,
                          nsIInputStream **aResultStream)
{
  return NS_ERROR_NOT_IMPLEMENTED;
}

NS_IMETHODIMP
nsUnknownDecoder::AsyncConvertData(const char *aFromType,
                                   const char *aToType,
                                   nsIStreamListener *aListener,
                                   nsISupports *aCtxt)
{
  NS_ASSERTION(aListener && aFromType && aToType,
               "null pointer passed into multi mixed converter");
  // hook up our final listener. this guy gets the various On*() calls we want to throw
  // at him.
  //
  mNextListener = aListener;
  return (aListener) ? NS_OK : NS_ERROR_FAILURE;
}

// ----
//
// nsIStreamListener methods...
//
// ----

NS_IMETHODIMP
nsUnknownDecoder::OnDataAvailable(nsIRequest* request,
                                  nsISupports *aCtxt,
                                  nsIInputStream *aStream,
                                  PRUint32 aSourceOffset,
                                  PRUint32 aCount)
{
  nsresult rv = NS_OK;

  if (!mNextListener) return NS_ERROR_FAILURE;

  if (mContentType.IsEmpty()) {
    PRUint32 count, len;

    // If the buffer has not been allocated by now, just fail...
    if (!mBuffer) return NS_ERROR_OUT_OF_MEMORY;

    //
    // Determine how much of the stream should be read to fill up the
    // sniffer buffer...
    //
    if (mBufferLen + aCount >= MAX_BUFFER_SIZE) {
      count = MAX_BUFFER_SIZE-mBufferLen;
    } else {
      count = aCount;
    }

    // Read the data into the buffer...
    rv = aStream->Read((mBuffer+mBufferLen), count, &len);
    if (NS_FAILED(rv)) return rv;

    mBufferLen += len;
    aCount     -= len;

    if (aCount) {
      //
      // Adjust the source offset...  The call to FireListenerNotifications(...)
      // will make the first OnDataAvailable(...) call with an offset of 0.
      // So, this offset needs to be adjusted to reflect that...
      //
      aSourceOffset += mBufferLen;

      DetermineContentType(request);

      rv = FireListenerNotifications(request, aCtxt);
    }
  }

  // Must not fire ODA again if it failed once
  if (aCount && NS_SUCCEEDED(rv)) {
    NS_ASSERTION(!mContentType.IsEmpty(),
                 "Content type should be known by now.");

    rv = mNextListener->OnDataAvailable(request, aCtxt, aStream,
                                        aSourceOffset, aCount);
  }

  return rv;
}

// ----
//
// nsIRequestObserver methods...
//
// ----

NS_IMETHODIMP
nsUnknownDecoder::OnStartRequest(nsIRequest* request, nsISupports *aCtxt)
{
  nsresult rv = NS_OK;

  if (!mNextListener) return NS_ERROR_FAILURE;

  // Allocate the sniffer buffer...
  if (NS_SUCCEEDED(rv) && !mBuffer) {
    mBuffer = new char[MAX_BUFFER_SIZE];

    if (!mBuffer) {
      rv = NS_ERROR_OUT_OF_MEMORY;
    }
  }

  // Do not pass the OnStartRequest on to the next listener (yet)...
  return rv;
}

NS_IMETHODIMP
nsUnknownDecoder::OnStopRequest(nsIRequest* request, nsISupports *aCtxt,
                                nsresult aStatus)
{
  nsresult rv = NS_OK;

  if (!mNextListener) return NS_ERROR_FAILURE;

  //
  // The total amount of data is less than the size of the sniffer buffer.
  // Analyze the buffer now...
  //
  if (mContentType.IsEmpty()) {
    DetermineContentType(request);

    rv = FireListenerNotifications(request, aCtxt);

    if (NS_FAILED(rv)) {
      aStatus = rv;
    }
  }

  rv = mNextListener->OnStopRequest(request, aCtxt, aStatus);
  mNextListener = 0;

  return rv;
}

// ----
//
// nsIContentSniffer methods...
//
// ----
NS_IMETHODIMP
nsUnknownDecoder::GetMIMETypeFromContent(nsIRequest* aRequest,
                                         const PRUint8* aData,
                                         PRUint32 aLength,
                                         nsACString& type)
{
  mBuffer = const_cast<char*>(reinterpret_cast<const char*>(aData));
  mBufferLen = aLength;
  DetermineContentType(aRequest);
  mBuffer = nsnull;
  mBufferLen = 0;
  type.Assign(mContentType);
  mContentType.Truncate();
  return type.IsEmpty() ? NS_ERROR_NOT_AVAILABLE : NS_OK;
}


// Actual sniffing code

bool nsUnknownDecoder::AllowSniffing(nsIRequest* aRequest)
{
  if (!mRequireHTMLsuffix) {
    return true;
  }

  nsCOMPtr<nsIChannel> channel = do_QueryInterface(aRequest);
  if (!channel) {
    NS_ERROR("QI failed");
    return false;
  }

  nsCOMPtr<nsIURI> uri;
  if (NS_FAILED(channel->GetURI(getter_AddRefs(uri))) || !uri) {
    return false;
  }

  bool isLocalFile = false;
  if (NS_FAILED(uri->SchemeIs("file", &isLocalFile)) || isLocalFile) {
    return false;
  }

  return true;
}

/**
 * This is the array of sniffer entries that depend on "magic numbers"
 * in the file.  Each entry has either a type associated with it (set
 * these with the SNIFFER_ENTRY macro) or a function to be executed
 * (set these with the SNIFFER_ENTRY_WITH_FUNC macro).  The function
 * should take a single nsIRequest* and returns bool -- true if
 * it sets mContentType, false otherwise
 */
nsUnknownDecoder::nsSnifferEntry nsUnknownDecoder::sSnifferEntries[] = {
  SNIFFER_ENTRY("%PDF-", APPLICATION_PDF),

  SNIFFER_ENTRY("%!PS-Adobe-", APPLICATION_POSTSCRIPT),

  // Files that start with mailbox delimiters let's provisionally call
  // text/plain
  SNIFFER_ENTRY("From", TEXT_PLAIN),
  SNIFFER_ENTRY(">From", TEXT_PLAIN),

  // If the buffer begins with "#!" or "%!" then it is a script of
  // some sort...  "Scripts" can include arbitrary data to be passed
  // to an interpreter, so we need to decide whether we can call this
  // text or whether it's data.
  SNIFFER_ENTRY_WITH_FUNC("#!", &nsUnknownDecoder::LastDitchSniff),

  // XXXbz should (and can) we also include the various ways that <?xml can
  // appear as UTF-16 and such?  See http://www.w3.org/TR/REC-xml#sec-guessing
  SNIFFER_ENTRY_WITH_FUNC("<?xml", &nsUnknownDecoder::SniffForXML)
};

PRUint32 nsUnknownDecoder::sSnifferEntryNum =
  sizeof(nsUnknownDecoder::sSnifferEntries) /
    sizeof(nsUnknownDecoder::nsSnifferEntry);

void nsUnknownDecoder::DetermineContentType(nsIRequest* aRequest)
{
  NS_ASSERTION(mContentType.IsEmpty(), "Content type is already known.");
  if (!mContentType.IsEmpty()) return;

  // First, run through all the types we can detect reliably based on
  // magic numbers
  PRUint32 i;
  for (i = 0; i < sSnifferEntryNum; ++i) {
    if (mBufferLen >= sSnifferEntries[i].mByteLen &&  // enough data
        memcmp(mBuffer, sSnifferEntries[i].mBytes, sSnifferEntries[i].mByteLen) == 0) {  // and type matches
      NS_ASSERTION(sSnifferEntries[i].mMimeType ||
                   sSnifferEntries[i].mContentTypeSniffer,
                   "Must have either a type string or a function to set the type");
      NS_ASSERTION(sSnifferEntries[i].mMimeType == nsnull ||
                   sSnifferEntries[i].mContentTypeSniffer == nsnull,
                   "Both a type string and a type sniffing function set;"
                   " using type string");
      if (sSnifferEntries[i].mMimeType) {
        mContentType = sSnifferEntries[i].mMimeType;
        NS_ASSERTION(!mContentType.IsEmpty(),
                     "Content type should be known by now.");
        return;
      }
      if ((this->*(sSnifferEntries[i].mContentTypeSniffer))(aRequest)) {
        NS_ASSERTION(!mContentType.IsEmpty(),
                     "Content type should be known by now.");
        return;
      }
    }
  }

  if (TryContentSniffers(aRequest)) {
    NS_ASSERTION(!mContentType.IsEmpty(),
                 "Content type should be known by now.");
    return;
  }

  if (SniffForHTML(aRequest)) {
    NS_ASSERTION(!mContentType.IsEmpty(),
                 "Content type should be known by now.");
    return;
  }

  // We don't know what this is yet.  Before we just give up, try
  // the URI from the request.
  if (SniffURI(aRequest)) {
    NS_ASSERTION(!mContentType.IsEmpty(),
                 "Content type should be known by now.");
    return;
  }

  LastDitchSniff(aRequest);
  NS_ASSERTION(!mContentType.IsEmpty(),
               "Content type should be known by now.");
}

bool nsUnknownDecoder::TryContentSniffers(nsIRequest* aRequest)
{
  // Enumerate content sniffers
  nsCOMPtr<nsICategoryManager> catMan(do_GetService("@mozilla.org/categorymanager;1"));
  if (!catMan) {
    return false;
  }

  nsCOMPtr<nsISimpleEnumerator> sniffers;
  catMan->EnumerateCategory("content-sniffing-services", getter_AddRefs(sniffers));
  if (!sniffers) {
    return false;
  }

  bool hasMore;
  while (NS_SUCCEEDED(sniffers->HasMoreElements(&hasMore)) && hasMore) {
    nsCOMPtr<nsISupports> elem;
    sniffers->GetNext(getter_AddRefs(elem));
    NS_ASSERTION(elem, "No element even though hasMore returned true!?");

    nsCOMPtr<nsISupportsCString> sniffer_id(do_QueryInterface(elem));
    NS_ASSERTION(sniffer_id, "element is no nsISupportsCString!?");
    nsCAutoString contractid;
    nsresult rv = sniffer_id->GetData(contractid);
    if (NS_FAILED(rv)) {
      continue;
    }

    nsCOMPtr<nsIContentSniffer> sniffer(do_GetService(contractid.get()));
    if (!sniffer) {
      continue;
    }

    rv = sniffer->GetMIMETypeFromContent(aRequest, (const PRUint8*)mBuffer,
                                         mBufferLen, mContentType);
    if (NS_SUCCEEDED(rv)) {
      return true;
    }
  }

  return false;
}

bool nsUnknownDecoder::SniffForHTML(nsIRequest* aRequest)
{
  /*
   * To prevent a possible attack, we will not consider this to be
   * html content if it comes from the local file system and our prefs
   * are set right
   */
  if (!AllowSniffing(aRequest)) {
    return false;
  }

  // Now look for HTML.
  const char* str = mBuffer;
  const char* end = mBuffer + mBufferLen;

  // skip leading whitespace
  while (str != end && nsCRT::IsAsciiSpace(*str)) {
    ++str;
  }

  // did we find something like a start tag?
  if (str == end || *str != '<' || ++str == end) {
    return false;
  }

  // If we seem to be SGML or XML and we got down here, just pretend we're HTML
  if (*str == '!' || *str == '?') {
    mContentType = TEXT_HTML;
    return true;
  }

  PRUint32 bufSize = end - str;
  // We use sizeof(_tagstr) below because that's the length of _tagstr
  // with the one char " " or ">" appended.
#define MATCHES_TAG(_tagstr)                                              \
  (bufSize >= sizeof(_tagstr) &&                                          \
   (PL_strncasecmp(str, _tagstr " ", sizeof(_tagstr)) == 0 ||             \
    PL_strncasecmp(str, _tagstr ">", sizeof(_tagstr)) == 0))

  if (MATCHES_TAG("html")     ||
      MATCHES_TAG("frameset") ||
      MATCHES_TAG("body")     ||
      MATCHES_TAG("head")     ||
      MATCHES_TAG("script")   ||
      MATCHES_TAG("iframe")   ||
      MATCHES_TAG("a")        ||
      MATCHES_TAG("img")      ||
      MATCHES_TAG("table")    ||
      MATCHES_TAG("title")    ||
      MATCHES_TAG("link")     ||
      MATCHES_TAG("base")     ||
      MATCHES_TAG("style")    ||
      MATCHES_TAG("div")      ||
      MATCHES_TAG("p")        ||
      MATCHES_TAG("font")     ||
      MATCHES_TAG("applet")   ||
      MATCHES_TAG("meta")     ||
      MATCHES_TAG("center")   ||
      MATCHES_TAG("form")     ||
      MATCHES_TAG("isindex")  ||
      MATCHES_TAG("h1")       ||
      MATCHES_TAG("h2")       ||
      MATCHES_TAG("h3")       ||
      MATCHES_TAG("h4")       ||
      MATCHES_TAG("h5")       ||
      MATCHES_TAG("h6")       ||
      MATCHES_TAG("b")        ||
      MATCHES_TAG("pre")) {

    mContentType = TEXT_HTML;
    return true;
  }

#undef MATCHES_TAG

  return false;
}

bool nsUnknownDecoder::SniffForXML(nsIRequest* aRequest)
{
  // Just like HTML, this should be able to be shut off.
  if (!AllowSniffing(aRequest)) {
    return false;
  }

  // First see whether we can glean anything from the uri...
  if (!SniffURI(aRequest)) {
    // Oh well; just generic XML will have to do
    mContentType = TEXT_XML;
  }

  return true;
}

bool nsUnknownDecoder::SniffURI(nsIRequest* aRequest)
{
  nsCOMPtr<nsIMIMEService> mimeService(do_GetService("@mozilla.org/mime;1"));
  if (mimeService) {
    nsCOMPtr<nsIChannel> channel = do_QueryInterface(aRequest);
    if (channel) {
      nsCOMPtr<nsIURI> uri;
      nsresult result = channel->GetURI(getter_AddRefs(uri));
      if (NS_SUCCEEDED(result) && uri) {
        nsCAutoString type;
        result = mimeService->GetTypeFromURI(uri, type);
        if (NS_SUCCEEDED(result)) {
          mContentType = type;
          return true;
        }
      }
    }
  }

  return false;
}

// This macro is based on RFC 2046 Section 4.1.2.  Treat any char 0-31
// except the 9-13 range (\t, \n, \v, \f, \r) and char 27 (used by
// encodings like Shift_JIS) as non-text
#define IS_TEXT_CHAR(ch)                                     \
  (((unsigned char)(ch)) > 31 || (9 <= (ch) && (ch) <= 13) || (ch) == 27)

bool nsUnknownDecoder::LastDitchSniff(nsIRequest* aRequest)
{
  // All we can do now is try to guess whether this is text/plain or
  // application/octet-stream

  // First, check for a BOM.  If we see one, assume this is text/plain
  // in whatever encoding.  If there is a BOM _and_ text we will
  // always have at least 4 bytes in the buffer (since the 2-byte BOMs
  // are for 2-byte encodings and the UTF-8 BOM is 3 bytes).
  if (mBufferLen >= 4) {
    const unsigned char* buf = (const unsigned char*)mBuffer;
    if ((buf[0] == 0xFE && buf[1] == 0xFF) || // UTF-16, Big Endian
        (buf[0] == 0xFF && buf[1] == 0xFE) || // UTF-16 or UCS-4, Little Endian
        (buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) || // UTF-8
        (buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF)) { // UCS-4, Big Endian

      mContentType = TEXT_PLAIN;
      return true;
    }
  }

  // Now see whether the buffer has any non-text chars.  If not, then let's
  // just call it text/plain...
  //
  PRUint32 i;
  for (i=0; i<mBufferLen && IS_TEXT_CHAR(mBuffer[i]); i++);

  if (i == mBufferLen) {
    mContentType = TEXT_PLAIN;
  }
  else {
    mContentType = APPLICATION_OCTET_STREAM;
  }

  return true;
}


nsresult nsUnknownDecoder::FireListenerNotifications(nsIRequest* request,
                                                     nsISupports *aCtxt)
{
  nsresult rv = NS_OK;

  if (!mNextListener) return NS_ERROR_FAILURE;

  if (!mContentType.IsEmpty()) {
    nsCOMPtr<nsIViewSourceChannel> viewSourceChannel =
      do_QueryInterface(request);
    if (viewSourceChannel) {
      rv = viewSourceChannel->SetOriginalContentType(mContentType);
    } else {
      nsCOMPtr<nsIChannel> channel = do_QueryInterface(request, &rv);
      if (NS_SUCCEEDED(rv)) {
        // Set the new content type on the channel...
        rv = channel->SetContentType(mContentType);
      }
    }

    NS_ASSERTION(NS_SUCCEEDED(rv), "Unable to set content type on channel!");

    if (NS_FAILED(rv)) {
      // Cancel the request to make sure it has the correct status if
      // mNextListener looks at it.
      request->Cancel(rv);
      mNextListener->OnStartRequest(request, aCtxt);
      return rv;
    }
  }

  // Fire the OnStartRequest(...)
  rv = mNextListener->OnStartRequest(request, aCtxt);

  if (!mBuffer) return NS_ERROR_OUT_OF_MEMORY;

  // If the request was canceled, then we need to treat that equivalently
  // to an error returned by OnStartRequest.
  if (NS_SUCCEEDED(rv))
    request->GetStatus(&rv);

  // Fire the first OnDataAvailable for the data that was read from the
  // stream into the sniffer buffer...
  if (NS_SUCCEEDED(rv) && (mBufferLen > 0)) {
    PRUint32 len = 0;
    nsCOMPtr<nsIInputStream> in;
    nsCOMPtr<nsIOutputStream> out;

    // Create a pipe and fill it with the data from the sniffer buffer.
    rv = NS_NewPipe(getter_AddRefs(in), getter_AddRefs(out),
                    MAX_BUFFER_SIZE, MAX_BUFFER_SIZE);

    if (NS_SUCCEEDED(rv)) {
      rv = out->Write(mBuffer, mBufferLen, &len);
      if (NS_SUCCEEDED(rv)) {
        if (len == mBufferLen) {
          rv = mNextListener->OnDataAvailable(request, aCtxt, in, 0, len);
        } else {
          NS_ERROR("Unable to write all the data into the pipe.");
          rv = NS_ERROR_FAILURE;
        }
      }
    }
  }

  delete [] mBuffer;
  mBuffer = nsnull;
  mBufferLen = 0;

  return rv;
}

void
nsBinaryDetector::DetermineContentType(nsIRequest* aRequest)
{
  nsCOMPtr<nsIHttpChannel> httpChannel = do_QueryInterface(aRequest);
  if (!httpChannel) {
    return;
  }

  // It's an HTTP channel.  Check for the text/plain mess
  nsCAutoString contentTypeHdr;
  httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Type"),
                                 contentTypeHdr);
  nsCAutoString contentType;
  httpChannel->GetContentType(contentType);

  // Make sure to do a case-sensitive exact match comparison here.  Apache
  // 1.x just sends text/plain for "unknown", while Apache 2.x sends
  // text/plain with a ISO-8859-1 charset.  Debian's Apache version, just to
  // be different, sends text/plain with iso-8859-1 charset.  For extra fun,
  // FC7, RHEL4, and Ubuntu Feisty send charset=UTF-8.  Don't do general
  // case-insensitive comparison, since we really want to apply this crap as
  // rarely as we can.
  if (!contentType.EqualsLiteral("text/plain") ||
      (!contentTypeHdr.EqualsLiteral("text/plain") &&
       !contentTypeHdr.EqualsLiteral("text/plain; charset=ISO-8859-1") &&
       !contentTypeHdr.EqualsLiteral("text/plain; charset=iso-8859-1") &&
       !contentTypeHdr.EqualsLiteral("text/plain; charset=UTF-8"))) {
    return;
  }

  // Check whether we have content-encoding.  If we do, don't try to
  // detect the type.
  // XXXbz we could improve this by doing a local decompress if we
  // wanted, I'm sure.
  nsCAutoString contentEncoding;
  httpChannel->GetResponseHeader(NS_LITERAL_CSTRING("Content-Encoding"),
                                 contentEncoding);
  if (!contentEncoding.IsEmpty()) {
    return;
  }

  LastDitchSniff(aRequest);
  if (mContentType.Equals(APPLICATION_OCTET_STREAM)) {
    // We want to guess at it instead
    mContentType = APPLICATION_GUESS_FROM_EXT;
  } else {
    // Let the text/plain type we already have be, so that other content
    // sniffers can also get a shot at this data.
    mContentType.Truncate();
  }
}