Bug 464314 - view-source link-browsing doesn't decode entities. r+sr=mrbkap

2024-09-13 09:24:08 -07:00 · 2009-01-16 16:07:12 +01:00 · 2009-01-16 16:07:12 +01:00 · eb67a63210
commit eb67a63210
parent 6e81518c41
2 changed files with 200 additions and 4 deletions
--- a/parser/htmlparser/src/nsViewSourceHTML.cpp
+++ b/parser/htmlparser/src/nsViewSourceHTML.cpp
@ -76,6 +76,7 @@
 #include "nsUnicharUtils.h"
 #include "nsPrintfCString.h"
 #include "nsNetUtil.h"
+#include "nsHTMLEntities.h"

 #include "nsIServiceManager.h"

@ -1000,8 +1001,10 @@ PRBool CViewSourceHTML::IsUrlAttribute(const nsAString& tagName,
  // This doesn't feel like the ideal place for this, but the alternatives don't
  // seem all that nice either.
  if (isHref && tagName.LowerCaseEqualsLiteral("base")) {
-    const nsSubstring& baseSpec = TrimTokenValue(attrValue);
-    SetBaseURI(baseSpec);
+    const nsAString& baseSpec = TrimTokenValue(attrValue);
+    nsAutoString expandedBaseSpec;
+    ExpandEntities(baseSpec, expandedBaseSpec);
+    SetBaseURI(expandedBaseSpec);
  }

  return isHref || isSrc;
@ -1080,8 +1083,11 @@ nsresult CViewSourceHTML::CreateViewSourceURL(const nsAString& linkUrl,
  rv = GetBaseURI(getter_AddRefs(baseURI));
  NS_ENSURE_SUCCESS(rv, rv);

-  // Use the link URL and the base URI to build a URI for the link.
-  rv = NS_NewURI(getter_AddRefs(hrefURI), linkUrl, charset.get(), baseURI);
+  // Use the link URL and the base URI to build a URI for the link.  Note that
+  // the link URL may have untranslated entities in it.
+  nsAutoString expandedLinkUrl;
+  ExpandEntities(linkUrl, expandedLinkUrl);
+  rv = NS_NewURI(getter_AddRefs(hrefURI), expandedLinkUrl, charset.get(), baseURI);
  NS_ENSURE_SUCCESS(rv, rv);

  // Get the absolute URL from the link URI.
@ -1230,3 +1236,188 @@ nsresult CViewSourceHTML::SetBaseURI(const nsAString& baseSpec) {
  mBaseURI = baseURI;
  return NS_OK;
 }
+
+void CViewSourceHTML::ExpandEntities(const nsAString& textIn, nsString& textOut)
+{  
+  nsAString::const_iterator iter, end;
+  textIn.BeginReading(iter);
+  textIn.EndReading(end);
+
+  // The outer loop treats the input as a sequence of pairs of runs.  The first
+  // run of each pair is just a run of regular characters.  The second run is
+  // something that looks like it might be an entity reference, e.g. "&amp;".
+  // Any regular run may be empty, and the entity run can be skipped at the end
+  // of the text.  Apparent entities that can't be translated are copied
+  // verbatim.  In particular this allows for raw ampersands in the input.
+  // Special care is taken to handle the end of the text at any point inside
+  // the loop.
+  while (iter != end) {
+    // Copy characters to textOut until but not including the first ampersand.
+    for (; iter != end; ++iter) {
+      PRUnichar ch = *iter;
+      if (ch == kAmpersand) {
+        break;
+      }
+      textOut.Append(ch);
+    }
+
+    // We have a possible entity.  If the entity is well-formed (or well-enough
+    // formed) copy the entity value to "textOut".  Otherwise, copy the "entity"
+    // source characters to "textOut" verbatim.  Either way, advance "iter" to
+    // the first position after the entity/entity-like-thing.
+    CopyPossibleEntity(iter, end, textOut);
+  }
+}
+
+static PRBool InRange(PRUnichar ch, unsigned char chLow, unsigned char chHigh)
+{
+  return (chLow <= ch) && (ch <= chHigh);
+}
+
+static PRBool IsDigit(PRUnichar ch)
+{ 
+  return InRange(ch, '0', '9');
+}
+
+static PRBool IsHexDigit(PRUnichar ch)
+{
+  return IsDigit(ch) || InRange(ch, 'A', 'F') || InRange(ch, 'a', 'f');
+}
+
+static PRBool IsAlphaNum(PRUnichar ch)
+{
+  return InRange(ch, 'A', 'Z') || InRange(ch, 'a', 'z') || IsDigit(ch);
+}
+
+static PRBool IsAmpersand(PRUnichar ch)
+{
+  return ch == kAmpersand;
+}
+
+static PRBool IsHashsign(PRUnichar ch)
+{
+  return ch == kHashsign;
+}
+
+static PRBool IsXx(PRUnichar ch)
+{
+  return (ch == 'X') || (ch == 'x');
+}
+
+static PRBool IsSemicolon(PRUnichar ch)
+{
+  return ch == kSemicolon;
+}
+
+static PRBool ConsumeChar(nsAString::const_iterator& start,
+                          const nsAString::const_iterator &end,
+                          PRBool (*testFun)(PRUnichar ch))
+{
+  if (start == end) {
+    return PR_FALSE;
+  }
+  if (!testFun(*start)) {
+    return PR_FALSE;
+  }
+  ++start;
+  return PR_TRUE;
+}
+
+void CViewSourceHTML::CopyPossibleEntity(nsAString::const_iterator& iter,
+                                         const nsAString::const_iterator& end,
+                                         nsAString& textBuffer)
+{
+  // Note that "iter" is passed by reference, and we need to make sure that
+  // we update its position as we parse characters, so the caller will know
+  // how much text we processed.
+
+  // Remember where we started.
+  const nsAString::const_iterator start(iter);
+  
+  // Our possible entity must at least start with an '&' -- bail if it doesn't.
+  if (!ConsumeChar(iter, end, IsAmpersand)) {
+    return;
+  }
+
+  // Identify the entity "body" and classify it.
+  nsAString::const_iterator startBody, endBody;
+  enum {TYPE_ID, TYPE_DECIMAL, TYPE_HEXADECIMAL} entityType;
+  if (ConsumeChar(iter, end, IsHashsign)) {
+    if (ConsumeChar(iter, end, IsXx)) {
+      startBody = iter;
+      entityType = TYPE_HEXADECIMAL;
+      while (ConsumeChar(iter, end, IsHexDigit)) {
+        // empty
+      }
+    } else {
+      startBody = iter;
+      entityType = TYPE_DECIMAL;
+      while (ConsumeChar(iter, end, IsDigit)) {
+        // empty
+      }
+    }
+  } else {
+    startBody = iter;
+    entityType = TYPE_ID;
+    // The parser seems to allow some other characters, such as ":" and "_".
+    // However, all of the entities that we know about (see nsHTMLEntityList.h)
+    // are strictly alphanumeric.
+    while (ConsumeChar(iter, end, IsAlphaNum)) {
+      // empty
+    }
+  }
+
+  // Record the end of the entity body.
+  endBody = iter;
+  
+  // If the entity body is terminated with a semicolon, consume that too.
+  PRBool properlyTerminated = ConsumeChar(iter, end, IsSemicolon);
+
+  // If the entity body is empty, then it's not really an entity.  Copy what
+  // we've parsed verbatim, and return immediately.
+  if (startBody == endBody) {
+    textBuffer.Append(Substring(start, iter));
+    return;
+  }
+
+  // Construct a string from the body range.  Note that we need a regular
+  // string since substrings don't provide ToInteger().
+  nsAutoString entityBody(Substring(startBody, endBody));
+
+  // Decode the entity to a Unicode character.
+  PRInt32 entityCode = -1;
+  switch (entityType) {
+  case TYPE_ID:
+    entityCode = nsHTMLEntities::EntityToUnicode(entityBody);
+    break;
+  case TYPE_DECIMAL:
+    entityCode = ToUnicode(entityBody, 10, -1);
+    break;
+  case TYPE_HEXADECIMAL:
+    entityCode = ToUnicode(entityBody, 16, -1);
+    break;
+  default:
+    NS_NOTREACHED("Unknown entity type!");
+    break;
+  }
+
+  // Note that the parser does not require terminating semicolons for entities
+  // with eight bit values.  We want to allow the same here.
+  if (properlyTerminated || ((0 <= entityCode) && (entityCode < 256))) {
+    textBuffer.Append((PRUnichar) entityCode);
+  } else {
+    // If the entity is malformed in any way, just copy the source text verbatim.
+    textBuffer.Append(Substring(start, iter));
+  }
+}
+
+PRInt32 CViewSourceHTML::ToUnicode(const nsString &strNum, PRInt32 radix, PRInt32 fallback)
+{
+  PRInt32 result;
+  PRInt32 code = strNum.ToInteger(&result, radix);
+  if (result == NS_OK) {
+    return code;
+  }
+  return fallback;
+}
+
--- a/parser/htmlparser/src/nsViewSourceHTML.h
+++ b/parser/htmlparser/src/nsViewSourceHTML.h
@ -126,6 +126,11 @@ private:
    PRBool IsTokenValueTrimmableCharacter(PRUnichar ch);
    nsresult GetBaseURI(nsIURI **result);
    nsresult SetBaseURI(const nsAString& baseSpec);
+    static void ExpandEntities(const nsAString& textIn, nsString& textOut);
+    static void CopyPossibleEntity(nsAString::const_iterator& iter,
+                                   const nsAString::const_iterator& end,
+                                   nsAString& textBuffer);
+    static PRInt32 ToUnicode(const nsString &strNum, PRInt32 radix, PRInt32 fallback);

 protected: