Treat unpaired surrogate characters in UTF-16 as illegal characters. Bug 312716. Patch by me and Peter Annema <jag@tty.nl>, r=me, jag, jshin; sr=dveditz

2024-09-13 09:24:08 -07:00 · 2009-02-22 02:08:27 -08:00 · 2009-02-22 02:08:27 -08:00 · c1e02f694e
commit c1e02f694e
parent c4850d0d07
4 changed files with 224 additions and 58 deletions
--- a/intl/uconv/tests/unit/test_bug317216.js
+++ b/intl/uconv/tests/unit/test_bug317216.js
@ -0,0 +1,109 @@
+/* Test case for bug 317216
+ *
+ * Uses nsIConverterInputStream to decode UTF-16 text with valid surrogate
+ * pairs and lone surrogate characters
+ *
+ * Sample text is: "A" in Mathematical Bold Capitals (U+1D400)
+ *
+ * The test uses buffers of 4 different lengths to test end of buffer in mid-
+ * UTF16 character and mid-surrogate pair
+ */
+
+const test = [
+// 0: Valid surrogate pair
+              ["%00%2D%00%2D%D8%35%DC%20%00%2D%00%2D",
+//    expected: surrogate pair
+               "--\uD835\uDC20--"],
+// 1: Lone high surrogate
+              ["%00%2D%00%2D%D8%35%00%2D%00%2D",
+//    expected: one replacement char
+               "--\uFFFD--"],
+// 2: Lone low surrogate
+              ["%00%2D%00%2D%DC%20%00%2D%00%2D",
+//    expected: one replacement char
+               "--\uFFFD--"],
+// 3: Two high surrogates
+              ["%00%2D%00%2D%D8%35%D8%35%00%2D%00%2D",
+//    expected: two replacement chars
+               "--\uFFFD\uFFFD--"],
+// 4: Two low surrogates
+              ["%00%2D%00%2D%DC%20%DC%20%00%2D%00%2D",
+//    expected: two replacement chars
+              "--\uFFFD\uFFFD--"],
+// 5: Low surrogate followed by high surrogate
+              ["%00%2D%00%2D%DC%20%D8%35%00%2D%00%2D",
+//    expected: two replacement chars
+               "--\uFFFD\uFFFD--"],
+// 6: Lone high surrogate followed by valid surrogate pair
+              ["%00%2D%00%2D%D8%35%D8%35%DC%20%00%2D%00%2D",
+//    expected: replacement char followed by surrogate pair
+               "--\uFFFD\uD835\uDC20--"],
+// 7: Lone low surrogate followed by valid surrogate pair
+              ["%00%2D%00%2D%DC%20%D8%35%DC%20%00%2D%00%2D",
+//    expected: replacement char followed by surrogate pair
+               "--\uFFFD\uD835\uDC20--"],
+// 8: Valid surrogate pair followed by lone high surrogate
+              ["%00%2D%00%2D%D8%35%DC%20%D8%35%00%2D%00%2D",
+//    expected: surrogate pair followed by replacement char
+               "--\uD835\uDC20\uFFFD--"],
+// 9: Valid surrogate pair followed by lone low surrogate
+              ["%00%2D%00%2D%D8%35%DC%20%DC%20%00%2D%00%2D",
+//    expected: surrogate pair followed by replacement char
+               "--\uD835\uDC20\uFFFD--"],
+// 10: Lone high surrogate at the end of the input
+              ["%00%2D%00%2D%00%2D%00%2D%D8%35%",
+//    expected: nothing
+               "----"],
+// 11: Half code unit at the end of the input
+              ["%00%2D%00%2D%00%2D%00%2D%D8",
+//    expected: nothing
+              "----"]];
+
+const IOService = Components.Constructor("@mozilla.org/network/io-service;1",
+                                         "nsIIOService");
+const ConverterInputStream =
+      Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
+                             "nsIConverterInputStream",
+                             "init");
+const ios = new IOService();
+
+function testCase(testText, expectedText, bufferLength, charset)
+{
+  var dataURI = "data:text/plain;charset=" + charset + "," + testText;
+
+  var channel = ios.newChannel(dataURI, "", null);
+  var testInputStream = channel.open();
+  var testConverter = new ConverterInputStream(testInputStream,
+                                               charset,
+                                               bufferLength,
+                                               0xFFFD);
+
+  if (!(testConverter instanceof
+        Components.interfaces.nsIUnicharLineInputStream))
+    throw "not line input stream";
+
+  var outStr = "";
+  var more;
+  do {
+    // read the line and check for eof
+    var line = {};
+    more = testConverter.readLine(line);
+    outStr += line.value;
+  } while (more);
+
+  // escape the strings before comparing for better readability
+  do_check_eq(escape(outStr), escape(expectedText));
+}
+
+// Byte-swap %-encoded utf-16
+function flip(str) { return str.replace(/(%..)(%..)/g, "$2$1"); }
+
+function run_test()
+{
+  for (var i = 0; i < 12; ++i) {
+    for (var bufferLength = 4; bufferLength < 8; ++ bufferLength) {
+      testCase(test[i][0], test[i][1], bufferLength, "UTF-16BE");
+      testCase(flip(test[i][0]), test[i][1], bufferLength, "UTF-16LE");
+    }
+  }
+}
--- a/intl/uconv/tests/unit/test_bug340714.js
+++ b/intl/uconv/tests/unit/test_bug340714.js
@ -63,7 +63,21 @@ function testCase(withBOM, charset, charsetDec, decoder, bufferLength)
      outStr += line.value;
  } while (more);

-  do_check_eq(outStr, expected);
+  if (outStr != expected) {
+    dump("Failed with BOM = " + withBOM + "; charset = " + charset +
+	 "; charset declaration = " + charsetDec + "; decoder = " + decoder +
+	 "; bufferLength = " + bufferLength + "\n");
+    if (outStr.length == expected.length) {
+      for (i = 0; i < outStr.length; ++i) {
+	if (outStr.charCodeAt(i) != expected.charCodeAt(i)) {
+	  dump(i + ": " + outStr.charCodeAt(i).toString(16) + " != " + expected.charCodeAt(i).toString(16) + "\n");
+	}
+      }
+    }
+  }
+
+  // escape the strings before comparing for better readability
+  do_check_eq(escape(outStr), escape(expected));
 }

 function run_test()
--- a/intl/uconv/ucvlatin/nsUCS2BEToUnicode.cpp
+++ b/intl/uconv/ucvlatin/nsUCS2BEToUnicode.cpp
@ -38,6 +38,7 @@
 #include "nsUCConstructors.h"
 #include "nsUCS2BEToUnicode.h"
 #include "nsUCvLatinDll.h"
+#include "nsCharTraits.h"
 #include <string.h>
 #include "prtypes.h"

@ -46,11 +47,12 @@
 #define STATE_FIRST_CALL      2
 #define STATE_FOUND_BOM       3

-// XXX : illegal surrogate code points are just passed through !!
 static nsresult
-UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aData, const char * aSrc,
+UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aOddByte,
+                      PRUnichar& aOddHighSurrogate, const char * aSrc,
                      PRInt32 * aSrcLength, PRUnichar * aDest,
-                      PRInt32 * aDestLength)
+                      PRInt32 * aDestLength,
+                      PRBool aSwapBytes)
 {
  const char* src = aSrc;
  const char* srcEnd = aSrc + *aSrcLength;
@ -81,41 +83,80 @@ UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aData, const char * aSrc,
    aState = STATE_NORMAL;
  }

-  PRInt32 copybytes;
+  if (src == srcEnd) {
+    *aDestLength = 0;
+    return NS_OK;
+  }

-  if((STATE_HALF_CODE_POINT == aState) && (src < srcEnd))
-  {
-    if(dest >= destEnd)
+  PRUnichar oddHighSurrogate = aOddHighSurrogate;
+
+  const char* srcEvenEnd;
+
+  PRUnichar u;
+  if (aState == STATE_HALF_CODE_POINT) {
+    // the 1st byte of a 16-bit code unit was stored in |aOddByte| in the
+    // previous run while the 2nd byte has to come from |*src|.
+    aState = STATE_NORMAL;
+#ifdef IS_BIG_ENDIAN
+    u = (aOddByte << 8) | *src++; // safe, we know we have at least one byte.
+#else
+    u = (*src++ << 8) | aOddByte; // safe, we know we have at least one byte.
+#endif
+    srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
+    goto have_codepoint;
+  } else {
+    srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
+  }
+
+  while (src != srcEvenEnd) {
+    if (dest == destEnd)
      goto error;

-    char tmpbuf[2];
+    u = *(const PRUnichar*)src;
+    src += 2;

-    // the 1st byte of a 16-bit code unit was stored in |aData| in the previous
-    // run while the 2nd byte has to come from |*src|. We just have to copy
-    // 'byte-by-byte'. Byte-swapping, if necessary, will be done in |Convert| of
-    // LE and BE converters.
-    PRUnichar * up = (PRUnichar*) &tmpbuf[0];
-    tmpbuf[0]= aData;
-    tmpbuf[1]= *src++;
-    *dest++ = *up;
+have_codepoint:
+    if (aSwapBytes)
+      u = u << 8 | u >> 8;
+
+    if (!IS_SURROGATE(u)) {
+      if (oddHighSurrogate) {
+        *dest++ = UCS2_REPLACEMENT_CHAR;
+        if (dest == destEnd)
+          goto error;
+        oddHighSurrogate = 0;
+      }
+      *dest++ = u;
+    } else if (NS_IS_HIGH_SURROGATE(u)) {
+      if (oddHighSurrogate) {
+        *dest++ = UCS2_REPLACEMENT_CHAR;
+        if (dest == destEnd)
+          goto error;
+      }
+      oddHighSurrogate = u;
+    }
+    else /* if (NS_IS_LOW_SURROGATE(u)) */ {
+      if (oddHighSurrogate) {
+        if (dest == destEnd - 1) {
+          *dest++ = UCS2_REPLACEMENT_CHAR;
+          goto error;
+        }
+        *dest++ = oddHighSurrogate;
+        *dest++ = u;
+        oddHighSurrogate = 0;
+      } else {
+        *dest++ = UCS2_REPLACEMENT_CHAR;
+      }
+    }
  }
-  
-  copybytes = (destEnd-dest)*2;
-  // if |srcEnd-src| is odd, we copy one fewer bytes.
-  if(copybytes > (~1 & (srcEnd - src)))
-      copybytes = ~1 & (srcEnd - src);
-  memcpy(dest,src,copybytes);
-  src +=copybytes;
-  dest +=(copybytes/2);
-  if(srcEnd == src)  { // srcLength was even.
-     aState = STATE_NORMAL;
-  } else if(1 == (srcEnd - src) ) { // srcLength was odd. 
-     aState = STATE_HALF_CODE_POINT;
-     aData  = *src++;  // store the lead byte of a 16-bit unit for the next run.
-  } else  {
-     goto error;
+  if (src != srcEnd) {
+    // store the lead byte of a 16-bit unit for the next run.
+    aOddByte = *src++;
+    aState = STATE_HALF_CODE_POINT;
  }
-  
+
+  aOddHighSurrogate = oddHighSurrogate;
+
  *aDestLength = dest - aDest;
  *aSrcLength =  src  - aSrc; 
  return NS_OK;
@ -126,18 +167,12 @@ error:
  return  NS_OK_UDEC_MOREOUTPUT;
 }

-static void
-SwapBytes(PRUnichar *aDest, PRInt32 aLen)
-{
-  for (PRUnichar *p = aDest; aLen > 0; ++p, --aLen)
-     *p = ((*p & 0xff) << 8) | ((*p >> 8) & 0xff);
-}
-
 NS_IMETHODIMP
 nsUTF16ToUnicodeBase::Reset()
 {
  mState = STATE_FIRST_CALL;
-  mData = 0;
+  mOddByte = 0;
+  mOddHighSurrogate = 0;
  return NS_OK;
 }

@ -145,8 +180,10 @@ NS_IMETHODIMP
 nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, PRInt32 aSrcLength, 
                                   PRInt32 * aDestLength)
 {
-  // the left-over byte of the previous run has to be taken into account.
-  *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT == mState) ? 1 : 0)) / 2;
+  // the left-over data of the previous run have to be taken into account.
+  *aDestLength = (aSrcLength +
+                    ((STATE_HALF_CODE_POINT == mState) ? 1 : 0)) / 2 +
+                 ((mOddHighSurrogate != 0) ? 1 : 0);
  return NS_OK;
 }

@ -174,12 +211,14 @@ nsUTF16BEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
    }
 #endif

-  nsresult rv = UTF16ConvertToUnicode(mState, mData, aSrc, aSrcLength,
-                                      aDest, aDestLength);
-
+  nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
+                                      aSrc, aSrcLength, aDest, aDestLength,
 #ifdef IS_LITTLE_ENDIAN
-  SwapBytes(aDest, *aDestLength);
+                                      PR_TRUE
+#else
+                                      PR_FALSE
 #endif
+                                      );
  return rv;
 }

@ -206,12 +245,14 @@ nsUTF16LEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
    }
 #endif
    
-  nsresult rv = UTF16ConvertToUnicode(mState, mData, aSrc, aSrcLength, aDest,
-                                      aDestLength);
-
+  nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
+                                      aSrc, aSrcLength, aDest, aDestLength,
 #ifdef IS_BIG_ENDIAN
-  SwapBytes(aDest, *aDestLength);
+                                      PR_TRUE
+#else
+                                      PR_FALSE
 #endif
+                                      );
  return rv;
 }

@ -262,17 +303,16 @@ nsUTF16ToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
      }
    }
    
-    nsresult rv = UTF16ConvertToUnicode(mState, mData, aSrc, aSrcLength, aDest,
-                                        aDestLength);
-
+    nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
+                                        aSrc, aSrcLength, aDest, aDestLength,
 #ifdef IS_BIG_ENDIAN
-    if (mEndian == kLittleEndian)
+                                        (mEndian == kLittleEndian)
 #elif defined(IS_LITTLE_ENDIAN)
-    if (mEndian == kBigEndian)
+                                        (mEndian == kBigEndian)
 #else
    #error "Unknown endianness"
 #endif
-      SwapBytes(aDest, *aDestLength);
+                                        );

    // If BOM is not found and we're to return NS_OK, signal that BOM
    // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
--- a/intl/uconv/ucvlatin/nsUCS2BEToUnicode.h
+++ b/intl/uconv/ucvlatin/nsUCS2BEToUnicode.h
@ -58,7 +58,10 @@ public:

 protected:
  PRUint8 mState;
-  PRUint8 mData;
+  // to store an odd byte left over between runs
+  PRUint8 mOddByte;
+  // to store an odd high surrogate left over between runs
+  PRUnichar mOddHighSurrogate;
 };

 // UTF-16 big endian