Bug 933885 part 4c: overhaul dom/encoding/test/unit/test_utf.js for correctness and efficiency. r=emk

2024-09-13 09:24:08 -07:00 · 2014-01-25 23:34:47 -05:00 · 2014-01-25 23:34:47 -05:00 · 8207da1fc0
commit 8207da1fc0
parent 32dfa93f19
1 changed files with 143 additions and 126 deletions
--- a/dom/encoding/test/unit/test_utf.js
+++ b/dom/encoding/test/unit/test_utf.js
@ -1,51 +1,43 @@
 // NOTE: Requires testharness.js
 // http://www.w3.org/2008/webapps/wiki/Harness

-function testEncodeDecode(encoding, min, max) {
-  function cpname(n) {
-    return 'U+' + ((n <= 0xFFFF) ?
-                   ('0000' + n.toString(16).toUpperCase()).slice(-4) :
-                   n.toString(16).toUpperCase());
+// Extension to testharness.js API which avoids logging enormous strings
+// on a coding failure.
+function assert_string_equals(actual, expected, description) {
+  // short circuit success case
+  if (actual === expected) {
+    assert_true(true, description + ": <actual> === <expected>");
+    return;
  }

-  test(
-    function() {
-      var string, i, j, BATCH_SIZE = 0x1000;
-      for (i = min; i < max; i += BATCH_SIZE) {
-        string = '';
-        for (j = i; j < i + BATCH_SIZE && j < max; j += 1) {
-          if (0xd800 <= j && j <= 0xdfff) {
-            // surrogate half
-            continue;
-          } else if (j > 0xffff) {
-            // outside BMP - encode as surrogate pair
-            string += String.fromCharCode(
-              0xd800 + ((j >> 10) & 0x3ff),
-              0xdc00 + (j & 0x3ff));
-          } else {
-            string += String.fromCharCode(i);
-          }
-        }
-        var encoded = new TextEncoder(encoding).encode(string);
-        var decoded = new TextDecoder(encoding).decode(encoded);
-        assert_equals(string, decoded, 'Round trip ' + cpname(i) + " - " + cpname(j));
-      }
-    },
-    encoding + " - Encode/Decode Range " + cpname(min) + " - " + cpname(max)
-  );
-}
+  // length check
+  assert_equals(actual.length, expected.length,
+                description + ": string lengths")

-testEncodeDecode('UTF-8', 0, 0x10FFFF);
-testEncodeDecode('UTF-16LE', 0, 0x10FFFF);
-testEncodeDecode('UTF-16BE', 0, 0x10FFFF);
+  var i, a, b;
+  for (i = 0; i < actual.length; i++) {
+    a = actual.charCodeAt(i);
+    b = expected.charCodeAt(i);
+    if (a !== b)
+      assert_true(false,
+                  description +
+                  ": code unit " + i.toString() + " unequal: " +
+                  cpname(a) + " != " + cpname(b)); // doesn't return
+  }
+
+  // It should be impossible to get here, because the initial
+  // comparison failed, so either the length comparison or the
+  // codeunit-by-codeunit comparison should also fail.
+  assert_true(false, description + ": failed to detect string difference");
+}

 // Inspired by:
 // http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html
 function encode_utf8(string) {
  var utf8 = unescape(encodeURIComponent(string));
-  var octets = [], i;
+  var octets = new Uint8Array(utf8.length), i;
  for (i = 0; i < utf8.length; i += 1) {
-    octets.push(utf8.charCodeAt(i));
+    octets[i] = utf8.charCodeAt(i);
  }
  return octets;
 }
@ -55,95 +47,120 @@ function decode_utf8(octets) {
  return decodeURIComponent(escape(utf8));
 }

-test(
-  function() {
-    var actual, expected, str, i, j, BATCH_SIZE = 0x1000;
-
-    for (i = 0; i < 0x10FFFF; i += BATCH_SIZE) {
-      str = '';
-      for (j = i; j < i + BATCH_SIZE; j += 1) {
-        if (0xd800 <= j && j <= 0xdfff) {
-          // surrogate half
-          continue;
-        } else if (j > 0xffff) {
-          // outside BMP - encode as surrogate pair
-          str += String.fromCharCode(
-            0xd800 + ((j >> 10) & 0x3ff),
-            0xdc00 + (j & 0x3ff));
-        } else {
-          str += String.fromCharCode(i);
-        }
-      }
-      expected = encode_utf8(str);
-
-      actual = new TextEncoder('UTF-8').encode(str);
-      assert_array_equals(actual, expected, 'expected equal encodings');
-    }
-  },
-  "UTF-8 encoding (compare against unescape/encodeURIComponent)"
-);
-
-test(
-  function() {
-    var encoded, actual, expected, str, i, j, BATCH_SIZE = 0x1000;
-
-    for (i = 0; i < 0x10FFFF; i += BATCH_SIZE) {
-      str = '';
-      for (j = i; j < i + BATCH_SIZE; j += 1) {
-        if (0xd800 <= j && j <= 0xdfff) {
-          // surrogate half
-          continue;
-        } else if (j > 0xffff) {
-          // outside BMP - encode as surrogate pair
-          str += String.fromCharCode(
-            0xd800 + ((j >> 10) & 0x3ff),
-            0xdc00 + (j & 0x3ff));
-        } else {
-          str += String.fromCharCode(i);
-        }
-      }
-
-      encoded = encode_utf8(str);
-
-      expected = decode_utf8(encoded);
-      actual = new TextDecoder('UTF-8').decode(new Uint8Array(encoded));
-
-      assert_equals(actual, expected, 'expected equal decodings');
-    }
-  },
-  "UTF-8 decoding (compare against decodeURIComponent/escape)"
-);
-
-function testEncodeDecodeSample(encoding, string, expected) {
-  test(
-    function() {
-      var encoded = new TextEncoder(encoding).encode(string);
-      assert_array_equals(encoded, expected, 'expected equal encodings ' + encoding);
-
-      var decoded = new TextDecoder(encoding).decode(new Uint8Array(expected));
-      assert_equals(decoded, string, 'expected equal decodings ' + encoding);
-    },
-    encoding + " - Encode/Decode - reference sample"
-  );
+// Helpers for test_utf_roundtrip.
+function cpname(n) {
+  if (n+0 !== n)
+    return n.toString();
+  var w = (n <= 0xFFFF) ? 4 : 6;
+  return 'U+' + ('000000' + n.toString(16).toUpperCase()).slice(-w);
 }

-testEncodeDecodeSample(
-  "utf-8",
-  "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD", // z, cent, CJK water, G-Clef, Private-use character
-  [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD]
-);
-testEncodeDecodeSample(
-  "utf-16le",
-  "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD", // z, cent, CJK water, G-Clef, Private-use character
-  [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF]
-);
-testEncodeDecodeSample(
-  "utf-16be",
-  "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD", // z, cent, CJK water, G-Clef, Private-use character
-  [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD]
-);
-testEncodeDecodeSample(
-  "utf-16",
-  "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD", // z, cent, CJK water, G-Clef, Private-use character
-  [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF]
-);
+function genblock(from, len) {
+  var i, j, point, offset;
+  var size, block;
+
+  // determine size required:
+  //    1 unit   for each point from U+000000 through U+00D7FF
+  //    0 units                      U+00D800 through U+00DFFF
+  //    1 unit                       U+00E000 through U+00FFFF
+  //    2 units                      U+010000 through U+10FFFF
+  function overlap(min1, max1, min2, max2) {
+    return Math.max(0, Math.min(max1, max2) - Math.max(min1, min2));
+  }
+  size = (overlap(from, from+len, 0x000000, 0x00D800) +
+          overlap(from, from+len, 0x00E000, 0x010000) +
+          overlap(from, from+len, 0x010000, 0x110000)*2);
+
+  block = new Uint16Array(size);
+  for (i = 0, j = 0; i < len; i++) {
+    point = from + i;
+    if (0xD800 <= point && point <= 0xDFFF)
+      continue;
+    else if (point <= 0xFFFF)
+      block[j++] = point;
+    else {
+      offset = point - 0x10000;
+      block[j++] = 0xD800 + (offset >> 10);
+      block[j++] = 0xDC00 + (offset & 0x3FF);
+    }
+  }
+  return String.fromCharCode.apply(null, block);
+}
+
+function test_utf_roundtrip () {
+  var MIN_CODEPOINT = 0;
+  var MAX_CODEPOINT = 0x10FFFF;
+  var BLOCK_SIZE = 0x1000;
+
+  var block, block_tag, i, j, encoded, decoded, exp_encoded, exp_decoded;
+
+  var TE_U16LE = new TextEncoder("UTF-16LE");
+  var TD_U16LE = new TextDecoder("UTF-16LE");
+
+  var TE_U16BE = new TextEncoder("UTF-16BE");
+  var TD_U16BE = new TextDecoder("UTF-16BE");
+
+  var TE_U8    = new TextEncoder("UTF-8");
+  var TD_U8    = new TextDecoder("UTF-8");
+
+  for (i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) {
+    block_tag = cpname(i) + " - " + cpname(i + BLOCK_SIZE - 1);
+    block = genblock(i, BLOCK_SIZE);
+
+    // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves
+    encoded = TE_U16LE.encode(block);
+    decoded = TD_U16LE.decode(encoded);
+    assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag);
+
+    encoded = TE_U16BE.encode(block);
+    decoded = TD_U16BE.decode(encoded);
+    assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag);
+
+    encoded = TE_U8.encode(block);
+    decoded = TD_U8.decode(encoded);
+    assert_string_equals(block, decoded, "UTF-8 round trip " + block_tag);
+
+    // test TextEncoder(UTF-8) against the older idiom
+    exp_encoded = encode_utf8(block);
+    assert_array_equals(encoded, exp_encoded,
+                        "UTF-8 reference encoding " + block_tag);
+
+    exp_decoded = decode_utf8(exp_encoded);
+    assert_string_equals(decoded, exp_decoded,
+                         "UTF-8 reference decoding " + block_tag);
+  }
+}
+
+function test_utf_samples () {
+  // z, cent, CJK water, G-Clef, Private-use character
+  var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD";
+  var cases = [
+    { encoding: "utf-8",
+      expected: [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD] },
+    { encoding: "utf-16le",
+      expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] },
+    { encoding: "utf-16",
+      expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] },
+    { encoding: "utf-16be",
+      expected: [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD] }
+  ];
+
+  cases.forEach(
+    function(t) {
+      var encoded = new TextEncoder(t.encoding).encode(sample);
+      assert_array_equals(encoded, t.expected,
+                          "expected equal encodings - " + t.encoding);
+
+      var decoded = new TextDecoder(t.encoding)
+                        .decode(new Uint8Array(t.expected));
+      assert_equals(decoded, sample,
+                    "expected equal decodings - " + t.encoding);
+    });
+}
+
+test(test_utf_samples,
+     "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - reference sample");
+
+test(test_utf_roundtrip,
+     "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - full roundtrip and "+
+     "agreement with encode/decodeURIComponent");