const Cc = Components.classes; const Ci = Components.interfaces; const NS_ERROR_ILLEGAL_VALUE = Components.results.NS_ERROR_ILLEGAL_VALUE; var BIS, BOS, _Pipe, COS, FIS, _SS, CIS; var dataDir; function run_test() { BIS = Components.Constructor("@mozilla.org/binaryinputstream;1", "nsIBinaryInputStream", "setInputStream"); BOS = Components.Constructor("@mozilla.org/binaryoutputstream;1", "nsIBinaryOutputStream", "setOutputStream"); _Pipe = Components.Constructor("@mozilla.org/pipe;1", "nsIPipe", "init"); COS = Components.Constructor("@mozilla.org/intl/converter-output-stream;1", "nsIConverterOutputStream", "init"); FIS = Components.Constructor("@mozilla.org/network/file-input-stream;1", "nsIFileInputStream", "init"); _SS = Components.Constructor("@mozilla.org/storagestream;1", "nsIStorageStream", "init"); CIS = Components.Constructor("@mozilla.org/intl/converter-input-stream;1", "nsIConverterInputStream", "init"); dataDir = do_get_file("data/"); test_utf8_1(); test_utf16_1(); test_utf16_2(); test_utf16_3(); test_cross_conversion(); } const UNICODE_STRINGS = [ '\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE', 'AZaz09 \u007F ' + // U+000000 to U+00007F '\u0080 \u0398 \u03BB \u0725 ' + // U+000080 to U+0007FF '\u0964 \u0F5F \u20AC \uFFFB' // U+000800 to U+00FFFF // there would be strings containing non-BMP code points here, but // unfortunately JS strings are UCS-2 (and worse yet are treated as // 16-bit values by the spec), so we have to do gymnastics to work // with non-BMP -- manual surrogate decoding doesn't work because // String.prototype.charCodeAt() ignores surrogate pairs and only // returns 16-bit values ]; // test conversion equality -- keys are names of files containing equivalent // Unicode data, values are the encoding of the file in the format expected by // nsIConverter(In|Out)putStream.init const UNICODE_FILES = { "unicode-conversion.utf8.txt": "UTF-8", "unicode-conversion.utf16.txt": "UTF-16", "unicode-conversion.utf16le.txt": "UTF-16LE", "unicode-conversion.utf16be.txt": "UTF-16BE" }; function test_utf8_1() { for (var i = 0; i < UNICODE_STRINGS.length; i++) { var pipe = Pipe(); var conv = new COS(pipe.outputStream, "UTF-8", 1024, 0x0); do_check_true(conv.writeString(UNICODE_STRINGS[i])); conv.close(); if (!equal(new UTF8(pipe.inputStream), stringToCodePoints(UNICODE_STRINGS[i]))) do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); } } function test_utf16_1() { for (var i = 0; i < UNICODE_STRINGS.length; i++) { var pipe = Pipe(); var conv = new COS(pipe.outputStream, "UTF-16", 1024, 0x0); do_check_true(conv.writeString(UNICODE_STRINGS[i])); conv.close(); if (!equal(new UTF16(pipe.inputStream), stringToCodePoints(UNICODE_STRINGS[i]))) do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); } } function test_utf16_2() { for (var i = 0; i < UNICODE_STRINGS.length; i++) { var pipe = Pipe(); var conv = new COS(pipe.outputStream, "UTF-16LE", 1024, 0x0); do_check_true(conv.writeString(UNICODE_STRINGS[i])); conv.close(); if (!equal(new UTF16(pipe.inputStream, false), stringToCodePoints(UNICODE_STRINGS[i]))) do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); } } function test_utf16_3() { for (var i = 0; i < UNICODE_STRINGS.length; i++) { var pipe = Pipe(); var conv = new COS(pipe.outputStream, "UTF-16BE", 1024, 0x0); do_check_true(conv.writeString(UNICODE_STRINGS[i])); conv.close(); if (!equal(new UTF16(pipe.inputStream, true), stringToCodePoints(UNICODE_STRINGS[i]))) do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); } } function test_cross_conversion() { for (var fn1 in UNICODE_FILES) { var fin = getBinaryInputStream(fn1); var ss = StorageStream(); var bos = new BOS(ss.getOutputStream(0)); var av; while ((av = fin.available()) > 0) { var data = fin.readByteArray(av); bos.writeByteArray(data, data.length); } fin.close(); bos.close(); for (var fn2 in UNICODE_FILES) { var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]); var unichar = new CIS(ss.newInputStream(0), UNICODE_FILES[fn1], 8192, 0x0); if (!equalUnicharStreams(unichar, fin2)) do_throw("unequal streams: " + UNICODE_FILES[fn1] + ", " + UNICODE_FILES[fn2]); } } } // utility functions function StorageStream() { return new _SS(8192, Math.pow(2, 32) - 1, null); } function getUnicharInputStream(filename, encoding) { var file = dataDir.clone(); file.append(filename); const PR_RDONLY = 0x1; var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); return new CIS(fis, encoding, 8192, 0x0); } function getBinaryInputStream(filename, encoding) { var file = dataDir.clone(); file.append(filename); const PR_RDONLY = 0x1; var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); return new BIS(fis); } function equal(stream, codePoints) { var sz, currIndex = 0; while (true) { var unit = stream.readUnit(); if (unit < 0) return currIndex == codePoints.length; if (unit !== codePoints[currIndex++]) return false; } do_throw("not reached"); return false; } function equalUnicharStreams(s1, s2) { var r1, r2; var str1 = {}, str2 = {}; while (true) { r1 = s1.readString(1024, str1); r2 = s2.readString(1024, str2); if (r1 != r2 || str1.value != str2.value) { print("r1: " + r1 + ", r2: " + r2); print(str1.value.length); print(str2.value.length); return false; } if (r1 == 0 && r2 == 0) return true; } // not reached return false; } function stringToCodePoints(str) { return str.split('').map(function(v){ return v.charCodeAt(0); }); } function lowbits(n) { return Math.pow(2, n) - 1; } function Pipe() { return new _Pipe(false, false, 1024, 10, null); } // complex charset readers /** * Wraps a UTF-8 stream to allow access to the Unicode code points in it. * * @param stream * the stream to wrap */ function UTF8(stream) { this._stream = new BIS(stream); } UTF8.prototype = { // returns numeric code point at front of stream encoded in UTF-8, -1 if at // end of stream, or throws if valid (and properly encoded!) code point not // found readUnit: function() { var str = this._stream; var c, c2, c3, c4, rv; // if at end of stream, must distinguish failure to read any bytes // (correct behavior) from failure to read some byte after the first // in the character try { c = str.read8(); } catch (e) { return -1; } if (c < 0x80) return c; if (c < 0xC0) // c < 11000000 { // byte doesn't have enough leading ones (must be at least two) throw NS_ERROR_ILLEGAL_VALUE; } c2 = str.read8(); if (c2 >= 0xC0 || c2 < 0x80) throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx if (c < 0xE0) // c < 11100000 { // two-byte between U+000080 and U+0007FF rv = ((lowbits(5) & c) << 6) + (lowbits(6) & c2); // no upper bounds-check needed, by previous lines if (rv >= 0x80) return rv; throw NS_ERROR_ILLEGAL_VALUE; } c3 = str.read8(); if (c3 >= 0xC0 || c3 < 0x80) throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx if (c < 0xF0) // c < 11110000 { // three-byte between U+000800 and U+00FFFF rv = ((lowbits(4) & c) << 12) + ((lowbits(6) & c2) << 6) + (lowbits(6) & c3); // no upper bounds-check needed, by previous lines if (rv >= 0xE000 || (rv >= 0x800 && rv <= 0xD7FF)) return rv; throw NS_ERROR_ILLEGAL_VALUE; } c4 = str.read8(); if (c4 >= 0xC0 || c4 < 0x80) throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx if (c < 0xF8) // c < 11111000 { // four-byte between U+010000 and U+10FFFF rv = ((lowbits(3) & c) << 18) + ((lowbits(6) & c2) << 12) + ((lowbits(6) & c3) << 6) + (lowbits(6) & c4); // need an upper bounds-check since 0x10FFFF isn't (2**n - 1) if (rv >= 0x10000 && rv <= 0x10FFFF) return rv; throw NS_ERROR_ILLEGAL_VALUE; } // 11111000 or greater -- no UTF-8 mapping throw NS_ERROR_ILLEGAL_VALUE; } }; /** * Wraps a UTF-16 stream to allow access to the Unicode code points in it. * * @param stream * the stream to wrap * @param bigEndian * true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with * a byte-order mark */ function UTF16(stream, bigEndian) { this._stream = new BIS(stream); if (arguments.length > 1) { this._bigEndian = bigEndian; } else { var bom = this._stream.read16(); if (bom == 0xFEFF) this._bigEndian = true; else if (bom == 0xFFFE) this._bigEndian = false; else do_throw("missing BOM: " + bom.toString(16).toUpperCase()); } } UTF16.prototype = { // returns numeric code point at front of stream encoded in UTF-16, // -1 if at end of stream, or throws if UTF-16 code point not found readUnit: function() { var str = this._stream; // if at end of stream, must distinguish failure to read any bytes // (correct behavior) from failure to read some byte after the first // in the character try { var b1 = str.read8(); } catch (e) { return -1; } var b2 = str.read8(); var w1 = this._bigEndian ? (b1 << 8) + b2 : (b2 << 8) + b1; if (w1 > 0xDBFF && w1 < 0xE000) { // second surrogate, but expecting none or first throw NS_ERROR_ILLEGAL_VALUE; } if (w1 > 0xD7FF && w1 < 0xDC00) { // non-BMP, use surrogate pair b1 = str.read8(); b2 = str.read8(); var w2 = this._bigEndian ? (b1 << 8) + b2 : (b2 << 8) + b1; if (w2 < 0xDC00 || w2 > 0xDFFF) throw NS_ERROR_ILLEGAL_VALUE; var rv = 0x100000 + ((lowbits(10) & w2) << 10) + (lowbits(10) & w1); if (rv <= 0x10FFFF) return rv; throw NS_ERROR_ILLEGAL_VALUE; } // non-surrogate return w1; } };