2012-11-09 09:45:25 -08:00
|
|
|
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
|
2013-04-16 13:47:10 -07:00
|
|
|
* vim: set ts=8 sts=4 et sw=4 tw=99:
|
2012-11-09 09:45:25 -08:00
|
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
|
2013-06-19 17:59:09 -07:00
|
|
|
#ifndef js_CharacterEncoding_h
|
|
|
|
#define js_CharacterEncoding_h
|
2012-11-09 09:45:25 -08:00
|
|
|
|
2013-09-19 12:24:53 -07:00
|
|
|
#include "mozilla/NullPtr.h"
|
2012-11-09 09:45:25 -08:00
|
|
|
#include "mozilla/Range.h"
|
|
|
|
|
2013-09-26 15:34:54 -07:00
|
|
|
#include "js/TypeDecls.h"
|
2013-08-13 15:34:12 -07:00
|
|
|
#include "js/Utility.h"
|
|
|
|
|
2013-09-26 15:34:54 -07:00
|
|
|
namespace js {
|
|
|
|
struct ThreadSafeContext;
|
|
|
|
}
|
|
|
|
|
2014-07-11 07:22:37 -07:00
|
|
|
class JSFlatString;
|
|
|
|
|
2012-11-09 09:45:25 -08:00
|
|
|
namespace JS {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* By default, all C/C++ 1-byte-per-character strings passed into the JSAPI
|
|
|
|
* are treated as ISO/IEC 8859-1, also known as Latin-1. That is, each
|
|
|
|
* byte is treated as a 2-byte character, and there is no way to pass in a
|
|
|
|
* string containing characters beyond U+00FF.
|
|
|
|
*/
|
2014-05-31 01:44:32 -07:00
|
|
|
class Latin1Chars : public mozilla::Range<Latin1Char>
|
2012-11-09 09:45:25 -08:00
|
|
|
{
|
2014-05-31 01:44:32 -07:00
|
|
|
typedef mozilla::Range<Latin1Char> Base;
|
2012-11-09 09:45:25 -08:00
|
|
|
|
|
|
|
public:
|
|
|
|
Latin1Chars() : Base() {}
|
2014-05-31 01:44:32 -07:00
|
|
|
Latin1Chars(char *aBytes, size_t aLength) : Base(reinterpret_cast<Latin1Char *>(aBytes), aLength) {}
|
2014-06-07 05:59:52 -07:00
|
|
|
Latin1Chars(const Latin1Char *aBytes, size_t aLength)
|
|
|
|
: Base(const_cast<Latin1Char *>(aBytes), aLength)
|
|
|
|
{}
|
2013-02-17 22:56:32 -08:00
|
|
|
Latin1Chars(const char *aBytes, size_t aLength)
|
2014-05-31 01:44:32 -07:00
|
|
|
: Base(reinterpret_cast<Latin1Char *>(const_cast<char *>(aBytes)), aLength)
|
2013-01-15 13:51:56 -08:00
|
|
|
{}
|
2012-11-09 09:45:25 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A Latin1Chars, but with \0 termination for C compatibility.
|
|
|
|
*/
|
2014-05-31 01:44:32 -07:00
|
|
|
class Latin1CharsZ : public mozilla::RangedPtr<Latin1Char>
|
2012-11-09 09:45:25 -08:00
|
|
|
{
|
2014-05-31 01:44:32 -07:00
|
|
|
typedef mozilla::RangedPtr<Latin1Char> Base;
|
2012-11-09 09:45:25 -08:00
|
|
|
|
|
|
|
public:
|
2013-09-19 12:24:53 -07:00
|
|
|
Latin1CharsZ() : Base(nullptr, 0) {}
|
2012-11-09 09:45:25 -08:00
|
|
|
|
2013-02-17 22:56:32 -08:00
|
|
|
Latin1CharsZ(char *aBytes, size_t aLength)
|
2014-05-31 01:44:32 -07:00
|
|
|
: Base(reinterpret_cast<Latin1Char *>(aBytes), aLength)
|
2012-11-09 09:45:25 -08:00
|
|
|
{
|
2014-02-17 22:24:15 -08:00
|
|
|
MOZ_ASSERT(aBytes[aLength] == '\0');
|
2012-11-09 09:45:25 -08:00
|
|
|
}
|
|
|
|
|
2014-05-31 01:44:32 -07:00
|
|
|
Latin1CharsZ(Latin1Char *aBytes, size_t aLength)
|
2013-02-17 22:56:32 -08:00
|
|
|
: Base(aBytes, aLength)
|
2012-11-09 09:45:25 -08:00
|
|
|
{
|
2014-02-17 22:24:15 -08:00
|
|
|
MOZ_ASSERT(aBytes[aLength] == '\0');
|
2012-11-09 09:45:25 -08:00
|
|
|
}
|
|
|
|
|
2013-12-19 10:56:24 -08:00
|
|
|
using Base::operator=;
|
|
|
|
|
2012-11-09 09:45:25 -08:00
|
|
|
char *c_str() { return reinterpret_cast<char *>(get()); }
|
|
|
|
};
|
|
|
|
|
2013-07-09 23:17:32 -07:00
|
|
|
class UTF8Chars : public mozilla::Range<unsigned char>
|
|
|
|
{
|
|
|
|
typedef mozilla::Range<unsigned char> Base;
|
|
|
|
|
|
|
|
public:
|
|
|
|
UTF8Chars() : Base() {}
|
|
|
|
UTF8Chars(char *aBytes, size_t aLength)
|
|
|
|
: Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
|
|
|
|
{}
|
|
|
|
UTF8Chars(const char *aBytes, size_t aLength)
|
|
|
|
: Base(reinterpret_cast<unsigned char *>(const_cast<char *>(aBytes)), aLength)
|
|
|
|
{}
|
|
|
|
};
|
|
|
|
|
2012-11-09 09:45:25 -08:00
|
|
|
/*
|
|
|
|
* SpiderMonkey also deals directly with UTF-8 encoded text in some places.
|
|
|
|
*/
|
|
|
|
class UTF8CharsZ : public mozilla::RangedPtr<unsigned char>
|
|
|
|
{
|
|
|
|
typedef mozilla::RangedPtr<unsigned char> Base;
|
|
|
|
|
|
|
|
public:
|
2013-09-19 12:24:53 -07:00
|
|
|
UTF8CharsZ() : Base(nullptr, 0) {}
|
2012-11-09 09:45:25 -08:00
|
|
|
|
2013-02-17 22:56:32 -08:00
|
|
|
UTF8CharsZ(char *aBytes, size_t aLength)
|
|
|
|
: Base(reinterpret_cast<unsigned char *>(aBytes), aLength)
|
2012-11-09 09:45:25 -08:00
|
|
|
{
|
2014-02-17 22:24:15 -08:00
|
|
|
MOZ_ASSERT(aBytes[aLength] == '\0');
|
2012-11-09 09:45:25 -08:00
|
|
|
}
|
2013-02-21 18:58:52 -08:00
|
|
|
|
|
|
|
UTF8CharsZ(unsigned char *aBytes, size_t aLength)
|
|
|
|
: Base(aBytes, aLength)
|
|
|
|
{
|
2014-02-17 22:24:15 -08:00
|
|
|
MOZ_ASSERT(aBytes[aLength] == '\0');
|
2013-02-21 18:58:52 -08:00
|
|
|
}
|
|
|
|
|
2013-12-19 10:56:24 -08:00
|
|
|
using Base::operator=;
|
|
|
|
|
2013-02-21 18:58:52 -08:00
|
|
|
char *c_str() { return reinterpret_cast<char *>(get()); }
|
2012-11-09 09:45:25 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* SpiderMonkey uses a 2-byte character representation: it is a
|
|
|
|
* 2-byte-at-a-time view of a UTF-16 byte stream. This is similar to UCS-2,
|
|
|
|
* but unlike UCS-2, we do not strip UTF-16 extension bytes. This allows a
|
|
|
|
* sufficiently dedicated JavaScript program to be fully unicode-aware by
|
|
|
|
* manually interpreting UTF-16 extension characters embedded in the JS
|
|
|
|
* string.
|
|
|
|
*/
|
|
|
|
class TwoByteChars : public mozilla::Range<jschar>
|
|
|
|
{
|
|
|
|
typedef mozilla::Range<jschar> Base;
|
|
|
|
|
|
|
|
public:
|
|
|
|
TwoByteChars() : Base() {}
|
2013-02-17 22:56:32 -08:00
|
|
|
TwoByteChars(jschar *aChars, size_t aLength) : Base(aChars, aLength) {}
|
|
|
|
TwoByteChars(const jschar *aChars, size_t aLength) : Base(const_cast<jschar *>(aChars), aLength) {}
|
2012-11-09 09:45:25 -08:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A TwoByteChars, but \0 terminated for compatibility with JSFlatString.
|
|
|
|
*/
|
|
|
|
class TwoByteCharsZ : public mozilla::RangedPtr<jschar>
|
|
|
|
{
|
|
|
|
typedef mozilla::RangedPtr<jschar> Base;
|
|
|
|
|
|
|
|
public:
|
2013-09-19 12:24:53 -07:00
|
|
|
TwoByteCharsZ() : Base(nullptr, 0) {}
|
2013-07-09 23:17:32 -07:00
|
|
|
|
2012-11-09 09:45:25 -08:00
|
|
|
TwoByteCharsZ(jschar *chars, size_t length)
|
|
|
|
: Base(chars, length)
|
|
|
|
{
|
2014-02-17 22:24:15 -08:00
|
|
|
MOZ_ASSERT(chars[length] == '\0');
|
2012-11-09 09:45:25 -08:00
|
|
|
}
|
2013-12-19 10:56:24 -08:00
|
|
|
|
|
|
|
using Base::operator=;
|
2012-11-09 09:45:25 -08:00
|
|
|
};
|
|
|
|
|
2014-01-30 14:58:53 -08:00
|
|
|
typedef mozilla::RangedPtr<const jschar> ConstCharPtr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Like TwoByteChars, but the chars are const.
|
|
|
|
*/
|
|
|
|
class ConstTwoByteChars : public mozilla::RangedPtr<const jschar>
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
ConstTwoByteChars(const ConstTwoByteChars &s) : ConstCharPtr(s) {}
|
2014-05-25 18:46:24 -07:00
|
|
|
MOZ_IMPLICIT ConstTwoByteChars(const mozilla::RangedPtr<const jschar> &s) : ConstCharPtr(s) {}
|
2014-01-30 14:58:53 -08:00
|
|
|
ConstTwoByteChars(const jschar *s, size_t len) : ConstCharPtr(s, len) {}
|
|
|
|
ConstTwoByteChars(const jschar *pos, const jschar *start, size_t len)
|
|
|
|
: ConstCharPtr(pos, start, len)
|
|
|
|
{}
|
|
|
|
|
|
|
|
using ConstCharPtr::operator=;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2012-11-09 09:45:25 -08:00
|
|
|
/*
|
|
|
|
* Convert a 2-byte character sequence to "ISO-Latin-1". This works by
|
|
|
|
* truncating each 2-byte pair in the sequence to a 1-byte pair. If the source
|
|
|
|
* contains any UTF-16 extension characters, then this may give invalid Latin1
|
|
|
|
* output. The returned string is zero terminated. The returned string or the
|
|
|
|
* returned string's |start()| must be freed with JS_free or js_free,
|
|
|
|
* respectively. If allocation fails, an OOM error will be set and the method
|
2013-09-19 12:24:53 -07:00
|
|
|
* will return a nullptr chars (which can be tested for with the ! operator).
|
2012-11-09 09:45:25 -08:00
|
|
|
* This method cannot trigger GC.
|
|
|
|
*/
|
|
|
|
extern Latin1CharsZ
|
2014-06-20 03:39:44 -07:00
|
|
|
LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx,
|
|
|
|
const mozilla::Range<const jschar> tbchars);
|
2012-11-09 09:45:25 -08:00
|
|
|
|
2014-06-17 06:18:23 -07:00
|
|
|
template <typename CharT>
|
2013-02-21 18:58:52 -08:00
|
|
|
extern UTF8CharsZ
|
2014-06-17 06:18:23 -07:00
|
|
|
CharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, const mozilla::Range<const CharT> chars);
|
2013-02-21 18:58:52 -08:00
|
|
|
|
2013-07-09 23:17:32 -07:00
|
|
|
uint32_t
|
|
|
|
Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inflate bytes in UTF-8 encoding to jschars.
|
|
|
|
* - On error, returns an empty TwoByteCharsZ.
|
|
|
|
* - On success, returns a malloc'd TwoByteCharsZ, and updates |outlen| to hold
|
|
|
|
* its length; the length value excludes the trailing null.
|
|
|
|
*/
|
|
|
|
extern TwoByteCharsZ
|
|
|
|
UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The same as UTF8CharsToNewTwoByteCharsZ(), except that any malformed UTF-8 characters
|
|
|
|
* will be replaced by \uFFFD. No exception will be thrown for malformed UTF-8
|
|
|
|
* input.
|
|
|
|
*/
|
|
|
|
extern TwoByteCharsZ
|
|
|
|
LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen);
|
|
|
|
|
2014-07-11 07:22:37 -07:00
|
|
|
/*
|
|
|
|
* Returns the length of the char buffer required to encode |s| as UTF8.
|
|
|
|
* Does not include the null-terminator.
|
|
|
|
*/
|
|
|
|
JS_PUBLIC_API(size_t)
|
|
|
|
GetDeflatedUTF8StringLength(JSFlatString *s);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Encode |src| as UTF8. The caller must ensure |dst| has enough space.
|
|
|
|
* Does not write the null terminator.
|
|
|
|
*/
|
|
|
|
JS_PUBLIC_API(void)
|
|
|
|
DeflateStringToUTF8Buffer(JSFlatString *src, mozilla::RangedPtr<char> dst);
|
|
|
|
|
2012-11-09 09:45:25 -08:00
|
|
|
} // namespace JS
|
|
|
|
|
|
|
|
inline void JS_free(JS::Latin1CharsZ &ptr) { js_free((void*)ptr.get()); }
|
2013-02-21 18:58:52 -08:00
|
|
|
inline void JS_free(JS::UTF8CharsZ &ptr) { js_free((void*)ptr.get()); }
|
2012-11-09 09:45:25 -08:00
|
|
|
|
2013-06-19 17:59:09 -07:00
|
|
|
#endif /* js_CharacterEncoding_h */
|