2007-03-22 10:30:00 -07:00
|
|
|
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
2012-05-21 04:12:37 -07:00
|
|
|
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
2007-03-22 10:30:00 -07:00
|
|
|
|
|
|
|
#include "nsUCConstructors.h"
|
|
|
|
#include "nsUCS2BEToUnicode.h"
|
|
|
|
#include "nsUCvLatinDll.h"
|
2009-02-22 02:08:27 -08:00
|
|
|
#include "nsCharTraits.h"
|
2007-03-22 10:30:00 -07:00
|
|
|
#include <string.h>
|
|
|
|
#include "prtypes.h"
|
|
|
|
|
2010-10-20 09:11:16 -07:00
|
|
|
#define STATE_NORMAL 0
|
|
|
|
#define STATE_HALF_CODE_POINT 1
|
|
|
|
#define STATE_FIRST_CALL 2
|
|
|
|
#define STATE_FOUND_BOM 3
|
|
|
|
#define STATE_ODD_SURROGATE_PAIR 4
|
2007-03-22 10:30:00 -07:00
|
|
|
|
|
|
|
static nsresult
|
2009-02-22 02:08:27 -08:00
|
|
|
UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aOddByte,
|
2010-10-20 09:11:16 -07:00
|
|
|
PRUnichar& aOddHighSurrogate, PRUnichar& aOddLowSurrogate,
|
|
|
|
const char * aSrc,
|
2007-03-22 10:30:00 -07:00
|
|
|
PRInt32 * aSrcLength, PRUnichar * aDest,
|
2009-02-22 02:08:27 -08:00
|
|
|
PRInt32 * aDestLength,
|
2011-09-28 23:19:26 -07:00
|
|
|
bool aSwapBytes)
|
2007-03-22 10:30:00 -07:00
|
|
|
{
|
|
|
|
const char* src = aSrc;
|
|
|
|
const char* srcEnd = aSrc + *aSrcLength;
|
|
|
|
PRUnichar* dest = aDest;
|
|
|
|
PRUnichar* destEnd = aDest + *aDestLength;
|
|
|
|
|
2010-10-20 09:11:16 -07:00
|
|
|
switch(aState) {
|
|
|
|
case STATE_FOUND_BOM:
|
2011-02-20 01:04:28 -08:00
|
|
|
NS_ASSERTION(*aSrcLength > 1, "buffer too short");
|
2007-03-22 10:30:00 -07:00
|
|
|
src+=2;
|
2010-10-20 09:11:16 -07:00
|
|
|
aState = STATE_NORMAL;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case STATE_FIRST_CALL: // first time called
|
2011-02-20 01:04:28 -08:00
|
|
|
NS_ASSERTION(*aSrcLength > 1, "buffer too short");
|
2010-10-20 09:11:16 -07:00
|
|
|
// Eliminate BOM (0xFEFF). Note that different endian case is taken care
|
|
|
|
// of in |Convert| of LE and BE converters. Here, we only have to
|
|
|
|
// deal with the same endian case. That is, 0xFFFE (byte-swapped BOM) is
|
|
|
|
// illegal.
|
|
|
|
if(0xFEFF == *((PRUnichar*)src)) {
|
|
|
|
src+=2;
|
|
|
|
} else if(0xFFFE == *((PRUnichar*)src)) {
|
|
|
|
*aSrcLength=0;
|
|
|
|
*aDestLength=0;
|
|
|
|
return NS_ERROR_ILLEGAL_INPUT;
|
|
|
|
}
|
|
|
|
aState = STATE_NORMAL;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case STATE_ODD_SURROGATE_PAIR:
|
|
|
|
if (*aDestLength < 2)
|
|
|
|
*dest++ = UCS2_REPLACEMENT_CHAR;
|
|
|
|
else {
|
|
|
|
*dest++ = aOddHighSurrogate;
|
|
|
|
*dest++ = aOddLowSurrogate;
|
|
|
|
aOddHighSurrogate = aOddLowSurrogate = 0;
|
|
|
|
aState = STATE_NORMAL;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case STATE_NORMAL:
|
|
|
|
case STATE_HALF_CODE_POINT:
|
|
|
|
default:
|
|
|
|
break;
|
2007-03-22 10:30:00 -07:00
|
|
|
}
|
|
|
|
|
2009-02-22 02:08:27 -08:00
|
|
|
if (src == srcEnd) {
|
2010-10-20 09:11:16 -07:00
|
|
|
*aDestLength = dest - aDest;
|
2009-02-22 02:08:27 -08:00
|
|
|
return NS_OK;
|
|
|
|
}
|
2007-03-22 10:30:00 -07:00
|
|
|
|
2009-02-22 02:08:27 -08:00
|
|
|
PRUnichar oddHighSurrogate = aOddHighSurrogate;
|
|
|
|
|
|
|
|
const char* srcEvenEnd;
|
|
|
|
|
|
|
|
PRUnichar u;
|
|
|
|
if (aState == STATE_HALF_CODE_POINT) {
|
|
|
|
// the 1st byte of a 16-bit code unit was stored in |aOddByte| in the
|
|
|
|
// previous run while the 2nd byte has to come from |*src|.
|
|
|
|
aState = STATE_NORMAL;
|
|
|
|
#ifdef IS_BIG_ENDIAN
|
|
|
|
u = (aOddByte << 8) | *src++; // safe, we know we have at least one byte.
|
|
|
|
#else
|
|
|
|
u = (*src++ << 8) | aOddByte; // safe, we know we have at least one byte.
|
|
|
|
#endif
|
|
|
|
srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
|
|
|
|
goto have_codepoint;
|
|
|
|
} else {
|
|
|
|
srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
|
|
|
|
}
|
|
|
|
|
|
|
|
while (src != srcEvenEnd) {
|
|
|
|
if (dest == destEnd)
|
2007-03-22 10:30:00 -07:00
|
|
|
goto error;
|
|
|
|
|
2010-03-06 07:35:11 -08:00
|
|
|
#if !defined(__sparc__) && !defined(__arm__)
|
2009-02-22 02:08:27 -08:00
|
|
|
u = *(const PRUnichar*)src;
|
2010-03-06 07:35:11 -08:00
|
|
|
#else
|
|
|
|
memcpy(&u, src, 2);
|
|
|
|
#endif
|
2009-02-22 02:08:27 -08:00
|
|
|
src += 2;
|
2007-03-22 10:30:00 -07:00
|
|
|
|
2009-02-22 02:08:27 -08:00
|
|
|
have_codepoint:
|
|
|
|
if (aSwapBytes)
|
|
|
|
u = u << 8 | u >> 8;
|
|
|
|
|
|
|
|
if (!IS_SURROGATE(u)) {
|
|
|
|
if (oddHighSurrogate) {
|
|
|
|
*dest++ = UCS2_REPLACEMENT_CHAR;
|
|
|
|
if (dest == destEnd)
|
|
|
|
goto error;
|
|
|
|
oddHighSurrogate = 0;
|
|
|
|
}
|
|
|
|
*dest++ = u;
|
|
|
|
} else if (NS_IS_HIGH_SURROGATE(u)) {
|
|
|
|
if (oddHighSurrogate) {
|
|
|
|
*dest++ = UCS2_REPLACEMENT_CHAR;
|
|
|
|
if (dest == destEnd)
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
oddHighSurrogate = u;
|
|
|
|
}
|
|
|
|
else /* if (NS_IS_LOW_SURROGATE(u)) */ {
|
2010-10-20 09:11:16 -07:00
|
|
|
if (oddHighSurrogate && *aDestLength > 1) {
|
|
|
|
if (dest + 1 >= destEnd) {
|
|
|
|
aOddLowSurrogate = u;
|
|
|
|
aOddHighSurrogate = oddHighSurrogate;
|
|
|
|
aState = STATE_ODD_SURROGATE_PAIR;
|
2009-02-22 02:08:27 -08:00
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
*dest++ = oddHighSurrogate;
|
|
|
|
*dest++ = u;
|
|
|
|
} else {
|
|
|
|
*dest++ = UCS2_REPLACEMENT_CHAR;
|
|
|
|
}
|
2010-10-20 09:11:16 -07:00
|
|
|
oddHighSurrogate = 0;
|
2009-02-22 02:08:27 -08:00
|
|
|
}
|
2007-03-22 10:30:00 -07:00
|
|
|
}
|
2009-02-22 02:08:27 -08:00
|
|
|
if (src != srcEnd) {
|
|
|
|
// store the lead byte of a 16-bit unit for the next run.
|
|
|
|
aOddByte = *src++;
|
|
|
|
aState = STATE_HALF_CODE_POINT;
|
2007-03-22 10:30:00 -07:00
|
|
|
}
|
2009-02-22 02:08:27 -08:00
|
|
|
|
|
|
|
aOddHighSurrogate = oddHighSurrogate;
|
|
|
|
|
2007-03-22 10:30:00 -07:00
|
|
|
*aDestLength = dest - aDest;
|
|
|
|
*aSrcLength = src - aSrc;
|
|
|
|
return NS_OK;
|
|
|
|
|
|
|
|
error:
|
|
|
|
*aDestLength = dest - aDest;
|
|
|
|
*aSrcLength = src - aSrc;
|
|
|
|
return NS_OK_UDEC_MOREOUTPUT;
|
|
|
|
}
|
|
|
|
|
|
|
|
NS_IMETHODIMP
|
|
|
|
nsUTF16ToUnicodeBase::Reset()
|
|
|
|
{
|
|
|
|
mState = STATE_FIRST_CALL;
|
2009-02-22 02:08:27 -08:00
|
|
|
mOddByte = 0;
|
|
|
|
mOddHighSurrogate = 0;
|
2010-10-20 09:11:16 -07:00
|
|
|
mOddLowSurrogate = 0;
|
2007-03-22 10:30:00 -07:00
|
|
|
return NS_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
NS_IMETHODIMP
|
|
|
|
nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, PRInt32 aSrcLength,
|
|
|
|
PRInt32 * aDestLength)
|
|
|
|
{
|
2009-02-22 02:08:27 -08:00
|
|
|
// the left-over data of the previous run have to be taken into account.
|
2010-10-20 09:11:16 -07:00
|
|
|
*aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT == mState) ? 1 : 0)) / 2;
|
|
|
|
if (mOddHighSurrogate)
|
|
|
|
(*aDestLength)++;
|
|
|
|
if (mOddLowSurrogate)
|
|
|
|
(*aDestLength)++;
|
2007-03-22 10:30:00 -07:00
|
|
|
return NS_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
NS_IMETHODIMP
|
|
|
|
nsUTF16BEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
|
|
|
|
PRUnichar * aDest, PRInt32 * aDestLength)
|
|
|
|
{
|
2011-02-20 01:04:28 -08:00
|
|
|
if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
|
|
|
|
{
|
2011-03-02 23:21:06 -08:00
|
|
|
nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
|
2011-02-20 01:04:28 -08:00
|
|
|
*aSrcLength=0;
|
|
|
|
*aDestLength=0;
|
2011-03-02 23:21:06 -08:00
|
|
|
return res;
|
2011-02-20 01:04:28 -08:00
|
|
|
}
|
2007-03-22 10:30:00 -07:00
|
|
|
#ifdef IS_LITTLE_ENDIAN
|
|
|
|
// Remove the BOM if we're little-endian. The 'same endian' case with the
|
|
|
|
// leading BOM will be taken care of by |UTF16ConvertToUnicode|.
|
|
|
|
if(STATE_FIRST_CALL == mState) // Called for the first time.
|
|
|
|
{
|
|
|
|
mState = STATE_NORMAL;
|
|
|
|
if(0xFFFE == *((PRUnichar*)aSrc)) {
|
|
|
|
// eliminate BOM (on LE machines, BE BOM is 0xFFFE)
|
|
|
|
mState = STATE_FOUND_BOM;
|
|
|
|
} else if(0xFEFF == *((PRUnichar*)aSrc)) {
|
|
|
|
*aSrcLength=0;
|
|
|
|
*aDestLength=0;
|
|
|
|
return NS_ERROR_ILLEGAL_INPUT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-02-22 02:08:27 -08:00
|
|
|
nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
|
2010-10-20 09:11:16 -07:00
|
|
|
mOddLowSurrogate,
|
2009-02-22 02:08:27 -08:00
|
|
|
aSrc, aSrcLength, aDest, aDestLength,
|
2007-03-22 10:30:00 -07:00
|
|
|
#ifdef IS_LITTLE_ENDIAN
|
2011-10-17 07:59:28 -07:00
|
|
|
true
|
2009-02-22 02:08:27 -08:00
|
|
|
#else
|
2011-10-17 07:59:28 -07:00
|
|
|
false
|
2007-03-22 10:30:00 -07:00
|
|
|
#endif
|
2009-02-22 02:08:27 -08:00
|
|
|
);
|
2007-03-22 10:30:00 -07:00
|
|
|
return rv;
|
|
|
|
}
|
|
|
|
|
|
|
|
NS_IMETHODIMP
|
|
|
|
nsUTF16LEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
|
|
|
|
PRUnichar * aDest, PRInt32 * aDestLength)
|
|
|
|
{
|
2011-02-20 01:04:28 -08:00
|
|
|
if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
|
|
|
|
{
|
2011-03-02 23:21:06 -08:00
|
|
|
nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
|
2011-02-20 01:04:28 -08:00
|
|
|
*aSrcLength=0;
|
|
|
|
*aDestLength=0;
|
2011-03-02 23:21:06 -08:00
|
|
|
return res;
|
2011-02-20 01:04:28 -08:00
|
|
|
}
|
2007-03-22 10:30:00 -07:00
|
|
|
#ifdef IS_BIG_ENDIAN
|
|
|
|
// Remove the BOM if we're big-endian. The 'same endian' case with the
|
|
|
|
// leading BOM will be taken care of by |UTF16ConvertToUnicode|.
|
|
|
|
if(STATE_FIRST_CALL == mState) // first time called
|
|
|
|
{
|
|
|
|
mState = STATE_NORMAL;
|
|
|
|
if(0xFFFE == *((PRUnichar*)aSrc)) {
|
|
|
|
// eliminate BOM (on BE machines, LE BOM is 0xFFFE)
|
|
|
|
mState = STATE_FOUND_BOM;
|
|
|
|
} else if(0xFEFF == *((PRUnichar*)aSrc)) {
|
|
|
|
*aSrcLength=0;
|
|
|
|
*aDestLength=0;
|
|
|
|
return NS_ERROR_ILLEGAL_INPUT;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-02-22 02:08:27 -08:00
|
|
|
nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
|
2010-10-20 09:11:16 -07:00
|
|
|
mOddLowSurrogate,
|
2009-02-22 02:08:27 -08:00
|
|
|
aSrc, aSrcLength, aDest, aDestLength,
|
2007-03-22 10:30:00 -07:00
|
|
|
#ifdef IS_BIG_ENDIAN
|
2011-10-17 07:59:28 -07:00
|
|
|
true
|
2009-02-22 02:08:27 -08:00
|
|
|
#else
|
2011-10-17 07:59:28 -07:00
|
|
|
false
|
2007-03-22 10:30:00 -07:00
|
|
|
#endif
|
2009-02-22 02:08:27 -08:00
|
|
|
);
|
2007-03-22 10:30:00 -07:00
|
|
|
return rv;
|
|
|
|
}
|
|
|
|
|
|
|
|
NS_IMETHODIMP
|
|
|
|
nsUTF16ToUnicode::Reset()
|
|
|
|
{
|
|
|
|
mEndian = kUnknown;
|
2011-10-17 07:59:28 -07:00
|
|
|
mFoundBOM = false;
|
2007-03-22 10:30:00 -07:00
|
|
|
return nsUTF16ToUnicodeBase::Reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
NS_IMETHODIMP
|
|
|
|
nsUTF16ToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
|
|
|
|
PRUnichar * aDest, PRInt32 * aDestLength)
|
|
|
|
{
|
2011-02-20 01:04:28 -08:00
|
|
|
if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
|
|
|
|
{
|
2011-03-02 23:21:06 -08:00
|
|
|
nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
|
2011-02-20 01:04:28 -08:00
|
|
|
*aSrcLength=0;
|
|
|
|
*aDestLength=0;
|
2011-03-02 23:21:06 -08:00
|
|
|
return res;
|
2011-02-20 01:04:28 -08:00
|
|
|
}
|
2007-03-22 10:30:00 -07:00
|
|
|
if(STATE_FIRST_CALL == mState) // first time called
|
|
|
|
{
|
|
|
|
mState = STATE_NORMAL;
|
|
|
|
// check if BOM (0xFEFF) is at the beginning, remove it if found, and
|
|
|
|
// set mEndian accordingly.
|
|
|
|
if(0xFF == PRUint8(aSrc[0]) && 0xFE == PRUint8(aSrc[1])) {
|
|
|
|
mState = STATE_FOUND_BOM;
|
|
|
|
mEndian = kLittleEndian;
|
2011-10-17 07:59:28 -07:00
|
|
|
mFoundBOM = true;
|
2007-03-22 10:30:00 -07:00
|
|
|
}
|
|
|
|
else if(0xFE == PRUint8(aSrc[0]) && 0xFF == PRUint8(aSrc[1])) {
|
|
|
|
mState = STATE_FOUND_BOM;
|
|
|
|
mEndian = kBigEndian;
|
2011-10-17 07:59:28 -07:00
|
|
|
mFoundBOM = true;
|
2007-03-22 10:30:00 -07:00
|
|
|
}
|
|
|
|
// BOM is not found, but we can use a simple heuristic to determine
|
|
|
|
// the endianness. Assume the first character is [U+0001, U+00FF].
|
|
|
|
// Not always valid, but it's very likely to hold for html/xml/css.
|
|
|
|
else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00)
|
|
|
|
mEndian = kBigEndian;
|
|
|
|
}
|
|
|
|
else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00)
|
|
|
|
mEndian = kLittleEndian;
|
|
|
|
}
|
|
|
|
else { // Neither BOM nor 'plausible' byte patterns at the beginning.
|
|
|
|
// Just assume it's BE (following Unicode standard)
|
|
|
|
// and let the garbage show up in the browser. (security concern?)
|
|
|
|
// (bug 246194)
|
|
|
|
mEndian = kBigEndian;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-02-22 02:08:27 -08:00
|
|
|
nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
|
2010-10-20 09:11:16 -07:00
|
|
|
mOddLowSurrogate,
|
2009-02-22 02:08:27 -08:00
|
|
|
aSrc, aSrcLength, aDest, aDestLength,
|
2007-03-22 10:30:00 -07:00
|
|
|
#ifdef IS_BIG_ENDIAN
|
2009-02-22 02:08:27 -08:00
|
|
|
(mEndian == kLittleEndian)
|
2007-03-22 10:30:00 -07:00
|
|
|
#elif defined(IS_LITTLE_ENDIAN)
|
2009-02-22 02:08:27 -08:00
|
|
|
(mEndian == kBigEndian)
|
2007-03-22 10:30:00 -07:00
|
|
|
#else
|
|
|
|
#error "Unknown endianness"
|
|
|
|
#endif
|
2009-02-22 02:08:27 -08:00
|
|
|
);
|
2007-03-22 10:30:00 -07:00
|
|
|
|
|
|
|
// If BOM is not found and we're to return NS_OK, signal that BOM
|
|
|
|
// is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
|
|
|
|
return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
|
|
|
|
}
|