gecko/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp

351 lines
9.4 KiB
C++
Raw Normal View History

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2012-05-21 04:12:37 -07:00
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsUTF16ToUnicode.h"
#include "nsCharTraits.h"
#include "mozilla/Endian.h"
enum {
STATE_NORMAL = 0,
STATE_HALF_CODE_POINT = 1,
STATE_FIRST_CALL = 2,
STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT,
STATE_ODD_SURROGATE_PAIR = 4
};
nsresult
nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
int32_t * aSrcLength,
char16_t * aDest,
int32_t * aDestLength,
bool aSwapBytes)
{
const char* src = aSrc;
const char* srcEnd = aSrc + *aSrcLength;
char16_t* dest = aDest;
char16_t* destEnd = aDest + *aDestLength;
char16_t oddHighSurrogate;
switch(mState) {
case STATE_FIRST_CALL:
NS_ASSERTION(*aSrcLength > 1, "buffer too short");
src+=2;
mState = STATE_NORMAL;
break;
case STATE_SECOND_BYTE:
NS_ASSERTION(*aSrcLength > 0, "buffer too short");
src++;
mState = STATE_NORMAL;
break;
case STATE_ODD_SURROGATE_PAIR:
if (*aDestLength < 2)
goto error;
else {
*dest++ = mOddHighSurrogate;
*dest++ = mOddLowSurrogate;
mOddHighSurrogate = mOddLowSurrogate = 0;
mState = STATE_NORMAL;
}
break;
case STATE_NORMAL:
case STATE_HALF_CODE_POINT:
default:
break;
}
oddHighSurrogate = mOddHighSurrogate;
if (src == srcEnd) {
*aDestLength = dest - aDest;
return (mState != STATE_NORMAL || oddHighSurrogate) ?
NS_OK_UDEC_MOREINPUT : NS_OK;
}
const char* srcEvenEnd;
char16_t u;
if (mState == STATE_HALF_CODE_POINT) {
if (dest == destEnd)
goto error;
// the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
// previous run while the 2nd byte has to come from |*src|.
mState = STATE_NORMAL;
#if MOZ_BIG_ENDIAN
u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte.
#else
u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
#endif
srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
goto have_codepoint;
} else {
srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
}
while (src != srcEvenEnd) {
if (dest == destEnd)
goto error;
#if !defined(__sparc__) && !defined(__arm__)
u = *(const char16_t*)src;
#else
memcpy(&u, src, 2);
#endif
src += 2;
have_codepoint:
if (aSwapBytes)
u = u << 8 | u >> 8;
if (!IS_SURROGATE(u)) {
if (oddHighSurrogate) {
if (mErrBehavior == kOnError_Signal) {
goto error2;
}
*dest++ = UCS2_REPLACEMENT_CHAR;
if (dest == destEnd)
goto error;
oddHighSurrogate = 0;
}
*dest++ = u;
} else if (NS_IS_HIGH_SURROGATE(u)) {
if (oddHighSurrogate) {
if (mErrBehavior == kOnError_Signal) {
goto error2;
}
*dest++ = UCS2_REPLACEMENT_CHAR;
if (dest == destEnd)
goto error;
}
oddHighSurrogate = u;
}
else /* if (NS_IS_LOW_SURROGATE(u)) */ {
if (oddHighSurrogate && *aDestLength > 1) {
if (dest + 1 >= destEnd) {
mOddLowSurrogate = u;
mOddHighSurrogate = oddHighSurrogate;
mState = STATE_ODD_SURROGATE_PAIR;
goto error;
}
*dest++ = oddHighSurrogate;
*dest++ = u;
} else {
if (mErrBehavior == kOnError_Signal) {
goto error2;
}
*dest++ = UCS2_REPLACEMENT_CHAR;
}
oddHighSurrogate = 0;
}
}
if (src != srcEnd) {
// store the lead byte of a 16-bit unit for the next run.
mOddByte = *src++;
mState = STATE_HALF_CODE_POINT;
}
mOddHighSurrogate = oddHighSurrogate;
*aDestLength = dest - aDest;
*aSrcLength = src - aSrc;
return (mState != STATE_NORMAL || oddHighSurrogate) ?
NS_OK_UDEC_MOREINPUT : NS_OK;
error:
*aDestLength = dest - aDest;
*aSrcLength = src - aSrc;
return NS_OK_UDEC_MOREOUTPUT;
error2:
*aDestLength = dest - aDest;
*aSrcLength = --src - aSrc;
return NS_ERROR_ILLEGAL_INPUT;
}
NS_IMETHODIMP
nsUTF16ToUnicodeBase::Reset()
{
mState = STATE_FIRST_CALL;
mOddByte = 0;
mOddHighSurrogate = 0;
mOddLowSurrogate = 0;
return NS_OK;
}
NS_IMETHODIMP
nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength,
int32_t * aDestLength)
{
// the left-over data of the previous run have to be taken into account.
*aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2;
if (mOddHighSurrogate)
(*aDestLength)++;
if (mOddLowSurrogate)
(*aDestLength)++;
return NS_OK;
}
NS_IMETHODIMP
nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
char16_t * aDest, int32_t * aDestLength)
{
switch (mState) {
case STATE_FIRST_CALL:
if (*aSrcLength < 2) {
if (*aSrcLength < 1) {
*aDestLength = 0;
return NS_OK;
}
if (uint8_t(*aSrc) != 0xFE) {
mState = STATE_NORMAL;
break;
}
*aDestLength = 0;
mState = STATE_SECOND_BYTE;
return NS_OK_UDEC_MOREINPUT;
}
#if MOZ_LITTLE_ENDIAN
// on LE machines, BE BOM is 0xFFFE
if (0xFFFE != *((char16_t*)aSrc)) {
mState = STATE_NORMAL;
}
#else
if (0xFEFF != *((char16_t*)aSrc)) {
mState = STATE_NORMAL;
}
#endif
break;
case STATE_SECOND_BYTE:
if (*aSrcLength < 1) {
*aDestLength = 0;
return NS_OK_UDEC_MOREINPUT;
}
if (uint8_t(*aSrc) != 0xFF) {
mOddByte = 0xFE;
mState = STATE_HALF_CODE_POINT;
}
break;
}
return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
bool(MOZ_LITTLE_ENDIAN));
}
NS_IMETHODIMP
nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
char16_t * aDest, int32_t * aDestLength)
{
switch (mState) {
case STATE_FIRST_CALL:
if (*aSrcLength < 2) {
if (*aSrcLength < 1) {
*aDestLength = 0;
return NS_OK;
}
if (uint8_t(*aSrc) != 0xFF) {
mState = STATE_NORMAL;
break;
}
*aDestLength = 0;
mState = STATE_SECOND_BYTE;
return NS_OK_UDEC_MOREINPUT;
}
#if MOZ_BIG_ENDIAN
// on BE machines, LE BOM is 0xFFFE
if (0xFFFE != *((char16_t*)aSrc)) {
mState = STATE_NORMAL;
}
#else
if (0xFEFF != *((char16_t*)aSrc)) {
mState = STATE_NORMAL;
}
#endif
break;
case STATE_SECOND_BYTE:
if (*aSrcLength < 1) {
*aDestLength = 0;
return NS_OK_UDEC_MOREINPUT;
}
if (uint8_t(*aSrc) != 0xFE) {
mOddByte = 0xFF;
mState = STATE_HALF_CODE_POINT;
}
break;
}
return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
bool(MOZ_BIG_ENDIAN));
}
NS_IMETHODIMP
nsUTF16ToUnicode::Reset()
{
mEndian = kUnknown;
mFoundBOM = false;
return nsUTF16ToUnicodeBase::Reset();
}
NS_IMETHODIMP
nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
char16_t * aDest, int32_t * aDestLength)
{
if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
{
nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
*aSrcLength=0;
*aDestLength=0;
return res;
}
if(STATE_FIRST_CALL == mState) // first time called
{
// check if BOM (0xFEFF) is at the beginning, remove it if found, and
// set mEndian accordingly.
if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) {
mEndian = kLittleEndian;
mFoundBOM = true;
}
else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) {
mEndian = kBigEndian;
mFoundBOM = true;
}
// BOM is not found, but we can use a simple heuristic to determine
// the endianness. Assume the first character is [U+0001, U+00FF].
// Not always valid, but it's very likely to hold for html/xml/css.
else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00)
mState = STATE_NORMAL;
mEndian = kBigEndian;
}
else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00)
mState = STATE_NORMAL;
mEndian = kLittleEndian;
}
else { // Neither BOM nor 'plausible' byte patterns at the beginning.
// Just assume it's BE (following Unicode standard)
// and let the garbage show up in the browser. (security concern?)
// (bug 246194)
mState = STATE_NORMAL;
mEndian = kBigEndian;
}
}
nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
#if MOZ_BIG_ENDIAN
(mEndian == kLittleEndian)
#else
(mEndian == kBigEndian)
#endif
);
// If BOM is not found and we're to return NS_OK, signal that BOM
// is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
}