/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim:expandtab:shiftwidth=2:tabstop=2: */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Mozilla Communicator client code. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 1998 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Jungshik Shin * * Alternatively, the contents of this file may be used under the terms of * either of the GNU General Public License Version 2 or later (the "GPL"), * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ #include "nsUCSupport.h" #include "nsUTF32ToUnicode.h" #include "nsCharTraits.h" #include //---------------------------------------------------------------------- // static functions and macro definition common to nsUTF32(BE|LE)ToUnicode #if defined(IS_BIG_ENDIAN) || defined(__arm__) #define LE_STRING_TO_UCS4(s) \ (PRUint8(*(s)) | (PRUint8(*((s) + 1)) << 8) | \ (PRUint8(*((s) + 2)) << 16) | (PRUint8(*((s) + 3)) << 24)) #else #define LE_STRING_TO_UCS4(s) (*(PRUint32*) (s)) #endif #if defined(IS_BIG_ENDIAN) && !defined(__sparc__) #define BE_STRING_TO_UCS4(s) (*(PRUint32*) (s)) #else #define BE_STRING_TO_UCS4(s) \ (PRUint8(*((s) + 3)) | (PRUint8(*((s) + 2)) << 8) | \ (PRUint8(*((s) + 1)) << 16) | (PRUint8(*(s)) << 24)) #endif static nsresult ConvertCommon(const char * aSrc, PRInt32 * aSrcLength, PRUnichar * aDest, PRInt32 * aDestLength, PRUint16 * aState, PRUint8 * aBuffer, PRBool aIsLE) { NS_ENSURE_TRUE(*aState < 4, NS_ERROR_INVALID_ARG); NS_ENSURE_TRUE(*aDestLength > 0, NS_ERROR_INVALID_ARG); const char *src = aSrc; const char *srcEnd = aSrc + *aSrcLength; PRUnichar *dest = aDest; PRUnichar *destEnd = aDest + *aDestLength; if (*aState > *aSrcLength) { memcpy(aBuffer + 4 - *aState, src, *aSrcLength); *aDestLength = 0; *aState -= *aSrcLength; return NS_OK_UDEC_MOREINPUT; } PRUint32 ucs4; // prev. run left a partial UTF-32 seq. if (*aState > 0) { memcpy(aBuffer + 4 - *aState, src, *aState); ucs4 = aIsLE ? LE_STRING_TO_UCS4(aBuffer) : BE_STRING_TO_UCS4(aBuffer); if (ucs4 < 0x10000L) // BMP { *dest++= IS_SURROGATE(ucs4) ? UCS2_REPLACEMENT_CHAR : PRUnichar(ucs4); } else if (ucs4 < 0x110000L) // plane 1 through plane 16 { if (destEnd - dest < 2) { *aSrcLength = 0; *aDestLength = 0; return NS_OK_UDEC_MOREOUTPUT; } *dest++= H_SURROGATE(ucs4); *dest++= L_SURROGATE(ucs4); } // Codepoints in plane 17 and higher (> 0x10ffff) // are not representable in UTF-16 we use for the internal // character representation. This is not a problem // because Unicode/ISO 10646 will never assign characters // in plane 17 and higher. Therefore, we convert them // to Unicode replacement character (0xfffd). else *dest++ = UCS2_REPLACEMENT_CHAR; src += *aState; *aState = 0; } nsresult rv = NS_OK; // conversion result for ( ; src < srcEnd && dest < destEnd; src += 4) { if (srcEnd - src < 4) { // fill up aBuffer until src buffer gets exhausted. memcpy(aBuffer, src, srcEnd - src); *aState = 4 - (srcEnd - src); // set add. char to read in next run src = srcEnd; rv = NS_OK_UDEC_MOREINPUT; break; } ucs4 = aIsLE ? LE_STRING_TO_UCS4(src) : BE_STRING_TO_UCS4(src); if (ucs4 < 0x10000L) // BMP { *dest++= IS_SURROGATE(ucs4) ? UCS2_REPLACEMENT_CHAR : PRUnichar(ucs4); } else if (ucs4 < 0x110000L) // plane 1 through plane 16 { if (destEnd - dest < 2) break; // ((ucs4 - 0x10000) >> 10) + 0xd800; *dest++= H_SURROGATE(ucs4); *dest++= L_SURROGATE(ucs4); } else // plane 17 and higher *dest++ = UCS2_REPLACEMENT_CHAR; } //output not finished, output buffer too short if((NS_OK == rv) && (src < srcEnd) && (dest >= destEnd)) rv = NS_OK_UDEC_MOREOUTPUT; *aSrcLength = src - aSrc; *aDestLength = dest - aDest; return rv; } //---------------------------------------------------------------------- // Class nsUTF32ToUnicode [implementation] nsUTF32ToUnicodeBase::nsUTF32ToUnicodeBase() : nsBasicDecoderSupport() { Reset(); } //---------------------------------------------------------------------- // Subclassing of nsDecoderSupport class [implementation] NS_IMETHODIMP nsUTF32ToUnicodeBase::GetMaxLength(const char * aSrc, PRInt32 aSrcLength, PRInt32 * aDestLength) { // Non-BMP characters take two PRUnichars(a pair of surrogate codepoints) // so that we have to divide by 2 instead of 4 for the worst case. *aDestLength = aSrcLength / 2; return NS_OK; } //---------------------------------------------------------------------- // Subclassing of nsBasicDecoderSupport class [implementation] NS_IMETHODIMP nsUTF32ToUnicodeBase::Reset() { // the number of additional bytes to read to complete UTF-32 4byte seq. mState = 0; memset(mBufferInc, 0, 4); return NS_OK; } //---------------------------------------------------------------------- // Class nsUTF32BEToUnicode [implementation] //---------------------------------------------------------------------- // Subclassing of nsUTF32ToUnicodeBase class [implementation] NS_IMETHODIMP nsUTF32BEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength, PRUnichar * aDest, PRInt32 * aDestLength) { return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState, mBufferInc, PR_FALSE); } //---------------------------------------------------------------------- // Class nsUTF32LEToUnicode [implementation] //---------------------------------------------------------------------- // Subclassing of nsUTF32ToUnicodeBase class [implementation] NS_IMETHODIMP nsUTF32LEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength, PRUnichar * aDest, PRInt32 * aDestLength) { return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState, mBufferInc, PR_TRUE); } //---------------------------------------------------------------------- // Class nsUTF32ToUnicode [implementation] //---------------------------------------------------------------------- // Subclassing of nsUTF32ToUnicodeBase class [implementation] NS_IMETHODIMP nsUTF32ToUnicode::Reset() { nsresult rv = nsUTF32ToUnicodeBase::Reset(); mState = 4; mEndian = kUnknown; mFoundBOM = PR_FALSE; return rv; } NS_IMETHODIMP nsUTF32ToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength, PRUnichar * aDest, PRInt32 * aDestLength) { PRBool foundBOM = PR_FALSE; if (4 == mState) // Called for the first time. { if (*aSrcLength < 4) return NS_ERROR_ILLEGAL_INPUT; // check if BOM (0xFEFF) is at the beginning, remove it if found, and // set mEndian accordingly. if (0xFF == PRUint8(aSrc[0]) && 0xFE == PRUint8(aSrc[1]) && 0 == PRUint8(aSrc[2]) && 0 == PRUint8(aSrc[3])) { aSrc += 4; *aSrcLength -= 4; mState = 0; mEndian = kLittleEndian; mFoundBOM = foundBOM = PR_TRUE; } else if (0 == PRUint8(aSrc[0]) && 0 == PRUint8(aSrc[1]) && 0xFE == PRUint8(aSrc[2]) && 0xFF == PRUint8(aSrc[3])) { aSrc += 4; *aSrcLength -= 4; mState = 0; mEndian = kBigEndian; mFoundBOM = foundBOM = PR_TRUE; } // BOM is not found, but we can use a simple heuristic to determine // the endianness. Assume the first character is [U+0001, U+FFFF]. // Not always valid, but it's very likely to hold for html/xml/css. #if 0 // BE case will be handled below else if (!aSrc[0] && !aSrc[1] && (aSrc[2] || aSrc[3])) { // 0x00 0x00 0xhh 0xhh (hh != 00) mState = 0; mEndian = kBigEndian; } #endif else if ((aSrc[0] || aSrc[1]) && !aSrc[2] && !aSrc[3]) { // 0xhh 0xhh 0x00 0x00 (hh != 00) mState = 0; mEndian = kLittleEndian; } else { // Neither BOM nor 'plausible' byte patterns at the beginning. // Just assume it's BE (following Unicode standard) // and let the garbage show up in the browser. (security concern?) // (bug 246194) mState = 0; mEndian = kBigEndian; } } nsresult rv = ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState, mBufferInc, mEndian == kLittleEndian); if (foundBOM) *aSrcLength += 4; // need to consume BOM // If BOM is not found and we're to return NS_OK, signal that BOM // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode| return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv; } // XXX : What to do with 'unflushed' mBufferInc?? : Finish()