gecko/intl/uconv/ucvlatin/nsUTF32ToUnicode.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:expandtab:shiftwidth=2:tabstop=2: 
 */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is Mozilla Communicator client code.
 *
 * The Initial Developer of the Original Code is
 * Netscape Communications Corporation.
 * Portions created by the Initial Developer are Copyright (C) 1998
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Jungshik Shin <jshin@mailaps.org>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include "nsUCSupport.h"
#include "nsUTF32ToUnicode.h"
#include "nsCharTraits.h"
#include <string.h>

//----------------------------------------------------------------------
// static functions and macro definition common to nsUTF32(BE|LE)ToUnicode

#ifdef IS_BIG_ENDIAN
#define LE_STRING_TO_UCS4(s)                                       \
        (PRUint8(*(s)) | (PRUint8(*((s) + 1)) << 8) |              \
         (PRUint8(*((s) + 2)) << 16) | (PRUint8(*((s) + 3)) << 24))
#else
#define LE_STRING_TO_UCS4(s) (*(PRUint32*) (s))
#endif

#ifdef IS_BIG_ENDIAN
#define BE_STRING_TO_UCS4(s) (*(PRUint32*) (s))
#else
#define BE_STRING_TO_UCS4(s)                                       \
        (PRUint8(*((s) + 3)) | (PRUint8(*((s) + 2)) << 8) |         \
         (PRUint8(*((s) + 1)) << 16) | (PRUint8(*(s)) << 24))
#endif
 
static nsresult ConvertCommon(const char * aSrc, 
                              PRInt32 * aSrcLength, 
                              PRUnichar * aDest, 
                              PRInt32 * aDestLength,
                              PRUint16 * aState,
                              PRUint8  * aBuffer,
                              PRBool aIsLE)
{
   
  NS_ENSURE_TRUE(*aState < 4, NS_ERROR_INVALID_ARG);
  NS_ENSURE_TRUE(*aDestLength > 0, NS_ERROR_INVALID_ARG);

  const char *src = aSrc;
  const char *srcEnd = aSrc + *aSrcLength;
   
  PRUnichar *dest = aDest;
  PRUnichar *destEnd = aDest + *aDestLength;

  if (*aState > *aSrcLength) 
  {
    memcpy(aBuffer + 4 - *aState, src, *aSrcLength);
    *aDestLength = 0;
    *aState -= *aSrcLength;
    return NS_OK_UDEC_MOREINPUT;
  }

  PRUint32 ucs4;

  // prev. run left a partial UTF-32 seq. 
  if (*aState > 0)
  {
    memcpy(aBuffer + 4 - *aState, src, *aState);
    ucs4 =  aIsLE ? LE_STRING_TO_UCS4(aBuffer) : BE_STRING_TO_UCS4(aBuffer); 
    if (ucs4 < 0x10000L)  // BMP
    {
      *dest++= IS_SURROGATE(ucs4) ? UCS2_REPLACEMENT_CHAR : PRUnichar(ucs4);
    }
    else if (ucs4 < 0x110000L)  // plane 1 through plane 16 
    {
      if (destEnd - dest < 2) 
      {
        *aSrcLength = 0;
        *aDestLength = 0;
        return NS_OK_UDEC_MOREOUTPUT;
      }
      *dest++= H_SURROGATE(ucs4);
      *dest++= L_SURROGATE(ucs4);
    }       
    // Codepoints in plane 17 and higher (> 0x10ffff)
    // are not representable in UTF-16 we use for the internal
    // character representation. This is not a problem
    // because Unicode/ISO 10646 will never assign characters
    // in plane 17 and higher. Therefore, we convert them
    // to Unicode replacement character (0xfffd).
    else                   
      *dest++ = UCS2_REPLACEMENT_CHAR;
    src += *aState;
    *aState = 0;
  }

  nsresult rv = NS_OK;  // conversion result

  for ( ; src < srcEnd && dest < destEnd; src += 4)
  {
    if (srcEnd - src < 4) 
    {
      // fill up aBuffer until src buffer gets exhausted.
      memcpy(aBuffer, src, srcEnd - src);
      *aState = 4 - (srcEnd - src); // set add. char to read in next run
      src = srcEnd;
      rv = NS_OK_UDEC_MOREINPUT;
      break;
    }

    ucs4 =  aIsLE ? LE_STRING_TO_UCS4(src) : BE_STRING_TO_UCS4(src); 
    if (ucs4 < 0x10000L)  // BMP
    {
      *dest++= IS_SURROGATE(ucs4) ? UCS2_REPLACEMENT_CHAR : PRUnichar(ucs4);
    }
    else if (ucs4 < 0x110000L)  // plane 1 through plane 16 
    {
      if (destEnd - dest < 2) 
        break;
      // ((ucs4 - 0x10000) >> 10) + 0xd800;
      *dest++= H_SURROGATE(ucs4);
      *dest++= L_SURROGATE(ucs4);
    }       
    else                       // plane 17 and higher
      *dest++ = UCS2_REPLACEMENT_CHAR;
  }

  //output not finished, output buffer too short
  if((NS_OK == rv) && (src < srcEnd) && (dest >= destEnd)) 
    rv = NS_OK_UDEC_MOREOUTPUT;

  *aSrcLength = src - aSrc;
  *aDestLength  = dest - aDest;

  return rv;
}


//----------------------------------------------------------------------
// Class nsUTF32ToUnicode [implementation]

nsUTF32ToUnicodeBase::nsUTF32ToUnicodeBase() : nsBasicDecoderSupport()
{
  Reset();
}

//----------------------------------------------------------------------
// Subclassing of nsDecoderSupport class [implementation]

NS_IMETHODIMP nsUTF32ToUnicodeBase::GetMaxLength(const char * aSrc, 
                                                 PRInt32 aSrcLength, 
                                                 PRInt32 * aDestLength)
{
  // Non-BMP characters take two PRUnichars(a pair of surrogate codepoints)
  // so that we have to divide by 2 instead of 4 for the worst case.
  *aDestLength = aSrcLength / 2;
  return NS_OK;
}


//----------------------------------------------------------------------
// Subclassing of nsBasicDecoderSupport class [implementation]

NS_IMETHODIMP nsUTF32ToUnicodeBase::Reset()
{
  // the number of additional bytes to read to complete UTF-32 4byte seq.
  mState = 0;  
  memset(mBufferInc, 0, 4);
  return NS_OK;

}


//----------------------------------------------------------------------
// Class nsUTF32BEToUnicode [implementation]

//----------------------------------------------------------------------
// Subclassing of nsUTF32ToUnicodeBase class [implementation]

NS_IMETHODIMP nsUTF32BEToUnicode::Convert(const char * aSrc, 
                                          PRInt32 * aSrcLength, 
                                          PRUnichar * aDest, 
                                          PRInt32 * aDestLength)
{
  return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState, 
                       mBufferInc, PR_FALSE);
}

//----------------------------------------------------------------------
// Class nsUTF32LEToUnicode [implementation]
  
//----------------------------------------------------------------------
// Subclassing of nsUTF32ToUnicodeBase class [implementation]

NS_IMETHODIMP nsUTF32LEToUnicode::Convert(const char * aSrc, 
                                          PRInt32 * aSrcLength, 
                                          PRUnichar * aDest, 
                                          PRInt32 * aDestLength)
{
  return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState, 
                       mBufferInc, PR_TRUE);
}

//----------------------------------------------------------------------
// Class nsUTF32ToUnicode [implementation]

//----------------------------------------------------------------------
// Subclassing of nsUTF32ToUnicodeBase class [implementation]

NS_IMETHODIMP nsUTF32ToUnicode::Reset()
{
  nsresult rv = nsUTF32ToUnicodeBase::Reset();
  mState = 4;
  mEndian = kUnknown;
  mFoundBOM = PR_FALSE;
  return rv;
}

NS_IMETHODIMP nsUTF32ToUnicode::Convert(const char * aSrc, 
                                        PRInt32 * aSrcLength, 
                                        PRUnichar * aDest, 
                                        PRInt32 * aDestLength)
{
  PRBool foundBOM = PR_FALSE;
  if (4 == mState) // Called for the first time.
  {
    if (*aSrcLength < 4)
      return NS_ERROR_ILLEGAL_INPUT;

    // check if BOM (0xFEFF) is at the beginning, remove it if found, and
    // set mEndian accordingly.
    if (0xFF == PRUint8(aSrc[0]) && 0xFE == PRUint8(aSrc[1]) &&
        0 == PRUint8(aSrc[2]) && 0 == PRUint8(aSrc[3])) {
      aSrc += 4;
      *aSrcLength -= 4;
      mState = 0;
      mEndian = kLittleEndian;
      mFoundBOM = foundBOM = PR_TRUE;
    }
    else if (0 == PRUint8(aSrc[0]) && 0 == PRUint8(aSrc[1]) &&
             0xFE == PRUint8(aSrc[2]) && 0xFF == PRUint8(aSrc[3])) {
      aSrc += 4;
      *aSrcLength -= 4;
      mState = 0;
      mEndian = kBigEndian;
      mFoundBOM = foundBOM = PR_TRUE;
    }
    // BOM is not found, but we can use a simple heuristic to determine
    // the endianness. Assume the first character is [U+0001, U+FFFF].
    // Not always valid, but it's very likely to hold for html/xml/css. 
#if 0 // BE case will be handled below
    else if (!aSrc[0] && !aSrc[1] && (aSrc[2] || aSrc[3])) {  // 0x00 0x00 0xhh 0xhh (hh != 00)
      mState = 0;
      mEndian = kBigEndian;
    }
#endif
    else if ((aSrc[0] || aSrc[1]) && !aSrc[2] && !aSrc[3]) {  // 0xhh 0xhh 0x00 0x00 (hh != 00)
      mState = 0;
      mEndian = kLittleEndian;
    }
    else { // Neither BOM nor 'plausible' byte patterns at the beginning.
           // Just assume it's BE (following Unicode standard)
           // and let the garbage show up in the browser. (security concern?)
           // (bug 246194)
      mState = 0;
      mEndian = kBigEndian;
    }
  }

  nsresult rv = ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState, 
                              mBufferInc, mEndian == kLittleEndian);
  if (foundBOM)
    *aSrcLength += 4; // need to consume BOM

  // If BOM is not found and we're to return NS_OK, signal that BOM
  // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
  return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
}

// XXX : What to do with 'unflushed' mBufferInc?? : Finish()
Free the (distributed) Lizard! Automatic merge from CVS: Module mozilla: tag HG_REPO_INITIAL_IMPORT at 22 Mar 2007 10:30 PDT, 2007-03-22 10:30:00 -07:00			`/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */`
			`/* vim:expandtab:shiftwidth=2:tabstop=2:`
			`*/`
			`/* *** BEGIN LICENSE BLOCK ***`
			`* Version: MPL 1.1/GPL 2.0/LGPL 2.1`
			`*`
			`* The contents of this file are subject to the Mozilla Public License Version`
			`* 1.1 (the "License"); you may not use this file except in compliance with`
			`* the License. You may obtain a copy of the License at`
			`* http://www.mozilla.org/MPL/`
			`*`
			`* Software distributed under the License is distributed on an "AS IS" basis,`
			`* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License`
			`* for the specific language governing rights and limitations under the`
			`* License.`
			`*`
			`* The Original Code is Mozilla Communicator client code.`
			`*`
			`* The Initial Developer of the Original Code is`
			`* Netscape Communications Corporation.`
			`* Portions created by the Initial Developer are Copyright (C) 1998`
			`* the Initial Developer. All Rights Reserved.`
			`*`
			`* Contributor(s):`
			`* Jungshik Shin <jshin@mailaps.org>`
			`*`
			`* Alternatively, the contents of this file may be used under the terms of`
			`* either of the GNU General Public License Version 2 or later (the "GPL"),`
			`* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),`
			`* in which case the provisions of the GPL or the LGPL are applicable instead`
			`* of those above. If you wish to allow use of your version of this file only`
			`* under the terms of either the GPL or the LGPL, and not to allow others to`
			`* use your version of this file under the terms of the MPL, indicate your`
			`* decision by deleting the provisions above and replace them with the notice`
			`* and other provisions required by the GPL or the LGPL. If you do not delete`
			`* the provisions above, a recipient may use your version of this file under`
			`* the terms of any one of the MPL, the GPL or the LGPL.`
			`*`
			`* *** END LICENSE BLOCK *** */`

			`#include "nsUCSupport.h"`
			`#include "nsUTF32ToUnicode.h"`
			`#include "nsCharTraits.h"`
			`#include <string.h>`

			`//----------------------------------------------------------------------`
			`// static functions and macro definition common to nsUTF32(BE\|LE)ToUnicode`

			`#ifdef IS_BIG_ENDIAN`
			`#define LE_STRING_TO_UCS4(s) \`
			`(PRUint8((s)) \| (PRUint8(((s) + 1)) << 8) \| \`
			`(PRUint8(((s) + 2)) << 16) \| (PRUint8(((s) + 3)) << 24))`
			`#else`
			`#define LE_STRING_TO_UCS4(s) ((PRUint32) (s))`
			`#endif`

			`#ifdef IS_BIG_ENDIAN`
			`#define BE_STRING_TO_UCS4(s) ((PRUint32) (s))`
			`#else`
			`#define BE_STRING_TO_UCS4(s) \`
			`(PRUint8(((s) + 3)) \| (PRUint8(((s) + 2)) << 8) \| \`
			`(PRUint8(((s) + 1)) << 16) \| (PRUint8((s)) << 24))`
			`#endif`

			`static nsresult ConvertCommon(const char * aSrc,`
			`PRInt32 * aSrcLength,`
			`PRUnichar * aDest,`
			`PRInt32 * aDestLength,`
			`PRUint16 * aState,`
			`PRUint8 * aBuffer,`
			`PRBool aIsLE)`
			`{`

			`NS_ENSURE_TRUE(*aState < 4, NS_ERROR_INVALID_ARG);`
			`NS_ENSURE_TRUE(*aDestLength > 0, NS_ERROR_INVALID_ARG);`

			`const char *src = aSrc;`
			`const char srcEnd = aSrc + aSrcLength;`

			`PRUnichar *dest = aDest;`
			`PRUnichar destEnd = aDest + aDestLength;`

			`if (aState > aSrcLength)`
			`{`
			`memcpy(aBuffer + 4 - aState, src, aSrcLength);`
			`*aDestLength = 0;`
			`aState -= aSrcLength;`
			`return NS_OK_UDEC_MOREINPUT;`
			`}`

			`PRUint32 ucs4;`

			`// prev. run left a partial UTF-32 seq.`
			`if (*aState > 0)`
			`{`
			`memcpy(aBuffer + 4 - aState, src, aState);`
			`ucs4 = aIsLE ? LE_STRING_TO_UCS4(aBuffer) : BE_STRING_TO_UCS4(aBuffer);`
			`if (ucs4 < 0x10000L) // BMP`
			`{`
			`*dest++= IS_SURROGATE(ucs4) ? UCS2_REPLACEMENT_CHAR : PRUnichar(ucs4);`
			`}`
			`else if (ucs4 < 0x110000L) // plane 1 through plane 16`
			`{`
			`if (destEnd - dest < 2)`
			`{`
			`*aSrcLength = 0;`
			`*aDestLength = 0;`
			`return NS_OK_UDEC_MOREOUTPUT;`
			`}`
			`*dest++= H_SURROGATE(ucs4);`
			`*dest++= L_SURROGATE(ucs4);`
			`}`
			`// Codepoints in plane 17 and higher (> 0x10ffff)`
			`// are not representable in UTF-16 we use for the internal`
			`// character representation. This is not a problem`
			`// because Unicode/ISO 10646 will never assign characters`
			`// in plane 17 and higher. Therefore, we convert them`
			`// to Unicode replacement character (0xfffd).`
			`else`
			`*dest++ = UCS2_REPLACEMENT_CHAR;`
			`src += *aState;`
			`*aState = 0;`
			`}`

			`nsresult rv = NS_OK; // conversion result`

			`for ( ; src < srcEnd && dest < destEnd; src += 4)`
			`{`
			`if (srcEnd - src < 4)`
			`{`
			`// fill up aBuffer until src buffer gets exhausted.`
			`memcpy(aBuffer, src, srcEnd - src);`
			`*aState = 4 - (srcEnd - src); // set add. char to read in next run`
			`src = srcEnd;`
			`rv = NS_OK_UDEC_MOREINPUT;`
			`break;`
			`}`

			`ucs4 = aIsLE ? LE_STRING_TO_UCS4(src) : BE_STRING_TO_UCS4(src);`
			`if (ucs4 < 0x10000L) // BMP`
			`{`
			`*dest++= IS_SURROGATE(ucs4) ? UCS2_REPLACEMENT_CHAR : PRUnichar(ucs4);`
			`}`
			`else if (ucs4 < 0x110000L) // plane 1 through plane 16`
			`{`
			`if (destEnd - dest < 2)`
			`break;`
			`// ((ucs4 - 0x10000) >> 10) + 0xd800;`
			`*dest++= H_SURROGATE(ucs4);`
			`*dest++= L_SURROGATE(ucs4);`
			`}`
			`else // plane 17 and higher`
			`*dest++ = UCS2_REPLACEMENT_CHAR;`
			`}`

			`//output not finished, output buffer too short`
			`if((NS_OK == rv) && (src < srcEnd) && (dest >= destEnd))`
			`rv = NS_OK_UDEC_MOREOUTPUT;`

			`*aSrcLength = src - aSrc;`
			`*aDestLength = dest - aDest;`

			`return rv;`
			`}`


			`//----------------------------------------------------------------------`
			`// Class nsUTF32ToUnicode [implementation]`

Bug 335531 - Correct misuse of UTF-16BE, UTF-16LE, UTF-32BE, and UTF-32LE charset labels; r=smontagu sr=dbaron 2008-12-06 11:08:26 -08:00			`nsUTF32ToUnicodeBase::nsUTF32ToUnicodeBase() : nsBasicDecoderSupport()`
Free the (distributed) Lizard! Automatic merge from CVS: Module mozilla: tag HG_REPO_INITIAL_IMPORT at 22 Mar 2007 10:30 PDT, 2007-03-22 10:30:00 -07:00			`{`
			`Reset();`
			`}`

			`//----------------------------------------------------------------------`
			`// Subclassing of nsDecoderSupport class [implementation]`

Bug 335531 - Correct misuse of UTF-16BE, UTF-16LE, UTF-32BE, and UTF-32LE charset labels; r=smontagu sr=dbaron 2008-12-06 11:08:26 -08:00			`NS_IMETHODIMP nsUTF32ToUnicodeBase::GetMaxLength(const char * aSrc,`
			`PRInt32 aSrcLength,`
			`PRInt32 * aDestLength)`
Free the (distributed) Lizard! Automatic merge from CVS: Module mozilla: tag HG_REPO_INITIAL_IMPORT at 22 Mar 2007 10:30 PDT, 2007-03-22 10:30:00 -07:00			`{`
			`// Non-BMP characters take two PRUnichars(a pair of surrogate codepoints)`
			`// so that we have to divide by 2 instead of 4 for the worst case.`
			`*aDestLength = aSrcLength / 2;`
			`return NS_OK;`
			`}`


			`//----------------------------------------------------------------------`
			`// Subclassing of nsBasicDecoderSupport class [implementation]`

Bug 335531 - Correct misuse of UTF-16BE, UTF-16LE, UTF-32BE, and UTF-32LE charset labels; r=smontagu sr=dbaron 2008-12-06 11:08:26 -08:00			`NS_IMETHODIMP nsUTF32ToUnicodeBase::Reset()`
Free the (distributed) Lizard! Automatic merge from CVS: Module mozilla: tag HG_REPO_INITIAL_IMPORT at 22 Mar 2007 10:30 PDT, 2007-03-22 10:30:00 -07:00			`{`
			`// the number of additional bytes to read to complete UTF-32 4byte seq.`
			`mState = 0;`
			`memset(mBufferInc, 0, 4);`
			`return NS_OK;`

			`}`


			`//----------------------------------------------------------------------`
			`// Class nsUTF32BEToUnicode [implementation]`

			`//----------------------------------------------------------------------`
Bug 335531 - Correct misuse of UTF-16BE, UTF-16LE, UTF-32BE, and UTF-32LE charset labels; r=smontagu sr=dbaron 2008-12-06 11:08:26 -08:00			`// Subclassing of nsUTF32ToUnicodeBase class [implementation]`
Free the (distributed) Lizard! Automatic merge from CVS: Module mozilla: tag HG_REPO_INITIAL_IMPORT at 22 Mar 2007 10:30 PDT, 2007-03-22 10:30:00 -07:00
			`NS_IMETHODIMP nsUTF32BEToUnicode::Convert(const char * aSrc,`
			`PRInt32 * aSrcLength,`
			`PRUnichar * aDest,`
			`PRInt32 * aDestLength)`
			`{`
			`return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState,`
			`mBufferInc, PR_FALSE);`
			`}`

			`//----------------------------------------------------------------------`
			`// Class nsUTF32LEToUnicode [implementation]`

			`//----------------------------------------------------------------------`
Bug 335531 - Correct misuse of UTF-16BE, UTF-16LE, UTF-32BE, and UTF-32LE charset labels; r=smontagu sr=dbaron 2008-12-06 11:08:26 -08:00			`// Subclassing of nsUTF32ToUnicodeBase class [implementation]`
Free the (distributed) Lizard! Automatic merge from CVS: Module mozilla: tag HG_REPO_INITIAL_IMPORT at 22 Mar 2007 10:30 PDT, 2007-03-22 10:30:00 -07:00
			`NS_IMETHODIMP nsUTF32LEToUnicode::Convert(const char * aSrc,`
			`PRInt32 * aSrcLength,`
			`PRUnichar * aDest,`
			`PRInt32 * aDestLength)`
			`{`
			`return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState,`
			`mBufferInc, PR_TRUE);`
			`}`

Bug 335531 - Correct misuse of UTF-16BE, UTF-16LE, UTF-32BE, and UTF-32LE charset labels; r=smontagu sr=dbaron 2008-12-06 11:08:26 -08:00			`//----------------------------------------------------------------------`
			`// Class nsUTF32ToUnicode [implementation]`

			`//----------------------------------------------------------------------`
			`// Subclassing of nsUTF32ToUnicodeBase class [implementation]`

			`NS_IMETHODIMP nsUTF32ToUnicode::Reset()`
			`{`
			`nsresult rv = nsUTF32ToUnicodeBase::Reset();`
			`mState = 4;`
			`mEndian = kUnknown;`
			`mFoundBOM = PR_FALSE;`
			`return rv;`
			`}`

			`NS_IMETHODIMP nsUTF32ToUnicode::Convert(const char * aSrc,`
			`PRInt32 * aSrcLength,`
			`PRUnichar * aDest,`
			`PRInt32 * aDestLength)`
			`{`
			`PRBool foundBOM = PR_FALSE;`
			`if (4 == mState) // Called for the first time.`
			`{`
			`if (*aSrcLength < 4)`
			`return NS_ERROR_ILLEGAL_INPUT;`

			`// check if BOM (0xFEFF) is at the beginning, remove it if found, and`
			`// set mEndian accordingly.`
			`if (0xFF == PRUint8(aSrc[0]) && 0xFE == PRUint8(aSrc[1]) &&`
			`0 == PRUint8(aSrc[2]) && 0 == PRUint8(aSrc[3])) {`
			`aSrc += 4;`
			`*aSrcLength -= 4;`
			`mState = 0;`
			`mEndian = kLittleEndian;`
			`mFoundBOM = foundBOM = PR_TRUE;`
			`}`
			`else if (0 == PRUint8(aSrc[0]) && 0 == PRUint8(aSrc[1]) &&`
			`0xFE == PRUint8(aSrc[2]) && 0xFF == PRUint8(aSrc[3])) {`
			`aSrc += 4;`
			`*aSrcLength -= 4;`
			`mState = 0;`
			`mEndian = kBigEndian;`
			`mFoundBOM = foundBOM = PR_TRUE;`
			`}`
			`// BOM is not found, but we can use a simple heuristic to determine`
			`// the endianness. Assume the first character is [U+0001, U+FFFF].`
			`// Not always valid, but it's very likely to hold for html/xml/css.`
			`#if 0 // BE case will be handled below`
			`else if (!aSrc[0] && !aSrc[1] && (aSrc[2] \|\| aSrc[3])) { // 0x00 0x00 0xhh 0xhh (hh != 00)`
			`mState = 0;`
			`mEndian = kBigEndian;`
			`}`
			`#endif`
			`else if ((aSrc[0] \|\| aSrc[1]) && !aSrc[2] && !aSrc[3]) { // 0xhh 0xhh 0x00 0x00 (hh != 00)`
			`mState = 0;`
			`mEndian = kLittleEndian;`
			`}`
			`else { // Neither BOM nor 'plausible' byte patterns at the beginning.`
			`// Just assume it's BE (following Unicode standard)`
			`// and let the garbage show up in the browser. (security concern?)`
			`// (bug 246194)`
			`mState = 0;`
			`mEndian = kBigEndian;`
			`}`
			`}`

			`nsresult rv = ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState,`
			`mBufferInc, mEndian == kLittleEndian);`
			`if (foundBOM)`
			`*aSrcLength += 4; // need to consume BOM`

			`// If BOM is not found and we're to return NS_OK, signal that BOM`
			`// is not found. Otherwise, return \|rv\| from \|UTF16ConvertToUnicode\|`
			`return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;`
			`}`

Free the (distributed) Lizard! Automatic merge from CVS: Module mozilla: tag HG_REPO_INITIAL_IMPORT at 22 Mar 2007 10:30 PDT, 2007-03-22 10:30:00 -07:00			`// XXX : What to do with 'unflushed' mBufferInc?? : Finish()`