1341 lines
27 KiB
C
1341 lines
27 KiB
C
/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
|
|
/*
|
|
* Copyright (C) 2011 Jeffrey Stedfast
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use, copy,
|
|
* modify, merge, publish, distribute, sublicense, and/or sell copies
|
|
* of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include <glib.h>
|
|
#include <string.h>
|
|
#ifdef HAVE_ICONV_H
|
|
#include <iconv.h>
|
|
#endif
|
|
#include <errno.h>
|
|
|
|
#ifdef _MSC_VER
|
|
#define FORCE_INLINE(RET_TYPE) __forceinline RET_TYPE
|
|
#else
|
|
#define FORCE_INLINE(RET_TYPE) inline RET_TYPE __attribute__((always_inline))
|
|
#endif
|
|
|
|
|
|
#define UNROLL_DECODE_UTF8 0
|
|
#define UNROLL_ENCODE_UTF8 0
|
|
|
|
typedef int (* Decoder) (char *inbuf, size_t inleft, gunichar *outchar);
|
|
typedef int (* Encoder) (gunichar c, char *outbuf, size_t outleft);
|
|
|
|
struct _GIConv {
|
|
Decoder decode;
|
|
Encoder encode;
|
|
gunichar c;
|
|
#ifdef HAVE_ICONV
|
|
iconv_t cd;
|
|
#endif
|
|
};
|
|
|
|
static int decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar);
|
|
static int encode_utf32be (gunichar c, char *outbuf, size_t outleft);
|
|
|
|
static int decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar);
|
|
static int encode_utf32le (gunichar c, char *outbuf, size_t outleft);
|
|
|
|
static int decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar);
|
|
static int encode_utf16be (gunichar c, char *outbuf, size_t outleft);
|
|
|
|
static int decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar);
|
|
static int encode_utf16le (gunichar c, char *outbuf, size_t outleft);
|
|
|
|
static FORCE_INLINE (int) decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar);
|
|
static int encode_utf8 (gunichar c, char *outbuf, size_t outleft);
|
|
|
|
static int decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar);
|
|
static int encode_latin1 (gunichar c, char *outbuf, size_t outleft);
|
|
|
|
#if G_BYTE_ORDER == G_LITTLE_ENDIAN
|
|
#define decode_utf32 decode_utf32le
|
|
#define encode_utf32 encode_utf32le
|
|
#define decode_utf16 decode_utf16le
|
|
#define encode_utf16 encode_utf16le
|
|
#else
|
|
#define decode_utf32 decode_utf32be
|
|
#define encode_utf32 encode_utf32be
|
|
#define decode_utf16 decode_utf16be
|
|
#define encode_utf16 encode_utf16be
|
|
#endif
|
|
|
|
static struct {
|
|
const char *name;
|
|
Decoder decoder;
|
|
Encoder encoder;
|
|
} charsets[] = {
|
|
{ "ISO-8859-1", decode_latin1, encode_latin1 },
|
|
{ "ISO8859-1", decode_latin1, encode_latin1 },
|
|
{ "UTF-32BE", decode_utf32be, encode_utf32be },
|
|
{ "UTF-32LE", decode_utf32le, encode_utf32le },
|
|
{ "UTF-16BE", decode_utf16be, encode_utf16be },
|
|
{ "UTF-16LE", decode_utf16le, encode_utf16le },
|
|
{ "UTF-32", decode_utf32, encode_utf32 },
|
|
{ "UTF-16", decode_utf16, encode_utf16 },
|
|
{ "UTF-8", decode_utf8, encode_utf8 },
|
|
{ "US-ASCII", decode_latin1, encode_latin1 },
|
|
{ "Latin1", decode_latin1, encode_latin1 },
|
|
{ "ASCII", decode_latin1, encode_latin1 },
|
|
{ "UTF32", decode_utf32, encode_utf32 },
|
|
{ "UTF16", decode_utf16, encode_utf16 },
|
|
{ "UTF8", decode_utf8, encode_utf8 },
|
|
};
|
|
|
|
|
|
GIConv
|
|
g_iconv_open (const char *to_charset, const char *from_charset)
|
|
{
|
|
#ifdef HAVE_ICONV
|
|
iconv_t icd = (iconv_t) -1;
|
|
#endif
|
|
Decoder decoder = NULL;
|
|
Encoder encoder = NULL;
|
|
GIConv cd;
|
|
guint i;
|
|
|
|
if (!to_charset || !from_charset || !to_charset[0] || !from_charset[0]) {
|
|
errno = EINVAL;
|
|
|
|
return (GIConv) -1;
|
|
}
|
|
|
|
for (i = 0; i < G_N_ELEMENTS (charsets); i++) {
|
|
if (!g_ascii_strcasecmp (charsets[i].name, from_charset))
|
|
decoder = charsets[i].decoder;
|
|
|
|
if (!g_ascii_strcasecmp (charsets[i].name, to_charset))
|
|
encoder = charsets[i].encoder;
|
|
}
|
|
|
|
if (!encoder || !decoder) {
|
|
#ifdef HAVE_ICONV
|
|
if ((icd = iconv_open (to_charset, from_charset)) == (iconv_t) -1)
|
|
return (GIConv) -1;
|
|
#else
|
|
errno = EINVAL;
|
|
|
|
return (GIConv) -1;
|
|
#endif
|
|
}
|
|
|
|
cd = (GIConv) g_malloc (sizeof (struct _GIConv));
|
|
cd->decode = decoder;
|
|
cd->encode = encoder;
|
|
cd->c = -1;
|
|
|
|
#ifdef HAVE_ICONV
|
|
cd->cd = icd;
|
|
#endif
|
|
|
|
return cd;
|
|
}
|
|
|
|
int
|
|
g_iconv_close (GIConv cd)
|
|
{
|
|
#ifdef HAVE_ICONV
|
|
if (cd->cd != (iconv_t) -1)
|
|
iconv_close (cd->cd);
|
|
#endif
|
|
|
|
g_free (cd);
|
|
|
|
return 0;
|
|
}
|
|
|
|
gsize
|
|
g_iconv (GIConv cd, gchar **inbytes, gsize *inbytesleft,
|
|
gchar **outbytes, gsize *outbytesleft)
|
|
{
|
|
gsize inleft, outleft;
|
|
char *inptr, *outptr;
|
|
gunichar c;
|
|
int rc = 0;
|
|
|
|
#ifdef HAVE_ICONV
|
|
if (cd->cd != (iconv_t) -1) {
|
|
/* Note: gsize may have a different size than size_t, so we need to
|
|
remap inbytesleft and outbytesleft to size_t's. */
|
|
size_t *outleftptr, *inleftptr;
|
|
size_t n_outleft, n_inleft;
|
|
|
|
if (inbytesleft) {
|
|
n_inleft = *inbytesleft;
|
|
inleftptr = &n_inleft;
|
|
} else {
|
|
inleftptr = NULL;
|
|
}
|
|
|
|
if (outbytesleft) {
|
|
n_outleft = *outbytesleft;
|
|
outleftptr = &n_outleft;
|
|
} else {
|
|
outleftptr = NULL;
|
|
}
|
|
#if defined(__NetBSD__)
|
|
return iconv (cd->cd, (const gchar **)inbytes, inleftptr, outbytes, outleftptr);
|
|
#else
|
|
return iconv (cd->cd, inbytes, inleftptr, outbytes, outleftptr);
|
|
#endif
|
|
}
|
|
#endif
|
|
|
|
if (outbytes == NULL || outbytesleft == NULL) {
|
|
/* reset converter */
|
|
cd->c = -1;
|
|
return 0;
|
|
}
|
|
|
|
inleft = inbytesleft ? *inbytesleft : 0;
|
|
inptr = inbytes ? *inbytes : NULL;
|
|
outleft = *outbytesleft;
|
|
outptr = *outbytes;
|
|
|
|
if ((c = cd->c) != (gunichar) -1)
|
|
goto encode;
|
|
|
|
while (inleft > 0) {
|
|
if ((rc = cd->decode (inptr, inleft, &c)) < 0)
|
|
break;
|
|
|
|
inleft -= rc;
|
|
inptr += rc;
|
|
|
|
encode:
|
|
if ((rc = cd->encode (c, outptr, outleft)) < 0)
|
|
break;
|
|
|
|
c = (gunichar) -1;
|
|
outleft -= rc;
|
|
outptr += rc;
|
|
}
|
|
|
|
if (inbytesleft)
|
|
*inbytesleft = inleft;
|
|
|
|
if (inbytes)
|
|
*inbytes = inptr;
|
|
|
|
*outbytesleft = outleft;
|
|
*outbytes = outptr;
|
|
cd->c = c;
|
|
|
|
return rc < 0 ? -1 : 0;
|
|
}
|
|
|
|
/*
|
|
* Unicode encoders and decoders
|
|
*/
|
|
|
|
static int
|
|
decode_utf32be (char *inbuf, size_t inleft, gunichar *outchar)
|
|
{
|
|
unsigned char *inptr = (unsigned char *) inbuf;
|
|
gunichar c;
|
|
|
|
if (inleft < 4) {
|
|
errno = EINVAL;
|
|
return -1;
|
|
}
|
|
|
|
c = (inptr[0] << 24) | (inptr[1] << 16) | (inptr[2] << 8) | inptr[3];
|
|
|
|
if (c >= 0xd800 && c < 0xe000) {
|
|
errno = EILSEQ;
|
|
return -1;
|
|
} else if (c >= 0x110000) {
|
|
errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
|
|
*outchar = c;
|
|
|
|
return 4;
|
|
}
|
|
|
|
static int
|
|
decode_utf32le (char *inbuf, size_t inleft, gunichar *outchar)
|
|
{
|
|
unsigned char *inptr = (unsigned char *) inbuf;
|
|
gunichar c;
|
|
|
|
if (inleft < 4) {
|
|
errno = EINVAL;
|
|
return -1;
|
|
}
|
|
|
|
c = (inptr[3] << 24) | (inptr[2] << 16) | (inptr[1] << 8) | inptr[0];
|
|
|
|
if (c >= 0xd800 && c < 0xe000) {
|
|
errno = EILSEQ;
|
|
return -1;
|
|
} else if (c >= 0x110000) {
|
|
errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
|
|
*outchar = c;
|
|
|
|
return 4;
|
|
}
|
|
|
|
static int
|
|
encode_utf32be (gunichar c, char *outbuf, size_t outleft)
|
|
{
|
|
unsigned char *outptr = (unsigned char *) outbuf;
|
|
|
|
if (outleft < 4) {
|
|
errno = E2BIG;
|
|
return -1;
|
|
}
|
|
|
|
outptr[0] = (c >> 24) & 0xff;
|
|
outptr[1] = (c >> 16) & 0xff;
|
|
outptr[2] = (c >> 8) & 0xff;
|
|
outptr[3] = c & 0xff;
|
|
|
|
return 4;
|
|
}
|
|
|
|
static int
|
|
encode_utf32le (gunichar c, char *outbuf, size_t outleft)
|
|
{
|
|
unsigned char *outptr = (unsigned char *) outbuf;
|
|
|
|
if (outleft < 4) {
|
|
errno = E2BIG;
|
|
return -1;
|
|
}
|
|
|
|
outptr[0] = c & 0xff;
|
|
outptr[1] = (c >> 8) & 0xff;
|
|
outptr[2] = (c >> 16) & 0xff;
|
|
outptr[3] = (c >> 24) & 0xff;
|
|
|
|
return 4;
|
|
}
|
|
|
|
static int
|
|
decode_utf16be (char *inbuf, size_t inleft, gunichar *outchar)
|
|
{
|
|
unsigned char *inptr = (unsigned char *) inbuf;
|
|
gunichar2 c;
|
|
gunichar u;
|
|
|
|
if (inleft < 2) {
|
|
errno = EINVAL;
|
|
return -1;
|
|
}
|
|
|
|
u = (inptr[0] << 8) | inptr[1];
|
|
|
|
if (u < 0xd800) {
|
|
/* 0x0000 -> 0xd7ff */
|
|
*outchar = u;
|
|
return 2;
|
|
} else if (u < 0xdc00) {
|
|
/* 0xd800 -> 0xdbff */
|
|
if (inleft < 4) {
|
|
errno = EINVAL;
|
|
return -2;
|
|
}
|
|
|
|
c = (inptr[2] << 8) | inptr[3];
|
|
|
|
if (c < 0xdc00 || c > 0xdfff) {
|
|
errno = EILSEQ;
|
|
return -2;
|
|
}
|
|
|
|
u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
|
|
*outchar = u;
|
|
|
|
return 4;
|
|
} else if (u < 0xe000) {
|
|
/* 0xdc00 -> 0xdfff */
|
|
errno = EILSEQ;
|
|
return -1;
|
|
} else {
|
|
/* 0xe000 -> 0xffff */
|
|
*outchar = u;
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
static int
|
|
decode_utf16le (char *inbuf, size_t inleft, gunichar *outchar)
|
|
{
|
|
unsigned char *inptr = (unsigned char *) inbuf;
|
|
gunichar2 c;
|
|
gunichar u;
|
|
|
|
if (inleft < 2) {
|
|
errno = EINVAL;
|
|
return -1;
|
|
}
|
|
|
|
u = (inptr[1] << 8) | inptr[0];
|
|
|
|
if (u < 0xd800) {
|
|
/* 0x0000 -> 0xd7ff */
|
|
*outchar = u;
|
|
return 2;
|
|
} else if (u < 0xdc00) {
|
|
/* 0xd800 -> 0xdbff */
|
|
if (inleft < 4) {
|
|
errno = EINVAL;
|
|
return -2;
|
|
}
|
|
|
|
c = (inptr[3] << 8) | inptr[2];
|
|
|
|
if (c < 0xdc00 || c > 0xdfff) {
|
|
errno = EILSEQ;
|
|
return -2;
|
|
}
|
|
|
|
u = ((u - 0xd800) << 10) + (c - 0xdc00) + 0x0010000UL;
|
|
*outchar = u;
|
|
|
|
return 4;
|
|
} else if (u < 0xe000) {
|
|
/* 0xdc00 -> 0xdfff */
|
|
errno = EILSEQ;
|
|
return -1;
|
|
} else {
|
|
/* 0xe000 -> 0xffff */
|
|
*outchar = u;
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
static int
|
|
encode_utf16be (gunichar c, char *outbuf, size_t outleft)
|
|
{
|
|
unsigned char *outptr = (unsigned char *) outbuf;
|
|
gunichar2 ch;
|
|
gunichar c2;
|
|
|
|
if (c < 0x10000) {
|
|
if (outleft < 2) {
|
|
errno = E2BIG;
|
|
return -1;
|
|
}
|
|
|
|
outptr[0] = (c >> 8) & 0xff;
|
|
outptr[1] = c & 0xff;
|
|
|
|
return 2;
|
|
} else {
|
|
if (outleft < 4) {
|
|
errno = E2BIG;
|
|
return -1;
|
|
}
|
|
|
|
c2 = c - 0x10000;
|
|
|
|
ch = (gunichar2) ((c2 >> 10) + 0xd800);
|
|
outptr[0] = (ch >> 8) & 0xff;
|
|
outptr[1] = ch & 0xff;
|
|
|
|
ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
|
|
outptr[2] = (ch >> 8) & 0xff;
|
|
outptr[3] = ch & 0xff;
|
|
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
static int
|
|
encode_utf16le (gunichar c, char *outbuf, size_t outleft)
|
|
{
|
|
unsigned char *outptr = (unsigned char *) outbuf;
|
|
gunichar2 ch;
|
|
gunichar c2;
|
|
|
|
if (c < 0x10000) {
|
|
if (outleft < 2) {
|
|
errno = E2BIG;
|
|
return -1;
|
|
}
|
|
|
|
outptr[0] = c & 0xff;
|
|
outptr[1] = (c >> 8) & 0xff;
|
|
|
|
return 2;
|
|
} else {
|
|
if (outleft < 4) {
|
|
errno = E2BIG;
|
|
return -1;
|
|
}
|
|
|
|
c2 = c - 0x10000;
|
|
|
|
ch = (gunichar2) ((c2 >> 10) + 0xd800);
|
|
outptr[0] = ch & 0xff;
|
|
outptr[1] = (ch >> 8) & 0xff;
|
|
|
|
ch = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
|
|
outptr[2] = ch & 0xff;
|
|
outptr[3] = (ch >> 8) & 0xff;
|
|
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
static FORCE_INLINE (int)
|
|
decode_utf8 (char *inbuf, size_t inleft, gunichar *outchar)
|
|
{
|
|
unsigned char *inptr = (unsigned char *) inbuf;
|
|
gunichar u;
|
|
int n, i;
|
|
|
|
u = *inptr;
|
|
|
|
if (u < 0x80) {
|
|
/* simple ascii case */
|
|
*outchar = u;
|
|
return 1;
|
|
} else if (u < 0xc2) {
|
|
errno = EILSEQ;
|
|
return -1;
|
|
} else if (u < 0xe0) {
|
|
u &= 0x1f;
|
|
n = 2;
|
|
} else if (u < 0xf0) {
|
|
u &= 0x0f;
|
|
n = 3;
|
|
} else if (u < 0xf8) {
|
|
u &= 0x07;
|
|
n = 4;
|
|
} else if (u < 0xfc) {
|
|
u &= 0x03;
|
|
n = 5;
|
|
} else if (u < 0xfe) {
|
|
u &= 0x01;
|
|
n = 6;
|
|
} else {
|
|
errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
|
|
if (n > inleft) {
|
|
errno = EINVAL;
|
|
return -1;
|
|
}
|
|
|
|
#if UNROLL_DECODE_UTF8
|
|
switch (n) {
|
|
case 6: u = (u << 6) | (*++inptr ^ 0x80);
|
|
case 5: u = (u << 6) | (*++inptr ^ 0x80);
|
|
case 4: u = (u << 6) | (*++inptr ^ 0x80);
|
|
case 3: u = (u << 6) | (*++inptr ^ 0x80);
|
|
case 2: u = (u << 6) | (*++inptr ^ 0x80);
|
|
}
|
|
#else
|
|
for (i = 1; i < n; i++)
|
|
u = (u << 6) | (*++inptr ^ 0x80);
|
|
#endif
|
|
|
|
*outchar = u;
|
|
|
|
return n;
|
|
}
|
|
|
|
static int
|
|
encode_utf8 (gunichar c, char *outbuf, size_t outleft)
|
|
{
|
|
unsigned char *outptr = (unsigned char *) outbuf;
|
|
int base, n, i;
|
|
|
|
if (c < 0x80) {
|
|
outptr[0] = c;
|
|
return 1;
|
|
} else if (c < 0x800) {
|
|
base = 192;
|
|
n = 2;
|
|
} else if (c < 0x10000) {
|
|
base = 224;
|
|
n = 3;
|
|
} else if (c < 0x200000) {
|
|
base = 240;
|
|
n = 4;
|
|
} else if (c < 0x4000000) {
|
|
base = 248;
|
|
n = 5;
|
|
} else {
|
|
base = 252;
|
|
n = 6;
|
|
}
|
|
|
|
if (outleft < n) {
|
|
errno = E2BIG;
|
|
return -1;
|
|
}
|
|
|
|
#if UNROLL_ENCODE_UTF8
|
|
switch (n) {
|
|
case 6: outptr[5] = (c & 0x3f) | 0x80; c >>= 6;
|
|
case 5: outptr[4] = (c & 0x3f) | 0x80; c >>= 6;
|
|
case 4: outptr[3] = (c & 0x3f) | 0x80; c >>= 6;
|
|
case 3: outptr[2] = (c & 0x3f) | 0x80; c >>= 6;
|
|
case 2: outptr[1] = (c & 0x3f) | 0x80; c >>= 6;
|
|
case 1: outptr[0] = c | base;
|
|
}
|
|
#else
|
|
for (i = n - 1; i > 0; i--) {
|
|
outptr[i] = (c & 0x3f) | 0x80;
|
|
c >>= 6;
|
|
}
|
|
|
|
outptr[0] = c | base;
|
|
#endif
|
|
|
|
return n;
|
|
}
|
|
|
|
static int
|
|
decode_latin1 (char *inbuf, size_t inleft, gunichar *outchar)
|
|
{
|
|
*outchar = (unsigned char) *inbuf;
|
|
return 1;
|
|
}
|
|
|
|
static int
|
|
encode_latin1 (gunichar c, char *outbuf, size_t outleft)
|
|
{
|
|
if (outleft < 1) {
|
|
errno = E2BIG;
|
|
return -1;
|
|
}
|
|
|
|
if (c > 0xff) {
|
|
errno = EILSEQ;
|
|
return -1;
|
|
}
|
|
|
|
*outbuf = (char) c;
|
|
|
|
return 1;
|
|
}
|
|
|
|
|
|
/*
|
|
* Simple conversion API
|
|
*/
|
|
|
|
static gpointer error_quark = "ConvertError";
|
|
|
|
gpointer
|
|
g_convert_error_quark (void)
|
|
{
|
|
return error_quark;
|
|
}
|
|
|
|
gchar *
|
|
g_convert (const gchar *str, gssize len, const gchar *to_charset, const gchar *from_charset,
|
|
gsize *bytes_read, gsize *bytes_written, GError **err)
|
|
{
|
|
gsize outsize, outused, outleft, inleft, grow, rc;
|
|
char *result, *outbuf, *inbuf;
|
|
gboolean flush = FALSE;
|
|
gboolean done = FALSE;
|
|
GIConv cd;
|
|
|
|
g_return_val_if_fail (str != NULL, NULL);
|
|
g_return_val_if_fail (to_charset != NULL, NULL);
|
|
g_return_val_if_fail (from_charset != NULL, NULL);
|
|
|
|
if ((cd = g_iconv_open (to_charset, from_charset)) == (GIConv) -1) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
|
|
"Conversion from %s to %s not supported.",
|
|
from_charset, to_charset);
|
|
|
|
if (bytes_written)
|
|
*bytes_written = 0;
|
|
|
|
if (bytes_read)
|
|
*bytes_read = 0;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
inleft = len < 0 ? strlen (str) : len;
|
|
inbuf = (char *) str;
|
|
|
|
outleft = outsize = MAX (inleft, 8);
|
|
outbuf = result = g_malloc (outsize + 4);
|
|
|
|
do {
|
|
if (!flush)
|
|
rc = g_iconv (cd, &inbuf, &inleft, &outbuf, &outleft);
|
|
else
|
|
rc = g_iconv (cd, NULL, NULL, &outbuf, &outleft);
|
|
|
|
if (rc == (gsize) -1) {
|
|
switch (errno) {
|
|
case E2BIG:
|
|
/* grow our result buffer */
|
|
grow = MAX (inleft, 8) << 1;
|
|
outused = outbuf - result;
|
|
outsize += grow;
|
|
outleft += grow;
|
|
result = g_realloc (result, outsize + 4);
|
|
outbuf = result + outused;
|
|
break;
|
|
case EINVAL:
|
|
/* incomplete input, stop converting and terminate here */
|
|
if (flush)
|
|
done = TRUE;
|
|
else
|
|
flush = TRUE;
|
|
break;
|
|
case EILSEQ:
|
|
/* illegal sequence in the input */
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "%s", g_strerror (errno));
|
|
|
|
if (bytes_read) {
|
|
/* save offset of the illegal input sequence */
|
|
*bytes_read = (inbuf - str);
|
|
}
|
|
|
|
if (bytes_written)
|
|
*bytes_written = 0;
|
|
|
|
g_iconv_close (cd);
|
|
g_free (result);
|
|
return NULL;
|
|
default:
|
|
/* unknown errno */
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "%s", g_strerror (errno));
|
|
|
|
if (bytes_written)
|
|
*bytes_written = 0;
|
|
|
|
if (bytes_read)
|
|
*bytes_read = 0;
|
|
|
|
g_iconv_close (cd);
|
|
g_free (result);
|
|
return NULL;
|
|
}
|
|
} else if (flush) {
|
|
/* input has been converted and output has been flushed */
|
|
break;
|
|
} else {
|
|
/* input has been converted, need to flush the output */
|
|
flush = TRUE;
|
|
}
|
|
} while (!done);
|
|
|
|
g_iconv_close (cd);
|
|
|
|
/* Note: not all charsets can be null-terminated with a single
|
|
null byte. UCS2, for example, needs 2 null bytes and UCS4
|
|
needs 4. I hope that 4 null bytes is enough to terminate all
|
|
multibyte charsets? */
|
|
|
|
/* null-terminate the result */
|
|
memset (outbuf, 0, 4);
|
|
|
|
if (bytes_written)
|
|
*bytes_written = outbuf - result;
|
|
|
|
if (bytes_read)
|
|
*bytes_read = inbuf - str;
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
/*
|
|
* Unicode conversion
|
|
*/
|
|
|
|
/**
|
|
* An explanation of the conversion can be found at:
|
|
* http://home.tiscali.nl/t876506/utf8tbl.html
|
|
*
|
|
**/
|
|
gint
|
|
g_unichar_to_utf8 (gunichar c, gchar *outbuf)
|
|
{
|
|
int base, n, i;
|
|
|
|
if (c < 0x80) {
|
|
base = 0;
|
|
n = 1;
|
|
} else if (c < 0x800) {
|
|
base = 192;
|
|
n = 2;
|
|
} else if (c < 0x10000) {
|
|
base = 224;
|
|
n = 3;
|
|
} else if (c < 0x200000) {
|
|
base = 240;
|
|
n = 4;
|
|
} else if (c < 0x4000000) {
|
|
base = 248;
|
|
n = 5;
|
|
} else if (c < 0x80000000) {
|
|
base = 252;
|
|
n = 6;
|
|
} else {
|
|
return -1;
|
|
}
|
|
|
|
if (outbuf != NULL) {
|
|
for (i = n - 1; i > 0; i--) {
|
|
/* mask off 6 bits worth and add 128 */
|
|
outbuf[i] = (c & 0x3f) | 0x80;
|
|
c >>= 6;
|
|
}
|
|
|
|
/* first character has a different base */
|
|
outbuf[0] = c | base;
|
|
}
|
|
|
|
return n;
|
|
}
|
|
|
|
static FORCE_INLINE (int)
|
|
g_unichar_to_utf16 (gunichar c, gunichar2 *outbuf)
|
|
{
|
|
gunichar c2;
|
|
|
|
if (c < 0xd800) {
|
|
if (outbuf)
|
|
*outbuf = (gunichar2) c;
|
|
|
|
return 1;
|
|
} else if (c < 0xe000) {
|
|
return -1;
|
|
} else if (c < 0x10000) {
|
|
if (outbuf)
|
|
*outbuf = (gunichar2) c;
|
|
|
|
return 1;
|
|
} else if (c < 0x110000) {
|
|
if (outbuf) {
|
|
c2 = c - 0x10000;
|
|
|
|
outbuf[0] = (gunichar2) ((c2 >> 10) + 0xd800);
|
|
outbuf[1] = (gunichar2) ((c2 & 0x3ff) + 0xdc00);
|
|
}
|
|
|
|
return 2;
|
|
} else {
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
gunichar *
|
|
g_utf8_to_ucs4_fast (const gchar *str, glong len, glong *items_written)
|
|
{
|
|
gunichar *outbuf, *outptr;
|
|
char *inptr;
|
|
glong n, i;
|
|
|
|
g_return_val_if_fail (str != NULL, NULL);
|
|
|
|
n = g_utf8_strlen (str, len);
|
|
|
|
if (items_written)
|
|
*items_written = n;
|
|
|
|
outptr = outbuf = g_malloc ((n + 1) * sizeof (gunichar));
|
|
inptr = (char *) str;
|
|
|
|
for (i = 0; i < n; i++) {
|
|
*outptr++ = g_utf8_get_char (inptr);
|
|
inptr = g_utf8_next_char (inptr);
|
|
}
|
|
|
|
*outptr = 0;
|
|
|
|
return outbuf;
|
|
}
|
|
|
|
static gunichar2 *
|
|
eg_utf8_to_utf16_general (const gchar *str, glong len, glong *items_read, glong *items_written, gboolean include_nuls, GError **err)
|
|
{
|
|
gunichar2 *outbuf, *outptr;
|
|
size_t outlen = 0;
|
|
size_t inleft;
|
|
char *inptr;
|
|
gunichar c;
|
|
int u, n;
|
|
|
|
g_return_val_if_fail (str != NULL, NULL);
|
|
|
|
if (len < 0) {
|
|
if (include_nuls) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, "Conversions with embedded nulls must pass the string length");
|
|
return NULL;
|
|
}
|
|
|
|
len = strlen (str);
|
|
}
|
|
|
|
inptr = (char *) str;
|
|
inleft = len;
|
|
|
|
while (inleft > 0) {
|
|
if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
|
|
goto error;
|
|
|
|
if (c == 0 && !include_nuls)
|
|
break;
|
|
|
|
if ((u = g_unichar_to_utf16 (c, NULL)) < 0) {
|
|
errno = EILSEQ;
|
|
goto error;
|
|
}
|
|
|
|
outlen += u;
|
|
inleft -= n;
|
|
inptr += n;
|
|
}
|
|
|
|
if (items_read)
|
|
*items_read = inptr - str;
|
|
|
|
if (items_written)
|
|
*items_written = outlen;
|
|
|
|
outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
|
|
inptr = (char *) str;
|
|
inleft = len;
|
|
|
|
while (inleft > 0) {
|
|
if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
|
|
break;
|
|
|
|
if (c == 0 && !include_nuls)
|
|
break;
|
|
|
|
outptr += g_unichar_to_utf16 (c, outptr);
|
|
inleft -= n;
|
|
inptr += n;
|
|
}
|
|
|
|
*outptr = '\0';
|
|
|
|
return outbuf;
|
|
|
|
error:
|
|
if (errno == EILSEQ) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
|
"Illegal byte sequence encounted in the input.");
|
|
} else if (items_read) {
|
|
/* partial input is ok if we can let our caller know... */
|
|
} else {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
|
"Partial byte sequence encountered in the input.");
|
|
}
|
|
|
|
if (items_read)
|
|
*items_read = inptr - str;
|
|
|
|
if (items_written)
|
|
*items_written = 0;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
gunichar2 *
|
|
g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
|
|
{
|
|
return eg_utf8_to_utf16_general (str, len, items_read, items_written, FALSE, err);
|
|
}
|
|
|
|
gunichar2 *
|
|
eg_utf8_to_utf16_with_nuls (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
|
|
{
|
|
return eg_utf8_to_utf16_general (str, len, items_read, items_written, TRUE, err);
|
|
}
|
|
|
|
gunichar *
|
|
g_utf8_to_ucs4 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **err)
|
|
{
|
|
gunichar *outbuf, *outptr;
|
|
size_t outlen = 0;
|
|
size_t inleft;
|
|
char *inptr;
|
|
gunichar c;
|
|
int n;
|
|
|
|
g_return_val_if_fail (str != NULL, NULL);
|
|
|
|
if (len < 0)
|
|
len = strlen (str);
|
|
|
|
inptr = (char *) str;
|
|
inleft = len;
|
|
|
|
while (inleft > 0) {
|
|
if ((n = decode_utf8 (inptr, inleft, &c)) < 0) {
|
|
if (errno == EILSEQ) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
|
"Illegal byte sequence encounted in the input.");
|
|
} else if (items_read) {
|
|
/* partial input is ok if we can let our caller know... */
|
|
break;
|
|
} else {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
|
"Partial byte sequence encountered in the input.");
|
|
}
|
|
|
|
if (items_read)
|
|
*items_read = inptr - str;
|
|
|
|
if (items_written)
|
|
*items_written = 0;
|
|
|
|
return NULL;
|
|
} else if (c == 0)
|
|
break;
|
|
|
|
outlen += 4;
|
|
inleft -= n;
|
|
inptr += n;
|
|
}
|
|
|
|
if (items_written)
|
|
*items_written = outlen / 4;
|
|
|
|
if (items_read)
|
|
*items_read = inptr - str;
|
|
|
|
outptr = outbuf = g_malloc (outlen + 4);
|
|
inptr = (char *) str;
|
|
inleft = len;
|
|
|
|
while (inleft > 0) {
|
|
if ((n = decode_utf8 (inptr, inleft, &c)) < 0)
|
|
break;
|
|
else if (c == 0)
|
|
break;
|
|
|
|
*outptr++ = c;
|
|
inleft -= n;
|
|
inptr += n;
|
|
}
|
|
|
|
*outptr = 0;
|
|
|
|
return outbuf;
|
|
}
|
|
|
|
gchar *
|
|
g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
|
|
{
|
|
char *inptr, *outbuf, *outptr;
|
|
size_t outlen = 0;
|
|
size_t inleft;
|
|
gunichar c;
|
|
int n;
|
|
|
|
g_return_val_if_fail (str != NULL, NULL);
|
|
|
|
if (len < 0) {
|
|
len = 0;
|
|
while (str[len])
|
|
len++;
|
|
}
|
|
|
|
inptr = (char *) str;
|
|
inleft = len * 2;
|
|
|
|
while (inleft > 0) {
|
|
if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
|
|
if (n == -2 && inleft > 2) {
|
|
/* This means that the first UTF-16 char was read, but second failed */
|
|
inleft -= 2;
|
|
inptr += 2;
|
|
}
|
|
|
|
if (errno == EILSEQ) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
|
"Illegal byte sequence encounted in the input.");
|
|
} else if (items_read) {
|
|
/* partial input is ok if we can let our caller know... */
|
|
break;
|
|
} else {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
|
"Partial byte sequence encountered in the input.");
|
|
}
|
|
|
|
if (items_read)
|
|
*items_read = (inptr - (char *) str) / 2;
|
|
|
|
if (items_written)
|
|
*items_written = 0;
|
|
|
|
return NULL;
|
|
} else if (c == 0)
|
|
break;
|
|
|
|
outlen += g_unichar_to_utf8 (c, NULL);
|
|
inleft -= n;
|
|
inptr += n;
|
|
}
|
|
|
|
if (items_read)
|
|
*items_read = (inptr - (char *) str) / 2;
|
|
|
|
if (items_written)
|
|
*items_written = outlen;
|
|
|
|
outptr = outbuf = g_malloc (outlen + 1);
|
|
inptr = (char *) str;
|
|
inleft = len * 2;
|
|
|
|
while (inleft > 0) {
|
|
if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
|
|
break;
|
|
else if (c == 0)
|
|
break;
|
|
|
|
outptr += g_unichar_to_utf8 (c, outptr);
|
|
inleft -= n;
|
|
inptr += n;
|
|
}
|
|
|
|
*outptr = '\0';
|
|
|
|
return outbuf;
|
|
}
|
|
|
|
gunichar *
|
|
g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **err)
|
|
{
|
|
gunichar *outbuf, *outptr;
|
|
size_t outlen = 0;
|
|
size_t inleft;
|
|
char *inptr;
|
|
gunichar c;
|
|
int n;
|
|
|
|
g_return_val_if_fail (str != NULL, NULL);
|
|
|
|
if (len < 0) {
|
|
len = 0;
|
|
while (str[len])
|
|
len++;
|
|
}
|
|
|
|
inptr = (char *) str;
|
|
inleft = len * 2;
|
|
|
|
while (inleft > 0) {
|
|
if ((n = decode_utf16 (inptr, inleft, &c)) < 0) {
|
|
if (n == -2 && inleft > 2) {
|
|
/* This means that the first UTF-16 char was read, but second failed */
|
|
inleft -= 2;
|
|
inptr += 2;
|
|
}
|
|
|
|
if (errno == EILSEQ) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
|
"Illegal byte sequence encounted in the input.");
|
|
} else if (items_read) {
|
|
/* partial input is ok if we can let our caller know... */
|
|
break;
|
|
} else {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
|
|
"Partial byte sequence encountered in the input.");
|
|
}
|
|
|
|
if (items_read)
|
|
*items_read = (inptr - (char *) str) / 2;
|
|
|
|
if (items_written)
|
|
*items_written = 0;
|
|
|
|
return NULL;
|
|
} else if (c == 0)
|
|
break;
|
|
|
|
outlen += 4;
|
|
inleft -= n;
|
|
inptr += n;
|
|
}
|
|
|
|
if (items_read)
|
|
*items_read = (inptr - (char *) str) / 2;
|
|
|
|
if (items_written)
|
|
*items_written = outlen / 4;
|
|
|
|
outptr = outbuf = g_malloc (outlen + 4);
|
|
inptr = (char *) str;
|
|
inleft = len * 2;
|
|
|
|
while (inleft > 0) {
|
|
if ((n = decode_utf16 (inptr, inleft, &c)) < 0)
|
|
break;
|
|
else if (c == 0)
|
|
break;
|
|
|
|
*outptr++ = c;
|
|
inleft -= n;
|
|
inptr += n;
|
|
}
|
|
|
|
*outptr = 0;
|
|
|
|
return outbuf;
|
|
}
|
|
|
|
gchar *
|
|
g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
|
|
{
|
|
char *outbuf, *outptr;
|
|
size_t outlen = 0;
|
|
glong i;
|
|
int n;
|
|
|
|
g_return_val_if_fail (str != NULL, NULL);
|
|
|
|
if (len < 0) {
|
|
for (i = 0; str[i] != 0; i++) {
|
|
if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
|
"Illegal byte sequence encounted in the input.");
|
|
|
|
if (items_written)
|
|
*items_written = 0;
|
|
|
|
if (items_read)
|
|
*items_read = i;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
outlen += n;
|
|
}
|
|
} else {
|
|
for (i = 0; i < len && str[i] != 0; i++) {
|
|
if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
|
"Illegal byte sequence encounted in the input.");
|
|
|
|
if (items_written)
|
|
*items_written = 0;
|
|
|
|
if (items_read)
|
|
*items_read = i;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
outlen += n;
|
|
}
|
|
}
|
|
|
|
len = i;
|
|
|
|
outptr = outbuf = g_malloc (outlen + 1);
|
|
for (i = 0; i < len; i++)
|
|
outptr += g_unichar_to_utf8 (str[i], outptr);
|
|
*outptr = 0;
|
|
|
|
if (items_written)
|
|
*items_written = outlen;
|
|
|
|
if (items_read)
|
|
*items_read = i;
|
|
|
|
return outbuf;
|
|
}
|
|
|
|
gunichar2 *
|
|
g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **err)
|
|
{
|
|
gunichar2 *outbuf, *outptr;
|
|
size_t outlen = 0;
|
|
glong i;
|
|
int n;
|
|
|
|
g_return_val_if_fail (str != NULL, NULL);
|
|
|
|
if (len < 0) {
|
|
for (i = 0; str[i] != 0; i++) {
|
|
if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
|
"Illegal byte sequence encounted in the input.");
|
|
|
|
if (items_written)
|
|
*items_written = 0;
|
|
|
|
if (items_read)
|
|
*items_read = i;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
outlen += n;
|
|
}
|
|
} else {
|
|
for (i = 0; i < len && str[i] != 0; i++) {
|
|
if ((n = g_unichar_to_utf16 (str[i], NULL)) < 0) {
|
|
g_set_error (err, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
|
|
"Illegal byte sequence encounted in the input.");
|
|
|
|
if (items_written)
|
|
*items_written = 0;
|
|
|
|
if (items_read)
|
|
*items_read = i;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
outlen += n;
|
|
}
|
|
}
|
|
|
|
len = i;
|
|
|
|
outptr = outbuf = g_malloc ((outlen + 1) * sizeof (gunichar2));
|
|
for (i = 0; i < len; i++)
|
|
outptr += g_unichar_to_utf16 (str[i], outptr);
|
|
*outptr = 0;
|
|
|
|
if (items_written)
|
|
*items_written = outlen;
|
|
|
|
if (items_read)
|
|
*items_read = i;
|
|
|
|
return outbuf;
|
|
}
|