/* -*- mode:c; tab-width:8; c-basic-offset:8; indent-tabs-mode:nil; -*- */ /* Copyright (C) 2016 by Ronnie Sahlberg This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program; if not, see . */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #ifdef HAVE_STDINT_H #include #endif #ifdef HAVE_STDLIB_H #include #endif #ifdef HAVE_STRING_H #include #endif #ifdef HAVE_TIME_H #include #endif #ifdef HAVE_SYS_TIME_H #include #endif #ifdef STDC_HEADERS #include #endif #include "compat.h" #include "portable-endian.h" #include #include #include "libsmb2-private.h" /* Count number of leading 1 bits in the char */ static int l1(char c) { int i = 0; while (c & 0x80) { i++; c <<= 1; } return i; } /* Validates that utf8 points to a valid utf8 codepoint. * Will update **utf8 to point at the next character in the string. * return 1 if the encoding is valid and requires one UTF-16 code unit, * 2 if the encoding is valid and requires two UTF-16 code units * -1 if it's invalid. * If the encoding is valid the codepoint will be returned in *cp. */ static int validate_utf8_cp(const char **utf8, uint16_t *ret) { int c = *(*utf8)++; int l, l_tmp; uint32_t cp; l = l_tmp = l1(c); switch (l) { case 0: /* 7-bit ascii is always ok */ *ret = c & 0x7f; return 1; case 1: /* 10.. .... can never start a new codepoint */ return -1; case 2: case 3: case 4: cp = c & (0x7f >> l); /* 2, 3 and 4 byte sequences must always be followed by exactly * 1, 2 or 3 chars matching 10.. .... */ while(--l_tmp) { c = *(*utf8)++; if (l1(c) != 1) { return -1; } cp <<= 6; cp |= (c & 0x3f); } /* Check for overlong sequences */ switch (l) { case 2: if (cp < 0x80) return -1; break; case 3: if (cp < 0x800) return -1; break; case 4: if (cp < 0x10000) return -1; break; default: break; } /* Write the code point in either one or two UTC-16 code units */ if (cp < 0xd800 || (cp - 0xe000) < 0x2000) { /* Single UTF-16 code unit */ *ret = cp; return 1; } else if (cp < 0xe000) { /* invalid unicode range */ return -1; } else if (cp < 0x110000) { cp -= 0x10000; *ret = 0xd800 | (cp >> 10); *(ret+1) = 0xdc00 | (cp & 0x3ff) ; return 2; } else { /* invalid unicode range */ return -1; } } return -1; } /* Validate that the given string is properly formatted UTF8. * Returns >=0 if valid UTF8 and -1 if not. */ static int validate_utf8_str(const char *utf8) { const char *u = utf8; int i = 0; int cp_length; uint16_t cp[2]; while (*u) { cp_length = validate_utf8_cp(&u, cp); if (cp_length < 0) { return -1; } i += cp_length; } return i; } /* Convert a UTF8 string into UTF-16LE */ struct smb2_utf16 * smb2_utf8_to_utf16(const char *utf8) { struct smb2_utf16 *utf16; int i, len; len = validate_utf8_str(utf8); if (len < 0) { return NULL; } utf16 = (struct smb2_utf16 *)(malloc(offsetof(struct smb2_utf16, val) + 2 * len)); if (utf16 == NULL) { return NULL; } utf16->len = len; i = 0; while (i < len) { switch(validate_utf8_cp(&utf8, &utf16->val[i])) { case 1: utf16->val[i] = htole16(utf16->val[i]); i += 1; break; case 2: utf16->val[i] = htole16(utf16->val[i]); utf16->val[i+1] = htole16(utf16->val[i+1]); i += 2; break; default: /* Won't happen since we wouldn't have gotten here if the UTF-8 string was invalid */ break; } } return utf16; } static int utf16_size(const uint16_t *utf16, size_t utf16_len) { int length = 0; const uint16_t *utf16_end = utf16 + utf16_len; while (utf16 < utf16_end) { uint32_t code = le16toh(*utf16++); if (code < 0x80) { length += 1; /* One UTF-16 code unit maps to one UTF-8 code unit */ } else if (code < 0x800) { length += 2; /* One UTF-16 code unit maps to two UTF-8 code units */ } else if (code < 0xd800 || code - 0xe000 < 0x2000) { length += 3; } else if (code < 0xdc00) { /* Surrogate pair */ uint32_t trail; if (utf16 == utf16_end) { /* It's possible the stream ends with a leading code unit, which is an error */ return length + 3; /* Replacement char */ } trail = le16toh(*utf16); if (trail - 0xdc00 < 0x400) { /* Check that 0xdc00 <= trail < 0xe000 */ code = 0x10000 + ((code & 0x3ff) << 10) + (trail & 0x3ff); if (code < 0x10000) { length += 3; /* Two UTF-16 code units map to three UTF-8 code units */ } else { length += 4; /* Two UTF-16 code units map to four UTF-8 code units */ } utf16++; } else { /* Invalid trailing code unit. It's still valid on its own though so only the first unit gets replaced */ length += 3; /* Replacement char */ } } else { /* 0xdc00 <= code < 0xe00, which makes code a trailing code unit without a leading one, which is invalid */ length += 3; /* Replacement char */ } } return length; } /* * Convert a UTF-16LE string into UTF8 */ const char * smb2_utf16_to_utf8(const uint16_t *utf16, size_t utf16_len) { int utf8_len = 1; char *str, *tmp; const uint16_t *utf16_end; /* How many bytes do we need for utf8 ? */ utf8_len += utf16_size(utf16, utf16_len); str = tmp = (char*)malloc(utf8_len); if (str == NULL) { return NULL; } str[utf8_len - 1] = 0; utf16_end = utf16 + utf16_len; while (utf16 < utf16_end) { uint32_t code = le16toh(*utf16++); if (code < 0x80) { *tmp++ = code; /* One UTF-16 code unit maps to one UTF-8 code unit */ } else if (code < 0x800) { *tmp++ = 0xc0 | (code >> 6); /* One UTF-16 code unit maps to two UTF-8 code units */ *tmp++ = 0x80 | ((code ) & 0x3f); } else if (code < 0xD800 || code - 0xe000 < 0x2000) { *tmp++ = 0xe0 | (code >> 12); /* All other values where we only have one UTF-16 code unit map to 3 UTF-8 code units */ *tmp++ = 0x80 | ((code >> 6) & 0x3f); *tmp++ = 0x80 | ((code ) & 0x3f); } else if (code < 0xdc00) { /* Surrogate pair */ uint32_t trail; if (utf16 == utf16_end) { /* It's possible the stream ends with a leading code unit, which is an error */ *tmp++ = 0xef; *tmp++ = 0xbf; *tmp++ = 0xbd; /* Replacement char */ return str; } trail = le16toh(*utf16); if (trail - 0xdc00 < 0x400) { /* Check that 0xdc00 <= trail < 0xe000 */ code = 0x10000 + ((code & 0x3ff) << 10) + (trail & 0x3ff); if (code < 0x10000) { *tmp++ = 0xe0 | (code >> 12); *tmp++ = 0x80 | ((code >> 6) & 0x3f); *tmp++ = 0x80 | ((code ) & 0x3f); } else { *tmp++ = 0xF0 | (code >> 18); *tmp++ = 0x80 | ((code >> 12) & 0x3F); *tmp++ = 0x80 | ((code >> 6) & 0x3F); *tmp++ = 0x80 | (code & 0x3F); } utf16++; } else { /* Invalid trailing code unit. It's still valid on its own though so only the first unit gets replaced */ *tmp++ = 0xef; *tmp++ = 0xbf; *tmp++ = 0xbd; /* Replacement char */ } } else { /* 0xdc00 <= code < 0xe00, which makes code a trailing code unit without a leading one, which is invalid */ *tmp++ = 0xef; *tmp++ = 0xbf; *tmp++ = 0xbd; /* Replacement char */ } } return str; }