Files
2024-04-29 20:48:28 -03:00

308 lines
11 KiB
C

/* -*- mode:c; tab-width:8; c-basic-offset:8; indent-tabs-mode:nil; -*- */
/*
Copyright (C) 2016 by Ronnie Sahlberg <ronniesahlberg@gmail.com>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#ifdef HAVE_STDINT_H
#include <stdint.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#ifdef HAVE_TIME_H
#include <time.h>
#endif
#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif
#ifdef STDC_HEADERS
#include <stddef.h>
#endif
#include "compat.h"
#include "portable-endian.h"
#include <smb2.h>
#include <libsmb2.h>
#include "libsmb2-private.h"
/* Count number of leading 1 bits in the char */
static int
l1(char c)
{
int i = 0;
while (c & 0x80) {
i++;
c <<= 1;
}
return i;
}
/* Validates that utf8 points to a valid utf8 codepoint.
* Will update **utf8 to point at the next character in the string.
* return 1 if the encoding is valid and requires one UTF-16 code unit,
* 2 if the encoding is valid and requires two UTF-16 code units
* -1 if it's invalid.
* If the encoding is valid the codepoint will be returned in *cp.
*/
static int
validate_utf8_cp(const char **utf8, uint16_t *ret)
{
int c = *(*utf8)++;
int l, l_tmp;
uint32_t cp;
l = l_tmp = l1(c);
switch (l) {
case 0:
/* 7-bit ascii is always ok */
*ret = c & 0x7f;
return 1;
case 1:
/* 10.. .... can never start a new codepoint */
return -1;
case 2:
case 3:
case 4:
cp = c & (0x7f >> l);
/* 2, 3 and 4 byte sequences must always be followed by exactly
* 1, 2 or 3 chars matching 10.. ....
*/
while(--l_tmp) {
c = *(*utf8)++;
if (l1(c) != 1) {
return -1;
}
cp <<= 6;
cp |= (c & 0x3f);
}
/* Check for overlong sequences */
switch (l) {
case 2:
if (cp < 0x80) return -1;
break;
case 3:
if (cp < 0x800) return -1;
break;
case 4:
if (cp < 0x10000) return -1;
break;
default: break;
}
/* Write the code point in either one or two UTC-16 code units */
if (cp < 0xd800 || (cp - 0xe000) < 0x2000) {
/* Single UTF-16 code unit */
*ret = cp;
return 1;
} else if (cp < 0xe000) {
/* invalid unicode range */
return -1;
} else if (cp < 0x110000) {
cp -= 0x10000;
*ret = 0xd800 | (cp >> 10);
*(ret+1) = 0xdc00 | (cp & 0x3ff) ;
return 2;
} else {
/* invalid unicode range */
return -1;
}
}
return -1;
}
/* Validate that the given string is properly formatted UTF8.
* Returns >=0 if valid UTF8 and -1 if not.
*/
static int
validate_utf8_str(const char *utf8)
{
const char *u = utf8;
int i = 0;
int cp_length;
uint16_t cp[2];
while (*u) {
cp_length = validate_utf8_cp(&u, cp);
if (cp_length < 0) {
return -1;
}
i += cp_length;
}
return i;
}
/* Convert a UTF8 string into UTF-16LE */
struct smb2_utf16 *
smb2_utf8_to_utf16(const char *utf8)
{
struct smb2_utf16 *utf16;
int i, len;
len = validate_utf8_str(utf8);
if (len < 0) {
return NULL;
}
utf16 = (struct smb2_utf16 *)(malloc(offsetof(struct smb2_utf16, val) + 2 * len));
if (utf16 == NULL) {
return NULL;
}
utf16->len = len;
i = 0;
while (i < len) {
switch(validate_utf8_cp(&utf8, &utf16->val[i])) {
case 1:
utf16->val[i] = htole16(utf16->val[i]);
i += 1;
break;
case 2:
utf16->val[i] = htole16(utf16->val[i]);
utf16->val[i+1] = htole16(utf16->val[i+1]);
i += 2;
break;
default:
/* Won't happen since we wouldn't have gotten here if the UTF-8 string was invalid */
break;
}
}
return utf16;
}
static int
utf16_size(const uint16_t *utf16, size_t utf16_len)
{
int length = 0;
const uint16_t *utf16_end = utf16 + utf16_len;
while (utf16 < utf16_end) {
uint32_t code = le16toh(*utf16++);
if (code < 0x80) {
length += 1; /* One UTF-16 code unit maps to one UTF-8 code unit */
} else if (code < 0x800) {
length += 2; /* One UTF-16 code unit maps to two UTF-8 code units */
} else if (code < 0xd800 || code - 0xe000 < 0x2000) {
length += 3;
} else if (code < 0xdc00) { /* Surrogate pair */
uint32_t trail;
if (utf16 == utf16_end) { /* It's possible the stream ends with a leading code unit, which is an error */
return length + 3; /* Replacement char */
}
trail = le16toh(*utf16);
if (trail - 0xdc00 < 0x400) { /* Check that 0xdc00 <= trail < 0xe000 */
code = 0x10000 + ((code & 0x3ff) << 10) + (trail & 0x3ff);
if (code < 0x10000) {
length += 3; /* Two UTF-16 code units map to three UTF-8 code units */
} else {
length += 4; /* Two UTF-16 code units map to four UTF-8 code units */
}
utf16++;
} else { /* Invalid trailing code unit. It's still valid on its own though so only the first unit gets replaced */
length += 3; /* Replacement char */
}
} else { /* 0xdc00 <= code < 0xe00, which makes code a trailing code unit without a leading one, which is invalid */
length += 3; /* Replacement char */
}
}
return length;
}
/*
* Convert a UTF-16LE string into UTF8
*/
const char *
smb2_utf16_to_utf8(const uint16_t *utf16, size_t utf16_len)
{
int utf8_len = 1;
char *str, *tmp;
const uint16_t *utf16_end;
/* How many bytes do we need for utf8 ? */
utf8_len += utf16_size(utf16, utf16_len);
str = tmp = (char*)malloc(utf8_len);
if (str == NULL) {
return NULL;
}
str[utf8_len - 1] = 0;
utf16_end = utf16 + utf16_len;
while (utf16 < utf16_end) {
uint32_t code = le16toh(*utf16++);
if (code < 0x80) {
*tmp++ = code; /* One UTF-16 code unit maps to one UTF-8 code unit */
} else if (code < 0x800) {
*tmp++ = 0xc0 | (code >> 6); /* One UTF-16 code unit maps to two UTF-8 code units */
*tmp++ = 0x80 | ((code ) & 0x3f);
} else if (code < 0xD800 || code - 0xe000 < 0x2000) {
*tmp++ = 0xe0 | (code >> 12); /* All other values where we only have one UTF-16 code unit map to 3 UTF-8 code units */
*tmp++ = 0x80 | ((code >> 6) & 0x3f);
*tmp++ = 0x80 | ((code ) & 0x3f);
} else if (code < 0xdc00) { /* Surrogate pair */
uint32_t trail;
if (utf16 == utf16_end) { /* It's possible the stream ends with a leading code unit, which is an error */
*tmp++ = 0xef; *tmp++ = 0xbf; *tmp++ = 0xbd; /* Replacement char */
return str;
}
trail = le16toh(*utf16);
if (trail - 0xdc00 < 0x400) { /* Check that 0xdc00 <= trail < 0xe000 */
code = 0x10000 + ((code & 0x3ff) << 10) + (trail & 0x3ff);
if (code < 0x10000) {
*tmp++ = 0xe0 | (code >> 12);
*tmp++ = 0x80 | ((code >> 6) & 0x3f);
*tmp++ = 0x80 | ((code ) & 0x3f);
} else {
*tmp++ = 0xF0 | (code >> 18);
*tmp++ = 0x80 | ((code >> 12) & 0x3F);
*tmp++ = 0x80 | ((code >> 6) & 0x3F);
*tmp++ = 0x80 | (code & 0x3F);
}
utf16++;
} else {
/* Invalid trailing code unit. It's still valid on its own though so only the first unit gets replaced */
*tmp++ = 0xef; *tmp++ = 0xbf; *tmp++ = 0xbd; /* Replacement char */
}
} else {
/* 0xdc00 <= code < 0xe00, which makes code a trailing code unit without a leading one, which is invalid */
*tmp++ = 0xef; *tmp++ = 0xbf; *tmp++ = 0xbd; /* Replacement char */
}
}
return str;
}