Commit 0c180317 authored by Namjae Jeon's avatar Namjae Jeon Committed by Steve French
Browse files

ksmbd: add support for surrogate pair conversion



ksmbd is missing supporting to convert filename included surrogate pair
characters. It triggers a "file or folder does not exist" error in
Windows client.

[Steps to Reproduce for bug]
1. Create surrogate pair file
 touch $(echo -e '\xf0\x9d\x9f\xa3')
 touch $(echo -e '\xf0\x9d\x9f\xa4')

2. Try to open these files in ksmbd share through Windows client.

This patch update unicode functions not to consider about surrogate pair
(and IVS).

Reviewed-by: default avatarMarios Makassikis <mmakassikis@freebox.fr>
Tested-by: default avatarMarios Makassikis <mmakassikis@freebox.fr>
Signed-off-by: default avatarNamjae Jeon <linkinjeon@kernel.org>
Signed-off-by: default avatarSteve French <stfrench@microsoft.com>
parent ecce70cf
Loading
Loading
Loading
Loading
+138 −49
Original line number Diff line number Diff line
@@ -13,46 +13,10 @@
#include "unicode.h"
#include "smb_common.h"

/*
 * smb_utf16_bytes() - how long will a string be after conversion?
 * @from:	pointer to input string
 * @maxbytes:	don't go past this many bytes of input string
 * @codepage:	destination codepage
 *
 * Walk a utf16le string and return the number of bytes that the string will
 * be after being converted to the given charset, not including any null
 * termination required. Don't walk past maxbytes in the source buffer.
 *
 * Return:	string length after conversion
 */
static int smb_utf16_bytes(const __le16 *from, int maxbytes,
			   const struct nls_table *codepage)
{
	int i;
	int charlen, outlen = 0;
	int maxwords = maxbytes / 2;
	char tmp[NLS_MAX_CHARSET_SIZE];
	__u16 ftmp;

	for (i = 0; i < maxwords; i++) {
		ftmp = get_unaligned_le16(&from[i]);
		if (ftmp == 0)
			break;

		charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
		if (charlen > 0)
			outlen += charlen;
		else
			outlen++;
	}

	return outlen;
}

/*
 * cifs_mapchar() - convert a host-endian char to proper char in codepage
 * @target:	where converted character should be copied
 * @src_char:	2 byte host-endian source character
 * @from:	host-endian source string
 * @cp:		codepage to which character should be converted
 * @mapchar:	should character be mapped according to mapchars mount option?
 *
@@ -63,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
 * Return:	string length after conversion
 */
static int
cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
	     bool mapchar)
{
	int len = 1;
	__u16 src_char;

	src_char = *from;

	if (!mapchar)
		goto cp_convert;
@@ -104,12 +71,66 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,

cp_convert:
	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
	if (len <= 0) {
	if (len <= 0)
		goto surrogate_pair;

	goto out;

surrogate_pair:
	/* convert SURROGATE_PAIR and IVS */
	if (strcmp(cp->charset, "utf8"))
		goto unknown;
	len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
	if (len <= 0)
		goto unknown;
	return len;

unknown:
	*target = '?';
	len = 1;
	goto out;
}

	goto out;
/*
 * smb_utf16_bytes() - compute converted string length
 * @from:	pointer to input string
 * @maxbytes:	input string length
 * @codepage:	destination codepage
 *
 * Walk a utf16le string and return the number of bytes that the string will
 * be after being converted to the given charset, not including any null
 * termination required. Don't walk past maxbytes in the source buffer.
 *
 * Return:	string length after conversion
 */
static int smb_utf16_bytes(const __le16 *from, int maxbytes,
			   const struct nls_table *codepage)
{
	int i, j;
	int charlen, outlen = 0;
	int maxwords = maxbytes / 2;
	char tmp[NLS_MAX_CHARSET_SIZE];
	__u16 ftmp[3];

	for (i = 0; i < maxwords; i++) {
		ftmp[0] = get_unaligned_le16(&from[i]);
		if (ftmp[0] == 0)
			break;
		for (j = 1; j <= 2; j++) {
			if (i + j < maxwords)
				ftmp[j] = get_unaligned_le16(&from[i + j]);
			else
				ftmp[j] = 0;
		}

		charlen = cifs_mapchar(tmp, ftmp, codepage, 0);
		if (charlen > 0)
			outlen += charlen;
		else
			outlen++;
	}

	return outlen;
}

/*
@@ -139,12 +160,12 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
			  const struct nls_table *codepage, bool mapchar)
{
	int i, charlen, safelen;
	int i, j, charlen, safelen;
	int outlen = 0;
	int nullsize = nls_nullsize(codepage);
	int fromwords = fromlen / 2;
	char tmp[NLS_MAX_CHARSET_SIZE];
	__u16 ftmp;
	__u16 ftmp[3];	/* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */

	/*
	 * because the chars can be of varying widths, we need to take care
@@ -155,9 +176,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);

	for (i = 0; i < fromwords; i++) {
		ftmp = get_unaligned_le16(&from[i]);
		if (ftmp == 0)
		ftmp[0] = get_unaligned_le16(&from[i]);
		if (ftmp[0] == 0)
			break;
		for (j = 1; j <= 2; j++) {
			if (i + j < fromwords)
				ftmp[j] = get_unaligned_le16(&from[i + j]);
			else
				ftmp[j] = 0;
		}

		/*
		 * check to see if converting this character might make the
@@ -172,6 +199,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
		/* put converted char into 'to' buffer */
		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
		outlen += charlen;

		/*
		 * charlen (=bytes of UTF-8 for 1 character)
		 * 4bytes UTF-8(surrogate pair) is charlen=4
		 * (4bytes UTF-16 code)
		 * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
		 * (2 UTF-8 pairs divided to 2 UTF-16 pairs)
		 */
		if (charlen == 4)
			i++;
		else if (charlen >= 5)
			/* 5-6bytes UTF-8 */
			i += 2;
	}

	/* properly null-terminate string */
@@ -306,6 +346,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
	char src_char;
	__le16 dst_char;
	wchar_t tmp;
	wchar_t wchar_to[6];	/* UTF-16 */
	int ret;
	unicode_t u;

	if (!mapchars)
		return smb_strtoUTF16(target, source, srclen, cp);
@@ -348,11 +391,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen,
			 * if no match, use question mark, which at least in
			 * some cases serves as wild card
			 */
			if (charlen < 1) {
			if (charlen > 0)
				goto ctoUTF16;

			/* convert SURROGATE_PAIR */
			if (strcmp(cp->charset, "utf8"))
				goto unknown;
			if (*(source + i) & 0x80) {
				charlen = utf8_to_utf32(source + i, 6, &u);
				if (charlen < 0)
					goto unknown;
			} else
				goto unknown;
			ret  = utf8s_to_utf16s(source + i, charlen,
					UTF16_LITTLE_ENDIAN,
					wchar_to, 6);
			if (ret < 0)
				goto unknown;

			i += charlen;
			dst_char = cpu_to_le16(*wchar_to);
			if (charlen <= 3)
				/* 1-3bytes UTF-8 to 2bytes UTF-16 */
				put_unaligned(dst_char, &target[j]);
			else if (charlen == 4) {
				/*
				 * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
				 * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
				 * (charlen=3+4 or 4+4)
				 */
				put_unaligned(dst_char, &target[j]);
				dst_char = cpu_to_le16(*(wchar_to + 1));
				j++;
				put_unaligned(dst_char, &target[j]);
			} else if (charlen >= 5) {
				/* 5-6bytes UTF-8 to 6bytes UTF-16 */
				put_unaligned(dst_char, &target[j]);
				dst_char = cpu_to_le16(*(wchar_to + 1));
				j++;
				put_unaligned(dst_char, &target[j]);
				dst_char = cpu_to_le16(*(wchar_to + 2));
				j++;
				put_unaligned(dst_char, &target[j]);
			}
			continue;

unknown:
			dst_char = cpu_to_le16(0x003f);
			charlen = 1;
		}
		}

ctoUTF16:
		/*
		 * character may take more than one byte in the source string,
		 * but will take exactly two bytes in the target string