ksmbd: add support for surrogate pair conversion (0c180317) · Commits · git / linux-nf

fs/smb/server/unicode.c

+138 −49

Original line number	Diff line number	Diff line
		@@ -13,46 +13,10 @@
		#include "unicode.h"
		#include "smb_common.h"

		/*
		* smb_utf16_bytes() - how long will a string be after conversion?
		* @from: pointer to input string
		* @maxbytes: don't go past this many bytes of input string
		* @codepage: destination codepage
		*
		* Walk a utf16le string and return the number of bytes that the string will
		* be after being converted to the given charset, not including any null
		* termination required. Don't walk past maxbytes in the source buffer.
		*
		* Return: string length after conversion
		*/
		static int smb_utf16_bytes(const __le16 *from, int maxbytes,
		const struct nls_table *codepage)
		{
		int i;
		int charlen, outlen = 0;
		int maxwords = maxbytes / 2;
		char tmp[NLS_MAX_CHARSET_SIZE];
		__u16 ftmp;

		for (i = 0; i < maxwords; i++) {
		ftmp = get_unaligned_le16(&from[i]);
		if (ftmp == 0)
		break;

		charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE);
		if (charlen > 0)
		outlen += charlen;
		else
		outlen++;
		}

		return outlen;
		}

		/*
		* cifs_mapchar() - convert a host-endian char to proper char in codepage
		* @target: where converted character should be copied
		* @src_char: 2 byte host-endian source character
		* @from: host-endian source string
		* @cp: codepage to which character should be converted
		* @mapchar: should character be mapped according to mapchars mount option?
		*
		@@ -63,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes,
		* Return: string length after conversion
		*/
		static int
		cifs_mapchar(char target, const __u16 src_char, const struct nls_table cp,
		cifs_mapchar(char target, const __u16 from, const struct nls_table *cp,
		bool mapchar)
		{
		int len = 1;
		__u16 src_char;

		src_char = *from;

		if (!mapchar)
		goto cp_convert;
		@@ -104,12 +71,66 @@ cifs_mapchar(char target, const __u16 src_char, const struct nls_table cp,

		cp_convert:
		len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
		if (len <= 0) {
		if (len <= 0)
		goto surrogate_pair;

		goto out;

		surrogate_pair:
		/* convert SURROGATE_PAIR and IVS */
		if (strcmp(cp->charset, "utf8"))
		goto unknown;
		len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
		if (len <= 0)
		goto unknown;
		return len;

		unknown:
		*target = '?';
		len = 1;
		goto out;
		}

		goto out;
		/*
		* smb_utf16_bytes() - compute converted string length
		* @from: pointer to input string
		* @maxbytes: input string length
		* @codepage: destination codepage
		*
		* Walk a utf16le string and return the number of bytes that the string will
		* be after being converted to the given charset, not including any null
		* termination required. Don't walk past maxbytes in the source buffer.
		*
		* Return: string length after conversion
		*/
		static int smb_utf16_bytes(const __le16 *from, int maxbytes,
		const struct nls_table *codepage)
		{
		int i, j;
		int charlen, outlen = 0;
		int maxwords = maxbytes / 2;
		char tmp[NLS_MAX_CHARSET_SIZE];
		__u16 ftmp[3];

		for (i = 0; i < maxwords; i++) {
		ftmp[0] = get_unaligned_le16(&from[i]);
		if (ftmp[0] == 0)
		break;
		for (j = 1; j <= 2; j++) {
		if (i + j < maxwords)
		ftmp[j] = get_unaligned_le16(&from[i + j]);
		else
		ftmp[j] = 0;
		}

		charlen = cifs_mapchar(tmp, ftmp, codepage, 0);
		if (charlen > 0)
		outlen += charlen;
		else
		outlen++;
		}

		return outlen;
		}

		/*
		@@ -139,12 +160,12 @@ cifs_mapchar(char target, const __u16 src_char, const struct nls_table cp,
		static int smb_from_utf16(char to, const __le16 from, int tolen, int fromlen,
		const struct nls_table *codepage, bool mapchar)
		{
		int i, charlen, safelen;
		int i, j, charlen, safelen;
		int outlen = 0;
		int nullsize = nls_nullsize(codepage);
		int fromwords = fromlen / 2;
		char tmp[NLS_MAX_CHARSET_SIZE];
		__u16 ftmp;
		__u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */

		/*
		* because the chars can be of varying widths, we need to take care
		@@ -155,9 +176,15 @@ static int smb_from_utf16(char to, const __le16 from, int tolen, int fromlen,
		safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);

		for (i = 0; i < fromwords; i++) {
		ftmp = get_unaligned_le16(&from[i]);
		if (ftmp == 0)
		ftmp[0] = get_unaligned_le16(&from[i]);
		if (ftmp[0] == 0)
		break;
		for (j = 1; j <= 2; j++) {
		if (i + j < fromwords)
		ftmp[j] = get_unaligned_le16(&from[i + j]);
		else
		ftmp[j] = 0;
		}

		/*
		* check to see if converting this character might make the
		@@ -172,6 +199,19 @@ static int smb_from_utf16(char to, const __le16 from, int tolen, int fromlen,
		/* put converted char into 'to' buffer */
		charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar);
		outlen += charlen;

		/*
		* charlen (=bytes of UTF-8 for 1 character)
		* 4bytes UTF-8(surrogate pair) is charlen=4
		* (4bytes UTF-16 code)
		* 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
		* (2 UTF-8 pairs divided to 2 UTF-16 pairs)
		*/
		if (charlen == 4)
		i++;
		else if (charlen >= 5)
		/* 5-6bytes UTF-8 */
		i += 2;
		}

		/* properly null-terminate string */
		@@ -306,6 +346,9 @@ int smbConvertToUTF16(__le16 target, const char source, int srclen,
		char src_char;
		__le16 dst_char;
		wchar_t tmp;
		wchar_t wchar_to[6]; /* UTF-16 */
		int ret;
		unicode_t u;

		if (!mapchars)
		return smb_strtoUTF16(target, source, srclen, cp);
		@@ -348,11 +391,57 @@ int smbConvertToUTF16(__le16 target, const char source, int srclen,
		* if no match, use question mark, which at least in
		* some cases serves as wild card
		*/
		if (charlen < 1) {
		if (charlen > 0)
		goto ctoUTF16;

		/* convert SURROGATE_PAIR */
		if (strcmp(cp->charset, "utf8"))
		goto unknown;
		if (*(source + i) & 0x80) {
		charlen = utf8_to_utf32(source + i, 6, &u);
		if (charlen < 0)
		goto unknown;
		} else
		goto unknown;
		ret = utf8s_to_utf16s(source + i, charlen,
		UTF16_LITTLE_ENDIAN,
		wchar_to, 6);
		if (ret < 0)
		goto unknown;

		i += charlen;
		dst_char = cpu_to_le16(*wchar_to);
		if (charlen <= 3)
		/* 1-3bytes UTF-8 to 2bytes UTF-16 */
		put_unaligned(dst_char, &target[j]);
		else if (charlen == 4) {
		/*
		* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
		* 7-8bytes UTF-8(IVS) divided to 2 UTF-16
		* (charlen=3+4 or 4+4)
		*/
		put_unaligned(dst_char, &target[j]);
		dst_char = cpu_to_le16(*(wchar_to + 1));
		j++;
		put_unaligned(dst_char, &target[j]);
		} else if (charlen >= 5) {
		/* 5-6bytes UTF-8 to 6bytes UTF-16 */
		put_unaligned(dst_char, &target[j]);
		dst_char = cpu_to_le16(*(wchar_to + 1));
		j++;
		put_unaligned(dst_char, &target[j]);
		dst_char = cpu_to_le16(*(wchar_to + 2));
		j++;
		put_unaligned(dst_char, &target[j]);
		}
		continue;

		unknown:
		dst_char = cpu_to_le16(0x003f);
		charlen = 1;
		}
		}

		ctoUTF16:
		/*
		* character may take more than one byte in the source string,
		* but will take exactly two bytes in the target string