diff --git a/core/io/file_access.cpp b/core/io/file_access.cpp index 7d8da1b11c2..da25f23917f 100644 --- a/core/io/file_access.cpp +++ b/core/io/file_access.cpp @@ -388,9 +388,7 @@ String FileAccess::get_as_utf8_string() const { w[len] = 0; String s; - if (s.parse_utf8((const char *)w)) { - return String(); - } + s.parse_utf8((const char *)w); return s; } @@ -516,7 +514,6 @@ String FileAccess::get_pascal_string() { String ret; ret.parse_utf8(cs.ptr()); - return ret; } diff --git a/core/io/marshalls.cpp b/core/io/marshalls.cpp index f71ea5c39e0..8ee19f274e4 100644 --- a/core/io/marshalls.cpp +++ b/core/io/marshalls.cpp @@ -78,7 +78,7 @@ static Error _decode_string(const uint8_t *&buf, int &len, int *r_len, String &r ERR_FAIL_COND_V(strlen < 0 || strlen + pad > len, ERR_FILE_EOF); String str; - ERR_FAIL_COND_V(str.parse_utf8((const char *)buf, strlen), ERR_INVALID_DATA); + ERR_FAIL_COND_V(str.parse_utf8((const char *)buf, strlen) != OK, ERR_INVALID_DATA); r_string = str; // Add padding diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp index df1aae53707..beefe54fafd 100644 --- a/core/string/ustring.cpp +++ b/core/string/ustring.cpp @@ -323,7 +323,13 @@ void String::copy_from(const char *p_cstr) { char32_t *dst = this->ptrw(); for (size_t i = 0; i <= len; i++) { - dst[i] = p_cstr[i]; + uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]); + if (c == 0 && i < len) { + print_unicode_error("NUL character", true); + dst[i] = 0x20; + } else { + dst[i] = c; + } } } @@ -350,7 +356,13 @@ void String::copy_from(const char *p_cstr, const int p_clip_to) { char32_t *dst = this->ptrw(); for (int i = 0; i < len; i++) { - dst[i] = p_cstr[i]; + uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]); + if (c == 0) { + print_unicode_error("NUL character", true); + dst[i] = 0x20; + } else { + dst[i] = c; + } } dst[len] = 0; } @@ -376,14 +388,21 @@ void String::copy_from(const wchar_t *p_cstr, const int p_clip_to) { } void String::copy_from(const char32_t &p_char) { - resize(2); - char32_t *dst = ptrw(); - if ((p_char >= 0xd800 && p_char <= 0xdfff) || (p_char > 0x10ffff)) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char, 16) + "."); - dst[0] = 0xfffd; - } else { - dst[0] = p_char; + if (p_char == 0) { + print_unicode_error("NUL character", true); + return; } + if ((p_char & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char)); + } + if (p_char > 0x10ffff) { + print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char)); + } + + resize(2); + + char32_t *dst = ptrw(); + dst[0] = p_char; dst[1] = 0; } @@ -437,12 +456,18 @@ void String::copy_from_unchecked(const char32_t *p_char, const int p_length) { dst[p_length] = 0; for (int i = 0; i < p_length; i++) { - if ((p_char[i] >= 0xd800 && p_char[i] <= 0xdfff) || (p_char[i] > 0x10ffff)) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char[i], 16) + "."); - dst[i] = 0xfffd; - } else { - dst[i] = p_char[i]; + if (p_char[i] == 0) { + print_unicode_error("NUL character", true); + dst[i] = 0x20; + continue; } + if ((p_char[i] & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char[i])); + } + if (p_char[i] > 0x10ffff) { + print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char[i])); + } + dst[i] = p_char[i]; } } @@ -481,7 +506,7 @@ String operator+(const wchar_t *p_chr, const String &p_str) { // wchar_t is 16-bit String tmp = String::utf16((const char16_t *)p_chr); #else - // wchar_t is 32-bi + // wchar_t is 32-bit String tmp = (const char32_t *)p_chr; #endif tmp += p_str; @@ -527,7 +552,13 @@ String &String::operator+=(const char *p_str) { char32_t *dst = ptrw() + lhs_len; for (size_t i = 0; i <= rhs_len; i++) { - dst[i] = p_str[i]; + uint8_t c = p_str[i] >= 0 ? p_str[i] : uint8_t(256 + p_str[i]); + if (c == 0 && i < rhs_len) { + print_unicode_error("NUL character", true); + dst[i] = 0x20; + } else { + dst[i] = c; + } } return *this; @@ -550,15 +581,21 @@ String &String::operator+=(const char32_t *p_str) { } String &String::operator+=(char32_t p_char) { + if (p_char == 0) { + print_unicode_error("NUL character", true); + return *this; + } + if ((p_char & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char)); + } + if (p_char > 0x10ffff) { + print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char)); + } + const int lhs_len = length(); resize(lhs_len + 2); char32_t *dst = ptrw(); - if ((p_char >= 0xd800 && p_char <= 0xdfff) || (p_char > 0x10ffff)) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(p_char, 16) + "."); - dst[lhs_len] = 0xfffd; - } else { - dst[lhs_len] = p_char; - } + dst[lhs_len] = p_char; dst[lhs_len + 1] = 0; return *this; @@ -1583,6 +1620,14 @@ String String::hex_encode_buffer(const uint8_t *p_buffer, int p_len) { return ret; } +void String::print_unicode_error(const String &p_message, bool p_critical) const { + if (p_critical) { + print_error(vformat("Unicode parsing error, some characters were replaced with spaces: %s", p_message)); + } else { + print_error(vformat("Unicode parsing error: %s", p_message)); + } +} + CharString String::ascii(bool p_allow_extended) const { if (!length()) { return CharString(); @@ -1596,7 +1641,7 @@ CharString String::ascii(bool p_allow_extended) const { if ((c <= 0x7f) || (c <= 0xff && p_allow_extended)) { cs[i] = c; } else { - print_error("Unicode parsing error: Cannot represent " + num_int64(c, 16) + " as ASCII/Latin-1 character."); + print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as ASCII/Latin-1", (uint32_t)c)); cs[i] = 0x20; } } @@ -1611,11 +1656,9 @@ String String::utf8(const char *p_utf8, int p_len) { return ret; } -bool String::parse_utf8(const char *p_utf8, int p_len) { -#define UNICERROR(m_err) print_error("Unicode parsing error: " + String(m_err) + ". Is the string valid UTF-8?"); - +Error String::parse_utf8(const char *p_utf8, int p_len) { if (!p_utf8) { - return true; + return ERR_INVALID_DATA; } String aux; @@ -1635,14 +1678,17 @@ bool String::parse_utf8(const char *p_utf8, int p_len) { } } + bool decode_error = false; + bool decode_failed = false; { const char *ptrtmp = p_utf8; const char *ptrtmp_limit = &p_utf8[p_len]; int skip = 0; + uint8_t c_start = 0; while (ptrtmp != ptrtmp_limit && *ptrtmp) { - if (skip == 0) { - uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp); + uint8_t c = *ptrtmp >= 0 ? *ptrtmp : uint8_t(256 + *ptrtmp); + if (skip == 0) { /* Determine the number of characters in sequence */ if ((c & 0x80) == 0) { skip = 0; @@ -1652,20 +1698,34 @@ bool String::parse_utf8(const char *p_utf8, int p_len) { skip = 2; } else if ((c & 0xf8) == 0xf0) { skip = 3; + } else if ((c & 0xfc) == 0xf8) { + skip = 4; + } else if ((c & 0xfe) == 0xfc) { + skip = 5; } else { - UNICERROR("invalid skip at " + num_int64(cstr_size)); - return true; //invalid utf8 + skip = 0; + print_unicode_error(vformat("Invalid UTF-8 leading byte (%x)", c), true); + decode_failed = true; } + c_start = c; if (skip == 1 && (c & 0x1e) == 0) { - UNICERROR("overlong rejected at " + num_int64(cstr_size)); - return true; //reject overlong + print_unicode_error(vformat("Overlong encoding (%x ...)", c)); + decode_error = true; } - str_size++; - } else { - --skip; + if ((c_start == 0xe0 && skip == 2 && c < 0xa0) || (c_start == 0xf0 && skip == 3 && c < 0x90) || (c_start == 0xf8 && skip == 4 && c < 0x88) || (c_start == 0xfc && skip == 5 && c < 0x84)) { + print_unicode_error(vformat("Overlong encoding (%x %x ...)", c_start, c)); + decode_error = true; + } + if (c < 0x80 || c > 0xbf) { + print_unicode_error(vformat("Invalid UTF-8 continuation byte (%x ... %x ...)", c_start, c), true); + decode_failed = true; + skip = 0; + } else { + --skip; + } } cstr_size++; @@ -1673,80 +1733,91 @@ bool String::parse_utf8(const char *p_utf8, int p_len) { } if (skip) { - UNICERROR("no space left"); - return true; //not enough space + print_unicode_error(vformat("Missing %d UTF-8 continuation byte(s)", skip), true); + decode_failed = true; } } if (str_size == 0) { clear(); - return false; + return OK; // empty string } resize(str_size + 1); char32_t *dst = ptrw(); dst[str_size] = 0; + int skip = 0; + uint32_t unichar = 0; while (cstr_size) { - int len = 0; + uint8_t c = *p_utf8 >= 0 ? *p_utf8 : uint8_t(256 + *p_utf8); - /* Determine the number of characters in sequence */ - if ((*p_utf8 & 0x80) == 0) { - len = 1; - } else if ((*p_utf8 & 0xe0) == 0xc0) { - len = 2; - } else if ((*p_utf8 & 0xf0) == 0xe0) { - len = 3; - } else if ((*p_utf8 & 0xf8) == 0xf0) { - len = 4; + if (skip == 0) { + /* Determine the number of characters in sequence */ + if ((c & 0x80) == 0) { + *(dst++) = c; + unichar = 0; + skip = 0; + } else if ((c & 0xe0) == 0xc0) { + unichar = (0xff >> 3) & c; + skip = 1; + } else if ((c & 0xf0) == 0xe0) { + unichar = (0xff >> 4) & c; + skip = 2; + } else if ((c & 0xf8) == 0xf0) { + unichar = (0xff >> 5) & c; + skip = 3; + } else if ((c & 0xfc) == 0xf8) { + unichar = (0xff >> 6) & c; + skip = 4; + } else if ((c & 0xfe) == 0xfc) { + unichar = (0xff >> 7) & c; + skip = 5; + } else { + *(dst++) = 0x20; + unichar = 0; + skip = 0; + } } else { - UNICERROR("invalid len"); - return true; //invalid UTF8 - } - - if (len > cstr_size) { - UNICERROR("no space left"); - return true; //not enough space - } - - if (len == 2 && (*p_utf8 & 0x1E) == 0) { - UNICERROR("no space left"); - return true; //reject overlong - } - - /* Convert the first character */ - - uint32_t unichar = 0; - - if (len == 1) { - unichar = *p_utf8; - } else { - unichar = (0xff >> (len + 1)) & *p_utf8; - - for (int i = 1; i < len; i++) { - if ((p_utf8[i] & 0xc0) != 0x80) { - UNICERROR("invalid utf8"); - return true; //invalid utf8 + if (c < 0x80 || c > 0xbf) { + *(dst++) = 0x20; + skip = 0; + } else { + unichar = (unichar << 6) | (c & 0x3f); + --skip; + if (skip == 0) { + if (unichar == 0) { + print_unicode_error("NUL character", true); + decode_failed = true; + unichar = 0x20; + } + if ((unichar & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", unichar)); + decode_error = true; + } + if (unichar > 0x10ffff) { + print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar)); + decode_error = true; + } + *(dst++) = unichar; } - if (unichar == 0 && i == 2 && ((p_utf8[i] & 0x7f) >> (7 - len)) == 0) { - UNICERROR("invalid utf8 overlong"); - return true; //no overlong - } - unichar = (unichar << 6) | (p_utf8[i] & 0x3f); } } - if (unichar >= 0xd800 && unichar <= 0xdfff) { - UNICERROR("invalid code point"); - return CharString(); - } - *(dst++) = unichar; - cstr_size -= len; - p_utf8 += len; + cstr_size--; + p_utf8++; + } + if (skip) { + *(dst++) = 0x20; } - return false; -#undef UNICERROR + if (decode_failed) { + return ERR_INVALID_DATA; + } else if (decode_error) { + return ERR_PARSE_ERROR; + } else { + return OK; + } } CharString String::utf8() const { @@ -1765,15 +1836,17 @@ CharString String::utf8() const { fl += 2; } else if (c <= 0xffff) { // 16 bits fl += 3; - } else if (c <= 0x0010ffff) { // 21 bits + } else if (c <= 0x001fffff) { // 21 bits fl += 4; + } else if (c <= 0x03ffffff) { // 26 bits + fl += 5; + print_unicode_error(vformat("Invalid unicode codepoint (%x)", c)); + } else if (c <= 0x7fffffff) { // 31 bits + fl += 6; + print_unicode_error(vformat("Invalid unicode codepoint (%x)", c)); } else { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + "."); - return CharString(); - } - if (c >= 0xd800 && c <= 0xdfff) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + "."); - return CharString(); + fl += 1; + print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as UTF-8", c), true); } } @@ -1799,11 +1872,26 @@ CharString String::utf8() const { APPEND_CHAR(uint32_t(0xe0 | ((c >> 12) & 0x0f))); // Top 4 bits. APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Middle 6 bits. APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits. - } else { // 21 bits + } else if (c <= 0x001fffff) { // 21 bits APPEND_CHAR(uint32_t(0xf0 | ((c >> 18) & 0x07))); // Top 3 bits. APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // Upper middle 6 bits. APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower middle 6 bits. APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits. + } else if (c <= 0x03ffffff) { // 26 bits + APPEND_CHAR(uint32_t(0xf8 | ((c >> 24) & 0x03))); // Top 2 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 18) & 0x3f))); // Upper middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits. + } else if (c <= 0x7fffffff) { // 31 bits + APPEND_CHAR(uint32_t(0xfc | ((c >> 30) & 0x01))); // Top 1 bit. + APPEND_CHAR(uint32_t(0x80 | ((c >> 24) & 0x3f))); // Upper upper middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 18) & 0x3f))); // Lower upper middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 12) & 0x3f))); // Upper lower middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower lower middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits. + } else { + APPEND_CHAR(0x20); } } #undef APPEND_CHAR @@ -1819,11 +1907,9 @@ String String::utf16(const char16_t *p_utf16, int p_len) { return ret; } -bool String::parse_utf16(const char16_t *p_utf16, int p_len) { -#define UNICERROR(m_err) print_error("Unicode parsing error: " + String(m_err) + ". Is the string valid UTF-16?"); - +Error String::parse_utf16(const char16_t *p_utf16, int p_len) { if (!p_utf16) { - return true; + return ERR_INVALID_DATA; } String aux; @@ -1850,80 +1936,90 @@ bool String::parse_utf16(const char16_t *p_utf16, int p_len) { } } + bool decode_error = false; { const char16_t *ptrtmp = p_utf16; const char16_t *ptrtmp_limit = &p_utf16[p_len]; - int skip = 0; + uint32_t c_prev = 0; + bool skip = false; while (ptrtmp != ptrtmp_limit && *ptrtmp) { uint32_t c = (byteswap) ? BSWAP16(*ptrtmp) : *ptrtmp; - if (skip == 0) { - if ((c & 0xfffffc00) == 0xd800) { - skip = 1; // lead surrogate - } else if ((c & 0xfffffc00) == 0xdc00) { - UNICERROR("invalid utf16 surrogate at " + num_int64(cstr_size)); - return true; // invalid UTF16 - } else { - skip = 0; + + if ((c & 0xfffffc00) == 0xd800) { // lead surrogate + if (skip) { + print_unicode_error(vformat("Unpaired lead surrogate (%x [trail?] %x)", c_prev, c)); + decode_error = true; } - str_size++; + skip = true; + } else if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate + if (skip) { + str_size--; + } else { + print_unicode_error(vformat("Unpaired trail surrogate (%x [lead?] %x)", c_prev, c)); + decode_error = true; + } + skip = false; } else { - if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate - --skip; - } else { - UNICERROR("invalid utf16 surrogate at " + num_int64(cstr_size)); - return true; // invalid UTF16 - } + skip = false; } + c_prev = c; + str_size++; cstr_size++; ptrtmp++; } if (skip) { - UNICERROR("no space left"); - return true; // not enough space + print_unicode_error(vformat("Unpaired lead surrogate (%x [eol])", c_prev)); + decode_error = true; } } if (str_size == 0) { clear(); - return false; + return OK; // empty string } resize(str_size + 1); char32_t *dst = ptrw(); dst[str_size] = 0; + bool skip = false; + uint32_t c_prev = 0; while (cstr_size) { - int len = 0; uint32_t c = (byteswap) ? BSWAP16(*p_utf16) : *p_utf16; - if ((c & 0xfffffc00) == 0xd800) { - len = 2; + if ((c & 0xfffffc00) == 0xd800) { // lead surrogate + if (skip) { + *(dst++) = c_prev; // unpaired, store as is + } + skip = true; + } else if ((c & 0xfffffc00) == 0xdc00) { // trail surrogate + if (skip) { + *(dst++) = (c_prev << 10UL) + c - ((0xd800 << 10UL) + 0xdc00 - 0x10000); // decode pair + } else { + *(dst++) = c; // unpaired, store as is + } + skip = false; } else { - len = 1; + *(dst++) = c; + skip = false; } - if (len > cstr_size) { - UNICERROR("no space left"); - return true; //not enough space - } - - uint32_t unichar = 0; - if (len == 1) { - unichar = c; - } else { - uint32_t c2 = (byteswap) ? BSWAP16(p_utf16[1]) : p_utf16[1]; - unichar = (c << 10UL) + c2 - ((0xd800 << 10UL) + 0xdc00 - 0x10000); - } - - *(dst++) = unichar; - cstr_size -= len; - p_utf16 += len; + cstr_size--; + p_utf16++; + c_prev = c; } - return false; -#undef UNICERROR + if (skip) { + *(dst++) = c_prev; + } + + if (decode_error) { + return ERR_PARSE_ERROR; + } else { + return OK; + } } Char16String String::utf16() const { @@ -1938,15 +2034,14 @@ Char16String String::utf16() const { uint32_t c = d[i]; if (c <= 0xffff) { // 16 bits. fl += 1; + if ((c & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", c)); + } } else if (c <= 0x10ffff) { // 32 bits. fl += 2; } else { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + "."); - return Char16String(); - } - if (c >= 0xd800 && c <= 0xdfff) { - print_error("Unicode parsing error: Invalid unicode codepoint " + num_int64(c, 16) + "."); - return Char16String(); + print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as UTF-16", c), true); + fl += 1; } } @@ -1965,9 +2060,11 @@ Char16String String::utf16() const { if (c <= 0xffff) { // 16 bits. APPEND_CHAR(c); - } else { // 32 bits. + } else if (c <= 0x10ffff) { // 32 bits. APPEND_CHAR(uint32_t((c >> 10) + 0xd7c0)); // lead surrogate. APPEND_CHAR(uint32_t((c & 0x3ff) | 0xdc00)); // trail surrogate. + } else { + APPEND_CHAR(0x20); } } #undef APPEND_CHAR diff --git a/core/string/ustring.h b/core/string/ustring.h index 11d09743816..1b8bf3d234b 100644 --- a/core/string/ustring.h +++ b/core/string/ustring.h @@ -271,6 +271,9 @@ public: bool is_valid_string() const; + /* debug, error messages */ + void print_unicode_error(const String &p_message, bool p_critical = false) const; + /* complex helpers */ String substr(int p_from, int p_chars = -1) const; int find(const String &p_str, int p_from = 0) const; ///< return <0 if failed @@ -373,11 +376,11 @@ public: CharString ascii(bool p_allow_extended = false) const; CharString utf8() const; - bool parse_utf8(const char *p_utf8, int p_len = -1); //return true on error + Error parse_utf8(const char *p_utf8, int p_len = -1); static String utf8(const char *p_utf8, int p_len = -1); Char16String utf16() const; - bool parse_utf16(const char16_t *p_utf16, int p_len = -1); //return true on error + Error parse_utf16(const char16_t *p_utf16, int p_len = -1); static String utf16(const char16_t *p_utf16, int p_len = -1); static uint32_t hash(const char32_t *p_cstr, int p_len); /* hash the string */ diff --git a/drivers/unix/dir_access_unix.cpp b/drivers/unix/dir_access_unix.cpp index c4b42806fde..39dbadf3cd5 100644 --- a/drivers/unix/dir_access_unix.cpp +++ b/drivers/unix/dir_access_unix.cpp @@ -343,7 +343,7 @@ Error DirAccessUnix::change_dir(String p_dir) { String prev_dir; char real_current_dir_name[2048]; ERR_FAIL_COND_V(getcwd(real_current_dir_name, 2048) == nullptr, ERR_BUG); - if (prev_dir.parse_utf8(real_current_dir_name)) { + if (prev_dir.parse_utf8(real_current_dir_name) != OK) { prev_dir = real_current_dir_name; //no utf8, maybe latin? } @@ -505,7 +505,7 @@ DirAccessUnix::DirAccessUnix() { // set current directory to an absolute path of the current directory char real_current_dir_name[2048]; ERR_FAIL_COND(getcwd(real_current_dir_name, 2048) == nullptr); - if (current_dir.parse_utf8(real_current_dir_name)) { + if (current_dir.parse_utf8(real_current_dir_name) != OK) { current_dir = real_current_dir_name; } diff --git a/drivers/unix/os_unix.cpp b/drivers/unix/os_unix.cpp index 52a4d538e17..091287c652c 100644 --- a/drivers/unix/os_unix.cpp +++ b/drivers/unix/os_unix.cpp @@ -313,7 +313,12 @@ Error OS_Unix::execute(const String &p_path, const List &p_arguments, St if (p_pipe_mutex) { p_pipe_mutex->lock(); } - (*r_pipe) += String::utf8(buf); + String pipe_out; + if (pipe_out.parse_utf8(buf) == OK) { + (*r_pipe) += pipe_out; + } else { + (*r_pipe) += String(buf); // If not valid UTF-8 try decode as Latin-1 + } if (p_pipe_mutex) { p_pipe_mutex->unlock(); } diff --git a/modules/gdscript/gdscript.cpp b/modules/gdscript/gdscript.cpp index e6aeef2fd1b..bf83353ead7 100644 --- a/modules/gdscript/gdscript.cpp +++ b/modules/gdscript/gdscript.cpp @@ -1050,7 +1050,7 @@ Error GDScript::load_source_code(const String &p_path) { w[len] = 0; String s; - if (s.parse_utf8((const char *)w)) { + if (s.parse_utf8((const char *)w) != OK) { ERR_FAIL_V_MSG(ERR_INVALID_DATA, "Script '" + p_path + "' contains invalid unicode (UTF-8), so it was not loaded. Please ensure that scripts are saved in valid UTF-8 unicode."); } diff --git a/modules/gdscript/gdscript_cache.cpp b/modules/gdscript/gdscript_cache.cpp index 4c15fca91ee..48d5fbc5699 100644 --- a/modules/gdscript/gdscript_cache.cpp +++ b/modules/gdscript/gdscript_cache.cpp @@ -157,7 +157,7 @@ String GDScriptCache::get_source_code(const String &p_path) { source_file.write[len] = 0; String source; - if (source.parse_utf8((const char *)source_file.ptr())) { + if (source.parse_utf8((const char *)source_file.ptr()) != OK) { ERR_FAIL_V_MSG("", "Script '" + p_path + "' contains invalid unicode (UTF-8), so it was not loaded. Please ensure that scripts are saved in valid UTF-8 unicode."); } return source; diff --git a/modules/mono/build_scripts/make_android_mono_config.py b/modules/mono/build_scripts/make_android_mono_config.py index 1920ef1c1a6..3459244bc27 100644 --- a/modules/mono/build_scripts/make_android_mono_config.py +++ b/modules/mono/build_scripts/make_android_mono_config.py @@ -43,7 +43,7 @@ String get_godot_android_mono_config() { Compression::decompress(w, config_uncompressed_size, config_compressed_data, config_compressed_size, Compression::MODE_DEFLATE); String s; - if (s.parse_utf8((const char *)w, data.size())) { + if (s.parse_utf8((const char *)w, data.size()) != OK) { ERR_FAIL_V(String()); } return s; diff --git a/modules/mono/utils/path_utils.cpp b/modules/mono/utils/path_utils.cpp index 15a0b281813..a1905dfcfea 100644 --- a/modules/mono/utils/path_utils.cpp +++ b/modules/mono/utils/path_utils.cpp @@ -62,7 +62,8 @@ String cwd() { } String result; - if (result.parse_utf16(buffer.ptr())) { + result.parse_utf16(buffer.ptr()); + if (result.is_empty()) { return "."; } return result.simplify_path(); @@ -73,7 +74,7 @@ String cwd() { } String result; - if (result.parse_utf8(buffer)) { + if (result.parse_utf8(buffer) != OK) { return "."; } @@ -114,7 +115,8 @@ String realpath(const String &p_path) { ::CloseHandle(hFile); String result; - if (result.parse_utf16(buffer.ptr())) { + result.parse_utf16(buffer.ptr()); + if (result.is_empty()) { return p_path; } @@ -127,10 +129,10 @@ String realpath(const String &p_path) { } String result; - bool parse_ok = result.parse_utf8(resolved_path); + Error parse_ok = result.parse_utf8(resolved_path); ::free(resolved_path); - if (parse_ok) { + if (parse_ok != OK) { return p_path; } diff --git a/modules/mono/utils/string_utils.cpp b/modules/mono/utils/string_utils.cpp index 64b68b70af3..975f2d8332b 100644 --- a/modules/mono/utils/string_utils.cpp +++ b/modules/mono/utils/string_utils.cpp @@ -178,7 +178,7 @@ Error read_all_file_utf8(const String &p_path, String &r_content) { w[len] = 0; String source; - if (source.parse_utf8((const char *)w)) { + if (source.parse_utf8((const char *)w) != OK) { ERR_FAIL_V(ERR_INVALID_DATA); } diff --git a/scene/resources/text_file.cpp b/scene/resources/text_file.cpp index 96a47c37c4f..0404e1f79bb 100644 --- a/scene/resources/text_file.cpp +++ b/scene/resources/text_file.cpp @@ -64,7 +64,7 @@ Error TextFile::load_text(const String &p_path) { w[len] = 0; String s; - ERR_FAIL_COND_V_MSG(s.parse_utf8((const char *)w), ERR_INVALID_DATA, "Script '" + p_path + "' contains invalid unicode (UTF-8), so it was not loaded. Please ensure that scripts are saved in valid UTF-8 unicode."); + ERR_FAIL_COND_V_MSG(s.parse_utf8((const char *)w) != OK, ERR_INVALID_DATA, "Script '" + p_path + "' contains invalid unicode (UTF-8), so it was not loaded. Please ensure that scripts are saved in valid UTF-8 unicode."); text = s; path = p_path; return OK; diff --git a/tests/core/string/test_string.h b/tests/core/string/test_string.h index 0b191d2d948..0c5704d6c97 100644 --- a/tests/core/string/test_string.h +++ b/tests/core/string/test_string.h @@ -89,12 +89,12 @@ TEST_CASE("[String] UTF8") { static const char32_t u32str[] = { 0x0045, 0x0020, 0x304A, 0x360F, 0x3088, 0x3046, 0x1F3A4, 0 }; static const uint8_t u8str[] = { 0x45, 0x20, 0xE3, 0x81, 0x8A, 0xE3, 0x98, 0x8F, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0 }; String s = u32str; - bool err = s.parse_utf8(s.utf8().get_data()); - CHECK(!err); + Error err = s.parse_utf8(s.utf8().get_data()); + CHECK(err == OK); CHECK(s == u32str); err = s.parse_utf8((const char *)u8str); - CHECK(!err); + CHECK(err == OK); CHECK(s == u32str); CharString cs = (const char *)u8str; @@ -106,12 +106,12 @@ TEST_CASE("[String] UTF16") { static const char32_t u32str[] = { 0x0045, 0x0020, 0x304A, 0x360F, 0x3088, 0x3046, 0x1F3A4, 0 }; static const char16_t u16str[] = { 0x0045, 0x0020, 0x304A, 0x360F, 0x3088, 0x3046, 0xD83C, 0xDFA4, 0 }; String s = u32str; - bool err = s.parse_utf16(s.utf16().get_data()); - CHECK(!err); + Error err = s.parse_utf16(s.utf16().get_data()); + CHECK(err == OK); CHECK(s == u32str); err = s.parse_utf16(u16str); - CHECK(!err); + CHECK(err == OK); CHECK(s == u32str); Char16String cs = u16str; @@ -123,8 +123,8 @@ TEST_CASE("[String] UTF8 with BOM") { static const char32_t u32str[] = { 0x0045, 0x0020, 0x304A, 0x360F, 0x3088, 0x3046, 0x1F3A4, 0 }; static const uint8_t u8str[] = { 0xEF, 0xBB, 0xBF, 0x45, 0x20, 0xE3, 0x81, 0x8A, 0xE3, 0x98, 0x8F, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0 }; String s; - bool err = s.parse_utf8((const char *)u8str); - CHECK(!err); + Error err = s.parse_utf8((const char *)u8str); + CHECK(err == OK); CHECK(s == u32str); CharString cs = (const char *)u8str; @@ -137,12 +137,12 @@ TEST_CASE("[String] UTF16 with BOM") { static const char16_t u16str[] = { 0xFEFF, 0x0020, 0x0045, 0x304A, 0x360F, 0x3088, 0x3046, 0xD83C, 0xDFA4, 0 }; static const char16_t u16str_swap[] = { 0xFFFE, 0x2000, 0x4500, 0x4A30, 0x0F36, 0x8830, 0x4630, 0x3CD8, 0xA4DF, 0 }; String s; - bool err = s.parse_utf16(u16str); - CHECK(!err); + Error err = s.parse_utf16(u16str); + CHECK(err == OK); CHECK(s == u32str); err = s.parse_utf16(u16str_swap); - CHECK(!err); + CHECK(err == OK); CHECK(s == u32str); Char16String cs = u16str; @@ -152,29 +152,48 @@ TEST_CASE("[String] UTF16 with BOM") { CHECK(String::utf16(cs) == s); } -TEST_CASE("[String] Invalid UTF8") { +TEST_CASE("[String] Invalid UTF8 (non-standard)") { ERR_PRINT_OFF - static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0x8F, 0xE3, 0xE3, 0x98, 0x8F, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0 }; + static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 }; + // + +2 +2 +2 +3 overlong +3 unpaired +2 + static const char32_t u32str[] = { 0x45, 0x304A, 0x3088, 0x3046, 0x1F3A4, 0x20AC, 0xD801, 0 }; String s; - bool err = s.parse_utf8((const char *)u8str); - CHECK(err); - CHECK(s.is_empty()); + Error err = s.parse_utf8((const char *)u8str); + CHECK(err == ERR_PARSE_ERROR); + CHECK(s == u32str); CharString cs = (const char *)u8str; - CHECK(String::utf8(cs).is_empty()); + CHECK(String::utf8(cs) == s); ERR_PRINT_ON } -TEST_CASE("[String] Invalid UTF16") { +TEST_CASE("[String] Invalid UTF8 (unrecoverable)") { + ERR_PRINT_OFF + static const uint8_t u8str[] = { 0x45, 0xE3, 0x81, 0x8A, 0x8F, 0xE3, 0xE3, 0x98, 0x8F, 0xE3, 0x82, 0x88, 0xE3, 0x81, 0x86, 0xC0, 0x80, 0xF0, 0x9F, 0x8E, 0xA4, 0xF0, 0x82, 0x82, 0xAC, 0xED, 0xA0, 0x81, 0 }; + // + +2 inv +2 inv inv inv +2 +2 ovl NUL +1 +3 overlong +3 unpaired +2 + static const char32_t u32str[] = { 0x45, 0x304A, 0x20, 0x20, 0x20, 0x20, 0x3088, 0x3046, 0x20, 0x1F3A4, 0x20AC, 0xD801, 0 }; + String s; + Error err = s.parse_utf8((const char *)u8str); + CHECK(err == ERR_INVALID_DATA); + CHECK(s == u32str); + + CharString cs = (const char *)u8str; + CHECK(String::utf8(cs) == s); + ERR_PRINT_ON +} + +TEST_CASE("[String] Invalid UTF16 (non-standard)") { ERR_PRINT_OFF static const char16_t u16str[] = { 0x0045, 0x304A, 0x3088, 0x3046, 0xDFA4, 0 }; + // + + + + unpaired + static const char32_t u32str[] = { 0x0045, 0x304A, 0x3088, 0x3046, 0xDFA4, 0 }; String s; - bool err = s.parse_utf16(u16str); - CHECK(err); - CHECK(s.is_empty()); + Error err = s.parse_utf16(u16str); + CHECK(err == ERR_PARSE_ERROR); + CHECK(s == u32str); Char16String cs = u16str; - CHECK(String::utf16(cs).is_empty()); + CHECK(String::utf16(cs) == s); ERR_PRINT_ON } @@ -262,8 +281,8 @@ TEST_CASE("[String] Test chr") { CHECK(String::chr('H') == "H"); CHECK(String::chr(0x3012)[0] == 0x3012); ERR_PRINT_OFF - CHECK(String::chr(0xd812)[0] == 0xfffd); // Unpaired UTF-16 surrogate - CHECK(String::chr(0x20d812)[0] == 0xfffd); // Outside UTF-32 range + CHECK(String::chr(0xd812)[0] == 0xd812); // Unpaired UTF-16 surrogate + CHECK(String::chr(0x20d812)[0] == 0x20d812); // Outside UTF-32 range ERR_PRINT_ON } @@ -1125,9 +1144,9 @@ TEST_CASE("[String] lstrip and rstrip") { #undef STRIP_TEST } -TEST_CASE("[String] ensuring empty string into parse_utf8 passes empty string") { +TEST_CASE("[String] Ensuring empty string into parse_utf8 passes empty string") { String empty; - CHECK(empty.parse_utf8(nullptr, -1)); + CHECK(empty.parse_utf8(nullptr, -1) == ERR_INVALID_DATA); } TEST_CASE("[String] Cyrillic to_lower()") { @@ -1440,8 +1459,8 @@ TEST_CASE("[String] validate_node_name") { String name_with_spaces = "Name with spaces"; CHECK(name_with_spaces.validate_node_name() == "Name with spaces"); - String name_with_kana = "Name with kana ゴドツ"; - CHECK(name_with_kana.validate_node_name() == "Name with kana ゴドツ"); + String name_with_kana = U"Name with kana ゴドツ"; + CHECK(name_with_kana.validate_node_name() == U"Name with kana ゴドツ"); String name_with_invalid_chars = "Name with invalid characters :.@removed!"; CHECK(name_with_invalid_chars.validate_node_name() == "Name with invalid characters removed!");