From b5e3238109a67b5c59ddb90c0646655f1d9d506c Mon Sep 17 00:00:00 2001 From: bruvzg <7645683+bruvzg@users.noreply.github.com> Date: Wed, 5 Jul 2023 12:49:16 +0300 Subject: [PATCH] [TextServer] Fix get_word_breaks and it uses. --- doc/classes/TextServer.xml | 5 +- editor/debugger/script_editor_debugger.cpp | 2 +- modules/text_server_adv/text_server_adv.cpp | 156 ++++++++++++-------- modules/text_server_fb/text_server_fb.cpp | 134 +++++++++++------ platform/linuxbsd/tts_linux.cpp | 8 +- 5 files changed, 192 insertions(+), 113 deletions(-) diff --git a/doc/classes/TextServer.xml b/doc/classes/TextServer.xml index 34137a18ef5..4fa9700f9cf 100644 --- a/doc/classes/TextServer.xml +++ b/doc/classes/TextServer.xml @@ -1739,8 +1739,9 @@ When [param chars_per_line] is greater than zero, line break boundaries are returned instead. [codeblock] var ts = TextServerManager.get_primary_interface() - print(ts.string_get_word_breaks("Godot Engine")) # Prints [0, 5, 6, 12] - print(ts.string_get_word_breaks("Godot Engine", "en", 5)) # Prints [0, 5, 6, 11, 11, 12] + print(ts.string_get_word_breaks("The Godot Engine, 4")) # Prints [0, 3, 4, 9, 10, 16, 18, 19], which corresponds to the following substrings: "The", "Godot", "Engine", "4" + print(ts.string_get_word_breaks("The Godot Engine, 4", "en", 5)) # Prints [0, 3, 4, 9, 10, 15, 15, 19], which corresponds to the following substrings: "The", "Godot", "Engin", "e, 4" + print(ts.string_get_word_breaks("The Godot Engine, 4", "en", 10)) # Prints [0, 9, 10, 19], which corresponds to the following substrings: "The Godot", "Engine, 4" [/codeblock] diff --git a/editor/debugger/script_editor_debugger.cpp b/editor/debugger/script_editor_debugger.cpp index b062b20000a..5e96daf69c8 100644 --- a/editor/debugger/script_editor_debugger.cpp +++ b/editor/debugger/script_editor_debugger.cpp @@ -840,7 +840,7 @@ void ScriptEditorDebugger::_set_reason_text(const String &p_reason, MessageType for (int i = 0; i < boundaries.size(); i += 2) { const int start = boundaries[i]; const int end = boundaries[i + 1]; - lines.append(p_reason.substr(start, end - start + 1)); + lines.append(p_reason.substr(start, end - start)); } reason->set_tooltip_text(String("\n").join(lines)); diff --git a/modules/text_server_adv/text_server_adv.cpp b/modules/text_server_adv/text_server_adv.cpp index 33ba2da761c..0c87199635e 100644 --- a/modules/text_server_adv/text_server_adv.cpp +++ b/modules/text_server_adv/text_server_adv.cpp @@ -7048,10 +7048,10 @@ PackedInt32Array TextServerAdvanced::_string_get_word_breaks(const String &p_str HashSet breaks; UErrorCode err = U_ZERO_ERROR; - UBreakIterator *bi = ubrk_open(UBRK_LINE, lang.ascii().get_data(), (const UChar *)utf16.get_data(), utf16.length(), &err); + UBreakIterator *bi = ubrk_open(UBRK_WORD, lang.ascii().get_data(), (const UChar *)utf16.get_data(), utf16.length(), &err); if (U_SUCCESS(err)) { while (ubrk_next(bi) != UBRK_DONE) { - int pos = _convert_pos(p_string, utf16, ubrk_current(bi)) - 1; + int pos = _convert_pos(p_string, utf16, ubrk_current(bi)); if (pos != p_string.length() - 1) { breaks.insert(pos); } @@ -7061,79 +7061,111 @@ PackedInt32Array TextServerAdvanced::_string_get_word_breaks(const String &p_str PackedInt32Array ret; - int line_start = 0; - int line_end = 0; // End of last word on current line. - int word_start = 0; // -1 if no word encountered. Leading spaces are part of a word. - int word_length = 0; + if (p_chars_per_line > 0) { + int line_start = 0; + int last_break = -1; + int line_length = 0; - for (int i = 0; i < p_string.length(); i++) { - const char32_t c = p_string[i]; + for (int i = 0; i < p_string.length(); i++) { + const char32_t c = p_string[i]; - if (is_linebreak(c)) { - // Force newline. - ret.push_back(line_start); - ret.push_back(i); - line_start = i + 1; - line_end = line_start; - word_start = line_start; - word_length = 0; - } else if (c == 0xfffc) { - continue; - } else if ((u_ispunct(c) && c != 0x005F) || is_underscore(c) || c == '\t' || is_whitespace(c)) { - // A whitespace ends current word. - if (word_length > 0) { - line_end = i - 1; - word_start = -1; - word_length = 0; - } - } else if (breaks.has(i)) { - // End current word, no space. - if (word_length > 0) { - line_end = i; - word_start = i + 1; - word_length = 0; - } - if (p_chars_per_line <= 0) { - ret.push_back(line_start); - ret.push_back(line_end + 1); - line_start = word_start; - line_end = line_start; - } - } else { - if (word_start == -1) { - word_start = i; - if (p_chars_per_line <= 0) { + bool is_lb = is_linebreak(c); + bool is_ws = is_whitespace(c); + bool is_p = (u_ispunct(c) && c != 0x005F) || is_underscore(c) || c == '\t' || c == 0xfffc; + + if (is_lb) { + if (line_length > 0) { ret.push_back(line_start); - ret.push_back(line_end + 1); - line_start = word_start; - line_end = line_start; + ret.push_back(i); } + line_start = i; + line_length = 0; + last_break = -1; + continue; + } else if (breaks.has(i) || is_ws || is_p) { + last_break = i; } - word_length += 1; - if (p_chars_per_line > 0) { - if (word_length > p_chars_per_line) { - // Word too long: wrap before current character. + if (line_length == p_chars_per_line) { + if (last_break != -1) { + int last_break_w_spaces = last_break; + while (last_break > line_start && is_whitespace(p_string[last_break - 1])) { + last_break--; + } + if (line_start != last_break) { + ret.push_back(line_start); + ret.push_back(last_break); + } + while (last_break_w_spaces < p_string.length() && is_whitespace(p_string[last_break_w_spaces])) { + last_break_w_spaces++; + } + line_start = last_break_w_spaces; + if (last_break_w_spaces < i) { + line_length = i - last_break_w_spaces; + } else { + i = last_break_w_spaces; + line_length = 0; + } + } else { ret.push_back(line_start); ret.push_back(i); line_start = i; - line_end = i; - word_start = i; - word_length = 1; - } else if (i - line_start + 1 > p_chars_per_line) { - // Line too long: wrap after the last word. - ret.push_back(line_start); - ret.push_back(line_end + 1); - line_start = word_start; - line_end = line_start; + line_length = 0; } + last_break = -1; } + line_length++; + } + if (line_length > 0) { + ret.push_back(line_start); + ret.push_back(p_string.length()); + } + } else { + int word_start = 0; // -1 if no word encountered. Leading spaces are part of a word. + int word_length = 0; + + for (int i = 0; i < p_string.length(); i++) { + const char32_t c = p_string[i]; + + bool is_lb = is_linebreak(c); + bool is_ws = is_whitespace(c); + bool is_p = (u_ispunct(c) && c != 0x005F) || is_underscore(c) || c == '\t' || c == 0xfffc; + + if (word_start == -1) { + if (!is_lb && !is_ws && !is_p) { + word_start = i; + } + continue; + } + + if (is_lb) { + if (word_start != -1 && word_length > 0) { + ret.push_back(word_start); + ret.push_back(i); + } + word_start = -1; + word_length = 0; + } else if (breaks.has(i) || is_ws || is_p) { + if (word_start != -1 && word_length > 0) { + ret.push_back(word_start); + ret.push_back(i); + } + if (is_ws || is_p) { + word_start = -1; + } else { + word_start = i; + } + word_length = 0; + } + + word_length++; + } + if (word_start != -1 && word_length > 0) { + ret.push_back(word_start); + ret.push_back(p_string.length()); } } - if (line_start < p_string.length()) { - ret.push_back(line_start); - ret.push_back(p_string.length()); - } + return ret; } diff --git a/modules/text_server_fb/text_server_fb.cpp b/modules/text_server_fb/text_server_fb.cpp index 697c3366c5f..6cf6b236edb 100644 --- a/modules/text_server_fb/text_server_fb.cpp +++ b/modules/text_server_fb/text_server_fb.cpp @@ -4492,65 +4492,105 @@ String TextServerFallback::_string_to_title(const String &p_string, const String PackedInt32Array TextServerFallback::_string_get_word_breaks(const String &p_string, const String &p_language, int64_t p_chars_per_line) const { PackedInt32Array ret; - int line_start = 0; - int line_end = 0; // End of last word on current line. - int word_start = 0; // -1 if no word encountered. Leading spaces are part of a word. - int word_length = 0; + if (p_chars_per_line > 0) { + int line_start = 0; + int last_break = -1; + int line_length = 0; - for (int i = 0; i < p_string.length(); i++) { - const char32_t c = p_string[i]; + for (int i = 0; i < p_string.length(); i++) { + const char32_t c = p_string[i]; - if (is_linebreak(c)) { - // Force newline. - ret.push_back(line_start); - ret.push_back(i); - line_start = i + 1; - line_end = line_start; - word_start = line_start; - word_length = 0; - } else if (c == 0xfffc) { - continue; - } else if ((is_punct(c) && c != 0x005F) || is_underscore(c) || c == '\t' || is_whitespace(c)) { - // A whitespace ends current word. - if (word_length > 0) { - line_end = i - 1; - word_start = -1; - word_length = 0; - } - } else { - if (word_start == -1) { - word_start = i; - if (p_chars_per_line <= 0) { + bool is_lb = is_linebreak(c); + bool is_ws = is_whitespace(c); + bool is_p = (is_punct(c) && c != 0x005F) || is_underscore(c) || c == '\t' || c == 0xfffc; + + if (is_lb) { + if (line_length > 0) { ret.push_back(line_start); - ret.push_back(line_end + 1); - line_start = word_start; - line_end = line_start; + ret.push_back(i); } + line_start = i; + line_length = 0; + last_break = -1; + continue; + } else if (is_ws || is_p) { + last_break = i; } - word_length += 1; - if (p_chars_per_line > 0) { - if (word_length > p_chars_per_line) { - // Word too long: wrap before current character. + if (line_length == p_chars_per_line) { + if (last_break != -1) { + int last_break_w_spaces = last_break; + while (last_break > line_start && is_whitespace(p_string[last_break - 1])) { + last_break--; + } + if (line_start != last_break) { + ret.push_back(line_start); + ret.push_back(last_break); + } + while (last_break_w_spaces < p_string.length() && is_whitespace(p_string[last_break_w_spaces])) { + last_break_w_spaces++; + } + line_start = last_break_w_spaces; + if (last_break_w_spaces < i) { + line_length = i - last_break_w_spaces; + } else { + i = last_break_w_spaces; + line_length = 0; + } + } else { ret.push_back(line_start); ret.push_back(i); line_start = i; - line_end = i; - word_start = i; - word_length = 1; - } else if (i - line_start + 1 > p_chars_per_line) { - // Line too long: wrap after the last word. - ret.push_back(line_start); - ret.push_back(line_end + 1); - line_start = word_start; - line_end = line_start; + line_length = 0; } + last_break = -1; } + line_length++; + } + if (line_length > 0) { + ret.push_back(line_start); + ret.push_back(p_string.length()); + } + } else { + int word_start = 0; // -1 if no word encountered. Leading spaces are part of a word. + int word_length = 0; + + for (int i = 0; i < p_string.length(); i++) { + const char32_t c = p_string[i]; + + bool is_lb = is_linebreak(c); + bool is_ws = is_whitespace(c); + bool is_p = (is_punct(c) && c != 0x005F) || is_underscore(c) || c == '\t' || c == 0xfffc; + + if (word_start == -1) { + if (!is_lb && !is_ws && !is_p) { + word_start = i; + } + continue; + } + + if (is_lb) { + if (word_start != -1 && word_length > 0) { + ret.push_back(word_start); + ret.push_back(i); + } + word_start = -1; + word_length = 0; + } else if (is_ws || is_p) { + if (word_start != -1 && word_length > 0) { + ret.push_back(word_start); + ret.push_back(i); + } + word_start = -1; + word_length = 0; + } + + word_length++; + } + if (word_start != -1 && word_length > 0) { + ret.push_back(word_start); + ret.push_back(p_string.length()); } - } - if (line_start < p_string.length()) { - ret.push_back(line_start); - ret.push_back(p_string.length()); } return ret; } diff --git a/platform/linuxbsd/tts_linux.cpp b/platform/linuxbsd/tts_linux.cpp index 46291bb4dad..6c1f49f0466 100644 --- a/platform/linuxbsd/tts_linux.cpp +++ b/platform/linuxbsd/tts_linux.cpp @@ -149,12 +149,18 @@ void TTS_Linux::_speech_event(int p_msg_id, int p_type) { } PackedInt32Array breaks = TS->string_get_word_breaks(message.text, language); + int prev_end = -1; for (int i = 0; i < breaks.size(); i += 2) { const int start = breaks[i]; const int end = breaks[i + 1]; - text += message.text.substr(start, end - start + 1); + if (prev_end != -1 && prev_end != start) { + text += message.text.substr(prev_end, start - prev_end); + } + text += message.text.substr(start, end - start); text += ""; + prev_end = end; } + spd_set_synthesis_voice(synth, message.voice.utf8().get_data()); spd_set_volume(synth, message.volume * 2 - 100); spd_set_voice_pitch(synth, (message.pitch - 1) * 100);