Add Unicode support to String.to_*_case() methods

2024-11-21 19:42:43 +00:00 · 2024-02-22 12:18:45 +03:00 · 2024-02-22 12:18:45 +03:00 · c0aa88ae4f
commit c0aa88ae4f
parent 16d61427ca
4 changed files with 1387 additions and 45 deletions
--- a/core/string/char_range.inc
+++ b/core/string/char_range.inc
--- a/core/string/char_utils.h
+++ b/core/string/char_utils.h
@ -35,24 +35,43 @@

 #include "char_range.inc"

+#define BSEARCH_CHAR_RANGE(m_array)                      \
+	int low = 0;                                         \
+	int high = sizeof(m_array) / sizeof(m_array[0]) - 1; \
+	int middle;                                          \
+                                                         \
+	while (low <= high) {                                \
+		middle = (low + high) / 2;                       \
+                                                         \
+		if (c < m_array[middle].start) {                 \
+			high = middle - 1;                           \
+		} else if (c > m_array[middle].end) {            \
+			low = middle + 1;                            \
+		} else {                                         \
+			return true;                                 \
+		}                                                \
+	}                                                    \
+                                                         \
+	return false
+
 static _FORCE_INLINE_ bool is_unicode_identifier_start(char32_t c) {
-	for (int i = 0; xid_start[i].start != 0; i++) {
-		if (c >= xid_start[i].start && c <= xid_start[i].end) {
-			return true;
-		}
-	}
-	return false;
+	BSEARCH_CHAR_RANGE(xid_start);
 }

 static _FORCE_INLINE_ bool is_unicode_identifier_continue(char32_t c) {
-	for (int i = 0; xid_continue[i].start != 0; i++) {
-		if (c >= xid_continue[i].start && c <= xid_continue[i].end) {
-			return true;
-		}
-	}
-	return false;
+	BSEARCH_CHAR_RANGE(xid_continue);
 }

+static _FORCE_INLINE_ bool is_unicode_upper_case(char32_t c) {
+	BSEARCH_CHAR_RANGE(uppercase_letter);
+}
+
+static _FORCE_INLINE_ bool is_unicode_lower_case(char32_t c) {
+	BSEARCH_CHAR_RANGE(lowercase_letter);
+}
+
+#undef BSEARCH_CHAR_RANGE
+
 static _FORCE_INLINE_ bool is_ascii_upper_case(char32_t c) {
 	return (c >= 'A' && c <= 'Z');
 }
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@ -1044,17 +1044,17 @@ String String::_camelcase_to_underscore() const {
 	int start_index = 0;

 	for (int i = 1; i < size(); i++) {
-		bool is_prev_upper = is_ascii_upper_case(cstr[i - 1]);
-		bool is_prev_lower = is_ascii_lower_case(cstr[i - 1]);
+		bool is_prev_upper = is_unicode_upper_case(cstr[i - 1]);
+		bool is_prev_lower = is_unicode_lower_case(cstr[i - 1]);
 		bool is_prev_digit = is_digit(cstr[i - 1]);

-		bool is_curr_upper = is_ascii_upper_case(cstr[i]);
-		bool is_curr_lower = is_ascii_lower_case(cstr[i]);
+		bool is_curr_upper = is_unicode_upper_case(cstr[i]);
+		bool is_curr_lower = is_unicode_lower_case(cstr[i]);
 		bool is_curr_digit = is_digit(cstr[i]);

 		bool is_next_lower = false;
 		if (i + 1 < size()) {
-			is_next_lower = is_ascii_lower_case(cstr[i + 1]);
+			is_next_lower = is_unicode_lower_case(cstr[i + 1]);
 		}

 		const bool cond_a = is_prev_lower && is_curr_upper; // aA
--- a/tests/core/string/test_string.h
+++ b/tests/core/string/test_string.h
@ -1300,39 +1300,54 @@ TEST_CASE("[String] Capitalize against many strings") {
 	input = "snake_case_function( snake_case_arg )";
 	output = "Snake Case Function( Snake Case Arg )";
 	CHECK(input.capitalize() == output);
+
+	input = U"словоСлово_слово слово";
+	output = U"Слово Слово Слово Слово";
+	CHECK(input.capitalize() == output);
+
+	input = U"λέξηΛέξη_λέξη λέξη";
+	output = U"Λέξη Λέξη Λέξη Λέξη";
+	CHECK(input.capitalize() == output);
+
+	input = U"բառԲառ_բառ բառ";
+	output = U"Բառ Բառ Բառ Բառ";
+	CHECK(input.capitalize() == output);
 }

 struct StringCasesTestCase {
-	const char *input;
-	const char *camel_case;
-	const char *pascal_case;
-	const char *snake_case;
+	const char32_t *input;
+	const char32_t *camel_case;
+	const char32_t *pascal_case;
+	const char32_t *snake_case;
 };

 TEST_CASE("[String] Checking case conversion methods") {
 	StringCasesTestCase test_cases[] = {
 		/* clang-format off */
-		{ "2D",                "2d",              "2d",              "2d"                },
-		{ "2d",                "2d",              "2d",              "2d"                },
-		{ "2db",               "2Db",             "2Db",             "2_db"              },
-		{ "Vector3",           "vector3",         "Vector3",         "vector_3"          },
-		{ "sha256",            "sha256",          "Sha256",          "sha_256"           },
-		{ "Node2D",            "node2d",          "Node2d",          "node_2d"           },
-		{ "RichTextLabel",     "richTextLabel",   "RichTextLabel",   "rich_text_label"   },
-		{ "HTML5",             "html5",           "Html5",           "html_5"            },
-		{ "Node2DPosition",    "node2dPosition",  "Node2dPosition",  "node_2d_position"  },
-		{ "Number2Digits",     "number2Digits",   "Number2Digits",   "number_2_digits"   },
-		{ "get_property_list", "getPropertyList", "GetPropertyList", "get_property_list" },
-		{ "get_camera_2d",     "getCamera2d",     "GetCamera2d",     "get_camera_2d"     },
-		{ "_physics_process",  "physicsProcess",  "PhysicsProcess",  "_physics_process"  },
-		{ "bytes2var",         "bytes2Var",       "Bytes2Var",       "bytes_2_var"       },
-		{ "linear2db",         "linear2Db",       "Linear2Db",       "linear_2_db"       },
-		{ "sha256sum",         "sha256Sum",       "Sha256Sum",       "sha_256_sum"       },
-		{ "camelCase",         "camelCase",       "CamelCase",       "camel_case"        },
-		{ "PascalCase",        "pascalCase",      "PascalCase",      "pascal_case"       },
-		{ "snake_case",        "snakeCase",       "SnakeCase",       "snake_case"        },
-		{ "Test TEST test",    "testTestTest",    "TestTestTest",    "test_test_test"    },
-		{ nullptr,             nullptr,           nullptr,           nullptr             },
+		{ U"2D",                     U"2d",                   U"2d",                   U"2d"                      },
+		{ U"2d",                     U"2d",                   U"2d",                   U"2d"                      },
+		{ U"2db",                    U"2Db",                  U"2Db",                  U"2_db"                    },
+		{ U"Vector3",                U"vector3",              U"Vector3",              U"vector_3"                },
+		{ U"sha256",                 U"sha256",               U"Sha256",               U"sha_256"                 },
+		{ U"Node2D",                 U"node2d",               U"Node2d",               U"node_2d"                 },
+		{ U"RichTextLabel",          U"richTextLabel",        U"RichTextLabel",        U"rich_text_label"         },
+		{ U"HTML5",                  U"html5",                U"Html5",                U"html_5"                  },
+		{ U"Node2DPosition",         U"node2dPosition",       U"Node2dPosition",       U"node_2d_position"        },
+		{ U"Number2Digits",          U"number2Digits",        U"Number2Digits",        U"number_2_digits"         },
+		{ U"get_property_list",      U"getPropertyList",      U"GetPropertyList",      U"get_property_list"       },
+		{ U"get_camera_2d",          U"getCamera2d",          U"GetCamera2d",          U"get_camera_2d"           },
+		{ U"_physics_process",       U"physicsProcess",       U"PhysicsProcess",       U"_physics_process"        },
+		{ U"bytes2var",              U"bytes2Var",            U"Bytes2Var",            U"bytes_2_var"             },
+		{ U"linear2db",              U"linear2Db",            U"Linear2Db",            U"linear_2_db"             },
+		{ U"sha256sum",              U"sha256Sum",            U"Sha256Sum",            U"sha_256_sum"             },
+		{ U"camelCase",              U"camelCase",            U"CamelCase",            U"camel_case"              },
+		{ U"PascalCase",             U"pascalCase",           U"PascalCase",           U"pascal_case"             },
+		{ U"snake_case",             U"snakeCase",            U"SnakeCase",            U"snake_case"              },
+		{ U"Test TEST test",         U"testTestTest",         U"TestTestTest",         U"test_test_test"          },
+		{ U"словоСлово_слово слово", U"словоСловоСловоСлово", U"СловоСловоСловоСлово", U"слово_слово_слово_слово" },
+		{ U"λέξηΛέξη_λέξη λέξη",     U"λέξηΛέξηΛέξηΛέξη",     U"ΛέξηΛέξηΛέξηΛέξη",     U"λέξη_λέξη_λέξη_λέξη"     },
+		{ U"բառԲառ_բառ բառ",         U"բառԲառԲառԲառ",         U"ԲառԲառԲառԲառ",         U"բառ_բառ_բառ_բառ"         },
+		{ nullptr,                   nullptr,                 nullptr,                 nullptr                    },
 		/* clang-format on */
 	};