From 2dd409fd70fcd2d61138419fb4db763486bb3a80 Mon Sep 17 00:00:00 2001 From: Dan Nicholson Date: Fri, 1 Nov 2024 13:49:51 -0600 Subject: [PATCH] Improve locale comparison Since 9456454109, country specific locales are always preferred over country-less locales even when the OS locale country doesn't match. For example, running the Godot editor with locale es_ES will result in the es_AR locale being chosen even though the es locale would be better. The change happened because the score of the es_AR locale and the es locale are the same when comparing to es_ES. Change this by parsing locale strings into a Locale structure and decreasing the score when script, country or variant are set in both but not matched. For the es_ES case, this causes the es_AR score to be decreased since the countries don't match. On the other hand, the es locale is not decreased since it doesn't specify a country. --- core/string/translation_server.cpp | 137 +++++++++++--------- core/string/translation_server.h | 18 +++ tests/core/string/test_translation_server.h | 58 ++++++++- 3 files changed, 148 insertions(+), 65 deletions(-) diff --git a/core/string/translation_server.cpp b/core/string/translation_server.cpp index 92b473b61f0..4f09360ba89 100644 --- a/core/string/translation_server.cpp +++ b/core/string/translation_server.cpp @@ -118,36 +118,45 @@ void TranslationServer::init_locale_info() { } } -String TranslationServer::standardize_locale(const String &p_locale) const { - return _standardize_locale(p_locale, false); +TranslationServer::Locale::operator String() const { + String out = language; + if (!script.is_empty()) { + out = out + "_" + script; + } + if (!country.is_empty()) { + out = out + "_" + country; + } + if (!variant.is_empty()) { + out = out + "_" + variant; + } + return out; } -String TranslationServer::_standardize_locale(const String &p_locale, bool p_add_defaults) const { +TranslationServer::Locale::Locale(const TranslationServer &p_server, const String &p_locale, bool p_add_defaults) { // Replaces '-' with '_' for macOS style locales. String univ_locale = p_locale.replace("-", "_"); // Extract locale elements. - String lang_name, script_name, country_name, variant_name; Vector locale_elements = univ_locale.get_slice("@", 0).split("_"); - lang_name = locale_elements[0]; + language = locale_elements[0]; if (locale_elements.size() >= 2) { if (locale_elements[1].length() == 4 && is_ascii_upper_case(locale_elements[1][0]) && is_ascii_lower_case(locale_elements[1][1]) && is_ascii_lower_case(locale_elements[1][2]) && is_ascii_lower_case(locale_elements[1][3])) { - script_name = locale_elements[1]; + script = locale_elements[1]; } if (locale_elements[1].length() == 2 && is_ascii_upper_case(locale_elements[1][0]) && is_ascii_upper_case(locale_elements[1][1])) { - country_name = locale_elements[1]; + country = locale_elements[1]; } } if (locale_elements.size() >= 3) { if (locale_elements[2].length() == 2 && is_ascii_upper_case(locale_elements[2][0]) && is_ascii_upper_case(locale_elements[2][1])) { - country_name = locale_elements[2]; - } else if (variant_map.has(locale_elements[2].to_lower()) && variant_map[locale_elements[2].to_lower()] == lang_name) { - variant_name = locale_elements[2].to_lower(); + country = locale_elements[2]; + } else if (p_server.variant_map.has(locale_elements[2].to_lower()) && p_server.variant_map[locale_elements[2].to_lower()] == language) { + variant = locale_elements[2].to_lower(); } } if (locale_elements.size() >= 4) { - if (variant_map.has(locale_elements[3].to_lower()) && variant_map[locale_elements[3].to_lower()] == lang_name) { - variant_name = locale_elements[3].to_lower(); + if (p_server.variant_map.has(locale_elements[3].to_lower()) && p_server.variant_map[locale_elements[3].to_lower()] == language) { + variant = locale_elements[3].to_lower(); } } @@ -155,71 +164,62 @@ String TranslationServer::_standardize_locale(const String &p_locale, bool p_add Vector script_extra = univ_locale.get_slice("@", 1).split(";"); for (int i = 0; i < script_extra.size(); i++) { if (script_extra[i].to_lower() == "cyrillic") { - script_name = "Cyrl"; + script = "Cyrl"; break; } else if (script_extra[i].to_lower() == "latin") { - script_name = "Latn"; + script = "Latn"; break; } else if (script_extra[i].to_lower() == "devanagari") { - script_name = "Deva"; + script = "Deva"; break; - } else if (variant_map.has(script_extra[i].to_lower()) && variant_map[script_extra[i].to_lower()] == lang_name) { - variant_name = script_extra[i].to_lower(); + } else if (p_server.variant_map.has(script_extra[i].to_lower()) && p_server.variant_map[script_extra[i].to_lower()] == language) { + variant = script_extra[i].to_lower(); } } // Handles known non-ISO language names used e.g. on Windows. - if (locale_rename_map.has(lang_name)) { - lang_name = locale_rename_map[lang_name]; + if (p_server.locale_rename_map.has(language)) { + language = p_server.locale_rename_map[language]; } // Handle country renames. - if (country_rename_map.has(country_name)) { - country_name = country_rename_map[country_name]; + if (p_server.country_rename_map.has(country)) { + country = p_server.country_rename_map[country]; } // Remove unsupported script codes. - if (!script_map.has(script_name)) { - script_name = ""; + if (!p_server.script_map.has(script)) { + script = ""; } // Add script code base on language and country codes for some ambiguous cases. if (p_add_defaults) { - if (script_name.is_empty()) { - for (int i = 0; i < locale_script_info.size(); i++) { - const LocaleScriptInfo &info = locale_script_info[i]; - if (info.name == lang_name) { - if (country_name.is_empty() || info.supported_countries.has(country_name)) { - script_name = info.script; + if (script.is_empty()) { + for (int i = 0; i < p_server.locale_script_info.size(); i++) { + const LocaleScriptInfo &info = p_server.locale_script_info[i]; + if (info.name == language) { + if (country.is_empty() || info.supported_countries.has(country)) { + script = info.script; break; } } } } - if (!script_name.is_empty() && country_name.is_empty()) { + if (!script.is_empty() && country.is_empty()) { // Add conntry code based on script for some ambiguous cases. - for (int i = 0; i < locale_script_info.size(); i++) { - const LocaleScriptInfo &info = locale_script_info[i]; - if (info.name == lang_name && info.script == script_name) { - country_name = info.default_country; + for (int i = 0; i < p_server.locale_script_info.size(); i++) { + const LocaleScriptInfo &info = p_server.locale_script_info[i]; + if (info.name == language && info.script == script) { + country = info.default_country; break; } } } } +} - // Combine results. - String out = lang_name; - if (!script_name.is_empty()) { - out = out + "_" + script_name; - } - if (!country_name.is_empty()) { - out = out + "_" + country_name; - } - if (!variant_name.is_empty()) { - out = out + "_" + variant_name; - } - return out; +String TranslationServer::standardize_locale(const String &p_locale) const { + return Locale(*this, p_locale, false).operator String(); } int TranslationServer::compare_locales(const String &p_locale_a, const String &p_locale_b) const { @@ -234,8 +234,8 @@ int TranslationServer::compare_locales(const String &p_locale_a, const String &p return *cached_result; } - String locale_a = _standardize_locale(p_locale_a, true); - String locale_b = _standardize_locale(p_locale_b, true); + Locale locale_a = Locale(*this, p_locale_a, true); + Locale locale_b = Locale(*this, p_locale_b, true); if (locale_a == locale_b) { // Exact match. @@ -243,26 +243,41 @@ int TranslationServer::compare_locales(const String &p_locale_a, const String &p return 10; } - Vector locale_a_elements = locale_a.split("_"); - Vector locale_b_elements = locale_b.split("_"); - if (locale_a_elements[0] != locale_b_elements[0]) { + if (locale_a.language != locale_b.language) { // No match. locale_compare_cache.insert(cache_key, 0); return 0; } - // Matching language, both locales have extra parts. - // Return number of matching elements. - int matching_elements = 1; - for (int i = 1; i < locale_a_elements.size(); i++) { - for (int j = 1; j < locale_b_elements.size(); j++) { - if (locale_a_elements[i] == locale_b_elements[j]) { - matching_elements++; - } + // Matching language, both locales have extra parts. Compare the + // remaining elements. If both elements are non-empty, check the + // match to increase or decrease the score. If either element or + // both are empty, leave the score as is. + int score = 5; + if (!locale_a.script.is_empty() && !locale_b.script.is_empty()) { + if (locale_a.script == locale_b.script) { + score++; + } else { + score--; } } - locale_compare_cache.insert(cache_key, matching_elements); - return matching_elements; + if (!locale_a.country.is_empty() && !locale_b.country.is_empty()) { + if (locale_a.country == locale_b.country) { + score++; + } else { + score--; + } + } + if (!locale_a.variant.is_empty() && !locale_b.variant.is_empty()) { + if (locale_a.variant == locale_b.variant) { + score++; + } else { + score--; + } + } + + locale_compare_cache.insert(cache_key, score); + return score; } String TranslationServer::get_locale_name(const String &p_locale) const { diff --git a/core/string/translation_server.h b/core/string/translation_server.h index 2438349a69b..fac41035ae1 100644 --- a/core/string/translation_server.h +++ b/core/string/translation_server.h @@ -64,6 +64,24 @@ class TranslationServer : public Object { }; static Vector locale_script_info; + struct Locale { + String language; + String script; + String country; + String variant; + + bool operator==(const Locale &p_locale) const { + return (p_locale.language == language) && + (p_locale.script == script) && + (p_locale.country == country) && + (p_locale.variant == variant); + } + + operator String() const; + + Locale(const TranslationServer &p_server, const String &p_locale, bool p_add_defaults); + }; + static HashMap language_map; static HashMap script_map; static HashMap locale_rename_map; diff --git a/tests/core/string/test_translation_server.h b/tests/core/string/test_translation_server.h index ac1599f2e81..57fdf21fa6f 100644 --- a/tests/core/string/test_translation_server.h +++ b/tests/core/string/test_translation_server.h @@ -110,18 +110,50 @@ TEST_CASE("[TranslationServer] Comparing locales") { locale_a = "sr-Latn-CS"; locale_b = "sr-Latn-RS"; - // Two elements from locales match. + // Script matches (+1) but country doesn't (-1). res = ts->compare_locales(locale_a, locale_b); - CHECK(res == 2); + CHECK(res == 5); locale_a = "uz-Cyrl-UZ"; locale_b = "uz-Latn-UZ"; - // Two elements match, but they are not sequentual. + // Country matches (+1) but script doesn't (-1). res = ts->compare_locales(locale_a, locale_b); - CHECK(res == 2); + CHECK(res == 5); + + locale_a = "aa-Latn-ER"; + locale_b = "aa-Latn-ER-saaho"; + + // Script and country match (+2) with variant on one locale (+0). + res = ts->compare_locales(locale_a, locale_b); + + CHECK(res == 7); + + locale_a = "uz-Cyrl-UZ"; + locale_b = "uz-Latn-KG"; + + // Both script and country mismatched (-2). + res = ts->compare_locales(locale_a, locale_b); + + CHECK(res == 3); + + locale_a = "es-ES"; + locale_b = "es-AR"; + + // Mismatched country (-1). + res = ts->compare_locales(locale_a, locale_b); + + CHECK(res == 4); + + locale_a = "es"; + locale_b = "es-AR"; + + // No country for one locale (+0). + res = ts->compare_locales(locale_a, locale_b); + + CHECK(res == 5); locale_a = "es-EC"; locale_b = "fr-LU"; @@ -130,6 +162,24 @@ TEST_CASE("[TranslationServer] Comparing locales") { res = ts->compare_locales(locale_a, locale_b); CHECK(res == 0); + + locale_a = "zh-HK"; + locale_b = "zh"; + + // In full standardization, zh-HK becomes zh_Hant_HK and zh becomes + // zh_Hans_CN. Both script and country mismatch (-2). + res = ts->compare_locales(locale_a, locale_b); + + CHECK(res == 3); + + locale_a = "zh-CN"; + locale_b = "zh"; + + // In full standardization, zh and zh-CN both become zh_Hans_CN for an + // exact match. + res = ts->compare_locales(locale_a, locale_b); + + CHECK(res == 10); } } // namespace TestTranslationServer