Improve locale comparison

Since 9456454109, country specific locales are always preferred over
country-less locales even when the OS locale country doesn't match. For
example, running the Godot editor with locale es_ES will result in the
es_AR locale being chosen even though the es locale would be better.

The change happened because the score of the es_AR locale and the es
locale are the same when comparing to es_ES. Change this by parsing
locale strings into a Locale structure and decreasing the score when
script, country or variant are set in both but not matched. For the
es_ES case, this causes the es_AR score to be decreased since the
countries don't match. On the other hand, the es locale is not decreased
since it doesn't specify a country.
This commit is contained in:
Dan Nicholson 2024-11-01 13:49:51 -06:00
parent 1bffd6c73b
commit 2dd409fd70
3 changed files with 148 additions and 65 deletions

View File

@ -118,36 +118,45 @@ void TranslationServer::init_locale_info() {
}
}
String TranslationServer::standardize_locale(const String &p_locale) const {
return _standardize_locale(p_locale, false);
TranslationServer::Locale::operator String() const {
String out = language;
if (!script.is_empty()) {
out = out + "_" + script;
}
if (!country.is_empty()) {
out = out + "_" + country;
}
if (!variant.is_empty()) {
out = out + "_" + variant;
}
return out;
}
String TranslationServer::_standardize_locale(const String &p_locale, bool p_add_defaults) const {
TranslationServer::Locale::Locale(const TranslationServer &p_server, const String &p_locale, bool p_add_defaults) {
// Replaces '-' with '_' for macOS style locales.
String univ_locale = p_locale.replace("-", "_");
// Extract locale elements.
String lang_name, script_name, country_name, variant_name;
Vector<String> locale_elements = univ_locale.get_slice("@", 0).split("_");
lang_name = locale_elements[0];
language = locale_elements[0];
if (locale_elements.size() >= 2) {
if (locale_elements[1].length() == 4 && is_ascii_upper_case(locale_elements[1][0]) && is_ascii_lower_case(locale_elements[1][1]) && is_ascii_lower_case(locale_elements[1][2]) && is_ascii_lower_case(locale_elements[1][3])) {
script_name = locale_elements[1];
script = locale_elements[1];
}
if (locale_elements[1].length() == 2 && is_ascii_upper_case(locale_elements[1][0]) && is_ascii_upper_case(locale_elements[1][1])) {
country_name = locale_elements[1];
country = locale_elements[1];
}
}
if (locale_elements.size() >= 3) {
if (locale_elements[2].length() == 2 && is_ascii_upper_case(locale_elements[2][0]) && is_ascii_upper_case(locale_elements[2][1])) {
country_name = locale_elements[2];
} else if (variant_map.has(locale_elements[2].to_lower()) && variant_map[locale_elements[2].to_lower()] == lang_name) {
variant_name = locale_elements[2].to_lower();
country = locale_elements[2];
} else if (p_server.variant_map.has(locale_elements[2].to_lower()) && p_server.variant_map[locale_elements[2].to_lower()] == language) {
variant = locale_elements[2].to_lower();
}
}
if (locale_elements.size() >= 4) {
if (variant_map.has(locale_elements[3].to_lower()) && variant_map[locale_elements[3].to_lower()] == lang_name) {
variant_name = locale_elements[3].to_lower();
if (p_server.variant_map.has(locale_elements[3].to_lower()) && p_server.variant_map[locale_elements[3].to_lower()] == language) {
variant = locale_elements[3].to_lower();
}
}
@ -155,71 +164,62 @@ String TranslationServer::_standardize_locale(const String &p_locale, bool p_add
Vector<String> script_extra = univ_locale.get_slice("@", 1).split(";");
for (int i = 0; i < script_extra.size(); i++) {
if (script_extra[i].to_lower() == "cyrillic") {
script_name = "Cyrl";
script = "Cyrl";
break;
} else if (script_extra[i].to_lower() == "latin") {
script_name = "Latn";
script = "Latn";
break;
} else if (script_extra[i].to_lower() == "devanagari") {
script_name = "Deva";
script = "Deva";
break;
} else if (variant_map.has(script_extra[i].to_lower()) && variant_map[script_extra[i].to_lower()] == lang_name) {
variant_name = script_extra[i].to_lower();
} else if (p_server.variant_map.has(script_extra[i].to_lower()) && p_server.variant_map[script_extra[i].to_lower()] == language) {
variant = script_extra[i].to_lower();
}
}
// Handles known non-ISO language names used e.g. on Windows.
if (locale_rename_map.has(lang_name)) {
lang_name = locale_rename_map[lang_name];
if (p_server.locale_rename_map.has(language)) {
language = p_server.locale_rename_map[language];
}
// Handle country renames.
if (country_rename_map.has(country_name)) {
country_name = country_rename_map[country_name];
if (p_server.country_rename_map.has(country)) {
country = p_server.country_rename_map[country];
}
// Remove unsupported script codes.
if (!script_map.has(script_name)) {
script_name = "";
if (!p_server.script_map.has(script)) {
script = "";
}
// Add script code base on language and country codes for some ambiguous cases.
if (p_add_defaults) {
if (script_name.is_empty()) {
for (int i = 0; i < locale_script_info.size(); i++) {
const LocaleScriptInfo &info = locale_script_info[i];
if (info.name == lang_name) {
if (country_name.is_empty() || info.supported_countries.has(country_name)) {
script_name = info.script;
if (script.is_empty()) {
for (int i = 0; i < p_server.locale_script_info.size(); i++) {
const LocaleScriptInfo &info = p_server.locale_script_info[i];
if (info.name == language) {
if (country.is_empty() || info.supported_countries.has(country)) {
script = info.script;
break;
}
}
}
}
if (!script_name.is_empty() && country_name.is_empty()) {
if (!script.is_empty() && country.is_empty()) {
// Add conntry code based on script for some ambiguous cases.
for (int i = 0; i < locale_script_info.size(); i++) {
const LocaleScriptInfo &info = locale_script_info[i];
if (info.name == lang_name && info.script == script_name) {
country_name = info.default_country;
for (int i = 0; i < p_server.locale_script_info.size(); i++) {
const LocaleScriptInfo &info = p_server.locale_script_info[i];
if (info.name == language && info.script == script) {
country = info.default_country;
break;
}
}
}
}
}
// Combine results.
String out = lang_name;
if (!script_name.is_empty()) {
out = out + "_" + script_name;
}
if (!country_name.is_empty()) {
out = out + "_" + country_name;
}
if (!variant_name.is_empty()) {
out = out + "_" + variant_name;
}
return out;
String TranslationServer::standardize_locale(const String &p_locale) const {
return Locale(*this, p_locale, false).operator String();
}
int TranslationServer::compare_locales(const String &p_locale_a, const String &p_locale_b) const {
@ -234,8 +234,8 @@ int TranslationServer::compare_locales(const String &p_locale_a, const String &p
return *cached_result;
}
String locale_a = _standardize_locale(p_locale_a, true);
String locale_b = _standardize_locale(p_locale_b, true);
Locale locale_a = Locale(*this, p_locale_a, true);
Locale locale_b = Locale(*this, p_locale_b, true);
if (locale_a == locale_b) {
// Exact match.
@ -243,26 +243,41 @@ int TranslationServer::compare_locales(const String &p_locale_a, const String &p
return 10;
}
Vector<String> locale_a_elements = locale_a.split("_");
Vector<String> locale_b_elements = locale_b.split("_");
if (locale_a_elements[0] != locale_b_elements[0]) {
if (locale_a.language != locale_b.language) {
// No match.
locale_compare_cache.insert(cache_key, 0);
return 0;
}
// Matching language, both locales have extra parts.
// Return number of matching elements.
int matching_elements = 1;
for (int i = 1; i < locale_a_elements.size(); i++) {
for (int j = 1; j < locale_b_elements.size(); j++) {
if (locale_a_elements[i] == locale_b_elements[j]) {
matching_elements++;
}
// Matching language, both locales have extra parts. Compare the
// remaining elements. If both elements are non-empty, check the
// match to increase or decrease the score. If either element or
// both are empty, leave the score as is.
int score = 5;
if (!locale_a.script.is_empty() && !locale_b.script.is_empty()) {
if (locale_a.script == locale_b.script) {
score++;
} else {
score--;
}
}
locale_compare_cache.insert(cache_key, matching_elements);
return matching_elements;
if (!locale_a.country.is_empty() && !locale_b.country.is_empty()) {
if (locale_a.country == locale_b.country) {
score++;
} else {
score--;
}
}
if (!locale_a.variant.is_empty() && !locale_b.variant.is_empty()) {
if (locale_a.variant == locale_b.variant) {
score++;
} else {
score--;
}
}
locale_compare_cache.insert(cache_key, score);
return score;
}
String TranslationServer::get_locale_name(const String &p_locale) const {

View File

@ -64,6 +64,24 @@ class TranslationServer : public Object {
};
static Vector<LocaleScriptInfo> locale_script_info;
struct Locale {
String language;
String script;
String country;
String variant;
bool operator==(const Locale &p_locale) const {
return (p_locale.language == language) &&
(p_locale.script == script) &&
(p_locale.country == country) &&
(p_locale.variant == variant);
}
operator String() const;
Locale(const TranslationServer &p_server, const String &p_locale, bool p_add_defaults);
};
static HashMap<String, String> language_map;
static HashMap<String, String> script_map;
static HashMap<String, String> locale_rename_map;

View File

@ -110,18 +110,50 @@ TEST_CASE("[TranslationServer] Comparing locales") {
locale_a = "sr-Latn-CS";
locale_b = "sr-Latn-RS";
// Two elements from locales match.
// Script matches (+1) but country doesn't (-1).
res = ts->compare_locales(locale_a, locale_b);
CHECK(res == 2);
CHECK(res == 5);
locale_a = "uz-Cyrl-UZ";
locale_b = "uz-Latn-UZ";
// Two elements match, but they are not sequentual.
// Country matches (+1) but script doesn't (-1).
res = ts->compare_locales(locale_a, locale_b);
CHECK(res == 2);
CHECK(res == 5);
locale_a = "aa-Latn-ER";
locale_b = "aa-Latn-ER-saaho";
// Script and country match (+2) with variant on one locale (+0).
res = ts->compare_locales(locale_a, locale_b);
CHECK(res == 7);
locale_a = "uz-Cyrl-UZ";
locale_b = "uz-Latn-KG";
// Both script and country mismatched (-2).
res = ts->compare_locales(locale_a, locale_b);
CHECK(res == 3);
locale_a = "es-ES";
locale_b = "es-AR";
// Mismatched country (-1).
res = ts->compare_locales(locale_a, locale_b);
CHECK(res == 4);
locale_a = "es";
locale_b = "es-AR";
// No country for one locale (+0).
res = ts->compare_locales(locale_a, locale_b);
CHECK(res == 5);
locale_a = "es-EC";
locale_b = "fr-LU";
@ -130,6 +162,24 @@ TEST_CASE("[TranslationServer] Comparing locales") {
res = ts->compare_locales(locale_a, locale_b);
CHECK(res == 0);
locale_a = "zh-HK";
locale_b = "zh";
// In full standardization, zh-HK becomes zh_Hant_HK and zh becomes
// zh_Hans_CN. Both script and country mismatch (-2).
res = ts->compare_locales(locale_a, locale_b);
CHECK(res == 3);
locale_a = "zh-CN";
locale_b = "zh";
// In full standardization, zh and zh-CN both become zh_Hans_CN for an
// exact match.
res = ts->compare_locales(locale_a, locale_b);
CHECK(res == 10);
}
} // namespace TestTranslationServer