diff options
| author | Dirk-Jan C. Binnema <djcb@djcbsoftware.nl> | 2025-10-25 16:53:57 +0300 |
|---|---|---|
| committer | Dirk-Jan C. Binnema <djcb@djcbsoftware.nl> | 2025-10-25 16:55:43 +0300 |
| commit | 825fed4be8c5e68b25a7b04cfccff46cdd86e4f2 (patch) | |
| tree | 18041e0006e9c07b17b018d4ba776aeaee7abf0e | |
| parent | 5a2b7290a5866f44d69f3aac9e6364ddb364adc8 (diff) | |
utils: add utf8_clean helper
To morph text into valid utf8 if it isn't already.
| -rw-r--r-- | lib/utils/mu-utils.hh | 20 | ||||
| -rw-r--r-- | lib/utils/tests/test-utils.cc | 23 |
2 files changed, 38 insertions, 5 deletions
diff --git a/lib/utils/mu-utils.hh b/lib/utils/mu-utils.hh index 32bb0dd..11c0d9a 100644 --- a/lib/utils/mu-utils.hh +++ b/lib/utils/mu-utils.hh @@ -165,6 +165,24 @@ static inline bool contains_unbroken_script(const std::string& str) { } /** + * If the string is already valid utf8, return it + * otherwise, return a valid utf8 version + * + * @param str some string + * + * @return a utf8-string + */ +static inline std::string utf8_clean(std::string&& str) { + if (!g_utf8_validate(str.c_str(), static_cast<gssize>(str.length()), {})) { + gchar* clean{g_utf8_make_valid( + str.c_str(), static_cast<gssize>(str.length()))}; + str = clean; + g_free(clean); + } + return std::move(str); +} + +/** * Flatten a string -- down-case and fold diacritics. * * @param str a string @@ -172,7 +190,7 @@ static inline bool contains_unbroken_script(const std::string& str) { * @return a flattened string */ std::string utf8_flatten(const char* str); -inline std::string +static inline std::string utf8_flatten(const std::string& s) { return utf8_flatten(s.c_str()); } diff --git a/lib/utils/tests/test-utils.cc b/lib/utils/tests/test-utils.cc index fe0d075..0551bdb 100644 --- a/lib/utils/tests/test-utils.cc +++ b/lib/utils/tests/test-utils.cc @@ -1,5 +1,5 @@ /* -** Copyright (C) 2017-2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl> +** Copyright (C) 2017-2025 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl> ** ** This library is free software; you can redistribute it and/or ** modify it under the terms of the GNU Lesser General Public License @@ -150,6 +150,22 @@ test_parse_size() } static void +test_utf8_clean() +{ + assert_equal(utf8_clean("James Holden"), "James Holden"); + + const uint8_t invalid_bytes[] ={ 'a' , 0xff, 'c', '\0'}; + std::string invalid{reinterpret_cast<const char*>(invalid_bytes), + sizeof(invalid_bytes) - 1}; + + g_assert_false(g_utf8_validate(invalid.c_str(), invalid.length(), nullptr)); + + const auto valid = utf8_clean(std::move(invalid)); + g_assert_true(g_utf8_validate(valid.c_str(), valid.length(), {})); + assert_equal(valid, "a\357\277\275c"); // replacement char +} + +static void test_flatten() { CaseVec cases = { @@ -159,7 +175,7 @@ test_flatten() {"đodø", true, "dodo"}, // don't touch combining characters in CJK etc. - {"スポンサーシップ募集",true, "スポンサーシップ募集"} + {"スポンサーシップ募集", true, "スポンサーシップ募集"} }; test_cases(cases, [](auto s, auto f) { return utf8_flatten(s); }); @@ -192,7 +208,6 @@ test_clean() test_cases(cases, [](auto s, auto f) { return utf8_clean(s); }); } - static void test_word_break() { @@ -206,7 +221,6 @@ test_word_break() test_cases(cases, [](auto s, auto f) { return utf8_wordbreak(s); }); } - static void test_format() { @@ -327,6 +341,7 @@ main(int argc, char* argv[]) g_test_add_func("/utils/date-basic", test_date_basic); g_test_add_func("/utils/date-ymwdhMs", test_date_ymwdhMs); g_test_add_func("/utils/parse-size", test_parse_size); + g_test_add_func("/utils/utf8-clean", test_utf8_clean); g_test_add_func("/utils/flatten", test_flatten); g_test_add_func("/utils/remove-ctrl", test_remove_ctrl); g_test_add_func("/utils/clean", test_clean); |
