summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDirk-Jan C. Binnema <djcb@djcbsoftware.nl>2025-10-25 16:53:57 +0300
committerDirk-Jan C. Binnema <djcb@djcbsoftware.nl>2025-10-25 16:55:43 +0300
commit825fed4be8c5e68b25a7b04cfccff46cdd86e4f2 (patch)
tree18041e0006e9c07b17b018d4ba776aeaee7abf0e
parent5a2b7290a5866f44d69f3aac9e6364ddb364adc8 (diff)
utils: add utf8_clean helper
To morph text into valid utf8 if it isn't already.
-rw-r--r--lib/utils/mu-utils.hh20
-rw-r--r--lib/utils/tests/test-utils.cc23
2 files changed, 38 insertions, 5 deletions
diff --git a/lib/utils/mu-utils.hh b/lib/utils/mu-utils.hh
index 32bb0dd..11c0d9a 100644
--- a/lib/utils/mu-utils.hh
+++ b/lib/utils/mu-utils.hh
@@ -165,6 +165,24 @@ static inline bool contains_unbroken_script(const std::string& str) {
}
/**
+ * If the string is already valid utf8, return it
+ * otherwise, return a valid utf8 version
+ *
+ * @param str some string
+ *
+ * @return a utf8-string
+ */
+static inline std::string utf8_clean(std::string&& str) {
+ if (!g_utf8_validate(str.c_str(), static_cast<gssize>(str.length()), {})) {
+ gchar* clean{g_utf8_make_valid(
+ str.c_str(), static_cast<gssize>(str.length()))};
+ str = clean;
+ g_free(clean);
+ }
+ return std::move(str);
+}
+
+/**
* Flatten a string -- down-case and fold diacritics.
*
* @param str a string
@@ -172,7 +190,7 @@ static inline bool contains_unbroken_script(const std::string& str) {
* @return a flattened string
*/
std::string utf8_flatten(const char* str);
-inline std::string
+static inline std::string
utf8_flatten(const std::string& s) {
return utf8_flatten(s.c_str());
}
diff --git a/lib/utils/tests/test-utils.cc b/lib/utils/tests/test-utils.cc
index fe0d075..0551bdb 100644
--- a/lib/utils/tests/test-utils.cc
+++ b/lib/utils/tests/test-utils.cc
@@ -1,5 +1,5 @@
/*
-** Copyright (C) 2017-2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+** Copyright (C) 2017-2025 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
@@ -150,6 +150,22 @@ test_parse_size()
}
static void
+test_utf8_clean()
+{
+ assert_equal(utf8_clean("James Holden"), "James Holden");
+
+ const uint8_t invalid_bytes[] ={ 'a' , 0xff, 'c', '\0'};
+ std::string invalid{reinterpret_cast<const char*>(invalid_bytes),
+ sizeof(invalid_bytes) - 1};
+
+ g_assert_false(g_utf8_validate(invalid.c_str(), invalid.length(), nullptr));
+
+ const auto valid = utf8_clean(std::move(invalid));
+ g_assert_true(g_utf8_validate(valid.c_str(), valid.length(), {}));
+ assert_equal(valid, "a\357\277\275c"); // replacement char
+}
+
+static void
test_flatten()
{
CaseVec cases = {
@@ -159,7 +175,7 @@ test_flatten()
{"đodø", true, "dodo"},
// don't touch combining characters in CJK etc.
- {"スポンサーシップ募集",true, "スポンサーシップ募集"}
+ {"スポンサーシップ募集", true, "スポンサーシップ募集"}
};
test_cases(cases, [](auto s, auto f) { return utf8_flatten(s); });
@@ -192,7 +208,6 @@ test_clean()
test_cases(cases, [](auto s, auto f) { return utf8_clean(s); });
}
-
static void
test_word_break()
{
@@ -206,7 +221,6 @@ test_word_break()
test_cases(cases, [](auto s, auto f) { return utf8_wordbreak(s); });
}
-
static void
test_format()
{
@@ -327,6 +341,7 @@ main(int argc, char* argv[])
g_test_add_func("/utils/date-basic", test_date_basic);
g_test_add_func("/utils/date-ymwdhMs", test_date_ymwdhMs);
g_test_add_func("/utils/parse-size", test_parse_size);
+ g_test_add_func("/utils/utf8-clean", test_utf8_clean);
g_test_add_func("/utils/flatten", test_flatten);
g_test_add_func("/utils/remove-ctrl", test_remove_ctrl);
g_test_add_func("/utils/clean", test_clean);