utils: add utf8_clean helper

To morph text into valid utf8 if it isn't already.
author: Dirk-Jan C. Binnema <djcb@djcbsoftware.nl> 2025-10-25 16:53:57 +0300
committer: Dirk-Jan C. Binnema <djcb@djcbsoftware.nl> 2025-10-25 16:55:43 +0300
commit: 825fed4be8c5e68b25a7b04cfccff46cdd86e4f2 (patch)
tree: 18041e0006e9c07b17b018d4ba776aeaee7abf0e
parent: 5a2b7290a5866f44d69f3aac9e6364ddb364adc8 (diff)
2 files changed, 38 insertions, 5 deletions
diff --git a/lib/utils/mu-utils.hh b/lib/utils/mu-utils.hh
index 32bb0dd..11c0d9a 100644
--- a/lib/utils/mu-utils.hh
+++ b/lib/utils/mu-utils.hh
@@ -165,6 +165,24 @@ static inline bool contains_unbroken_script(const std::string& str) {
 }
 
 /**
+ * If the string is already valid utf8, return it
+ * otherwise, return a valid utf8 version
+ *
+ * @param str some string
+ *
+ * @return a utf8-string
+ */
+static inline std::string utf8_clean(std::string&& str) {
+	if (!g_utf8_validate(str.c_str(), static_cast<gssize>(str.length()), {})) {
+		gchar* clean{g_utf8_make_valid(
+				str.c_str(), static_cast<gssize>(str.length()))};
+		str = clean;
+		g_free(clean);
+	}
+	return std::move(str);
+}
+
+/**
  * Flatten a string -- down-case and fold diacritics.
  *
  * @param str a string
@@ -172,7 +190,7 @@ static inline bool contains_unbroken_script(const std::string& str) {
  * @return a flattened string
  */
 std::string utf8_flatten(const char* str);
-inline std::string
+static inline std::string
 utf8_flatten(const std::string& s) {
 	return utf8_flatten(s.c_str());
 }
diff --git a/lib/utils/tests/test-utils.cc b/lib/utils/tests/test-utils.cc
index fe0d075..0551bdb 100644
--- a/lib/utils/tests/test-utils.cc
+++ b/lib/utils/tests/test-utils.cc
@@ -1,5 +1,5 @@
 /*
-** Copyright (C) 2017-2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+** Copyright (C) 2017-2025 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
 **
 **  This library is free software; you can redistribute it and/or
 **  modify it under the terms of the GNU Lesser General Public License
@@ -150,6 +150,22 @@ test_parse_size()
 }
 
 static void
+test_utf8_clean()
+{
+	assert_equal(utf8_clean("James Holden"), "James Holden");
+
+	const uint8_t invalid_bytes[] ={ 'a' , 0xff, 'c', '\0'};
+	std::string invalid{reinterpret_cast<const char*>(invalid_bytes),
+		sizeof(invalid_bytes) - 1};
+
+	g_assert_false(g_utf8_validate(invalid.c_str(), invalid.length(), nullptr));
+
+	const auto valid = utf8_clean(std::move(invalid));
+	g_assert_true(g_utf8_validate(valid.c_str(), valid.length(), {}));
+	assert_equal(valid, "a\357\277\275c"); // replacement char
+}
+
+static void
 test_flatten()
 {
 	CaseVec cases = {
@@ -159,7 +175,7 @@ test_flatten()
 	    {"đodø",		true, "dodo"},
 
 	    // don't touch combining characters in CJK etc.
-	    {"スポンサーシップ募集",true, "スポンサーシップ募集"}
+	    {"スポンサーシップ募集", true, "スポンサーシップ募集"}
 	};
 
 	test_cases(cases, [](auto s, auto f) { return utf8_flatten(s); });
@@ -192,7 +208,6 @@ test_clean()
 	test_cases(cases, [](auto s, auto f) { return utf8_clean(s); });
 }
 
-
 static void
 test_word_break()
 {
@@ -206,7 +221,6 @@ test_word_break()
 	test_cases(cases, [](auto s, auto f) { return utf8_wordbreak(s); });
 }
 
-
 static void
 test_format()
 {
@@ -327,6 +341,7 @@ main(int argc, char* argv[])
 	g_test_add_func("/utils/date-basic", test_date_basic);
 	g_test_add_func("/utils/date-ymwdhMs", test_date_ymwdhMs);
 	g_test_add_func("/utils/parse-size", test_parse_size);
+	g_test_add_func("/utils/utf8-clean", test_utf8_clean);
 	g_test_add_func("/utils/flatten", test_flatten);
 	g_test_add_func("/utils/remove-ctrl", test_remove_ctrl);
 	g_test_add_func("/utils/clean", test_clean);
author	Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>	2025-10-25 16:53:57 +0300
committer	Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>	2025-10-25 16:55:43 +0300
commit	825fed4be8c5e68b25a7b04cfccff46cdd86e4f2 (patch)
tree	18041e0006e9c07b17b018d4ba776aeaee7abf0e
parent	5a2b7290a5866f44d69f3aac9e6364ddb364adc8 (diff)