diff --git a/lib/utils/mu-utils.cc b/lib/utils/mu-utils.cc index e60b65ff..d3fea3f6 100644 --- a/lib/utils/mu-utils.cc +++ b/lib/utils/mu-utils.cc @@ -207,6 +207,60 @@ Mu::utf8_clean(const std::string& dirty) return std::string{g_strstrip(gstr->str)}; } + +std::string +Mu::utf8_wordbreak(const std::string& txt) +{ + g_autoptr(GString) gstr = g_string_sized_new(txt.length()); + + bool spc{}; + for (auto cur = txt.c_str(); cur && *cur; cur = g_utf8_next_char(cur)) { + const gunichar uc = g_utf8_get_char(cur); + + if (g_unichar_iscntrl(uc)) { + g_string_append_c(gstr, ' '); + continue; + } + // inspired by Xapian's termgenerator. + + switch(uc) { + case '\'': + case '&': + case 0xb7: + case 0x5f4: + case 0x2019: + case 0x201b: + case 0x2027: + case ',': + case '.': + case ';': + case '+': + case '#': + case '-': + case 0x037e: // GREEK QUESTION MARK + case 0x0589: // ARMENIAN FULL STOP + case 0x060D: // ARABIC DATE SEPARATOR + case 0x07F8: // NKO COMMA + case 0x2044: // FRACTION SLASH + case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA + case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON + case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON + if (spc) + break; + spc = true; + g_string_append_c(gstr, ' '); + break; + default: + spc = false; + g_string_append_unichar(gstr, uc); + break; + } + } + + return std::string{g_strstrip(gstr->str)}; +} + + std::string Mu::remove_ctrl(const std::string& str) { diff --git a/lib/utils/mu-utils.hh b/lib/utils/mu-utils.hh index 6ca0e85f..115d0ae2 100644 --- a/lib/utils/mu-utils.hh +++ b/lib/utils/mu-utils.hh @@ -187,6 +187,17 @@ utf8_flatten(const std::string& s) { */ std::string utf8_clean(const std::string& dirty); + +/** + * Replace all wordbreak chars (as recognized by Xapian by single SPC) + * + * @param txt text + * + * @return string + */ +std::string utf8_wordbreak(const std::string& txt); + + /** * Remove ctrl characters, replacing them with ' '; subsequent * ctrl characters are replaced by a single ' ' diff --git a/lib/utils/tests/test-utils.cc b/lib/utils/tests/test-utils.cc index f0e98412..24642cf8 100644 --- a/lib/utils/tests/test-utils.cc +++ b/lib/utils/tests/test-utils.cc @@ -188,6 +188,20 @@ test_clean() test_cases(cases, [](auto s, auto f) { return utf8_clean(s); }); } + +static void +test_word_break() +{ + CaseVec cases = { + {"aap+noot&mies", true, "aap noot mies"}, + {"hallo", true, "hallo"}, + {" foo-bar###cuux,fnorb ", true, "foo bar cuux fnorb"}, + }; + + test_cases(cases, [](auto s, auto f) { return utf8_wordbreak(s); }); +} + + static void test_format() { @@ -313,6 +327,7 @@ main(int argc, char* argv[]) g_test_add_func("/utils/flatten", test_flatten); g_test_add_func("/utils/remove-ctrl", test_remove_ctrl); g_test_add_func("/utils/clean", test_clean); + g_test_add_func("/utils/word-break", test_word_break); g_test_add_func("/utils/format", test_format); g_test_add_func("/utils/summarize", test_summarize); g_test_add_func("/utils/split", test_split);