utils: add utf8_wordbreak

Determine if a string has wordbreaks in a mostly Xapian-compatible way. We need this to determine what strings should be considered "phrases".
2023-09-17 10:01:15 +03:00
parent 94c90bd0c5
commit 7cbab21099
3 changed files with 80 additions and 0 deletions
--- a/lib/utils/mu-utils.cc
+++ b/lib/utils/mu-utils.cc
@ -207,6 +207,60 @@ Mu::utf8_clean(const std::string& dirty)
 	return std::string{g_strstrip(gstr->str)};
 }
 std::string
 Mu::utf8_wordbreak(const std::string& txt)
 {
 	g_autoptr(GString) gstr = g_string_sized_new(txt.length());
 	bool spc{};
 	for (auto cur = txt.c_str(); cur && *cur; cur = g_utf8_next_char(cur)) {
 		const gunichar uc = g_utf8_get_char(cur);
 		if (g_unichar_iscntrl(uc)) {
 			g_string_append_c(gstr, ' ');
 			continue;
 		}
 		// inspired by Xapian's termgenerator.
 		switch(uc) {
 		case '\'':
 		case '&':
 		case 0xb7:
 		case 0x5f4:
 		case 0x2019:
 		case 0x201b:
 		case 0x2027:
 		case ',':
 		case '.':
 		case ';':
 		case '+':
 		case '#':
 		case '-':
 		case 0x037e: // GREEK QUESTION MARK
 		case 0x0589: // ARMENIAN FULL STOP
 		case 0x060D: // ARABIC DATE SEPARATOR
 		case 0x07F8: // NKO COMMA
 		case 0x2044: // FRACTION SLASH
 		case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
 		case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
 		case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
 			if (spc)
 				break;
 			spc = true;
 			g_string_append_c(gstr, ' ');
 			break;
 		default:
 			spc = false;
 			g_string_append_unichar(gstr, uc);
 			break;
 		}
 	}
 	return std::string{g_strstrip(gstr->str)};
 }
 std::string
 Mu::remove_ctrl(const std::string& str)
 {
--- a/lib/utils/mu-utils.hh
+++ b/lib/utils/mu-utils.hh
@ -187,6 +187,17 @@ utf8_flatten(const std::string& s) {
 */
 std::string utf8_clean(const std::string& dirty);
 /**
 * Replace all wordbreak chars (as recognized by Xapian by single SPC)
 *
 * @param txt text
 *
 * @return string
 */
 std::string utf8_wordbreak(const std::string& txt);
 /**
 * Remove ctrl characters, replacing them with ' '; subsequent
 * ctrl characters are replaced by a single ' '
--- a/lib/utils/tests/test-utils.cc
+++ b/lib/utils/tests/test-utils.cc
@ -188,6 +188,20 @@ test_clean()
 	test_cases(cases, [](auto s, auto f) { return utf8_clean(s); });
 }
 static void
 test_word_break()
 {
 	CaseVec cases = {
 	    {"aap+noot&mies",            true,  "aap noot mies"},
 	    {"hallo",                    true,  "hallo"},
 	    {"  foo-bar###cuux,fnorb  ", true, "foo bar cuux fnorb"},
 	};
 	test_cases(cases, [](auto s, auto f) { return utf8_wordbreak(s); });
 }
 static void
 test_format()
 {
@ -313,6 +327,7 @@ main(int argc, char* argv[])
 	g_test_add_func("/utils/flatten", test_flatten);
 	g_test_add_func("/utils/remove-ctrl", test_remove_ctrl);
 	g_test_add_func("/utils/clean", test_clean);
 	g_test_add_func("/utils/word-break", test_word_break);
 	g_test_add_func("/utils/format", test_format);
 	g_test_add_func("/utils/summarize", test_summarize);
 	g_test_add_func("/utils/split", test_split);