utils: add utf8_wordbreak

Determine if a string has wordbreaks in a mostly Xapian-compatible way. We need this to determine what strings should be considered "phrases".
2023-09-17 10:01:15 +03:00
parent 94c90bd0c5
commit 7cbab21099
3 changed files with 80 additions and 0 deletions
--- a/lib/utils/mu-utils.cc
+++ b/lib/utils/mu-utils.cc
@ -207,6 +207,60 @@ Mu::utf8_clean(const std::string& dirty)
 	return std::string{g_strstrip(gstr->str)};
 }

+
+std::string
+Mu::utf8_wordbreak(const std::string& txt)
+{
+	g_autoptr(GString) gstr = g_string_sized_new(txt.length());
+
+	bool spc{};
+	for (auto cur = txt.c_str(); cur && *cur; cur = g_utf8_next_char(cur)) {
+		const gunichar uc = g_utf8_get_char(cur);
+
+		if (g_unichar_iscntrl(uc)) {
+			g_string_append_c(gstr, ' ');
+			continue;
+		}
+		// inspired by Xapian's termgenerator.
+
+		switch(uc) {
+		case '\'':
+		case '&':
+		case 0xb7:
+		case 0x5f4:
+		case 0x2019:
+		case 0x201b:
+		case 0x2027:
+		case ',':
+		case '.':
+		case ';':
+		case '+':
+		case '#':
+		case '-':
+		case 0x037e: // GREEK QUESTION MARK
+		case 0x0589: // ARMENIAN FULL STOP
+		case 0x060D: // ARABIC DATE SEPARATOR
+		case 0x07F8: // NKO COMMA
+		case 0x2044: // FRACTION SLASH
+		case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
+		case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
+		case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
+			if (spc)
+				break;
+			spc = true;
+			g_string_append_c(gstr, ' ');
+			break;
+		default:
+			spc = false;
+			g_string_append_unichar(gstr, uc);
+			break;
+		}
+	}
+
+	return std::string{g_strstrip(gstr->str)};
+}
+
+
 std::string
 Mu::remove_ctrl(const std::string& str)
 {
--- a/lib/utils/mu-utils.hh
+++ b/lib/utils/mu-utils.hh
@ -187,6 +187,17 @@ utf8_flatten(const std::string& s) {
 */
 std::string utf8_clean(const std::string& dirty);

+
+/**
+ * Replace all wordbreak chars (as recognized by Xapian by single SPC)
+ *
+ * @param txt text
+ *
+ * @return string
+ */
+std::string utf8_wordbreak(const std::string& txt);
+
+
 /**
 * Remove ctrl characters, replacing them with ' '; subsequent
 * ctrl characters are replaced by a single ' '
--- a/lib/utils/tests/test-utils.cc
+++ b/lib/utils/tests/test-utils.cc
@ -188,6 +188,20 @@ test_clean()
 	test_cases(cases, [](auto s, auto f) { return utf8_clean(s); });
 }

+
+static void
+test_word_break()
+{
+	CaseVec cases = {
+	    {"aap+noot&mies",            true,  "aap noot mies"},
+	    {"hallo",                    true,  "hallo"},
+	    {"  foo-bar###cuux,fnorb  ", true, "foo bar cuux fnorb"},
+	};
+
+	test_cases(cases, [](auto s, auto f) { return utf8_wordbreak(s); });
+}
+
+
 static void
 test_format()
 {
@ -313,6 +327,7 @@ main(int argc, char* argv[])
 	g_test_add_func("/utils/flatten", test_flatten);
 	g_test_add_func("/utils/remove-ctrl", test_remove_ctrl);
 	g_test_add_func("/utils/clean", test_clean);
+	g_test_add_func("/utils/word-break", test_word_break);
 	g_test_add_func("/utils/format", test_format);
 	g_test_add_func("/utils/summarize", test_summarize);
 	g_test_add_func("/utils/split", test_split);