utils: add utf8_wordbreak

Determine if a string has wordbreaks in a mostly Xapian-compatible way. We need this to determine what strings should be considered "phrases".
2023-09-17 10:01:15 +03:00
parent 94c90bd0c5
commit 7cbab21099
3 changed files with 80 additions and 0 deletions
--- a/lib/utils/mu-utils.cc
+++ b/lib/utils/mu-utils.cc
@ -207,6 +207,60 @@ Mu::utf8_clean(const std::string& dirty)
 	return std::string{g_strstrip(gstr->str)};
 }

+
+std::string
+Mu::utf8_wordbreak(const std::string& txt)
+{
+	g_autoptr(GString) gstr = g_string_sized_new(txt.length());
+
+	bool spc{};
+	for (auto cur = txt.c_str(); cur && *cur; cur = g_utf8_next_char(cur)) {
+		const gunichar uc = g_utf8_get_char(cur);
+
+		if (g_unichar_iscntrl(uc)) {
+			g_string_append_c(gstr, ' ');
+			continue;
+		}
+		// inspired by Xapian's termgenerator.
+
+		switch(uc) {
+		case '\'':
+		case '&':
+		case 0xb7:
+		case 0x5f4:
+		case 0x2019:
+		case 0x201b:
+		case 0x2027:
+		case ',':
+		case '.':
+		case ';':
+		case '+':
+		case '#':
+		case '-':
+		case 0x037e: // GREEK QUESTION MARK
+		case 0x0589: // ARMENIAN FULL STOP
+		case 0x060D: // ARABIC DATE SEPARATOR
+		case 0x07F8: // NKO COMMA
+		case 0x2044: // FRACTION SLASH
+		case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
+		case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
+		case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
+			if (spc)
+				break;
+			spc = true;
+			g_string_append_c(gstr, ' ');
+			break;
+		default:
+			spc = false;
+			g_string_append_unichar(gstr, uc);
+			break;
+		}
+	}
+
+	return std::string{g_strstrip(gstr->str)};
+}
+
+
 std::string
 Mu::remove_ctrl(const std::string& str)
 {