utils: add utf8_wordbreak

Determine if a string has wordbreaks in a mostly Xapian-compatible way.
We need this to determine what strings should be considered "phrases".
This commit is contained in:
Dirk-Jan C. Binnema
2023-09-17 10:01:15 +03:00
parent 94c90bd0c5
commit 7cbab21099
3 changed files with 80 additions and 0 deletions

View File

@ -207,6 +207,60 @@ Mu::utf8_clean(const std::string& dirty)
return std::string{g_strstrip(gstr->str)}; return std::string{g_strstrip(gstr->str)};
} }
std::string
Mu::utf8_wordbreak(const std::string& txt)
{
g_autoptr(GString) gstr = g_string_sized_new(txt.length());
bool spc{};
for (auto cur = txt.c_str(); cur && *cur; cur = g_utf8_next_char(cur)) {
const gunichar uc = g_utf8_get_char(cur);
if (g_unichar_iscntrl(uc)) {
g_string_append_c(gstr, ' ');
continue;
}
// inspired by Xapian's termgenerator.
switch(uc) {
case '\'':
case '&':
case 0xb7:
case 0x5f4:
case 0x2019:
case 0x201b:
case 0x2027:
case ',':
case '.':
case ';':
case '+':
case '#':
case '-':
case 0x037e: // GREEK QUESTION MARK
case 0x0589: // ARMENIAN FULL STOP
case 0x060D: // ARABIC DATE SEPARATOR
case 0x07F8: // NKO COMMA
case 0x2044: // FRACTION SLASH
case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
if (spc)
break;
spc = true;
g_string_append_c(gstr, ' ');
break;
default:
spc = false;
g_string_append_unichar(gstr, uc);
break;
}
}
return std::string{g_strstrip(gstr->str)};
}
std::string std::string
Mu::remove_ctrl(const std::string& str) Mu::remove_ctrl(const std::string& str)
{ {

View File

@ -187,6 +187,17 @@ utf8_flatten(const std::string& s) {
*/ */
std::string utf8_clean(const std::string& dirty); std::string utf8_clean(const std::string& dirty);
/**
* Replace all wordbreak chars (as recognized by Xapian by single SPC)
*
* @param txt text
*
* @return string
*/
std::string utf8_wordbreak(const std::string& txt);
/** /**
* Remove ctrl characters, replacing them with ' '; subsequent * Remove ctrl characters, replacing them with ' '; subsequent
* ctrl characters are replaced by a single ' ' * ctrl characters are replaced by a single ' '

View File

@ -188,6 +188,20 @@ test_clean()
test_cases(cases, [](auto s, auto f) { return utf8_clean(s); }); test_cases(cases, [](auto s, auto f) { return utf8_clean(s); });
} }
static void
test_word_break()
{
CaseVec cases = {
{"aap+noot&mies", true, "aap noot mies"},
{"hallo", true, "hallo"},
{" foo-bar###cuux,fnorb ", true, "foo bar cuux fnorb"},
};
test_cases(cases, [](auto s, auto f) { return utf8_wordbreak(s); });
}
static void static void
test_format() test_format()
{ {
@ -313,6 +327,7 @@ main(int argc, char* argv[])
g_test_add_func("/utils/flatten", test_flatten); g_test_add_func("/utils/flatten", test_flatten);
g_test_add_func("/utils/remove-ctrl", test_remove_ctrl); g_test_add_func("/utils/remove-ctrl", test_remove_ctrl);
g_test_add_func("/utils/clean", test_clean); g_test_add_func("/utils/clean", test_clean);
g_test_add_func("/utils/word-break", test_word_break);
g_test_add_func("/utils/format", test_format); g_test_add_func("/utils/format", test_format);
g_test_add_func("/utils/summarize", test_summarize); g_test_add_func("/utils/summarize", test_summarize);
g_test_add_func("/utils/split", test_split); g_test_add_func("/utils/split", test_split);