utils: add utf8_wordbreak
Determine if a string has wordbreaks in a mostly Xapian-compatible way. We need this to determine what strings should be considered "phrases".
This commit is contained in:
@ -207,6 +207,60 @@ Mu::utf8_clean(const std::string& dirty)
|
||||
return std::string{g_strstrip(gstr->str)};
|
||||
}
|
||||
|
||||
|
||||
std::string
|
||||
Mu::utf8_wordbreak(const std::string& txt)
|
||||
{
|
||||
g_autoptr(GString) gstr = g_string_sized_new(txt.length());
|
||||
|
||||
bool spc{};
|
||||
for (auto cur = txt.c_str(); cur && *cur; cur = g_utf8_next_char(cur)) {
|
||||
const gunichar uc = g_utf8_get_char(cur);
|
||||
|
||||
if (g_unichar_iscntrl(uc)) {
|
||||
g_string_append_c(gstr, ' ');
|
||||
continue;
|
||||
}
|
||||
// inspired by Xapian's termgenerator.
|
||||
|
||||
switch(uc) {
|
||||
case '\'':
|
||||
case '&':
|
||||
case 0xb7:
|
||||
case 0x5f4:
|
||||
case 0x2019:
|
||||
case 0x201b:
|
||||
case 0x2027:
|
||||
case ',':
|
||||
case '.':
|
||||
case ';':
|
||||
case '+':
|
||||
case '#':
|
||||
case '-':
|
||||
case 0x037e: // GREEK QUESTION MARK
|
||||
case 0x0589: // ARMENIAN FULL STOP
|
||||
case 0x060D: // ARABIC DATE SEPARATOR
|
||||
case 0x07F8: // NKO COMMA
|
||||
case 0x2044: // FRACTION SLASH
|
||||
case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
|
||||
case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
|
||||
case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
|
||||
if (spc)
|
||||
break;
|
||||
spc = true;
|
||||
g_string_append_c(gstr, ' ');
|
||||
break;
|
||||
default:
|
||||
spc = false;
|
||||
g_string_append_unichar(gstr, uc);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return std::string{g_strstrip(gstr->str)};
|
||||
}
|
||||
|
||||
|
||||
std::string
|
||||
Mu::remove_ctrl(const std::string& str)
|
||||
{
|
||||
|
||||
@ -187,6 +187,17 @@ utf8_flatten(const std::string& s) {
|
||||
*/
|
||||
std::string utf8_clean(const std::string& dirty);
|
||||
|
||||
|
||||
/**
|
||||
* Replace all wordbreak chars (as recognized by Xapian by single SPC)
|
||||
*
|
||||
* @param txt text
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
std::string utf8_wordbreak(const std::string& txt);
|
||||
|
||||
|
||||
/**
|
||||
* Remove ctrl characters, replacing them with ' '; subsequent
|
||||
* ctrl characters are replaced by a single ' '
|
||||
|
||||
@ -188,6 +188,20 @@ test_clean()
|
||||
test_cases(cases, [](auto s, auto f) { return utf8_clean(s); });
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_word_break()
|
||||
{
|
||||
CaseVec cases = {
|
||||
{"aap+noot&mies", true, "aap noot mies"},
|
||||
{"hallo", true, "hallo"},
|
||||
{" foo-bar###cuux,fnorb ", true, "foo bar cuux fnorb"},
|
||||
};
|
||||
|
||||
test_cases(cases, [](auto s, auto f) { return utf8_wordbreak(s); });
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
test_format()
|
||||
{
|
||||
@ -313,6 +327,7 @@ main(int argc, char* argv[])
|
||||
g_test_add_func("/utils/flatten", test_flatten);
|
||||
g_test_add_func("/utils/remove-ctrl", test_remove_ctrl);
|
||||
g_test_add_func("/utils/clean", test_clean);
|
||||
g_test_add_func("/utils/word-break", test_word_break);
|
||||
g_test_add_func("/utils/format", test_format);
|
||||
g_test_add_func("/utils/summarize", test_summarize);
|
||||
g_test_add_func("/utils/split", test_split);
|
||||
|
||||
Reference in New Issue
Block a user