utils: add utf8_wordbreak
Determine if a string has wordbreaks in a mostly Xapian-compatible way. We need this to determine what strings should be considered "phrases".
This commit is contained in:
@ -207,6 +207,60 @@ Mu::utf8_clean(const std::string& dirty)
|
||||
return std::string{g_strstrip(gstr->str)};
|
||||
}
|
||||
|
||||
|
||||
std::string
|
||||
Mu::utf8_wordbreak(const std::string& txt)
|
||||
{
|
||||
g_autoptr(GString) gstr = g_string_sized_new(txt.length());
|
||||
|
||||
bool spc{};
|
||||
for (auto cur = txt.c_str(); cur && *cur; cur = g_utf8_next_char(cur)) {
|
||||
const gunichar uc = g_utf8_get_char(cur);
|
||||
|
||||
if (g_unichar_iscntrl(uc)) {
|
||||
g_string_append_c(gstr, ' ');
|
||||
continue;
|
||||
}
|
||||
// inspired by Xapian's termgenerator.
|
||||
|
||||
switch(uc) {
|
||||
case '\'':
|
||||
case '&':
|
||||
case 0xb7:
|
||||
case 0x5f4:
|
||||
case 0x2019:
|
||||
case 0x201b:
|
||||
case 0x2027:
|
||||
case ',':
|
||||
case '.':
|
||||
case ';':
|
||||
case '+':
|
||||
case '#':
|
||||
case '-':
|
||||
case 0x037e: // GREEK QUESTION MARK
|
||||
case 0x0589: // ARMENIAN FULL STOP
|
||||
case 0x060D: // ARABIC DATE SEPARATOR
|
||||
case 0x07F8: // NKO COMMA
|
||||
case 0x2044: // FRACTION SLASH
|
||||
case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
|
||||
case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
|
||||
case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
|
||||
if (spc)
|
||||
break;
|
||||
spc = true;
|
||||
g_string_append_c(gstr, ' ');
|
||||
break;
|
||||
default:
|
||||
spc = false;
|
||||
g_string_append_unichar(gstr, uc);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return std::string{g_strstrip(gstr->str)};
|
||||
}
|
||||
|
||||
|
||||
std::string
|
||||
Mu::remove_ctrl(const std::string& str)
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user