utils: add utf8_wordbreak
Determine if a string has wordbreaks in a mostly Xapian-compatible way. We need this to determine what strings should be considered "phrases".
This commit is contained in:
@ -207,6 +207,60 @@ Mu::utf8_clean(const std::string& dirty)
|
|||||||
return std::string{g_strstrip(gstr->str)};
|
return std::string{g_strstrip(gstr->str)};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
std::string
|
||||||
|
Mu::utf8_wordbreak(const std::string& txt)
|
||||||
|
{
|
||||||
|
g_autoptr(GString) gstr = g_string_sized_new(txt.length());
|
||||||
|
|
||||||
|
bool spc{};
|
||||||
|
for (auto cur = txt.c_str(); cur && *cur; cur = g_utf8_next_char(cur)) {
|
||||||
|
const gunichar uc = g_utf8_get_char(cur);
|
||||||
|
|
||||||
|
if (g_unichar_iscntrl(uc)) {
|
||||||
|
g_string_append_c(gstr, ' ');
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// inspired by Xapian's termgenerator.
|
||||||
|
|
||||||
|
switch(uc) {
|
||||||
|
case '\'':
|
||||||
|
case '&':
|
||||||
|
case 0xb7:
|
||||||
|
case 0x5f4:
|
||||||
|
case 0x2019:
|
||||||
|
case 0x201b:
|
||||||
|
case 0x2027:
|
||||||
|
case ',':
|
||||||
|
case '.':
|
||||||
|
case ';':
|
||||||
|
case '+':
|
||||||
|
case '#':
|
||||||
|
case '-':
|
||||||
|
case 0x037e: // GREEK QUESTION MARK
|
||||||
|
case 0x0589: // ARMENIAN FULL STOP
|
||||||
|
case 0x060D: // ARABIC DATE SEPARATOR
|
||||||
|
case 0x07F8: // NKO COMMA
|
||||||
|
case 0x2044: // FRACTION SLASH
|
||||||
|
case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
|
||||||
|
case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
|
||||||
|
case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
|
||||||
|
if (spc)
|
||||||
|
break;
|
||||||
|
spc = true;
|
||||||
|
g_string_append_c(gstr, ' ');
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
spc = false;
|
||||||
|
g_string_append_unichar(gstr, uc);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::string{g_strstrip(gstr->str)};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
std::string
|
std::string
|
||||||
Mu::remove_ctrl(const std::string& str)
|
Mu::remove_ctrl(const std::string& str)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -187,6 +187,17 @@ utf8_flatten(const std::string& s) {
|
|||||||
*/
|
*/
|
||||||
std::string utf8_clean(const std::string& dirty);
|
std::string utf8_clean(const std::string& dirty);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Replace all wordbreak chars (as recognized by Xapian by single SPC)
|
||||||
|
*
|
||||||
|
* @param txt text
|
||||||
|
*
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
std::string utf8_wordbreak(const std::string& txt);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Remove ctrl characters, replacing them with ' '; subsequent
|
* Remove ctrl characters, replacing them with ' '; subsequent
|
||||||
* ctrl characters are replaced by a single ' '
|
* ctrl characters are replaced by a single ' '
|
||||||
|
|||||||
@ -188,6 +188,20 @@ test_clean()
|
|||||||
test_cases(cases, [](auto s, auto f) { return utf8_clean(s); });
|
test_cases(cases, [](auto s, auto f) { return utf8_clean(s); });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_word_break()
|
||||||
|
{
|
||||||
|
CaseVec cases = {
|
||||||
|
{"aap+noot&mies", true, "aap noot mies"},
|
||||||
|
{"hallo", true, "hallo"},
|
||||||
|
{" foo-bar###cuux,fnorb ", true, "foo bar cuux fnorb"},
|
||||||
|
};
|
||||||
|
|
||||||
|
test_cases(cases, [](auto s, auto f) { return utf8_wordbreak(s); });
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
test_format()
|
test_format()
|
||||||
{
|
{
|
||||||
@ -313,6 +327,7 @@ main(int argc, char* argv[])
|
|||||||
g_test_add_func("/utils/flatten", test_flatten);
|
g_test_add_func("/utils/flatten", test_flatten);
|
||||||
g_test_add_func("/utils/remove-ctrl", test_remove_ctrl);
|
g_test_add_func("/utils/remove-ctrl", test_remove_ctrl);
|
||||||
g_test_add_func("/utils/clean", test_clean);
|
g_test_add_func("/utils/clean", test_clean);
|
||||||
|
g_test_add_func("/utils/word-break", test_word_break);
|
||||||
g_test_add_func("/utils/format", test_format);
|
g_test_add_func("/utils/format", test_format);
|
||||||
g_test_add_func("/utils/summarize", test_summarize);
|
g_test_add_func("/utils/summarize", test_summarize);
|
||||||
g_test_add_func("/utils/split", test_split);
|
g_test_add_func("/utils/split", test_split);
|
||||||
|
|||||||
Reference in New Issue
Block a user