utils: handle "unbroken" scripts

Do not removing combining characters from scripts without explicit word
boundaries, such as those for CJK.

Reuse some Xapian code for that.
This commit is contained in:
Dirk-Jan C. Binnema
2023-09-09 11:37:53 +03:00
parent 080cf43b4a
commit 9c28c65d45
4 changed files with 162 additions and 9 deletions

View File

@ -44,6 +44,8 @@
#include <glib/gprintf.h>
#include "mu-utils.hh"
#include "mu-unbroken.hh"
#include "mu-error.hh"
#include "mu-option.hh"
@ -112,12 +114,28 @@ gx_utf8_flatten(const gchar* str, gssize len)
} // namespace
bool
Mu::contains_unbroken_script(const char *str)
{
while (str && *str) {
auto uc = g_utf8_get_char(str);
if (is_unbroken_script(uc))
return true;
str = g_utf8_next_char(str);
}
return false;
}
std::string // gx_utf8_flatten
Mu::utf8_flatten(const char* str)
{
if (!str)
return {};
if (contains_unbroken_script(str))
return std::string{str};
// the pure-ascii case
if (g_str_is_ascii(str)) {
auto l = g_ascii_strdown(str, -1);