utils: handle "unbroken" scripts
Do not removing combining characters from scripts without explicit word boundaries, such as those for CJK. Reuse some Xapian code for that.
This commit is contained in:
@ -44,6 +44,8 @@
|
||||
#include <glib/gprintf.h>
|
||||
|
||||
#include "mu-utils.hh"
|
||||
#include "mu-unbroken.hh"
|
||||
|
||||
#include "mu-error.hh"
|
||||
#include "mu-option.hh"
|
||||
|
||||
@ -112,12 +114,28 @@ gx_utf8_flatten(const gchar* str, gssize len)
|
||||
|
||||
} // namespace
|
||||
|
||||
bool
|
||||
Mu::contains_unbroken_script(const char *str)
|
||||
{
|
||||
while (str && *str) {
|
||||
auto uc = g_utf8_get_char(str);
|
||||
if (is_unbroken_script(uc))
|
||||
return true;
|
||||
str = g_utf8_next_char(str);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string // gx_utf8_flatten
|
||||
Mu::utf8_flatten(const char* str)
|
||||
{
|
||||
if (!str)
|
||||
return {};
|
||||
|
||||
if (contains_unbroken_script(str))
|
||||
return std::string{str};
|
||||
|
||||
// the pure-ascii case
|
||||
if (g_str_is_ascii(str)) {
|
||||
auto l = g_ascii_strdown(str, -1);
|
||||
|
||||
Reference in New Issue
Block a user