utils: handle "unbroken" scripts

Do not removing combining characters from scripts without explicit word
boundaries, such as those for CJK.

Reuse some Xapian code for that.
This commit is contained in:
Dirk-Jan C. Binnema
2023-09-09 11:37:53 +03:00
parent 080cf43b4a
commit 9c28c65d45
4 changed files with 162 additions and 9 deletions

View File

@ -154,7 +154,19 @@ std::tm mu_time(T t={}, bool use_utc=false) {
using StringVec = std::vector<std::string>;
/**
* Flatten a string -- downcase and fold diacritics etc.
* Does the string contain script without explicit word separators?
*
* @param str a string
*
* @return true or false
*/
bool contains_unbroken_script(const char* str);
static inline bool contains_unbroken_script(const std::string& str) {
return contains_unbroken_script(str.c_str());
}
/**
* Flatten a string -- down-case and fold diacritics.
*
* @param str a string
*