From 55ffb524dba768ff703764b175902fe78fc46cc8 Mon Sep 17 00:00:00 2001 From: djcb Date: Sat, 28 Oct 2017 14:13:09 +0300 Subject: [PATCH] tokenizer: clean unicode-aware --- lib/parser/tokenizer.cc | 21 +++------------------ lib/parser/utils.cc | 26 ++++++++++++++++++++++++++ lib/parser/utils.hh | 11 +++++++++++ 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/lib/parser/tokenizer.cc b/lib/parser/tokenizer.cc index 8c616d9b..d1fc0e97 100644 --- a/lib/parser/tokenizer.cc +++ b/lib/parser/tokenizer.cc @@ -18,6 +18,8 @@ */ #include "tokenizer.hh" +#include "utils.hh" + #include #include #include @@ -113,29 +115,12 @@ eat_token (std::string& food, size_t& pos) } -static std::string -cleanup (const std::string& dirty) -{ - auto clean = dirty; - - // only accept spc as whitespace - for (auto f = clean.begin(); f != clean.end(); ++f) - if (*f < ' ') - *f = ' '; - - clean.erase (0, clean.find_first_not_of(" ")); - clean.erase (clean.find_last_not_of(" ") + 1); // remove trailing space - - return clean; -} - - Mux::Tokens Mux::tokenize (const std::string& s) { Tokens tokens{}; - std::string food = cleanup(s); + std::string food = utf8_clean(s); size_t pos{0}; if (s.empty()) diff --git a/lib/parser/utils.cc b/lib/parser/utils.cc index 105c0f58..789ba1a5 100644 --- a/lib/parser/utils.cc +++ b/lib/parser/utils.cc @@ -110,6 +110,32 @@ Mux::utf8_flatten (const std::string& str) } +std::string +Mux::utf8_clean (const std::string& dirty) +{ + GString *gstr = g_string_sized_new (dirty.length()); + + for (auto cur = dirty.c_str(); cur && *cur; cur = g_utf8_next_char (cur)) { + + const gunichar uc = g_utf8_get_char (cur); + if (g_unichar_iscntrl (uc)) + g_string_append_c (gstr, ' '); + else + g_string_append_unichar (gstr, uc); + } + + std::string clean(gstr->str, gstr->len); + g_string_free (gstr, TRUE); + + clean.erase (0, clean.find_first_not_of(" ")); + clean.erase (clean.find_last_not_of(" ") + 1); // remove trailing space + + return clean; +} + + + + std::vector Mux::split (const std::string& str, const std::string& sepa) { diff --git a/lib/parser/utils.hh b/lib/parser/utils.hh index f9389f9e..28494c15 100644 --- a/lib/parser/utils.hh +++ b/lib/parser/utils.hh @@ -34,6 +34,17 @@ namespace Mux { */ std::string utf8_flatten (const std::string& str); + +/** + * Replace all control characters with spaces, and remove leading and trailing space. + * + * @param dirty an unclean string + * + * @return a cleaned-up string. + */ +std::string utf8_clean (const std::string& dirty); + + /** * Split a string in parts *