tokenizer: clean unicode-aware

2017-10-28 14:13:09 +03:00
parent 0e5e8b6bce
commit 55ffb524db
3 changed files with 40 additions and 18 deletions
--- a/lib/parser/tokenizer.cc
+++ b/lib/parser/tokenizer.cc
@ -18,6 +18,8 @@
 */
 #include "tokenizer.hh"
 #include "utils.hh"
 #include <cctype>
 #include <iostream>
 #include <algorithm>
@ -113,29 +115,12 @@ eat_token (std::string& food, size_t& pos)
 }
 static std::string
 cleanup (const std::string& dirty)
 {
 	auto clean = dirty;
 	// only accept spc as whitespace
 	for (auto f = clean.begin(); f != clean.end(); ++f)
 		if (*f < ' ')
 			*f = ' ';
 	clean.erase (0, clean.find_first_not_of(" "));
 	clean.erase (clean.find_last_not_of(" ") + 1); // remove trailing space
 	return clean;
 }
 Mux::Tokens
 Mux::tokenize (const std::string& s)
 {
 	Tokens tokens{};
-	std::string food = cleanup(s);
+	std::string food = utf8_clean(s);
 	size_t pos{0};
 	if (s.empty())
--- a/lib/parser/utils.cc
+++ b/lib/parser/utils.cc
@ -110,6 +110,32 @@ Mux::utf8_flatten (const std::string& str)
 }
 std::string
 Mux::utf8_clean (const std::string& dirty)
 {
 	GString *gstr = g_string_sized_new (dirty.length());
 	for (auto cur = dirty.c_str(); cur && *cur; cur = g_utf8_next_char (cur)) {
 		const gunichar uc = g_utf8_get_char (cur);
 		if (g_unichar_iscntrl (uc))
 			g_string_append_c (gstr, ' ');
 		else
 			g_string_append_unichar (gstr, uc);
 	}
 	std::string clean(gstr->str, gstr->len);
 	g_string_free (gstr, TRUE);
 	clean.erase (0, clean.find_first_not_of(" "));
 	clean.erase (clean.find_last_not_of(" ") + 1); // remove trailing space
 	return clean;
 }
 std::vector<std::string>
 Mux::split (const std::string& str, const std::string& sepa)
 {
--- a/lib/parser/utils.hh
+++ b/lib/parser/utils.hh
@ -34,6 +34,17 @@ namespace Mux {
 */
 std::string utf8_flatten (const std::string& str);
 /**
 * Replace all control characters with spaces, and remove leading and trailing space.
 *
 * @param dirty an unclean string
 *
 * @return a cleaned-up string.
 */
 std::string utf8_clean (const std::string& dirty);
 /**
 * Split a string in parts
 *