From 55ffb524dba768ff703764b175902fe78fc46cc8 Mon Sep 17 00:00:00 2001
From: djcb <djcb@djcbsoftware.nl>
Date: Sat, 28 Oct 2017 14:13:09 +0300
Subject: [PATCH] tokenizer: clean unicode-aware

---
 lib/parser/tokenizer.cc | 21 +++------------------
 lib/parser/utils.cc     | 26 ++++++++++++++++++++++++++
 lib/parser/utils.hh     | 11 +++++++++++
 3 files changed, 40 insertions(+), 18 deletions(-)
diff --git a/lib/parser/tokenizer.cc b/lib/parser/tokenizer.cc
index 8c616d9b..d1fc0e97 100644
--- a/lib/parser/tokenizer.cc
+++ b/lib/parser/tokenizer.cc
@@ -18,6 +18,8 @@
 */
 
 #include "tokenizer.hh"
+#include "utils.hh"
+
 #include <cctype>
 #include <iostream>
 #include <algorithm>
@@ -113,29 +115,12 @@ eat_token (std::string& food, size_t& pos)
 }
 
 
-static std::string
-cleanup (const std::string& dirty)
-{
-	auto clean = dirty;
-
-	// only accept spc as whitespace
-	for (auto f = clean.begin(); f != clean.end(); ++f)
-		if (*f < ' ')
-			*f = ' ';
-
-	clean.erase (0, clean.find_first_not_of(" "));
-	clean.erase (clean.find_last_not_of(" ") + 1); // remove trailing space
-
-	return clean;
-}
-
-
 Mux::Tokens
 Mux::tokenize (const std::string& s)
 {
 	Tokens tokens{};
 
-	std::string food = cleanup(s);
+	std::string food = utf8_clean(s);
 	size_t pos{0};
 
 	if (s.empty())
diff --git a/lib/parser/utils.cc b/lib/parser/utils.cc
index 105c0f58..789ba1a5 100644
--- a/lib/parser/utils.cc
+++ b/lib/parser/utils.cc
@@ -110,6 +110,32 @@ Mux::utf8_flatten (const std::string& str)
 }
 
 
+std::string
+Mux::utf8_clean (const std::string& dirty)
+{
+	GString *gstr = g_string_sized_new (dirty.length());
+
+	for (auto cur = dirty.c_str(); cur && *cur; cur = g_utf8_next_char (cur)) {
+
+		const gunichar uc = g_utf8_get_char (cur);
+		if (g_unichar_iscntrl (uc))
+			g_string_append_c (gstr, ' ');
+		else
+			g_string_append_unichar (gstr, uc);
+	}
+
+	std::string clean(gstr->str, gstr->len);
+	g_string_free (gstr, TRUE);
+
+	clean.erase (0, clean.find_first_not_of(" "));
+	clean.erase (clean.find_last_not_of(" ") + 1); // remove trailing space
+
+	return clean;
+}
+
+
+
+
 std::vector<std::string>
 Mux::split (const std::string& str, const std::string& sepa)
 {
diff --git a/lib/parser/utils.hh b/lib/parser/utils.hh
index f9389f9e..28494c15 100644
--- a/lib/parser/utils.hh
+++ b/lib/parser/utils.hh
@@ -34,6 +34,17 @@ namespace Mux {
  */
 std::string utf8_flatten (const std::string& str);
 
+
+/**
+ * Replace all control characters with spaces, and remove leading and trailing space.
+ *
+ * @param dirty an unclean string
+ *
+ * @return a cleaned-up string.
+ */
+std::string utf8_clean (const std::string& dirty);
+
+
 /**
  * Split a string in parts
  *