tokenizer: clean unicode-aware

2017-10-28 14:13:09 +03:00
parent 0e5e8b6bce
commit 55ffb524db
3 changed files with 40 additions and 18 deletions
--- a/lib/parser/utils.hh
+++ b/lib/parser/utils.hh
@ -34,6 +34,17 @@ namespace Mux {
 */
 std::string utf8_flatten (const std::string& str);

+
+/**
+ * Replace all control characters with spaces, and remove leading and trailing space.
+ *
+ * @param dirty an unclean string
+ *
+ * @return a cleaned-up string.
+ */
+std::string utf8_clean (const std::string& dirty);
+
+
 /**
 * Split a string in parts
 *