tokenizer: clean unicode-aware

2017-10-28 14:13:09 +03:00
parent 0e5e8b6bce
commit 55ffb524db
3 changed files with 40 additions and 18 deletions
--- a/lib/parser/tokenizer.cc
+++ b/lib/parser/tokenizer.cc
@ -18,6 +18,8 @@
 */

 #include "tokenizer.hh"
+#include "utils.hh"
+
 #include <cctype>
 #include <iostream>
 #include <algorithm>
@ -113,29 +115,12 @@ eat_token (std::string& food, size_t& pos)
 }


-static std::string
-cleanup (const std::string& dirty)
-{
-	auto clean = dirty;
-
-	// only accept spc as whitespace
-	for (auto f = clean.begin(); f != clean.end(); ++f)
-		if (*f < ' ')
-			*f = ' ';
-
-	clean.erase (0, clean.find_first_not_of(" "));
-	clean.erase (clean.find_last_not_of(" ") + 1); // remove trailing space
-
-	return clean;
-}
-
-
 Mux::Tokens
 Mux::tokenize (const std::string& s)
 {
 	Tokens tokens{};

-	std::string food = cleanup(s);
+	std::string food = utf8_clean(s);
 	size_t pos{0};

 	if (s.empty())