lib: implement new query parser

mu's query parser is the piece of software that turns your queries into something the Xapian database can understand. So, if you query "maildir:/inbox and subject:bla" this must be translated into a Xapian::Query object which will retrieve the sought after messages. Since mu's beginning, almost a decade ago, this parser was based on Xapian's default Xapian::QueryParser. It works okay, but wasn't really designed for the mu use-case, and had a bit of trouble with anything that's not A..Z (think: spaces, special characters, unicode etc.). Over the years, mu added quite a bit of pre-processing trickery to deal with that. Still, there were corner cases and bugs that were practically unfixable. The solution to all of this is to have a custom query processor that replaces Xapian's, and write it from the ground up to deal with the special characters etc. I wrote one, as part of my "future, post-1.0 mu" reseach project, and I have now backported it to the mu 0.9.19. From a technical perspective, this is a major cleanup, and allows us to get rid of much of the fragile preprocessing both for indexing and querying. From and end-user perspective this (hopefully) means that many of the little parsing issues are gone, and it opens the way for some new features. From an end-user perspective: - better support for special characters. - regexp search! yes, you can now search for regular expressions, e.g. subject:/h.ll?o/ will find subjects with hallo, hello, halo, philosophy, ... As you can imagine, this can be a _heavy_ operation on the database, and might take quite a bit longer than a normal query; but it can be quite useful.
2017-10-24 22:55:35 +03:00
parent b53366313b
commit b75f9f508b
18 changed files with 2208 additions and 0 deletions
--- a/lib/parser/tokenizer.cc
+++ b/lib/parser/tokenizer.cc
@ -0,0 +1,128 @@
+/*
+**  Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
+**
+**  This library is free software; you can redistribute it and/or
+**  modify it under the terms of the GNU Lesser General Public License
+**  as published by the Free Software Foundation; either version 2.1
+**  of the License, or (at your option) any later version.
+**
+**  This library is distributed in the hope that it will be useful,
+**  but WITHOUT ANY WARRANTY; without even the implied warranty of
+**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+**  Lesser General Public License for more details.
+**
+**  You should have received a copy of the GNU Lesser General Public
+**  License along with this library; if not, write to the Free
+**  Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+**  02110-1301, USA.
+*/
+
+#include "tokenizer.hh"
+#include <cctype>
+#include <iostream>
+#include <algorithm>
+
+using namespace Mux;
+
+static bool
+is_separator (char c)
+{
+	const auto seps = std::string (":()\"");
+
+	if (isblank(c))
+		return true;
+	else
+		return seps.find(c) != std::string::npos;
+}
+
+
+static Mux::Token
+op_or_value (size_t pos, const std::string& val)
+{
+	auto s = val;
+	std::transform(s.begin(), s.end(), s.begin(), ::tolower);
+
+	if (s == "and")
+		return Token{pos, Token::Type::And, val};
+	else if (s == "or")
+		return Token{pos, Token::Type::Or, val};
+	else if (s == "xor")
+		return Token{pos, Token::Type::Xor, val};
+	else if (s == "not")
+		return Token{pos, Token::Type::Not, val};
+	else
+		return Token{pos, Token::Type::Data, val};
+}
+
+static void
+unread_char (std::string& food, char kar, size_t& pos)
+{
+	food = kar + food;
+	--pos;
+}
+
+static Mux::Token
+eat_token (std::string& food, size_t& pos)
+{
+	bool quoted{};
+	bool escaped{};
+	std::string value {};
+
+	while (!food.empty()) {
+
+		const auto kar = food[0];
+		food.erase(0, 1);
+		++pos;
+
+		if (kar == '\\') {
+			escaped = !escaped;
+			if (escaped)
+				continue;
+		}
+
+		if (kar == '"' && !escaped && quoted)
+			return Token{pos, Token::Type::Data, value};
+
+		if (!quoted && !escaped && is_separator(kar)) {
+
+			if (!value.empty() && kar != ':') {
+				unread_char (food, kar, pos);
+				return op_or_value(pos, value);
+			}
+
+			if (kar == '"')
+				quoted = true;
+
+			if (quoted || isblank(kar))
+				continue;
+
+			switch (kar) {
+			case '(': return {pos, Token::Type::Open, "("};
+			case ')': return {pos, Token::Type::Close,")"};
+			default: break;
+			}
+		}
+
+		value	+= kar;
+		escaped	 = false;
+	}
+
+	return {pos, Token::Type::Data, value};
+}
+
+
+Mux::Tokens
+Mux::tokenize (const std::string& s)
+{
+	Tokens tokens{};
+	std::string food{s};
+	size_t pos{0};
+
+	if (s.empty())
+		return {};
+
+	while (!food.empty())
+		tokens.emplace_back(eat_token (food, pos));
+
+	return tokens;
+}