lib: implement new query parser
mu's query parser is the piece of software that turns your queries
into something the Xapian database can understand. So, if you query
"maildir:/inbox and subject:bla" this must be translated into a
Xapian::Query object which will retrieve the sought after messages.
Since mu's beginning, almost a decade ago, this parser was based on
Xapian's default Xapian::QueryParser. It works okay, but wasn't really
designed for the mu use-case, and had a bit of trouble with anything
that's not A..Z (think: spaces, special characters, unicode etc.).
Over the years, mu added quite a bit of pre-processing trickery to
deal with that. Still, there were corner cases and bugs that were
practically unfixable.
The solution to all of this is to have a custom query processor that
replaces Xapian's, and write it from the ground up to deal with the
special characters etc. I wrote one, as part of my "future, post-1.0
mu" reseach project, and I have now backported it to the mu 0.9.19.
From a technical perspective, this is a major cleanup, and allows us
to get rid of much of the fragile preprocessing both for indexing and
querying. From and end-user perspective this (hopefully) means that
many of the little parsing issues are gone, and it opens the way for
some new features.
From an end-user perspective:
- better support for special characters.
- regexp search! yes, you can now search for regular expressions, e.g.
subject:/h.ll?o/
will find subjects with hallo, hello, halo, philosophy, ...
As you can imagine, this can be a _heavy_ operation on the database,
and might take quite a bit longer than a normal query; but it can be
quite useful.
This commit is contained in:
346
lib/parser/parser.cc
Normal file
346
lib/parser/parser.cc
Normal file
@ -0,0 +1,346 @@
|
||||
/*
|
||||
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
**
|
||||
** This library is free software; you can redistribute it and/or
|
||||
** modify it under the terms of the GNU Lesser General Public License
|
||||
** as published by the Free Software Foundation; either version 2.1
|
||||
** of the License, or (at your option) any later version.
|
||||
**
|
||||
** This library is distributed in the hope that it will be useful,
|
||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
** Lesser General Public License for more details.
|
||||
**
|
||||
** You should have received a copy of the GNU Lesser General Public
|
||||
** License along with this library; if not, write to the Free
|
||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
||||
** 02110-1301, USA.
|
||||
*/
|
||||
#include "parser.hh"
|
||||
#include "tokenizer.hh"
|
||||
#include "utils.hh"
|
||||
|
||||
using namespace Mux;
|
||||
|
||||
// 3 precedence levels: units (NOT,()) > factors (OR) > terms (AND)
|
||||
|
||||
// query -> <term-1> | ε
|
||||
// <term-1> -> <factor-1> <term-2> | ε
|
||||
// <term-2> -> OR|XOR <term-1> | ε
|
||||
// <factor-1> -> <unit> <factor-2> | ε
|
||||
// <factor-2> -> [AND]|AND NOT <factor-1> | ε
|
||||
// <unit> -> [NOT] <term-1> | ( <term-1> ) | <data>
|
||||
// <data> -> <value> | <range> | <regex>
|
||||
// <value> -> [field:]value
|
||||
// <range> -> [field:][lower]..[upper]
|
||||
// <regex> -> [field:]/regex/
|
||||
|
||||
|
||||
#define BUG(...) std::runtime_error (format("%u: BUG: ",__LINE__) \
|
||||
+ format(__VA_ARGS__))
|
||||
|
||||
static Token
|
||||
look_ahead (const Mux::Tokens& tokens)
|
||||
{
|
||||
return tokens.front();
|
||||
}
|
||||
|
||||
static Mux::Tree
|
||||
empty()
|
||||
{
|
||||
return {{Node::Type::Empty}};
|
||||
}
|
||||
|
||||
static Mux::Tree term_1 (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings);
|
||||
|
||||
|
||||
static Mux::Tree
|
||||
value (const ProcIface::FieldInfoVec& fields, const std::string& v,
|
||||
size_t pos, ProcPtr proc, WarningVec& warnings)
|
||||
{
|
||||
auto val = utf8_flatten(v);
|
||||
|
||||
if (fields.empty())
|
||||
throw BUG("expected one or more fields");
|
||||
|
||||
if (fields.size() == 1) {
|
||||
const auto item = fields.front();
|
||||
return Tree({Node::Type::Value,
|
||||
std::make_unique<Value>(
|
||||
item.field, item.prefix, item.id,
|
||||
proc->process_value(item.field, val))});
|
||||
}
|
||||
|
||||
// a 'multi-field' such as "recip:"
|
||||
Tree tree(Node{Node::Type::OpOr});
|
||||
for (const auto& item: fields)
|
||||
tree.add_child (Tree({Node::Type::Value,
|
||||
std::make_unique<Value>(
|
||||
item.field, item.prefix, item.id,
|
||||
proc->process_value(item.field, val))}));
|
||||
return tree;
|
||||
}
|
||||
|
||||
static Mux::Tree
|
||||
regex (const ProcIface::FieldInfoVec& fields, const std::string& v,
|
||||
size_t pos, ProcPtr proc, WarningVec& warnings)
|
||||
{
|
||||
if (v.length() < 2)
|
||||
throw BUG("expected regexp, got '%s'", v.c_str());
|
||||
|
||||
const auto rxstr = utf8_flatten(v.substr(1, v.length()-2));
|
||||
|
||||
try {
|
||||
Tree tree(Node{Node::Type::OpOr});
|
||||
const auto rx = std::regex (rxstr);
|
||||
for (const auto& field: fields) {
|
||||
const auto terms = proc->process_regex (field.field, rx);
|
||||
for (const auto& term: terms) {
|
||||
tree.add_child (Tree(
|
||||
{Node::Type::Value,
|
||||
std::make_unique<Value>(field.field, "",
|
||||
field.id, term)}));
|
||||
}
|
||||
}
|
||||
return tree;
|
||||
|
||||
} catch (...) {
|
||||
// fallback
|
||||
warnings.push_back ({pos, "invalid regexp"});
|
||||
return value (fields, v, pos, proc, warnings);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
static Mux::Tree
|
||||
range (const ProcIface::FieldInfoVec& fields, const std::string& lower,
|
||||
const std::string& upper, size_t pos, ProcPtr proc,
|
||||
WarningVec& warnings)
|
||||
{
|
||||
if (fields.empty())
|
||||
throw BUG("expected field");
|
||||
|
||||
const auto& field = fields.front();
|
||||
if (!proc->is_range_field(field.field))
|
||||
return value (fields, lower + ".." + upper, pos, proc, warnings);
|
||||
|
||||
auto prange = proc->process_range (field.field, lower, upper);
|
||||
if (prange.lower > prange.upper)
|
||||
prange = proc->process_range (field.field, upper, lower);
|
||||
|
||||
return Tree({{Node::Type::Range},
|
||||
std::make_unique<Range>(field.field, field.prefix, field.id,
|
||||
prange.lower, prange.upper)});
|
||||
}
|
||||
|
||||
|
||||
static Mux::Tree
|
||||
data (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
|
||||
{
|
||||
const auto token = look_ahead(tokens);
|
||||
if (token.type != Token::Type::Data)
|
||||
warnings.push_back ({token.pos, "expected: value"});
|
||||
|
||||
tokens.pop_front();
|
||||
|
||||
std::string field, val;
|
||||
const auto col = token.str.find (":");
|
||||
if (col != 0 && col != std::string::npos && col != token.str.length()-1) {
|
||||
field = token.str.substr(0, col);
|
||||
val = token.str.substr(col + 1);
|
||||
} else
|
||||
val = token.str;
|
||||
|
||||
auto fields = proc->process_field (field);
|
||||
if (fields.empty()) {// not valid field...
|
||||
warnings.push_back ({token.pos, format ("invalid field '%s'", field.c_str())});
|
||||
fields = proc->process_field ("");
|
||||
// fallback, treat the whole of foo:bar as a value
|
||||
return value (fields, field + ":" + val, token.pos, proc, warnings);
|
||||
}
|
||||
|
||||
// does it look like a regexp?
|
||||
if (val.length()>=2) {
|
||||
if (val[0]=='/' && val[val.length()-1] == '/')
|
||||
return regex (fields, val, token.pos, proc, warnings);
|
||||
else if (val[val.length()-1] == '*')
|
||||
return regex (fields, // transfrom wildcard into regexp
|
||||
"/" + val.substr(0, val.length()-1) + ".*/",
|
||||
token.pos, proc, warnings);
|
||||
}
|
||||
|
||||
// does it look like a range?
|
||||
const auto dotdot = val.find("..");
|
||||
if (dotdot != std::string::npos)
|
||||
return range(fields, val.substr(0, dotdot), val.substr(dotdot + 2),
|
||||
token.pos, proc, warnings);
|
||||
|
||||
// if nothing else, it's a value.
|
||||
return value (fields, val, token.pos, proc, warnings);
|
||||
}
|
||||
|
||||
static Mux::Tree
|
||||
unit (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
|
||||
{
|
||||
if (tokens.empty()) {
|
||||
warnings.push_back ({0, "expected: unit"});
|
||||
return empty();
|
||||
}
|
||||
|
||||
const auto token = look_ahead (tokens);
|
||||
|
||||
if (token.type == Token::Type::Not) {
|
||||
tokens.pop_front();
|
||||
Tree tree{{Node::Type::OpNot}};
|
||||
tree.add_child(unit (tokens, proc, warnings));
|
||||
return tree;
|
||||
}
|
||||
|
||||
if (token.type == Token::Type::Open) {
|
||||
tokens.pop_front();
|
||||
auto tree = term_1 (tokens, proc, warnings);
|
||||
if (tokens.empty())
|
||||
warnings.push_back({token.pos, "expected: ')'"});
|
||||
else {
|
||||
const auto token2 = look_ahead(tokens);
|
||||
if (token2.type == Token::Type::Close)
|
||||
tokens.pop_front();
|
||||
else {
|
||||
warnings.push_back(
|
||||
{token2.pos,
|
||||
std::string("expected: ')' but got ") +
|
||||
token2.str});
|
||||
}
|
||||
|
||||
}
|
||||
return tree;
|
||||
}
|
||||
|
||||
return data (tokens, proc, warnings);
|
||||
}
|
||||
|
||||
static Mux::Tree factor_1 (Mux::Tokens& tokens, ProcPtr proc,
|
||||
WarningVec& warnings);
|
||||
|
||||
static Mux::Tree
|
||||
factor_2 (Mux::Tokens& tokens, Node::Type& op, ProcPtr proc,
|
||||
WarningVec& warnings)
|
||||
{
|
||||
if (tokens.empty())
|
||||
return empty();
|
||||
|
||||
const auto token = look_ahead(tokens);
|
||||
|
||||
switch (token.type) {
|
||||
case Token::Type::And: {
|
||||
tokens.pop_front();
|
||||
const auto token2 = look_ahead(tokens);
|
||||
if (token2.type == Token::Type::Not) { // AND NOT is a unit
|
||||
tokens.pop_front();
|
||||
op = Node::Type::OpAndNot;
|
||||
} else
|
||||
op = Node::Type::OpAnd;
|
||||
} break;
|
||||
case Token::Type::Open:
|
||||
case Token::Type::Data:
|
||||
op = Node::Type::OpAnd; // implicit AND
|
||||
break;
|
||||
case Token::Type::Not:
|
||||
tokens.pop_front();
|
||||
op = Node::Type::OpAndNot; // implicit AND NOT
|
||||
break;
|
||||
|
||||
|
||||
default:
|
||||
return empty();
|
||||
}
|
||||
|
||||
return factor_1 (tokens, proc, warnings);
|
||||
}
|
||||
|
||||
static Mux::Tree
|
||||
factor_1 (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
|
||||
{
|
||||
Node::Type op { Node::Type::Invalid };
|
||||
|
||||
auto t = unit (tokens, proc, warnings);
|
||||
auto a2 = factor_2 (tokens, op, proc, warnings);
|
||||
|
||||
if (a2.empty())
|
||||
return t;
|
||||
|
||||
Tree tree {{op}};
|
||||
tree.add_child(std::move(t));
|
||||
tree.add_child(std::move(a2));
|
||||
|
||||
return tree;
|
||||
}
|
||||
|
||||
|
||||
static Mux::Tree
|
||||
term_2 (Mux::Tokens& tokens, Node::Type& op, ProcPtr proc,
|
||||
WarningVec& warnings)
|
||||
{
|
||||
if (tokens.empty())
|
||||
return empty();
|
||||
|
||||
const auto token = look_ahead (tokens);
|
||||
|
||||
switch (token.type) {
|
||||
case Token::Type::Or:
|
||||
op = Node::Type::OpOr;
|
||||
break;
|
||||
case Token::Type::Xor:
|
||||
op = Node::Type::OpXor;
|
||||
break;
|
||||
default:
|
||||
if (token.type != Token::Type::Close)
|
||||
warnings.push_back({token.pos, "expected OR|XOR"});
|
||||
return empty();
|
||||
}
|
||||
|
||||
tokens.pop_front();
|
||||
|
||||
return term_1 (tokens, proc, warnings);
|
||||
}
|
||||
|
||||
static Mux::Tree
|
||||
term_1 (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
|
||||
{
|
||||
Node::Type op { Node::Type::Invalid };
|
||||
|
||||
auto t = factor_1 (tokens, proc, warnings);
|
||||
auto o2 = term_2 (tokens, op, proc, warnings);
|
||||
|
||||
if (o2.empty())
|
||||
return t;
|
||||
else {
|
||||
Tree tree {{op}};
|
||||
tree.add_child(std::move(t));
|
||||
tree.add_child(std::move(o2));
|
||||
return tree;
|
||||
}
|
||||
}
|
||||
|
||||
static Mux::Tree
|
||||
query (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
|
||||
{
|
||||
if (tokens.empty())
|
||||
return empty ();
|
||||
else
|
||||
return term_1 (tokens, proc, warnings);
|
||||
}
|
||||
|
||||
Mux::Tree
|
||||
Mux::parse (const std::string& expr, WarningVec& warnings, ProcPtr proc)
|
||||
{
|
||||
try {
|
||||
auto tokens = tokenize (expr);
|
||||
return query (tokens, proc, warnings);
|
||||
|
||||
} catch (const std::runtime_error& ex) {
|
||||
std::cerr << ex.what() << std::endl;
|
||||
return empty();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user