lib/parser -> lib/query

And update the names to follow the mu- convention.
This commit is contained in:
Dirk-Jan C. Binnema
2020-02-20 21:53:24 +02:00
parent a132f5c21f
commit 20ce7b7066
17 changed files with 39 additions and 39 deletions

99
lib/query/Makefile.am Normal file
View File

@ -0,0 +1,99 @@
## Copyright (C) 2017-2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software Foundation,
## Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
include $(top_srcdir)/gtest.mk
@VALGRIND_CHECK_RULES@
AM_CXXFLAGS= \
-I$(srcdir)/.. \
-I$(top_srcdir)/lib \
$(GLIB_CFLAGS) \
$(XAPIAN_CXXFLAGS) \
$(WARN_CXXFLAGS) \
$(ASAN_CXXFLAGS) \
$(CODE_COVERAGE_CFLAGS) \
-Wno-inline \
-Wno-switch-enum
AM_CPPFLAGS= \
$(CODE_COVERAGE_CPPFLAGS)
AM_LDFLAGS= \
$(ASAN_LDFLAGS) \
$(WARN_LDFLAGS)
noinst_PROGRAMS= \
tokenize \
parse
noinst_LTLIBRARIES= \
libmu-query.la
libmu_query_la_SOURCES= \
mu-data.hh \
mu-parser.cc \
mu-parser.hh \
mu-proc-iface.hh \
mu-tokenizer.cc \
mu-tokenizer.hh \
mu-tree.hh \
mu-xapian.cc \
mu-xapian.hh
libmu_query_la_LIBADD= \
$(WARN_LDFLAGS) \
$(GLIB_LIBS) \
$(XAPIAN_LIBS) \
../utils/libmu-utils.la \
$(CODE_COVERAGE_LIBS)
VALGRIND_SUPPRESSIONS_FILES= \
${top_srcdir}/mu.supp
tokenize_SOURCES= \
tokenize.cc
tokenize_LDADD= \
$(WARN_LDFLAGS) \
libmu-query.la
parse_SOURCES= \
parse.cc
parse_LDADD= \
$(WARN_LDFLAGS) \
libmu-query.la
noinst_PROGRAMS+=$(TEST_PROGS)
TEST_PROGS+= \
test-tokenizer
test_tokenizer_SOURCES= \
test-tokenizer.cc
test_tokenizer_LDADD= \
libmu-query.la
TEST_PROGS+= \
test-parser
test_parser_SOURCES= \
test-parser.cc
test_parser_LDADD= \
libmu-query.la
TESTS=$(TEST_PROGS)
include $(top_srcdir)/aminclude_static.am

155
lib/query/mu-data.hh Normal file
View File

@ -0,0 +1,155 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef __DATA_HH__
#define __DATA_HH__
#include <string>
#include <iostream>
#include <regex>
#include <utils//mu-utils.hh>
namespace Mu {
// class representing some data item; either a Value or a Range a Value can still be a Regex (but
// that's not a separate type here)
struct Data {
enum class Type { Value, Range };
virtual ~Data() = default;
Type type; /**< type of data */
std::string field; /**< full name of the field */
std::string prefix; /**< Xapian prefix for thef field */
unsigned id; /**< Xapian value no for the field */
protected:
Data (Type _type, const std::string& _field, const std::string& _prefix,
unsigned _id): type(_type), field(_field), prefix(_prefix), id(_id) {}
};
/**
* operator<<
*
* @param os an output stream
* @param t a data type
*
* @return the updated output stream
*/
inline std::ostream&
operator<< (std::ostream& os, Data::Type t)
{
switch (t) {
case Data::Type::Value: os << "value"; break;
case Data::Type::Range: os << "range"; break;
default: os << "bug"; break;
}
return os;
}
/**
* Range type -- [a..b]
*/
struct Range: public Data {
/**
* Construct a range
*
* @param _field the field
* @param _prefix the xapian prefix
* @param _id xapian value number
* @param _lower lower bound
* @param _upper upper bound
*/
Range (const std::string& _field, const std::string& _prefix,
unsigned _id,
const std::string& _lower,const std::string& _upper):
Data(Data::Type::Range, _field, _prefix, _id),
lower(_lower), upper(_upper) {}
std::string lower; /**< lower bound */
std::string upper; /**< upper bound */
};
/**
* Basic value
*
*/
struct Value: public Data {
/**
* Construct a Value
*
* @param _field the field
* @param _prefix the xapian prefix
* @param _id xapian value number
* @param _value the value
*/
Value (const std::string& _field, const std::string& _prefix,
unsigned _id, const std::string& _value, bool _phrase = false):
Data(Value::Type::Value, _field, _prefix, _id),
value(_value), phrase(_phrase) {}
std::string value; /**< the value */
bool phrase;
};
/**
* operator<<
*
* @param os an output stream
* @param v a data ptr
*
* @return the updated output stream
*/
inline std::ostream&
operator<< (std::ostream& os, const std::unique_ptr<Data>& v)
{
switch (v->type) {
case Data::Type::Value: {
const auto bval = dynamic_cast<Value*> (v.get());
os << ' ' << quote(v->field) << ' '
<< quote(utf8_flatten(bval->value));
if (bval->phrase)
os << " (ph)";
break;
}
case Data::Type::Range: {
const auto rval = dynamic_cast<Range*> (v.get());
os << ' ' << quote(v->field) << ' '
<< quote(rval->lower) << ' '
<< quote(rval->upper);
break;
}
default:
os << "unexpected type";
break;
}
return os;
}
} // namespace Mu
#endif /* __DATA_HH__ */

344
lib/query/mu-parser.cc Normal file
View File

@ -0,0 +1,344 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include "mu-parser.hh"
#include "mu-tokenizer.hh"
#include "utils/mu-utils.hh"
#include "utils/mu-error.hh"
using namespace Mu;
// 3 precedence levels: units (NOT,()) > factors (OR) > terms (AND)
// query -> <term-1> | ε
// <term-1> -> <factor-1> <term-2> | ε
// <term-2> -> OR|XOR <term-1> | ε
// <factor-1> -> <unit> <factor-2> | ε
// <factor-2> -> [AND]|AND NOT <factor-1> | ε
// <unit> -> [NOT] <term-1> | ( <term-1> ) | <data>
// <data> -> <value> | <range> | <regex>
// <value> -> [field:]value
// <range> -> [field:][lower]..[upper]
// <regex> -> [field:]/regex/
#define BUG(...) Mu::Error (Error::Code::Internal, format("%u: BUG: ",__LINE__) \
+ format(__VA_ARGS__))
static Token
look_ahead (const Mu::Tokens& tokens)
{
return tokens.front();
}
static Mu::Tree
empty()
{
return {{Node::Type::Empty}};
}
static Mu::Tree term_1 (Mu::Tokens& tokens, ProcPtr proc, WarningVec& warnings);
static Mu::Tree
value (const ProcIface::FieldInfoVec& fields, const std::string& v,
size_t pos, ProcPtr proc, WarningVec& warnings)
{
auto val = utf8_flatten(v);
if (fields.empty())
throw BUG("expected one or more fields");
if (fields.size() == 1) {
const auto item = fields.front();
return Tree({Node::Type::Value,
std::make_unique<Value>(
item.field, item.prefix, item.id,
proc->process_value(item.field, val),
item.supports_phrase)});
}
// a 'multi-field' such as "recip:"
Tree tree(Node{Node::Type::OpOr});
for (const auto& item: fields)
tree.add_child (Tree({Node::Type::Value,
std::make_unique<Value>(
item.field, item.prefix, item.id,
proc->process_value(item.field, val),
item.supports_phrase)}));
return tree;
}
static Mu::Tree
regex (const ProcIface::FieldInfoVec& fields, const std::string& v,
size_t pos, ProcPtr proc, WarningVec& warnings)
{
if (v.length() < 2)
throw BUG("expected regexp, got '%s'", v.c_str());
const auto rxstr = utf8_flatten(v.substr(1, v.length()-2));
try {
Tree tree(Node{Node::Type::OpOr});
const auto rx = std::regex (rxstr);
for (const auto& field: fields) {
const auto terms = proc->process_regex (field.field, rx);
for (const auto& term: terms) {
tree.add_child (Tree(
{Node::Type::Value,
std::make_unique<Value>(field.field, "",
field.id, term)}));
}
}
if (tree.children.empty())
return empty();
else
return tree;
} catch (...) {
// fallback
warnings.push_back ({pos, "invalid regexp"});
return value (fields, v, pos, proc, warnings);
}
}
static Mu::Tree
range (const ProcIface::FieldInfoVec& fields, const std::string& lower,
const std::string& upper, size_t pos, ProcPtr proc,
WarningVec& warnings)
{
if (fields.empty())
throw BUG("expected field");
const auto& field = fields.front();
if (!proc->is_range_field(field.field))
return value (fields, lower + ".." + upper, pos, proc, warnings);
auto prange = proc->process_range (field.field, lower, upper);
if (prange.lower > prange.upper)
prange = proc->process_range (field.field, upper, lower);
return Tree({Node::Type::Range,
std::make_unique<Range>(field.field, field.prefix, field.id,
prange.lower, prange.upper)});
}
static Mu::Tree
data (Mu::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
{
const auto token = look_ahead(tokens);
if (token.type != Token::Type::Data)
warnings.push_back ({token.pos, "expected: value"});
tokens.pop_front();
std::string field, val;
const auto col = token.str.find (":");
if (col != 0 && col != std::string::npos && col != token.str.length()-1) {
field = token.str.substr(0, col);
val = token.str.substr(col + 1);
} else
val = token.str;
auto fields = proc->process_field (field);
if (fields.empty()) {// not valid field...
warnings.push_back ({token.pos, format ("invalid field '%s'", field.c_str())});
fields = proc->process_field ("");
// fallback, treat the whole of foo:bar as a value
return value (fields, field + ":" + val, token.pos, proc, warnings);
}
// does it look like a regexp?
if (val.length() >=2 )
if (val[0] == '/' && val[val.length()-1] == '/')
return regex (fields, val, token.pos, proc, warnings);
// does it look like a range?
const auto dotdot = val.find("..");
if (dotdot != std::string::npos)
return range(fields, val.substr(0, dotdot), val.substr(dotdot + 2),
token.pos, proc, warnings);
else if (proc->is_range_field(fields.front().field)) {
// range field without a range - treat as field:val..val
return range (fields, val, val, token.pos, proc, warnings);
}
// if nothing else, it's a value.
return value (fields, val, token.pos, proc, warnings);
}
static Mu::Tree
unit (Mu::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
{
if (tokens.empty()) {
warnings.push_back ({0, "expected: unit"});
return empty();
}
const auto token = look_ahead (tokens);
if (token.type == Token::Type::Not) {
tokens.pop_front();
Tree tree{{Node::Type::OpNot}};
tree.add_child(unit (tokens, proc, warnings));
return tree;
}
if (token.type == Token::Type::Open) {
tokens.pop_front();
auto tree = term_1 (tokens, proc, warnings);
if (tokens.empty())
warnings.push_back({token.pos, "expected: ')'"});
else {
const auto token2 = look_ahead(tokens);
if (token2.type == Token::Type::Close)
tokens.pop_front();
else {
warnings.push_back(
{token2.pos,
std::string("expected: ')' but got ") +
token2.str});
}
}
return tree;
}
return data (tokens, proc, warnings);
}
static Mu::Tree factor_1 (Mu::Tokens& tokens, ProcPtr proc,
WarningVec& warnings);
static Mu::Tree
factor_2 (Mu::Tokens& tokens, Node::Type& op, ProcPtr proc,
WarningVec& warnings)
{
if (tokens.empty())
return empty();
const auto token = look_ahead(tokens);
switch (token.type) {
case Token::Type::And: {
tokens.pop_front();
op = Node::Type::OpAnd;
} break;
case Token::Type::Open:
case Token::Type::Data:
case Token::Type::Not:
op = Node::Type::OpAnd; // implicit AND
break;
default:
return empty();
}
return factor_1 (tokens, proc, warnings);
}
static Mu::Tree
factor_1 (Mu::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
{
Node::Type op { Node::Type::Invalid };
auto t = unit (tokens, proc, warnings);
auto a2 = factor_2 (tokens, op, proc, warnings);
if (a2.empty())
return t;
Tree tree {{op}};
tree.add_child(std::move(t));
tree.add_child(std::move(a2));
return tree;
}
static Mu::Tree
term_2 (Mu::Tokens& tokens, Node::Type& op, ProcPtr proc,
WarningVec& warnings)
{
if (tokens.empty())
return empty();
const auto token = look_ahead (tokens);
switch (token.type) {
case Token::Type::Or:
op = Node::Type::OpOr;
break;
case Token::Type::Xor:
op = Node::Type::OpXor;
break;
default:
if (token.type != Token::Type::Close)
warnings.push_back({token.pos, "expected OR|XOR"});
return empty();
}
tokens.pop_front();
return term_1 (tokens, proc, warnings);
}
static Mu::Tree
term_1 (Mu::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
{
Node::Type op { Node::Type::Invalid };
auto t = factor_1 (tokens, proc, warnings);
auto o2 = term_2 (tokens, op, proc, warnings);
if (o2.empty())
return t;
else {
Tree tree {{op}};
tree.add_child(std::move(t));
tree.add_child(std::move(o2));
return tree;
}
}
static Mu::Tree
query (Mu::Tokens& tokens, ProcPtr proc, WarningVec& warnings)
{
if (tokens.empty())
return empty ();
else
return term_1 (tokens, proc, warnings);
}
Mu::Tree
Mu::parse (const std::string& expr, WarningVec& warnings, ProcPtr proc)
{
try {
auto tokens = tokenize (expr);
return query (tokens, proc, warnings);
} catch (const std::runtime_error& ex) {
std::cerr << ex.what() << std::endl;
return empty();
}
}

89
lib/query/mu-parser.hh Normal file
View File

@ -0,0 +1,89 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef __PARSER_HH__
#define __PARSER_HH__
#include <string>
#include <vector>
#include <memory>
#include <query/mu-data.hh>
#include <query/mu-tree.hh>
#include <query/mu-proc-iface.hh>
// A simple recursive-descent parser for queries. Follows the Xapian syntax,
// but better handles non-alphanum; also implements regexp
namespace Mu {
/**
* A parser warning
*
*/
struct Warning {
size_t pos; /**< pos in string */
const std::string msg; /**< warning message */
/**
* operator==
*
* @param rhs right-hand side
*
* @return true if rhs is equal to this; false otherwise
*/
bool operator==(const Warning& rhs) const {
return pos == rhs.pos && msg == rhs.msg;
}
};
/**
* operator<<
*
* @param os an output stream
* @param w a warning
*
* @return the updated output stream
*/
inline std::ostream&
operator<< (std::ostream& os, const Warning& w)
{
os << w.pos << ":" << w.msg;
return os;
}
/**
* Parse a query string
*
* @param query a query string
* @param warnings vec to receive warnings
* @param proc a Processor object
*
* @return a parse-tree
*/
using WarningVec=std::vector<Warning>;
using ProcPtr = const std::unique_ptr<ProcIface>&;
Tree parse (const std::string& query, WarningVec& warnings,
ProcPtr proc = std::make_unique<DummyProc>());
} // namespace Mu
#endif /* __PARSER_HH__ */

132
lib/query/mu-proc-iface.hh Normal file
View File

@ -0,0 +1,132 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef __PROC_IFACE_HH__
#define __PROC_IFACE_HH__
#include <string>
#include <vector>
#include <tuple>
#include <regex>
namespace Mu {
struct ProcIface {
virtual ~ProcIface() = default;
/**
* Get the "shortcut"/internal fields for the the given fieldstr or empty if there is none
*
* @param fieldstr a fieldstr, e.g "subject" or "s" for the subject field
*
* @return a vector with "exploded" values, with a code and a fullname. E.g. "s" might map
* to [<"S","subject">], while "recip" could map to [<"to", "T">, <"cc", "C">, <"bcc", "B">]
*/
struct FieldInfo {
const std::string field;
const std::string prefix;
bool supports_phrase;
unsigned id;
};
using FieldInfoVec = std::vector<FieldInfo>;
virtual FieldInfoVec process_field (const std::string& field) const = 0;
/**
* Process a value
*
* @param field a field name
* @param value a value
*
* @return the processed value
*/
virtual std::string process_value (
const std::string& field, const std::string& value) const = 0;
/**
* Is this a range field?
*
* @param field some field
*
* @return true if it is a range-field; false otherwise.
*/
virtual bool is_range_field (const std::string& field) const = 0;
/**
* Process a range field
*
* @param fieldstr a fieldstr, e.g "date" or "d" for the date field
* @param lower lower bound or empty
* @param upper upper bound or empty
*
* @return the processed range
*/
struct Range {
std::string lower;
std::string upper;
};
virtual Range process_range (const std::string& field, const std::string& lower,
const std::string& upper) const = 0;
/**
*
*
* @param field
* @param rx
*
* @return
*/
virtual std::vector<std::string>
process_regex (const std::string& field, const std::regex& rx) const = 0;
}; // ProcIface
struct DummyProc: public ProcIface { // For testing
std::vector<FieldInfo>
process_field (const std::string& field) const override {
return {{ field, "x", false, 0 }};
}
std::string
process_value (const std::string& field, const std::string& value) const override {
return value;
}
bool is_range_field (const std::string& field) const override {
return field == "range";
}
Range process_range (const std::string& field, const std::string& lower,
const std::string& upper) const override {
return { lower, upper };
}
std::vector<std::string>
process_regex (const std::string& field, const std::regex& rx) const override {
return {};
}
}; //Dummy
} // Mu
#endif /* __PROC_IFACE_HH__ */

133
lib/query/mu-tokenizer.cc Normal file
View File

@ -0,0 +1,133 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include "mu-tokenizer.hh"
#include "utils/mu-utils.hh"
#include <cctype>
#include <iostream>
#include <algorithm>
using namespace Mu;
static bool
is_separator (char c)
{
if (isblank(c))
return true;
const auto seps = std::string ("()");
return seps.find(c) != std::string::npos;
}
static Mu::Token
op_or_value (size_t pos, const std::string& val)
{
auto s = val;
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
if (s == "and")
return Token{pos, Token::Type::And, val};
else if (s == "or")
return Token{pos, Token::Type::Or, val};
else if (s == "xor")
return Token{pos, Token::Type::Xor, val};
else if (s == "not")
return Token{pos, Token::Type::Not, val};
else
return Token{pos, Token::Type::Data, val};
}
static void
unread_char (std::string& food, char kar, size_t& pos)
{
food = kar + food;
--pos;
}
static Mu::Token
eat_token (std::string& food, size_t& pos)
{
bool quoted{};
bool escaped{};
std::string value {};
while (!food.empty()) {
const auto kar = food[0];
food.erase(0, 1);
++pos;
if (kar == '\\') {
escaped = !escaped;
if (escaped)
continue;
}
if (kar == '"') {
if (!escaped && quoted)
return Token{pos, Token::Type::Data, value};
else {
quoted = true;
continue;
}
}
if (!quoted && !escaped && is_separator(kar)) {
if (!value.empty() && kar != ':') {
unread_char (food, kar, pos);
return op_or_value(pos, value);
}
if (quoted || isblank(kar))
continue;
switch (kar) {
case '(': return {pos, Token::Type::Open, "("};
case ')': return {pos, Token::Type::Close,")"};
default: break;
}
}
value += kar;
escaped = false;
}
return {pos, Token::Type::Data, value};
}
Mu::Tokens
Mu::tokenize (const std::string& s)
{
Tokens tokens{};
std::string food = utf8_clean(s);
size_t pos{0};
if (s.empty())
return {};
while (!food.empty())
tokens.emplace_back(eat_token (food, pos));
return tokens;
}

140
lib/query/mu-tokenizer.hh Normal file
View File

@ -0,0 +1,140 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef __TOKENIZER_HH__
#define __TOKENIZER_HH__
#include <string>
#include <vector>
#include <deque>
#include <ostream>
#include <stdexcept>
// A simple tokenizer, which turns a string into a deque of tokens
//
// It recognizes '(', ')', '*' 'and', 'or', 'xor', 'not'
//
// Note that even if we recognizes those at the lexical level, they might be demoted to mere strings
// when we're creating the parse tree.
//
// Furthermore, we detect ranges ("a..b") and regexps (/../) at the parser level, since we need a
// bit more context to resolve ambiguities.
namespace Mu {
// A token
struct Token {
enum class Type {
Data, /**< e .g., banana or date:..456 */
// Brackets
Open, /**< ( */
Close, /**< ) */
// Unops
Not, /**< logical not*/
// Binops
And, /**< logical and */
Or, /**< logical not */
Xor, /**< logical xor */
Empty, /**< nothing */
};
size_t pos{}; /**< position in string */
Type type{}; /**< token type */
const std::string str{}; /**< data for this token */
/**
* operator==
*
* @param rhs right-hand side
*
* @return true if rhs is equal to this; false otherwise
*/
bool operator==(const Token& rhs) const {
return pos == rhs.pos &&
type == rhs.type &&
str == rhs.str;
}
};
/**
* operator<<
*
* @param os an output stream
* @param t a token type
*
* @return the updated output stream
*/
inline std::ostream&
operator<< (std::ostream& os, Token::Type t)
{
switch (t) {
case Token::Type::Data: os << "<data>"; break;
case Token::Type::Open: os << "<open>"; break;
case Token::Type::Close: os << "<close>";break;
case Token::Type::Not: os << "<not>"; break;
case Token::Type::And: os << "<and>"; break;
case Token::Type::Or: os << "<or>"; break;
case Token::Type::Xor: os << "<xor>"; break;
default: // can't happen, but pacify compiler
throw std::runtime_error ("<<bug>>");
}
return os;
}
/**
* operator<<
*
* @param os an output stream
* @param t a token
*
* @return the updated output stream
*/
inline std::ostream&
operator<< (std::ostream& os, const Token& t)
{
os << t.pos << ": " << t.type;
if (!t.str.empty())
os << " [" << t.str << "]";
return os;
}
/**
* Tokenize a string into a vector of tokens. The tokenization always succeeds, ie., ignoring errors
* such a missing end-".
*
* @param s a string
*
* @return a deque of tokens
*/
using Tokens = std::deque<Token>;
Tokens tokenize (const std::string& s);
} // namespace Mu
#endif /* __TOKENIZER_HH__ */

111
lib/query/mu-tree.hh Normal file
View File

@ -0,0 +1,111 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef TREE_HH__
#define TREE_HH__
#include <vector>
#include <string>
#include <iostream>
#include <query/mu-data.hh>
#include <utils/mu-error.hh>
namespace Mu {
// A node in the parse tree
struct Node {
enum class Type {
Empty, // only for empty trees
OpAnd,
OpOr,
OpXor,
OpAndNot,
OpNot,
Value,
Range,
Invalid
};
Node(Type _type, std::unique_ptr<Data>&& _data):
type{_type}, data{std::move(_data)} {}
Node(Type _type): type{_type} {}
Node(Node&& rhs) = default;
Type type;
std::unique_ptr<Data> data;
static const char* type_name (Type t) {
switch (t) {
case Type::Empty: return ""; break;
case Type::OpAnd: return "and"; break;
case Type::OpOr: return "or"; break;
case Type::OpXor: return "xor"; break;
case Type::OpAndNot: return "andnot"; break;
case Type::OpNot: return "not"; break;
case Type::Value: return "value"; break;
case Type::Range: return "range"; break;
case Type::Invalid: return "<invalid>"; break;
default:
throw Mu::Error(Error::Code::Internal, "unexpected type");
}
}
static constexpr bool is_binop(Type t) {
return t == Type::OpAnd || t == Type::OpAndNot ||
t == Type::OpOr || t == Type::OpXor;
}
};
inline std::ostream&
operator<< (std::ostream& os, const Node& t)
{
os << Node::type_name(t.type);
if (t.data)
os << t.data;
return os;
}
struct Tree {
Tree(Node&& _node): node(std::move(_node)) {}
Tree(Tree&& rhs) = default;
void add_child (Tree&& child) { children.emplace_back(std::move(child)); }
bool empty() const { return node.type == Node::Type::Empty; }
Node node;
std::vector<Tree> children;
};
inline std::ostream&
operator<< (std::ostream& os, const Tree& tree)
{
os << '(' << tree.node;
for (const auto& subtree : tree.children)
os << subtree;
os << ')';
return os;
}
} // namespace Mu
#endif /* TREE_HH__ */

120
lib/query/mu-xapian.cc Normal file
View File

@ -0,0 +1,120 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /*HAVE_CONFIG_H*/
#include <xapian.h>
#include "mu-xapian.hh"
#include <utils/mu-error.hh>
using namespace Mu;
static Xapian::Query
xapian_query_op (const Mu::Tree& tree)
{
Xapian::Query::op op;
switch (tree.node.type) {
case Node::Type::OpNot: // OpNot x ::= <all> AND NOT x
if (tree.children.size() != 1)
throw std::runtime_error ("invalid # of children");
return Xapian::Query (Xapian::Query::OP_AND_NOT,
Xapian::Query::MatchAll,
xapian_query(tree.children.front()));
case Node::Type::OpAnd: op = Xapian::Query::OP_AND; break;
case Node::Type::OpOr: op = Xapian::Query::OP_OR; break;
case Node::Type::OpXor: op = Xapian::Query::OP_XOR; break;
case Node::Type::OpAndNot: op = Xapian::Query::OP_AND_NOT; break;
default: throw Mu::Error (Error::Code::Internal, "invalid op"); // bug
}
std::vector<Xapian::Query> childvec;
for (const auto& subtree: tree.children)
childvec.emplace_back(xapian_query(subtree));
return Xapian::Query(op, childvec.begin(), childvec.end());
}
static Xapian::Query
make_query (const Value* val, const std::string& str, bool maybe_wildcard)
{
#ifndef XAPIAN_HAVE_OP_WILDCARD
return Xapian::Query(val->prefix + str);
#else
const auto vlen{str.length()};
if (!maybe_wildcard || vlen <= 1 || str[vlen - 1] != '*')
return Xapian::Query(val->prefix + str);
else
return Xapian::Query(Xapian::Query::OP_WILDCARD,
val->prefix + str.substr(0, vlen - 1));
#endif/*XAPIAN_HAVE_OP_WILDCARD*/
}
static Xapian::Query
xapian_query_value (const Mu::Tree& tree)
{
const auto v = dynamic_cast<Value*> (tree.node.data.get());
if (!v->phrase)
return make_query(v, v->value, true/*maybe-wildcard*/);
const auto parts = split (v->value, " ");
if (parts.empty())
return Xapian::Query::MatchNothing; // shouldn't happen
if (parts.size() == 1)
return make_query(v, parts.front(), true/*maybe-wildcard*/);
std::vector<Xapian::Query> phvec;
for (const auto p: parts)
phvec.emplace_back(make_query(v, p, false/*no wildcards*/));
return Xapian::Query (Xapian::Query::OP_PHRASE, phvec.begin(), phvec.end());
}
static Xapian::Query
xapian_query_range (const Mu::Tree& tree)
{
const auto r { dynamic_cast<Range *>(tree.node.data.get()) };
return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, (Xapian::valueno)r->id,
r->lower, r->upper);
}
Xapian::Query
Mu::xapian_query (const Mu::Tree& tree)
{
switch (tree.node.type) {
case Node::Type::Empty:
return Xapian::Query();
case Node::Type::OpNot:
case Node::Type::OpAnd:
case Node::Type::OpOr:
case Node::Type::OpXor:
case Node::Type::OpAndNot:
return xapian_query_op (tree);
case Node::Type::Value:
return xapian_query_value (tree);
case Node::Type::Range:
return xapian_query_range (tree);
default:
throw Mu::Error (Error::Code::Internal, "invalid query"); // bug
}
}

40
lib/query/mu-xapian.hh Normal file
View File

@ -0,0 +1,40 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#ifndef __XAPIAN_HH__
#define __XAPIAN_HH__
#include <xapian.h>
#include <query/mu-parser.hh>
namespace Mu {
/**
* Transform a parse-tree into a Xapian query object
*
* @param tree a parse tree
*
* @return a Xapian query object
*/
Xapian::Query xapian_query (const Mu::Tree& tree);
} // namespace Mu
#endif /* __XAPIAN_H__ */

41
lib/query/parse.cc Normal file
View File

@ -0,0 +1,41 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include <string>
#include <iostream>
#include "mu-parser.hh"
int
main (int argc, char *argv[])
{
std::string s;
for (auto i = 1; i < argc; ++i)
s += " " + std::string(argv[i]);
Mu::WarningVec warnings;
const auto tree = Mu::parse (s, warnings);
for (const auto& w: warnings)
std::cerr << "1:" << w.pos << ": " << w.msg << std::endl;
std::cout << tree << std::endl;
return 0;
}

146
lib/query/test-parser.cc Normal file
View File

@ -0,0 +1,146 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include <vector>
#include <glib.h>
#include <iostream>
#include <sstream>
#include "mu-parser.hh"
using namespace Mu;
struct Case {
const std::string expr;
const std::string expected;
WarningVec warnings;
};
using CaseVec = std::vector<Case>;
static void
test_cases(const CaseVec& cases)
{
for (const auto& casus : cases ) {
WarningVec warnings;
const auto tree = parse (casus.expr, warnings);
std::stringstream ss;
ss << tree;
if (g_test_verbose()) {
std::cout << "\n";
std::cout << casus.expr << std::endl;
std::cout << "exp:" << casus.expected << std::endl;
std::cout << "got:" << ss.str() << std::endl;
}
g_assert_true (casus.expected == ss.str());
// g_assert_cmpuint (casus.warnings.size(), ==, warnings.size());
// for (auto i = 0; i != (int)casus.warnings.size(); ++i) {
// std::cout << "exp:" << casus.warnings[i] << std::endl;
// std::cout << "got:" << warnings[i] << std::endl;
// g_assert_true (casus.warnings[i] == warnings[i]);
// }
}
}
static void
test_basic ()
{
CaseVec cases = {
//{ "", R"#((atom :value ""))#"},
{ "foo", R"#((value "" "foo"))#", },
{ "foo or bar",
R"#((or(value "" "foo")(value "" "bar")))#" },
{ "foo and bar",
R"#((and(value "" "foo")(value "" "bar")))#"},
};
test_cases (cases);
}
static void
test_complex ()
{
CaseVec cases = {
{ "foo and bar or cuux",
R"#((or(and(value "" "foo")(value "" "bar")))#" +
std::string(R"#((value "" "cuux")))#") },
{ "a and not b",
R"#((and(value "" "a")(not(value "" "b"))))#"
},
{ "a and b and c",
R"#((and(value "" "a")(and(value "" "b")(value "" "c"))))#"
},
{ "(a or b) and c",
R"#((and(or(value "" "a")(value "" "b"))(value "" "c")))#"
},
{ "a b", // implicit and
R"#((and(value "" "a")(value "" "b")))#"
},
{ "a not b", // implicit and not
R"#((and(value "" "a")(not(value "" "b"))))#"
},
{ "not b", // implicit and not
R"#((not(value "" "b")))#"
}
};
test_cases (cases);
}
static void
test_range ()
{
CaseVec cases = {
{ "range:a..b", // implicit and
R"#((range "range" "a" "b"))#"
},
};
test_cases (cases);
}
static void
test_flatten ()
{
CaseVec cases = {
{ " Mötørhęåđ", R"#((value "" "motorhead"))#" }
};
test_cases (cases);
}
int
main (int argc, char *argv[])
{
g_test_init (&argc, &argv, NULL);
g_test_add_func ("/parser/basic", test_basic);
g_test_add_func ("/parser/complex", test_complex);
g_test_add_func ("/parser/range", test_range);
g_test_add_func ("/parser/flatten", test_flatten);
return g_test_run ();
}

158
lib/query/test-tokenizer.cc Normal file
View File

@ -0,0 +1,158 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include <vector>
#include <glib.h>
#include <iostream>
#include <sstream>
#include "mu-tokenizer.hh"
struct Case {
const char *str;
const Mu::Tokens tokens;
};
using CaseVec = std::vector<Case>;
using namespace Mu;
using TT = Token::Type;
static void
test_cases(const CaseVec& cases)
{
for (const auto& casus : cases ) {
const auto tokens = tokenize (casus.str);
g_assert_cmpuint ((guint)tokens.size(),==,(guint)casus.tokens.size());
for (size_t u = 0; u != tokens.size(); ++u) {
if (g_test_verbose()) {
std::cerr << "case " << u << " " << casus.str << std::endl;
std::cerr << "exp: '" << casus.tokens[u] << "'" << std::endl;
std::cerr << "got: '" << tokens[u] << "'" << std::endl;
}
g_assert_true (tokens[u] == casus.tokens[u]);
}
}
}
static void
test_basic ()
{
CaseVec cases = {
{ "", {} },
{ "foo", Tokens{Token{3, TT::Data, "foo"}}},
{ "foo bar cuux", Tokens{Token{3, TT::Data, "foo"},
Token{7, TT::Data, "bar"},
Token{12, TT::Data, "cuux"}}},
{ "\"foo bar\"", Tokens{ Token{9, TT::Data, "foo bar"}}},
// ie. ignore missing closing '"'
{ "\"foo bar", Tokens{ Token{8, TT::Data, "foo bar"}}},
};
test_cases (cases);
}
static void
test_specials ()
{
CaseVec cases = {
{ ")*(", Tokens{Token{1, TT::Close, ")"},
Token{2, TT::Data, "*"},
Token{3, TT::Open, "("}}},
{ "\")*(\"", Tokens{Token{5, TT::Data, ")*("}}},
};
test_cases (cases);
}
static void
test_ops ()
{
CaseVec cases = {
{ "foo and bar oR cuux XoR fnorb",
Tokens{Token{3, TT::Data, "foo"},
Token{7, TT::And, "and"},
Token{11, TT::Data, "bar"},
Token{14, TT::Or, "oR"},
Token{19, TT::Data, "cuux"},
Token{23, TT::Xor, "XoR"},
Token{29, TT::Data, "fnorb"}}},
{ "NOT (aap or mies)",
Tokens{Token{3, TT::Not, "NOT"},
Token{5, TT::Open, "("},
Token{8, TT::Data, "aap"},
Token{11, TT::Or, "or"},
Token{16, TT::Data, "mies"},
Token{17, TT::Close, ")"}}}
};
test_cases (cases);
}
static void
test_escape ()
{
CaseVec cases = {
{ "foo\"bar\"", Tokens{Token{8, TT::Data, "foobar"}}},
{ "\"fnorb\"", Tokens{Token{7, TT::Data, "fnorb"}}},
{ "\\\"fnorb\\\"", Tokens{Token{9, TT::Data, "fnorb"}}},
{ "foo\\\"bar\\\"", Tokens{Token{10, TT::Data, "foobar"}}}
};
test_cases (cases);
}
static void
test_to_string ()
{
std::stringstream ss;
for (const auto t: tokenize ("foo and bar xor not cuux or fnorb"))
ss << t << ' ';
g_assert_true (ss.str() ==
"3: <data> [foo] 7: <and> [and] 11: <data> [bar] "
"15: <xor> [xor] 19: <not> [not] 24: <data> [cuux] "
"27: <or> [or] 33: <data> [fnorb] ");
}
int
main (int argc, char *argv[])
{
g_test_init (&argc, &argv, NULL);
g_test_add_func ("/tokens/basic", test_basic);
g_test_add_func ("/tokens/specials", test_specials);
g_test_add_func ("/tokens/ops", test_ops);
g_test_add_func ("/tokens/escape", test_escape);
g_test_add_func ("/tokens/to-string", test_to_string);
return g_test_run ();
}

38
lib/query/tokenize.cc Normal file
View File

@ -0,0 +1,38 @@
/*
** Copyright (C) 2017-2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include <string>
#include <iostream>
#include "mu-tokenizer.hh"
int
main (int argc, char *argv[])
{
std::string s;
for (auto i = 1; i < argc; ++i)
s += " " + std::string(argv[i]);
const auto tvec = Mu::tokenize (s);
for (const auto& t : tvec)
std::cout << t << std::endl;
return 0;
}