diff --git a/lib/parser/Makefile.am b/lib/parser/Makefile.am new file mode 100644 index 00000000..c1557ac5 --- /dev/null +++ b/lib/parser/Makefile.am @@ -0,0 +1,87 @@ +## Copyright (C) 2017 Dirk-Jan C. Binnema +## +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software Foundation, +## Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +include $(top_srcdir)/gtest.mk + +@VALGRIND_CHECK_RULES@ + +noinst_PROGRAMS= \ + tokenize \ + parse + +tokenize_SOURCES= \ + tokenize.cc + +tokenize_LDADD= \ + $(GCOV_LDADD) \ + libmuxparser.la + +parse_SOURCES= \ + parse.cc + +parse_LDADD= \ + $(GCOV_LDADD) \ + libmuxparser.la + +AM_CXXFLAGS= \ + -I$(srcdir)/.. \ + -I$(top_srcdir)/lib \ + $(GLIB_CFLAGS) \ + $(XAPIAN_CXXFLAGS) \ + $(WARN_CXXFLAGS) \ + $(GCOV_CFLAGS) \ + -Wno-inline \ + -Wno-switch-enum + +libmuxparser_la_LIBADD= \ + $(WARN_LDFLAGS) \ + $(GLIB_LIBS) \ + $(XAPIAN_LIBS) \ + $(GCOV_LDADD) + +noinst_LTLIBRARIES= \ + libmuxparser.la + +libmuxparser_la_SOURCES= \ + data.hh \ + parser.cc \ + parser.hh \ + proc-iface.hh \ + tokenizer.cc \ + tokenizer.hh \ + tree.hh \ + utils.cc \ + utils.hh \ + xapian.cc \ + xapian.hh + +VALGRIND_SUPPRESSIONS_FILES= ${top_srcdir}/mux.supp + +noinst_PROGRAMS+=$(TEST_PROGS) + +TEST_PROGS += test-tokenizer +test_tokenizer_SOURCES=test-tokenizer.cc +test_tokenizer_LDADD=libmuxparser.la + +TEST_PROGS += test-parser +test_parser_SOURCES=test-parser.cc +test_parser_LDADD=libmuxparser.la + +TEST_PROGS += test-utils +test_utils_SOURCES=test-utils.cc +test_utils_LDADD=libmuxparser.la + +TESTS=$(TEST_PROGS) diff --git a/lib/parser/data.hh b/lib/parser/data.hh new file mode 100644 index 00000000..513db514 --- /dev/null +++ b/lib/parser/data.hh @@ -0,0 +1,151 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#ifndef __DATA_HH__ +#define __DATA_HH__ + +#include +#include +#include + +#include + +namespace Mux { + +// class representing some data item; either a Value or a Range a Value can still be a Regex (but +// that's not a separate type here) +struct Data { + enum class Type { Value, Range }; + virtual ~Data() = default; + + Type type; /**< type of data */ + std::string field; /**< full name of the field */ + std::string prefix; /**< Xapian prefix for thef field */ + unsigned id; /**< Xapian value no for the field */ + +protected: + Data (Type _type, const std::string& _field, const std::string& _prefix, + unsigned _id): type(_type), field(_field), prefix(_prefix), id(_id) {} +}; + + +/** + * operator<< + * + * @param os an output stream + * @param t a data type + * + * @return the updated output stream + */ +inline std::ostream& +operator<< (std::ostream& os, Data::Type t) +{ + switch (t) { + case Data::Type::Value: os << "value"; break; + case Data::Type::Range: os << "range"; break; + default: os << "bug"; break; + } + return os; +} + + +/** + * Range type -- [a..b] + */ +struct Range: public Data { + /** + * Construct a range + * + * @param _field the field + * @param _prefix the xapian prefix + * @param _id xapian value number + * @param _lower lower bound + * @param _upper upper bound + */ + Range (const std::string& _field, const std::string& _prefix, + unsigned _id, + const std::string& _lower,const std::string& _upper): + + Data(Data::Type::Range, _field, _prefix, _id), + lower(_lower), upper(_upper) {} + + std::string lower; /**< lower bound */ + std::string upper; /**< upper bound */ +}; + + +/** + * Basic value + * + */ +struct Value: public Data { + /** + * Construct a Value + * + * @param _field the field + * @param _prefix the xapian prefix + * @param _id xapian value number + * @param _value the value + */ + Value (const std::string& _field, const std::string& _prefix, + unsigned _id, const std::string& _value): + Data(Value::Type::Value, _field, _prefix, _id), + value(_value) {} + + std::string value; /**< the value */ +}; + + +/** + * operator<< + * + * @param os an output stream + * @param v a data ptr + * + * @return the updated output stream + */ +inline std::ostream& +operator<< (std::ostream& os, const std::unique_ptr& v) +{ + switch (v->type) { + case Data::Type::Value: { + const auto bval = dynamic_cast (v.get()); + os << ' ' << quote(v->field) << ' ' + << quote(utf8_flatten(bval->value)); + break; + } + case Data::Type::Range: { + const auto rval = dynamic_cast (v.get()); + os << ' ' << quote(v->field) << ' ' + << quote(rval->lower) << ' ' + << quote(rval->upper); + break; + } + default: + os << "unexpected type"; + break; + } + + return os; +} + +} // namespace Mux + + +#endif /* __DATA_HH__ */ diff --git a/lib/parser/dummy-processor.hh b/lib/parser/dummy-processor.hh new file mode 100644 index 00000000..436dc8c2 --- /dev/null +++ b/lib/parser/dummy-processor.hh @@ -0,0 +1,30 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ +#ifndef __DUMMY_PROCESSOR_HH__ +#define __DUMMY_PROCESSOR_HH__ + +#include +#include +#include + +namespace Mux { + + + +#endif /* __FIELDS_HH__ */ diff --git a/lib/parser/parse.cc b/lib/parser/parse.cc new file mode 100644 index 00000000..297d11d7 --- /dev/null +++ b/lib/parser/parse.cc @@ -0,0 +1,41 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include +#include +#include "parser.hh" + +int +main (int argc, char *argv[]) +{ + std::string s; + + for (auto i = 1; i < argc; ++i) + s += " " + std::string(argv[i]); + + Mux::WarningVec warnings; + + const auto tree = Mux::parse (s, warnings); + for (const auto& w: warnings) + std::cerr << "1:" << w.pos << ": " << w.msg << std::endl; + + std::cout << tree << std::endl; + + return 0; +} diff --git a/lib/parser/parser.cc b/lib/parser/parser.cc new file mode 100644 index 00000000..cf4ac25a --- /dev/null +++ b/lib/parser/parser.cc @@ -0,0 +1,346 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ +#include "parser.hh" +#include "tokenizer.hh" +#include "utils.hh" + +using namespace Mux; + +// 3 precedence levels: units (NOT,()) > factors (OR) > terms (AND) + +// query -> | ε +// -> | ε +// -> OR|XOR | ε +// -> | ε +// -> [AND]|AND NOT | ε +// -> [NOT] | ( ) | +// -> | | +// -> [field:]value +// -> [field:][lower]..[upper] +// -> [field:]/regex/ + + +#define BUG(...) std::runtime_error (format("%u: BUG: ",__LINE__) \ + + format(__VA_ARGS__)) + +static Token +look_ahead (const Mux::Tokens& tokens) +{ + return tokens.front(); +} + +static Mux::Tree +empty() +{ + return {{Node::Type::Empty}}; +} + +static Mux::Tree term_1 (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings); + + +static Mux::Tree +value (const ProcIface::FieldInfoVec& fields, const std::string& v, + size_t pos, ProcPtr proc, WarningVec& warnings) +{ + auto val = utf8_flatten(v); + + if (fields.empty()) + throw BUG("expected one or more fields"); + + if (fields.size() == 1) { + const auto item = fields.front(); + return Tree({Node::Type::Value, + std::make_unique( + item.field, item.prefix, item.id, + proc->process_value(item.field, val))}); + } + + // a 'multi-field' such as "recip:" + Tree tree(Node{Node::Type::OpOr}); + for (const auto& item: fields) + tree.add_child (Tree({Node::Type::Value, + std::make_unique( + item.field, item.prefix, item.id, + proc->process_value(item.field, val))})); + return tree; +} + +static Mux::Tree +regex (const ProcIface::FieldInfoVec& fields, const std::string& v, + size_t pos, ProcPtr proc, WarningVec& warnings) +{ + if (v.length() < 2) + throw BUG("expected regexp, got '%s'", v.c_str()); + + const auto rxstr = utf8_flatten(v.substr(1, v.length()-2)); + + try { + Tree tree(Node{Node::Type::OpOr}); + const auto rx = std::regex (rxstr); + for (const auto& field: fields) { + const auto terms = proc->process_regex (field.field, rx); + for (const auto& term: terms) { + tree.add_child (Tree( + {Node::Type::Value, + std::make_unique(field.field, "", + field.id, term)})); + } + } + return tree; + + } catch (...) { + // fallback + warnings.push_back ({pos, "invalid regexp"}); + return value (fields, v, pos, proc, warnings); + } +} + + + +static Mux::Tree +range (const ProcIface::FieldInfoVec& fields, const std::string& lower, + const std::string& upper, size_t pos, ProcPtr proc, + WarningVec& warnings) +{ + if (fields.empty()) + throw BUG("expected field"); + + const auto& field = fields.front(); + if (!proc->is_range_field(field.field)) + return value (fields, lower + ".." + upper, pos, proc, warnings); + + auto prange = proc->process_range (field.field, lower, upper); + if (prange.lower > prange.upper) + prange = proc->process_range (field.field, upper, lower); + + return Tree({{Node::Type::Range}, + std::make_unique(field.field, field.prefix, field.id, + prange.lower, prange.upper)}); +} + + +static Mux::Tree +data (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings) +{ + const auto token = look_ahead(tokens); + if (token.type != Token::Type::Data) + warnings.push_back ({token.pos, "expected: value"}); + + tokens.pop_front(); + + std::string field, val; + const auto col = token.str.find (":"); + if (col != 0 && col != std::string::npos && col != token.str.length()-1) { + field = token.str.substr(0, col); + val = token.str.substr(col + 1); + } else + val = token.str; + + auto fields = proc->process_field (field); + if (fields.empty()) {// not valid field... + warnings.push_back ({token.pos, format ("invalid field '%s'", field.c_str())}); + fields = proc->process_field (""); + // fallback, treat the whole of foo:bar as a value + return value (fields, field + ":" + val, token.pos, proc, warnings); + } + + // does it look like a regexp? + if (val.length()>=2) { + if (val[0]=='/' && val[val.length()-1] == '/') + return regex (fields, val, token.pos, proc, warnings); + else if (val[val.length()-1] == '*') + return regex (fields, // transfrom wildcard into regexp + "/" + val.substr(0, val.length()-1) + ".*/", + token.pos, proc, warnings); + } + + // does it look like a range? + const auto dotdot = val.find(".."); + if (dotdot != std::string::npos) + return range(fields, val.substr(0, dotdot), val.substr(dotdot + 2), + token.pos, proc, warnings); + + // if nothing else, it's a value. + return value (fields, val, token.pos, proc, warnings); +} + +static Mux::Tree +unit (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings) +{ + if (tokens.empty()) { + warnings.push_back ({0, "expected: unit"}); + return empty(); + } + + const auto token = look_ahead (tokens); + + if (token.type == Token::Type::Not) { + tokens.pop_front(); + Tree tree{{Node::Type::OpNot}}; + tree.add_child(unit (tokens, proc, warnings)); + return tree; + } + + if (token.type == Token::Type::Open) { + tokens.pop_front(); + auto tree = term_1 (tokens, proc, warnings); + if (tokens.empty()) + warnings.push_back({token.pos, "expected: ')'"}); + else { + const auto token2 = look_ahead(tokens); + if (token2.type == Token::Type::Close) + tokens.pop_front(); + else { + warnings.push_back( + {token2.pos, + std::string("expected: ')' but got ") + + token2.str}); + } + + } + return tree; + } + + return data (tokens, proc, warnings); +} + +static Mux::Tree factor_1 (Mux::Tokens& tokens, ProcPtr proc, + WarningVec& warnings); + +static Mux::Tree +factor_2 (Mux::Tokens& tokens, Node::Type& op, ProcPtr proc, + WarningVec& warnings) +{ + if (tokens.empty()) + return empty(); + + const auto token = look_ahead(tokens); + + switch (token.type) { + case Token::Type::And: { + tokens.pop_front(); + const auto token2 = look_ahead(tokens); + if (token2.type == Token::Type::Not) { // AND NOT is a unit + tokens.pop_front(); + op = Node::Type::OpAndNot; + } else + op = Node::Type::OpAnd; + } break; + case Token::Type::Open: + case Token::Type::Data: + op = Node::Type::OpAnd; // implicit AND + break; + case Token::Type::Not: + tokens.pop_front(); + op = Node::Type::OpAndNot; // implicit AND NOT + break; + + + default: + return empty(); + } + + return factor_1 (tokens, proc, warnings); +} + +static Mux::Tree +factor_1 (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings) +{ + Node::Type op { Node::Type::Invalid }; + + auto t = unit (tokens, proc, warnings); + auto a2 = factor_2 (tokens, op, proc, warnings); + + if (a2.empty()) + return t; + + Tree tree {{op}}; + tree.add_child(std::move(t)); + tree.add_child(std::move(a2)); + + return tree; +} + + +static Mux::Tree +term_2 (Mux::Tokens& tokens, Node::Type& op, ProcPtr proc, + WarningVec& warnings) +{ + if (tokens.empty()) + return empty(); + + const auto token = look_ahead (tokens); + + switch (token.type) { + case Token::Type::Or: + op = Node::Type::OpOr; + break; + case Token::Type::Xor: + op = Node::Type::OpXor; + break; + default: + if (token.type != Token::Type::Close) + warnings.push_back({token.pos, "expected OR|XOR"}); + return empty(); + } + + tokens.pop_front(); + + return term_1 (tokens, proc, warnings); +} + +static Mux::Tree +term_1 (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings) +{ + Node::Type op { Node::Type::Invalid }; + + auto t = factor_1 (tokens, proc, warnings); + auto o2 = term_2 (tokens, op, proc, warnings); + + if (o2.empty()) + return t; + else { + Tree tree {{op}}; + tree.add_child(std::move(t)); + tree.add_child(std::move(o2)); + return tree; + } +} + +static Mux::Tree +query (Mux::Tokens& tokens, ProcPtr proc, WarningVec& warnings) +{ + if (tokens.empty()) + return empty (); + else + return term_1 (tokens, proc, warnings); +} + +Mux::Tree +Mux::parse (const std::string& expr, WarningVec& warnings, ProcPtr proc) +{ + try { + auto tokens = tokenize (expr); + return query (tokens, proc, warnings); + + } catch (const std::runtime_error& ex) { + std::cerr << ex.what() << std::endl; + return empty(); + } +} diff --git a/lib/parser/parser.hh b/lib/parser/parser.hh new file mode 100644 index 00000000..34cbb529 --- /dev/null +++ b/lib/parser/parser.hh @@ -0,0 +1,89 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + + +#ifndef __PARSER_HH__ +#define __PARSER_HH__ + +#include +#include +#include + +#include +#include +#include + +// A simple recursive-descent parser for queries. Follows the Xapian syntax, +// but better handles non-alphanum; also implements regexp + +namespace Mux { + +/** + * A parser warning + * + */ +struct Warning { + size_t pos; /**< pos in string */ + const std::string msg; /**< warning message */ + + /** + * operator== + * + * @param rhs right-hand side + * + * @return true if rhs is equal to this; false otherwise + */ + bool operator==(const Warning& rhs) const { + return pos == rhs.pos && msg == rhs.msg; + } +}; + + +/** + * operator<< + * + * @param os an output stream + * @param w a warning + * + * @return the updated output stream + */ +inline std::ostream& +operator<< (std::ostream& os, const Warning& w) +{ + os << w.pos << ":" << w.msg; + return os; +} + +/** + * Parse a query string + * + * @param query a query string + * @param warnings vec to receive warnings + * @param proc a Processor object + * + * @return a parse-tree + */ +using WarningVec=std::vector; +using ProcPtr = const std::unique_ptr&; +Tree parse (const std::string& query, WarningVec& warnings, + ProcPtr proc = std::make_unique()); + +} // namespace Mux + +#endif /* __PARSER_HH__ */ diff --git a/lib/parser/proc-iface.hh b/lib/parser/proc-iface.hh new file mode 100644 index 00000000..f6633dd2 --- /dev/null +++ b/lib/parser/proc-iface.hh @@ -0,0 +1,131 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ +#ifndef __PROC_IFACE_HH__ +#define __PROC_IFACE_HH__ + +#include +#include +#include +#include + +namespace Mux { + +struct ProcIface { + + virtual ~ProcIface() = default; + + /** + * Get the "shortcut"/internal fields for the the given fieldstr or empty if there is none + * + * @param fieldstr a fieldstr, e.g "subject" or "s" for the subject field + * + * @return a vector with "exploded" values, with a code and a fullname. E.g. "s" might map + * to [<"S","subject">], while "recip" could map to [<"to", "T">, <"cc", "C">, <"bcc", "B">] + */ + struct FieldInfo { + const std::string field; + const std::string prefix; + unsigned id; + }; + using FieldInfoVec = std::vector; + + virtual FieldInfoVec process_field (const std::string& field) const = 0; + + /** + * Process a value + * + * @param field a field name + * @param value a value + * + * @return the processed value + */ + virtual std::string process_value ( + const std::string& field, const std::string& value) const = 0; + + /** + * Is this a range field? + * + * @param field some field + * + * @return true if it is a range-field; false otherwise. + */ + virtual bool is_range_field (const std::string& field) const = 0; + + + /** + * Process a range field + * + * @param fieldstr a fieldstr, e.g "date" or "d" for the date field + * @param lower lower bound or empty + * @param upper upper bound or empty + * + * @return the processed range + */ + struct Range { + std::string lower; + std::string upper; + }; + virtual Range process_range (const std::string& field, const std::string& lower, + const std::string& upper) const = 0; + + /** + * + * + * @param field + * @param rx + * + * @return + */ + virtual std::vector + process_regex (const std::string& field, const std::regex& rx) const = 0; + +}; // ProcIface + + +struct DummyProc: public ProcIface { // For testing + + std::vector + process_field (const std::string& field) const override { + return {{ field, "x", 0 }}; + } + + std::string + process_value (const std::string& field, const std::string& value) const override { + return value; + } + + bool is_range_field (const std::string& field) const override { + return false; + } + + Range process_range (const std::string& field, const std::string& lower, + const std::string& upper) const override { + return { lower, upper }; + } + + std::vector + process_regex (const std::string& field, const std::regex& rx) const override { + return {}; + } +}; //Dummy + + +} // Mux + +#endif /* __PROC_IFACE_HH__ */ diff --git a/lib/parser/test-parser.cc b/lib/parser/test-parser.cc new file mode 100644 index 00000000..2eea921f --- /dev/null +++ b/lib/parser/test-parser.cc @@ -0,0 +1,121 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include +#include + +#include +#include + +#include "parser.hh" +using namespace Mux; + +struct Case { + const std::string expr; + const std::string expected; + WarningVec warnings; +}; + +using CaseVec = std::vector; + +static void +test_cases(const CaseVec& cases) +{ + for (const auto& casus : cases ) { + + WarningVec warnings; + const auto tree = parse (casus.expr, warnings); + + std::stringstream ss; + ss << tree; + + if (g_test_verbose()) { + std::cout << "\n"; + std::cout << casus.expr << std::endl; + std::cout << "exp:" << casus.expected << std::endl; + std::cout << "got:" << ss.str() << std::endl; + } + g_assert_true (casus.expected == ss.str()); + + // g_assert_cmpuint (casus.warnings.size(), ==, warnings.size()); + // for (auto i = 0; i != (int)casus.warnings.size(); ++i) { + // std::cout << "exp:" << casus.warnings[i] << std::endl; + // std::cout << "got:" << warnings[i] << std::endl; + // g_assert_true (casus.warnings[i] == warnings[i]); + // } + } +} + +static void +test_basic () +{ + CaseVec cases = { + //{ "", R"#((atom :value ""))#"}, + { "foo", R"#((value "" "foo"))#", }, + { "foo or bar", + R"#((or(value "" "foo")(value "" "bar")))#" }, + { "foo and bar", + R"#((and(value "" "foo")(value "" "bar")))#"}, + }; + + test_cases (cases); +} + +static void +test_complex () +{ + CaseVec cases = { + { "foo and bar or cuux", + R"#((or(and(value "" "foo")(value "" "bar")))#" + + std::string(R"#((value "" "cuux")))#") }, + { "a and not b", + R"#((andnot(value "" "a")(value "" "b")))#" + }, + { "a and b and c", + R"#((and(value "" "a")(and(value "" "b")(value "" "c"))))#" + }, + { "(a or b) and c", + R"#((and(or(value "" "a")(value "" "b"))(value "" "c")))#" + } + }; + + test_cases (cases); +} + +static void +test_flatten () +{ + CaseVec cases = { + { " Mötørhęåđ", R"#((value "" "motorhead"))#" } + }; + + test_cases (cases); +} + +int +main (int argc, char *argv[]) +{ + g_test_init (&argc, &argv, NULL); + + g_test_add_func ("/parser/basic", test_basic); + g_test_add_func ("/parser/complex", test_complex); + g_test_add_func ("/parser/flatten", test_flatten); + + return g_test_run (); +} diff --git a/lib/parser/test-tokenizer.cc b/lib/parser/test-tokenizer.cc new file mode 100644 index 00000000..2e4cdeec --- /dev/null +++ b/lib/parser/test-tokenizer.cc @@ -0,0 +1,143 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include +#include +#include + +#include "tokenizer.hh" + +struct Case { + const char *str; + const Mux::Tokens tokens; +}; + +using CaseVec = std::vector; + +using namespace Mux; +using TT = Token::Type; + +static void +test_cases(const CaseVec& cases) +{ + for (const auto& casus : cases ) { + const auto tokens = tokenize (casus.str); + + g_assert_cmpuint ((guint)tokens.size(),==,(guint)casus.tokens.size()); + for (size_t u = 0; u != tokens.size(); ++u) { + if (g_test_verbose()) { + std::cerr << "case " << u << " " << casus.str << std::endl; + std::cerr << "exp: '" << casus.tokens[u] << "'" << std::endl; + std::cerr << "got: '" << tokens[u] << "'" << std::endl; + + } + g_assert_true (tokens[u] == casus.tokens[u]); + } + } +} + +static void +test_basic () +{ + CaseVec cases = { + { "", {} }, + + { "foo", Tokens{Token{3, TT::Data, "foo"}}}, + + { "foo bar cuux", Tokens{Token{3, TT::Data, "foo"}, + Token{7, TT::Data, "bar"}, + Token{12, TT::Data, "cuux"}}}, + + { "\"foo bar\"", Tokens{ Token{9, TT::Data, "foo bar"}}}, + + // ie. ignore missing closing '"' + { "\"foo bar", Tokens{ Token{8, TT::Data, "foo bar"}}}, + + }; + + test_cases (cases); +} + +static void +test_specials () +{ + CaseVec cases = { + { ")*(", Tokens{Token{1, TT::Close, ")"}, + Token{2, TT::Data, "*"}, + Token{3, TT::Open, "("}}}, + { "\")*(\"", Tokens{Token{5, TT::Data, ")*("}}}, + }; + + test_cases (cases); +} + + +static void +test_ops () +{ + CaseVec cases = { + { "foo and bar oR cuux XoR fnorb", + Tokens{Token{3, TT::Data, "foo"}, + Token{7, TT::And, "and"}, + Token{11, TT::Data, "bar"}, + Token{14, TT::Or, "oR"}, + Token{19, TT::Data, "cuux"}, + Token{23, TT::Xor, "XoR"}, + Token{29, TT::Data, "fnorb"}}}, + { "NOT (aap or mies)", + Tokens{Token{3, TT::Not, "NOT"}, + Token{5, TT::Open, "("}, + Token{8, TT::Data, "aap"}, + Token{11, TT::Or, "or"}, + Token{16, TT::Data, "mies"}, + Token{17, TT::Close, ")"}}} + }; + + + test_cases (cases); +} + + +static void +test_escape () +{ + CaseVec cases = { + { "foo\"bar\"", Tokens{Token{3, TT::Data, "foo"}, + Token{8, TT::Data, "bar"}}}, + { "\"fnorb\"", Tokens{Token{7, TT::Data, "fnorb"}}}, + { "\\\"fnorb\\\"", Tokens{Token{9, TT::Data, "\"fnorb\""}}}, + { "foo\\\"bar\\\"", Tokens{Token{10, TT::Data, "foo\"bar\""}}} + }; + + test_cases (cases); +} + + +int +main (int argc, char *argv[]) +{ + g_test_init (&argc, &argv, NULL); + + g_test_add_func ("/tokens/basic", test_basic); + g_test_add_func ("/tokens/specials", test_specials); + g_test_add_func ("/tokens/ops", test_ops); + g_test_add_func ("/tokens/escape", test_escape); + + return g_test_run (); +} diff --git a/lib/parser/test-utils.cc b/lib/parser/test-utils.cc new file mode 100644 index 00000000..7772b193 --- /dev/null +++ b/lib/parser/test-utils.cc @@ -0,0 +1,95 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include +#include + +#include +#include + +#include "parser.hh" +using namespace Mux; + +struct Case { + const std::string expr; + bool is_first; + const std::string expected; +}; +using CaseVec = std::vector; +using ProcFunc = std::function; + + +static void +test_cases(const CaseVec& cases, ProcFunc proc) +{ + for (const auto& casus : cases ) { + + const auto res = proc(casus.expr, casus.is_first); + if (g_test_verbose()) { + std::cout << "\n"; + std::cout << casus.expr << ' ' << casus.is_first << std::endl; + std::cout << "exp:" << casus.expected << std::endl; + std::cout << "got:" << res << std::endl; + } + + g_assert_true (casus.expected == res); + } +} + +static void +test_date () +{ + g_setenv ("TZ", "Europe/Helsinki", TRUE); + + CaseVec cases = { + { "2015-09-18T09:10:23", true, "001442556623" }, + { "1972-12-14T09:10:23", true, "000093165023" }, + { "1854-11-18T17:10:23", true, "000000000000" }, + { "fnorb", true, "000000000000" }, + { "fnorb", false, "999999999999" }, + { "", false, "999999999999" }, + { "", true, "000000000000" } + }; + + test_cases (cases, [](auto s, auto f){ return date_to_time_t_string(s,f); }); +} + +static void +test_size () +{ + CaseVec cases = { + { "456", true, "0000000456" }, + { "", false, "9999999999" }, + { "", true, "0000000000" }, + }; + + test_cases (cases, [](auto s, auto f){ return size_to_string(s,f); }); +} + + +int +main (int argc, char *argv[]) +{ + g_test_init (&argc, &argv, NULL); + + g_test_add_func ("/utils/process-date", test_date); + g_test_add_func ("/utils/process-size", test_size); + + return g_test_run (); +} diff --git a/lib/parser/tokenize.cc b/lib/parser/tokenize.cc new file mode 100644 index 00000000..13c17882 --- /dev/null +++ b/lib/parser/tokenize.cc @@ -0,0 +1,38 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include +#include + +#include "tokenizer.hh" + +int +main (int argc, char *argv[]) +{ + std::string s; + + for (auto i = 1; i < argc; ++i) + s += " " + std::string(argv[i]); + + const auto tvec = Mux::tokenize (s); + for (const auto& t : tvec) + std::cout << t << std::endl; + + return 0; +} diff --git a/lib/parser/tokenizer.cc b/lib/parser/tokenizer.cc new file mode 100644 index 00000000..7267ff25 --- /dev/null +++ b/lib/parser/tokenizer.cc @@ -0,0 +1,128 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include "tokenizer.hh" +#include +#include +#include + +using namespace Mux; + +static bool +is_separator (char c) +{ + const auto seps = std::string (":()\""); + + if (isblank(c)) + return true; + else + return seps.find(c) != std::string::npos; +} + + +static Mux::Token +op_or_value (size_t pos, const std::string& val) +{ + auto s = val; + std::transform(s.begin(), s.end(), s.begin(), ::tolower); + + if (s == "and") + return Token{pos, Token::Type::And, val}; + else if (s == "or") + return Token{pos, Token::Type::Or, val}; + else if (s == "xor") + return Token{pos, Token::Type::Xor, val}; + else if (s == "not") + return Token{pos, Token::Type::Not, val}; + else + return Token{pos, Token::Type::Data, val}; +} + +static void +unread_char (std::string& food, char kar, size_t& pos) +{ + food = kar + food; + --pos; +} + +static Mux::Token +eat_token (std::string& food, size_t& pos) +{ + bool quoted{}; + bool escaped{}; + std::string value {}; + + while (!food.empty()) { + + const auto kar = food[0]; + food.erase(0, 1); + ++pos; + + if (kar == '\\') { + escaped = !escaped; + if (escaped) + continue; + } + + if (kar == '"' && !escaped && quoted) + return Token{pos, Token::Type::Data, value}; + + if (!quoted && !escaped && is_separator(kar)) { + + if (!value.empty() && kar != ':') { + unread_char (food, kar, pos); + return op_or_value(pos, value); + } + + if (kar == '"') + quoted = true; + + if (quoted || isblank(kar)) + continue; + + switch (kar) { + case '(': return {pos, Token::Type::Open, "("}; + case ')': return {pos, Token::Type::Close,")"}; + default: break; + } + } + + value += kar; + escaped = false; + } + + return {pos, Token::Type::Data, value}; +} + + +Mux::Tokens +Mux::tokenize (const std::string& s) +{ + Tokens tokens{}; + std::string food{s}; + size_t pos{0}; + + if (s.empty()) + return {}; + + while (!food.empty()) + tokens.emplace_back(eat_token (food, pos)); + + return tokens; +} diff --git a/lib/parser/tokenizer.hh b/lib/parser/tokenizer.hh new file mode 100644 index 00000000..1f0da014 --- /dev/null +++ b/lib/parser/tokenizer.hh @@ -0,0 +1,140 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#ifndef __TOKENIZER_HH__ +#define __TOKENIZER_HH__ + +#include +#include +#include +#include +#include + +// A simple tokenizer, which turns a string into a deque of tokens +// +// It recognizes '(', ')', '*' 'and', 'or', 'xor', 'not' +// +// Note that even if we recognizes those at the lexical level, they might be demoted to mere strings +// when we're creating the parse tree. +// +// Furthermore, we detect ranges ("a..b") and regexps (/../) at the parser level, since we need a +// bit more context to resolve ambiguities. + +namespace Mux { + +// A token +struct Token { + enum class Type { + Data, /**< e .g., banana or date:..456 */ + + // Brackets + Open, /**< ( */ + Close, /**< ) */ + + // Unops + Not, /**< logical not*/ + + // Binops + And, /**< logical and */ + Or, /**< logical not */ + Xor, /**< logical xor */ + + Empty, /**< nothing */ + }; + + size_t pos{}; /**< position in string */ + Type type{}; /**< token type */ + const std::string str{}; /**< data for this token */ + + /** + * operator== + * + * @param rhs right-hand side + * + * @return true if rhs is equal to this; false otherwise + */ + bool operator==(const Token& rhs) const { + return pos == rhs.pos && + type == rhs.type && + str == rhs.str; + } +}; + +/** + * operator<< + * + * @param os an output stream + * @param t a token type + * + * @return the updated output stream + */ +inline std::ostream& +operator<< (std::ostream& os, Token::Type t) +{ + switch (t) { + case Token::Type::Data: os << ""; break; + + case Token::Type::Open: os << ""; break; + case Token::Type::Close: os << "";break; + + case Token::Type::Not: os << ""; break; + case Token::Type::And: os << ""; break; + case Token::Type::Or: os << ""; break; + case Token::Type::Xor: os << ""; break; + + default: // can't happen, but pacify compiler + throw std::runtime_error ("<>"); + } + + return os; +} + +/** + * operator<< + * + * @param os an output stream + * @param t a token + * + * @return the updated output stream + */ +inline std::ostream& +operator<< (std::ostream& os, const Token& t) +{ + os << t.pos << ": " << t.type; + + if (!t.str.empty()) + os << " [" << t.str << "]"; + + return os; +} + +/** + * Tokenize a string into a vector of tokens. The tokenization always succeeds, ie., ignoring errors + * such a missing end-". + * + * @param s a string + * + * @return a deque of tokens + */ +using Tokens = std::deque; +Tokens tokenize (const std::string& s); + +} // namespace Mux + +#endif /* __TOKENIZER_HH__ */ diff --git a/lib/parser/tree.hh b/lib/parser/tree.hh new file mode 100644 index 00000000..6ceaaa9d --- /dev/null +++ b/lib/parser/tree.hh @@ -0,0 +1,104 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include +#include +#include + +#include + +namespace Mux { + +// A node in the parse tree +struct Node { + enum class Type { + Empty, // only for empty trees + OpAnd, + OpOr, + OpXor, + OpAndNot, + OpNot, + Value, + Range, + Invalid + }; + + Node(Type _type, std::unique_ptr&& _data): + type{_type}, data{std::move(_data)} {} + Node(Type _type): type{_type} {} + Node(Node&& rhs) = default; + + Type type; + std::unique_ptr data; + + static constexpr const char* type_name (Type t) { + switch (t) { + case Type::Empty: return ""; break; + case Type::OpAnd: return "and"; break; + case Type::OpOr: return "or"; break; + case Type::OpXor: return "xor"; break; + case Type::OpAndNot: return "andnot"; break; + case Type::OpNot: return "not"; break; + case Type::Value: return "value"; break; + case Type::Range: return "range"; break; + case Type::Invalid: return ""; break; + default: + throw std::runtime_error ("bug"); + } + } + + static constexpr bool is_binop(Type t) { + return t == Type::OpAnd || t == Type::OpAndNot || + t == Type::OpOr || t == Type::OpXor; + } +}; + +inline std::ostream& +operator<< (std::ostream& os, const Node& t) +{ + os << Node::type_name(t.type); + if (t.data) + os << t.data; + + return os; +} + +struct Tree { + Tree(Node&& _node): node(std::move(_node)) {} + Tree(Tree&& rhs) = default; + + void add_child (Tree&& child) { children.emplace_back(std::move(child)); } + bool empty() const { return node.type == Node::Type::Empty; } + + Node node; + std::vector children; +}; + +inline std::ostream& +operator<< (std::ostream& os, const Tree& tree) +{ + os << '(' << tree.node; + for (const auto& subtree : tree.children) + os << subtree; + os << ')'; + + return os; +} + +} // namespace Mux diff --git a/lib/parser/utils.cc b/lib/parser/utils.cc new file mode 100644 index 00000000..26fbdaae --- /dev/null +++ b/lib/parser/utils.cc @@ -0,0 +1,349 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#define GNU_SOURCE +#include +#include + +#include "utils.hh" + +#include +#include +#include + +#include + +using namespace Mux; + +namespace { + +static gunichar +unichar_tolower (gunichar uc) +{ + if (!g_unichar_isalpha(uc)) + return uc; + + if (g_unichar_get_script (uc) != G_UNICODE_SCRIPT_LATIN) + return g_unichar_tolower (uc); + + switch (uc) + { + case 0x00e6: + case 0x00c6: return 'e'; /* æ */ + case 0x00f8: return 'o'; /* ø */ + case 0x0110: + case 0x0111: return 'd'; /* đ */ + /* todo: many more */ + default: return g_unichar_tolower (uc); + } +} + +/** + * gx_utf8_flatten: + * @str: a UTF-8 string + * @len: the length of @str, or -1 if it is %NULL-terminated + * + * Flatten some UTF-8 string; that is, downcase it and remove any diacritics. + * + * Returns: (transfer full): a flattened string, free with g_free(). + */ +static char* +gx_utf8_flatten (const gchar *str, gssize len) +{ + GString *gstr; + char *norm, *cur; + + g_return_val_if_fail (str, NULL); + + norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL); + if (!norm) + return NULL; + + gstr = g_string_sized_new (strlen (norm)); + + for (cur = norm; cur && *cur; cur = g_utf8_next_char (cur)) + { + gunichar uc; + + uc = g_utf8_get_char (cur); + if (g_unichar_combining_class (uc) != 0) + continue; + + g_string_append_unichar (gstr, unichar_tolower(uc)); + } + + g_free (norm); + + return g_string_free (gstr, FALSE); +} + +} // namespace + + +std::string // gx_utf8_flatten +Mux::utf8_flatten (const std::string& str) +{ + char *flat = gx_utf8_flatten (str.c_str(), str.length()); + if (!flat) + return {}; + + std::string s(flat); + g_free (flat); + + return s; +} + + +std::string +Mux::quote (const std::string& str) +{ + char *s = g_strescape (str.c_str(), NULL); + if (!s) + return {}; + + std::string res (s); + g_free (s); + + return "\"" + res + "\""; +} + + std::string + Mux::format (const char *frm, ...) + { + va_list args; + + va_start (args, frm); + + char *s = {}; + const auto res = vasprintf (&s, frm, args); + va_end (args); + if (res == -1) { + std::cerr << "string format failed" << std::endl; + return {}; + } + + std::string str = s; + free (s); + + return str; + } + +constexpr const auto InternalDateFormat = "%012" G_GINT64_FORMAT; +constexpr const char InternalDateMin[] = "000000000000"; +constexpr const char InternalDateMax[] = "999999999999"; +static_assert(sizeof(InternalDateMin) == 12 + 1); +static_assert(sizeof(InternalDateMax) == 12 + 1); + +static std::string +date_boundary (bool is_first) +{ + return is_first ? InternalDateMin : InternalDateMax; +} + +std::string +Mux::date_to_time_t_string (time_t t) +{ + char buf[sizeof(InternalDateMax)]; + snprintf (buf, sizeof(buf), InternalDateFormat, t); + + return buf; +} + + +static std::string +delta_ymwdhMs (const std::string& expr) +{ + char *endptr; + auto num = strtol (expr.c_str(), &endptr, 10); + if (num <= 0 || num > 9999 || !endptr || !*endptr) + return date_boundary (true); + + int years, months, weeks, days, hours, minutes, seconds; + years = months = weeks = days = hours = minutes = seconds = 0; + + switch (endptr[0]) { + case 's': seconds = num; break; + case 'M': minutes = num; break; + case 'h': hours = num; break; + case 'd': days = num; break; + case 'w': weeks = num; break; + case 'm': months = num; break; + case 'y': years = num; break; + default: + return date_boundary (true); + } + + GDateTime *then, *now = g_date_time_new_now_local (); + if (weeks != 0) + then = g_date_time_add_weeks (now, -weeks); + else + then = g_date_time_add_full (now, -years, -months,-days, + -hours, -minutes, -seconds); + + time_t t = MAX (0, (gint64)g_date_time_to_unix (then)); + + g_date_time_unref (then); + g_date_time_unref (now); + + return date_to_time_t_string (t); +} + + +static std::string +special_date (const std::string& d, bool is_first) +{ + if (d == "now") + return date_to_time_t_string (time(NULL)); + + else if (d == "today") { + + GDateTime *dt, *midnight; + dt = g_date_time_new_now_local (); + + if (!is_first) { + GDateTime *tmp = dt; + dt = g_date_time_add_days (dt, 1); + g_date_time_unref (tmp); + } + + midnight = g_date_time_add_full (dt, 0, 0, 0, + -g_date_time_get_hour(dt), + -g_date_time_get_minute (dt), + -g_date_time_get_second (dt)); + time_t t = MAX(0, (gint64)g_date_time_to_unix (midnight)); + g_date_time_unref (dt); + g_date_time_unref (midnight); + return date_to_time_t_string ((time_t)t); + + } else + return date_boundary (is_first); +} + + +constexpr const char UserDateMin[] = "19700101000000"; +constexpr const char UserDateMax[] = "29993112235959"; + +std::string +Mux::date_to_time_t_string (const std::string& dstr, bool is_first) +{ + gint64 t; + struct tm tbuf; + GDateTime *dtime; + + /* one-sided dates */ + if (dstr.empty()) + return date_boundary (is_first); + else if (is_first && dstr.find_first_of("ymdwhMs") != std::string::npos) + return delta_ymwdhMs (dstr); + + std::string date (is_first ? UserDateMin : UserDateMax); + std::copy_if (dstr.begin(), dstr.end(), date.begin(),[](auto c){return isdigit(c);}); + + memset (&tbuf, 0, sizeof tbuf); + if (!strptime (date.c_str(), "%Y%m%d%H%M%S", &tbuf) && + !strptime (date.c_str(), "%Y%m%d%H%M", &tbuf) && + !strptime (date.c_str(), "%Y%m%d", &tbuf) && + !strptime (date.c_str(), "%Y%m", &tbuf) && + !strptime (date.c_str(), "%Y", &tbuf)) + return special_date (date, is_first); + + dtime = g_date_time_new_local (tbuf.tm_year + 1900, + tbuf.tm_mon + 1, + tbuf.tm_mday, + tbuf.tm_hour, + tbuf.tm_min, + tbuf.tm_sec); + if (!dtime) { + g_warning ("invalid %s date '%s'", + is_first ? "lower" : "upper", date.c_str()); + return date_boundary (is_first); + } + + t = (gint64)g_date_time_to_unix (dtime); + g_date_time_unref (dtime); + + if (t < 0 || t > 9999999999) + return date_boundary (is_first); + else + return date_to_time_t_string (t); +} + + +constexpr const auto SizeFormat = "%010" G_GINT64_FORMAT; + +constexpr const char SizeMin[] = "0000000000"; +constexpr const char SizeMax[] = "9999999999"; +static_assert(sizeof(SizeMin) == 10 + 1); +static_assert(sizeof(SizeMax) == 10 + 1); + +static std::string +size_boundary (bool is_first) +{ + return is_first ? SizeMin : SizeMax; +} + +std::string +Mux::size_to_string (int64_t size) +{ + char buf[sizeof(SizeMax)]; + snprintf (buf, sizeof(buf), SizeFormat, size); + + return buf; +} + +std::string +Mux::size_to_string (const std::string& val, bool is_first) +{ + std::string str; + GRegex *rx; + GMatchInfo *minfo; + + /* one-sided ranges */ + if (val.empty()) + return size_boundary (is_first); + + rx = g_regex_new ("(\\d+)(b|k|kb|m|mb|g|gb)?", + G_REGEX_CASELESS, (GRegexMatchFlags)0, NULL); + minfo = NULL; + if (g_regex_match (rx, val.c_str(), (GRegexMatchFlags)0, &minfo)) { + gint64 size; + char *s; + + s = g_match_info_fetch (minfo, 1); + size = atoll (s); + g_free (s); + + s = g_match_info_fetch (minfo, 2); + switch (s ? g_ascii_tolower(s[0]) : 0) { + case 'k': size *= 1024; break; + case 'm': size *= (1024 * 1024); break; + case 'g': size *= (1024 * 1024 * 1024); break; + default: break; + } + + g_free (s); + str = size_to_string (size); + } else + str = size_boundary (is_first); + + g_regex_unref (rx); + g_match_info_unref (minfo); + + return str; +} diff --git a/lib/parser/utils.hh b/lib/parser/utils.hh new file mode 100644 index 00000000..2a916430 --- /dev/null +++ b/lib/parser/utils.hh @@ -0,0 +1,100 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include + +#ifndef __UTILS_HH__ +#define __UTILS_HH__ + +namespace Mux { + +/** + * Flatten a string -- downcase and fold diacritics etc. + * + * @param str a string + * + * @return a flattened string + */ +std::string utf8_flatten (const std::string& str); + +/** + * Quote & escape a string + * + * @param str a string + * + * @return quoted string + */ +std::string quote (const std::string& str); + +/** + * Format a string, printf style + * + * @param frm format string + * @param ... parameters + * + * @return a formatted string + */ +std::string format (const char *frm, ...) + __attribute__((format(printf, 1, 2))); + +/** + * Convert an ISO date to the corresponding time expressed as a string + * with a 10-digit time_t + * + * @param date + * @param first + * + * @return + */ +std::string date_to_time_t_string (const std::string& date, bool first); + +/** + * time_t expressed as a string with a 10-digit time_t + * + * @param t + * + * @return + */ +std::string date_to_time_t_string (time_t t); + + + +/** + * Convert a size string to a size in bytes + * + * @param sizestr the size string + * @param first + * + * @return the size expressed as a string with the decimal number of bytes + */ +std::string size_to_string (const std::string& sizestr, bool first); + +/** + * Convert a size into a size in bytes string + * + * @param size the size + * @param first + * + * @return the size expressed as a string with the decimal number of bytes + */ +std::string size_to_string (int64_t size); + +} // namespace Mux + +#endif /* __UTILS_HH__ */ diff --git a/lib/parser/xapian.cc b/lib/parser/xapian.cc new file mode 100644 index 00000000..a14e304f --- /dev/null +++ b/lib/parser/xapian.cc @@ -0,0 +1,75 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + +#include +#include "parser/xapian.hh" + +using namespace Mux; + +static Xapian::Query +xapian_query_op (const Mux::Tree& tree) +{ + Xapian::Query::op op; + + switch (tree.node.type) { + case Node::Type::OpNot: // OpNot x ::= AND NOT x + if (tree.children.size() != 1) + throw std::runtime_error ("invalid # of children"); + return Xapian::Query (Xapian::Query::OP_AND_NOT, + Xapian::Query::MatchAll, + xapian_query(tree.children.front())); + case Node::Type::OpAnd: op = Xapian::Query::OP_AND; break; + case Node::Type::OpOr: op = Xapian::Query::OP_OR; break; + case Node::Type::OpXor: op = Xapian::Query::OP_XOR; break; + case Node::Type::OpAndNot: op = Xapian::Query::OP_AND_NOT; break; + default: throw std::runtime_error ("invalid op"); // bug + } + + std::vector childvec; + for (const auto& subtree: tree.children) + childvec.emplace_back(xapian_query(subtree)); + + return Xapian::Query(op, childvec.begin(), childvec.end()); +} + +Xapian::Query +Mux::xapian_query (const Mux::Tree& tree) +{ + switch (tree.node.type) { + case Node::Type::Empty: + return Xapian::Query(); + case Node::Type::OpNot: + case Node::Type::OpAnd: + case Node::Type::OpOr: + case Node::Type::OpXor: + case Node::Type::OpAndNot: + return xapian_query_op (tree); + case Node::Type::Value: { + const auto v = dynamic_cast (tree.node.data.get()); + return Xapian::Query(v->prefix + v->value); + } + case Node::Type::Range: { + const auto r = dynamic_cast (tree.node.data.get()); + return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, + (Xapian::valueno)r->id, r->lower, r->upper); + } + default: + throw std::runtime_error ("invalid query"); // bug + } +} diff --git a/lib/parser/xapian.hh b/lib/parser/xapian.hh new file mode 100644 index 00000000..4f30a01b --- /dev/null +++ b/lib/parser/xapian.hh @@ -0,0 +1,40 @@ +/* +** Copyright (C) 2017 Dirk-Jan C. Binnema +** +** This library is free software; you can redistribute it and/or +** modify it under the terms of the GNU Lesser General Public License +** as published by the Free Software Foundation; either version 2.1 +** of the License, or (at your option) any later version. +** +** This library is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +** Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public +** License along with this library; if not, write to the Free +** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA +** 02110-1301, USA. +*/ + + +#ifndef __XAPIAN_HH__ +#define __XAPIAN_HH__ + +#include +#include + +namespace Mux { + +/** + * Transform a parse-tree into a Xapian query object + * + * @param tree a parse tree + * + * @return a Xapian query object + */ +Xapian::Query xapian_query (const Mux::Tree& tree); + +}; + +#endif /* __XAPIAN_H__ */