From 7cd7d118e284df0e17967f5874afdd3806f10c4a Mon Sep 17 00:00:00 2001 From: djcb Date: Thu, 26 Oct 2017 21:31:22 +0300 Subject: [PATCH] query-parser: support phrase queries --- lib/parser/test-tokenizer.cc | 7 +++--- lib/parser/tokenizer.cc | 20 +++++++++-------- lib/parser/utils.cc | 14 ++++++++++++ lib/parser/utils.hh | 12 +++++++++++ lib/parser/xapian.cc | 42 ++++++++++++++++++++++++++++-------- 5 files changed, 73 insertions(+), 22 deletions(-) diff --git a/lib/parser/test-tokenizer.cc b/lib/parser/test-tokenizer.cc index 2e4cdeec..a773890c 100644 --- a/lib/parser/test-tokenizer.cc +++ b/lib/parser/test-tokenizer.cc @@ -118,11 +118,10 @@ static void test_escape () { CaseVec cases = { - { "foo\"bar\"", Tokens{Token{3, TT::Data, "foo"}, - Token{8, TT::Data, "bar"}}}, + { "foo\"bar\"", Tokens{Token{8, TT::Data, "foobar"}}}, { "\"fnorb\"", Tokens{Token{7, TT::Data, "fnorb"}}}, - { "\\\"fnorb\\\"", Tokens{Token{9, TT::Data, "\"fnorb\""}}}, - { "foo\\\"bar\\\"", Tokens{Token{10, TT::Data, "foo\"bar\""}}} + { "\\\"fnorb\\\"", Tokens{Token{9, TT::Data, "fnorb"}}}, + { "foo\\\"bar\\\"", Tokens{Token{10, TT::Data, "foobar"}}} }; test_cases (cases); diff --git a/lib/parser/tokenizer.cc b/lib/parser/tokenizer.cc index 7267ff25..75e0dfcf 100644 --- a/lib/parser/tokenizer.cc +++ b/lib/parser/tokenizer.cc @@ -27,12 +27,11 @@ using namespace Mux; static bool is_separator (char c) { - const auto seps = std::string (":()\""); - if (isblank(c)) return true; - else - return seps.find(c) != std::string::npos; + + const auto seps = std::string ("()"); + return seps.find(c) != std::string::npos; } @@ -80,8 +79,14 @@ eat_token (std::string& food, size_t& pos) continue; } - if (kar == '"' && !escaped && quoted) - return Token{pos, Token::Type::Data, value}; + if (kar == '"') { + if (!escaped && quoted) + return Token{pos, Token::Type::Data, value}; + else { + quoted = true; + continue; + } + } if (!quoted && !escaped && is_separator(kar)) { @@ -90,9 +95,6 @@ eat_token (std::string& food, size_t& pos) return op_or_value(pos, value); } - if (kar == '"') - quoted = true; - if (quoted || isblank(kar)) continue; diff --git a/lib/parser/utils.cc b/lib/parser/utils.cc index 6b27991d..a21b26cc 100644 --- a/lib/parser/utils.cc +++ b/lib/parser/utils.cc @@ -110,6 +110,20 @@ Mux::utf8_flatten (const std::string& str) } +std::vector +Mux::split (const std::string& str, const std::string& sepa) +{ + char **parts = g_strsplit(str.c_str(), sepa.c_str(), -1); + std::vector vec; + for (auto part = parts; part && *part; ++part) + vec.push_back (*part); + + g_strfreev(parts); + + return vec; +} + + std::string Mux::quote (const std::string& str) { diff --git a/lib/parser/utils.hh b/lib/parser/utils.hh index 2a916430..f9389f9e 100644 --- a/lib/parser/utils.hh +++ b/lib/parser/utils.hh @@ -18,6 +18,7 @@ */ #include +#include #ifndef __UTILS_HH__ #define __UTILS_HH__ @@ -33,6 +34,17 @@ namespace Mux { */ std::string utf8_flatten (const std::string& str); +/** + * Split a string in parts + * + * @param str a string + * @param sepa the separator + * + * @return the parts. + */ +std::vector split (const std::string& str, + const std::string& sepa); + /** * Quote & escape a string * diff --git a/lib/parser/xapian.cc b/lib/parser/xapian.cc index a14e304f..229c9ce5 100644 --- a/lib/parser/xapian.cc +++ b/lib/parser/xapian.cc @@ -48,6 +48,35 @@ xapian_query_op (const Mux::Tree& tree) return Xapian::Query(op, childvec.begin(), childvec.end()); } +static Xapian::Query +xapian_query_value (const Mux::Tree& tree) +{ + const auto v = dynamic_cast (tree.node.data.get()); + const auto parts = split (v->value, " "); + + std::vector phvec; + for (const auto p: parts) + phvec.push_back(Xapian::Query(v->prefix + p)); + + if (parts.empty()) + return Xapian::Query::MatchNothing; // shouldn't happen + + if (parts.size() == 1) + return phvec.front(); + + return Xapian::Query (Xapian::Query::OP_PHRASE, + phvec.begin(), phvec.end()); +} + +static Xapian::Query +xapian_query_range (const Mux::Tree& tree) +{ + const auto r = dynamic_cast (tree.node.data.get()); + return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, + (Xapian::valueno)r->id, r->lower, r->upper); +} + + Xapian::Query Mux::xapian_query (const Mux::Tree& tree) { @@ -60,15 +89,10 @@ Mux::xapian_query (const Mux::Tree& tree) case Node::Type::OpXor: case Node::Type::OpAndNot: return xapian_query_op (tree); - case Node::Type::Value: { - const auto v = dynamic_cast (tree.node.data.get()); - return Xapian::Query(v->prefix + v->value); - } - case Node::Type::Range: { - const auto r = dynamic_cast (tree.node.data.get()); - return Xapian::Query(Xapian::Query::OP_VALUE_RANGE, - (Xapian::valueno)r->id, r->lower, r->upper); - } + case Node::Type::Value: + return xapian_query_value (tree); + case Node::Type::Range: + return xapian_query_range (tree); default: throw std::runtime_error ("invalid query"); // bug }