query: move phrasification to mu-query-parser
Do the "phrasification" for matching fields later during query parsing; this allows for handling combination fields correctly. Also match both the normal term and the "phrase term", so we catch more cases. Update/extend unit tests. This fixes the "kata-container" issue also for body test. Fixes #2167.
This commit is contained in:
@ -76,6 +76,42 @@ struct ParseContext {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indexable fields become _phrase_ fields if they contain
|
||||||
|
* wordbreakable data;
|
||||||
|
*
|
||||||
|
* @param field
|
||||||
|
* @param val
|
||||||
|
*
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
static Option<Sexp>
|
||||||
|
phrasify(const Field& field, const Sexp& val)
|
||||||
|
{
|
||||||
|
if (!field.is_phrasable_term() || !val.stringp())
|
||||||
|
return Nothing; // nothing to phrasify
|
||||||
|
|
||||||
|
auto words{utf8_wordbreak(val.string())};
|
||||||
|
if (words.find(' ') == std::string::npos)
|
||||||
|
return Nothing; // nothing to phrasify
|
||||||
|
|
||||||
|
auto phrase = Sexp {
|
||||||
|
Sexp::Symbol{field.name},
|
||||||
|
Sexp{phrase_sym, Sexp{std::move(words)}}};
|
||||||
|
|
||||||
|
// if the field both a normal term & phrasable, match both
|
||||||
|
// if they are different
|
||||||
|
if (val.string() != words)
|
||||||
|
return Sexp{or_sym,
|
||||||
|
Sexp {Sexp::Symbol{field.name}, Sexp(val.string())},
|
||||||
|
std::move(phrase)};
|
||||||
|
else
|
||||||
|
return phrase;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Grammar
|
* Grammar
|
||||||
*
|
*
|
||||||
@ -87,6 +123,7 @@ struct ParseContext {
|
|||||||
|
|
||||||
static Sexp query(Sexp& tokens, ParseContext& ctx);
|
static Sexp query(Sexp& tokens, ParseContext& ctx);
|
||||||
|
|
||||||
|
|
||||||
static Sexp
|
static Sexp
|
||||||
matcher(Sexp& tokens, ParseContext& ctx)
|
matcher(Sexp& tokens, ParseContext& ctx)
|
||||||
{
|
{
|
||||||
@ -95,22 +132,38 @@ matcher(Sexp& tokens, ParseContext& ctx)
|
|||||||
|
|
||||||
auto val{*tokens.head()};
|
auto val{*tokens.head()};
|
||||||
tokens.pop_front();
|
tokens.pop_front();
|
||||||
|
/* special case: if we find some non-matcher type here, we need to second-guess the token */
|
||||||
/* special case: if we find some non-matcher type here, we need to
|
|
||||||
* second-guess the tokenizer */
|
|
||||||
if (!looks_like_matcher(val))
|
if (!looks_like_matcher(val))
|
||||||
val = Sexp{placeholder_sym, val.symbol().name};
|
val = Sexp{placeholder_sym, val.symbol().name};
|
||||||
|
|
||||||
|
const auto fieldsym{val.front().symbol()};
|
||||||
|
|
||||||
|
// Note the _expand_ case is what we use when processing the query 'for real';
|
||||||
|
// the non-expand case is only to have a bit more human-readable Sexp for use
|
||||||
|
// mu find's '--analyze'
|
||||||
|
//
|
||||||
|
// Re: phrase-fields We map something like 'subject:hello-world'
|
||||||
|
// to
|
||||||
|
// (or (subject "hello-world" (subject (phrase "hello world"))))
|
||||||
|
|
||||||
if (ctx.expand) { /* should we expand meta-fields? */
|
if (ctx.expand) { /* should we expand meta-fields? */
|
||||||
const auto symbol{val.front().symbol()};
|
auto fields = fields_from_name(fieldsym == placeholder_sym ? "" : fieldsym.name);
|
||||||
const auto fields = fields_from_name(symbol == placeholder_sym ? "" : symbol.name);
|
|
||||||
if (!fields.empty()) {
|
if (!fields.empty()) {
|
||||||
Sexp vals{};
|
Sexp vals{};
|
||||||
vals.add(or_sym);
|
vals.add(or_sym);
|
||||||
for (auto&& field: fields)
|
for (auto&& field: fields)
|
||||||
|
if (auto&& phrase{phrasify(field, *second(val))}; phrase)
|
||||||
|
vals.add(std::move(*phrase));
|
||||||
|
else
|
||||||
vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}});
|
vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}});
|
||||||
val = std::move(vals);
|
val = std::move(vals);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (auto&& field{field_from_name(fieldsym.name)}; field) {
|
||||||
|
if (auto&& phrase(phrasify(*field, *second(val))); phrase)
|
||||||
|
val = std::move(*phrase);
|
||||||
}
|
}
|
||||||
|
|
||||||
return val;
|
return val;
|
||||||
|
|||||||
@ -74,7 +74,6 @@ struct Element {
|
|||||||
ValueType value{};
|
ValueType value{};
|
||||||
};
|
};
|
||||||
struct Basic: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
struct Basic: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
||||||
struct Phrase: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
|
||||||
struct Regex: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
struct Regex: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
||||||
struct Wildcard: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
struct Wildcard: public FieldValue<std::string> {using FieldValue::FieldValue;};
|
||||||
struct Range: public FieldValue<std::pair<std::string, std::string>> {
|
struct Range: public FieldValue<std::pair<std::string, std::string>> {
|
||||||
@ -89,7 +88,6 @@ struct Element {
|
|||||||
std::string,
|
std::string,
|
||||||
/* value types */
|
/* value types */
|
||||||
Basic,
|
Basic,
|
||||||
Phrase,
|
|
||||||
Regex,
|
Regex,
|
||||||
Wildcard,
|
Wildcard,
|
||||||
Range
|
Range
|
||||||
@ -152,9 +150,6 @@ struct Element {
|
|||||||
}
|
}
|
||||||
} else if constexpr (std::is_same_v<T, Basic>) {
|
} else if constexpr (std::is_same_v<T, Basic>) {
|
||||||
return Sexp { field_sym(arg.field), arg.value };
|
return Sexp { field_sym(arg.field), arg.value };
|
||||||
} else if constexpr (std::is_same_v<T, Phrase>) {
|
|
||||||
return Sexp {field_sym(arg.field),
|
|
||||||
Sexp{ phrase_sym, arg.value }};
|
|
||||||
} else if constexpr (std::is_same_v<T, Regex>) {
|
} else if constexpr (std::is_same_v<T, Regex>) {
|
||||||
return Sexp { field_sym(arg.field), Sexp{ regex_sym, arg.value}};
|
return Sexp { field_sym(arg.field), Sexp{ regex_sym, arg.value}};
|
||||||
} else if constexpr (std::is_same_v<T, Wildcard>) {
|
} else if constexpr (std::is_same_v<T, Wildcard>) {
|
||||||
@ -337,24 +332,6 @@ basify(Element&& element)
|
|||||||
return element;
|
return element;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Option<Element>
|
|
||||||
phrasify(Element&& element)
|
|
||||||
{
|
|
||||||
auto&& basic{element.get_opt<Element::Basic>()};
|
|
||||||
if (!basic)
|
|
||||||
return element;
|
|
||||||
|
|
||||||
auto&& field = field_from_name(*basic->field);
|
|
||||||
if (!field || field->is_indexable_term()) {
|
|
||||||
auto&& val{basic->value};
|
|
||||||
if (val.find(' ') != std::string::npos)
|
|
||||||
element.value = Element::Phrase{basic->field, val};
|
|
||||||
}
|
|
||||||
|
|
||||||
return element;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static Option<Element>
|
static Option<Element>
|
||||||
wildcardify(Element&& element)
|
wildcardify(Element&& element)
|
||||||
{
|
{
|
||||||
@ -467,7 +444,7 @@ process(const std::string& expr)
|
|||||||
.and_then(opify)
|
.and_then(opify)
|
||||||
.and_then(basify)
|
.and_then(basify)
|
||||||
.and_then(regexpify)
|
.and_then(regexpify)
|
||||||
.and_then(phrasify)
|
//.and_then(phrasify)
|
||||||
.and_then(wildcardify)
|
.and_then(wildcardify)
|
||||||
.and_then(rangify);
|
.and_then(rangify);
|
||||||
if (element)
|
if (element)
|
||||||
@ -527,10 +504,6 @@ test_processor()
|
|||||||
std::vector<TestCase> cases = {
|
std::vector<TestCase> cases = {
|
||||||
// basics
|
// basics
|
||||||
TestCase{R"(hello world)", R"(((_ "hello") (_ "world")))"},
|
TestCase{R"(hello world)", R"(((_ "hello") (_ "world")))"},
|
||||||
TestCase{R"("hello world")", R"(((_ (phrase "hello world"))))"},
|
|
||||||
TestCase{R"(subject:"hello world")", R"(((subject (phrase "hello world"))))"},
|
|
||||||
|
|
||||||
// maildir must _not_ be phrasified
|
|
||||||
TestCase{R"(maildir:/"hello world")", R"(((maildir "/hello world")))"},
|
TestCase{R"(maildir:/"hello world")", R"(((maildir "/hello world")))"},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -89,7 +89,7 @@ string_nth(const Sexp& args, size_t n)
|
|||||||
static Result<Xapian::Query>
|
static Result<Xapian::Query>
|
||||||
phrase(const Field& field, Sexp&& s)
|
phrase(const Field& field, Sexp&& s)
|
||||||
{
|
{
|
||||||
if (!field.is_indexable_term())
|
if (!field.is_phrasable_term())
|
||||||
return Err(Error::Code::InvalidArgument,
|
return Err(Error::Code::InvalidArgument,
|
||||||
"field {} does not support phrases", field.name);
|
"field {} does not support phrases", field.name);
|
||||||
|
|
||||||
@ -273,13 +273,12 @@ parse_basic(const Field &field, Sexp &&vals, Mu::ParserFlags flags)
|
|||||||
default: {
|
default: {
|
||||||
auto q{Xapian::Query{field.xapian_term(val)}};
|
auto q{Xapian::Query{field.xapian_term(val)}};
|
||||||
if (ngrams) { // special case: cjk; see if we can create an expanded query.
|
if (ngrams) { // special case: cjk; see if we can create an expanded query.
|
||||||
if (field.is_indexable_term() && contains_unbroken_script(val))
|
if (field.is_phrasable_term() && contains_unbroken_script(val))
|
||||||
if (auto&& ng{ngram_expand(field, val)}; ng)
|
if (auto&& ng{ngram_expand(field, val)}; ng)
|
||||||
return ng;
|
return ng;
|
||||||
}
|
}
|
||||||
return q;
|
return q;
|
||||||
}}
|
}}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static Result<Xapian::Query>
|
static Result<Xapian::Query>
|
||||||
@ -420,6 +419,8 @@ using TestCase = std::pair<std::string, std::string>;
|
|||||||
static void
|
static void
|
||||||
test_xapian()
|
test_xapian()
|
||||||
{
|
{
|
||||||
|
allow_warnings();
|
||||||
|
|
||||||
auto&& testhome{unwrap(make_temp_dir())};
|
auto&& testhome{unwrap(make_temp_dir())};
|
||||||
auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)};
|
auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)};
|
||||||
auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))};
|
auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))};
|
||||||
@ -429,7 +430,8 @@ test_xapian()
|
|||||||
auto&& zz{make_xapian_query(store, R"(subject:"hello world")")};
|
auto&& zz{make_xapian_query(store, R"(subject:"hello world")")};
|
||||||
assert_valid_result(zz);
|
assert_valid_result(zz);
|
||||||
/* LCOV_EXCL_START*/
|
/* LCOV_EXCL_START*/
|
||||||
if (zz->get_description() != R"(Query((Shello PHRASE 2 Sworld)))") {
|
if (zz->get_description() != R"(Query((Shello world OR (Shello PHRASE 2 Sworld))))") {
|
||||||
|
mu_println("{}", zz->get_description());
|
||||||
if (mu_test_mu_hacker()) {
|
if (mu_test_mu_hacker()) {
|
||||||
// in the mu hacker case, we want to be warned if Xapian changed.
|
// in the mu hacker case, we want to be warned if Xapian changed.
|
||||||
g_critical("xapian version mismatch");
|
g_critical("xapian version mismatch");
|
||||||
@ -446,7 +448,8 @@ test_xapian()
|
|||||||
TestCase{R"(i:87h766tzzz.fsf@gnus.org)", R"(Query(I87h766tzzz.fsf@gnus.org))"},
|
TestCase{R"(i:87h766tzzz.fsf@gnus.org)", R"(Query(I87h766tzzz.fsf@gnus.org))"},
|
||||||
TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"},
|
TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"},
|
||||||
TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"},
|
TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"},
|
||||||
TestCase{R"(subject:"hello world")", R"(Query((Shello PHRASE 2 Sworld)))"},
|
TestCase{R"(subject:"hello world")",
|
||||||
|
R"(Query((Shello world OR (Shello PHRASE 2 Sworld))))"},
|
||||||
TestCase{R"(subject:/boo/")", R"(Query())"},
|
TestCase{R"(subject:/boo/")", R"(Query())"},
|
||||||
|
|
||||||
// ranges.
|
// ranges.
|
||||||
|
|||||||
@ -131,6 +131,7 @@ I said: "Aujourd'hui!"
|
|||||||
}};
|
}};
|
||||||
TempDir tdir;
|
TempDir tdir;
|
||||||
auto store{make_test_store(tdir.path(), test_msgs, {})};
|
auto store{make_test_store(tdir.path(), test_msgs, {})};
|
||||||
|
store.commit();
|
||||||
|
|
||||||
// matches
|
// matches
|
||||||
for (auto&& expr: {
|
for (auto&& expr: {
|
||||||
@ -692,6 +693,8 @@ Date: Wed, 26 Oct 2022 11:01:54 -0700
|
|||||||
To: example@example.com
|
To: example@example.com
|
||||||
Subject: kata-containers
|
Subject: kata-containers
|
||||||
|
|
||||||
|
voodoo-containers
|
||||||
|
|
||||||
Boo!
|
Boo!
|
||||||
)"},
|
)"},
|
||||||
}};
|
}};
|
||||||
@ -699,10 +702,13 @@ Boo!
|
|||||||
TempDir tdir;
|
TempDir tdir;
|
||||||
auto store{make_test_store(tdir.path(), test_msgs, {})};
|
auto store{make_test_store(tdir.path(), test_msgs, {})};
|
||||||
/* true: match; false: no match */
|
/* true: match; false: no match */
|
||||||
const auto cases = std::array<std::pair<const char*, bool>, 3>{{
|
const auto cases = std::vector<std::pair<const char*, bool>>{{
|
||||||
{"subject:kata", true},
|
{"subject:kata", true},
|
||||||
{"subject:containers", true},
|
{"subject:containers", true},
|
||||||
{"subject:kata-containers", true}
|
{"subject:kata-containers", true},
|
||||||
|
{"subject:\"kata containers\"", true},
|
||||||
|
{"voodoo-containers", true},
|
||||||
|
{"voodoo containers", true}
|
||||||
}};
|
}};
|
||||||
|
|
||||||
for (auto&& test: cases) {
|
for (auto&& test: cases) {
|
||||||
|
|||||||
Reference in New Issue
Block a user