diff --git a/lib/mu-query-parser.cc b/lib/mu-query-parser.cc index 77333d2e..3af828d2 100644 --- a/lib/mu-query-parser.cc +++ b/lib/mu-query-parser.cc @@ -76,6 +76,42 @@ struct ParseContext { }; + + +/** + * Indexable fields become _phrase_ fields if they contain + * wordbreakable data; + * + * @param field + * @param val + * + * @return + */ +static Option +phrasify(const Field& field, const Sexp& val) +{ + if (!field.is_phrasable_term() || !val.stringp()) + return Nothing; // nothing to phrasify + + auto words{utf8_wordbreak(val.string())}; + if (words.find(' ') == std::string::npos) + return Nothing; // nothing to phrasify + + auto phrase = Sexp { + Sexp::Symbol{field.name}, + Sexp{phrase_sym, Sexp{std::move(words)}}}; + + // if the field both a normal term & phrasable, match both + // if they are different + if (val.string() != words) + return Sexp{or_sym, + Sexp {Sexp::Symbol{field.name}, Sexp(val.string())}, + std::move(phrase)}; + else + return phrase; +} + + /* * Grammar * @@ -87,6 +123,7 @@ struct ParseContext { static Sexp query(Sexp& tokens, ParseContext& ctx); + static Sexp matcher(Sexp& tokens, ParseContext& ctx) { @@ -95,22 +132,38 @@ matcher(Sexp& tokens, ParseContext& ctx) auto val{*tokens.head()}; tokens.pop_front(); - - /* special case: if we find some non-matcher type here, we need to - * second-guess the tokenizer */ + /* special case: if we find some non-matcher type here, we need to second-guess the token */ if (!looks_like_matcher(val)) val = Sexp{placeholder_sym, val.symbol().name}; + const auto fieldsym{val.front().symbol()}; + + // Note the _expand_ case is what we use when processing the query 'for real'; + // the non-expand case is only to have a bit more human-readable Sexp for use + // mu find's '--analyze' + // + // Re: phrase-fields We map something like 'subject:hello-world' + // to + // (or (subject "hello-world" (subject (phrase "hello world")))) + if (ctx.expand) { /* should we expand meta-fields? */ - const auto symbol{val.front().symbol()}; - const auto fields = fields_from_name(symbol == placeholder_sym ? "" : symbol.name); + auto fields = fields_from_name(fieldsym == placeholder_sym ? "" : fieldsym.name); if (!fields.empty()) { Sexp vals{}; vals.add(or_sym); for (auto&& field: fields) - vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}}); + if (auto&& phrase{phrasify(field, *second(val))}; phrase) + vals.add(std::move(*phrase)); + else + vals.add(Sexp{Sexp::Symbol{field.name}, Sexp{*second(val)}}); val = std::move(vals); } + + } + + if (auto&& field{field_from_name(fieldsym.name)}; field) { + if (auto&& phrase(phrasify(*field, *second(val))); phrase) + val = std::move(*phrase); } return val; diff --git a/lib/mu-query-processor.cc b/lib/mu-query-processor.cc index 9900c4e1..80aa3402 100644 --- a/lib/mu-query-processor.cc +++ b/lib/mu-query-processor.cc @@ -74,7 +74,6 @@ struct Element { ValueType value{}; }; struct Basic: public FieldValue {using FieldValue::FieldValue;}; - struct Phrase: public FieldValue {using FieldValue::FieldValue;}; struct Regex: public FieldValue {using FieldValue::FieldValue;}; struct Wildcard: public FieldValue {using FieldValue::FieldValue;}; struct Range: public FieldValue> { @@ -89,7 +88,6 @@ struct Element { std::string, /* value types */ Basic, - Phrase, Regex, Wildcard, Range @@ -152,9 +150,6 @@ struct Element { } } else if constexpr (std::is_same_v) { return Sexp { field_sym(arg.field), arg.value }; - } else if constexpr (std::is_same_v) { - return Sexp {field_sym(arg.field), - Sexp{ phrase_sym, arg.value }}; } else if constexpr (std::is_same_v) { return Sexp { field_sym(arg.field), Sexp{ regex_sym, arg.value}}; } else if constexpr (std::is_same_v) { @@ -337,24 +332,6 @@ basify(Element&& element) return element; } -static Option -phrasify(Element&& element) -{ - auto&& basic{element.get_opt()}; - if (!basic) - return element; - - auto&& field = field_from_name(*basic->field); - if (!field || field->is_indexable_term()) { - auto&& val{basic->value}; - if (val.find(' ') != std::string::npos) - element.value = Element::Phrase{basic->field, val}; - } - - return element; -} - - static Option wildcardify(Element&& element) { @@ -467,7 +444,7 @@ process(const std::string& expr) .and_then(opify) .and_then(basify) .and_then(regexpify) - .and_then(phrasify) + //.and_then(phrasify) .and_then(wildcardify) .and_then(rangify); if (element) @@ -527,10 +504,6 @@ test_processor() std::vector cases = { // basics TestCase{R"(hello world)", R"(((_ "hello") (_ "world")))"}, - TestCase{R"("hello world")", R"(((_ (phrase "hello world"))))"}, - TestCase{R"(subject:"hello world")", R"(((subject (phrase "hello world"))))"}, - - // maildir must _not_ be phrasified TestCase{R"(maildir:/"hello world")", R"(((maildir "/hello world")))"}, }; diff --git a/lib/mu-query-xapianizer.cc b/lib/mu-query-xapianizer.cc index 249dda26..e9cdf52b 100644 --- a/lib/mu-query-xapianizer.cc +++ b/lib/mu-query-xapianizer.cc @@ -89,7 +89,7 @@ string_nth(const Sexp& args, size_t n) static Result phrase(const Field& field, Sexp&& s) { - if (!field.is_indexable_term()) + if (!field.is_phrasable_term()) return Err(Error::Code::InvalidArgument, "field {} does not support phrases", field.name); @@ -273,13 +273,12 @@ parse_basic(const Field &field, Sexp &&vals, Mu::ParserFlags flags) default: { auto q{Xapian::Query{field.xapian_term(val)}}; if (ngrams) { // special case: cjk; see if we can create an expanded query. - if (field.is_indexable_term() && contains_unbroken_script(val)) + if (field.is_phrasable_term() && contains_unbroken_script(val)) if (auto&& ng{ngram_expand(field, val)}; ng) return ng; } return q; }} - } static Result @@ -420,6 +419,8 @@ using TestCase = std::pair; static void test_xapian() { + allow_warnings(); + auto&& testhome{unwrap(make_temp_dir())}; auto&& dbpath{runtime_path(RuntimePath::XapianDb, testhome)}; auto&& store{unwrap(Store::make_new(dbpath, join_paths(testhome, "test-maildir")))}; @@ -429,7 +430,8 @@ test_xapian() auto&& zz{make_xapian_query(store, R"(subject:"hello world")")}; assert_valid_result(zz); /* LCOV_EXCL_START*/ - if (zz->get_description() != R"(Query((Shello PHRASE 2 Sworld)))") { + if (zz->get_description() != R"(Query((Shello world OR (Shello PHRASE 2 Sworld))))") { + mu_println("{}", zz->get_description()); if (mu_test_mu_hacker()) { // in the mu hacker case, we want to be warned if Xapian changed. g_critical("xapian version mismatch"); @@ -446,7 +448,8 @@ test_xapian() TestCase{R"(i:87h766tzzz.fsf@gnus.org)", R"(Query(I87h766tzzz.fsf@gnus.org))"}, TestCase{R"(subject:foo to:bar)", R"(Query((Sfoo AND Tbar)))"}, TestCase{R"(subject:"cuux*")", R"(Query(WILDCARD SYNONYM Scuux))"}, - TestCase{R"(subject:"hello world")", R"(Query((Shello PHRASE 2 Sworld)))"}, + TestCase{R"(subject:"hello world")", + R"(Query((Shello world OR (Shello PHRASE 2 Sworld))))"}, TestCase{R"(subject:/boo/")", R"(Query())"}, // ranges. diff --git a/lib/tests/test-mu-store-query.cc b/lib/tests/test-mu-store-query.cc index 6148c1d2..8117ceb7 100644 --- a/lib/tests/test-mu-store-query.cc +++ b/lib/tests/test-mu-store-query.cc @@ -131,6 +131,7 @@ I said: "Aujourd'hui!" }}; TempDir tdir; auto store{make_test_store(tdir.path(), test_msgs, {})}; + store.commit(); // matches for (auto&& expr: { @@ -692,6 +693,8 @@ Date: Wed, 26 Oct 2022 11:01:54 -0700 To: example@example.com Subject: kata-containers +voodoo-containers + Boo! )"}, }}; @@ -699,10 +702,13 @@ Boo! TempDir tdir; auto store{make_test_store(tdir.path(), test_msgs, {})}; /* true: match; false: no match */ - const auto cases = std::array, 3>{{ + const auto cases = std::vector>{{ {"subject:kata", true}, {"subject:containers", true}, - {"subject:kata-containers", true} + {"subject:kata-containers", true}, + {"subject:\"kata containers\"", true}, + {"voodoo-containers", true}, + {"voodoo containers", true} }}; for (auto&& test: cases) {