From 9bf580de3d979e2d5528c1b0b08dfd9b6e58bebf Mon Sep 17 00:00:00 2001 From: "Dirk-Jan C. Binnema" Date: Fri, 10 Jun 2022 00:43:47 +0300 Subject: [PATCH] message: refactor/improve attachment heuristic a bit Also check for X-MS-Has-Attach --- lib/message/mu-message-part.cc | 28 +++++++++++++ lib/message/mu-message-part.hh | 12 +++++- lib/message/mu-message.cc | 74 +++++++++++++++------------------- lib/message/test-mu-message.cc | 41 +++++++++++++++++++ 4 files changed, 112 insertions(+), 43 deletions(-) diff --git a/lib/message/mu-message-part.cc b/lib/message/mu-message-part.cc index ec686650..8f0b7638 100644 --- a/lib/message/mu-message-part.cc +++ b/lib/message/mu-message-part.cc @@ -158,3 +158,31 @@ MessagePart::is_encrypted() const noexcept { return mime_object().is_multipart_encrypted(); } + +bool /* heuristic */ +MessagePart::looks_like_attachment() const noexcept +{ + auto matches=[](const MimeContentType& ctype, + const std::initializer_list>& ctypes) { + return std::find_if(ctypes.begin(), ctypes.end(), [&](auto&& item){ + return ctype.is_type(item.first, item.second); }) != ctypes.end(); + }; + + const auto ctype{mime_object().content_type()}; + if (!ctype) + return false; // no content-type: not an attachment. + + // we consider some parts _not_ to be attachments regardless of disposition + if (matches(*ctype,{{"application", "pgp-keys"}})) + return false; + + // we consider some parts to be attachments regardless of disposition + if (matches(*ctype,{{"image", "*"}, + {"audio", "*"}, + {"application", "*"}, + {"application", "x-patch"}})) + return true; + + // otherwise, rely on the disposition + return is_attachment(); +} diff --git a/lib/message/mu-message-part.hh b/lib/message/mu-message-part.hh index 05e21eb7..b955fc88 100644 --- a/lib/message/mu-message-part.hh +++ b/lib/message/mu-message-part.hh @@ -105,13 +105,23 @@ public: /** * Does this part have an "attachment" disposition? Otherwise it is * "inline". Note that does *not* map 1:1 to a message's HasAttachment - * flag. + * flag (which uses looks_like_attachment()) * * @return true or false. */ bool is_attachment() const noexcept; + /** + * Does this part appear to be an attachment from an end-users point of + * view? This uses some heuristics to guess. Some parts for which + * is_attachment() is true may not "really" be attachments, and + * vice-versa + * + * @return true or false. + */ + bool looks_like_attachment() const noexcept; + /** * Is this part signed? * diff --git a/lib/message/mu-message.cc b/lib/message/mu-message.cc index e7ae98b1..5d0f825b 100644 --- a/lib/message/mu-message.cc +++ b/lib/message/mu-message.cc @@ -340,45 +340,6 @@ get_mailing_list(const MimeMessage& mime_msg) return to_string_opt_gchar(std::move(res)); } -static bool /* heuristic */ -looks_like_attachment(const MimeObject& parent, - const MimePart& part, const MimeContentType& ctype) -{ - constexpr std::array, 4> att_types = {{ - {"image", "*"}, - {"audio", "*"}, - {"application", "*"}, - {"application", "x-patch"} - }}; - - if (parent) { /* crypto multipart children are not considered attachments */ - if (const auto parent_ctype{parent.content_type()}; parent_ctype) { - if (parent_ctype->is_type("multipart", "signed") || - parent_ctype->is_type("multipart", "encrypted")) - return false; - } - } - - /* we also consider patches, images, audio, and non-pgp-signature - * application attachments to be attachments... */ - if (ctype.is_type("*", "pgp-signature")) - return false; /* don't consider as a signature */ - - if (ctype.is_type("text", "*") && - (ctype.is_type("*", "plain") || ctype.is_type("*", "html"))) - return false; /* not a signature */ - - /* if not one of those special types, consider it any attachment - * if it says so */ - if (part.is_attachment()) - return true; - - const auto it = seq_find_if(att_types, [&](auto&& item){ - return ctype.is_type(item.first, item.second); - }); - return it != att_types.cend(); /* if found, it's an attachment */ -} - static void append_text(Option& str, Option app) { @@ -403,19 +364,38 @@ accumulate_text(const MimePart& part, Message::Private& info, append_text(info.body_html, part.to_string()); } + +static bool /* heuristic */ +looks_like_attachment(const MimeObject& parent, const MessagePart& mpart) +{ + if (parent) { /* crypto multipart children are not considered attachments */ + if (const auto parent_ctype{parent.content_type()}; parent_ctype) { + if (parent_ctype->is_type("multipart", "signed") || + parent_ctype->is_type("multipart", "encrypted")) + return false; + } + } + + return mpart.looks_like_attachment(); +} + + static void process_part(const MimeObject& parent, const MimePart& part, - Message::Private& info) + Message::Private& info, const MessagePart& mpart) { const auto ctype{part.content_type()}; if (!ctype) return; - if (looks_like_attachment(parent, part, *ctype)) // flag as calendar, if not already if (none_of(info.flags & Flags::Calendar) && ctype->is_type("text", "calendar")) info.flags |= Flags::Calendar; + + // flag as attachment, if not already. + if (none_of(info.flags & Flags::HasAttachment) && + looks_like_attachment(parent, mpart)) info.flags |= Flags::HasAttachment; // if there are text parts, gather. @@ -499,7 +479,7 @@ handle_object(const MimeObject& parent, info.parts.emplace_back(obj); if (obj.is_part()) - process_part(parent, obj, info); + process_part(parent, obj, info, info.parts.back()); else if (obj.is_message_part()) process_message_part(obj, info); else if (obj.is_multipart_signed()) @@ -553,6 +533,16 @@ process_message(const MimeMessage& mime_msg, const std::string& path, info.mailing_list = get_mailing_list(mime_msg); if (info.mailing_list) info.flags |= Flags::MailingList; + + // Microsoft override; outlook message can tell us directly + // wther + const auto ms_atthdr{mime_msg.header("X-MS-Has-Attach")}; + if (ms_atthdr) { + if (*ms_atthdr == "yes") + info.flags |= Flags::HasAttachment; + else + info.flags &= ~Flags::HasAttachment; + } } static Mu::Result diff --git a/lib/message/test-mu-message.cc b/lib/message/test-mu-message.cc index 62286ca4..a277445f 100644 --- a/lib/message/test-mu-message.cc +++ b/lib/message/test-mu-message.cc @@ -20,6 +20,7 @@ #include "mu-message.hh" #include "mu-mime-object.hh" #include +#include using namespace Mu; @@ -569,7 +570,45 @@ Moi, part.mime_type().value_or("boo").c_str()); } +static void +test_message_ms_attach() +{ + const std::string msgtext = +R"(Date: Thu, 31 Jul 2008 14:57:25 -0400 +From: "John Milton" +Subject: Fere libenter homines id quod volunt credunt +To: "Julius Caesar" +Message-id: <3BE9E6535E3029448670913581E7A1A20D852173@emss35m06.us.lmco.com> +X-MS-Has-Attach: +MIME-version: 1.0 +Content-type: text/plain; charset=us-ascii +Content-transfer-encoding: 7BIT +OF Mans First Disobedience, and the Fruit +Of that Forbidden Tree, whose mortal tast +Brought Death into the World, and all our woe, +With loss of Eden, till one greater Man +)"; + + { + auto message{Message::make_from_text(msgtext)}; + g_assert_true(!!message); + g_assert_true(message->flags() == (Flags::None)); + } + + { + const auto text2 = std::regex_replace( + msgtext, std::regex{"X-MS-Has-Attach:"}, + "X-MS-Has-Attach: yes"); + + g_message("%s", text2.c_str()); + + auto message{Message::make_from_text(text2)}; + + g_assert_true(!!message); + g_assert_true(message->flags() == (Flags::HasAttachment)); + } +} static void @@ -841,6 +880,8 @@ main(int argc, char* argv[]) test_message_multipart_mixed_rfc822); g_test_add_func("/message/message/detect-attachment", test_message_detect_attachment); + g_test_add_func("/message/message/x-ms-has-attach", + test_message_ms_attach); g_test_add_func("/message/message/calendar", test_message_calendar); g_test_add_func("/message/message/fail",