message: use html-to-text scraper for html parts

We were dumping the HTML-parts as-is in the Xapian indexer; however,
it's better to remove the html decoration first, and just pass the text.

We use the new built-in html->text scraper for that.
This commit is contained in:
Dirk-Jan C. Binnema
2023-07-23 14:46:11 +03:00
parent 56b8fad89e
commit b795242d5a
7 changed files with 31 additions and 69 deletions

View File

@ -336,11 +336,11 @@ get_mailing_list(const MimeMessage& mime_msg)
}
static void
append_text(Option<std::string>& str, Option<std::string> app)
append_text(Option<std::string>& str, Option<std::string>&& app)
{
if (!str)
str = app;
else if (app)
if (!str && app)
str = std::move(*app);
else if (str && app)
str.value() += app.value();
}
@ -407,17 +407,18 @@ process_message_part(const MimeMessagePart& msg_part,
return;
submsg->for_each([&](auto&& parent, auto&& child_obj) {
/* XXX: we only handle one level */
/* NOTE: we only handle one level; ideally, we'd apply the whole
parsing machinery recursively; so this a little crude. */
if (!child_obj.is_part())
return;
const auto ctype{child_obj.content_type()};
if (!ctype || !ctype->is_type("text", "*"))
if (const auto ctype{child_obj.content_type()}; !ctype)
return;
append_text(info.embedded, MimePart{child_obj}.to_string());
else if (ctype->is_type("text", "plain"))
append_text(info.embedded, MimePart{child_obj}.to_string());
else if (ctype->is_type("text", "html")) {
if (auto&& str{MimePart{child_obj}.to_string()}; str)
append_text(info.embedded, html_to_text(*str));
}
});
}
@ -662,6 +663,8 @@ fill_document(Message::Private& priv)
break;
case Field::Id::BodyText:
doc.add(field.id, priv.body_txt);
if (priv.body_html)
doc.add(field.id, html_to_text(*priv.body_html));
break;
case Field::Id::Cc:
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
@ -725,10 +728,6 @@ fill_document(Message::Private& priv)
case Field::Id::To:
doc.add(field.id, mime_msg.contacts(Contact::Type::To));
break;
/* internal fields */
case Field::Id::XBodyHtml:
doc.add(field.id, priv.body_html);
break;
/* LCOV_EXCL_START */
case Field::Id::_count_:
default: