message: try to detect body text language

Try to detect the language of the e-mail body and make it searchable.
This commit is contained in:
Dirk-Jan C. Binnema
2023-05-11 23:22:29 +03:00
parent ad64093183
commit 7f2eeb1010
4 changed files with 29 additions and 9 deletions

View File

@ -165,7 +165,6 @@ test_field_from_name()
Field::Id::Bcc); Field::Id::Bcc);
} }
static void static void
test_xapian_term() test_xapian_term()
{ {

View File

@ -52,6 +52,7 @@ struct Field {
File, /**< Filename */ File, /**< Filename */
Flags, /**< Message flags */ Flags, /**< Message flags */
From, /**< Message sender */ From, /**< Message sender */
Language, /**< Body language */
Maildir, /**< Maildir path */ Maildir, /**< Maildir path */
MailingList, /**< Mailing list */ MailingList, /**< Mailing list */
MessageId, /**< Message Id */ MessageId, /**< Message Id */
@ -252,7 +253,6 @@ static constexpr std::array<Field, Field::id_size()>
Field::Flag::IncludeInSexp | Field::Flag::IncludeInSexp |
Field::Flag::IndexableTerm, Field::Flag::IndexableTerm,
}, },
{ {
Field::Id::Changed, Field::Id::Changed,
Field::Type::TimeT, Field::Type::TimeT,
@ -316,6 +316,17 @@ static constexpr std::array<Field, Field::id_size()>
Field::Flag::IncludeInSexp | Field::Flag::IncludeInSexp |
Field::Flag::IndexableTerm, Field::Flag::IndexableTerm,
}, },
{
Field::Id::Language,
Field::Type::String,
"language", "lang",
"ISO 639-1 language code for body",
"lang:nl",
'a',
Field::Flag::BooleanTerm |
Field::Flag::Value |
Field::Flag::IncludeInSexp
},
{ {
Field::Id::Maildir, Field::Id::Maildir,
Field::Type::String, Field::Type::String,

View File

@ -29,6 +29,7 @@
#include <utils/mu-utils.hh> #include <utils/mu-utils.hh>
#include <utils/mu-error.hh> #include <utils/mu-error.hh>
#include <utils/mu-option.hh> #include <utils/mu-option.hh>
#include <utils/mu-lang-detector.hh>
#include <atomic> #include <atomic>
#include <mutex> #include <mutex>
@ -67,6 +68,8 @@ struct Message::Private {
Option<std::string> body_txt; Option<std::string> body_txt;
Option<std::string> body_html; Option<std::string> body_html;
Option<std::string> embedded; Option<std::string> embedded;
Option<std::string> language; /* body ISO language code */
}; };
@ -531,6 +534,14 @@ process_message(const MimeMessage& mime_msg, const std::string& path,
info.mailing_list = get_mailing_list(mime_msg); info.mailing_list = get_mailing_list(mime_msg);
if (info.mailing_list) if (info.mailing_list)
info.flags |= Flags::MailingList; info.flags |= Flags::MailingList;
if (info.body_txt) { /* attempt to get the body-language */
if (const auto lang{detect_language(info.body_txt.value())}; lang) {
info.language = lang->code;
g_debug("detected language: %s", lang->code);
} else
g_debug("could not detect language");
}
} }
static Mu::Result<std::string> static Mu::Result<std::string>
@ -586,8 +597,6 @@ fake_message_id(const std::string& path)
* based on a field. So we add them here. * based on a field. So we add them here.
*/ */
static void static void
doc_add_list_post(Document& doc, const MimeMessage& mime_msg) doc_add_list_post(Document& doc, const MimeMessage& mime_msg)
{ {
@ -643,7 +652,7 @@ fill_document(Message::Private& priv)
doc_add_reply_to(doc, mime_msg); /* only in sexp */ doc_add_reply_to(doc, mime_msg); /* only in sexp */
field_for_each([&](auto&& field) { field_for_each([&](auto&& field) {
/* insist on expliclity handling each */ /* insist on explicitly handling each */
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch" #pragma GCC diagnostic error "-Wswitch"
switch(field.id) { switch(field.id) {
@ -652,7 +661,6 @@ fill_document(Message::Private& priv)
break; break;
case Field::Id::BodyText: case Field::Id::BodyText:
doc.add(field.id, priv.body_txt); doc.add(field.id, priv.body_txt);
break; break;
case Field::Id::Cc: case Field::Id::Cc:
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc)); doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
@ -676,6 +684,9 @@ fill_document(Message::Private& priv)
case Field::Id::From: case Field::Id::From:
doc.add(field.id, mime_msg.contacts(Contact::Type::From)); doc.add(field.id, mime_msg.contacts(Contact::Type::From));
break; break;
case Field::Id::Language:
doc.add(field.id, priv.language);
break;
case Field::Id::Maildir: /* already */ case Field::Id::Maildir: /* already */
break; break;
case Field::Id::MailingList: case Field::Id::MailingList:

View File

@ -110,7 +110,6 @@ public:
} }
/* LCOV_EXCL_STOP */ /* LCOV_EXCL_STOP */
/** /**
* Construct a message from a string. This is mostly useful for testing. * Construct a message from a string. This is mostly useful for testing.
* *
@ -406,8 +405,8 @@ public:
const std::vector<Part>& parts() const; const std::vector<Part>& parts() const;
/** /**
* Get the path to a cche directory for this message, which * Get the path to a cache directory for this message, which is useful
* is useful for temporarily saving attachments * for temporarily saving attachments
* *
* @param index optionally, create <cache-path>/<index> instead; * @param index optionally, create <cache-path>/<index> instead;
* this is useful for having part-specific subdirectories. * this is useful for having part-specific subdirectories.