message: try to detect body text language
Try to detect the language of the e-mail body and make it searchable.
This commit is contained in:
@ -165,7 +165,6 @@ test_field_from_name()
|
|||||||
Field::Id::Bcc);
|
Field::Id::Bcc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
test_xapian_term()
|
test_xapian_term()
|
||||||
{
|
{
|
||||||
|
|||||||
@ -52,6 +52,7 @@ struct Field {
|
|||||||
File, /**< Filename */
|
File, /**< Filename */
|
||||||
Flags, /**< Message flags */
|
Flags, /**< Message flags */
|
||||||
From, /**< Message sender */
|
From, /**< Message sender */
|
||||||
|
Language, /**< Body language */
|
||||||
Maildir, /**< Maildir path */
|
Maildir, /**< Maildir path */
|
||||||
MailingList, /**< Mailing list */
|
MailingList, /**< Mailing list */
|
||||||
MessageId, /**< Message Id */
|
MessageId, /**< Message Id */
|
||||||
@ -252,7 +253,6 @@ static constexpr std::array<Field, Field::id_size()>
|
|||||||
Field::Flag::IncludeInSexp |
|
Field::Flag::IncludeInSexp |
|
||||||
Field::Flag::IndexableTerm,
|
Field::Flag::IndexableTerm,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
Field::Id::Changed,
|
Field::Id::Changed,
|
||||||
Field::Type::TimeT,
|
Field::Type::TimeT,
|
||||||
@ -316,6 +316,17 @@ static constexpr std::array<Field, Field::id_size()>
|
|||||||
Field::Flag::IncludeInSexp |
|
Field::Flag::IncludeInSexp |
|
||||||
Field::Flag::IndexableTerm,
|
Field::Flag::IndexableTerm,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
Field::Id::Language,
|
||||||
|
Field::Type::String,
|
||||||
|
"language", "lang",
|
||||||
|
"ISO 639-1 language code for body",
|
||||||
|
"lang:nl",
|
||||||
|
'a',
|
||||||
|
Field::Flag::BooleanTerm |
|
||||||
|
Field::Flag::Value |
|
||||||
|
Field::Flag::IncludeInSexp
|
||||||
|
},
|
||||||
{
|
{
|
||||||
Field::Id::Maildir,
|
Field::Id::Maildir,
|
||||||
Field::Type::String,
|
Field::Type::String,
|
||||||
|
|||||||
@ -29,6 +29,7 @@
|
|||||||
#include <utils/mu-utils.hh>
|
#include <utils/mu-utils.hh>
|
||||||
#include <utils/mu-error.hh>
|
#include <utils/mu-error.hh>
|
||||||
#include <utils/mu-option.hh>
|
#include <utils/mu-option.hh>
|
||||||
|
#include <utils/mu-lang-detector.hh>
|
||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
@ -67,6 +68,8 @@ struct Message::Private {
|
|||||||
Option<std::string> body_txt;
|
Option<std::string> body_txt;
|
||||||
Option<std::string> body_html;
|
Option<std::string> body_html;
|
||||||
Option<std::string> embedded;
|
Option<std::string> embedded;
|
||||||
|
|
||||||
|
Option<std::string> language; /* body ISO language code */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -531,6 +534,14 @@ process_message(const MimeMessage& mime_msg, const std::string& path,
|
|||||||
info.mailing_list = get_mailing_list(mime_msg);
|
info.mailing_list = get_mailing_list(mime_msg);
|
||||||
if (info.mailing_list)
|
if (info.mailing_list)
|
||||||
info.flags |= Flags::MailingList;
|
info.flags |= Flags::MailingList;
|
||||||
|
|
||||||
|
if (info.body_txt) { /* attempt to get the body-language */
|
||||||
|
if (const auto lang{detect_language(info.body_txt.value())}; lang) {
|
||||||
|
info.language = lang->code;
|
||||||
|
g_debug("detected language: %s", lang->code);
|
||||||
|
} else
|
||||||
|
g_debug("could not detect language");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static Mu::Result<std::string>
|
static Mu::Result<std::string>
|
||||||
@ -586,8 +597,6 @@ fake_message_id(const std::string& path)
|
|||||||
* based on a field. So we add them here.
|
* based on a field. So we add them here.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
doc_add_list_post(Document& doc, const MimeMessage& mime_msg)
|
doc_add_list_post(Document& doc, const MimeMessage& mime_msg)
|
||||||
{
|
{
|
||||||
@ -643,7 +652,7 @@ fill_document(Message::Private& priv)
|
|||||||
doc_add_reply_to(doc, mime_msg); /* only in sexp */
|
doc_add_reply_to(doc, mime_msg); /* only in sexp */
|
||||||
|
|
||||||
field_for_each([&](auto&& field) {
|
field_for_each([&](auto&& field) {
|
||||||
/* insist on expliclity handling each */
|
/* insist on explicitly handling each */
|
||||||
#pragma GCC diagnostic push
|
#pragma GCC diagnostic push
|
||||||
#pragma GCC diagnostic error "-Wswitch"
|
#pragma GCC diagnostic error "-Wswitch"
|
||||||
switch(field.id) {
|
switch(field.id) {
|
||||||
@ -652,7 +661,6 @@ fill_document(Message::Private& priv)
|
|||||||
break;
|
break;
|
||||||
case Field::Id::BodyText:
|
case Field::Id::BodyText:
|
||||||
doc.add(field.id, priv.body_txt);
|
doc.add(field.id, priv.body_txt);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case Field::Id::Cc:
|
case Field::Id::Cc:
|
||||||
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
|
doc.add(field.id, mime_msg.contacts(Contact::Type::Cc));
|
||||||
@ -676,6 +684,9 @@ fill_document(Message::Private& priv)
|
|||||||
case Field::Id::From:
|
case Field::Id::From:
|
||||||
doc.add(field.id, mime_msg.contacts(Contact::Type::From));
|
doc.add(field.id, mime_msg.contacts(Contact::Type::From));
|
||||||
break;
|
break;
|
||||||
|
case Field::Id::Language:
|
||||||
|
doc.add(field.id, priv.language);
|
||||||
|
break;
|
||||||
case Field::Id::Maildir: /* already */
|
case Field::Id::Maildir: /* already */
|
||||||
break;
|
break;
|
||||||
case Field::Id::MailingList:
|
case Field::Id::MailingList:
|
||||||
|
|||||||
@ -110,7 +110,6 @@ public:
|
|||||||
}
|
}
|
||||||
/* LCOV_EXCL_STOP */
|
/* LCOV_EXCL_STOP */
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a message from a string. This is mostly useful for testing.
|
* Construct a message from a string. This is mostly useful for testing.
|
||||||
*
|
*
|
||||||
@ -406,8 +405,8 @@ public:
|
|||||||
const std::vector<Part>& parts() const;
|
const std::vector<Part>& parts() const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the path to a cche directory for this message, which
|
* Get the path to a cache directory for this message, which is useful
|
||||||
* is useful for temporarily saving attachments
|
* for temporarily saving attachments
|
||||||
*
|
*
|
||||||
* @param index optionally, create <cache-path>/<index> instead;
|
* @param index optionally, create <cache-path>/<index> instead;
|
||||||
* this is useful for having part-specific subdirectories.
|
* this is useful for having part-specific subdirectories.
|
||||||
|
|||||||
Reference in New Issue
Block a user