* add indexing/searching for text-based mime parts

This commit is contained in:
djcb
2011-11-24 00:11:45 +02:00
parent 5ac9693681
commit cc6738c195
4 changed files with 209 additions and 75 deletions

View File

@ -512,8 +512,8 @@ stream_to_string (GMimeStream *stream, size_t buflen)
}
static gchar*
part_to_string (GMimePart *part, gboolean *err)
gchar*
mu_msg_mime_part_to_string (GMimePart *part, gboolean *err)
{
GMimeDataWrapper *wrapper;
GMimeStream *stream = NULL;
@ -557,31 +557,42 @@ cleanup:
}
static char*
get_body (MuMsgFile *self, gboolean want_html)
GMimePart*
mu_msg_mime_get_body_part (GMimeMessage *msg, gboolean want_html)
{
GetBodyData data;
char *str;
gboolean err;
g_return_val_if_fail (self, NULL);
g_return_val_if_fail (GMIME_IS_MESSAGE(self->_mime_msg), NULL);
g_return_val_if_fail (GMIME_IS_MESSAGE(msg), NULL);
memset (&data, 0, sizeof(GetBodyData));
data._want_html = want_html;
err = FALSE;
g_mime_message_foreach (self->_mime_msg,
g_mime_message_foreach (msg,
(GMimeObjectForeachFunc)get_body_cb,
&data);
if (want_html)
str = data._html_part ?
part_to_string (GMIME_PART(data._html_part), &err) :
NULL;
return (GMimePart*)data._html_part;
else
str = data._txt_part ?
part_to_string (GMIME_PART(data._txt_part), &err) :
NULL;
return (GMimePart*)data._txt_part;
}
static char*
get_body (MuMsgFile *self, gboolean want_html)
{
GMimePart *part;
g_return_val_if_fail (self, NULL);
g_return_val_if_fail (GMIME_IS_MESSAGE(self->_mime_msg), NULL);
part = mu_msg_mime_get_body_part (self->_mime_msg, want_html);
if (GMIME_IS_PART(part)) {
gboolean err;
gchar *str;
err = FALSE;
str = mu_msg_mime_part_to_string (part, &err);
/* note, str may be NULL (no body), but that's not necessarily
* an error; we only warn when an actual error occured */
@ -592,6 +603,9 @@ get_body (MuMsgFile *self, gboolean want_html)
return str;
}
return NULL;
}
static gboolean
@ -672,6 +686,18 @@ maybe_cleanup (const char* str, const char *path, gboolean *do_free)
}
G_GNUC_CONST static GMimeRecipientType
recipient_type (MuMsgFieldId mfid)
{
switch (mfid) {
case MU_MSG_FIELD_ID_BCC: return GMIME_RECIPIENT_TYPE_BCC;
case MU_MSG_FIELD_ID_CC: return GMIME_RECIPIENT_TYPE_CC;
case MU_MSG_FIELD_ID_TO: return GMIME_RECIPIENT_TYPE_TO;
default: g_return_val_if_reached (-1);
}
}
char*
mu_msg_file_get_str_field (MuMsgFile *self, MuMsgFieldId mfid,
gboolean *do_free)
@ -683,17 +709,18 @@ mu_msg_file_get_str_field (MuMsgFile *self, MuMsgFieldId mfid,
switch (mfid) {
case MU_MSG_FIELD_ID_BCC: *do_free = TRUE;
return get_recipient (self, GMIME_RECIPIENT_TYPE_BCC);
case MU_MSG_FIELD_ID_ATTACH_TEXT: *do_free = TRUE;
return NULL; /* FIXME */
case MU_MSG_FIELD_ID_BODY_TEXT: *do_free = TRUE;
return get_body (self, FALSE);
case MU_MSG_FIELD_ID_BCC:
case MU_MSG_FIELD_ID_CC:
case MU_MSG_FIELD_ID_TO: *do_free = TRUE;
return get_recipient (self, recipient_type(mfid));
case MU_MSG_FIELD_ID_BODY_TEXT:
case MU_MSG_FIELD_ID_BODY_HTML: *do_free = TRUE;
return get_body (self, TRUE);
case MU_MSG_FIELD_ID_CC: *do_free = TRUE;
return get_recipient (self, GMIME_RECIPIENT_TYPE_CC);
return get_body
(self, mfid == MU_MSG_FIELD_ID_BODY_HTML ? TRUE : FALSE);
case MU_MSG_FIELD_ID_FROM:
return (char*)maybe_cleanup
@ -707,9 +734,6 @@ mu_msg_file_get_str_field (MuMsgFile *self, MuMsgFieldId mfid,
(g_mime_message_get_subject (self->_mime_msg),
self->_path, do_free);
case MU_MSG_FIELD_ID_TO: *do_free = TRUE;
return get_recipient (self, GMIME_RECIPIENT_TYPE_TO);
case MU_MSG_FIELD_ID_MSGID:
return (char*)g_mime_message_get_message_id (self->_mime_msg);

View File

@ -66,10 +66,30 @@ struct _PartData {
unsigned _idx;
MuMsgPartForeachFunc _func;
gpointer _user_data;
GMimePart *_body_part;
};
typedef struct _PartData PartData;
char*
mu_msg_part_to_string (MuMsgPart *part, gboolean *err)
{
char *txt;
g_return_val_if_fail (part && part->data, NULL);
if (GMIME_IS_PART(part->data))
txt = mu_msg_mime_part_to_string (GMIME_PART(part->data),
err);
else
txt = NULL;
return txt;
}
static ssize_t
get_part_size (GMimePart *part)
{
@ -100,6 +120,10 @@ part_foreach_cb (GMimeObject *parent, GMimeObject *part, PartData *pdata)
memset (&pi, 0, sizeof pi);
pi.index = pdata->_idx++;
pi.content_id = (char*)g_mime_object_get_content_id (part);
pi.data = (gpointer)part;
/* check if this is the body part */
pi.is_body = ((void*)pdata->_body_part == (void*)part);
ct = g_mime_object_get_content_type (part);
@ -112,11 +136,13 @@ part_foreach_cb (GMimeObject *parent, GMimeObject *part, PartData *pdata)
pi.disposition = (char*)g_mime_object_get_disposition (part);
pi.file_name = (char*)g_mime_part_get_filename (GMIME_PART(part));
pi.size = get_part_size (GMIME_PART(part));
}
} else
return; /* only deal with GMimePart */
pdata->_func(pdata->_msg, &pi, pdata->_user_data);
}
static gboolean
load_msg_file_maybe (MuMsg *msg)
{
@ -149,14 +175,18 @@ mu_msg_part_foreach (MuMsg *msg, MuMsgPartForeachFunc func,
gpointer user_data)
{
PartData pdata;
GMimeMessage *mime_msg;
g_return_if_fail (msg);
g_return_if_fail (msg && msg->_file && msg->_file->_mime_msg);
if (!load_msg_file_maybe (msg))
return;
mime_msg = msg->_file->_mime_msg;
pdata._msg = msg;
pdata._idx = 0;
pdata._body_part = mu_msg_mime_get_body_part (mime_msg, FALSE);
pdata._func = func;
pdata._user_data = user_data;
@ -269,6 +299,7 @@ mu_msg_part_filepath (MuMsg *msg, const char* targetdir, guint partidx)
gchar*
mu_msg_part_filepath_cache (MuMsg *msg, guint partid)
{

View File

@ -51,6 +51,9 @@ struct _MuMsgPart {
gpointer data; /* opaque data */
gboolean is_body; /* TRUE if this is probably the
* message body*/
/* if TRUE, mu_msg_part_destroy will free the member vars
* as well*/
gboolean own_members;
@ -77,6 +80,17 @@ typedef struct _MuMsgPart MuMsgPart;
#define mu_msg_part_content_id(pi) ((pi)->content_id)
/**
* convert a MuMsgPart to a string
*
* @param part a MuMsgPart
* @param err will receive TRUE if there was an error, FALSE otherwise
*
* @return utf8 string for this MIME part, to be freed by caller
*/
char* mu_msg_part_to_string (MuMsgPart *part, gboolean *err);
/**
* does this msg part look like an attachment?
*

View File

@ -66,7 +66,7 @@ _MuStore::rollback_transaction () {
}
/* we cache these prefix strings, so we don't have to allocate the all
/* we cache these prefix strings, so we don't have to allocate them all
* the time; this should save 10-20 string allocs per message */
G_GNUC_CONST static const std::string&
prefix (MuMsgFieldId mfid)
@ -162,6 +162,7 @@ mu_store_set_metadata (MuStore *store, const char *key, const char *val,
try {
store->db_writable()->set_metadata (key, val);
return TRUE;
} MU_STORE_CATCH_BLOCK_RETURN(err, FALSE);
} MU_XAPIAN_CATCH_BLOCK_G_ERROR_RETURN(err, MU_ERROR_XAPIAN, FALSE);
@ -212,8 +213,6 @@ add_terms_values_date (Xapian::Document& doc, MuMsg *msg, MuMsgFieldId mfid)
}
}
/* TODO: we could pre-calculate the add_term values for FLAGS */
/* pre-calculate; optimization */
G_GNUC_CONST static const std::string&
flag_val (char flagchar)
@ -310,7 +309,8 @@ static void
add_terms_values_str (Xapian::Document& doc, char *val,
MuMsgFieldId mfid)
{
/* the value is what we'll display; the unchanged original */
/* the value is what we display in search results; the
* unchanged original */
if (mu_msg_field_xapian_value(mfid))
doc.add_value ((Xapian::valueno)mfid, val);
@ -318,7 +318,8 @@ add_terms_values_str (Xapian::Document& doc, char *val,
if (mu_msg_field_normalize (mfid))
mu_str_normalize_in_place (val, TRUE);
if (mu_msg_field_xapian_escape (mfid))
mu_str_ascii_xapian_escape_in_place (val);
mu_str_ascii_xapian_escape_in_place (val,
TRUE /*esc_space*/);
if (mu_msg_field_xapian_index (mfid)) {
Xapian::TermGenerator termgen;
@ -402,36 +403,86 @@ struct PartData {
MuMsgFieldId _mfid;
};
static gboolean
index_text_part (MuMsgPart *part, PartData *pdata)
{
unsigned u;
gboolean txt_type, err;
char *txt, *norm;
Xapian::TermGenerator termgen;
/* check wether it's a type we need to store */
struct { const char* type; const char *subtype; } txt_types[] = {
{ "text", "plain"},
{ "message", "rfc822"},
};
txt_type = FALSE;
for (u = 0; u != G_N_ELEMENTS(txt_types) && !txt_type; ++u) {
if ((strcasecmp (part->type, txt_types[u].type) == 0) &&
((strcasecmp (part->subtype, txt_types[u].subtype) == 0) ||
(strcmp (txt_types[u].subtype, "*") == 0)))
txt_type = TRUE;
}
if (!txt_type)
return FALSE; /* not a supported text type */
txt = mu_msg_part_to_string (part, &err);
if (!txt || err)
return FALSE;
termgen.set_document(pdata->_doc);
norm = mu_str_normalize (txt, TRUE);
termgen.index_text_without_positions
(norm, 1, prefix(MU_MSG_FIELD_ID_ATTACH_TEXT));
g_free (norm);
g_free (txt);
return TRUE;
}
static void
each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)
{
static const std::string
att (prefix(MU_MSG_FIELD_ID_ATTACH)),
mime (prefix(MU_MSG_FIELD_ID_ATTACH_MIME_TYPE));
file (prefix(MU_MSG_FIELD_ID_FILE)),
mime (prefix(MU_MSG_FIELD_ID_MIME));
if (mu_msg_part_looks_like_attachment (part, TRUE) &&
(part->file_name)) {
/* save the mime type of any part */
if (part->type) {
/* note, we use '_' instead of '/' to separate
* type/subtype -- Xapian doesn't treat '/' as
* desired, so we use '_' and pre-process queries; see
* mu_query_preprocess */
char ctype[MuStore::MAX_TERM_LENGTH + 1];
snprintf (ctype, sizeof(ctype), "%s_%s",
part->type, part->subtype);
pdata->_doc.add_term
(mime + std::string(ctype, 0, MuStore::MAX_TERM_LENGTH));
}
/* save the name of anything that has a filename */
if (part->file_name) {
char val[MuStore::MAX_TERM_LENGTH + 1];
strncpy (val, part->file_name, sizeof(val));
/* now, let's create a term... */
mu_str_normalize_in_place (val, TRUE);
mu_str_ascii_xapian_escape_in_place (val);
mu_str_ascii_xapian_escape_in_place (val, TRUE /*esc space*/);
pdata->_doc.add_term
(att + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
/* save the mime type */
if (part->type) {
gchar *str;
str = g_strdup_printf ("%s/%s", part->type, part->subtype);
pdata->_doc.add_term
(mime + std::string(str, 0, MuStore::MAX_TERM_LENGTH));
g_free (str);
} else
pdata->_doc.add_term (mime + "application/octet-stream");
(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
}
/* now, for non-body parts with some MIME-types, index the
* content as well */
if (!part->is_body)
index_text_part (part, pdata);
}
@ -465,8 +516,7 @@ add_terms_values_body (Xapian::Document& doc, MuMsg *msg,
termgen.set_document(doc);
norm = mu_str_normalize (str, TRUE);
termgen.index_text_without_positions
(norm, 1, prefix(mfid));
termgen.index_text_without_positions (norm, 1, prefix(mfid));
g_free (norm);
}
@ -478,6 +528,24 @@ struct _MsgDoc {
};
typedef struct _MsgDoc MsgDoc;
static void
add_terms_values_default (MuMsgFieldId mfid, MsgDoc* msgdoc)
{
if (mu_msg_field_is_numeric (mfid))
add_terms_values_number
(*msgdoc->_doc, msgdoc->_msg, mfid);
else if (mu_msg_field_is_string (mfid))
add_terms_values_string
(*msgdoc->_doc, msgdoc->_msg, mfid);
else if (mu_msg_field_is_string_list(mfid))
add_terms_values_string_list
(*msgdoc->_doc, msgdoc->_msg, mfid);
else
g_return_if_reached ();
}
static void
add_terms_values (MuMsgFieldId mfid, MsgDoc* msgdoc)
{
@ -495,26 +563,19 @@ add_terms_values (MuMsgFieldId mfid, MsgDoc* msgdoc)
case MU_MSG_FIELD_ID_BODY_TEXT:
add_terms_values_body (*msgdoc->_doc, msgdoc->_msg, mfid);
break;
case MU_MSG_FIELD_ID_ATTACH: /* also takes care of MU_MSG_FIELD_ID_ATTACH_MIME */
/* note: add_terms_values_attach handles these three msgfields */
case MU_MSG_FIELD_ID_FILE:
add_terms_values_attach (*msgdoc->_doc, msgdoc->_msg, mfid);
case MU_MSG_FIELD_ID_MIME:
case MU_MSG_FIELD_ID_ATTACH_TEXT:
break;
case MU_MSG_FIELD_ID_ATTACH_MIME_TYPE:
///////////////////////////////////////////
case MU_MSG_FIELD_ID_UID:
break; /* already taken care of elsewhere */
default:
if (mu_msg_field_is_numeric (mfid))
add_terms_values_number (*msgdoc->_doc, msgdoc->_msg,
mfid);
else if (mu_msg_field_is_string (mfid))
add_terms_values_string (*msgdoc->_doc,
msgdoc->_msg,
mfid);
else if (mu_msg_field_is_string_list(mfid))
add_terms_values_string_list (*msgdoc->_doc,
msgdoc->_msg,
mfid);
else
g_return_if_reached ();
return add_terms_values_default (mfid, msgdoc);
}
}
@ -559,7 +620,11 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
/* don't normalize e-mail address, but do lowercase it */
if (!mu_str_is_empty(contact->address)) {
char *escaped = mu_str_ascii_xapian_escape (contact->address);
char *escaped;
escaped = mu_str_ascii_xapian_escape (contact->address,
FALSE /*dont esc space*/);
msgdoc->_doc->add_term
(std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
g_free (escaped);