From cc6738c195f88a0a982db7bbd1587274bc3ffd84 Mon Sep 17 00:00:00 2001 From: djcb Date: Thu, 24 Nov 2011 00:11:45 +0200 Subject: [PATCH] * add indexing/searching for text-based mime parts --- src/mu-msg-file.c | 92 +++++++++++++++++---------- src/mu-msg-part.c | 35 ++++++++++- src/mu-msg-part.h | 14 +++++ src/mu-store-write.cc | 143 ++++++++++++++++++++++++++++++------------ 4 files changed, 209 insertions(+), 75 deletions(-) diff --git a/src/mu-msg-file.c b/src/mu-msg-file.c index 75f57b01..fbdb8e7a 100644 --- a/src/mu-msg-file.c +++ b/src/mu-msg-file.c @@ -512,8 +512,8 @@ stream_to_string (GMimeStream *stream, size_t buflen) } -static gchar* -part_to_string (GMimePart *part, gboolean *err) +gchar* +mu_msg_mime_part_to_string (GMimePart *part, gboolean *err) { GMimeDataWrapper *wrapper; GMimeStream *stream = NULL; @@ -557,39 +557,53 @@ cleanup: } -static char* -get_body (MuMsgFile *self, gboolean want_html) +GMimePart* +mu_msg_mime_get_body_part (GMimeMessage *msg, gboolean want_html) { GetBodyData data; - char *str; - gboolean err; - g_return_val_if_fail (self, NULL); - g_return_val_if_fail (GMIME_IS_MESSAGE(self->_mime_msg), NULL); + g_return_val_if_fail (GMIME_IS_MESSAGE(msg), NULL); memset (&data, 0, sizeof(GetBodyData)); data._want_html = want_html; - err = FALSE; - g_mime_message_foreach (self->_mime_msg, + g_mime_message_foreach (msg, (GMimeObjectForeachFunc)get_body_cb, &data); if (want_html) - str = data._html_part ? - part_to_string (GMIME_PART(data._html_part), &err) : - NULL; + return (GMimePart*)data._html_part; else - str = data._txt_part ? - part_to_string (GMIME_PART(data._txt_part), &err) : - NULL; + return (GMimePart*)data._txt_part; +} - /* note, str may be NULL (no body), but that's not necessarily - * an error; we only warn when an actual error occured */ - if (err) - g_warning ("error occured while retrieving %s body " - "for message %s", - want_html ? "html" : "text", self->_path); - return str; + + +static char* +get_body (MuMsgFile *self, gboolean want_html) +{ + GMimePart *part; + + g_return_val_if_fail (self, NULL); + g_return_val_if_fail (GMIME_IS_MESSAGE(self->_mime_msg), NULL); + + part = mu_msg_mime_get_body_part (self->_mime_msg, want_html); + if (GMIME_IS_PART(part)) { + gboolean err; + gchar *str; + + err = FALSE; + str = mu_msg_mime_part_to_string (part, &err); + + /* note, str may be NULL (no body), but that's not necessarily + * an error; we only warn when an actual error occured */ + if (err) + g_warning ("error occured while retrieving %s body " + "for message %s", + want_html ? "html" : "text", self->_path); + return str; + } + + return NULL; } @@ -672,6 +686,18 @@ maybe_cleanup (const char* str, const char *path, gboolean *do_free) } +G_GNUC_CONST static GMimeRecipientType +recipient_type (MuMsgFieldId mfid) +{ + switch (mfid) { + case MU_MSG_FIELD_ID_BCC: return GMIME_RECIPIENT_TYPE_BCC; + case MU_MSG_FIELD_ID_CC: return GMIME_RECIPIENT_TYPE_CC; + case MU_MSG_FIELD_ID_TO: return GMIME_RECIPIENT_TYPE_TO; + default: g_return_val_if_reached (-1); + } +} + + char* mu_msg_file_get_str_field (MuMsgFile *self, MuMsgFieldId mfid, gboolean *do_free) @@ -683,17 +709,18 @@ mu_msg_file_get_str_field (MuMsgFile *self, MuMsgFieldId mfid, switch (mfid) { - case MU_MSG_FIELD_ID_BCC: *do_free = TRUE; - return get_recipient (self, GMIME_RECIPIENT_TYPE_BCC); + case MU_MSG_FIELD_ID_ATTACH_TEXT: *do_free = TRUE; + return NULL; /* FIXME */ - case MU_MSG_FIELD_ID_BODY_TEXT: *do_free = TRUE; - return get_body (self, FALSE); + case MU_MSG_FIELD_ID_BCC: + case MU_MSG_FIELD_ID_CC: + case MU_MSG_FIELD_ID_TO: *do_free = TRUE; + return get_recipient (self, recipient_type(mfid)); + case MU_MSG_FIELD_ID_BODY_TEXT: case MU_MSG_FIELD_ID_BODY_HTML: *do_free = TRUE; - return get_body (self, TRUE); - - case MU_MSG_FIELD_ID_CC: *do_free = TRUE; - return get_recipient (self, GMIME_RECIPIENT_TYPE_CC); + return get_body + (self, mfid == MU_MSG_FIELD_ID_BODY_HTML ? TRUE : FALSE); case MU_MSG_FIELD_ID_FROM: return (char*)maybe_cleanup @@ -707,9 +734,6 @@ mu_msg_file_get_str_field (MuMsgFile *self, MuMsgFieldId mfid, (g_mime_message_get_subject (self->_mime_msg), self->_path, do_free); - case MU_MSG_FIELD_ID_TO: *do_free = TRUE; - return get_recipient (self, GMIME_RECIPIENT_TYPE_TO); - case MU_MSG_FIELD_ID_MSGID: return (char*)g_mime_message_get_message_id (self->_mime_msg); diff --git a/src/mu-msg-part.c b/src/mu-msg-part.c index 978245a4..8a5d247f 100644 --- a/src/mu-msg-part.c +++ b/src/mu-msg-part.c @@ -66,10 +66,30 @@ struct _PartData { unsigned _idx; MuMsgPartForeachFunc _func; gpointer _user_data; + GMimePart *_body_part; }; typedef struct _PartData PartData; + +char* +mu_msg_part_to_string (MuMsgPart *part, gboolean *err) +{ + char *txt; + + g_return_val_if_fail (part && part->data, NULL); + + if (GMIME_IS_PART(part->data)) + txt = mu_msg_mime_part_to_string (GMIME_PART(part->data), + err); + else + txt = NULL; + + return txt; +} + + + static ssize_t get_part_size (GMimePart *part) { @@ -100,6 +120,10 @@ part_foreach_cb (GMimeObject *parent, GMimeObject *part, PartData *pdata) memset (&pi, 0, sizeof pi); pi.index = pdata->_idx++; pi.content_id = (char*)g_mime_object_get_content_id (part); + pi.data = (gpointer)part; + + /* check if this is the body part */ + pi.is_body = ((void*)pdata->_body_part == (void*)part); ct = g_mime_object_get_content_type (part); @@ -112,11 +136,13 @@ part_foreach_cb (GMimeObject *parent, GMimeObject *part, PartData *pdata) pi.disposition = (char*)g_mime_object_get_disposition (part); pi.file_name = (char*)g_mime_part_get_filename (GMIME_PART(part)); pi.size = get_part_size (GMIME_PART(part)); - } + } else + return; /* only deal with GMimePart */ pdata->_func(pdata->_msg, &pi, pdata->_user_data); } + static gboolean load_msg_file_maybe (MuMsg *msg) { @@ -149,14 +175,18 @@ mu_msg_part_foreach (MuMsg *msg, MuMsgPartForeachFunc func, gpointer user_data) { PartData pdata; + GMimeMessage *mime_msg; - g_return_if_fail (msg); + g_return_if_fail (msg && msg->_file && msg->_file->_mime_msg); if (!load_msg_file_maybe (msg)) return; + mime_msg = msg->_file->_mime_msg; + pdata._msg = msg; pdata._idx = 0; + pdata._body_part = mu_msg_mime_get_body_part (mime_msg, FALSE); pdata._func = func; pdata._user_data = user_data; @@ -269,6 +299,7 @@ mu_msg_part_filepath (MuMsg *msg, const char* targetdir, guint partidx) + gchar* mu_msg_part_filepath_cache (MuMsg *msg, guint partid) { diff --git a/src/mu-msg-part.h b/src/mu-msg-part.h index 470b73cb..526d2b37 100644 --- a/src/mu-msg-part.h +++ b/src/mu-msg-part.h @@ -51,6 +51,9 @@ struct _MuMsgPart { gpointer data; /* opaque data */ + gboolean is_body; /* TRUE if this is probably the + * message body*/ + /* if TRUE, mu_msg_part_destroy will free the member vars * as well*/ gboolean own_members; @@ -77,6 +80,17 @@ typedef struct _MuMsgPart MuMsgPart; #define mu_msg_part_content_id(pi) ((pi)->content_id) +/** + * convert a MuMsgPart to a string + * + * @param part a MuMsgPart + * @param err will receive TRUE if there was an error, FALSE otherwise + * + * @return utf8 string for this MIME part, to be freed by caller + */ +char* mu_msg_part_to_string (MuMsgPart *part, gboolean *err); + + /** * does this msg part look like an attachment? * diff --git a/src/mu-store-write.cc b/src/mu-store-write.cc index 417ac72a..ae0a9932 100644 --- a/src/mu-store-write.cc +++ b/src/mu-store-write.cc @@ -66,7 +66,7 @@ _MuStore::rollback_transaction () { } -/* we cache these prefix strings, so we don't have to allocate the all +/* we cache these prefix strings, so we don't have to allocate them all * the time; this should save 10-20 string allocs per message */ G_GNUC_CONST static const std::string& prefix (MuMsgFieldId mfid) @@ -162,6 +162,7 @@ mu_store_set_metadata (MuStore *store, const char *key, const char *val, try { store->db_writable()->set_metadata (key, val); return TRUE; + } MU_STORE_CATCH_BLOCK_RETURN(err, FALSE); } MU_XAPIAN_CATCH_BLOCK_G_ERROR_RETURN(err, MU_ERROR_XAPIAN, FALSE); @@ -212,8 +213,6 @@ add_terms_values_date (Xapian::Document& doc, MuMsg *msg, MuMsgFieldId mfid) } } -/* TODO: we could pre-calculate the add_term values for FLAGS */ - /* pre-calculate; optimization */ G_GNUC_CONST static const std::string& flag_val (char flagchar) @@ -310,7 +309,8 @@ static void add_terms_values_str (Xapian::Document& doc, char *val, MuMsgFieldId mfid) { - /* the value is what we'll display; the unchanged original */ + /* the value is what we display in search results; the + * unchanged original */ if (mu_msg_field_xapian_value(mfid)) doc.add_value ((Xapian::valueno)mfid, val); @@ -318,7 +318,8 @@ add_terms_values_str (Xapian::Document& doc, char *val, if (mu_msg_field_normalize (mfid)) mu_str_normalize_in_place (val, TRUE); if (mu_msg_field_xapian_escape (mfid)) - mu_str_ascii_xapian_escape_in_place (val); + mu_str_ascii_xapian_escape_in_place (val, + TRUE /*esc_space*/); if (mu_msg_field_xapian_index (mfid)) { Xapian::TermGenerator termgen; @@ -402,36 +403,86 @@ struct PartData { MuMsgFieldId _mfid; }; + +static gboolean +index_text_part (MuMsgPart *part, PartData *pdata) +{ + unsigned u; + gboolean txt_type, err; + char *txt, *norm; + Xapian::TermGenerator termgen; + + /* check wether it's a type we need to store */ + struct { const char* type; const char *subtype; } txt_types[] = { + { "text", "plain"}, + { "message", "rfc822"}, + }; + + txt_type = FALSE; + for (u = 0; u != G_N_ELEMENTS(txt_types) && !txt_type; ++u) { + if ((strcasecmp (part->type, txt_types[u].type) == 0) && + ((strcasecmp (part->subtype, txt_types[u].subtype) == 0) || + (strcmp (txt_types[u].subtype, "*") == 0))) + txt_type = TRUE; + } + + if (!txt_type) + return FALSE; /* not a supported text type */ + + txt = mu_msg_part_to_string (part, &err); + if (!txt || err) + return FALSE; + + termgen.set_document(pdata->_doc); + + norm = mu_str_normalize (txt, TRUE); + termgen.index_text_without_positions + (norm, 1, prefix(MU_MSG_FIELD_ID_ATTACH_TEXT)); + + g_free (norm); + g_free (txt); + + return TRUE; +} + + static void each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata) { static const std::string - att (prefix(MU_MSG_FIELD_ID_ATTACH)), - mime (prefix(MU_MSG_FIELD_ID_ATTACH_MIME_TYPE)); + file (prefix(MU_MSG_FIELD_ID_FILE)), + mime (prefix(MU_MSG_FIELD_ID_MIME)); - if (mu_msg_part_looks_like_attachment (part, TRUE) && - (part->file_name)) { + /* save the mime type of any part */ + if (part->type) { + /* note, we use '_' instead of '/' to separate + * type/subtype -- Xapian doesn't treat '/' as + * desired, so we use '_' and pre-process queries; see + * mu_query_preprocess */ + char ctype[MuStore::MAX_TERM_LENGTH + 1]; + snprintf (ctype, sizeof(ctype), "%s_%s", + part->type, part->subtype); + pdata->_doc.add_term + (mime + std::string(ctype, 0, MuStore::MAX_TERM_LENGTH)); + } + /* save the name of anything that has a filename */ + if (part->file_name) { char val[MuStore::MAX_TERM_LENGTH + 1]; strncpy (val, part->file_name, sizeof(val)); /* now, let's create a term... */ mu_str_normalize_in_place (val, TRUE); - mu_str_ascii_xapian_escape_in_place (val); + mu_str_ascii_xapian_escape_in_place (val, TRUE /*esc space*/); pdata->_doc.add_term - (att + std::string(val, 0, MuStore::MAX_TERM_LENGTH)); - - /* save the mime type */ - if (part->type) { - gchar *str; - str = g_strdup_printf ("%s/%s", part->type, part->subtype); - pdata->_doc.add_term - (mime + std::string(str, 0, MuStore::MAX_TERM_LENGTH)); - g_free (str); - } else - pdata->_doc.add_term (mime + "application/octet-stream"); + (file + std::string(val, 0, MuStore::MAX_TERM_LENGTH)); } + + /* now, for non-body parts with some MIME-types, index the + * content as well */ + if (!part->is_body) + index_text_part (part, pdata); } @@ -465,8 +516,7 @@ add_terms_values_body (Xapian::Document& doc, MuMsg *msg, termgen.set_document(doc); norm = mu_str_normalize (str, TRUE); - termgen.index_text_without_positions - (norm, 1, prefix(mfid)); + termgen.index_text_without_positions (norm, 1, prefix(mfid)); g_free (norm); } @@ -478,6 +528,24 @@ struct _MsgDoc { }; typedef struct _MsgDoc MsgDoc; + +static void +add_terms_values_default (MuMsgFieldId mfid, MsgDoc* msgdoc) +{ + if (mu_msg_field_is_numeric (mfid)) + add_terms_values_number + (*msgdoc->_doc, msgdoc->_msg, mfid); + else if (mu_msg_field_is_string (mfid)) + add_terms_values_string + (*msgdoc->_doc, msgdoc->_msg, mfid); + else if (mu_msg_field_is_string_list(mfid)) + add_terms_values_string_list + (*msgdoc->_doc, msgdoc->_msg, mfid); + else + g_return_if_reached (); + +} + static void add_terms_values (MuMsgFieldId mfid, MsgDoc* msgdoc) { @@ -495,26 +563,19 @@ add_terms_values (MuMsgFieldId mfid, MsgDoc* msgdoc) case MU_MSG_FIELD_ID_BODY_TEXT: add_terms_values_body (*msgdoc->_doc, msgdoc->_msg, mfid); break; - case MU_MSG_FIELD_ID_ATTACH: /* also takes care of MU_MSG_FIELD_ID_ATTACH_MIME */ + + /* note: add_terms_values_attach handles these three msgfields */ + case MU_MSG_FIELD_ID_FILE: add_terms_values_attach (*msgdoc->_doc, msgdoc->_msg, mfid); + case MU_MSG_FIELD_ID_MIME: + case MU_MSG_FIELD_ID_ATTACH_TEXT: break; - case MU_MSG_FIELD_ID_ATTACH_MIME_TYPE: + /////////////////////////////////////////// + case MU_MSG_FIELD_ID_UID: break; /* already taken care of elsewhere */ default: - if (mu_msg_field_is_numeric (mfid)) - add_terms_values_number (*msgdoc->_doc, msgdoc->_msg, - mfid); - else if (mu_msg_field_is_string (mfid)) - add_terms_values_string (*msgdoc->_doc, - msgdoc->_msg, - mfid); - else if (mu_msg_field_is_string_list(mfid)) - add_terms_values_string_list (*msgdoc->_doc, - msgdoc->_msg, - mfid); - else - g_return_if_reached (); + return add_terms_values_default (mfid, msgdoc); } } @@ -559,7 +620,11 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc) /* don't normalize e-mail address, but do lowercase it */ if (!mu_str_is_empty(contact->address)) { - char *escaped = mu_str_ascii_xapian_escape (contact->address); + + char *escaped; + + escaped = mu_str_ascii_xapian_escape (contact->address, + FALSE /*dont esc space*/); msgdoc->_doc->add_term (std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH)); g_free (escaped);