* add indexing/searching for text-based mime parts

This commit is contained in:
djcb
2011-11-24 00:11:45 +02:00
parent 5ac9693681
commit cc6738c195
4 changed files with 209 additions and 75 deletions

View File

@ -66,7 +66,7 @@ _MuStore::rollback_transaction () {
}
/* we cache these prefix strings, so we don't have to allocate the all
/* we cache these prefix strings, so we don't have to allocate them all
* the time; this should save 10-20 string allocs per message */
G_GNUC_CONST static const std::string&
prefix (MuMsgFieldId mfid)
@ -162,6 +162,7 @@ mu_store_set_metadata (MuStore *store, const char *key, const char *val,
try {
store->db_writable()->set_metadata (key, val);
return TRUE;
} MU_STORE_CATCH_BLOCK_RETURN(err, FALSE);
} MU_XAPIAN_CATCH_BLOCK_G_ERROR_RETURN(err, MU_ERROR_XAPIAN, FALSE);
@ -212,8 +213,6 @@ add_terms_values_date (Xapian::Document& doc, MuMsg *msg, MuMsgFieldId mfid)
}
}
/* TODO: we could pre-calculate the add_term values for FLAGS */
/* pre-calculate; optimization */
G_GNUC_CONST static const std::string&
flag_val (char flagchar)
@ -310,7 +309,8 @@ static void
add_terms_values_str (Xapian::Document& doc, char *val,
MuMsgFieldId mfid)
{
/* the value is what we'll display; the unchanged original */
/* the value is what we display in search results; the
* unchanged original */
if (mu_msg_field_xapian_value(mfid))
doc.add_value ((Xapian::valueno)mfid, val);
@ -318,7 +318,8 @@ add_terms_values_str (Xapian::Document& doc, char *val,
if (mu_msg_field_normalize (mfid))
mu_str_normalize_in_place (val, TRUE);
if (mu_msg_field_xapian_escape (mfid))
mu_str_ascii_xapian_escape_in_place (val);
mu_str_ascii_xapian_escape_in_place (val,
TRUE /*esc_space*/);
if (mu_msg_field_xapian_index (mfid)) {
Xapian::TermGenerator termgen;
@ -402,36 +403,86 @@ struct PartData {
MuMsgFieldId _mfid;
};
static gboolean
index_text_part (MuMsgPart *part, PartData *pdata)
{
unsigned u;
gboolean txt_type, err;
char *txt, *norm;
Xapian::TermGenerator termgen;
/* check wether it's a type we need to store */
struct { const char* type; const char *subtype; } txt_types[] = {
{ "text", "plain"},
{ "message", "rfc822"},
};
txt_type = FALSE;
for (u = 0; u != G_N_ELEMENTS(txt_types) && !txt_type; ++u) {
if ((strcasecmp (part->type, txt_types[u].type) == 0) &&
((strcasecmp (part->subtype, txt_types[u].subtype) == 0) ||
(strcmp (txt_types[u].subtype, "*") == 0)))
txt_type = TRUE;
}
if (!txt_type)
return FALSE; /* not a supported text type */
txt = mu_msg_part_to_string (part, &err);
if (!txt || err)
return FALSE;
termgen.set_document(pdata->_doc);
norm = mu_str_normalize (txt, TRUE);
termgen.index_text_without_positions
(norm, 1, prefix(MU_MSG_FIELD_ID_ATTACH_TEXT));
g_free (norm);
g_free (txt);
return TRUE;
}
static void
each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)
{
static const std::string
att (prefix(MU_MSG_FIELD_ID_ATTACH)),
mime (prefix(MU_MSG_FIELD_ID_ATTACH_MIME_TYPE));
file (prefix(MU_MSG_FIELD_ID_FILE)),
mime (prefix(MU_MSG_FIELD_ID_MIME));
if (mu_msg_part_looks_like_attachment (part, TRUE) &&
(part->file_name)) {
/* save the mime type of any part */
if (part->type) {
/* note, we use '_' instead of '/' to separate
* type/subtype -- Xapian doesn't treat '/' as
* desired, so we use '_' and pre-process queries; see
* mu_query_preprocess */
char ctype[MuStore::MAX_TERM_LENGTH + 1];
snprintf (ctype, sizeof(ctype), "%s_%s",
part->type, part->subtype);
pdata->_doc.add_term
(mime + std::string(ctype, 0, MuStore::MAX_TERM_LENGTH));
}
/* save the name of anything that has a filename */
if (part->file_name) {
char val[MuStore::MAX_TERM_LENGTH + 1];
strncpy (val, part->file_name, sizeof(val));
/* now, let's create a term... */
mu_str_normalize_in_place (val, TRUE);
mu_str_ascii_xapian_escape_in_place (val);
mu_str_ascii_xapian_escape_in_place (val, TRUE /*esc space*/);
pdata->_doc.add_term
(att + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
/* save the mime type */
if (part->type) {
gchar *str;
str = g_strdup_printf ("%s/%s", part->type, part->subtype);
pdata->_doc.add_term
(mime + std::string(str, 0, MuStore::MAX_TERM_LENGTH));
g_free (str);
} else
pdata->_doc.add_term (mime + "application/octet-stream");
(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
}
/* now, for non-body parts with some MIME-types, index the
* content as well */
if (!part->is_body)
index_text_part (part, pdata);
}
@ -465,8 +516,7 @@ add_terms_values_body (Xapian::Document& doc, MuMsg *msg,
termgen.set_document(doc);
norm = mu_str_normalize (str, TRUE);
termgen.index_text_without_positions
(norm, 1, prefix(mfid));
termgen.index_text_without_positions (norm, 1, prefix(mfid));
g_free (norm);
}
@ -478,6 +528,24 @@ struct _MsgDoc {
};
typedef struct _MsgDoc MsgDoc;
static void
add_terms_values_default (MuMsgFieldId mfid, MsgDoc* msgdoc)
{
if (mu_msg_field_is_numeric (mfid))
add_terms_values_number
(*msgdoc->_doc, msgdoc->_msg, mfid);
else if (mu_msg_field_is_string (mfid))
add_terms_values_string
(*msgdoc->_doc, msgdoc->_msg, mfid);
else if (mu_msg_field_is_string_list(mfid))
add_terms_values_string_list
(*msgdoc->_doc, msgdoc->_msg, mfid);
else
g_return_if_reached ();
}
static void
add_terms_values (MuMsgFieldId mfid, MsgDoc* msgdoc)
{
@ -495,26 +563,19 @@ add_terms_values (MuMsgFieldId mfid, MsgDoc* msgdoc)
case MU_MSG_FIELD_ID_BODY_TEXT:
add_terms_values_body (*msgdoc->_doc, msgdoc->_msg, mfid);
break;
case MU_MSG_FIELD_ID_ATTACH: /* also takes care of MU_MSG_FIELD_ID_ATTACH_MIME */
/* note: add_terms_values_attach handles these three msgfields */
case MU_MSG_FIELD_ID_FILE:
add_terms_values_attach (*msgdoc->_doc, msgdoc->_msg, mfid);
case MU_MSG_FIELD_ID_MIME:
case MU_MSG_FIELD_ID_ATTACH_TEXT:
break;
case MU_MSG_FIELD_ID_ATTACH_MIME_TYPE:
///////////////////////////////////////////
case MU_MSG_FIELD_ID_UID:
break; /* already taken care of elsewhere */
default:
if (mu_msg_field_is_numeric (mfid))
add_terms_values_number (*msgdoc->_doc, msgdoc->_msg,
mfid);
else if (mu_msg_field_is_string (mfid))
add_terms_values_string (*msgdoc->_doc,
msgdoc->_msg,
mfid);
else if (mu_msg_field_is_string_list(mfid))
add_terms_values_string_list (*msgdoc->_doc,
msgdoc->_msg,
mfid);
else
g_return_if_reached ();
return add_terms_values_default (mfid, msgdoc);
}
}
@ -559,7 +620,11 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
/* don't normalize e-mail address, but do lowercase it */
if (!mu_str_is_empty(contact->address)) {
char *escaped = mu_str_ascii_xapian_escape (contact->address);
char *escaped;
escaped = mu_str_ascii_xapian_escape (contact->address,
FALSE /*dont esc space*/);
msgdoc->_doc->add_term
(std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
g_free (escaped);