* mu4e: use GStringChunk* for string normalization / escaping

- this should fix the rare bug for some non-Latin unicode blocks,
  simplify some code, and possibly improve performance a bit
This commit is contained in:
djcb
2012-06-12 00:11:14 +03:00
parent 9991c6fd60
commit 423a1d7140
6 changed files with 111 additions and 92 deletions

View File

@ -294,12 +294,13 @@ mu_query_preprocess (const char *query, GError **err)
return NULL; return NULL;
for (cur = parts; cur; cur = g_slist_next(cur)) { for (cur = parts; cur; cur = g_slist_next(cur)) {
char *data;
data = (gchar*)cur->data;
/* remove accents and turn to lower-case */ /* remove accents and turn to lower-case */
cur->data = mu_str_normalize_in_place ((gchar*)cur->data, TRUE);
/* escape '@', single '_' and ':' if it's not following a /* escape '@', single '_' and ':' if it's not following a
* xapian-pfx with '_' */ * xapian-pfx with '_' */
cur->data = mu_str_xapian_escape_in_place cur->data = mu_str_xapian_escape (data, TRUE, NULL);
((gchar*)cur->data, TRUE /*escape spaces too*/); g_free (data);
} }
myquery = mu_str_from_list (parts, ' '); myquery = mu_str_from_list (parts, ' ');

View File

@ -1,6 +1,6 @@
/* -*-mode: c++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8-*- */ /* -*-mode: c++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8-*- */
/* /*
** Copyright (C) 2011 <djcb@djcbsoftware.nl> ** Copyright (C) 2011-2012 <djcb@djcbsoftware.nl>
** **
** This program is free software; you can redistribute it and/or modify it ** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the ** under the terms of the GNU General Public License as published by the

View File

@ -306,7 +306,7 @@ add_terms_values_number (Xapian::Document& doc, MuMsg *msg, MuMsgFieldId mfid)
/* for string and string-list */ /* for string and string-list */
static void static void
add_terms_values_str (Xapian::Document& doc, char *val, add_terms_values_str (Xapian::Document& doc, char *val,
MuMsgFieldId mfid) MuMsgFieldId mfid, GStringChunk *strchunk)
{ {
/* the value is what we display in search results; the /* the value is what we display in search results; the
* unchanged original */ * unchanged original */
@ -315,7 +315,7 @@ add_terms_values_str (Xapian::Document& doc, char *val,
/* now, let's create some search terms... */ /* now, let's create some search terms... */
if (mu_msg_field_normalize (mfid)) if (mu_msg_field_normalize (mfid))
mu_str_normalize_in_place (val, TRUE); val = mu_str_normalize_in_place_try (val, TRUE, strchunk);
if (mu_msg_field_xapian_index (mfid)) { if (mu_msg_field_xapian_index (mfid)) {
Xapian::TermGenerator termgen; Xapian::TermGenerator termgen;
@ -323,8 +323,8 @@ add_terms_values_str (Xapian::Document& doc, char *val,
termgen.index_text_without_positions (val, 1, prefix(mfid)); termgen.index_text_without_positions (val, 1, prefix(mfid));
} }
if (mu_msg_field_xapian_escape (mfid)) if (mu_msg_field_xapian_escape (mfid))
mu_str_xapian_escape_in_place (val, val= mu_str_xapian_escape_in_place_try (val, TRUE /*esc_space*/,
TRUE /*esc_space*/); strchunk);
if (mu_msg_field_xapian_term(mfid)) if (mu_msg_field_xapian_term(mfid))
doc.add_term (prefix(mfid) + doc.add_term (prefix(mfid) +
std::string(val, 0, _MuStore::MAX_TERM_LENGTH)); std::string(val, 0, _MuStore::MAX_TERM_LENGTH));
@ -333,31 +333,23 @@ add_terms_values_str (Xapian::Document& doc, char *val,
static void static void
add_terms_values_string (Xapian::Document& doc, MuMsg *msg, add_terms_values_string (Xapian::Document& doc, MuMsg *msg,
MuMsgFieldId mfid) MuMsgFieldId mfid, GStringChunk *strchunk)
{ {
const char *orig; const char *orig;
char *val; char *val;
size_t len;
if (!(orig = mu_msg_get_field_string (msg, mfid))) if (!(orig = mu_msg_get_field_string (msg, mfid)))
return; /* nothing to do */ return; /* nothing to do */
/* try stack-allocation, it's much faster*/ val = g_string_chunk_insert (strchunk, orig);
len = strlen (orig); add_terms_values_str (doc, val, mfid, strchunk);
val = (char*)(G_LIKELY(len < 1024)?g_alloca(len+1):g_malloc(len+1));
strcpy (val, orig);
add_terms_values_str (doc, val, mfid);
if (!(G_LIKELY(len < 1024)))
g_free (val);
} }
static void static void
add_terms_values_string_list (Xapian::Document& doc, MuMsg *msg, add_terms_values_string_list (Xapian::Document& doc, MuMsg *msg,
MuMsgFieldId mfid) MuMsgFieldId mfid, GStringChunk *strchunk)
{ {
const GSList *lst; const GSList *lst;
@ -373,21 +365,9 @@ add_terms_values_string_list (Xapian::Document& doc, MuMsg *msg,
if (lst && mu_msg_field_xapian_term (mfid)) { if (lst && mu_msg_field_xapian_term (mfid)) {
while (lst) { while (lst) {
size_t len;
char *val; char *val;
/* try stack-allocation, it's much faster*/ val = g_string_chunk_insert (strchunk, (const gchar*)lst->data);
len = strlen ((char*)lst->data); add_terms_values_str (doc, val, mfid, strchunk);
if (G_LIKELY(len < 1024))
val = (char*)g_alloca(len+1);
else
val = (char*)g_malloc(len+1);
strcpy (val, (char*)lst->data);
add_terms_values_str (doc, val, mfid);
if (!(G_LIKELY(len < 1024)))
g_free (val);
lst = g_slist_next ((GSList*)lst); lst = g_slist_next ((GSList*)lst);
} }
} }
@ -395,10 +375,11 @@ add_terms_values_string_list (Xapian::Document& doc, MuMsg *msg,
struct PartData { struct PartData {
PartData (Xapian::Document& doc, MuMsgFieldId mfid): PartData (Xapian::Document& doc, MuMsgFieldId mfid, GStringChunk *strchunk):
_doc (doc), _mfid(mfid) {} _doc (doc), _mfid(mfid), _strchunk(strchunk) {}
Xapian::Document _doc; Xapian::Document _doc;
MuMsgFieldId _mfid; MuMsgFieldId _mfid;
GStringChunk *_strchunk;
}; };
static gboolean static gboolean
@ -434,14 +415,13 @@ index_text_part (MuMsgPart *part, PartData *pdata)
termgen.set_document(pdata->_doc); termgen.set_document(pdata->_doc);
norm = mu_str_normalize (txt, TRUE); /* allocated on strchunk, no need to free */
norm = mu_str_normalize (txt, TRUE, pdata->_strchunk);
termgen.index_text_without_positions termgen.index_text_without_positions
(norm, 1, prefix(MU_MSG_FIELD_ID_EMBEDDED_TEXT)); (norm, 1, prefix(MU_MSG_FIELD_ID_EMBEDDED_TEXT));
g_free (norm);
g_free (txt); g_free (txt);
return TRUE; return TRUE;
} }
@ -470,13 +450,11 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)
/* save the name of anything that has a filename */ /* save the name of anything that has a filename */
if (part->file_name) { if (part->file_name) {
char val[MuStore::MAX_TERM_LENGTH + 1]; char *val;
strncpy (val, part->file_name, sizeof(val)); /* now, let's create a term... allocated on strchunk,
* no need to free*/
/* now, let's create a term... */ val = mu_str_xapian_escape (part->file_name, TRUE /*esc space*/,
mu_str_normalize_in_place (val, TRUE); pdata->_strchunk);
mu_str_xapian_escape_in_place (val, TRUE /*esc space*/);
pdata->_doc.add_term pdata->_doc.add_term
(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH)); (file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
} }
@ -490,9 +468,9 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)
static void static void
add_terms_values_attach (Xapian::Document& doc, MuMsg *msg, add_terms_values_attach (Xapian::Document& doc, MuMsg *msg,
MuMsgFieldId mfid) MuMsgFieldId mfid, GStringChunk *strchunk)
{ {
PartData pdata (doc, mfid); PartData pdata (doc, mfid, strchunk);
mu_msg_part_foreach (msg, TRUE, mu_msg_part_foreach (msg, TRUE,
(MuMsgPartForeachFunc)each_part, &pdata); (MuMsgPartForeachFunc)each_part, &pdata);
} }
@ -500,7 +478,7 @@ add_terms_values_attach (Xapian::Document& doc, MuMsg *msg,
static void static void
add_terms_values_body (Xapian::Document& doc, MuMsg *msg, add_terms_values_body (Xapian::Document& doc, MuMsg *msg,
MuMsgFieldId mfid) MuMsgFieldId mfid, GStringChunk *strchunk)
{ {
const char *str; const char *str;
char *norm; char *norm;
@ -518,32 +496,32 @@ add_terms_values_body (Xapian::Document& doc, MuMsg *msg,
Xapian::TermGenerator termgen; Xapian::TermGenerator termgen;
termgen.set_document(doc); termgen.set_document(doc);
norm = mu_str_normalize (str, TRUE); /* norm is allocated on strchunk, no need for freeing */
norm = mu_str_normalize (str, TRUE, strchunk);
termgen.index_text_without_positions (norm, 1, prefix(mfid)); termgen.index_text_without_positions (norm, 1, prefix(mfid));
g_free (norm);
} }
struct _MsgDoc { struct _MsgDoc {
Xapian::Document *_doc; Xapian::Document *_doc;
MuMsg *_msg; MuMsg *_msg;
MuStore *_store; MuStore *_store;
GStringChunk *_strchunk;
}; };
typedef struct _MsgDoc MsgDoc; typedef struct _MsgDoc MsgDoc;
static void static void
add_terms_values_default (MuMsgFieldId mfid, MsgDoc* msgdoc) add_terms_values_default (MuMsgFieldId mfid, MsgDoc *msgdoc)
{ {
if (mu_msg_field_is_numeric (mfid)) if (mu_msg_field_is_numeric (mfid))
add_terms_values_number add_terms_values_number
(*msgdoc->_doc, msgdoc->_msg, mfid); (*msgdoc->_doc, msgdoc->_msg, mfid);
else if (mu_msg_field_is_string (mfid)) else if (mu_msg_field_is_string (mfid))
add_terms_values_string add_terms_values_string
(*msgdoc->_doc, msgdoc->_msg, mfid); (*msgdoc->_doc, msgdoc->_msg, mfid, msgdoc->_strchunk);
else if (mu_msg_field_is_string_list(mfid)) else if (mu_msg_field_is_string_list(mfid))
add_terms_values_string_list add_terms_values_string_list
(*msgdoc->_doc, msgdoc->_msg, mfid); (*msgdoc->_doc, msgdoc->_msg, mfid, msgdoc->_strchunk);
else else
g_return_if_reached (); g_return_if_reached ();
@ -564,13 +542,14 @@ add_terms_values (MuMsgFieldId mfid, MsgDoc* msgdoc)
add_terms_values_date (*msgdoc->_doc, msgdoc->_msg, mfid); add_terms_values_date (*msgdoc->_doc, msgdoc->_msg, mfid);
break; break;
case MU_MSG_FIELD_ID_BODY_TEXT: case MU_MSG_FIELD_ID_BODY_TEXT:
add_terms_values_body (*msgdoc->_doc, msgdoc->_msg, mfid); add_terms_values_body (*msgdoc->_doc, msgdoc->_msg, mfid, msgdoc->_strchunk);
break; break;
/* note: add_terms_values_attach handles _FILE, _MIME and /* note: add_terms_values_attach handles _FILE, _MIME and
* _ATTACH_TEXT msgfields */ * _ATTACH_TEXT msgfields */
case MU_MSG_FIELD_ID_FILE: case MU_MSG_FIELD_ID_FILE:
add_terms_values_attach (*msgdoc->_doc, msgdoc->_msg, mfid); add_terms_values_attach (*msgdoc->_doc, msgdoc->_msg, mfid,
msgdoc->_strchunk);
break; break;
case MU_MSG_FIELD_ID_MIME: case MU_MSG_FIELD_ID_MIME:
case MU_MSG_FIELD_ID_EMBEDDED_TEXT: case MU_MSG_FIELD_ID_EMBEDDED_TEXT:
@ -622,20 +601,22 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
if (!mu_str_is_empty(contact->name)) { if (!mu_str_is_empty(contact->name)) {
Xapian::TermGenerator termgen; Xapian::TermGenerator termgen;
termgen.set_document (*msgdoc->_doc); termgen.set_document (*msgdoc->_doc);
char *norm = mu_str_normalize (contact->name, TRUE); /* note: norm is added to stringchunk, no need for freeing */
char *norm = mu_str_normalize (contact->name, TRUE, msgdoc->_strchunk);
termgen.index_text_without_positions (norm, 1, pfx); termgen.index_text_without_positions (norm, 1, pfx);
g_free (norm);
} }
/* don't normalize e-mail address, but do lowercase it */ /* don't normalize e-mail address, but do lowercase it */
if (!mu_str_is_empty(contact->address)) { if (!mu_str_is_empty(contact->address)) {
char *escaped; char *escaped;
/* note: escaped is added to stringchunk, no need for
* freeing */
escaped = mu_str_xapian_escape (contact->address, escaped = mu_str_xapian_escape (contact->address,
FALSE /*dont esc space*/); FALSE /*dont esc space*/,
msgdoc->_strchunk);
msgdoc->_doc->add_term msgdoc->_doc->add_term
(std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH)); (std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
g_free (escaped);
/* store it also in our contacts cache */ /* store it also in our contacts cache */
if (msgdoc->_store->contacts()) if (msgdoc->_store->contacts())
@ -645,18 +626,22 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
} }
} }
#define MU_STRING_CHUNK_SIZE 8192
Xapian::Document Xapian::Document
new_doc_from_message (MuStore *store, MuMsg *msg) new_doc_from_message (MuStore *store, MuMsg *msg)
{ {
Xapian::Document doc; Xapian::Document doc;
MsgDoc docinfo = {&doc, msg, store}; MsgDoc docinfo = {&doc, msg, store, 0};
docinfo._strchunk = g_string_chunk_new (MU_STRING_CHUNK_SIZE);
mu_msg_field_foreach ((MuMsgFieldForeachFunc)add_terms_values, &docinfo); mu_msg_field_foreach ((MuMsgFieldForeachFunc)add_terms_values, &docinfo);
/* also store the contact-info as separate terms */ /* also store the contact-info as separate terms */
mu_msg_contact_foreach (msg, (MuMsgContactForeachFunc)each_contact_info, mu_msg_contact_foreach (msg, (MuMsgContactForeachFunc)each_contact_info,
&docinfo); &docinfo);
g_string_chunk_free (docinfo._strchunk);
return doc; return doc;
} }

View File

@ -30,34 +30,48 @@
char* char*
mu_str_normalize (const char *str, gboolean downcase) mu_str_normalize (const char *str, gboolean downcase, GStringChunk *strchunk)
{ {
char *mystr;
g_return_val_if_fail (str, NULL); g_return_val_if_fail (str, NULL);
return mu_str_normalize_in_place (g_strdup(str), downcase); if (strchunk)
mystr = g_string_chunk_insert (strchunk, str);
else
mystr = g_strdup (str);
return mu_str_normalize_in_place_try (mystr, downcase, strchunk);
} }
/* this implementation should work for _all_ locales. */ /* this implementation should work for _all_ locales. */
static char* static char*
mu_str_normalize_in_place_generic (char *str, gboolean downcase) mu_str_normalize_in_place_generic (char *str, gboolean downcase, GStringChunk *strchunk)
{ {
/* FIXME: add accent-folding etc. */
if (downcase) {
char *norm; char *norm;
size_t len; size_t len;
/* FIXME: add accent-folding etc. */
if (!downcase)
return str; /* nothing to do */
len = strlen (str); len = strlen (str);
norm = g_utf8_strdown (str, len); norm = g_utf8_strdown (str, len);
if (strlen (norm) > len)
g_warning ("normalized text doesn't fit :/");
memcpy (str, norm, len); if (strlen (norm) > len) {
/* this case is rare, but does happen */
char *copy;
if (!strchunk)
return norm;
copy = g_string_chunk_insert (strchunk, norm);
g_free (norm);
return copy;
} }
memcpy (str, norm, len);
return str; return str;
} }
@ -78,7 +92,7 @@ mu_str_normalize_in_place_generic (char *str, gboolean downcase)
* original 0xc3 0x9f * original 0xc3 0x9f
*/ */
char* char*
mu_str_normalize_in_place (char *str, gboolean downcase) mu_str_normalize_in_place_try (char *str, gboolean downcase, GStringChunk *strchunk)
{ {
const guchar *cur; const guchar *cur;
int i; int i;
@ -386,7 +400,7 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
/* our fast-path for latin-utf8 does not work -- bummer! /* our fast-path for latin-utf8 does not work -- bummer!
* use something more generic (but a bit slower) * use something more generic (but a bit slower)
*/ */
return mu_str_normalize_in_place_generic (str, downcase); return mu_str_normalize_in_place_generic (str, downcase, strchunk);
} }
} }

View File

@ -430,7 +430,7 @@ check_for_field (const char *str, gboolean *is_field, gboolean *is_range_field)
* function expects search terms (not complete queries) * function expects search terms (not complete queries)
* */ * */
char* char*
mu_str_xapian_escape_in_place (char *term, gboolean esc_space) mu_str_xapian_escape_in_place_try (char *term, gboolean esc_space, GStringChunk *strchunk)
{ {
unsigned char *cur; unsigned char *cur;
const char escchar = '_'; const char escchar = '_';
@ -474,15 +474,22 @@ mu_str_xapian_escape_in_place (char *term, gboolean esc_space)
} }
/* downcase try to remove accents etc. */ /* downcase try to remove accents etc. */
return mu_str_normalize_in_place (term, TRUE); return mu_str_normalize_in_place_try (term, TRUE, strchunk);
} }
char* char*
mu_str_xapian_escape (const char *query, gboolean esc_space) mu_str_xapian_escape (const char *query, gboolean esc_space, GStringChunk *strchunk)
{ {
char *mystr;
g_return_val_if_fail (query, NULL); g_return_val_if_fail (query, NULL);
return mu_str_xapian_escape_in_place (g_strdup(query), esc_space); if (strchunk)
mystr = g_string_chunk_insert (strchunk, query);
else
mystr = g_strdup (query);
return mu_str_xapian_escape_in_place_try (mystr, esc_space, strchunk);
} }

View File

@ -107,14 +107,16 @@ char* mu_str_summarize (const char* str, size_t max_lines)
* *
* @param str a valid utf8 string or NULL * @param str a valid utf8 string or NULL
* @param downcase if TRUE, convert the string to lowercase * @param downcase if TRUE, convert the string to lowercase
* @param strchunk (optional) if non-NULL, allocate strings on strchunk
* *
* @return the normalize string, or NULL in case of error or str was NULL * @return the normalized string, or NULL in case of error or str was
* NULL. Unless strchunk was provided, user must g_free the string when
* no longer needed
*/ */
char* mu_str_normalize (const char *str, gboolean downcase) char* mu_str_normalize (const char *str, gboolean downcase,
GStringChunk *strchunk)
G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT; G_GNUC_MALLOC G_GNUC_WARN_UNUSED_RESULT;
/** /**
* normalize a string (ie., collapse accented characters etc.), and * normalize a string (ie., collapse accented characters etc.), and
* optionally, downcase it. this happen by changing the string; if * optionally, downcase it. this happen by changing the string; if
@ -123,12 +125,14 @@ char* mu_str_normalize (const char *str, gboolean downcase)
* *
* @param str a valid utf8 string or NULL * @param str a valid utf8 string or NULL
* @param downcase if TRUE, convert the string to lowercase * @param downcase if TRUE, convert the string to lowercase
* @param strchunk (optional) if non-NULL, allocate strings on strchunk
* *
* @return the normalized string, or NULL in case of error or str was * @return the normalized string, or NULL in case of error or str was
* NULL * NULL. User only needs to free the returned string if a) return
* value != str and b) strchunk was not provided.
*/ */
char* mu_str_normalize_in_place (char *str, gboolean downcase); char* mu_str_normalize_in_place_try (char *str, gboolean downcase,
GStringChunk *strchunk);
/** /**
* escape the string for use with xapian matching. in practice, if the * escape the string for use with xapian matching. in practice, if the
@ -140,10 +144,15 @@ char* mu_str_normalize_in_place (char *str, gboolean downcase);
* *
* @param query a query string * @param query a query string
* @param esc_space escape space characters as well * @param esc_space escape space characters as well
* @param strchunk (optional) if non-NULL, allocate strings on strchunk
*
* @return the escaped string or NULL in case of error. User only
* needs to free the returned string if a) return value != query and b)
* strchunk was not provided.
* *
* @return the escaped string or NULL in case of error
*/ */
char* mu_str_xapian_escape_in_place (char *query, gboolean esc_space); char* mu_str_xapian_escape_in_place_try (char *query, gboolean esc_space,
GStringChunk *strchunk);
/** /**
* escape the string for use with xapian matching. in practice, if the * escape the string for use with xapian matching. in practice, if the
@ -153,11 +162,14 @@ char* mu_str_xapian_escape_in_place (char *query, gboolean esc_space);
* *
* @param query a query string * @param query a query string
* @param esc_space escape space characters as well * @param esc_space escape space characters as well
* @param strchunk (optional) if non-NULL, allocate strings on strchunk
* *
* @return the escaped string (free with g_free) or NULL in case of error * @return the escaped string (free with g_free) or NULL in case of error
* Unless strchunk was provided, user must g_free the string when
* no longer needed
*/ */
char* mu_str_xapian_escape (const char *query, gboolean esc_space) char* mu_str_xapian_escape (const char *query, gboolean esc_space,
G_GNUC_WARN_UNUSED_RESULT; GStringChunk *strchunk) G_GNUC_WARN_UNUSED_RESULT;