* improve support for non-latin languages (cyrillic etc.) (WIP)
- change the various escaping / normalization functions to better deal with
non-ascii, non-latin languages, such as Russian.
It seems. now we can match 'Тесла' or 'Аркона' without problem.
- added unit test.
- WIP -- needs more testing.
This commit is contained in:
@ -139,7 +139,7 @@ static const MuMsgField FIELD_DATA[] = {
|
|||||||
MU_MSG_FIELD_ID_FILE,
|
MU_MSG_FIELD_ID_FILE,
|
||||||
MU_MSG_FIELD_TYPE_STRING,
|
MU_MSG_FIELD_TYPE_STRING,
|
||||||
"file" , 'j', 'J',
|
"file" , 'j', 'J',
|
||||||
FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_NORMALIZE |
|
FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_ESCAPE |
|
||||||
FLAG_DONT_CACHE | FLAG_XAPIAN_PREFIX_ONLY
|
FLAG_DONT_CACHE | FLAG_XAPIAN_PREFIX_ONLY
|
||||||
},
|
},
|
||||||
|
|
||||||
@ -164,7 +164,8 @@ static const MuMsgField FIELD_DATA[] = {
|
|||||||
MU_MSG_FIELD_TYPE_STRING,
|
MU_MSG_FIELD_TYPE_STRING,
|
||||||
"path", 'l', 'L', /* 'l' for location */
|
"path", 'l', 'L', /* 'l' for location */
|
||||||
FLAG_GMIME | FLAG_XAPIAN_VALUE |
|
FLAG_GMIME | FLAG_XAPIAN_VALUE |
|
||||||
FLAG_XAPIAN_BOOLEAN | FLAG_XAPIAN_PREFIX_ONLY
|
FLAG_XAPIAN_BOOLEAN | FLAG_XAPIAN_PREFIX_ONLY |
|
||||||
|
FLAG_XAPIAN_ESCAPE
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -172,7 +173,7 @@ static const MuMsgField FIELD_DATA[] = {
|
|||||||
MU_MSG_FIELD_TYPE_STRING,
|
MU_MSG_FIELD_TYPE_STRING,
|
||||||
"maildir", 'm', 'M',
|
"maildir", 'm', 'M',
|
||||||
FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_VALUE |
|
FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_VALUE |
|
||||||
FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE | FLAG_XAPIAN_PREFIX_ONLY
|
FLAG_XAPIAN_ESCAPE | FLAG_XAPIAN_PREFIX_ONLY
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
||||||
@ -204,7 +205,7 @@ static const MuMsgField FIELD_DATA[] = {
|
|||||||
MU_MSG_FIELD_TYPE_STRING,
|
MU_MSG_FIELD_TYPE_STRING,
|
||||||
"subject", 's', 'S',
|
"subject", 's', 'S',
|
||||||
FLAG_GMIME | FLAG_XAPIAN_INDEX | FLAG_XAPIAN_VALUE |
|
FLAG_GMIME | FLAG_XAPIAN_INDEX | FLAG_XAPIAN_VALUE |
|
||||||
FLAG_XAPIAN_TERM | FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE
|
FLAG_XAPIAN_TERM | FLAG_XAPIAN_ESCAPE
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
@ -234,7 +235,7 @@ static const MuMsgField FIELD_DATA[] = {
|
|||||||
MU_MSG_FIELD_TYPE_STRING_LIST,
|
MU_MSG_FIELD_TYPE_STRING_LIST,
|
||||||
"tag", 'x', 'X',
|
"tag", 'x', 'X',
|
||||||
FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_PREFIX_ONLY |
|
FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_PREFIX_ONLY |
|
||||||
FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE
|
FLAG_XAPIAN_ESCAPE
|
||||||
},
|
},
|
||||||
|
|
||||||
{ /* special, internal field, to get a unique key */
|
{ /* special, internal field, to get a unique key */
|
||||||
|
|||||||
@ -298,7 +298,7 @@ mu_query_preprocess (const char *query, GError **err)
|
|||||||
cur->data = mu_str_normalize_in_place ((gchar*)cur->data, TRUE);
|
cur->data = mu_str_normalize_in_place ((gchar*)cur->data, TRUE);
|
||||||
/* escape '@', single '_' and ':' if it's not following a
|
/* escape '@', single '_' and ':' if it's not following a
|
||||||
* xapian-pfx with '_' */
|
* xapian-pfx with '_' */
|
||||||
cur->data = mu_str_ascii_xapian_escape_in_place
|
cur->data = mu_str_xapian_escape_in_place
|
||||||
((gchar*)cur->data, TRUE /*escape spaces too*/);
|
((gchar*)cur->data, TRUE /*escape spaces too*/);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -324,7 +324,7 @@ add_terms_values_str (Xapian::Document& doc, char *val,
|
|||||||
termgen.index_text_without_positions (val, 1, prefix(mfid));
|
termgen.index_text_without_positions (val, 1, prefix(mfid));
|
||||||
}
|
}
|
||||||
if (mu_msg_field_xapian_escape (mfid))
|
if (mu_msg_field_xapian_escape (mfid))
|
||||||
mu_str_ascii_xapian_escape_in_place (val,
|
mu_str_xapian_escape_in_place (val,
|
||||||
TRUE /*esc_space*/);
|
TRUE /*esc_space*/);
|
||||||
if (mu_msg_field_xapian_term(mfid))
|
if (mu_msg_field_xapian_term(mfid))
|
||||||
doc.add_term (prefix(mfid) +
|
doc.add_term (prefix(mfid) +
|
||||||
@ -476,7 +476,7 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)
|
|||||||
|
|
||||||
/* now, let's create a term... */
|
/* now, let's create a term... */
|
||||||
mu_str_normalize_in_place (val, TRUE);
|
mu_str_normalize_in_place (val, TRUE);
|
||||||
mu_str_ascii_xapian_escape_in_place (val, TRUE /*esc space*/);
|
mu_str_xapian_escape_in_place (val, TRUE /*esc space*/);
|
||||||
|
|
||||||
pdata->_doc.add_term
|
pdata->_doc.add_term
|
||||||
(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
|
(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
|
||||||
@ -632,7 +632,7 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
|
|||||||
if (!mu_str_is_empty(contact->address)) {
|
if (!mu_str_is_empty(contact->address)) {
|
||||||
|
|
||||||
char *escaped;
|
char *escaped;
|
||||||
escaped = mu_str_ascii_xapian_escape (contact->address,
|
escaped = mu_str_xapian_escape (contact->address,
|
||||||
FALSE /*dont esc space*/);
|
FALSE /*dont esc space*/);
|
||||||
msgdoc->_doc->add_term
|
msgdoc->_doc->add_term
|
||||||
(std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
|
(std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
|
||||||
|
|||||||
@ -38,6 +38,30 @@ mu_str_normalize (const char *str, gboolean downcase)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* this implementation should work for _all_ locales. */
|
||||||
|
static char*
|
||||||
|
mu_str_normalize_in_place_generic (char *str, gboolean downcase)
|
||||||
|
{
|
||||||
|
/* FIXME: add accent-folding etc. */
|
||||||
|
|
||||||
|
if (downcase) {
|
||||||
|
|
||||||
|
char *norm;
|
||||||
|
size_t len;
|
||||||
|
|
||||||
|
len = strlen (str);
|
||||||
|
norm = g_utf8_strdown (str, len);
|
||||||
|
|
||||||
|
if (strlen (norm) > len)
|
||||||
|
g_warning ("normalized text doesn't fit :/");
|
||||||
|
|
||||||
|
memcpy (str, norm, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* this implementation works for accented chars in Unicode Blocks
|
* this implementation works for accented chars in Unicode Blocks
|
||||||
* 'Latin-1 Supplement' and 'Latin Extended-A'. An alternative (slower
|
* 'Latin-1 Supplement' and 'Latin Extended-A'. An alternative (slower
|
||||||
@ -66,7 +90,8 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
|
|||||||
|
|
||||||
for (i = 0, cur = (const guchar*)str; *cur; ++cur) {
|
for (i = 0, cur = (const guchar*)str; *cur; ++cur) {
|
||||||
|
|
||||||
if (G_LIKELY(*cur < 0xc3 || *cur > 0xc5)) {
|
/* special case for plain-old ascii */
|
||||||
|
if ((*cur < 0x80)) {
|
||||||
str[i++] = downcase ? tolower (*cur) : *cur;
|
str[i++] = downcase ? tolower (*cur) : *cur;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -263,7 +288,7 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} else { /* Latin Extended-A (0xc5) */
|
} else if (*cur == 0xc5) { /* Latin Extended-A (0xc5) */
|
||||||
++cur;
|
++cur;
|
||||||
switch (*cur) {
|
switch (*cur) {
|
||||||
case 0x81: str[i++] = downcase ? 'l': 'L'; break;
|
case 0x81: str[i++] = downcase ? 'l': 'L'; break;
|
||||||
@ -357,6 +382,11 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
|
|||||||
|
|
||||||
default: str[i++] = *cur; break;
|
default: str[i++] = *cur; break;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
/* our fast-path for latin-utf8 does not work -- bummer!
|
||||||
|
* use something more generic (but a bit slower)
|
||||||
|
*/
|
||||||
|
return mu_str_normalize_in_place_generic (str, downcase);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
24
src/mu-str.c
24
src/mu-str.c
@ -423,9 +423,9 @@ check_for_field (const char *str, gboolean *is_field, gboolean *is_range_field)
|
|||||||
* function expects search terms (not complete queries)
|
* function expects search terms (not complete queries)
|
||||||
* */
|
* */
|
||||||
char*
|
char*
|
||||||
mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
|
mu_str_xapian_escape_in_place (char *term, gboolean esc_space)
|
||||||
{
|
{
|
||||||
gchar *cur;
|
unsigned char *cur;
|
||||||
const char escchar = '_';
|
const char escchar = '_';
|
||||||
gboolean is_field, is_range_field;
|
gboolean is_field, is_range_field;
|
||||||
unsigned colon;
|
unsigned colon;
|
||||||
@ -434,13 +434,10 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
|
|||||||
|
|
||||||
check_for_field (term, &is_field, &is_range_field);
|
check_for_field (term, &is_field, &is_range_field);
|
||||||
|
|
||||||
for (colon = 0, cur = term; *cur; ++cur) {
|
for (colon = 0, cur = (unsigned char*)term; *cur; ++cur) {
|
||||||
|
|
||||||
*cur = tolower(*cur);
|
|
||||||
|
|
||||||
switch (*cur) {
|
switch (*cur) {
|
||||||
*cur = escchar;
|
|
||||||
break;
|
|
||||||
case '.': /* escape '..' if it's not a range field*/
|
case '.': /* escape '..' if it's not a range field*/
|
||||||
if (is_range_field && cur[1] == '.')
|
if (is_range_field && cur[1] == '.')
|
||||||
cur += 1;
|
cur += 1;
|
||||||
@ -461,21 +458,24 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
|
|||||||
case '*': /* wildcard */
|
case '*': /* wildcard */
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (!isalnum(*cur))
|
/* escape all other special stuff */
|
||||||
|
if (*cur < '0' || (*cur > '9' && *cur < 'A')
|
||||||
|
|| (*cur > 'Z' && *cur < 'a') ||
|
||||||
|
(*cur > 'z' && *cur < 0x80))
|
||||||
*cur = escchar;
|
*cur = escchar;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return term;
|
/* downcase try to remove accents etc. */
|
||||||
|
return mu_str_normalize_in_place (term, TRUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
char*
|
char*
|
||||||
mu_str_ascii_xapian_escape (const char *query, gboolean esc_space)
|
mu_str_xapian_escape (const char *query, gboolean esc_space)
|
||||||
{
|
{
|
||||||
g_return_val_if_fail (query, NULL);
|
g_return_val_if_fail (query, NULL);
|
||||||
|
|
||||||
return mu_str_ascii_xapian_escape_in_place (g_strdup(query), esc_space);
|
return mu_str_xapian_escape_in_place (g_strdup(query), esc_space);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -138,14 +138,12 @@ char* mu_str_normalize_in_place (char *str, gboolean downcase);
|
|||||||
* changing is done in-place (by changing the argument string). in any
|
* changing is done in-place (by changing the argument string). in any
|
||||||
* case, the string will be downcased.
|
* case, the string will be downcased.
|
||||||
*
|
*
|
||||||
* works for ascii strings, like e-mail addresses and message-id.
|
|
||||||
*
|
|
||||||
* @param query a query string
|
* @param query a query string
|
||||||
* @param esc_space escape space characters as well
|
* @param esc_space escape space characters as well
|
||||||
*
|
*
|
||||||
* @return the escaped string or NULL in case of error
|
* @return the escaped string or NULL in case of error
|
||||||
*/
|
*/
|
||||||
char* mu_str_ascii_xapian_escape_in_place (char *query, gboolean esc_space);
|
char* mu_str_xapian_escape_in_place (char *query, gboolean esc_space);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* escape the string for use with xapian matching. in practice, if the
|
* escape the string for use with xapian matching. in practice, if the
|
||||||
@ -153,14 +151,12 @@ char* mu_str_ascii_xapian_escape_in_place (char *query, gboolean esc_space);
|
|||||||
* replace ':' with '_', if it's not following a xapian-prefix (such
|
* replace ':' with '_', if it's not following a xapian-prefix (such
|
||||||
* as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]).
|
* as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]).
|
||||||
*
|
*
|
||||||
* works for ascii strings, like e-mail addresses and message-id.
|
|
||||||
*
|
|
||||||
* @param query a query string
|
* @param query a query string
|
||||||
* @param esc_space escape space characters as well
|
* @param esc_space escape space characters as well
|
||||||
*
|
*
|
||||||
* @return the escaped string (free with g_free) or NULL in case of error
|
* @return the escaped string (free with g_free) or NULL in case of error
|
||||||
*/
|
*/
|
||||||
char* mu_str_ascii_xapian_escape (const char *query, gboolean esc_space)
|
char* mu_str_xapian_escape (const char *query, gboolean esc_space)
|
||||||
G_GNUC_WARN_UNUSED_RESULT;
|
G_GNUC_WARN_UNUSED_RESULT;
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -180,7 +180,7 @@ test_mu_str_esc_to_list (void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
test_mu_str_ascii_xapian_escape (void)
|
test_mu_str_xapian_escape (void)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
struct {
|
struct {
|
||||||
@ -204,7 +204,7 @@ test_mu_str_ascii_xapian_escape (void)
|
|||||||
|
|
||||||
for (i = 0; i != G_N_ELEMENTS(words); ++i) {
|
for (i = 0; i != G_N_ELEMENTS(words); ++i) {
|
||||||
gchar *a = g_strdup (words[i].word);
|
gchar *a = g_strdup (words[i].word);
|
||||||
mu_str_ascii_xapian_escape_in_place (a, FALSE);
|
mu_str_xapian_escape_in_place (a, FALSE);
|
||||||
|
|
||||||
if (g_test_verbose())
|
if (g_test_verbose())
|
||||||
g_print ("expected: '%s' <=> got: '%s'\n",
|
g_print ("expected: '%s' <=> got: '%s'\n",
|
||||||
@ -216,6 +216,36 @@ test_mu_str_ascii_xapian_escape (void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
test_mu_str_xapian_escape_non_ascii (void)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
struct {
|
||||||
|
const char* word;
|
||||||
|
const char* esc;
|
||||||
|
} words [] = {
|
||||||
|
{ "Тесла, Никола", "тесла__никола"},
|
||||||
|
{ "Masha@Аркона.ru", "masha_аркона_ru" },
|
||||||
|
{ "foo:ελληνικά", "foo_ελληνικά" },
|
||||||
|
{ "日本語!!", "日本語__" },
|
||||||
|
};
|
||||||
|
|
||||||
|
for (i = 0; i != G_N_ELEMENTS(words); ++i) {
|
||||||
|
gchar *a = g_strdup (words[i].word);
|
||||||
|
mu_str_xapian_escape_in_place (a, FALSE);
|
||||||
|
|
||||||
|
if (g_test_verbose())
|
||||||
|
g_print ("(%s) expected: '%s' <=> got: '%s'\n",
|
||||||
|
words[i].word, words[i].esc, a);
|
||||||
|
|
||||||
|
g_assert_cmpstr (a, ==, words[i].esc);
|
||||||
|
g_free (a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
test_mu_str_display_contact (void)
|
test_mu_str_display_contact (void)
|
||||||
{
|
{
|
||||||
@ -454,8 +484,10 @@ main (int argc, char *argv[])
|
|||||||
g_test_add_func ("/mu-str/mu-str-normalize-02",
|
g_test_add_func ("/mu-str/mu-str-normalize-02",
|
||||||
test_mu_str_normalize_02);
|
test_mu_str_normalize_02);
|
||||||
|
|
||||||
g_test_add_func ("/mu-str/mu-str-ascii-xapian-escape",
|
g_test_add_func ("/mu-str/mu-str-xapian-escape",
|
||||||
test_mu_str_ascii_xapian_escape);
|
test_mu_str_xapian_escape);
|
||||||
|
g_test_add_func ("/mu-str/mu-str-xapian-escape-non-ascii",
|
||||||
|
test_mu_str_xapian_escape_non_ascii);
|
||||||
|
|
||||||
g_test_add_func ("/mu-str/mu-str-display_contact",
|
g_test_add_func ("/mu-str/mu-str-display_contact",
|
||||||
test_mu_str_display_contact);
|
test_mu_str_display_contact);
|
||||||
|
|||||||
Reference in New Issue
Block a user