diff --git a/src/mu-msg-fields.c b/src/mu-msg-fields.c index 867f4660..d6f8702f 100644 --- a/src/mu-msg-fields.c +++ b/src/mu-msg-fields.c @@ -139,7 +139,7 @@ static const MuMsgField FIELD_DATA[] = { MU_MSG_FIELD_ID_FILE, MU_MSG_FIELD_TYPE_STRING, "file" , 'j', 'J', - FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_NORMALIZE | + FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_ESCAPE | FLAG_DONT_CACHE | FLAG_XAPIAN_PREFIX_ONLY }, @@ -164,7 +164,8 @@ static const MuMsgField FIELD_DATA[] = { MU_MSG_FIELD_TYPE_STRING, "path", 'l', 'L', /* 'l' for location */ FLAG_GMIME | FLAG_XAPIAN_VALUE | - FLAG_XAPIAN_BOOLEAN | FLAG_XAPIAN_PREFIX_ONLY + FLAG_XAPIAN_BOOLEAN | FLAG_XAPIAN_PREFIX_ONLY | + FLAG_XAPIAN_ESCAPE }, { @@ -172,7 +173,7 @@ static const MuMsgField FIELD_DATA[] = { MU_MSG_FIELD_TYPE_STRING, "maildir", 'm', 'M', FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_VALUE | - FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE | FLAG_XAPIAN_PREFIX_ONLY + FLAG_XAPIAN_ESCAPE | FLAG_XAPIAN_PREFIX_ONLY }, @@ -204,7 +205,7 @@ static const MuMsgField FIELD_DATA[] = { MU_MSG_FIELD_TYPE_STRING, "subject", 's', 'S', FLAG_GMIME | FLAG_XAPIAN_INDEX | FLAG_XAPIAN_VALUE | - FLAG_XAPIAN_TERM | FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE + FLAG_XAPIAN_TERM | FLAG_XAPIAN_ESCAPE }, { @@ -234,7 +235,7 @@ static const MuMsgField FIELD_DATA[] = { MU_MSG_FIELD_TYPE_STRING_LIST, "tag", 'x', 'X', FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_PREFIX_ONLY | - FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE + FLAG_XAPIAN_ESCAPE }, { /* special, internal field, to get a unique key */ diff --git a/src/mu-query.cc b/src/mu-query.cc index 2ff95f07..49ef9d25 100644 --- a/src/mu-query.cc +++ b/src/mu-query.cc @@ -298,7 +298,7 @@ mu_query_preprocess (const char *query, GError **err) cur->data = mu_str_normalize_in_place ((gchar*)cur->data, TRUE); /* escape '@', single '_' and ':' if it's not following a * xapian-pfx with '_' */ - cur->data = mu_str_ascii_xapian_escape_in_place + cur->data = mu_str_xapian_escape_in_place ((gchar*)cur->data, TRUE /*escape spaces too*/); } diff --git a/src/mu-store-write.cc b/src/mu-store-write.cc index 362f4c33..8ea98c4c 100644 --- a/src/mu-store-write.cc +++ b/src/mu-store-write.cc @@ -324,7 +324,7 @@ add_terms_values_str (Xapian::Document& doc, char *val, termgen.index_text_without_positions (val, 1, prefix(mfid)); } if (mu_msg_field_xapian_escape (mfid)) - mu_str_ascii_xapian_escape_in_place (val, + mu_str_xapian_escape_in_place (val, TRUE /*esc_space*/); if (mu_msg_field_xapian_term(mfid)) doc.add_term (prefix(mfid) + @@ -476,7 +476,7 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata) /* now, let's create a term... */ mu_str_normalize_in_place (val, TRUE); - mu_str_ascii_xapian_escape_in_place (val, TRUE /*esc space*/); + mu_str_xapian_escape_in_place (val, TRUE /*esc space*/); pdata->_doc.add_term (file + std::string(val, 0, MuStore::MAX_TERM_LENGTH)); @@ -632,8 +632,8 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc) if (!mu_str_is_empty(contact->address)) { char *escaped; - escaped = mu_str_ascii_xapian_escape (contact->address, - FALSE /*dont esc space*/); + escaped = mu_str_xapian_escape (contact->address, + FALSE /*dont esc space*/); msgdoc->_doc->add_term (std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH)); g_free (escaped); diff --git a/src/mu-str-normalize.c b/src/mu-str-normalize.c index a6d6a758..9407465b 100644 --- a/src/mu-str-normalize.c +++ b/src/mu-str-normalize.c @@ -1,20 +1,20 @@ -/* +/* ** Copyright (C) 2010 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation; either version 3 of the License, or ** (at your option) any later version. -** +** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. -** +** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software Foundation, -** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -** +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** */ #if HAVE_CONFIG_H @@ -38,6 +38,30 @@ mu_str_normalize (const char *str, gboolean downcase) } +/* this implementation should work for _all_ locales. */ +static char* +mu_str_normalize_in_place_generic (char *str, gboolean downcase) +{ + /* FIXME: add accent-folding etc. */ + + if (downcase) { + + char *norm; + size_t len; + + len = strlen (str); + norm = g_utf8_strdown (str, len); + + if (strlen (norm) > len) + g_warning ("normalized text doesn't fit :/"); + + memcpy (str, norm, len); + } + + return str; +} + + /* * this implementation works for accented chars in Unicode Blocks * 'Latin-1 Supplement' and 'Latin Extended-A'. An alternative (slower @@ -58,23 +82,24 @@ mu_str_normalize_in_place (char *str, gboolean downcase) { const guchar *cur; int i; - + g_return_val_if_fail (str, NULL); - + if (*str == '\0') return str; - + for (i = 0, cur = (const guchar*)str; *cur; ++cur) { - if (G_LIKELY(*cur < 0xc3 || *cur > 0xc5)) { + /* special case for plain-old ascii */ + if ((*cur < 0x80)) { str[i++] = downcase ? tolower (*cur) : *cur; - continue; + continue; } - + if (*cur == 0xc3) { /* latin-1 supplement */ ++cur; switch (*cur) { - + case 0x80: case 0x81: case 0x82: @@ -82,93 +107,93 @@ mu_str_normalize_in_place (char *str, gboolean downcase) case 0x84: case 0x85: str[i++] = downcase ? 'a' : 'A' ; break; - case 0x86: + case 0x86: str[i++] = downcase ? 'a' : 'A' ; str[i++] = 'e'; break; - + case 0x87: str[i++] = downcase ? 'c' : 'C'; break; - + case 0x88: case 0x89: case 0x8a: case 0x8b: str[i++] = downcase ? 'e' : 'E'; break; - + case 0x8c: case 0x8d: case 0x8e: case 0x8f: str[i++] = downcase ? 'i': 'I'; break; - + case 0x90: str[i++] = downcase ? 'd' : 'D'; break; case 0x91: str[i++] = downcase ? 'n' : 'N'; break; - + case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: str[i++] = downcase ? 'o' : 'O'; break; - + case 0x99: case 0x9a: case 0x9b: case 0x9c: str[i++] = downcase ? 'u' : 'U'; break; - + case 0x9d: str[i++] = downcase ? 'y' : 'Y'; break; case 0x9e: str[i++] = downcase ? 't' : 'T'; str[i++] = 'h'; break; - + case 0x9f: str[i++] = 's'; str[i++] = 's'; break; - + case 0xa0: case 0xa1: case 0xa2: case 0xa3: case 0xa4: case 0xa5: str[i++] = 'a'; break; - + case 0xa6: str[i++] = 'a'; str[i++] = 'e'; break; case 0xa7: str[i++] = 'c'; break; - + case 0xa8: case 0xa9: case 0xaa: case 0xab: str[i++] = 'e'; break; - + case 0xac: case 0xad: case 0xae: case 0xaf: str[i++] = 'i'; break; - + case 0xb0: str[i++] = 'd'; break; case 0xb1: str[i++] = 'n'; break; - + case 0xb2: case 0xb3: case 0xb4: case 0xb5: case 0xb6: str[i++] = 'o'; break; - + case 0xb9: case 0xba: case 0xbb: case 0xbc: str[i++] = 'u'; break; - + case 0xbd: str[i++] = 'y'; break; case 0xbe: str[i++] = 't'; str[i++] = 'h'; break; case 0xbf: str[i++] = 'y'; break; - + default: str[i++] = *cur; } } else if (*cur == 0xc4) { /* Latin Extended-A (0x04) */ ++cur; - switch (*cur) { + switch (*cur) { case 0x80: case 0x82: case 0x84: str[i++] = downcase ? 'a' : 'A'; break; @@ -194,18 +219,18 @@ mu_str_normalize_in_place (char *str, gboolean downcase) case 0xa4: case 0xa6: str[i++] = downcase ? 'h' : 'H'; break; - + case 0xa8: case 0xaa: case 0xac: case 0xae: case 0xb0: str[i++] = downcase ? 'i' : 'I'; break; - + case 0xb2: str[i++] = downcase ? 'i' : 'I'; str[i++] = downcase ? 'j' : 'J'; break; - + case 0xb4: str[i++] = downcase ? 'j' : 'J'; break; @@ -215,7 +240,7 @@ mu_str_normalize_in_place (char *str, gboolean downcase) case 0xbb: case 0xbd: case 0xbf: str[i++] = downcase ? 'l': 'L'; break; - + case 0x81: case 0x83: case 0x85: str[i++] = 'a'; break; @@ -260,14 +285,14 @@ mu_str_normalize_in_place (char *str, gboolean downcase) case 0xbe: str[i++] = 'l'; break; default: str[i++] = *cur; break; - + } - } else { /* Latin Extended-A (0xc5) */ + } else if (*cur == 0xc5) { /* Latin Extended-A (0xc5) */ ++cur; switch (*cur) { case 0x81: str[i++] = downcase ? 'l': 'L'; break; - + case 0x83: case 0x85: case 0x87: str[i++] = downcase ? 'n': 'N'; break; @@ -275,7 +300,7 @@ mu_str_normalize_in_place (char *str, gboolean downcase) case 0x8c: case 0x8e: case 0x90: str[i++] = downcase ? 'o': 'O'; break; - + case 0x92: str[i++] = downcase ? 'o': 'O'; str[i++] = 'e'; @@ -298,7 +323,7 @@ mu_str_normalize_in_place (char *str, gboolean downcase) case 0xaa: case 0xac: case 0xae: - case 0xb0: + case 0xb0: case 0xb2: str[i++] = downcase ? 'u': 'U'; break; case 0xb4: str[i++] = downcase ? 'w': 'W'; break; @@ -306,9 +331,9 @@ mu_str_normalize_in_place (char *str, gboolean downcase) case 0xb8: str[i++] = downcase ? 'y': 'Y'; break; case 0xb9: - case 0xbb: + case 0xbb: case 0xbd: str[i++] = downcase ? 'z': 'Z'; break; - + case 0x80: case 0x82: str[i++] = 'l'; break; @@ -342,25 +367,30 @@ mu_str_normalize_in_place (char *str, gboolean downcase) case 0xab: case 0xad: case 0xaf: - case 0xb1: + case 0xb1: case 0xb3: str[i++] = 'u'; break; - + case 0xb5: str[i++] = 'w'; break; case 0xb7: str[i++] = 'y'; break; case 0xba: - case 0xbc: + case 0xbc: case 0xbe: str[i++] = 'z'; break; case 0xbf: str[i++] = 's'; break; - + default: str[i++] = *cur; break; } + } else { + /* our fast-path for latin-utf8 does not work -- bummer! + * use something more generic (but a bit slower) + */ + return mu_str_normalize_in_place_generic (str, downcase); } } str[i] = '\0'; - + return str; } diff --git a/src/mu-str.c b/src/mu-str.c index dff06eb4..b6f20448 100644 --- a/src/mu-str.c +++ b/src/mu-str.c @@ -423,9 +423,9 @@ check_for_field (const char *str, gboolean *is_field, gboolean *is_range_field) * function expects search terms (not complete queries) * */ char* -mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space) +mu_str_xapian_escape_in_place (char *term, gboolean esc_space) { - gchar *cur; + unsigned char *cur; const char escchar = '_'; gboolean is_field, is_range_field; unsigned colon; @@ -434,13 +434,10 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space) check_for_field (term, &is_field, &is_range_field); - for (colon = 0, cur = term; *cur; ++cur) { - - *cur = tolower(*cur); + for (colon = 0, cur = (unsigned char*)term; *cur; ++cur) { switch (*cur) { - *cur = escchar; - break; + case '.': /* escape '..' if it's not a range field*/ if (is_range_field && cur[1] == '.') cur += 1; @@ -461,21 +458,24 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space) case '*': /* wildcard */ break; default: - if (!isalnum(*cur)) + /* escape all other special stuff */ + if (*cur < '0' || (*cur > '9' && *cur < 'A') + || (*cur > 'Z' && *cur < 'a') || + (*cur > 'z' && *cur < 0x80)) *cur = escchar; } - } - return term; + /* downcase try to remove accents etc. */ + return mu_str_normalize_in_place (term, TRUE); } char* -mu_str_ascii_xapian_escape (const char *query, gboolean esc_space) +mu_str_xapian_escape (const char *query, gboolean esc_space) { g_return_val_if_fail (query, NULL); - return mu_str_ascii_xapian_escape_in_place (g_strdup(query), esc_space); + return mu_str_xapian_escape_in_place (g_strdup(query), esc_space); } diff --git a/src/mu-str.h b/src/mu-str.h index ad9e7dcb..6cae11bd 100644 --- a/src/mu-str.h +++ b/src/mu-str.h @@ -138,14 +138,12 @@ char* mu_str_normalize_in_place (char *str, gboolean downcase); * changing is done in-place (by changing the argument string). in any * case, the string will be downcased. * - * works for ascii strings, like e-mail addresses and message-id. - * * @param query a query string * @param esc_space escape space characters as well * * @return the escaped string or NULL in case of error */ -char* mu_str_ascii_xapian_escape_in_place (char *query, gboolean esc_space); +char* mu_str_xapian_escape_in_place (char *query, gboolean esc_space); /** * escape the string for use with xapian matching. in practice, if the @@ -153,14 +151,12 @@ char* mu_str_ascii_xapian_escape_in_place (char *query, gboolean esc_space); * replace ':' with '_', if it's not following a xapian-prefix (such * as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]). * - * works for ascii strings, like e-mail addresses and message-id. - * * @param query a query string * @param esc_space escape space characters as well * * @return the escaped string (free with g_free) or NULL in case of error */ -char* mu_str_ascii_xapian_escape (const char *query, gboolean esc_space) +char* mu_str_xapian_escape (const char *query, gboolean esc_space) G_GNUC_WARN_UNUSED_RESULT; diff --git a/src/tests/test-mu-str.c b/src/tests/test-mu-str.c index 623dd425..5409641c 100644 --- a/src/tests/test-mu-str.c +++ b/src/tests/test-mu-str.c @@ -180,7 +180,7 @@ test_mu_str_esc_to_list (void) } static void -test_mu_str_ascii_xapian_escape (void) +test_mu_str_xapian_escape (void) { int i; struct { @@ -204,7 +204,7 @@ test_mu_str_ascii_xapian_escape (void) for (i = 0; i != G_N_ELEMENTS(words); ++i) { gchar *a = g_strdup (words[i].word); - mu_str_ascii_xapian_escape_in_place (a, FALSE); + mu_str_xapian_escape_in_place (a, FALSE); if (g_test_verbose()) g_print ("expected: '%s' <=> got: '%s'\n", @@ -216,6 +216,36 @@ test_mu_str_ascii_xapian_escape (void) } +static void +test_mu_str_xapian_escape_non_ascii (void) +{ + int i; + struct { + const char* word; + const char* esc; + } words [] = { + { "Тесла, Никола", "тесла__никола"}, + { "Masha@Аркона.ru", "masha_аркона_ru" }, + { "foo:ελληνικά", "foo_ελληνικά" }, + { "日本語!!", "日本語__" }, + }; + + for (i = 0; i != G_N_ELEMENTS(words); ++i) { + gchar *a = g_strdup (words[i].word); + mu_str_xapian_escape_in_place (a, FALSE); + + if (g_test_verbose()) + g_print ("(%s) expected: '%s' <=> got: '%s'\n", + words[i].word, words[i].esc, a); + + g_assert_cmpstr (a, ==, words[i].esc); + g_free (a); + } +} + + + + static void test_mu_str_display_contact (void) { @@ -454,8 +484,10 @@ main (int argc, char *argv[]) g_test_add_func ("/mu-str/mu-str-normalize-02", test_mu_str_normalize_02); - g_test_add_func ("/mu-str/mu-str-ascii-xapian-escape", - test_mu_str_ascii_xapian_escape); + g_test_add_func ("/mu-str/mu-str-xapian-escape", + test_mu_str_xapian_escape); + g_test_add_func ("/mu-str/mu-str-xapian-escape-non-ascii", + test_mu_str_xapian_escape_non_ascii); g_test_add_func ("/mu-str/mu-str-display_contact", test_mu_str_display_contact);