* improve support for non-latin languages (cyrillic etc.) (WIP)

- change the various escaping / normalization functions to better deal with
    non-ascii, non-latin languages, such as Russian.

    It seems. now we can match 'Тесла' or 'Аркона' without problem.

  - added unit test.

  - WIP -- needs more testing.
This commit is contained in:
djcb
2012-04-16 01:10:46 +03:00
parent 557ce2839b
commit 0be852b288
7 changed files with 138 additions and 79 deletions

View File

@ -139,7 +139,7 @@ static const MuMsgField FIELD_DATA[] = {
MU_MSG_FIELD_ID_FILE, MU_MSG_FIELD_ID_FILE,
MU_MSG_FIELD_TYPE_STRING, MU_MSG_FIELD_TYPE_STRING,
"file" , 'j', 'J', "file" , 'j', 'J',
FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_NORMALIZE | FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_ESCAPE |
FLAG_DONT_CACHE | FLAG_XAPIAN_PREFIX_ONLY FLAG_DONT_CACHE | FLAG_XAPIAN_PREFIX_ONLY
}, },
@ -164,7 +164,8 @@ static const MuMsgField FIELD_DATA[] = {
MU_MSG_FIELD_TYPE_STRING, MU_MSG_FIELD_TYPE_STRING,
"path", 'l', 'L', /* 'l' for location */ "path", 'l', 'L', /* 'l' for location */
FLAG_GMIME | FLAG_XAPIAN_VALUE | FLAG_GMIME | FLAG_XAPIAN_VALUE |
FLAG_XAPIAN_BOOLEAN | FLAG_XAPIAN_PREFIX_ONLY FLAG_XAPIAN_BOOLEAN | FLAG_XAPIAN_PREFIX_ONLY |
FLAG_XAPIAN_ESCAPE
}, },
{ {
@ -172,7 +173,7 @@ static const MuMsgField FIELD_DATA[] = {
MU_MSG_FIELD_TYPE_STRING, MU_MSG_FIELD_TYPE_STRING,
"maildir", 'm', 'M', "maildir", 'm', 'M',
FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_VALUE | FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_VALUE |
FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE | FLAG_XAPIAN_PREFIX_ONLY FLAG_XAPIAN_ESCAPE | FLAG_XAPIAN_PREFIX_ONLY
}, },
@ -204,7 +205,7 @@ static const MuMsgField FIELD_DATA[] = {
MU_MSG_FIELD_TYPE_STRING, MU_MSG_FIELD_TYPE_STRING,
"subject", 's', 'S', "subject", 's', 'S',
FLAG_GMIME | FLAG_XAPIAN_INDEX | FLAG_XAPIAN_VALUE | FLAG_GMIME | FLAG_XAPIAN_INDEX | FLAG_XAPIAN_VALUE |
FLAG_XAPIAN_TERM | FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE FLAG_XAPIAN_TERM | FLAG_XAPIAN_ESCAPE
}, },
{ {
@ -234,7 +235,7 @@ static const MuMsgField FIELD_DATA[] = {
MU_MSG_FIELD_TYPE_STRING_LIST, MU_MSG_FIELD_TYPE_STRING_LIST,
"tag", 'x', 'X', "tag", 'x', 'X',
FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_PREFIX_ONLY | FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_PREFIX_ONLY |
FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE FLAG_XAPIAN_ESCAPE
}, },
{ /* special, internal field, to get a unique key */ { /* special, internal field, to get a unique key */

View File

@ -298,7 +298,7 @@ mu_query_preprocess (const char *query, GError **err)
cur->data = mu_str_normalize_in_place ((gchar*)cur->data, TRUE); cur->data = mu_str_normalize_in_place ((gchar*)cur->data, TRUE);
/* escape '@', single '_' and ':' if it's not following a /* escape '@', single '_' and ':' if it's not following a
* xapian-pfx with '_' */ * xapian-pfx with '_' */
cur->data = mu_str_ascii_xapian_escape_in_place cur->data = mu_str_xapian_escape_in_place
((gchar*)cur->data, TRUE /*escape spaces too*/); ((gchar*)cur->data, TRUE /*escape spaces too*/);
} }

View File

@ -324,7 +324,7 @@ add_terms_values_str (Xapian::Document& doc, char *val,
termgen.index_text_without_positions (val, 1, prefix(mfid)); termgen.index_text_without_positions (val, 1, prefix(mfid));
} }
if (mu_msg_field_xapian_escape (mfid)) if (mu_msg_field_xapian_escape (mfid))
mu_str_ascii_xapian_escape_in_place (val, mu_str_xapian_escape_in_place (val,
TRUE /*esc_space*/); TRUE /*esc_space*/);
if (mu_msg_field_xapian_term(mfid)) if (mu_msg_field_xapian_term(mfid))
doc.add_term (prefix(mfid) + doc.add_term (prefix(mfid) +
@ -476,7 +476,7 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)
/* now, let's create a term... */ /* now, let's create a term... */
mu_str_normalize_in_place (val, TRUE); mu_str_normalize_in_place (val, TRUE);
mu_str_ascii_xapian_escape_in_place (val, TRUE /*esc space*/); mu_str_xapian_escape_in_place (val, TRUE /*esc space*/);
pdata->_doc.add_term pdata->_doc.add_term
(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH)); (file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
@ -632,8 +632,8 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
if (!mu_str_is_empty(contact->address)) { if (!mu_str_is_empty(contact->address)) {
char *escaped; char *escaped;
escaped = mu_str_ascii_xapian_escape (contact->address, escaped = mu_str_xapian_escape (contact->address,
FALSE /*dont esc space*/); FALSE /*dont esc space*/);
msgdoc->_doc->add_term msgdoc->_doc->add_term
(std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH)); (std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
g_free (escaped); g_free (escaped);

View File

@ -1,20 +1,20 @@
/* /*
** Copyright (C) 2010 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl> ** Copyright (C) 2010 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
** **
** This program is free software; you can redistribute it and/or modify ** This program is free software; you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by ** it under the terms of the GNU General Public License as published by
** the Free Software Foundation; either version 3 of the License, or ** the Free Software Foundation; either version 3 of the License, or
** (at your option) any later version. ** (at your option) any later version.
** **
** This program is distributed in the hope that it will be useful, ** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of ** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details. ** GNU General Public License for more details.
** **
** You should have received a copy of the GNU General Public License ** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation, ** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
** **
*/ */
#if HAVE_CONFIG_H #if HAVE_CONFIG_H
@ -38,6 +38,30 @@ mu_str_normalize (const char *str, gboolean downcase)
} }
/* this implementation should work for _all_ locales. */
static char*
mu_str_normalize_in_place_generic (char *str, gboolean downcase)
{
/* FIXME: add accent-folding etc. */
if (downcase) {
char *norm;
size_t len;
len = strlen (str);
norm = g_utf8_strdown (str, len);
if (strlen (norm) > len)
g_warning ("normalized text doesn't fit :/");
memcpy (str, norm, len);
}
return str;
}
/* /*
* this implementation works for accented chars in Unicode Blocks * this implementation works for accented chars in Unicode Blocks
* 'Latin-1 Supplement' and 'Latin Extended-A'. An alternative (slower * 'Latin-1 Supplement' and 'Latin Extended-A'. An alternative (slower
@ -58,23 +82,24 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
{ {
const guchar *cur; const guchar *cur;
int i; int i;
g_return_val_if_fail (str, NULL); g_return_val_if_fail (str, NULL);
if (*str == '\0') if (*str == '\0')
return str; return str;
for (i = 0, cur = (const guchar*)str; *cur; ++cur) { for (i = 0, cur = (const guchar*)str; *cur; ++cur) {
if (G_LIKELY(*cur < 0xc3 || *cur > 0xc5)) { /* special case for plain-old ascii */
if ((*cur < 0x80)) {
str[i++] = downcase ? tolower (*cur) : *cur; str[i++] = downcase ? tolower (*cur) : *cur;
continue; continue;
} }
if (*cur == 0xc3) { /* latin-1 supplement */ if (*cur == 0xc3) { /* latin-1 supplement */
++cur; ++cur;
switch (*cur) { switch (*cur) {
case 0x80: case 0x80:
case 0x81: case 0x81:
case 0x82: case 0x82:
@ -82,93 +107,93 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
case 0x84: case 0x84:
case 0x85: str[i++] = downcase ? 'a' : 'A' ; break; case 0x85: str[i++] = downcase ? 'a' : 'A' ; break;
case 0x86: case 0x86:
str[i++] = downcase ? 'a' : 'A' ; str[i++] = downcase ? 'a' : 'A' ;
str[i++] = 'e'; str[i++] = 'e';
break; break;
case 0x87: str[i++] = downcase ? 'c' : 'C'; break; case 0x87: str[i++] = downcase ? 'c' : 'C'; break;
case 0x88: case 0x88:
case 0x89: case 0x89:
case 0x8a: case 0x8a:
case 0x8b: case 0x8b:
str[i++] = downcase ? 'e' : 'E'; str[i++] = downcase ? 'e' : 'E';
break; break;
case 0x8c: case 0x8c:
case 0x8d: case 0x8d:
case 0x8e: case 0x8e:
case 0x8f: str[i++] = downcase ? 'i': 'I'; break; case 0x8f: str[i++] = downcase ? 'i': 'I'; break;
case 0x90: str[i++] = downcase ? 'd' : 'D'; break; case 0x90: str[i++] = downcase ? 'd' : 'D'; break;
case 0x91: str[i++] = downcase ? 'n' : 'N'; break; case 0x91: str[i++] = downcase ? 'n' : 'N'; break;
case 0x92: case 0x92:
case 0x93: case 0x93:
case 0x94: case 0x94:
case 0x95: case 0x95:
case 0x96: str[i++] = downcase ? 'o' : 'O'; break; case 0x96: str[i++] = downcase ? 'o' : 'O'; break;
case 0x99: case 0x99:
case 0x9a: case 0x9a:
case 0x9b: case 0x9b:
case 0x9c: str[i++] = downcase ? 'u' : 'U'; break; case 0x9c: str[i++] = downcase ? 'u' : 'U'; break;
case 0x9d: str[i++] = downcase ? 'y' : 'Y'; break; case 0x9d: str[i++] = downcase ? 'y' : 'Y'; break;
case 0x9e: case 0x9e:
str[i++] = downcase ? 't' : 'T'; str[i++] = downcase ? 't' : 'T';
str[i++] = 'h'; str[i++] = 'h';
break; break;
case 0x9f: str[i++] = 's'; str[i++] = 's'; break; case 0x9f: str[i++] = 's'; str[i++] = 's'; break;
case 0xa0: case 0xa0:
case 0xa1: case 0xa1:
case 0xa2: case 0xa2:
case 0xa3: case 0xa3:
case 0xa4: case 0xa4:
case 0xa5: str[i++] = 'a'; break; case 0xa5: str[i++] = 'a'; break;
case 0xa6: str[i++] = 'a'; str[i++] = 'e'; break; case 0xa6: str[i++] = 'a'; str[i++] = 'e'; break;
case 0xa7: str[i++] = 'c'; break; case 0xa7: str[i++] = 'c'; break;
case 0xa8: case 0xa8:
case 0xa9: case 0xa9:
case 0xaa: case 0xaa:
case 0xab: str[i++] = 'e'; break; case 0xab: str[i++] = 'e'; break;
case 0xac: case 0xac:
case 0xad: case 0xad:
case 0xae: case 0xae:
case 0xaf: str[i++] = 'i'; break; case 0xaf: str[i++] = 'i'; break;
case 0xb0: str[i++] = 'd'; break; case 0xb0: str[i++] = 'd'; break;
case 0xb1: str[i++] = 'n'; break; case 0xb1: str[i++] = 'n'; break;
case 0xb2: case 0xb2:
case 0xb3: case 0xb3:
case 0xb4: case 0xb4:
case 0xb5: case 0xb5:
case 0xb6: str[i++] = 'o'; break; case 0xb6: str[i++] = 'o'; break;
case 0xb9: case 0xb9:
case 0xba: case 0xba:
case 0xbb: case 0xbb:
case 0xbc: str[i++] = 'u'; break; case 0xbc: str[i++] = 'u'; break;
case 0xbd: str[i++] = 'y'; break; case 0xbd: str[i++] = 'y'; break;
case 0xbe: str[i++] = 't'; str[i++] = 'h'; break; case 0xbe: str[i++] = 't'; str[i++] = 'h'; break;
case 0xbf: str[i++] = 'y'; break; case 0xbf: str[i++] = 'y'; break;
default: default:
str[i++] = *cur; str[i++] = *cur;
} }
} else if (*cur == 0xc4) { /* Latin Extended-A (0x04) */ } else if (*cur == 0xc4) { /* Latin Extended-A (0x04) */
++cur; ++cur;
switch (*cur) { switch (*cur) {
case 0x80: case 0x80:
case 0x82: case 0x82:
case 0x84: str[i++] = downcase ? 'a' : 'A'; break; case 0x84: str[i++] = downcase ? 'a' : 'A'; break;
@ -194,18 +219,18 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
case 0xa4: case 0xa4:
case 0xa6: str[i++] = downcase ? 'h' : 'H'; break; case 0xa6: str[i++] = downcase ? 'h' : 'H'; break;
case 0xa8: case 0xa8:
case 0xaa: case 0xaa:
case 0xac: case 0xac:
case 0xae: case 0xae:
case 0xb0: str[i++] = downcase ? 'i' : 'I'; break; case 0xb0: str[i++] = downcase ? 'i' : 'I'; break;
case 0xb2: case 0xb2:
str[i++] = downcase ? 'i' : 'I'; str[i++] = downcase ? 'i' : 'I';
str[i++] = downcase ? 'j' : 'J'; str[i++] = downcase ? 'j' : 'J';
break; break;
case 0xb4: str[i++] = downcase ? 'j' : 'J'; break; case 0xb4: str[i++] = downcase ? 'j' : 'J'; break;
@ -215,7 +240,7 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
case 0xbb: case 0xbb:
case 0xbd: case 0xbd:
case 0xbf: str[i++] = downcase ? 'l': 'L'; break; case 0xbf: str[i++] = downcase ? 'l': 'L'; break;
case 0x81: case 0x81:
case 0x83: case 0x83:
case 0x85: str[i++] = 'a'; break; case 0x85: str[i++] = 'a'; break;
@ -260,14 +285,14 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
case 0xbe: str[i++] = 'l'; break; case 0xbe: str[i++] = 'l'; break;
default: str[i++] = *cur; break; default: str[i++] = *cur; break;
} }
} else { /* Latin Extended-A (0xc5) */ } else if (*cur == 0xc5) { /* Latin Extended-A (0xc5) */
++cur; ++cur;
switch (*cur) { switch (*cur) {
case 0x81: str[i++] = downcase ? 'l': 'L'; break; case 0x81: str[i++] = downcase ? 'l': 'L'; break;
case 0x83: case 0x83:
case 0x85: case 0x85:
case 0x87: str[i++] = downcase ? 'n': 'N'; break; case 0x87: str[i++] = downcase ? 'n': 'N'; break;
@ -275,7 +300,7 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
case 0x8c: case 0x8c:
case 0x8e: case 0x8e:
case 0x90: str[i++] = downcase ? 'o': 'O'; break; case 0x90: str[i++] = downcase ? 'o': 'O'; break;
case 0x92: case 0x92:
str[i++] = downcase ? 'o': 'O'; str[i++] = downcase ? 'o': 'O';
str[i++] = 'e'; str[i++] = 'e';
@ -298,7 +323,7 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
case 0xaa: case 0xaa:
case 0xac: case 0xac:
case 0xae: case 0xae:
case 0xb0: case 0xb0:
case 0xb2: str[i++] = downcase ? 'u': 'U'; break; case 0xb2: str[i++] = downcase ? 'u': 'U'; break;
case 0xb4: str[i++] = downcase ? 'w': 'W'; break; case 0xb4: str[i++] = downcase ? 'w': 'W'; break;
@ -306,9 +331,9 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
case 0xb8: str[i++] = downcase ? 'y': 'Y'; break; case 0xb8: str[i++] = downcase ? 'y': 'Y'; break;
case 0xb9: case 0xb9:
case 0xbb: case 0xbb:
case 0xbd: str[i++] = downcase ? 'z': 'Z'; break; case 0xbd: str[i++] = downcase ? 'z': 'Z'; break;
case 0x80: case 0x80:
case 0x82: str[i++] = 'l'; break; case 0x82: str[i++] = 'l'; break;
@ -342,25 +367,30 @@ mu_str_normalize_in_place (char *str, gboolean downcase)
case 0xab: case 0xab:
case 0xad: case 0xad:
case 0xaf: case 0xaf:
case 0xb1: case 0xb1:
case 0xb3: str[i++] = 'u'; break; case 0xb3: str[i++] = 'u'; break;
case 0xb5: str[i++] = 'w'; break; case 0xb5: str[i++] = 'w'; break;
case 0xb7: str[i++] = 'y'; break; case 0xb7: str[i++] = 'y'; break;
case 0xba: case 0xba:
case 0xbc: case 0xbc:
case 0xbe: str[i++] = 'z'; break; case 0xbe: str[i++] = 'z'; break;
case 0xbf: str[i++] = 's'; break; case 0xbf: str[i++] = 's'; break;
default: str[i++] = *cur; break; default: str[i++] = *cur; break;
} }
} else {
/* our fast-path for latin-utf8 does not work -- bummer!
* use something more generic (but a bit slower)
*/
return mu_str_normalize_in_place_generic (str, downcase);
} }
} }
str[i] = '\0'; str[i] = '\0';
return str; return str;
} }

View File

@ -423,9 +423,9 @@ check_for_field (const char *str, gboolean *is_field, gboolean *is_range_field)
* function expects search terms (not complete queries) * function expects search terms (not complete queries)
* */ * */
char* char*
mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space) mu_str_xapian_escape_in_place (char *term, gboolean esc_space)
{ {
gchar *cur; unsigned char *cur;
const char escchar = '_'; const char escchar = '_';
gboolean is_field, is_range_field; gboolean is_field, is_range_field;
unsigned colon; unsigned colon;
@ -434,13 +434,10 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
check_for_field (term, &is_field, &is_range_field); check_for_field (term, &is_field, &is_range_field);
for (colon = 0, cur = term; *cur; ++cur) { for (colon = 0, cur = (unsigned char*)term; *cur; ++cur) {
*cur = tolower(*cur);
switch (*cur) { switch (*cur) {
*cur = escchar;
break;
case '.': /* escape '..' if it's not a range field*/ case '.': /* escape '..' if it's not a range field*/
if (is_range_field && cur[1] == '.') if (is_range_field && cur[1] == '.')
cur += 1; cur += 1;
@ -461,21 +458,24 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
case '*': /* wildcard */ case '*': /* wildcard */
break; break;
default: default:
if (!isalnum(*cur)) /* escape all other special stuff */
if (*cur < '0' || (*cur > '9' && *cur < 'A')
|| (*cur > 'Z' && *cur < 'a') ||
(*cur > 'z' && *cur < 0x80))
*cur = escchar; *cur = escchar;
} }
} }
return term; /* downcase try to remove accents etc. */
return mu_str_normalize_in_place (term, TRUE);
} }
char* char*
mu_str_ascii_xapian_escape (const char *query, gboolean esc_space) mu_str_xapian_escape (const char *query, gboolean esc_space)
{ {
g_return_val_if_fail (query, NULL); g_return_val_if_fail (query, NULL);
return mu_str_ascii_xapian_escape_in_place (g_strdup(query), esc_space); return mu_str_xapian_escape_in_place (g_strdup(query), esc_space);
} }

View File

@ -138,14 +138,12 @@ char* mu_str_normalize_in_place (char *str, gboolean downcase);
* changing is done in-place (by changing the argument string). in any * changing is done in-place (by changing the argument string). in any
* case, the string will be downcased. * case, the string will be downcased.
* *
* works for ascii strings, like e-mail addresses and message-id.
*
* @param query a query string * @param query a query string
* @param esc_space escape space characters as well * @param esc_space escape space characters as well
* *
* @return the escaped string or NULL in case of error * @return the escaped string or NULL in case of error
*/ */
char* mu_str_ascii_xapian_escape_in_place (char *query, gboolean esc_space); char* mu_str_xapian_escape_in_place (char *query, gboolean esc_space);
/** /**
* escape the string for use with xapian matching. in practice, if the * escape the string for use with xapian matching. in practice, if the
@ -153,14 +151,12 @@ char* mu_str_ascii_xapian_escape_in_place (char *query, gboolean esc_space);
* replace ':' with '_', if it's not following a xapian-prefix (such * replace ':' with '_', if it's not following a xapian-prefix (such
* as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]). * as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]).
* *
* works for ascii strings, like e-mail addresses and message-id.
*
* @param query a query string * @param query a query string
* @param esc_space escape space characters as well * @param esc_space escape space characters as well
* *
* @return the escaped string (free with g_free) or NULL in case of error * @return the escaped string (free with g_free) or NULL in case of error
*/ */
char* mu_str_ascii_xapian_escape (const char *query, gboolean esc_space) char* mu_str_xapian_escape (const char *query, gboolean esc_space)
G_GNUC_WARN_UNUSED_RESULT; G_GNUC_WARN_UNUSED_RESULT;

View File

@ -180,7 +180,7 @@ test_mu_str_esc_to_list (void)
} }
static void static void
test_mu_str_ascii_xapian_escape (void) test_mu_str_xapian_escape (void)
{ {
int i; int i;
struct { struct {
@ -204,7 +204,7 @@ test_mu_str_ascii_xapian_escape (void)
for (i = 0; i != G_N_ELEMENTS(words); ++i) { for (i = 0; i != G_N_ELEMENTS(words); ++i) {
gchar *a = g_strdup (words[i].word); gchar *a = g_strdup (words[i].word);
mu_str_ascii_xapian_escape_in_place (a, FALSE); mu_str_xapian_escape_in_place (a, FALSE);
if (g_test_verbose()) if (g_test_verbose())
g_print ("expected: '%s' <=> got: '%s'\n", g_print ("expected: '%s' <=> got: '%s'\n",
@ -216,6 +216,36 @@ test_mu_str_ascii_xapian_escape (void)
} }
static void
test_mu_str_xapian_escape_non_ascii (void)
{
int i;
struct {
const char* word;
const char* esc;
} words [] = {
{ "Тесла, Никола", "тесла__никола"},
{ "Masha@Аркона.ru", "masha_аркона_ru" },
{ "foo:ελληνικά", "foo_ελληνικά" },
{ "日本語!!", "日本語__" },
};
for (i = 0; i != G_N_ELEMENTS(words); ++i) {
gchar *a = g_strdup (words[i].word);
mu_str_xapian_escape_in_place (a, FALSE);
if (g_test_verbose())
g_print ("(%s) expected: '%s' <=> got: '%s'\n",
words[i].word, words[i].esc, a);
g_assert_cmpstr (a, ==, words[i].esc);
g_free (a);
}
}
static void static void
test_mu_str_display_contact (void) test_mu_str_display_contact (void)
{ {
@ -454,8 +484,10 @@ main (int argc, char *argv[])
g_test_add_func ("/mu-str/mu-str-normalize-02", g_test_add_func ("/mu-str/mu-str-normalize-02",
test_mu_str_normalize_02); test_mu_str_normalize_02);
g_test_add_func ("/mu-str/mu-str-ascii-xapian-escape", g_test_add_func ("/mu-str/mu-str-xapian-escape",
test_mu_str_ascii_xapian_escape); test_mu_str_xapian_escape);
g_test_add_func ("/mu-str/mu-str-xapian-escape-non-ascii",
test_mu_str_xapian_escape_non_ascii);
g_test_add_func ("/mu-str/mu-str-display_contact", g_test_add_func ("/mu-str/mu-str-display_contact",
test_mu_str_display_contact); test_mu_str_display_contact);