* improve support for non-latin languages (cyrillic etc.) (WIP)

- change the various escaping / normalization functions to better deal with non-ascii, non-latin languages, such as Russian. It seems. now we can match 'Тесла' or 'Аркона' without problem. - added unit test. - WIP -- needs more testing.
2012-04-16 01:10:46 +03:00
parent 557ce2839b
commit 0be852b288
7 changed files with 138 additions and 79 deletions
--- a/src/mu-msg-fields.c
+++ b/src/mu-msg-fields.c
@ -139,7 +139,7 @@ static const MuMsgField FIELD_DATA[] = {
 		MU_MSG_FIELD_ID_FILE,
 		MU_MSG_FIELD_TYPE_STRING,
 		"file" , 'j', 'J',
-		FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_NORMALIZE |
+		FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_ESCAPE |
 		FLAG_DONT_CACHE | FLAG_XAPIAN_PREFIX_ONLY
 	},

@ -164,7 +164,8 @@ static const MuMsgField FIELD_DATA[] = {
 		MU_MSG_FIELD_TYPE_STRING,
 		"path", 'l', 'L',   /* 'l' for location */
 		FLAG_GMIME | FLAG_XAPIAN_VALUE |
-		FLAG_XAPIAN_BOOLEAN  | FLAG_XAPIAN_PREFIX_ONLY
+		FLAG_XAPIAN_BOOLEAN  | FLAG_XAPIAN_PREFIX_ONLY |
+		FLAG_XAPIAN_ESCAPE
 	},

 	{
@ -172,7 +173,7 @@ static const MuMsgField FIELD_DATA[] = {
 		MU_MSG_FIELD_TYPE_STRING,
 		"maildir", 'm', 'M',
 		FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_VALUE |
-		FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE | FLAG_XAPIAN_PREFIX_ONLY
+		FLAG_XAPIAN_ESCAPE | FLAG_XAPIAN_PREFIX_ONLY
 	},


@ -204,7 +205,7 @@ static const MuMsgField FIELD_DATA[] = {
 		MU_MSG_FIELD_TYPE_STRING,
 		"subject", 's', 'S',
 		FLAG_GMIME | FLAG_XAPIAN_INDEX | FLAG_XAPIAN_VALUE |
-		FLAG_XAPIAN_TERM | FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE
+		FLAG_XAPIAN_TERM  | FLAG_XAPIAN_ESCAPE
 	},

 	{
@ -234,7 +235,7 @@ static const MuMsgField FIELD_DATA[] = {
 		MU_MSG_FIELD_TYPE_STRING_LIST,
 		"tag", 'x', 'X',
 		FLAG_GMIME | FLAG_XAPIAN_TERM | FLAG_XAPIAN_PREFIX_ONLY |
-		FLAG_NORMALIZE | FLAG_XAPIAN_ESCAPE
+		FLAG_XAPIAN_ESCAPE
 	},

 	{	/* special, internal field, to get a unique key */
--- a/src/mu-query.cc
+++ b/src/mu-query.cc
@ -298,7 +298,7 @@ mu_query_preprocess (const char *query, GError **err)
 		cur->data = mu_str_normalize_in_place ((gchar*)cur->data, TRUE);
 		/* escape '@', single '_' and ':' if it's not following a
 		 * xapian-pfx with '_' */
-		cur->data = mu_str_ascii_xapian_escape_in_place
+		cur->data = mu_str_xapian_escape_in_place
 			((gchar*)cur->data, TRUE /*escape spaces too*/);
 	}

--- a/src/mu-store-write.cc
+++ b/src/mu-store-write.cc
@ -324,7 +324,7 @@ add_terms_values_str (Xapian::Document& doc, char *val,
 		termgen.index_text_without_positions (val, 1, prefix(mfid));
 	}
 	if (mu_msg_field_xapian_escape (mfid))
-		mu_str_ascii_xapian_escape_in_place (val,
+		mu_str_xapian_escape_in_place (val,
 						     TRUE /*esc_space*/);
 	if (mu_msg_field_xapian_term(mfid))
 		doc.add_term (prefix(mfid) +
@ -476,7 +476,7 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)

 		/* now, let's create a term... */
 		mu_str_normalize_in_place (val, TRUE);
-		mu_str_ascii_xapian_escape_in_place (val, TRUE /*esc space*/);
+		mu_str_xapian_escape_in_place (val, TRUE /*esc space*/);

 		pdata->_doc.add_term
 			(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
@ -632,7 +632,7 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
 	if (!mu_str_is_empty(contact->address)) {

 		char *escaped;
-		escaped = mu_str_ascii_xapian_escape (contact->address,
+		escaped = mu_str_xapian_escape (contact->address,
 						FALSE /*dont esc space*/);
 		msgdoc->_doc->add_term
 			(std::string  (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
--- a/src/mu-str-normalize.c
+++ b/src/mu-str-normalize.c
@ -38,6 +38,30 @@ mu_str_normalize (const char *str, gboolean downcase)
 }


+/* this implementation should work for _all_ locales. */
+static char*
+mu_str_normalize_in_place_generic (char *str, gboolean downcase)
+{
+	/* FIXME: add accent-folding etc. */
+
+	if (downcase) {
+
+		char *norm;
+		size_t len;
+
+		len  = strlen (str);
+		norm = g_utf8_strdown (str, len);
+
+		if (strlen (norm) > len)
+			g_warning ("normalized text doesn't fit :/");
+
+		memcpy (str, norm, len);
+	}
+
+	return str;
+}
+
+
 /*
 * this implementation works for accented chars in Unicode Blocks
 * 'Latin-1 Supplement' and 'Latin Extended-A'. An alternative (slower
@ -66,7 +90,8 @@ mu_str_normalize_in_place (char *str, gboolean downcase)

 	for (i = 0, cur = (const guchar*)str; *cur; ++cur) {

-		if (G_LIKELY(*cur < 0xc3 || *cur > 0xc5)) {
+		/* special case for plain-old ascii */
+		if ((*cur < 0x80)) {
 			str[i++] = downcase ? tolower (*cur) : *cur;
 			continue;
 		}
@ -263,7 +288,7 @@ mu_str_normalize_in_place (char *str, gboolean downcase)

 			}

-		} else { /* Latin Extended-A (0xc5) */
+		} else if (*cur == 0xc5) { /* Latin Extended-A (0xc5) */
 			++cur;
 			switch (*cur) {
 			case 0x81: str[i++] = downcase ? 'l': 'L'; break;
@ -357,6 +382,11 @@ mu_str_normalize_in_place (char *str, gboolean downcase)

 			default:   str[i++] = *cur; break;
 			}
+		} else {
+			/* our fast-path for latin-utf8 does not work -- bummer!
+			 * use something more generic (but a bit slower)
+			 */
+			return mu_str_normalize_in_place_generic (str, downcase);
 		}
 	}

--- a/src/mu-str.c
+++ b/src/mu-str.c
@ -423,9 +423,9 @@ check_for_field (const char *str, gboolean *is_field, gboolean *is_range_field)
 * function expects search terms (not complete queries)
 * */
 char*
-mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
+mu_str_xapian_escape_in_place (char *term, gboolean esc_space)
 {
-	gchar *cur;
+	unsigned char *cur;
 	const char escchar = '_';
 	gboolean is_field, is_range_field;
 	unsigned colon;
@ -434,13 +434,10 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)

 	check_for_field (term, &is_field, &is_range_field);

-	for (colon = 0, cur = term; *cur; ++cur) {
-
-		*cur = tolower(*cur);
+	for (colon = 0, cur = (unsigned char*)term; *cur; ++cur) {

 		switch (*cur) {
-			*cur = escchar;
-			break;
+
 		case '.': /* escape '..' if it's not a range field*/
 			if (is_range_field && cur[1] == '.')
 				cur += 1;
@ -461,21 +458,24 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
 		case '*':   /* wildcard */
 			break;
 		default:
-			if (!isalnum(*cur))
+			/* escape all other special stuff */
+			if (*cur < '0' || (*cur > '9' && *cur < 'A')
+			    || (*cur > 'Z' && *cur < 'a') ||
+			    (*cur > 'z' && *cur < 0x80))
 				*cur = escchar;
 		}
-
 	}

-	return term;
+	/* downcase try to remove accents etc. */
+	return mu_str_normalize_in_place (term, TRUE);
 }

 char*
-mu_str_ascii_xapian_escape (const char *query, gboolean esc_space)
+mu_str_xapian_escape (const char *query, gboolean esc_space)
 {
 	g_return_val_if_fail (query, NULL);

-	return mu_str_ascii_xapian_escape_in_place (g_strdup(query), esc_space);
+	return mu_str_xapian_escape_in_place (g_strdup(query), esc_space);
 }


--- a/src/mu-str.h
+++ b/src/mu-str.h
@ -138,14 +138,12 @@ char* mu_str_normalize_in_place (char *str, gboolean downcase);
 * changing is done in-place (by changing the argument string). in any
 * case, the string will be downcased.
 *
- * works for ascii strings, like e-mail addresses and message-id.
- *
 * @param query a query string
 * @param esc_space escape space characters as well
 *
 * @return the escaped string or NULL in case of error
 */
-char* mu_str_ascii_xapian_escape_in_place (char *query, gboolean esc_space);
+char* mu_str_xapian_escape_in_place (char *query, gboolean esc_space);

 /**
 * escape the string for use with xapian matching. in practice, if the
@ -153,14 +151,12 @@ char* mu_str_ascii_xapian_escape_in_place (char *query, gboolean esc_space);
 * replace ':' with '_', if it's not following a xapian-prefix (such
 * as 'subject:', 't:' etc, as defined in mu-msg-fields.[ch]).
 *
- * works for ascii strings, like e-mail addresses and message-id.
- *
 * @param query a query string
 * @param esc_space escape space characters as well
 *
 * @return the escaped string (free with g_free) or NULL in case of error
 */
-char* mu_str_ascii_xapian_escape (const char *query, gboolean esc_space)
+char* mu_str_xapian_escape (const char *query, gboolean esc_space)
        G_GNUC_WARN_UNUSED_RESULT;


--- a/src/tests/test-mu-str.c
+++ b/src/tests/test-mu-str.c
@ -180,7 +180,7 @@ test_mu_str_esc_to_list (void)
 }

 static void
-test_mu_str_ascii_xapian_escape (void)
+test_mu_str_xapian_escape (void)
 {
 	int			i;
 	struct {
@ -204,7 +204,7 @@ test_mu_str_ascii_xapian_escape (void)

 	for (i = 0; i != G_N_ELEMENTS(words); ++i) {
 		gchar *a = g_strdup (words[i].word);
-		mu_str_ascii_xapian_escape_in_place (a, FALSE);
+		mu_str_xapian_escape_in_place (a, FALSE);

 		if (g_test_verbose())
 			g_print ("expected: '%s' <=> got: '%s'\n",
@ -216,6 +216,36 @@ test_mu_str_ascii_xapian_escape (void)
 }


+static void
+test_mu_str_xapian_escape_non_ascii (void)
+{
+	int			i;
+	struct {
+		const char*	word;
+		const char*	esc;
+	} words [] = {
+		{ "Тесла, Никола", "тесла__никола"},
+		{ "Masha@Аркона.ru", "masha_аркона_ru" },
+		{ "foo:ελληνικά", "foo_ελληνικά" },
+		{ "日本語!!", "日本語__" },
+	};
+
+	for (i = 0; i != G_N_ELEMENTS(words); ++i) {
+		gchar *a = g_strdup (words[i].word);
+		mu_str_xapian_escape_in_place (a, FALSE);
+
+		if (g_test_verbose())
+			g_print ("(%s) expected: '%s' <=> got: '%s'\n",
+				 words[i].word, words[i].esc, a);
+
+		g_assert_cmpstr (a, ==, words[i].esc);
+		g_free (a);
+	}
+}
+
+
+
+
 static void
 test_mu_str_display_contact (void)
 {
@ -454,8 +484,10 @@ main (int argc, char *argv[])
 	g_test_add_func ("/mu-str/mu-str-normalize-02",
 			 test_mu_str_normalize_02);

-	g_test_add_func ("/mu-str/mu-str-ascii-xapian-escape",
-			 test_mu_str_ascii_xapian_escape);
+	g_test_add_func ("/mu-str/mu-str-xapian-escape",
+			 test_mu_str_xapian_escape);
+	g_test_add_func ("/mu-str/mu-str-xapian-escape-non-ascii",
+			 test_mu_str_xapian_escape_non_ascii);

 	g_test_add_func ("/mu-str/mu-str-display_contact",
 			 test_mu_str_display_contact);