* improve support for non-latin languages (cyrillic etc.) (WIP)

- change the various escaping / normalization functions to better deal with
    non-ascii, non-latin languages, such as Russian.

    It seems. now we can match 'Тесла' or 'Аркона' without problem.

  - added unit test.

  - WIP -- needs more testing.
This commit is contained in:
djcb
2012-04-16 01:10:46 +03:00
parent 557ce2839b
commit 0be852b288
7 changed files with 138 additions and 79 deletions

View File

@ -423,9 +423,9 @@ check_for_field (const char *str, gboolean *is_field, gboolean *is_range_field)
* function expects search terms (not complete queries)
* */
char*
mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
mu_str_xapian_escape_in_place (char *term, gboolean esc_space)
{
gchar *cur;
unsigned char *cur;
const char escchar = '_';
gboolean is_field, is_range_field;
unsigned colon;
@ -434,13 +434,10 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
check_for_field (term, &is_field, &is_range_field);
for (colon = 0, cur = term; *cur; ++cur) {
*cur = tolower(*cur);
for (colon = 0, cur = (unsigned char*)term; *cur; ++cur) {
switch (*cur) {
*cur = escchar;
break;
case '.': /* escape '..' if it's not a range field*/
if (is_range_field && cur[1] == '.')
cur += 1;
@ -461,21 +458,24 @@ mu_str_ascii_xapian_escape_in_place (char *term, gboolean esc_space)
case '*': /* wildcard */
break;
default:
if (!isalnum(*cur))
/* escape all other special stuff */
if (*cur < '0' || (*cur > '9' && *cur < 'A')
|| (*cur > 'Z' && *cur < 'a') ||
(*cur > 'z' && *cur < 0x80))
*cur = escchar;
}
}
return term;
/* downcase try to remove accents etc. */
return mu_str_normalize_in_place (term, TRUE);
}
char*
mu_str_ascii_xapian_escape (const char *query, gboolean esc_space)
mu_str_xapian_escape (const char *query, gboolean esc_space)
{
g_return_val_if_fail (query, NULL);
return mu_str_ascii_xapian_escape_in_place (g_strdup(query), esc_space);
return mu_str_xapian_escape_in_place (g_strdup(query), esc_space);
}