* fix bug normalizing mixed (e.g. Latin etc. and Cyrillic) uf8 text
This commit is contained in:
@ -41,40 +41,10 @@ mu_str_normalize (const char *str, gboolean downcase, GStringChunk *strchunk)
|
|||||||
else
|
else
|
||||||
mystr = g_strdup (str);
|
mystr = g_strdup (str);
|
||||||
|
|
||||||
return mu_str_normalize_in_place_try (mystr, downcase, strchunk);
|
return mu_str_normalize_in_place (mystr, downcase, strchunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* this implementation should work for _all_ locales. */
|
|
||||||
static char*
|
|
||||||
mu_str_normalize_in_place_generic (char *str, gboolean downcase, GStringChunk *strchunk)
|
|
||||||
{
|
|
||||||
|
|
||||||
char *norm;
|
|
||||||
size_t len;
|
|
||||||
|
|
||||||
/* FIXME: add accent-folding etc. */
|
|
||||||
if (!downcase)
|
|
||||||
return str; /* nothing to do */
|
|
||||||
|
|
||||||
len = strlen (str);
|
|
||||||
norm = g_utf8_strdown (str, len);
|
|
||||||
|
|
||||||
|
|
||||||
if (strlen (norm) > len) {
|
|
||||||
/* this case is rare, but does happen */
|
|
||||||
char *copy;
|
|
||||||
if (!strchunk)
|
|
||||||
return norm;
|
|
||||||
copy = g_string_chunk_insert (strchunk, norm);
|
|
||||||
g_free (norm);
|
|
||||||
return copy;
|
|
||||||
}
|
|
||||||
|
|
||||||
memcpy (str, norm, len);
|
|
||||||
return str;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* this implementation works for accented chars in Unicode Blocks
|
* this implementation works for accented chars in Unicode Blocks
|
||||||
@ -94,7 +64,7 @@ mu_str_normalize_in_place_generic (char *str, gboolean downcase, GStringChunk *s
|
|||||||
* note-to-self: http://www.geertvanderploeg.com/unicode-gen/
|
* note-to-self: http://www.geertvanderploeg.com/unicode-gen/
|
||||||
*/
|
*/
|
||||||
char*
|
char*
|
||||||
mu_str_normalize_in_place_try (char *str, gboolean downcase, GStringChunk *strchunk)
|
mu_str_normalize_in_place (char *str, gboolean downcase, GStringChunk *strchunk)
|
||||||
{
|
{
|
||||||
const guchar *cur;
|
const guchar *cur;
|
||||||
int i;
|
int i;
|
||||||
@ -398,14 +368,34 @@ mu_str_normalize_in_place_try (char *str, gboolean downcase, GStringChunk *strch
|
|||||||
|
|
||||||
default: str[i++] = *cur; break;
|
default: str[i++] = *cur; break;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
/* our fast-path for latin-utf8 does not work -- bummer!
|
/* our fast-path for latin-utf8 does not work
|
||||||
* use something more generic (but a bit slower)
|
* -- bummer! just append the character then
|
||||||
*/
|
* */
|
||||||
return mu_str_normalize_in_place_generic (str, downcase, strchunk);
|
gunichar uc;
|
||||||
|
char buf[7];
|
||||||
|
size_t len1, len2;
|
||||||
|
|
||||||
|
len1 = g_utf8_next_char ((char*)cur) - (char*)cur;
|
||||||
|
uc = g_utf8_get_char ((char*)cur);
|
||||||
|
|
||||||
|
if (downcase)
|
||||||
|
uc = g_unichar_tolower (uc);
|
||||||
|
|
||||||
|
len2 = g_unichar_to_utf8 (uc, buf);
|
||||||
|
|
||||||
|
/* if the new char fits where the old char was,
|
||||||
|
* change it. otherwise, don't bother. */
|
||||||
|
|
||||||
|
if (len1 == len2) {
|
||||||
|
memcpy (str + i, buf, len2);
|
||||||
|
i += len2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
str[i] = '\0';
|
str[i] = '\0';
|
||||||
|
|
||||||
return str;
|
return str;
|
||||||
|
|||||||
@ -501,7 +501,7 @@ mu_str_xapian_escape_in_place_try (char *term, gboolean esc_space, GStringChunk
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* downcase try to remove accents etc. */
|
/* downcase try to remove accents etc. */
|
||||||
return mu_str_normalize_in_place_try (term, TRUE, strchunk);
|
return mu_str_normalize_in_place (term, TRUE, strchunk);
|
||||||
}
|
}
|
||||||
|
|
||||||
char*
|
char*
|
||||||
|
|||||||
@ -137,7 +137,7 @@ char* mu_str_normalize (const char *str, gboolean downcase,
|
|||||||
* NULL. User only needs to free the returned string if a) return
|
* NULL. User only needs to free the returned string if a) return
|
||||||
* value != str and b) strchunk was not provided.
|
* value != str and b) strchunk was not provided.
|
||||||
*/
|
*/
|
||||||
char* mu_str_normalize_in_place_try (char *str, gboolean downcase,
|
char* mu_str_normalize_in_place (char *str, gboolean downcase,
|
||||||
GStringChunk *strchunk);
|
GStringChunk *strchunk);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user