* improve support for non-latin languages (cyrillic etc.) (WIP)

- change the various escaping / normalization functions to better deal with
    non-ascii, non-latin languages, such as Russian.

    It seems. now we can match 'Тесла' or 'Аркона' without problem.

  - added unit test.

  - WIP -- needs more testing.
This commit is contained in:
djcb
2012-04-16 01:10:46 +03:00
parent 557ce2839b
commit 0be852b288
7 changed files with 138 additions and 79 deletions

View File

@ -324,7 +324,7 @@ add_terms_values_str (Xapian::Document& doc, char *val,
termgen.index_text_without_positions (val, 1, prefix(mfid));
}
if (mu_msg_field_xapian_escape (mfid))
mu_str_ascii_xapian_escape_in_place (val,
mu_str_xapian_escape_in_place (val,
TRUE /*esc_space*/);
if (mu_msg_field_xapian_term(mfid))
doc.add_term (prefix(mfid) +
@ -476,7 +476,7 @@ each_part (MuMsg *msg, MuMsgPart *part, PartData *pdata)
/* now, let's create a term... */
mu_str_normalize_in_place (val, TRUE);
mu_str_ascii_xapian_escape_in_place (val, TRUE /*esc space*/);
mu_str_xapian_escape_in_place (val, TRUE /*esc space*/);
pdata->_doc.add_term
(file + std::string(val, 0, MuStore::MAX_TERM_LENGTH));
@ -632,8 +632,8 @@ each_contact_info (MuMsgContact *contact, MsgDoc *msgdoc)
if (!mu_str_is_empty(contact->address)) {
char *escaped;
escaped = mu_str_ascii_xapian_escape (contact->address,
FALSE /*dont esc space*/);
escaped = mu_str_xapian_escape (contact->address,
FALSE /*dont esc space*/);
msgdoc->_doc->add_term
(std::string (pfx + escaped, 0, MuStore::MAX_TERM_LENGTH));
g_free (escaped);