support xapian ngrams

Xapian supports an "ngrams" option to help with languages/scripts
without explicit wordbreaks, such as Chinese / Japanese / Korean.

Add some plumbing for supporting this in mu as well. Experimental for
now.
This commit is contained in:
Dirk-Jan C. Binnema
2023-09-09 11:57:05 +03:00
parent f6122ecc9e
commit 264bb092f0
20 changed files with 207 additions and 81 deletions

View File

@ -202,6 +202,8 @@ topic_store(const Mu::Store& store, const Options& opts)
info.add_row({"ignored-address", c});
info.add_row({"messages in store", mu_format("{}", store.size())});
info.add_row({"support-ngrams", conf.get<Config::Id::SupportNgrams>() ? "yes" : "no"});
info.add_row({"last-change", tstamp(store.statistics().last_change)});
info.add_row({"last-index", tstamp(store.statistics().last_index)});

View File

@ -55,6 +55,8 @@ Mu::mu_cmd_init(const Options& opts)
conf.set<Config::Id::PersonalAddresses>(opts.init.my_addresses);
if (!opts.init.ignored_addresses.empty())
conf.set<Config::Id::IgnoredAddresses>(opts.init.ignored_addresses);
if (opts.init.support_ngrams)
conf.set<Config::Id::SupportNgrams>(true);
return Store::make_new(opts.runtime_path(RuntimePath::XapianDb),
opts.init.maildir, conf);

View File

@ -457,13 +457,16 @@ sub_init(CLI::App& sub, Options& opts)
"Maximum allowed message size in bytes");
sub.add_option("--batch-size", opts.init.batch_size,
"Maximum size of database transaction");
sub.add_option("--support-ngrams", opts.init.support_ngrams,
"Support CJK n-grams if for querying/indexing");
sub.add_flag("--reinit", opts.init.reinit,
"Re-initialize database with current settings")
->excludes("--maildir")
->excludes("--my-address")
->excludes("--ignored-address")
->excludes("--max-message-size")
->excludes("--batch-size");
->excludes("--batch-size")
->excludes("--support-ngrams");
}
static void

View File

@ -185,13 +185,15 @@ struct Options {
* Init
*/
struct Init {
std::string maildir; /**< where the mails are */
StringVec my_addresses; /**< personal e-mail addresses */
StringVec ignored_addresses; /**< addresses to be ignored for
std::string maildir; /**< where the mails are */
StringVec my_addresses; /**< personal e-mail addresses */
StringVec ignored_addresses; /**< addresses to be ignored for
* the contacts-cache */
OptSize max_msg_size; /**< max size for message files */
OptSize batch_size; /**< db transaction batch size */
bool reinit; /**< re-initialize */
OptSize max_msg_size; /**< max size for message files */
OptSize batch_size; /**< db transaction batch size */
bool reinit; /**< re-initialize */
bool support_ngrams; /**< support CJK etc. ngrams */
} init;
/*

View File

@ -90,6 +90,14 @@ handle_result(const Result<void>& res, const Mu::Options& opts)
int
main(int argc, char* argv[])
{
/*
* We handle this through explicit options
*/
g_unsetenv("XAPIAN_CJK_NGRAM");
/*
* set up locale
*/
setlocale(LC_ALL, "");
/*