support xapian ngrams
Xapian supports an "ngrams" option to help with languages/scripts without explicit wordbreaks, such as Chinese / Japanese / Korean. Add some plumbing for supporting this in mu as well. Experimental for now.
This commit is contained in:
@ -1,4 +1,4 @@
|
||||
## Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
## Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
##
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
@ -38,7 +38,7 @@ lib_mu_message=static_library(
|
||||
|
||||
lib_mu_message_dep = declare_dependency(
|
||||
link_with: lib_mu_message,
|
||||
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep ],
|
||||
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep, config_h_dep ],
|
||||
include_directories:
|
||||
include_directories(['.', '..']))
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify it
|
||||
** under the terms of the GNU General Public License as published by the
|
||||
@ -16,6 +16,7 @@
|
||||
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
**
|
||||
*/
|
||||
#include "config.h"
|
||||
|
||||
#include "mu-document.hh"
|
||||
#include "mu-message.hh"
|
||||
@ -31,9 +32,14 @@
|
||||
#include <string>
|
||||
#include <utils/mu-utils.hh>
|
||||
|
||||
|
||||
using namespace Mu;
|
||||
|
||||
// backward compat
|
||||
#ifndef HAVE_XAPIAN_FLAG_NGRAMS
|
||||
#define FLAG_NGRAMS FLAG_CJK_NGRAM
|
||||
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
|
||||
|
||||
|
||||
const Xapian::Document&
|
||||
Document::xapian_document() const
|
||||
{
|
||||
@ -58,16 +64,29 @@ Document::put_prop(const Field& field, SexpType&& val)
|
||||
std::forward<SexpType>(val));
|
||||
}
|
||||
|
||||
static Xapian::TermGenerator
|
||||
make_term_generator(Xapian::Document& doc, Document::Options opts)
|
||||
{
|
||||
Xapian::TermGenerator termgen;
|
||||
|
||||
if (any_of(opts & Document::Options::SupportNgrams))
|
||||
termgen.set_flags(Xapian::TermGenerator::FLAG_NGRAMS);
|
||||
|
||||
termgen.set_document(doc);
|
||||
|
||||
return termgen;
|
||||
}
|
||||
|
||||
static void
|
||||
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val)
|
||||
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val,
|
||||
Document::Options opts)
|
||||
{
|
||||
if (field.is_normal_term()) {
|
||||
doc.add_term(field.xapian_term(val));
|
||||
} else if (field.is_boolean_term()) {
|
||||
doc.add_boolean_term(field.xapian_term(val));
|
||||
} else if (field.is_indexable_term()) {
|
||||
Xapian::TermGenerator termgen;
|
||||
termgen.set_document(doc);
|
||||
auto&& termgen{make_term_generator(doc, opts)};
|
||||
termgen.index_text(utf8_flatten(val), 1, field.xapian_term());
|
||||
/* also add as 'normal' term, so some queries where the indexer
|
||||
* eats special chars also match */
|
||||
@ -88,7 +107,7 @@ Document::add(Field::Id id, const std::string& val)
|
||||
xdoc_.add_value(field.value_no(), val);
|
||||
|
||||
if (field.is_searchable())
|
||||
add_search_term(xdoc_, field, val);
|
||||
add_search_term(xdoc_, field, val, options_);
|
||||
|
||||
if (field.include_in_sexp())
|
||||
put_prop(field, val);
|
||||
@ -107,7 +126,7 @@ Document::add(Field::Id id, const std::vector<std::string>& vals)
|
||||
if (field.is_searchable())
|
||||
std::for_each(vals.begin(), vals.end(),
|
||||
[&](const auto& val) {
|
||||
add_search_term(xdoc_, field, val); });
|
||||
add_search_term(xdoc_, field, val, options_); });
|
||||
|
||||
if (field.include_in_sexp()) {
|
||||
Sexp elms{};
|
||||
@ -149,9 +168,7 @@ Document::add(Field::Id id, const Contacts& contacts)
|
||||
std::vector<std::string> cvec;
|
||||
|
||||
const std::string sepa2(1, SepaChar2);
|
||||
|
||||
Xapian::TermGenerator termgen;
|
||||
termgen.set_document(xdoc_);
|
||||
auto&& termgen{make_term_generator(xdoc_, options_)};
|
||||
|
||||
for (auto&& contact: contacts) {
|
||||
|
||||
|
||||
@ -41,17 +41,27 @@ namespace Mu {
|
||||
*/
|
||||
class Document {
|
||||
public:
|
||||
enum struct Options {
|
||||
None = 0,
|
||||
SupportNgrams = 1 << 0, /**< Support ngrams, as used in
|
||||
* CJK and other languages. */
|
||||
};
|
||||
|
||||
/**
|
||||
* Construct a message for a new Xapian Document
|
||||
*
|
||||
* @param flags behavioral flags
|
||||
*/
|
||||
Document() {}
|
||||
Document(Options opts = Options::None): options_{opts} {}
|
||||
|
||||
/**
|
||||
* Construct a message document based on an existing Xapian document.
|
||||
*
|
||||
* @param doc
|
||||
* @param flags behavioral flags
|
||||
*/
|
||||
Document(const Xapian::Document& doc): xdoc_{doc} {}
|
||||
Document(const Xapian::Document& doc, Options opts = Options::None):
|
||||
xdoc_{doc}, options_{opts} {}
|
||||
|
||||
/**
|
||||
* DTOR
|
||||
@ -240,11 +250,12 @@ private:
|
||||
return cached_sexp_;
|
||||
}
|
||||
|
||||
|
||||
mutable Xapian::Document xdoc_;
|
||||
Options options_;
|
||||
mutable Sexp cached_sexp_;
|
||||
mutable bool dirty_sexp_{}; /* xdoc's sexp is outdated */
|
||||
};
|
||||
MU_ENABLE_BITOPS(Document::Options);
|
||||
|
||||
} // namepace Mu
|
||||
|
||||
|
||||
@ -45,9 +45,10 @@
|
||||
using namespace Mu;
|
||||
|
||||
struct Message::Private {
|
||||
Private(Message::Options options): opts{options} {}
|
||||
Private(Message::Options options):
|
||||
opts{options}, doc{doc_opts(opts)} {}
|
||||
Private(Message::Options options, Xapian::Document&& xdoc):
|
||||
opts{options}, doc{std::move(xdoc)} {}
|
||||
opts{options}, doc{std::move(xdoc), doc_opts(opts)} {}
|
||||
|
||||
Message::Options opts;
|
||||
Document doc;
|
||||
@ -70,6 +71,13 @@ struct Message::Private {
|
||||
Option<std::string> embedded;
|
||||
|
||||
Option<std::string> language; /* body ISO language code */
|
||||
|
||||
private:
|
||||
Document::Options doc_opts(Message::Options mopts) {
|
||||
return any_of(opts & Message::Options::SupportNgrams) ?
|
||||
Document::Options::SupportNgrams :
|
||||
Document::Options::None;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -176,6 +184,11 @@ Message::document() const
|
||||
return priv_->doc;
|
||||
}
|
||||
|
||||
Message::Options
|
||||
Message::options() const
|
||||
{
|
||||
return priv_->opts;
|
||||
}
|
||||
|
||||
unsigned
|
||||
Message::docid() const
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
**
|
||||
** This program is free software; you can redistribute it and/or modify it
|
||||
** under the terms of the GNU General Public License as published by the
|
||||
@ -49,8 +49,10 @@ public:
|
||||
Decrypt = 1 << 0, /**< Attempt to decrypt */
|
||||
RetrieveKeys = 1 << 1, /**< Auto-retrieve crypto keys (implies network
|
||||
* access) */
|
||||
AllowRelativePath = 1 << 2, /**< Allow relateive paths for filename
|
||||
AllowRelativePath = 1 << 2, /**< Allow relative paths for filename
|
||||
* in make_from_path */
|
||||
SupportNgrams = 1 << 3, /**< Support ngrams, as used in
|
||||
* CJK and other languages. */
|
||||
};
|
||||
|
||||
/**
|
||||
@ -60,7 +62,6 @@ public:
|
||||
*/
|
||||
Message(Message&& other) noexcept;
|
||||
|
||||
|
||||
/**
|
||||
* operator=
|
||||
*
|
||||
@ -147,6 +148,14 @@ public:
|
||||
const Document& document() const;
|
||||
|
||||
|
||||
/**
|
||||
* The message options for this message
|
||||
*
|
||||
* @return message options
|
||||
*/
|
||||
Options options() const;
|
||||
|
||||
|
||||
/**
|
||||
* Get the document-id, or 0 if non-existent.
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user