support xapian ngrams

Xapian supports an "ngrams" option to help with languages/scripts
without explicit wordbreaks, such as Chinese / Japanese / Korean.

Add some plumbing for supporting this in mu as well. Experimental for
now.
This commit is contained in:
Dirk-Jan C. Binnema
2023-09-09 11:57:05 +03:00
parent f6122ecc9e
commit 264bb092f0
20 changed files with 207 additions and 81 deletions

View File

@ -1,4 +1,4 @@
## Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
## Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
@ -38,7 +38,7 @@ lib_mu_message=static_library(
lib_mu_message_dep = declare_dependency(
link_with: lib_mu_message,
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep ],
dependencies: [ xapian_dep, gmime_dep, lib_mu_utils_dep, config_h_dep ],
include_directories:
include_directories(['.', '..']))

View File

@ -1,5 +1,5 @@
/*
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
@ -16,6 +16,7 @@
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include "config.h"
#include "mu-document.hh"
#include "mu-message.hh"
@ -31,9 +32,14 @@
#include <string>
#include <utils/mu-utils.hh>
using namespace Mu;
// backward compat
#ifndef HAVE_XAPIAN_FLAG_NGRAMS
#define FLAG_NGRAMS FLAG_CJK_NGRAM
#endif /*HAVE_XAPIAN_FLAG_NGRAMS*/
const Xapian::Document&
Document::xapian_document() const
{
@ -58,16 +64,29 @@ Document::put_prop(const Field& field, SexpType&& val)
std::forward<SexpType>(val));
}
static Xapian::TermGenerator
make_term_generator(Xapian::Document& doc, Document::Options opts)
{
Xapian::TermGenerator termgen;
if (any_of(opts & Document::Options::SupportNgrams))
termgen.set_flags(Xapian::TermGenerator::FLAG_NGRAMS);
termgen.set_document(doc);
return termgen;
}
static void
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val)
add_search_term(Xapian::Document& doc, const Field& field, const std::string& val,
Document::Options opts)
{
if (field.is_normal_term()) {
doc.add_term(field.xapian_term(val));
} else if (field.is_boolean_term()) {
doc.add_boolean_term(field.xapian_term(val));
} else if (field.is_indexable_term()) {
Xapian::TermGenerator termgen;
termgen.set_document(doc);
auto&& termgen{make_term_generator(doc, opts)};
termgen.index_text(utf8_flatten(val), 1, field.xapian_term());
/* also add as 'normal' term, so some queries where the indexer
* eats special chars also match */
@ -88,7 +107,7 @@ Document::add(Field::Id id, const std::string& val)
xdoc_.add_value(field.value_no(), val);
if (field.is_searchable())
add_search_term(xdoc_, field, val);
add_search_term(xdoc_, field, val, options_);
if (field.include_in_sexp())
put_prop(field, val);
@ -107,7 +126,7 @@ Document::add(Field::Id id, const std::vector<std::string>& vals)
if (field.is_searchable())
std::for_each(vals.begin(), vals.end(),
[&](const auto& val) {
add_search_term(xdoc_, field, val); });
add_search_term(xdoc_, field, val, options_); });
if (field.include_in_sexp()) {
Sexp elms{};
@ -149,9 +168,7 @@ Document::add(Field::Id id, const Contacts& contacts)
std::vector<std::string> cvec;
const std::string sepa2(1, SepaChar2);
Xapian::TermGenerator termgen;
termgen.set_document(xdoc_);
auto&& termgen{make_term_generator(xdoc_, options_)};
for (auto&& contact: contacts) {

View File

@ -41,17 +41,27 @@ namespace Mu {
*/
class Document {
public:
enum struct Options {
None = 0,
SupportNgrams = 1 << 0, /**< Support ngrams, as used in
* CJK and other languages. */
};
/**
* Construct a message for a new Xapian Document
*
* @param flags behavioral flags
*/
Document() {}
Document(Options opts = Options::None): options_{opts} {}
/**
* Construct a message document based on an existing Xapian document.
*
* @param doc
* @param flags behavioral flags
*/
Document(const Xapian::Document& doc): xdoc_{doc} {}
Document(const Xapian::Document& doc, Options opts = Options::None):
xdoc_{doc}, options_{opts} {}
/**
* DTOR
@ -240,11 +250,12 @@ private:
return cached_sexp_;
}
mutable Xapian::Document xdoc_;
Options options_;
mutable Sexp cached_sexp_;
mutable bool dirty_sexp_{}; /* xdoc's sexp is outdated */
};
MU_ENABLE_BITOPS(Document::Options);
} // namepace Mu

View File

@ -45,9 +45,10 @@
using namespace Mu;
struct Message::Private {
Private(Message::Options options): opts{options} {}
Private(Message::Options options):
opts{options}, doc{doc_opts(opts)} {}
Private(Message::Options options, Xapian::Document&& xdoc):
opts{options}, doc{std::move(xdoc)} {}
opts{options}, doc{std::move(xdoc), doc_opts(opts)} {}
Message::Options opts;
Document doc;
@ -70,6 +71,13 @@ struct Message::Private {
Option<std::string> embedded;
Option<std::string> language; /* body ISO language code */
private:
Document::Options doc_opts(Message::Options mopts) {
return any_of(opts & Message::Options::SupportNgrams) ?
Document::Options::SupportNgrams :
Document::Options::None;
}
};
@ -176,6 +184,11 @@ Message::document() const
return priv_->doc;
}
Message::Options
Message::options() const
{
return priv_->opts;
}
unsigned
Message::docid() const

View File

@ -1,5 +1,5 @@
/*
** Copyright (C) 2022 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
** Copyright (C) 2022-2023 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
@ -49,8 +49,10 @@ public:
Decrypt = 1 << 0, /**< Attempt to decrypt */
RetrieveKeys = 1 << 1, /**< Auto-retrieve crypto keys (implies network
* access) */
AllowRelativePath = 1 << 2, /**< Allow relateive paths for filename
AllowRelativePath = 1 << 2, /**< Allow relative paths for filename
* in make_from_path */
SupportNgrams = 1 << 3, /**< Support ngrams, as used in
* CJK and other languages. */
};
/**
@ -60,7 +62,6 @@ public:
*/
Message(Message&& other) noexcept;
/**
* operator=
*
@ -147,6 +148,14 @@ public:
const Document& document() const;
/**
* The message options for this message
*
* @return message options
*/
Options options() const;
/**
* Get the document-id, or 0 if non-existent.
*