support xapian ngrams

Xapian supports an "ngrams" option to help with languages/scripts
without explicit wordbreaks, such as Chinese / Japanese / Korean.

Add some plumbing for supporting this in mu as well. Experimental for
now.
This commit is contained in:
Dirk-Jan C. Binnema
2023-09-09 11:57:05 +03:00
parent f6122ecc9e
commit 264bb092f0
20 changed files with 207 additions and 81 deletions

View File

@ -41,17 +41,27 @@ namespace Mu {
*/
class Document {
public:
enum struct Options {
None = 0,
SupportNgrams = 1 << 0, /**< Support ngrams, as used in
* CJK and other languages. */
};
/**
* Construct a message for a new Xapian Document
*
* @param flags behavioral flags
*/
Document() {}
Document(Options opts = Options::None): options_{opts} {}
/**
* Construct a message document based on an existing Xapian document.
*
* @param doc
* @param flags behavioral flags
*/
Document(const Xapian::Document& doc): xdoc_{doc} {}
Document(const Xapian::Document& doc, Options opts = Options::None):
xdoc_{doc}, options_{opts} {}
/**
* DTOR
@ -240,11 +250,12 @@ private:
return cached_sexp_;
}
mutable Xapian::Document xdoc_;
Options options_;
mutable Sexp cached_sexp_;
mutable bool dirty_sexp_{}; /* xdoc's sexp is outdated */
};
MU_ENABLE_BITOPS(Document::Options);
} // namepace Mu