lib/index: Implement new indexer

Implement a new message indexer consisting of a single-threaded scanner
and a multi-threaded indexer.

This allows for a number of optimizations as well as background
indexing, though this initial version should be behave similar to the
old indexer.
This commit is contained in:
Dirk-Jan C. Binnema
2020-06-27 11:39:43 +03:00
parent 0e50bfc02c
commit 4e6bd7dfdf
12 changed files with 918 additions and 675 deletions

45
lib/index/Makefile.am Normal file
View File

@ -0,0 +1,45 @@
## Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program; if not, write to the Free Software Foundation,
## Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
include $(top_srcdir)/gtest.mk
AM_CPPFLAGS= \
$(CODE_COVERAGE_CPPFLAGS)
AM_CXXFLAGS= \
$(WARN_CXXFLAGS) \
$(GLIB_CFLAGS) \
$(ASAN_CXXFLAGS) \
$(CODE_COVERAGE_CFLAGS) \
-I${top_srcdir}/lib
AM_LDFLAGS= \
$(ASAN_LDFLAGS)
noinst_LTLIBRARIES= \
libmu-index.la
libmu_index_la_SOURCES= \
mu-indexer.cc \
mu-indexer.hh \
mu-scanner.cc \
mu-scanner.hh
libmu_index_la_LIBADD= \
$(GLIB_LIBS) \
$(CODE_COVERAGE_LIBS)
include $(top_srcdir)/aminclude_static.am

350
lib/index/mu-indexer.cc Normal file
View File

@ -0,0 +1,350 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include "mu-indexer.hh"
#include <config.h>
#include <atomic>
#include <mutex>
#include <vector>
#include <thread>
#include <condition_variable>
#include <iostream>
#include <atomic>
#include <chrono>
using namespace std::chrono_literals;
#include <xapian.h>
#include "mu-scanner.hh"
#include "utils/mu-async-queue.hh"
#include "utils/mu-error.hh"
#include "../mu-store.hh"
using namespace Mu;
struct Indexer::Private {
Private (Mu::Store& store):
store_{store},
scanner_{store_.metadata().root_maildir,
[this](auto&& path, auto&& statbuf, auto&& info){
return handler(path, statbuf, info);
}},
max_message_size_{store_.metadata().max_message_size} {
g_message ("created indexer for %s -> %s",
store.metadata().root_maildir.c_str(),
store.metadata().database_path.c_str());
}
~Private() { stop(); }
bool dir_predicate (const std::string& path, const struct dirent* dirent) const;
bool handler (const std::string& fullpath, struct stat *statbuf,
Scanner::HandleType htype);
void maybe_start_worker();
void worker();
bool cleanup();
bool start(const Indexer::Config& conf);
bool stop();
Indexer::Config conf_;
Store& store_;
Scanner scanner_;
const size_t max_message_size_;
time_t dirstamp_{};
std::atomic<bool> scan_done_{true}, clean_done_{true};
std::size_t max_workers_;
std::vector<std::thread> workers_;
std::thread scanner_worker_;
AsyncQueue<std::string> fq_;
struct Progress {
void reset() {
processed = updated = removed = 0;
}
std::atomic<size_t> processed{}; /**< Number of messages processed */
std::atomic<size_t> updated{}; /**< Number of messages added/updated to store */
std::atomic<size_t> removed{}; /**< Number of message removed from store */
};
Progress progress_;
std::mutex lock_, wlock_;
};
bool
Indexer::Private::handler (const std::string& fullpath, struct stat *statbuf,
Scanner::HandleType htype)
{
switch (htype) {
case Scanner::HandleType::EnterDir: {
// in lazy-mode, we ignore this dir if its dirstamp suggest it
// is up-to-date (this is _not_ always true; hence we call it
// lazy-mode)
dirstamp_ = store_.dirstamp(fullpath);
if (conf_.lazy_check && dirstamp_ == statbuf->st_mtime) {
g_debug("skip %s (seems up-to-date)", fullpath.c_str());
return false;
}
// don't index dirs with '.noindex'
auto noindex = ::access((fullpath + "/.noindex").c_str(), F_OK) == 0;
if (noindex) {
g_debug ("skip %s (has .noindex)", fullpath.c_str());
return false; // don't descend into this dir.
}
// don't index dirs with '.noupdate', unless we do a full
// (re)index.
if (!conf_.ignore_noupdate) {
auto noupdate = ::access((fullpath + "/.noupdate").c_str(), F_OK) == 0;
if (noupdate) {
g_debug ("skip %s (has .noupdate)", fullpath.c_str());
return false;
}
}
g_debug ("process %s", fullpath.c_str());
return true;
}
case Scanner::HandleType::LeaveDir: {
store_.set_dirstamp(fullpath, ::time({}));
return true;
}
case Scanner::HandleType::File: {
if ((size_t)statbuf->st_size > max_message_size_) {
g_debug ("skip %s (too big: %zu bytes)",
fullpath.c_str(), statbuf->st_size);
return false;
}
// if the message is not in the db yet, or not up-to-date, queue
// it for updating/inserting.
if (statbuf->st_mtime <= dirstamp_ &&
store_.contains_message (fullpath)) {
//g_debug ("skip %s: already up-to-date");
return false;
}
fq_.push(std::string{fullpath});
return true;
}
default:
g_return_val_if_reached (false);
return false;
}
}
void
Indexer::Private::maybe_start_worker()
{
std::lock_guard<std::mutex> wlock{wlock_};
if (fq_.size() > workers_.size() && workers_.size() < max_workers_)
workers_.emplace_back(std::thread([this]{worker();}));
}
void
Indexer::Private::worker()
{
std::string item;
g_debug ("started worker");
while (!scan_done_ || !fq_.empty()) {
if (!fq_.pop (item, 250ms))
continue;
//g_debug ("popped (n=%zu) path %s", fq_.size(), item.c_str());
++progress_.processed;
try {
store_.add_message(item);
++progress_.updated;
} catch (const Mu::Error& er) {
g_warning ("error adding message @ %s: %s",
item.c_str(), er.what());
}
maybe_start_worker();
}
}
bool
Indexer::Private::cleanup()
{
g_debug ("starting cleanup");
std::vector<Store::Id> orphans_; // store messages without files.
store_.for_each([&](Store::Id id, const std::string &path) {
if (clean_done_)
return false;
if (::access(path.c_str(), F_OK) != 0) {
g_debug ("%s not found; queing id=%u for removal",
path.c_str(), id);
orphans_.emplace_back(id);
}
return !clean_done_;
});
if (orphans_.empty()) {
g_debug("nothing to clean up");
return true;
}
store_.remove_messages (orphans_);
g_debug ("removed %zu orphan messages from store", orphans_.size());
return true;
}
bool
Indexer::Private::start(const Indexer::Config& conf)
{
stop();
conf_ = conf;
if (conf_.max_threads == 0)
max_workers_ = std::thread::hardware_concurrency();
else
max_workers_ = conf.max_threads;
g_debug ("starting indexer with up to %zu threads", max_workers_);
scan_done_ = false;
workers_.emplace_back(std::thread([this]{worker();}));
scan_done_ = clean_done_ = false;
scanner_worker_ = std::thread([this]{
progress_.reset();
if (conf_.scan) {
g_debug("starting scanner");
if (!scanner_.start()) {
g_warning ("failed to start scanner");
return;
}
scan_done_ = true;
g_debug ("scanner finished");
}
if (conf_.cleanup) {
g_debug ("starting cleanup");
cleanup();
clean_done_ = true;
g_debug ("cleanup finished");
}
store_.commit();
});
g_debug ("started indexer");
return true;
}
bool
Indexer::Private::stop()
{
scanner_.stop();
scan_done_ = clean_done_ = true;
const auto w_n = workers_.size();
fq_.clear();
if (scanner_worker_.joinable())
scanner_worker_.join();
for (auto&& w: workers_)
if (w.joinable())
w.join();
workers_.clear();
if (w_n > 0)
g_debug ("stopped indexer (joined %zu worker(s))", w_n);
return true;
}
Indexer::Indexer (Store& store):
priv_{std::make_unique<Private>(store)}
{}
Indexer::~Indexer() = default;
bool
Indexer::start(const Indexer::Config& conf)
{
std::lock_guard<std::mutex> l(priv_->lock_);
if (is_running())
return true;
return priv_->start(conf);
}
bool
Indexer::stop()
{
std::lock_guard<std::mutex> l(priv_->lock_);
if (!is_running())
return true;
g_debug ("stopping indexer");
return priv_->stop();
}
bool
Indexer::is_running() const
{
return !priv_->scan_done_ || !priv_->clean_done_ ||
!priv_->fq_.empty();
}
Indexer::Progress
Indexer::progress() const
{
return Progress{
is_running(),
priv_->progress_.processed,
priv_->progress_.updated,
priv_->progress_.removed
};
}

114
lib/index/mu-indexer.hh Normal file
View File

@ -0,0 +1,114 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#ifndef MU_INDEXER_HH__
#define MU_INDEXER_HH__
#include <memory>
#include <chrono>
namespace Mu {
struct Store;
/// An object abstracting the index process.
class Indexer {
public:
/**
* Construct an indexer object
*
* @param store the message store to use
*/
Indexer (Store& store);
/**
* DTOR
*/
~Indexer();
/// A configuration object for the indexer
struct Config {
bool scan{true};
/**< scan for new messages */
bool cleanup{true};
/**< clean messages no longer in the file system */
size_t max_threads{};
/**< maximum # of threads to use */
bool ignore_noupdate{};
/**< ignore .noupdate files */
bool lazy_check{};
/**< whether to skip directories that don't have a changed
* mtime */
};
/**
* Start indexing. If already underway, do nothing.
*
* @param conf a configuration object
*
* @return true if starting worked or an indexing process was already
* underway; false otherwise.
*
*/
bool start(const Config& conf);
/**
* Stop indexing. If not indexing, do nothing.
*
*
* @return true if we stopped indexing, or indexing was not underway.
* False otherwise.
*/
bool stop();
/**
* Is an indexing process running?
*
* @return true or false.
*/
bool is_running() const;
// Object describing current progress
struct Progress {
bool running{}; /**< Is an index operation in progress? */
size_t processed{}; /**< Number of messages processed */
size_t updated{}; /**< Number of messages added/updated to store */
size_t removed{}; /**< Number of message removed from store */
};
/**
* Get an object describing the current progress. The progress object
* describes the most recent indexing job, and is reset up a fresh
* start().
*
* @return a progress object.
*/
Progress progress() const;
private:
struct Private;
std::unique_ptr<Private> priv_;
};
} // namepace Mu
#endif /* MU_INDEXER_HH__ */

242
lib/index/mu-scanner.cc Normal file
View File

@ -0,0 +1,242 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include "mu-scanner.hh"
#include "config.h"
#include <chrono>
#include <mutex>
#include <atomic>
#include <thread>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <glib.h>
#include "utils/mu-utils.hh"
#include "utils/mu-error.hh"
using namespace Mu;
struct Scanner::Private {
Private (const std::string& root_dir,
Scanner::Handler handler):
root_dir_{root_dir}, handler_{handler} {
if (!handler_)
throw Mu::Error{Error::Code::Internal, "missing handler"};
}
~Private() {
stop();
}
bool start();
bool stop();
bool process_dentry (const std::string& path, struct dirent *dentry, bool is_maildir);
bool process_dir (const std::string& path, bool is_maildir);
const std::string root_dir_;
const Scanner::Handler handler_;
std::atomic<bool> running_{};
std::mutex lock_;
};
static bool
is_special_dir (const struct dirent *dentry)
{
const auto d_name{dentry->d_name};
return d_name[0] == '\0' ||
(d_name[1] == '\0' && d_name[0] == '.') ||
(d_name[2] == '\0' && d_name[0] == '.' && d_name[1] == '.');
}
static bool
is_new_cur (const char *dirname)
{
if (dirname[0] == 'c' && dirname[1] == 'u' && dirname[2] == 'r' && dirname[3] == '\0')
return true;
if (dirname[0] == 'n' && dirname[1] == 'e' && dirname[2] == 'w' && dirname[3] == '\0')
return true;
return false;
}
bool
Scanner::Private::process_dentry (const std::string& path, struct dirent *dentry,
bool is_maildir)
{
if (is_special_dir (dentry))
return true; // ignore.
const auto fullpath{path + "/" + dentry->d_name};
struct stat statbuf;
if (::stat(fullpath.c_str(), &statbuf) != 0) {
g_warning ("failed to stat %s: %s", fullpath.c_str(), ::strerror(errno));
return false;
}
if (S_ISDIR(statbuf.st_mode)) {
const auto res = handler_(fullpath, &statbuf, Scanner::HandleType::EnterDir);
if (!res) {
//g_debug ("skipping dir %s", fullpath.c_str());
return true; // skip
}
process_dir (fullpath, is_new_cur(dentry->d_name));
return handler_(fullpath, &statbuf, Scanner::HandleType::LeaveDir);
} else if (S_ISREG(statbuf.st_mode) && is_maildir)
return handler_(fullpath, &statbuf, Scanner::HandleType::File);
g_debug ("skip %s (neither maildir-file nor directory)", fullpath.c_str());
return true;
}
bool
Scanner::Private::process_dir (const std::string& path, bool is_maildir)
{
const auto dir = opendir (path.c_str());
if (G_UNLIKELY(!dir)) {
g_warning("failed to scan dir %s: %s", path.c_str(), strerror(errno));
return false;
}
// TODO: sort dentries by inode order, which makes things faster for extfs.
// see mu-maildir.c
while (running_) {
errno = 0;
const auto dentry{readdir(dir)};
if (G_LIKELY(dentry)) {
process_dentry (path, dentry, is_maildir);
continue;
}
if (errno != 0) {
g_warning("failed to read %s: %s", path.c_str(), strerror(errno));
continue;
}
break;
}
closedir (dir);
return true;
}
bool
Scanner::Private::start()
{
const auto& path{root_dir_};
if (G_UNLIKELY(path.length() > PATH_MAX)) {
g_warning("path too long");
return false;
}
const auto mode{F_OK | R_OK};
if (G_UNLIKELY(access (path.c_str(), mode) != 0)) {
g_warning("'%s' is not readable: %s", path.c_str(), strerror (errno));
return false;
}
struct stat statbuf{};
if (G_UNLIKELY(stat (path.c_str(), &statbuf) != 0)) {
g_warning("'%s' is not stat'able: %s", path.c_str(), strerror (errno));
return false;
}
if (G_UNLIKELY(!S_ISDIR (statbuf.st_mode))) {
g_warning("'%s' is not a directory", path.c_str());
return false;
}
running_ = true;
g_debug ("starting scan @ %s", root_dir_.c_str());
auto basename{g_path_get_basename(root_dir_.c_str())};
const auto is_maildir = (g_strcmp0(basename, "cur") == 0 ||
g_strcmp0(basename,"new") == 0);
g_free(basename);
const auto start{std::chrono::steady_clock::now()};
process_dir(root_dir_, is_maildir);
const auto elapsed = std::chrono::steady_clock::now() - start;
g_debug ("finished scan of %s in %" G_GINT64_FORMAT " ms", root_dir_.c_str(),
to_ms(elapsed));
running_ = false;
return true;
}
bool
Scanner::Private::stop()
{
if (!running_)
return true; // nothing to do
g_debug ("stopping scan");
running_ = false;
return true;
}
Scanner::Scanner (const std::string& root_dir,
Scanner::Handler handler):
priv_{std::make_unique<Private>(root_dir, handler)}
{}
Scanner::~Scanner() = default;
bool
Scanner::start()
{
{
std::lock_guard<std::mutex> l(priv_->lock_);
if (priv_->running_)
return true; //nothing to do
priv_->running_ = true;
}
const auto res = priv_->start();
priv_->running_ = false;
return res;
}
bool
Scanner::stop()
{
std::lock_guard<std::mutex> l(priv_->lock_);
return priv_->stop();
}
bool
Scanner::is_running() const
{
return priv_->running_;
}

96
lib/index/mu-scanner.hh Normal file
View File

@ -0,0 +1,96 @@
/*
** Copyright (C) 2020 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#ifndef MU_SCANNER_HH__
#define MU_SCANNER_HH__
#include <functional>
#include <memory>
#include <dirent.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
namespace Mu {
/// @brief Maildir scanner
///
/// Scans maildir (trees) recursively, and calls the Handler callback for
/// directories & files.
///
/// It filters out (i.e., does call the handler for):
/// - files starting with '.'
/// - files that do not live in a cur / new leaf maildir
/// - directories '.' and '..'
///
class Scanner {
public:
enum struct HandleType { File, EnterDir, LeaveDir };
/// Prototype for a handler function
using Handler = std::function<bool(const std::string& fullpath,
struct stat* statbuf,
HandleType htype)>;
/**
* Construct a scanner object for scanning a directory, recursively.
*
* If handler is a directroy
*
*
* @param root_dir root dir to start scanning
* @param handler handler function for some direntry
*/
Scanner (const std::string& root_dir, Handler handler);
/**
* DTOR
*/
~Scanner();
/**
* Start the scan; this is a blocking call than run until
* finished or (from another thread) stop() is called.
*
* @return true if starting worked; false otherwise
*/
bool start();
/**
* Stop the scan
*
* @return true if stopping worked; false otherwi%sse
*/
bool stop();
/**
* Is a scan currently running?
*
* @return true or false
*/
bool is_running() const;
private:
struct Private;
std::unique_ptr<Private> priv_;
};
} // namepace Mu
#endif /* MU_SCANNER_HH__ */

68
lib/index/test-scanner.cc Normal file
View File

@ -0,0 +1,68 @@
/*
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This library is free software; you can redistribute it and/or
** modify it under the terms of the GNU Lesser General Public License
** as published by the Free Software Foundation; either version 2.1
** of the License, or (at your option) any later version.
**
** This library is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
** Lesser General Public License for more details.
**
** You should have received a copy of the GNU Lesser General Public
** License along with this library; if not, write to the Free
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
** 02110-1301, USA.
*/
#include <vector>
#include <glib.h>
#include <iostream>
#include <sstream>
#include "mu-scanner.hh"
#include "mu-utils.hh"
using namespace Mu;
static void
test_scan_maildir ()
{
allow_warnings();
Scanner scanner{"/home/djcb/Maildir",
[](const dirent* dentry)->bool {
g_print ("%02x %s\n", dentry->d_type, dentry->d_name);
return true;
},
[](const std::string& fullpath, const struct stat* statbuf,
auto&& info)->bool {
g_print ("%s %zu\n", fullpath.c_str(), statbuf->st_size);
return true;
}
};
g_assert_true (scanner.start());
while (scanner.is_running()) {
sleep(1);
}
}
int
main (int argc, char *argv[]) try
{
g_test_init (&argc, &argv, NULL);
g_test_add_func ("/utils/scanner/scan-maildir", test_scan_maildir);
return g_test_run ();
} catch (const std::runtime_error& re) {
std::cerr << re.what() << "\n";
return 1;
}