index: save/commit metadata after messages

Ensure the metadata (dirstamps) for messages are only written / committed _after_
the accompanying message have been written / committed.

This avoids missing updates when indexing gets terminated unexpectedly.
This commit is contained in:
Dirk-Jan C. Binnema
2022-02-03 22:51:02 +02:00
parent 12658a3dc6
commit 05393ba797
5 changed files with 174 additions and 113 deletions

View File

@ -22,6 +22,7 @@
#include <config.h> #include <config.h>
#include <atomic> #include <atomic>
#include <algorithm>
#include <mutex> #include <mutex>
#include <vector> #include <vector>
#include <thread> #include <thread>
@ -39,29 +40,41 @@ using namespace std::chrono_literals;
using namespace Mu; using namespace Mu;
struct IndexState { struct IndexState {
enum State { Idle, Scanning, Cleaning }; enum State { Idle,
Scanning,
Cleaning };
static const char* name(State s) static const char* name(State s)
{ {
switch (s) { switch (s) {
case Idle: return "idle"; case Idle:
case Scanning: return "scanning"; return "idle";
case Cleaning: return "cleaning"; case Scanning:
default: return "<error>"; return "scanning";
case Cleaning:
return "cleaning";
default:
return "<error>";
} }
} }
bool operator==(State rhs) const { return state_ == rhs; } bool operator==(State rhs) const
{
return state_ == rhs;
}
bool operator!=(State rhs) const
{
return state_ != rhs;
}
void change_to(State new_state) void change_to(State new_state)
{ {
g_debug("changing indexer state %s->%s", g_debug("changing indexer state %s->%s", name((State)state_),
name((State)state_),
name((State)new_state)); name((State)new_state));
state_ = new_state; state_ = new_state;
} }
private: private:
State state_{Idle}; std::atomic<State> state_{Idle};
}; };
struct Indexer::Private { struct Indexer::Private {
@ -74,17 +87,20 @@ struct Indexer::Private {
{ {
g_message("created indexer for %s -> %s (batch-size: %zu)", g_message("created indexer for %s -> %s (batch-size: %zu)",
store.metadata().root_maildir.c_str(), store.metadata().root_maildir.c_str(),
store.metadata().database_path.c_str(), store.metadata().database_path.c_str(), store.metadata().batch_size);
store.metadata().batch_size);
} }
~Private() { stop(); } ~Private()
{
stop();
}
bool dir_predicate(const std::string& path, const struct dirent* dirent) const; bool dir_predicate(const std::string& path, const struct dirent* dirent) const;
bool handler(const std::string& fullpath, struct stat* statbuf, Scanner::HandleType htype); bool handler(const std::string& fullpath, struct stat* statbuf, Scanner::HandleType htype);
void maybe_start_worker(); void maybe_start_worker();
void worker(); void item_worker();
void scan_worker();
bool cleanup(); bool cleanup();
@ -101,17 +117,24 @@ struct Indexer::Private {
std::vector<std::thread> workers_; std::vector<std::thread> workers_;
std::thread scanner_worker_; std::thread scanner_worker_;
AsyncQueue<std::string> fq_; struct WorkItem {
std::string full_path;
enum Type {
Dir,
File
};
Type type;
};
AsyncQueue<WorkItem> todos_;
Progress progress_; Progress progress_;
IndexState state_; IndexState state_;
std::mutex lock_, w_lock_;
std::mutex lock_, wlock_;
}; };
bool bool
Indexer::Private::handler(const std::string& fullpath, Indexer::Private::handler(const std::string& fullpath, struct stat* statbuf,
struct stat* statbuf,
Scanner::HandleType htype) Scanner::HandleType htype)
{ {
switch (htype) { switch (htype) {
@ -124,8 +147,7 @@ Indexer::Private::handler(const std::string& fullpath,
dirstamp_ = store_.dirstamp(fullpath); dirstamp_ = store_.dirstamp(fullpath);
if (conf_.lazy_check && dirstamp_ >= statbuf->st_mtime && if (conf_.lazy_check && dirstamp_ >= statbuf->st_mtime &&
htype == Scanner::HandleType::EnterNewCur) { htype == Scanner::HandleType::EnterNewCur) {
g_debug("skip %s (seems up-to-date: %s >= %s)", g_debug("skip %s (seems up-to-date: %s >= %s)", fullpath.c_str(),
fullpath.c_str(),
time_to_string("%FT%T", dirstamp_).c_str(), time_to_string("%FT%T", dirstamp_).c_str(),
time_to_string("%FT%T", statbuf->st_mtime).c_str()); time_to_string("%FT%T", statbuf->st_mtime).c_str());
return false; return false;
@ -152,7 +174,7 @@ Indexer::Private::handler(const std::string& fullpath,
return true; return true;
} }
case Scanner::HandleType::LeaveDir: { case Scanner::HandleType::LeaveDir: {
store_.set_dirstamp(fullpath, ::time(NULL)); todos_.push({fullpath, WorkItem::Type::Dir});
return true; return true;
} }
@ -160,8 +182,7 @@ Indexer::Private::handler(const std::string& fullpath,
++progress_.checked; ++progress_.checked;
if ((size_t)statbuf->st_size > max_message_size_) { if ((size_t)statbuf->st_size > max_message_size_) {
g_debug("skip %s (too big: %" G_GINT64_FORMAT " bytes)", g_debug("skip %s (too big: %" G_GINT64_FORMAT " bytes)", fullpath.c_str(),
fullpath.c_str(),
(gint64)statbuf->st_size); (gint64)statbuf->st_size);
return false; return false;
} }
@ -175,44 +196,59 @@ Indexer::Private::handler(const std::string& fullpath,
// push the remaining messages to our "todo" queue for // push the remaining messages to our "todo" queue for
// (re)parsing and adding/updating to the database. // (re)parsing and adding/updating to the database.
fq_.push(std::string{fullpath}); todos_.push({fullpath, WorkItem::Type::File});
return true; return true;
} }
default: g_return_val_if_reached(false); return false; default:
g_return_val_if_reached(false);
return false;
} }
} }
void void
Indexer::Private::maybe_start_worker() Indexer::Private::maybe_start_worker()
{ {
std::lock_guard<std::mutex> wlock{wlock_}; std::lock_guard lock{w_lock_};
if (fq_.size() > workers_.size() && workers_.size() < max_workers_) if (todos_.size() > workers_.size() && workers_.size() < max_workers_) {
workers_.emplace_back(std::thread([this] { worker(); })); workers_.emplace_back(std::thread([this] { item_worker(); }));
g_debug("added worker %zu", workers_.size());
}
} }
void void
Indexer::Private::worker() Indexer::Private::item_worker()
{ {
std::string item; WorkItem item;
g_debug("started worker"); g_debug("started worker");
while (state_ == IndexState::Scanning) { while (state_ == IndexState::Scanning) {
if (!todos_.pop(item, 250ms))
if (!fq_.pop(item, 250ms))
continue; continue;
try { try {
std::unique_lock lock{lock_}; std::lock_guard lock{w_lock_};
switch (item.type) {
store_.add_message(item, true /*use-transaction*/); case WorkItem::Type::File:
store_.add_message(item.full_path,
true /*use-transaction*/);
++progress_.updated; ++progress_.updated;
break;
case WorkItem::Type::Dir:
store_.set_dirstamp(item.full_path, ::time(NULL),
true /*use-transaction*/);
break;
default:
g_warn_if_reached();
break;
}
} catch (const Mu::Error& er) { } catch (const Mu::Error& er) {
g_warning("error adding message @ %s: %s", item.c_str(), er.what()); g_warning("error adding message @ %s: %s",
item.full_path.c_str(), er.what());
} }
maybe_start_worker(); maybe_start_worker();
std::this_thread::yield();
} }
} }
@ -227,8 +263,7 @@ Indexer::Private::cleanup()
++n; ++n;
if (::access(path.c_str(), R_OK) != 0) { if (::access(path.c_str(), R_OK) != 0) {
g_debug("cannot read %s (id=%u); queueing for removal from store", g_debug("cannot read %s (id=%u); queueing for removal from store",
path.c_str(), path.c_str(), id);
id);
orphans.emplace_back(id); orphans.emplace_back(id);
} }
@ -246,29 +281,9 @@ Indexer::Private::cleanup()
return true; return true;
} }
bool void
Indexer::Private::start(const Indexer::Config& conf) Indexer::Private::scan_worker()
{ {
stop();
conf_ = conf;
if (conf_.max_threads == 0)
max_workers_ = std::thread::hardware_concurrency();
else
max_workers_ = conf.max_threads;
g_debug("starting indexer with <= %zu worker thread(s)", max_workers_);
g_debug("indexing: %s; clean-up: %s",
conf_.scan ? "yes" : "no",
conf_.cleanup ? "yes" : "no");
state_.change_to(IndexState::Scanning);
{
/* kick off the first worker, which will spawn more if needed. */
std::lock_guard<std::mutex> wlock{wlock_};
workers_.emplace_back(std::thread([this] { worker(); }));
}
scanner_worker_ = std::thread([this] {
progress_ = {}; progress_ = {};
if (conf_.scan) { if (conf_.scan) {
@ -277,13 +292,16 @@ Indexer::Private::start(const Indexer::Config& conf)
g_warning("failed to start scanner"); g_warning("failed to start scanner");
goto leave; goto leave;
} }
g_debug("scanner finished with %zu file(s) in queue", fq_.size()); g_debug("scanner finished with %zu file(s) in queue", todos_.size());
} }
// now there may still be messages in the work queue... // now there may still be messages in the work queue...
// finish those; this is a bit ugly; perhaps we should // finish those; this is a bit ugly; perhaps we should
// handle SIGTERM etc. // handle SIGTERM etc.
while (!fq_.empty()) if (!todos_.empty())
g_debug("process %zu remaining message(s) with %zu worker(s)", todos_.size(),
workers_.size());
while (!todos_.empty())
std::this_thread::sleep_for(100ms); std::this_thread::sleep_for(100ms);
store_.commit(); store_.commit();
@ -294,11 +312,36 @@ Indexer::Private::start(const Indexer::Config& conf)
cleanup(); cleanup();
g_debug("cleanup finished"); g_debug("cleanup finished");
} }
leave: leave:
state_.change_to(IndexState::Idle); state_.change_to(IndexState::Idle);
}); }
bool
Indexer::Private::start(const Indexer::Config& conf)
{
stop();
conf_ = conf;
if (conf_.max_threads == 0) {
/* we're blocked mostly by a) filesystem and b) database;
* so it's not very useful to start many threads */
max_workers_ = std::max(
4U, std::thread::hardware_concurrency());
} else
max_workers_ = conf.max_threads;
g_debug("starting indexer with <= %zu worker thread(s)", max_workers_);
g_debug("indexing: %s; clean-up: %s", conf_.scan ? "yes" : "no",
conf_.cleanup ? "yes" : "no");
state_.change_to(IndexState::Scanning);
/* kick off the first worker, which will spawn more if needed. */
workers_.emplace_back(std::thread([this] { item_worker(); }));
/* kick the disk-scanner thread */
scanner_worker_ = std::thread([this] { scan_worker(); });
g_debug("started indexer"); g_debug("started indexer");
return true; return true;
} }
@ -306,26 +349,24 @@ bool
Indexer::Private::stop() Indexer::Private::stop()
{ {
scanner_.stop(); scanner_.stop();
state_.change_to(IndexState::Idle);
const auto w_n = workers_.size(); todos_.clear();
fq_.clear();
if (scanner_worker_.joinable()) if (scanner_worker_.joinable())
scanner_worker_.join(); scanner_worker_.join();
state_.change_to(IndexState::Idle);
for (auto&& w : workers_) for (auto&& w : workers_)
if (w.joinable()) if (w.joinable())
w.join(); w.join();
workers_.clear(); workers_.clear();
if (w_n > 0)
g_debug("stopped indexer (joined %zu worker(s))", w_n);
return true; return true;
} }
Indexer::Indexer(Store& store) : priv_{std::make_unique<Private>(store)} {} Indexer::Indexer(Store& store)
: priv_{std::make_unique<Private>(store)}
{
}
Indexer::~Indexer() = default; Indexer::~Indexer() = default;
@ -338,7 +379,7 @@ Indexer::start(const Indexer::Config& conf)
return false; return false;
} }
std::lock_guard<std::mutex> l(priv_->lock_); std::lock_guard lock(priv_->lock_);
if (is_running()) if (is_running())
return true; return true;
@ -348,7 +389,7 @@ Indexer::start(const Indexer::Config& conf)
bool bool
Indexer::stop() Indexer::stop()
{ {
std::lock_guard<std::mutex> l(priv_->lock_); std::lock_guard lock{priv_->lock_};
if (!is_running()) if (!is_running())
return true; return true;
@ -360,7 +401,7 @@ Indexer::stop()
bool bool
Indexer::is_running() const Indexer::is_running() const
{ {
return !(priv_->state_ == IndexState::Idle) || !priv_->fq_.empty(); return priv_->state_ != IndexState::Idle;
} }
Indexer::Progress Indexer::Progress

View File

@ -58,7 +58,9 @@ public:
}; };
/** /**
* Start indexing. If already underway, do nothing. * Start indexing. If already underway, do nothing. This returns
* immediately after starting, with the work being done in the
* background.
* *
* @param conf a configuration object * @param conf a configuration object
* *

View File

@ -44,7 +44,10 @@ struct Scanner::Private {
if (!handler_) if (!handler_)
throw Mu::Error{Error::Code::Internal, "missing handler"}; throw Mu::Error{Error::Code::Internal, "missing handler"};
} }
~Private() { stop(); } ~Private()
{
stop();
}
bool start(); bool start();
bool stop(); bool stop();
@ -65,7 +68,8 @@ is_special_dir(const char* d_name)
} }
bool bool
Scanner::Private::process_dentry(const std::string& path, struct dirent* dentry, bool is_maildir) Scanner::Private::process_dentry(const std::string& path, struct dirent* dentry,
bool is_maildir)
{ {
const auto d_name{dentry->d_name}; const auto d_name{dentry->d_name};
@ -83,8 +87,8 @@ Scanner::Private::process_dentry(const std::string& path, struct dirent* dentry,
if (S_ISDIR(statbuf.st_mode)) { if (S_ISDIR(statbuf.st_mode)) {
const auto new_cur = const auto new_cur =
std::strcmp(d_name, "cur") == 0 || std::strcmp(d_name, "new") == 0; std::strcmp(d_name, "cur") == 0 || std::strcmp(d_name, "new") == 0;
const auto htype = new_cur ? Scanner::HandleType::EnterNewCur const auto htype =
: Scanner::HandleType::EnterDir; new_cur ? Scanner::HandleType::EnterNewCur : Scanner::HandleType::EnterDir;
const auto res = handler_(fullpath, &statbuf, htype); const auto res = handler_(fullpath, &statbuf, htype);
if (!res) if (!res)
return true; // skip return true; // skip
@ -97,12 +101,16 @@ Scanner::Private::process_dentry(const std::string& path, struct dirent* dentry,
return handler_(fullpath, &statbuf, Scanner::HandleType::File); return handler_(fullpath, &statbuf, Scanner::HandleType::File);
g_debug("skip %s (neither maildir-file nor directory)", fullpath.c_str()); g_debug("skip %s (neither maildir-file nor directory)", fullpath.c_str());
return true; return true;
} }
bool bool
Scanner::Private::process_dir(const std::string& path, bool is_maildir) Scanner::Private::process_dir(const std::string& path, bool is_maildir)
{ {
if (!running_)
return true; /* we're done */
const auto dir = opendir(path.c_str()); const auto dir = opendir(path.c_str());
if (G_UNLIKELY(!dir)) { if (G_UNLIKELY(!dir)) {
g_warning("failed to scan dir %s: %s", path.c_str(), g_strerror(errno)); g_warning("failed to scan dir %s: %s", path.c_str(), g_strerror(errno));
@ -171,8 +179,7 @@ Scanner::Private::start()
const auto start{std::chrono::steady_clock::now()}; const auto start{std::chrono::steady_clock::now()};
process_dir(root_dir_, is_maildir); process_dir(root_dir_, is_maildir);
const auto elapsed = std::chrono::steady_clock::now() - start; const auto elapsed = std::chrono::steady_clock::now() - start;
g_debug("finished scan of %s in %" G_GINT64_FORMAT " ms", g_debug("finished scan of %s in %" G_GINT64_FORMAT " ms", root_dir_.c_str(),
root_dir_.c_str(),
to_ms(elapsed)); to_ms(elapsed));
running_ = false; running_ = false;
@ -201,15 +208,10 @@ Scanner::~Scanner() = default;
bool bool
Scanner::start() Scanner::start()
{ {
{
std::lock_guard<std::mutex> l(priv_->lock_);
if (priv_->running_) if (priv_->running_)
return true; // nothing to do return true; // nothing to do
priv_->running_ = true; const auto res = priv_->start(); /* blocks */
}
const auto res = priv_->start();
priv_->running_ = false; priv_->running_ = false;
return res; return res;
@ -218,7 +220,7 @@ Scanner::start()
bool bool
Scanner::stop() Scanner::stop()
{ {
std::lock_guard<std::mutex> l(priv_->lock_); std::lock_guard l(priv_->lock_);
return priv_->stop(); return priv_->stop();
} }

View File

@ -31,6 +31,7 @@
#include <iostream> #include <iostream>
#include <cstring> #include <cstring>
#include <vector>
#include <xapian.h> #include <xapian.h>
#include "mu-store.hh" #include "mu-store.hh"
@ -206,9 +207,12 @@ struct Store::Private {
contacts_.serialize()); contacts_.serialize());
}); });
} }
g_debug("committing transaction (n=%zu)", transaction_size_); g_debug("committing transaction (n=%zu,%zu)",
transaction_size_, metadatas_.size());
xapian_try([this] { xapian_try([this] {
writable_db().commit_transaction(); writable_db().commit_transaction();
for (auto&& mdata : metadatas_)
writable_db().set_metadata(mdata.first, mdata.second);
transaction_size_ = 0; transaction_size_ = 0;
}); });
} }
@ -278,6 +282,10 @@ struct Store::Private {
Xapian::docid add_or_update_msg(Xapian::docid docid, MuMsg* msg); Xapian::docid add_or_update_msg(Xapian::docid docid, MuMsg* msg);
Xapian::Document new_doc_from_message(MuMsg* msg); Xapian::Document new_doc_from_message(MuMsg* msg);
/* metadata to write as part of a transaction commit */
using StringPair = std::pair<std::string, std::string>;
std::vector<StringPair> metadatas_;
const bool read_only_{}; const bool read_only_{};
std::unique_ptr<Xapian::Database> db_; std::unique_ptr<Xapian::Database> db_;
@ -501,7 +509,7 @@ Store::dirstamp(const std::string& path) const
} }
void void
Store::set_dirstamp(const std::string& path, time_t tstamp) Store::set_dirstamp(const std::string& path, time_t tstamp, bool use_transaction)
{ {
std::lock_guard guard{priv_->lock_}; std::lock_guard guard{priv_->lock_};
@ -510,9 +518,15 @@ Store::set_dirstamp(const std::string& path, time_t tstamp)
const auto len = static_cast<size_t>( const auto len = static_cast<size_t>(
g_snprintf(data.data(), data.size(), "%zx", tstamp)); g_snprintf(data.data(), data.size(), "%zx", tstamp));
xapian_try([&] { /* set_metadata is not otherwise part of a "transaction" but we want it
priv_->writable_db().set_metadata(path, std::string{data.data(), len}); * to be so, so a dirstamp _only_ gets updated when the messages in the
}); * dir are commited */
Private::StringPair item{path, std::string{data.data(), len}};
if (use_transaction)
priv_->metadatas_.emplace_back(std::move(item));
else
xapian_try([&] { priv_->writable_db().set_metadata(item.first, item.second); });
} }
MuMsg* MuMsg*

View File

@ -303,8 +303,10 @@ public:
* *
* @param path a filesystem path * @param path a filesystem path
* @param tstamp the timestamp for that path * @param tstamp the timestamp for that path
* @param whether to do this as part of a transaction
*/ */
void set_dirstamp(const std::string& path, time_t tstamp); void set_dirstamp(const std::string& path, time_t tstamp,
bool use_transaction = false);
/** /**
* Get the number of documents in the document database * Get the number of documents in the document database