From cdb619e4f59b74d9a504df9244f726345c74f4e0 Mon Sep 17 00:00:00 2001 From: Daniel Colascione Date: Sat, 22 Feb 2025 14:48:22 -0500 Subject: [PATCH] Improve performance of index cleanup: use readdir(3), not access(2) This change makes index cleanup ~4x faster by changing how we determine whether a file mentioned by the database still exists on disk. Previously, we'd call access(2) for each file the database mentioned. Doing so produced a lot of system call overhead. Now, we read the directory entries of the directories containing the files whose existence we're checking, build a hash table from what we find, then do the existence check against this hash table instead of entering the kernel. The semantics of the cleanup check do change subtly, however. Previously, we checked whether the mentioned file was *readable*. Now we check merely that it exists. Extant but unreadable files in maildirs should be rare. BEFORE: $ time mu index --lazy-check lazily indexing maildir /home/dancol/Mail -> store /home/dancol/.cache/mu/xapian / indexing messages; checked: 0; updated/new: 0; cleaned-up: 0 real 0m19.310s user 0m1.803s sys 0m12.999s AFTER: $ time mu --debug index --lazy-check lazily indexing maildir /home/dancol/Mail -> store /home/dancol/.cache/mu/xapian - indexing messages; checked: 0; updated/new: 0; cleaned-up: 0 real 0m4.584s user 0m2.433s sys 0m2.133s --- lib/mu-indexer.cc | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/lib/mu-indexer.cc b/lib/mu-indexer.cc index 65fc0c6a..0e1d8e72 100644 --- a/lib/mu-indexer.cc +++ b/lib/mu-indexer.cc @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include using namespace std::chrono_literals; @@ -280,9 +282,37 @@ Indexer::Private::cleanup() size_t n{}; std::vector orphans; // store messages without files. + + using DirFiles = std::unordered_set; + std::unordered_map dir_cache; + + auto get_dir_files = [](const std::string& path) -> DirFiles { + DirFiles ret; + auto dir{::opendir(path.c_str())}; + if (dir) { + struct dirent* dentry; + while ((dentry = ::readdir(dir))) { + ret.emplace(dentry->d_name); + } + ::closedir(dir); + } + + return ret; + }; + + auto is_file_present = [&](const std::string& path) -> bool { + std::string dir = dirname(path); + auto [it, inserted] = dir_cache.try_emplace(dir); + DirFiles& dir_files = it->second; + if (inserted) { + dir_files = get_dir_files(dir); + } + return dir_files.find(basename(path)) != dir_files.end(); + }; + store_.for_each_message_path([&](Store::Id id, const std::string& path) { ++n; - if (::access(path.c_str(), R_OK) != 0) { + if (!is_file_present(path)) { mu_debug("cannot read {} (id={}); queuing for removal from store", path, id); orphans.emplace_back(id);