/* ** Copyright (C) 2021 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the ** Free Software Foundation; either version 3, or (at your option) any ** later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software Foundation, ** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ** */ #include "mu-query-threads.hh" #include #include #include #include #include #include using namespace Mu; struct Container { using children_type = std::set; Container(): children{&compare} {} Container(Option msg): query_match{msg}, children{&compare} {} Container(const Container&) = delete; Container(Container&&) = delete; void set_parent (Container* new_parent) { assert(this != new_parent); assert(!new_parent->is_reachable(this)); if (new_parent == parent) return; if (parent) parent->remove_child(*this); if (new_parent) new_parent->add_child(*this); else parent = new_parent; assert(this->parent != this); } void add_child (Container& new_child) { assert(!new_child.parent); new_child.parent = this; children.emplace(&new_child); } void promote_children () { for_each_child([&](auto&& child){ child->parent = {}; if (parent) parent->add_child(*child); }); children.clear(); if (parent) parent->remove_child(*this); is_nuked = true; assert(!parent); assert(children.empty()); } void remove_child (Container& child) { assert(has_child(child)); child.parent = {}; children.erase(&child); assert(!has_child(child)); } bool has_child (Container& child) const { return children.find(&child) != children.cend(); } bool is_reachable(Container* other) const { return ur_parent() == other->ur_parent(); } void borrow_query_match (Container& other) { assert(!query_match); assert(other.query_match); query_match = other.query_match; is_borrowed_query_match = true; if (parent) { // and renew (for sorting) auto p{parent}; parent->remove_child(*this); p->add_child(*this); assert(parent->has_child(*this)); } } template void for_each_child (Func&& func) { auto it{children.begin()}; while (it != children.end()) { auto next = std::next(it); func(*it); it = next; } } bool is_empty() const { return !query_match || is_borrowed_query_match; } Option query_match; bool is_borrowed_query_match{}; bool is_nuked{}; Container* parent{}; children_type children; private: const Container* ur_parent() const { assert(this->parent != this); return parent ? parent->ur_parent() : this; } static bool compare(const Container *c1, const Container *c2) { if (c1->query_match && c2->query_match) { const auto cmp{std::strcmp(c1->query_match->date_key.c_str(), c2->query_match->date_key.c_str())}; if (cmp != 0) return cmp < 0; } return c1 < c2; } }; static std::ostream& operator<<(std::ostream& os, const Container& container) { os << "container: " << std::right << std::setw(10) << &container << ": parent: " << std::right << std::setw(10) << container.parent << "\n children: "; for (auto&& c: container.children) os << std::right << std::setw(10) << c << " "; os << (container.is_nuked ? " nuked" : "") << (container.is_borrowed_query_match ? " borrowed" : ""); if (container.query_match) os << "\n " << container.query_match.value(); return os; } using IdTable = std::unordered_map; template static IdTable determine_id_table (QueryResultsType& qres, MuMsgFieldId sortfield_id) { // 1. For each query_match IdTable id_table; for (auto&& mi: qres) { const auto msgid{mi.message_id().value_or(*mi.path())}; // 1.A If id_table contains an empty Container for this ID: // Store this query_match (query_match) in the Container's query_match (value) slot. auto c_it = id_table.find(msgid); if (c_it != id_table.end()) { if (!c_it->second.query_match) { c_it->second.query_match = mi.query_match(); c_it->second.query_match->thread_path = "x"; } else { /* special case, not in the JWZ algorithm: the container * exists already and has a query_match (query-match); this * means that we are seeing *another query_match* with a * query_match-id we already saw... create this query_match, and * mark it as a duplicate; use its path as the fake * query_match-id */ c_it = id_table.emplace(*mi.path(), mi.query_match()).first; c_it->second.query_match->flags |= QueryMatch::Flags::Duplicate; c_it->second.query_match->thread_path = "c"; } } else { // Else: // Create a new Container object holding this query_match (query-match); // Index the Container by Query_Match-ID c_it = id_table.emplace(msgid, mi.query_match()).first; c_it->second.query_match->thread_path = "y"; } Container& container{c_it->second}; // We sort by date (ascending), *except* for the root; we don't // know what query_matchs will be at the root level yet, so remember // both. Moreover, even when sorting the top-level in descending // order, still sort the thread levels below that in ascending // order. if (sortfield_id != MU_MSG_FIELD_ID_NONE) container.query_match->sort_key = mi.opt_string(sortfield_id).value_or(""); container.query_match->date_key = mi.opt_string(MU_MSG_FIELD_ID_DATE).value_or(""); // 1.B // For each element in the query_match's References field: Container* parent_ref_container{}; for (const auto& ref: mi.references()) { // grand_-parent -> grand_-parent -> ... -> parent. // Find a Container object for the given Query_Match-ID; If it exists, use it; // otherwise make one with a null Query_Match. auto ref_container = [&]()->Container* { auto ref_it = id_table.find(ref); if (ref_it == id_table.end()) ref_it = id_table.emplace(ref,Nothing).first; return &ref_it->second; }(); // Link the References field's Containers together in the order implied // by the References header. // * If they are already linked, don't change the existing links. // // * Do not add a link if adding that link would introduce a loop: that is, // before asserting A->B, search down the children of B to see if A is // reachable, and also search down the children of A to see if B is // reachable. If either is already reachable as a child of the other, // don't add the link. if (parent_ref_container && !ref_container->parent && !parent_ref_container->is_reachable(ref_container)) parent_ref_container->add_child(*ref_container); parent_ref_container = ref_container; } // Add the query_match to the chain. if (parent_ref_container && !container.parent && !parent_ref_container->is_reachable(&container)) { parent_ref_container->add_child(container); } } return id_table; } /// Recursively walk all containers under the root set. /// For each container: /// /// If it is an empty container with no children, nuke it. /// /// Note: Normally such containers won't occur, but they can show up when two /// query_matchs have References lines that disagree. For example, assuming A and /// B are query_matchs, and 1, 2, and 3 are references for query_matchs we haven't /// seen: /// /// A has references: 1, 2, 3 /// B has references: 1, 3 /// /// There is ambiguity as to whether 3 is a child of 1 or of 2. So, /// depending on the processing order, we might end up with either /// /// -- 1 /// |-- 2 /// \-- 3 /// |-- A /// \-- B /// /// or /// /// -- 1 /// |-- 2 <--- non root childless container! /// \-- 3 /// |-- A /// \-- B /// /// If the Container has no Query_Match, but does have children, remove this /// container but promote its children to this level (that is, splice them in /// to the current child list.) /// /// Do not promote the children if doing so would promote them to the root /// set -- unless there is only one child, in which case, do. static void prune_empty_containers (Container& container) { container.for_each_child([](auto&& child){prune_empty_containers(*child);}); // Never nuke these. if (!container.is_empty()) return; if (container.children.empty()) { // If it is an empty container with no children, nuke it. if (container.parent) container.parent->remove_child(container); container.is_nuked = true; return; } // If the Container is empty, but does have children, remove this // container but promote its children to this level (that is, splice them in // to the current child list.) // // Do not promote the children if doing so would promote them to the root // set -- unless there is only one child, in which case, do. //const auto rootset_child{!container.parent->parent}; if (container.parent || container.children.size() == 1) { container.promote_children(); container.is_nuked = true; } else if (!container.children.empty()){ // so an empty container with children. Copy the query info of the first // child, for sorting -- so the sort key "bubbles up". Renew // it so the sorting workes out. auto& first_child{*container.children.begin()}; container.borrow_query_match(*first_child); } } static void prune_empty_containers (IdTable& id_table) { for (auto&& item: id_table) { if (!item.second.parent) prune_empty_containers(item.second); } } /// Sorting. /// /// We start the sorting from the rout-vec, ie. the set of of parentless conainers. /// /// We need to sort the rootset by whatever the sortkey is (subject, date, ...); however under the /// rotset we stricly sort in ascending order by date. Containers with empty query_matchs have the /// sort key from the first of their children (recursively). // // Note, children are already stored in a (sorted) std::set, based on their date. That's correct for // all but the top-level (root) containers; so, we just need fix those. // // the root_vec is the sorted vec of top-level (parent-less) containers. using RootVec = std::vector; static RootVec determine_root_vec(IdTable& id_table, bool descending) { RootVec root_vec; for (auto&& item: id_table) { Container* c{&item.second}; if (!c || !c->query_match || c->parent || c->is_nuked) continue; root_vec.emplace_back(c); } std::sort(root_vec.begin(), root_vec.end(), [&](Container*& c1, Container*& c2)->bool { #ifdef BUILD_TESTS if (descending) return c2->query_match->sort_key < c1->query_match->sort_key; else return c1->query_match->sort_key < c2->query_match->sort_key; #else // the non-testing case, the "descending" part is handled // in the "decider" return c1->query_match->sort_key < c2->query_match->sort_key; #endif /*BUILD_TESTS*/ }); return root_vec; } static bool update_container_query_match (Container& container, ThreadPathVec& pvec, size_t segment_size, bool descending) { if (container.is_empty()) return false; // nothing to update. auto& qmatch{*container.query_match}; if (!container.parent) qmatch.flags |= QueryMatch::Flags::Root; else if (container.parent->is_empty()) qmatch.flags |= QueryMatch::Flags::Orphan; if (!container.children.empty()) qmatch.flags |= QueryMatch::Flags::HasChild; if (descending && container.parent) { // trick xapian by giving it "inverse" sorting key so our // ascending-date sorted threads stay in that order pvec.back() = ((1U << (4 * segment_size)) - 1) - pvec.back(); } qmatch.thread_path = to_string(pvec, segment_size); qmatch.thread_level = pvec.size() - 1; // ensure thread root comes before its children if (descending) qmatch.thread_path += ":z"; return true; } static void sort_siblings (Container::children_type& siblings, const ThreadPathVec& parent_path_vec, size_t segment_size, bool descending) { if (siblings.empty()) return; else { const auto first{*siblings.begin()}; if (first->query_match) first->query_match->flags |= QueryMatch::Flags::First; const auto last{*(--siblings.end())}; if (last->query_match) last->query_match->flags |= QueryMatch::Flags::Last; } size_t idx{0}; ThreadPathVec thread_path_vec{parent_path_vec}; for (auto&& c: siblings) { thread_path_vec.emplace_back(idx++); update_container_query_match (*c, thread_path_vec, segment_size, descending); if (!c->children.empty()) sort_siblings (c->children, thread_path_vec, segment_size, descending); thread_path_vec.pop_back(); } } static void sort_siblings (IdTable& id_table, bool descending) { if (id_table.empty()) return; auto root_vec{determine_root_vec(id_table, descending)}; // sorted //std::cerr << "rvs" << root_vec.size() << "\n"; const auto seg_size = static_cast( std::ceil(std::log2(id_table.size())/4.0)); /*note: 4 == std::log2(16)*/ ThreadPathVec path_vec; auto idx{0U}; for (auto&& c: root_vec) { path_vec.emplace_back(idx++); update_container_query_match (*c, path_vec, seg_size, descending); sort_siblings (c->children, path_vec, seg_size, descending); path_vec.pop_back(); } } static std::ostream& operator<<(std::ostream& os, const IdTable& id_table) { std::set ids; for (auto&& item: id_table) { if (item.second.query_match) ids.emplace(item.second.query_match->thread_path); } for (auto&& id: ids) { auto it = std::find_if(id_table.begin(), id_table.end(), [&](auto&& item) { return item.second.query_match && item.second.query_match->thread_path == id; }); assert(it != id_table.end()); os << it->first << ": " << it->second << '\n'; } return os; } template static void calculate_threads_real (Results& qres, MuMsgFieldId sort_field, bool descending) { // Step 1: build the id_table auto id_table{determine_id_table(qres, sort_field)}; // // Step 2: get the root set // // Step 3: discard id_table // Nope: id-table owns the containers. // Step 4: prune empty containers prune_empty_containers(id_table); // Step 5: group root-set by subject. // Not implemented. // Step 6: we're done threading // Step 7: sort siblings. The segment-size is the number of hex-digits // in the thread-path string (so we can lexically compare them.) sort_siblings(id_table, descending); if (g_test_verbose()) std::cout << "*** id-table:\n" << id_table << "\n"; } void Mu::calculate_threads (Mu::QueryResults& qres, MuMsgFieldId sort_field, bool descending) { calculate_threads_real(qres, sort_field, descending); } #ifdef BUILD_TESTS struct MockQueryResult { MockQueryResult(const std::string& message_id_arg, const std::string& sort_key_arg, const std::string& date_key_arg, const std::vector& refs_arg={}): message_id_{message_id_arg}, sort_key_{sort_key_arg}, date_key_{date_key_arg}, refs_{refs_arg} {} MockQueryResult(const std::string& message_id_arg, const std::vector& refs_arg={}): MockQueryResult(message_id_arg, "", "", refs_arg) {} Option message_id() const { return message_id_;} Option path() const { return path_;} QueryMatch& query_match() { return query_match_;} const QueryMatch& query_match() const { return query_match_;} const std::vector& references() const { return refs_;} Option opt_string(MuMsgFieldId id) const { if (id == MU_MSG_FIELD_ID_DATE) return date_key_; else return sort_key_; } Option path_{"/"}; std::string message_id_; QueryMatch query_match_{}; std::string sort_key_; std::string date_key_; std::vector refs_; }; using MockQueryResults = std::vector; G_GNUC_UNUSED static std::ostream& operator<<(std::ostream& os, const MockQueryResults& qrs) { for (auto&& mi: qrs) os << mi.query_match().thread_path << " :: " << mi.message_id().value_or("") << std::endl; return os; } static void calculate_threads (MockQueryResults& qres, MuMsgFieldId sort_field, bool descending) { calculate_threads_real(qres, sort_field, descending); } using Expected = std::vector>; static void assert_thread_paths (MockQueryResults& qrs, const Expected& expected) { for (auto&& exp: expected) { auto it = std::find_if(qrs.begin(), qrs.end(), [&](auto&& qr){ return qr.message_id().value_or("") == exp.first; }); g_assert_true (it != qrs.end()); g_assert_cmpstr(exp.second.c_str(), ==, it->query_match().thread_path.c_str()); } } static void test_basic() { auto results = MockQueryResults { MockQueryResult{ "m1", "a", "1", {"m2"} }, MockQueryResult{ "m2", "b", "2", {"m3"} }, MockQueryResult{ "m3", "c", "3", {}}, MockQueryResult{ "m4", "d", "4", {}} }; calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); assert_thread_paths (results, { { "m1", "0:0:0"}, { "m2", "0:0" }, { "m3", "0" }, { "m4", "1" } }); calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, true); assert_thread_paths (results, { { "m1", "1:f:f:z"}, { "m2", "1:f:z" }, { "m3", "1:z" }, { "m4", "0:z" } }); } static void test_prune_empty_containers() { { // m7 should not be nuked auto results = MockQueryResults { MockQueryResult{ "x1", "a", "1", {"m7"} }, MockQueryResult{ "x2", "b", "2", {"m7"} }, }; calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); assert_thread_paths (results, { { "x1", "0:0"}, { "x2", "0:1" }, }); } { // m7 should be nuked auto results = MockQueryResults { MockQueryResult{ "m1", "a", "1", {"m7"} }, }; calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); assert_thread_paths (results, { { "m1", "0"}, }); } { // m6 should be nuked auto results = MockQueryResults { MockQueryResult{ "m1", "a", "1", {"m7", "m6"} }, MockQueryResult{ "m2", "b", "2", {"m7", "m6"} }, }; calculate_threads(results, MU_MSG_FIELD_ID_SUBJECT, false); assert_thread_paths (results, { { "m1", "0:0"}, { "m2", "0:1" }, }); } { // m6 should be nuked auto results = MockQueryResults { MockQueryResult{ "m1", "a", "1", {"m28uszf59m.fsf@damtp.cam.ac.uk", "CAP8THHWFDR9fJynKJHiRLayBo8wNiOCK6ghbgOK6rHboQKjDqA@mail.gmail.com", "m2lhwxevpt.fsf@damtp.cam.ac.uk"} }, MockQueryResult{ "m2", "b", "2", {"m28uszf59m.fsf@damtp.cam.ac.uk", "CAP8THHWFDR9fJynKJHiRLayBo8wNiOCK6ghbgOK6rHboQKjDqA@mail.gmail.com", "m2lhwxevpt.fsf@damtp.cam.ac.uk"} }, }; calculate_threads(results, MU_MSG_FIELD_ID_DATE, false); assert_thread_paths (results, { { "m1", "0:0"}, { "m2", "0:1" }, }); } } static void test_id_table_inconsistent() { auto results = MockQueryResults { MockQueryResult{ "m1", "a", "1", {"m2"} }, MockQueryResult{ "m2", "b", "2", {"m1"} }, MockQueryResult{ "m3", "c", "3", {"m3"} }, // self ref MockQueryResult{ "m4", "d", "4", {"m3", "m5"} }, MockQueryResult{ "m5", "e", "5", {"m4", "m4"} }, // dup parent }; calculate_threads(results, MU_MSG_FIELD_ID_DATE, false); assert_thread_paths (results, { { "m2", "0"}, { "m1", "0:0" }, { "m3", "1"}, { "m5", "1:0" }, { "m4", "1:0:0"}, }); } int main (int argc, char *argv[]) try { g_test_init (&argc, &argv, NULL); g_test_add_func ("/threader/basic", test_basic); g_test_add_func ("/threader/prune-empty-containers", test_prune_empty_containers); g_test_add_func ("/threader/id-table-inconsistent", test_id_table_inconsistent); return g_test_run (); } catch (const std::runtime_error& re) { std::cerr << re.what() << "\n"; return 1; } catch (...) { std::cerr << "caught exception\n"; return 1; } #endif /*BUILD_TESTS*/