* semi-working message-threading (WIP)

This commit is contained in:
Dirk-Jan C. Binnema
2011-06-18 18:47:13 +03:00
parent d0d8356248
commit a3ec83b96d
2 changed files with 485 additions and 188 deletions

View File

@ -1,5 +1,4 @@
/* -*-mode: c; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-*/
/*
** Copyright (C) 2011 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
@ -20,241 +19,493 @@
*/
#include "mu-msg-threader.h"
#include "mu-str.h"
struct _MuMsgThreader {
GHashTable *_ids;
};
enum _ContainerState { NUKE, SPLICE, OKAY };
typedef enum _ContainerState ContainerState;
struct _Container {
MuMsg *_msg;
struct _Container *_parent;
GSList *_children;
};
typedef struct _Container Container;
static Container *container_new (MuMsg *msg);
MuMsg *_msg;
unsigned int _docid;
struct _Container *_parent;
GSList *_children;
ContainerState _state;
};
typedef struct _Container Container;
static Container *container_new (MuMsg *msg, unsigned docid);
static void container_destroy (Container *c);
static void container_dump (Container *c);
/* breath-first recursive traversal */
typedef gboolean (*ContainerTraverseFunc) (Container *c, gpointer user_data);
static gboolean container_traverse (Container *c, ContainerTraverseFunc func,
typedef gboolean (*ContainerTraverseFunc) (Container *c, guint level,
gpointer user_data);
static gboolean container_traverse (Container *c,
guint level,
ContainerTraverseFunc func,
gpointer user_data);
static gboolean container_traverse_list (GSList *containers, ContainerTraverseFunc func,
gpointer user_data);
static void container_add_child (Container *c, Container *child);
MuMsgThreader *
mu_msg_threader_new (void)
{
MuMsgThreader *self;
self = g_slice_new (MuMsgThreader);
self->_ids = g_hash_table_new_full (g_str_hash,
g_str_equal,
NULL, /* we don't copy msgid */
(GDestroyNotify)container_destroy);
return self;
}
void
mu_msg_threader_destroy (MuMsgThreader *self)
{
if (!self)
return;
g_hash_table_destroy (self->_ids);
g_slice_free (MuMsgThreader, self);
}
static void container_add_child (Container *c, Container *child);
static void container_promote_child (Container *c, Container *child);
static void container_remove_child (Container *c, Container *child);
static Container *store_msg_in_container (GHashTable *ids, MuMsg *msg);
static void handle_references (GHashTable *ids, Container *c);
static Container* find_or_create (GHashTable *ids, char *msgid);
static GSList *find_root_set (GHashTable *ids);
/* step 1 */ static GHashTable* create_containers (MuMsgIter *iter);
/* step 2 */ static GSList *find_root_set (GHashTable *ids);
static void prune_empty_containers (GSList *root_set);
/* static void group_root_set_by_subject (GSList *root_set); */
static void dump (GSList *root_set);
GHashTable* create_doc_id_thread_path_hash (GSList *root_set);
/* msg threading algorithm, based on JWZ's algorithm,
* http://www.jwz.org/doc/threading.html */
gboolean
mu_msg_threader_calculate (MuMsgThreader *self, MuMsgIter *iter)
GHashTable*
mu_msg_threader_calculate (MuMsgIter *iter)
{
GHashTable *id_table;
GSList *root_set;
g_return_val_if_fail (iter, FALSE);
/* 1. for all messages... */
for (mu_msg_iter_reset (iter); !mu_msg_iter_is_done (iter);
mu_msg_iter_next (iter)) {
/* step 1 */
id_table = create_containers (iter);
/* step 2 */
root_set = find_root_set (id_table);
/* step 3: skip until the end */
Container *c;
GSList *root_set;
/* 1.A */
c = store_msg_in_container (self->_ids,
mu_msg_iter_get_msg (iter, NULL));
if (!c) {
g_printerr ("failed to create container\n");
continue;
}
/* 1.B and C */
handle_references (self->_ids, c);
/* step 4: prune empty containers */
prune_empty_containers (root_set);
/* step 2 */
root_set = find_root_set (self->_ids);
{
GSList *cur;
for (cur = root_set; cur; cur = g_slist_next (cur)) {
MuMsg *msg;
msg = ((Container*)cur->data)->_msg;
if (!msg)
g_printerr ("<empty root>\n");
else {
const gchar *subj;
subj = mu_msg_get_subject (msg);
g_printerr ("%s\n", subj ? subj : "<no subject>");
}
}
}
/* recalculate root set */
root_set = find_root_set (id_table);
g_printerr ("ROOT SET\n");
dump (root_set);
g_printerr ("===\n");
}
/* step 5: group root set by subject */
// group_root_set_by_subject (root_set);
mu_msg_iter_reset (iter); /* go all the way back */
return TRUE;
g_hash_table_destroy (id_table); /* step 3*/
/* finally, deliver the docid => thread-path hash */
return create_doc_id_thread_path_hash (root_set);
}
static Container* /* 1A */
store_msg_in_container (GHashTable *ids, MuMsg *msg)
static gboolean
check_exists (Container *c, guint level, Container *c2)
{
/* 1.A */
Container *c;
const char *msgid;
msgid = mu_msg_get_msgid (msg);
g_return_val_if_fail (msgid, NULL);
c = find_or_create (ids, (gchar*)msgid);
if (!c->_msg)
c->_msg = mu_msg_ref (msg);
else
g_printerr ("%s: duplicate msgid; ignore\n",
__FUNCTION__); /* FIXME */
return c;
return c != c2;
}
static gboolean
already_referenced (Container *c1, Container *c2)
{
gboolean notfound;
notfound = container_traverse ((Container*)c1, 0,
(ContainerTraverseFunc)check_exists,
c2);
if (!notfound)
g_warning ("*** c2 found already!!");
return notfound ? FALSE : TRUE;
}
static void
set_parent_child (Container *parent, Container *child)
{
/* FIXME: set relationship, but first check if they're not
* already linked */
if (already_referenced(child, parent))
return;
container_add_child (parent, child);
child->_parent = parent;
g_printerr ("%s: %p is a parent of %p\n", __FUNCTION__,
g_print ("%s: %p <--- %p\n", __FUNCTION__,
(gpointer)parent, (gpointer)child);
}
static void /* 1B */
handle_references (GHashTable *ids, Container *c)
{
const gchar *refsstr;
gchar **refs, **cur;
refsstr = mu_msg_get_references_str (c->_msg);
if (!refsstr)
return; /* nothing to do */
refs = g_strsplit (refsstr,",",-1);
/* go over over our list of refs */
for (cur = refs; *cur && *(cur + 1); ++cur) {
Container *c1, *c2; /* two consecutive refs in the list;
* we register them as parent, child */
c1 = find_or_create (ids, *cur);
c2 = find_or_create (ids, *(cur + 1));
set_parent_child (c1, c2);
}
/* now cur points to the final ref; we register that with
* ourselves...*/
set_parent_child (find_or_create (ids, *cur), c);
g_strfreev (refs);
}
/* find a container for the given msgid; if it does not exist yet,
* create a new one, and register it */
static Container*
find_or_create (GHashTable *ids, char *msgid)
find_or_create (GHashTable *id_table, const char* msgid, unsigned docid)
{
Container *c;
c = g_hash_table_lookup (id_table, msgid);
if (!c) {
c = container_new (NULL, docid);
g_hash_table_insert (id_table, (gpointer)msgid, c);
g_warning ("registered: %s", msgid);
} else
g_warning ("already found: %s", msgid);
g_return_val_if_fail (msgid, NULL);
c = g_hash_table_lookup (ids, msgid);
if (c) { /* we found the container? */
g_printerr ("%s: found %s\n", __FUNCTION__, msgid);
return c;
}
/* no container yet, create it */
c = container_new (NULL);
/* no need to copy msgid, as the MuMsg will be around still,
* owning the const gchar* FIXME we must g_strdup for some
* reason */
g_hash_table_insert (ids, g_strdup(msgid), c);
g_printerr ("%s: created %s\n", __FUNCTION__, msgid);
return c;
}
static void
filter_parentless (const gchar *msgid, Container *c, GSList **lst)
static void /* 1B */
handle_references (GHashTable *id_table, Container *c)
{
if (!c->_parent)
const GSList *refs, *cur;
refs = mu_msg_get_references (c->_msg);
if (!refs)
return; /* nothing to do */
/* go over over our list of refs */
for (cur = refs; cur && cur->next; cur = g_slist_next (cur)) {
Container *c1, *c2; /* two consecutive refs in the list;
* we register them as parent, child */
c1 = find_or_create (id_table, (gchar*)cur->data, 0);
c2 = find_or_create (id_table, (gchar*)cur->next->data, 0);
set_parent_child (c1, c2);
}
/* now cur points to the final ref, which refers to our own
* parent... register it */
if (cur) {
Container *parent;
parent = find_or_create (id_table, (gchar*)cur->data, 0);
set_parent_child (parent, c);
}
}
/* step 1: create the containers, connect them, and fill the id_table */
static GHashTable*
create_containers (MuMsgIter *iter)
{
GHashTable *id_table;
id_table = g_hash_table_new_full (g_str_hash,
g_str_equal,
NULL, /* we don't copy msgid */
(GDestroyNotify)container_destroy);
for (mu_msg_iter_reset (iter); !mu_msg_iter_is_done (iter);
mu_msg_iter_next (iter)) {
Container *c;
MuMsg *msg;
unsigned docid;
const char *msgid;
/* 1.A */
msg = mu_msg_iter_get_msg (iter, NULL);
msgid = mu_msg_get_msgid (msg);
docid = mu_msg_iter_get_docid (iter);
if (!msgid) {
const char* path;
path = mu_msg_get_path(msg);
g_warning ("msg without msgid %s", path);
msgid = path; /* fake it... */
}
c = find_or_create (id_table, msgid, docid);
if (!c->_msg)
c->_msg = mu_msg_ref (msg);
/* 1.B and C */
handle_references (id_table, c);
}
return id_table;
}
static void
filter_root_set (const gchar *msgid, Container *c, GSList **lst)
{
if (!c->_parent && c->_state != NUKE)
*lst = g_slist_prepend (*lst, c);
}
static GSList *
/* 2. Find the root set. Walk over the elements of id_table, and
gather a list of the Container objects that have no parents, but do
have children */
static GSList*
find_root_set (GHashTable *ids)
{
GSList *lst;
lst = NULL;
g_hash_table_foreach (ids, (GHFunc)filter_parentless, &lst);
g_hash_table_foreach (ids, (GHFunc)filter_root_set, &lst);
return lst;
}
static void
do_pruning (GSList *containers)
{
GSList *cur;
g_warning ("%s", __FUNCTION__);
/* now, do stuff to our children... */
for (cur = containers; cur; cur = g_slist_next(cur)) {
Container *child;
child = (Container *)cur->data;
if (child->_state == SPLICE) {
g_printerr ("SPLICE %p\n", (void*)child);
container_promote_child (child->_parent, child);
container_remove_child (child->_parent, child);
} else if (child->_state == NUKE) {
g_printerr ("NUKE %p\n", (void*)child);
container_remove_child (child->_parent, child);
}
}
}
/* this function will mark 'containers', and the do the pruning on
* their children */
static void
prune_empty_nonroot (GSList *containers)
{
GSList *cur;
g_warning ("%s (%d container(s))",
__FUNCTION__, g_slist_length(containers));
for (cur = containers; cur; cur = g_slist_next (cur)) {
Container *container;
g_warning ("cur: %p->%p", (void*)cur, (void*)cur->data);
container = (Container*)cur->data;
if (container->_children) {
prune_empty_nonroot (container->_children); /* recurse! */
do_pruning (container->_children);
}
/* A. If it is an empty container with no children, nuke it. */
if (!container->_msg && !container->_children) {
g_printerr ("setting %p to NUKE\n", (void*)container);
container->_state = NUKE;
}
/* B. If the Container has no Message, but does have
* children, remove this container but promote its
* children to this level (that is, splice them in to
* the current child list.)
*
* Do not promote the children if doing so would
* promote them to the root set -- unless there is
* only one child, in which case, do. */
else if (!container->_msg && container->_children &&
(container->_parent ||
g_slist_length(container->_children) == 1)) {
g_printerr ("setting %p to SPLICE\n", (void*)container);
container->_state = SPLICE;
}
}
}
/* 4. Prune empty containers */
static void
prune_empty_containers (GSList *root_set)
{
GSList *cur;
/* everything below the root_set will be pruned */
prune_empty_nonroot (root_set);
/* no, clear up the root_set itself... */
for (cur = root_set; cur; cur = g_slist_next(cur)) {
Container *c;
c = (Container *)cur->data;
if (c->_state == SPLICE) {
/* make child parent-less, so the become part of the root_set */
GSList *iter; /* there should be only 1... */
for (iter = c->_children; iter; iter = g_slist_next(iter))
((Container*)iter->data)->_parent = NULL;
c->_children = NULL;
c->_state = NUKE;
}
}
}
#if 0
/* 5. group root set by subject */
static void
group_root_set_by_subject (GSList *root_set)
{
GHashTable *subject_table;
GSList *cur;
/* A: Construct a new hash table, subject_table, which
* associates subject strings with Container objects. */
subject_table = g_hash_table_new (g_str_hash, g_str_equal);
for (cur = root_set; cur; cur = g_slist_next (cur)) {
const char *subject, *subj;
/* subject without Re: Fwd: etc. */
/* Find the subject of that sub-tree: */
Container *c;
c = (Container*)cur->data;
if (c->_msg)
/* (i) if there is a message in the Container, the
* subject is the subject of that message. */
subject = mu_msg_get_subject (c->_msg);
else
/* (ii )If there is no message in the Container,
* then the Container will have at least one
* child Container, and that Container will
* have a message. Use the subject of that
* message instead. */
subject = mu_msg_get_subject (
((Container*)(c->_children->data))->_msg);
/* (iii) Strip ``Re:'', ``RE:'', ``RE[5]:'', ``Re:
* Re[4]: Re:'' and so on. */
subj = subject ? mu_str_subject_normalize (subject) : NULL;
/* (iv )If the subject is now "", give up on this
* Container. */
if (mu_str_is_empty (subj))
continue;
}
}
#endif
struct _ThreadInfo {
const char *parent_path, *prev_path;
GHashTable *hash;
unsigned seq, prev_level;
};
typedef struct _ThreadInfo ThreadInfo;
/* let's make a GtkTreePath compatible thread path */
static gboolean
add_thread_path (Container *c, guint level, ThreadInfo *ti)
{
gchar *threadpath;
if (level == 0)
threadpath = g_strdup_printf ("%05d", ti->seq++);
else {
/* see if we're the first on this level; if so, reset
* the sequence to 0 */
if (ti->prev_level != level) {
ti->seq = 0;
ti->parent_path = ti->prev_path;
}
threadpath = g_strdup_printf ("%s:%05d",
ti->parent_path,
ti->seq++);
}
g_printerr ("[%s (%u)]\n", threadpath, level);
if (c->_docid)
g_hash_table_insert (ti->hash, GUINT_TO_POINTER(c->_docid),
threadpath);
ti->prev_level = level;
ti->prev_path = threadpath;
return TRUE;
}
GHashTable*
create_doc_id_thread_path_hash (GSList *root_set)
{
ThreadInfo ti;
GSList *cur;
int i;
/* create hash docid => thread-path */
ti.hash = g_hash_table_new_full (g_direct_hash, g_direct_equal,
NULL,
(GDestroyNotify)g_free);
ti.parent_path = ti.prev_path = "";
ti.prev_level = ti.seq = 0;
for (i = 0, cur = root_set; cur; cur = g_slist_next (cur))
container_traverse ((Container*)cur->data, 0,
(ContainerTraverseFunc)add_thread_path,
&ti);
return ti.hash;
}
static void
dump_container (Container *c, guint indent, gpointer p)
{
while (indent--)
fputs (" ", stdout);
container_dump (c);
}
static void
dump (GSList *root_set)
{
GSList *cur;
int i;
for (i = 0, cur = root_set ; cur ; cur = g_slist_next (cur)) {
container_traverse ((Container*)cur->data, 0,
(ContainerTraverseFunc)dump_container,
NULL);
}
}
static Container*
container_new (MuMsg *msg)
container_new (MuMsg *msg, unsigned docid)
{
Container *c;
c = g_slice_new0 (Container);
if (msg)
c->_msg = mu_msg_ref (msg);
c->_docid = docid;
c->_state = OKAY;
return c;
}
@ -274,29 +525,23 @@ container_destroy (Container *c)
/* depth-first traverse all children, grand-children ... n-children of
* container */
static gboolean
container_traverse (Container *c, ContainerTraverseFunc func,
gpointer user_data)
{
/* if func returns FALSE, quit */
if (!func (c, user_data))
return FALSE;
if (c->_children)
return container_traverse_list (c->_children,
func, user_data);
return TRUE;
}
static gboolean
container_traverse_list (GSList *containers, ContainerTraverseFunc func,
gpointer user_data)
container_traverse (Container *c, guint level, ContainerTraverseFunc func,
gpointer user_data)
{
GSList *cur;
int i;
for (cur = containers; cur; cur = g_slist_next (cur))
if (!container_traverse ((Container*)cur->data, func, user_data))
return FALSE; /* quit */
g_return_val_if_fail (c, FALSE);
if (!func (c, level, user_data))
return FALSE;
for (i = 0, cur = c->_children; cur; cur = g_slist_next (cur)) {
if (!container_traverse ((Container*)cur->data, level + 1,
func, user_data))
return FALSE;
}
return TRUE;
}
@ -307,6 +552,63 @@ container_add_child (Container *c, Container *child)
g_return_if_fail (c != child);
g_return_if_fail (child);
if (g_slist_find (c->_children, child)) {
g_warning ("%s: %p not adding dup child %p", __FUNCTION__,
(void*)c, (void*)child);
return;
}
child->_parent = c;
c->_children = g_slist_prepend (c->_children, child);
}
static void
container_promote_child (Container *c, Container *child)
{
GSList *iter;
g_return_if_fail (c != child);
g_return_if_fail (child);
for (iter = child->_children; iter; iter = g_slist_next(iter)) {
/* reparent grandchildren */
((Container*)iter->data)->_parent = c;
c->_children = g_slist_concat (c->_children,
child->_children);
child->_children = NULL;
}
}
static void
container_remove_child (Container *c, Container *child)
{
g_return_if_fail (c != child);
g_return_if_fail (child);
c->_children =
g_slist_remove (c->_children, child);
}
static void
container_dump (Container *c)
{
const char* state;
switch (c->_state) {
case NUKE: state = "NUKE"; break;
case SPLICE: state = "SPLICE"; break;
case OKAY: state = "OKAY"; break;
default: state = "HUH"; break;
};
g_print ("[%s] { %p parent=%p msg=%p [%s] children: %d state: %s}\n",
c->_msg ? mu_msg_get_subject(c->_msg) : "<empty>",
(void*)c,
(void*)c->_parent, (void*)c->_msg,
c->_msg ? mu_msg_get_msgid(c->_msg) : "",
g_slist_length (c->_children), state);
}

View File

@ -28,12 +28,7 @@
G_BEGIN_DECLS
struct _MuMsgThreader;
typedef struct _MuMsgThreader MuMsgThreader;
MuMsgThreader *mu_msg_threader_new (void);
void mu_msg_threader_destroy (MuMsgThreader *self);
gboolean mu_msg_threader_calculate (MuMsgThreader *self, MuMsgIter *iter);
GHashTable *mu_msg_threader_calculate (MuMsgIter *iter);
G_END_DECLS