summaryrefslogtreecommitdiff
path: root/lib/mu-threader.c
blob: 719508d241315a2325e8194b91807f1e1a09427a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/* -*-mode: c; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-*/
/*
** Copyright (C) 2012-2013 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
**
** This program is free software; you can redistribute it and/or modify it
** under the terms of the GNU General Public License as published by the
** Free Software Foundation; either version 3, or (at your option) any
** later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program; if not, write to the Free Software Foundation,
** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
**
*/
#include <math.h>   /* for log, ceil */
#include <string.h> /* for memset */

#include "mu-threader.h"
#include "mu-container.h"
#include "mu-str.h"

/* msg threading implementation based on JWZ's algorithm, as described in:
 *    http://www.jwz.org/doc/threading.html
 *
 * the implementation follows the terminology from that doc, so should
 * be understandable from that... I did change things a bit though
 *
 * the end result of the threading operation is a hashtable which maps
 * docids (ie., Xapian documents == messages) to 'thread paths'; a
 * thread path is a string denoting the 2-dimensional place of a
 * message in a list of messages,
 *
 * Msg1                        => 00000
 * Msg2                        => 00001
 *   Msg3 (child of Msg2)      => 00001:00000
 *   Msg4 (child of Msg2)      => 00001:00001
 *     Msg5 (child of Msg4)    => 00001:00001:00000
 * Msg6                        => 00002
 *
 * the padding-0's are added to make them easy to sort using strcmp;
 * the number hexadecimal numbers, and the length of the 'segments'
 * (the parts separated by the ':') is equal to ceil(log_16(matchnum))
 *
 */

/* step 1 */ static GHashTable* create_containers (MuMsgIter *iter);
/* step 2 */ static MuContainer *find_root_set (GHashTable *ids);
static MuContainer* prune_empty_containers (MuContainer *root);
/* static void group_root_set_by_subject (GSList *root_set); */
GHashTable* create_doc_id_thread_path_hash (MuContainer *root,
					    size_t match_num);

/* msg threading algorithm, based on JWZ's algorithm,
 * http://www.jwz.org/doc/threading.html */
GHashTable*
mu_threader_calculate (MuMsgIter *iter, size_t matchnum,
		       MuMsgFieldId sortfield, gboolean descending)
{
	GHashTable *id_table, *thread_ids;
	MuContainer *root_set;

	g_return_val_if_fail (iter, FALSE);
	g_return_val_if_fail (mu_msg_field_id_is_valid (sortfield) ||
			      sortfield == MU_MSG_FIELD_ID_NONE,
			      FALSE);

	/* step 1 */
	id_table = create_containers (iter);
	if (matchnum == 0)
		return id_table; /* just return an empty table */

	/* step 2 -- the root_set is the list of children without parent */
	root_set = find_root_set (id_table);

	/* step 3: skip until the end; we still need to containers */

	/* step 4: prune empty containers */
	root_set = prune_empty_containers (root_set);

	/* sort root set */
	if (sortfield != MU_MSG_FIELD_ID_NONE)
		root_set = mu_container_sort (root_set, sortfield, descending,
					      NULL);

	/* step 5: group root set by subject */
	/* group_root_set_by_subject (root_set); */

	/* sort */
	mu_msg_iter_reset (iter); /* go all the way back */

	/* finally, deliver the docid => thread-path hash */
	thread_ids = mu_container_thread_info_hash_new (root_set,
							matchnum);

	g_hash_table_destroy (id_table); /* step 3*/

	return thread_ids;
}

G_GNUC_UNUSED static void
check_dup (const char *msgid, MuContainer *c, GHashTable *hash)
{
	if (g_hash_table_lookup (hash, c)) {
		g_warning ("ALREADY!!");
		mu_container_dump (c, FALSE);
		g_assert (0);
	} else
		g_hash_table_insert (hash, c, GUINT_TO_POINTER(TRUE));
}


G_GNUC_UNUSED static void
assert_no_duplicates (GHashTable *ids)
{
	GHashTable *hash;

	hash = g_hash_table_new (g_direct_hash, g_direct_equal);

	g_hash_table_foreach (ids, (GHFunc)check_dup, hash);

	g_hash_table_destroy (hash);
}


/* a referred message is a message that is refered by some other
 * message */
static MuContainer*
find_or_create_referred (GHashTable *id_table, const char *msgid,
			 gboolean *created)
{
	MuContainer *c;

	g_return_val_if_fail (msgid, NULL);

	c = g_hash_table_lookup (id_table, msgid);
	*created = !c;
	if (!c) {
		c = mu_container_new (NULL, 0, msgid);
		g_hash_table_insert (id_table, (gpointer)msgid, c);
		/* assert_no_duplicates (id_table); */
	}


	return c;
}

/* find a container for the given msgid; if it does not exist yet,
 * create a new one, and register it */
static MuContainer*
find_or_create (GHashTable *id_table, MuMsg *msg, guint docid)
{
	MuContainer *c;
	const char* msgid;

	g_return_val_if_fail (msg, NULL);
	g_return_val_if_fail (docid != 0, NULL);

	msgid = mu_msg_get_msgid (msg);
	if (!msgid)
		msgid = mu_msg_get_path (msg); /* fake it */

	c = g_hash_table_lookup (id_table, msgid);

	/* If id_table contains an empty MuContainer for this ID: * *
	 * Store this message in the MuContainer's message slot. */
	if (c) {
		if (!c->msg) {
			c->msg	  = mu_msg_ref (msg);
			c->docid  = docid;
			return c;
		} else {
			/* special case, not in the JWZ algorithm: the
			 * container exists already and has a message; this
			 * means that we are seeing *another message* with a
			 * message-id we already saw... create this message,
			 * and mark it as a duplicate, and a child of the one
			 * we saw before; use its path as a fake message-id
			 * */
			MuContainer *c2;
			const char* fake_msgid;

			fake_msgid = mu_msg_get_path (msg);

			c2	  = mu_container_new (msg, docid, fake_msgid);
			c2->flags = MU_CONTAINER_FLAG_DUP;
			/*c	  = */ mu_container_append_children (c, c2);

			g_hash_table_insert (id_table, (gpointer)fake_msgid, c2);

			return NULL; /* don't process this message further */
		}
	} else { /* Else: Create a new MuContainer object holding
		    this message; Index the MuContainer by
		    Message-ID in id_table. */
		c = mu_container_new (msg, docid, msgid);
		g_hash_table_insert (id_table, (gpointer)msgid, c);
		/* assert_no_duplicates (id_table); */

		return c;
	}
}

static gboolean
child_elligible (MuContainer *parent, MuContainer *child, gboolean created)
{
	if (!parent || !child)
		return FALSE;
	if (child->parent)
		return FALSE;
	/* if (created) */
	/* 	return TRUE; */
	if (mu_container_reachable (parent, child))
		return FALSE;
	if (mu_container_reachable (child, parent))
		return FALSE;

	return TRUE;
}



static void /* 1B */
handle_references (GHashTable *id_table, MuContainer *c)
{
	const GSList *refs, *cur;
	MuContainer *parent;
	gboolean created;

	refs = mu_msg_get_references (c->msg);
	if (!refs)
		return; /* nothing to do */

	/* For each element in the message's References field:

	   Find a MuContainer object for the given Message-ID: If
	   there's one in id_table use that; Otherwise, make (and
	   index) one with a null Message. */

	/* go over over our list of refs, until 1 before the last... */
	created = FALSE;
	for (parent = NULL, cur = refs; cur; cur = g_slist_next (cur)) {

		MuContainer *child;
		child = find_or_create_referred (id_table, (gchar*)cur->data,
						 &created);

		/* if we find the current message in their own refs, break now
		   so that parent != c in next step */
		if (child == c)
			break;

		/*Link the References field's MuContainers together in
		 * the order implied by the References header.

		 If they are already linked, don't change the existing
		 links.  Do not add a link if adding that link would
		 introduce a loop: that is, before asserting A->B,
		 search down the children of B to see if A is
		 reachable, and also search down the children of A to
		 see if B is reachable. If either is already reachable
		 as a child of the other, don't add the link. */

		if (child_elligible (parent, child, created))
			/*parent =*/
			mu_container_append_children (parent, child);

		parent = child;
	}

	/* 'parent' points to the last ref: our direct parent;

	   Set the parent of this message to be the last element in
	   References. Note that this message may have a parent
	   already: this can happen because we saw this ID in a
	   References field, and presumed a parent based on the other
	   entries in that field. Now that we have the actual message,
	   we can be more definitive, so throw away the old parent and
	   use this new one. Find this MuContainer in the parent's
	   children list, and unlink it.

	   Note that this could cause this message to now have no
	   parent, if it has no references field, but some message
	   referred to it as the non-first element of its
	   references. (Which would have been some kind of lie...)

	   Note that at all times, the various ``parent'' and ``child'' fields
	   must be kept inter-consistent. */

        /* optimization: if the the message was newly added, it's by
   	   definition not reachable yet */

	/* So, we move c and its descendants to become a child of parent if:
	   * both are not NULL
	   * parent is not a descendant of c.
	   * both are different from each other (guaranteed in last loop) */

	if (parent && c && !(c->child && mu_container_reachable (c->child, parent))) {

		/* if c already has a parent, remove c from its parent children
		   and reparent it, as now we know who is c's parent reliably */
		if (c->parent) {
			mu_container_remove_child(c->parent, c);
			c->next = c->last = c->parent = NULL;
		}

		/*parent = */mu_container_append_children (parent, c);
	}
}



/* step 1: create the containers, connect them, and fill the id_table */
static GHashTable*
create_containers (MuMsgIter *iter)
{
	GHashTable *id_table;
	id_table = g_hash_table_new_full (g_str_hash, g_str_equal,
					  NULL,
					  (GDestroyNotify)mu_container_destroy);

	for (mu_msg_iter_reset (iter); !mu_msg_iter_is_done (iter);
	     mu_msg_iter_next (iter)) {

		MuContainer *c;
		MuMsg *msg;
		unsigned docid;

		/* 1.A */
		msg   = mu_msg_iter_get_msg_floating (iter); /* don't unref */
		docid = mu_msg_iter_get_docid (iter);

		c = find_or_create (id_table, msg, docid);

		/* 1.B and C */
		if (c)
			handle_references (id_table, c);
	}

	return id_table;
}



static void
filter_root_set (const gchar *msgid, MuContainer *c, MuContainer **root_set)
{
	/* ignore children */
	if (c->parent)
		return;

	/* ignore duplicates */
	if (c->flags & MU_CONTAINER_FLAG_DUP)
		return;

	if (*root_set == NULL) {
		*root_set = c;
		return;
	} else
		*root_set = mu_container_append_siblings (*root_set, c);
}


/* 2.  Walk over the elements of id_table, and gather a list of the
   MuContainer objects that have no parents, but do have children */
static MuContainer*
find_root_set (GHashTable *ids)
{
	MuContainer *root_set;

	root_set = NULL;
	g_hash_table_foreach (ids, (GHFunc)filter_root_set, &root_set);

	return root_set;
}


static gboolean
prune_maybe (MuContainer *c)
{
	MuContainer *cur;

	for (cur = c->child; cur; cur = cur->next) {
		if (cur->flags & MU_CONTAINER_FLAG_DELETE)
			c = mu_container_remove_child (c, cur);
		else if (cur->flags & MU_CONTAINER_FLAG_SPLICE)
			c = mu_container_splice_children (c, cur);
	}

	g_return_val_if_fail (c, FALSE);

	/* don't touch containers with messages */
	if (c->msg)
		return TRUE;

	/* A. If it is an msg-less container with no children, mark it
	 * for deletion. */
	if (!c->child) {
		c->flags |= MU_CONTAINER_FLAG_DELETE;
		return TRUE;
	}

	/* B. If the MuContainer has no Message, but does have
	 * children, remove this container but promote its
	 * children to this level (that is, splice them in to
	 * the current child list.)
	 *
	 * Do not promote the children if doing so would
	 * promote them to the root set -- unless there is
	 * only one child, in which case, do.
	 */
	if (c->child->next) /* ie., > 1 child */
		return TRUE;

	c->flags |= MU_CONTAINER_FLAG_SPLICE;

	return TRUE;
}


static MuContainer*
prune_empty_containers (MuContainer *root_set)
{
	MuContainer *cur;

	mu_container_foreach (root_set,
			      (MuContainerForeachFunc)prune_maybe,
			      NULL);

	/* and prune the root_set itself... */
	for (cur = root_set; cur; cur = cur->next) {

		if (cur->flags & MU_CONTAINER_FLAG_DELETE)
			root_set = mu_container_remove_sibling (root_set, cur);

		else if (cur->flags & MU_CONTAINER_FLAG_SPLICE) {
			MuContainer *newchild;
			newchild = cur->child;
			cur->child = NULL;
			root_set = mu_container_append_siblings (root_set,
								 newchild);
		}
	}

	return root_set;
}