[PATCH 3/5] c/r: introduce walk_task_subtree() to iterate through descendants

Oren Laadan orenl at librato.com
Wed Sep 30 18:47:12 PDT 2009


Introduce a helper to iterate over the descendants of a given
task. The prototype is:

int walk_task_subtree(struct task_struct *root,
		      int (*func)(struct task_struct *, void *),
      		      void *data)

The function will start with @root, and iterate through all the
descendants, including threads, in a DFS manner. Children of a task
are traversed before proceeding to the next thread of that task.

For each task, the callback @func will be called providing the task
pointer and the @data. The callback is invoked while holding the
tasklist_lock for reading. If the callback fails it should return a
negative error, and the traversal ends. If the callback succeeds, it
returns a non-negative number, and these values are summed.

On success, walk_task_subtree() returns the total summed. On failure,
it returns a negative value.

The code in checkpoint/checkpoint.c and checkpoint/restart.c has
been converted to use this helper.

Signed-off-by: Oren Laadan <orenl at cs.columbia.edu>
---
 checkpoint/checkpoint.c    |   97 ++++++++++++++--------------------------
 checkpoint/restart.c       |  105 +++++++++++++++++++-------------------------
 checkpoint/sys.c           |   55 +++++++++++++++++++++++
 include/linux/checkpoint.h |    3 +
 4 files changed, 137 insertions(+), 123 deletions(-)

diff --git a/checkpoint/checkpoint.c b/checkpoint/checkpoint.c
index dbe9e10..1eeb557 100644
--- a/checkpoint/checkpoint.c
+++ b/checkpoint/checkpoint.c
@@ -520,80 +520,51 @@ static int collect_objects(struct ckpt_ctx *ctx)
 	return ret;
 }
 
+struct ckpt_cnt_tasks {
+	struct ckpt_ctx *ctx;
+	int nr;
+};
+
 /* count number of tasks in tree (and optionally fill pid's in array) */
-static int tree_count_tasks(struct ckpt_ctx *ctx)
+static int __tree_count_tasks(struct task_struct *task, void *data)
 {
-	struct task_struct *root;
-	struct task_struct *task;
-	struct task_struct *parent;
-	struct task_struct **tasks_arr = ctx->tasks_arr;
-	int nr_tasks = ctx->nr_tasks;
-	int nr = 0;
+	struct ckpt_cnt_tasks *d = (struct ckpt_cnt_tasks *) data;
+	struct ckpt_ctx *ctx = d->ctx;
 	int ret;
 
-	read_lock(&tasklist_lock);
-
-	/* we hold the lock, so root_task->real_parent can't change */
-	task = ctx->root_task;
-	if (ctx->root_init) {
-		/* container-init: start from container parent */
-		parent = task->real_parent;
-		root = parent;
-	} else {
-		/* non-container-init: start from root task and down */
-		parent = NULL;
-		root = task;
-	}
-
-	/* count tasks via DFS scan of the tree */
-	while (1) {
-		ctx->tsk = task;  /* (for ckpt_write_err) */
+	ctx->tsk = task;  /* (for ckpt_write_err) */
 
-		/* is this task cool ? */
-		ret = may_checkpoint_task(ctx, task);
-		if (ret < 0) {
-			nr = ret;
-			break;
-		}
-		if (tasks_arr) {
-			/* unlikely... but if so then try again later */
-			if (nr == nr_tasks) {
-				nr = -EBUSY; /* cleanup in ckpt_ctx_free() */
-				break;
-			}
-			tasks_arr[nr] = task;
-			get_task_struct(task);
-		}
-		nr++;
-		/* if has children - proceed with child */
-		if (!list_empty(&task->children)) {
-			parent = task;
-			task = list_entry(task->children.next,
-					  struct task_struct, sibling);
-			continue;
-		}
-		while (task != root) {
-			/* if has sibling - proceed with sibling */
-			if (!list_is_last(&task->sibling, &parent->children)) {
-				task = list_entry(task->sibling.next,
-						  struct task_struct, sibling);
-				break;
-			}
+	/* is this task cool ? */
+	ret = may_checkpoint_task(ctx, task);
+	if (ret < 0)
+		goto out;
 
-			/* else, trace back to parent and proceed */
-			task = parent;
-			parent = parent->real_parent;
+	if (ctx->tasks_arr) {
+		if (d->nr == ctx->nr_tasks) {  /* unlikely... try again later */
+			__ckpt_write_err(ctx, "TE", "bad task count\n", -EBUSY);
+			ret = -EBUSY;
+			goto out;
 		}
-		if (task == root)
-			break;
+		ctx->tasks_arr[d->nr++] = task;
+		get_task_struct(task);
 	}
+
+	ret = 1;
+ out:
+	if (ret < 0)
+		ckpt_write_err(ctx, "", NULL);
 	ctx->tsk = NULL;
+	return ret;
+}
 
-	read_unlock(&tasklist_lock);
+static int tree_count_tasks(struct ckpt_ctx *ctx)
+{
+	struct ckpt_cnt_tasks data;
 
-	if (nr < 0)
-		ckpt_write_err(ctx, "", NULL);
-	return nr;
+	data.ctx = ctx;
+	data.nr = 0;
+
+	return walk_task_subtree(ctx->root_task, __tree_count_tasks, &data);
 }
 
 /*
diff --git a/checkpoint/restart.c b/checkpoint/restart.c
index 37454c5..c021eaf 100644
--- a/checkpoint/restart.c
+++ b/checkpoint/restart.c
@@ -804,77 +804,56 @@ static int do_restore_task(void)
  * Called by the coodinator to set the ->checkpoint_ctx pointer of the
  * root task and all its descendants.
  */
-static int prepare_descendants(struct ckpt_ctx *ctx, struct task_struct *root)
+static int __prepare_descendants(struct task_struct *task, void *data)
 {
-	struct task_struct *leader = root;
-	struct task_struct *parent = NULL;
-	struct task_struct *task = root;
-	int nr_pids = 0;
-	int ret = -EBUSY;
+	struct ckpt_ctx *ctx = (struct ckpt_ctx *) data;
 
-	read_lock(&tasklist_lock);
-	while (1) {
-		ckpt_debug("consider task %d\n", task_pid_vnr(task));
-		if (task_ptrace(task) & PT_PTRACED)
-			break;
-		/*
-		 * Set task->checkpoint_ctx of all non-zombie descendants.
-		 * If a descendant already has a ->checkpoint_ctx, it
-		 * must be a coordinator (for a different restart ?) so
-		 * we fail.
-		 *
-		 * Note that own ancestors cannot interfere since they
-		 * won't descend past us, as own ->checkpoint_ctx must
-		 * already be set.
-		 */
-		if (!task->exit_state) {
-			if (!set_task_ctx(task, ctx))
-				break;
-			ckpt_debug("prepare task %d\n", task_pid_vnr(task));
-			wake_up_process(task);
-			nr_pids++;
-		}
+	ckpt_debug("consider task %d\n", task_pid_vnr(task));
 
-		/* if has children - proceed with child */
-		if (!list_empty(&task->children)) {
-			parent = task;
-			task = list_entry(task->children.next,
-					  struct task_struct, sibling);
-			continue;
-		}
-		while (task != root) {
-			/* if has sibling - proceed with sibling */
-			if (!list_is_last(&task->sibling, &parent->children)) {
-				task = list_entry(task->sibling.next,
-						  struct task_struct, sibling);
-				break;
-			}
-
-			/* else, trace back to parent and proceed */
-			task = parent;
-			parent = parent->real_parent;
-		}
-		if (task == root) {
-			/* in case root task is multi-threaded */
-			root = task = next_thread(task);
-			if (root == leader) {
-				ret = 0;
-				break;
-			}
-		}
+	if (task_ptrace(task) & PT_PTRACED) {
+		ckpt_debug("ptraced task %d\n", task_pid_vnr(task));
+		return -EBUSY;
 	}
-	read_unlock(&tasklist_lock);
-	ckpt_debug("nr %d/%d  ret %d\n", ctx->nr_pids, nr_pids, ret);
+
+	/*
+	 * Set task->checkpoint_ctx of all non-zombie descendants.
+	 * If a descendant already has a ->checkpoint_ctx, it
+	 * must be a coordinator (for a different restart ?) so
+	 * we fail.
+	 *
+	 * Note that own ancestors cannot interfere since they
+	 * won't descend past us, as own ->checkpoint_ctx must
+	 * already be set.
+	 */
+	if (!task->exit_state) {
+		if (!set_task_ctx(task, ctx))
+			return -EBUSY;
+		ckpt_debug("prepare task %d\n", task_pid_vnr(task));
+		wake_up_process(task);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int prepare_descendants(struct ckpt_ctx *ctx, struct task_struct *root)
+{
+	int nr_pids;
+
+	nr_pids = walk_task_subtree(root, __prepare_descendants, ctx);
+	ckpt_debug("nr %d/%d\n", ctx->nr_pids, nr_pids);
+	if (nr_pids < 0)
+		return nr_pids;
 
 	/*
 	 * Actual tasks count may exceed ctx->nr_pids due of 'dead'
 	 * tasks used as place-holders for PGIDs, but not fall short.
 	 */
-	if (!ret && (nr_pids < ctx->nr_pids))
-		ret = -ESRCH;
+	if (nr_pids < ctx->nr_pids)
+		return -ESRCH;
 
 	atomic_set(&ctx->nr_total, nr_pids);
-	return ret;
+	return nr_pids;
 }
 
 static int wait_all_tasks_finish(struct ckpt_ctx *ctx)
@@ -991,6 +970,12 @@ static int do_restore_coord(struct ckpt_ctx *ctx, pid_t pid)
 		return -EBUSY;
 	}
 
+	/*
+	 * From now on we are committed to the restart. If anything
+	 * fails, we'll wipe out the entire subtree below us, to
+	 * ensure proper cleanup.
+	 */
+
 	if (ctx->uflags & RESTART_TASKSELF) {
 		ret = restore_task(ctx);
 		ckpt_debug("restore task: %d\n", ret);
diff --git a/checkpoint/sys.c b/checkpoint/sys.c
index 77613d7..7604089 100644
--- a/checkpoint/sys.c
+++ b/checkpoint/sys.c
@@ -276,6 +276,61 @@ void ckpt_ctx_put(struct ckpt_ctx *ctx)
 		ckpt_ctx_free(ctx);
 }
 
+int walk_task_subtree(struct task_struct *root,
+		      int (*func)(struct task_struct *, void *),
+		      void *data)
+{
+
+	struct task_struct *leader = root;
+	struct task_struct *parent = NULL;
+	struct task_struct *task = root;
+	int total = 0;
+	int ret;
+
+	read_lock(&tasklist_lock);
+	while (1) {
+		/* invoke callback on this task */
+		ret = func(task, data);
+		if (ret < 0)
+			break;
+
+		total += ret;
+
+		/* if has children - proceed with child */
+		if (!list_empty(&task->children)) {
+			parent = task;
+			task = list_entry(task->children.next,
+					  struct task_struct, sibling);
+			continue;
+		}
+
+		while (task != root) {
+			/* if has sibling - proceed with sibling */
+			if (!list_is_last(&task->sibling, &parent->children)) {
+				task = list_entry(task->sibling.next,
+						  struct task_struct, sibling);
+				break;
+			}
+
+			/* else, trace back to parent and proceed */
+			task = parent;
+			parent = parent->real_parent;
+		}
+
+		if (task == root) {
+			/* in case root task is multi-threaded */
+			root = task = next_thread(task);
+			if (root == leader)
+				break;
+		}
+	}
+	read_unlock(&tasklist_lock);
+
+	ckpt_debug("total %d ret %d\n", total, ret);
+	return (ret < 0 ? ret : total);
+}
+
+
 /**
  * sys_checkpoint - checkpoint a container
  * @pid: pid of the container init(1) process
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
index b7f1796..12210e4 100644
--- a/include/linux/checkpoint.h
+++ b/include/linux/checkpoint.h
@@ -50,6 +50,9 @@
 	 RESTART_FROZEN | \
 	 RESTART_GHOST)
 
+extern int walk_task_subtree(struct task_struct *task,
+			     int (*func)(struct task_struct *, void *),
+			     void *data);
 extern void exit_checkpoint(struct task_struct *tsk);
 
 extern int ckpt_kwrite(struct ckpt_ctx *ctx, void *buf, int count);
-- 
1.6.0.4



More information about the Containers mailing list