[kernel] Fix cgroup destroy oops (rhbz 1045755)

Wed Feb 12 15:41:53 UTC 2014

commit 7845789edd0ec32e4756ef845de93c70d7641c2f
Author: Josh Boyer <jwboyer at fedoraproject.org>
Date:   Wed Feb 12 10:37:45 2014 -0500

    Fix cgroup destroy oops (rhbz 1045755)

 cgroup-fixes.patch |  342 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel.spec        |    7 +
 2 files changed, 349 insertions(+), 0 deletions(-)
---

diff --git a/cgroup-fixes.patch b/cgroup-fixes.patch
new file mode 100644
index 0000000..b76be1d
--- /dev/null
+++ b/cgroup-fixes.patch
@@ -0,0 +1,342 @@
+From ab3f5faa6255a0eb4f832675507d9e295ca7e9ba Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd at google.com>
+Date: Thu, 06 Feb 2014 23:56:01 +0000
+Subject: cgroup: use an ordered workqueue for cgroup destruction
+
+Sometimes the cleanup after memcg hierarchy testing gets stuck in
+mem_cgroup_reparent_charges(), unable to bring non-kmem usage down to 0.
+
+There may turn out to be several causes, but a major cause is this: the
+workitem to offline parent can get run before workitem to offline child;
+parent's mem_cgroup_reparent_charges() circles around waiting for the
+child's pages to be reparented to its lrus, but it's holding cgroup_mutex
+which prevents the child from reaching its mem_cgroup_reparent_charges().
+
+Just use an ordered workqueue for cgroup_destroy_wq.
+
+tj: Committing as the temporary fix until the reverse dependency can
+    be removed from memcg.  Comment updated accordingly.
+
+Fixes: e5fca243abae ("cgroup: use a dedicated workqueue for cgroup destruction")
+Suggested-by: Filipe Brandenburger <filbranden at google.com>
+Signed-off-by: Hugh Dickins <hughd at google.com>
+Cc: stable at vger.kernel.org # 3.10+
+Signed-off-by: Tejun Heo <tj at kernel.org>
+---
+diff --git a/kernel/cgroup.c b/kernel/cgroup.c
+index e2f46ba..aa95591 100644
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -4845,12 +4845,16 @@ static int __init cgroup_wq_init(void)
+ 	/*
+ 	 * There isn't much point in executing destruction path in
+ 	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
+-	 * Use 1 for @max_active.
++	 *
++	 * XXX: Must be ordered to make sure parent is offlined after
++	 * children.  The ordering requirement is for memcg where a
++	 * parent's offline may wait for a child's leading to deadlock.  In
++	 * the long term, this should be fixed from memcg side.
+ 	 *
+ 	 * We would prefer to do this in cgroup_init() above, but that
+ 	 * is called before init_workqueues(): so leave this until after.
+ 	 */
+-	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
++	cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0);
+ 	BUG_ON(!cgroup_destroy_wq);
+ 
+ 	/*
+--
+cgit v0.9.2
+From eb46bf89696972b856a9adb6aebd5c7b65c266e4 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj at kernel.org>
+Date: Sat, 08 Feb 2014 15:26:33 +0000
+Subject: cgroup: fix error return value in cgroup_mount()
+
+When cgroup_mount() fails to allocate an id for the root, it didn't
+set ret before jumping to unlock_drop ending up returning 0 after a
+failure.  Fix it.
+
+Signed-off-by: Tejun Heo <tj at kernel.org>
+Acked-by: Li Zefan <lizefan at huawei.com>
+Cc: stable at vger.kernel.org
+---
+diff --git a/kernel/cgroup.c b/kernel/cgroup.c
+index aa95591..793f371 100644
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
+ 		mutex_lock(&cgroup_mutex);
+ 		mutex_lock(&cgroup_root_mutex);
+ 
+-		root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
+-					   0, 1, GFP_KERNEL);
+-		if (root_cgrp->id < 0)
++		ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
++		if (ret < 0)
+ 			goto unlock_drop;
++		root_cgrp->id = ret;
+ 
+ 		/* Check for name clashes with existing mounts */
+ 		ret = -EBUSY;
+--
+cgit v0.9.2
+From b58c89986a77a23658682a100eb15d8edb571ebb Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj at kernel.org>
+Date: Sat, 08 Feb 2014 15:26:33 +0000
+Subject: cgroup: fix error return from cgroup_create()
+
+cgroup_create() was returning 0 after allocation failures.  Fix it.
+
+Signed-off-by: Tejun Heo <tj at kernel.org>
+Acked-by: Li Zefan <lizefan at huawei.com>
+Cc: stable at vger.kernel.org
+---
+diff --git a/kernel/cgroup.c b/kernel/cgroup.c
+index 793f371..0eb7b86 100644
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
+ 	struct cgroup *cgrp;
+ 	struct cgroup_name *name;
+ 	struct cgroupfs_root *root = parent->root;
+-	int ssid, err = 0;
++	int ssid, err;
+ 	struct cgroup_subsys *ss;
+ 	struct super_block *sb = root->sb;
+ 
+@@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
+ 		return -ENOMEM;
+ 
+ 	name = cgroup_alloc_name(dentry);
+-	if (!name)
++	if (!name) {
++		err = -ENOMEM;
+ 		goto err_free_cgrp;
++	}
+ 	rcu_assign_pointer(cgrp->name, name);
+ 
+ 	/*
+@@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
+ 	 * a half-baked cgroup.
+ 	 */
+ 	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+-	if (cgrp->id < 0)
++	if (cgrp->id < 0) {
++		err = -ENOMEM;
+ 		goto err_free_name;
++	}
+ 
+ 	/*
+ 	 * Only live parents can have children.  Note that the liveliness
+--
+cgit v0.9.2
+From 48573a893303986e3b0b2974d6fb11f3d1bb7064 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj at kernel.org>
+Date: Sat, 08 Feb 2014 15:26:34 +0000
+Subject: cgroup: fix locking in cgroup_cfts_commit()
+
+cgroup_cfts_commit() walks the cgroup hierarchy that the target
+subsystem is attached to and tries to apply the file changes.  Due to
+the convolution with inode locking, it can't keep cgroup_mutex locked
+while iterating.  It currently holds only RCU read lock around the
+actual iteration and then pins the found cgroup using dget().
+
+Unfortunately, this is incorrect.  Although the iteration does check
+cgroup_is_dead() before invoking dget(), there's nothing which
+prevents the dentry from going away inbetween.  Note that this is
+different from the usual css iterations where css_tryget() is used to
+pin the css - css_tryget() tests whether the css can be pinned and
+fails if not.
+
+The problem can be solved by simply holding cgroup_mutex instead of
+RCU read lock around the iteration, which actually reduces LOC.
+
+Signed-off-by: Tejun Heo <tj at kernel.org>
+Acked-by: Li Zefan <lizefan at huawei.com>
+Cc: stable at vger.kernel.org
+---
+diff --git a/kernel/cgroup.c b/kernel/cgroup.c
+index 0eb7b86..3edf716 100644
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
+ 	 */
+ 	update_before = cgroup_serial_nr_next;
+ 
+-	mutex_unlock(&cgroup_mutex);
+-
+ 	/* add/rm files for all cgroups created before */
+-	rcu_read_lock();
+ 	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
+ 		struct cgroup *cgrp = css->cgroup;
+ 
+@@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
+ 
+ 		inode = cgrp->dentry->d_inode;
+ 		dget(cgrp->dentry);
+-		rcu_read_unlock();
+-
+ 		dput(prev);
+ 		prev = cgrp->dentry;
+ 
++		mutex_unlock(&cgroup_mutex);
+ 		mutex_lock(&inode->i_mutex);
+ 		mutex_lock(&cgroup_mutex);
+ 		if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
+ 			ret = cgroup_addrm_files(cgrp, cfts, is_add);
+-		mutex_unlock(&cgroup_mutex);
+ 		mutex_unlock(&inode->i_mutex);
+-
+-		rcu_read_lock();
+ 		if (ret)
+ 			break;
+ 	}
+-	rcu_read_unlock();
++	mutex_unlock(&cgroup_mutex);
+ 	dput(prev);
+ 	deactivate_super(sb);
+ 	return ret;
+--
+cgit v0.9.2
+From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001
+From: Li Zefan <lizefan at huawei.com>
+Date: Tue, 11 Feb 2014 08:05:46 +0000
+Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex
+
+Setup cgroupfs like this:
+  # mount -t cgroup -o cpuacct xxx /cgroup
+  # mkdir /cgroup/sub1
+  # mkdir /cgroup/sub2
+
+Then run these two commands:
+  # for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } &
+  # for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } &
+
+After seconds you may see this warning:
+
+------------[ cut here ]------------
+WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0()
+idr_remove called for id=6 which is not allocated.
+...
+Call Trace:
+ [<ffffffff8156063c>] dump_stack+0x7a/0x96
+ [<ffffffff810591ac>] warn_slowpath_common+0x8c/0xc0
+ [<ffffffff81059296>] warn_slowpath_fmt+0x46/0x50
+ [<ffffffff81300aa7>] sub_remove+0x87/0x1b0
+ [<ffffffff810f3f02>] ? css_killed_work_fn+0x32/0x1b0
+ [<ffffffff81300bf5>] idr_remove+0x25/0xd0
+ [<ffffffff810f2bab>] cgroup_destroy_css_killed+0x5b/0xc0
+ [<ffffffff810f4000>] css_killed_work_fn+0x130/0x1b0
+ [<ffffffff8107cdbc>] process_one_work+0x26c/0x550
+ [<ffffffff8107eefe>] worker_thread+0x12e/0x3b0
+ [<ffffffff81085f96>] kthread+0xe6/0xf0
+ [<ffffffff81570bac>] ret_from_fork+0x7c/0xb0
+---[ end trace 2d1577ec10cf80d0 ]---
+
+It's because allocating/removing cgroup ID is not properly synchronized.
+
+The bug was introduced when we converted cgroup_ida to cgroup_idr.
+While synchronization is already done inside ida_simple_{get,remove}(),
+users are responsible for concurrent calls to idr_{alloc,remove}().
+
+tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from
+cgroup_create()").
+
+Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr")
+Cc: <stable at vger.kernel.org> #3.12+
+Reported-by: Michal Hocko <mhocko at suse.cz>
+Signed-off-by: Li Zefan <lizefan at huawei.com>
+Signed-off-by: Tejun Heo <tj at kernel.org>
+---
+diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
+index 5c09759..9450f02 100644
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -166,6 +166,8 @@ struct cgroup {
+ 	 *
+ 	 * The ID of the root cgroup is always 0, and a new cgroup
+ 	 * will be assigned with a smallest available ID.
++	 *
++	 * Allocating/Removing ID must be protected by cgroup_mutex.
+ 	 */
+ 	int id;
+ 
+diff --git a/kernel/cgroup.c b/kernel/cgroup.c
+index 3edf716..52719ce 100644
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
+ 		 * per-subsystem and moved to css->id so that lookups are
+ 		 * successful until the target css is released.
+ 		 */
++		mutex_lock(&cgroup_mutex);
+ 		idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
++		mutex_unlock(&cgroup_mutex);
+ 		cgrp->id = -1;
+ 
+ 		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
+@@ -4168,16 +4170,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
+ 	rcu_assign_pointer(cgrp->name, name);
+ 
+ 	/*
+-	 * Temporarily set the pointer to NULL, so idr_find() won't return
+-	 * a half-baked cgroup.
+-	 */
+-	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+-	if (cgrp->id < 0) {
+-		err = -ENOMEM;
+-		goto err_free_name;
+-	}
+-
+-	/*
+ 	 * Only live parents can have children.  Note that the liveliness
+ 	 * check isn't strictly necessary because cgroup_mkdir() and
+ 	 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
+@@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
+ 	 */
+ 	if (!cgroup_lock_live_group(parent)) {
+ 		err = -ENODEV;
+-		goto err_free_id;
++		goto err_free_name;
++	}
++
++	/*
++	 * Temporarily set the pointer to NULL, so idr_find() won't return
++	 * a half-baked cgroup.
++	 */
++	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
++	if (cgrp->id < 0) {
++		err = -ENOMEM;
++		goto err_unlock;
+ 	}
+ 
+ 	/* Grab a reference on the superblock so the hierarchy doesn't
+@@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
+ 	 */
+ 	err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
+ 	if (err < 0)
+-		goto err_unlock;
++		goto err_free_id;
+ 	lockdep_assert_held(&dentry->d_inode->i_mutex);
+ 
+ 	cgrp->serial_nr = cgroup_serial_nr_next++;
+@@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
+ 
+ 	return 0;
+ 
+-err_unlock:
+-	mutex_unlock(&cgroup_mutex);
+-	/* Release the reference count that we took on the superblock */
+-	deactivate_super(sb);
+ err_free_id:
+ 	idr_remove(&root->cgroup_idr, cgrp->id);
++	/* Release the reference count that we took on the superblock */
++	deactivate_super(sb);
++err_unlock:
++	mutex_unlock(&cgroup_mutex);
+ err_free_name:
+ 	kfree(rcu_dereference_raw(cgrp->name));
+ err_free_cgrp:
+--
+cgit v0.9.2
diff --git a/kernel.spec b/kernel.spec
index 4b98a9e..00f2444 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -629,6 +629,9 @@ Patch25193: fix-exynos-hdmi-build.patch
 #rhbz 1031296
 Patch25194: tick-Clear-broadcast-pending-bit-when-switching-to-oneshot.patch
 
+#rhbz 1045755
+Patch25195: cgroup-fixes.patch
+
 # END OF PATCH DEFINITIONS
 
 %endif
@@ -1280,6 +1283,9 @@ ApplyPatch fix-exynos-hdmi-build.patch
 #rhbz 1031296
 ApplyPatch tick-Clear-broadcast-pending-bit-when-switching-to-oneshot.patch
 
+#rhbz 1045755
+ApplyPatch cgroup-fixes.patch
+
 # END OF PATCH APPLICATIONS
 
 %endif
@@ -2060,6 +2066,7 @@ fi
 #                                    ||     ||
 %changelog
 * Wed Feb 12 2014 Josh Boyer <jwboyer at fedoraproject.org>
+- Fix cgroup destroy oops (rhbz 1045755)
 - Fix backtrace in amd_e400_idle (rhbz 1031296)
 
 * Tue Feb 11 2014 Josh Boyer <jwboyer at fedoraproject.org> - 3.14.0-0.rc2.git1.1