[kernel/f15] Reinstate the route cache garbage collector.

Dave Jones davej at fedoraproject.org
Wed Dec 21 18:55:06 UTC 2011


commit 3454da31210f9ef98f8369013dc34956e9bafacf
Author: Dave Jones <davej at redhat.com>
Date:   Wed Dec 21 13:37:14 2011 -0500

    Reinstate the route cache garbage collector.

 kernel.spec                         |    7 ++
 route-cache-garbage-collector.patch |  201 +++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+), 0 deletions(-)
---
diff --git a/kernel.spec b/kernel.spec
index 42b1352..3bf1d45 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -724,6 +724,8 @@ Patch21047: iwlwifi-allow-to-switch-to-HT40-if-not-associated.patch
 #rhbz 741117
 Patch21048: b44-Use-dev_kfree_skb_irq-in-b44_tx.patch
 
+Patch22000: route-cache-garbage-collector.patch
+
 %endif
 
 BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
@@ -1343,6 +1345,8 @@ ApplyPatch iwlwifi-allow-to-switch-to-HT40-if-not-associated.patch
 #rhbz 741117
 ApplyPatch b44-Use-dev_kfree_skb_irq-in-b44_tx.patch
 
+ApplyPatch route-cache-garbage-collector.patch
+
 # END OF PATCH APPLICATIONS
 
 %endif
@@ -1990,6 +1994,9 @@ fi
 # and build.
 
 %changelog
+* Wed Dec 21 2011 Dave Jones <davej at redhat.com> 2.6.41.5-5
+- Reinstate the route cache garbage collector.
+
 * Tue Dec 20 2011 Josh Boyer <jwboyer at redhat.com>
 - Fix config options in arm configs after latest commits
 - Backport upstream fix for b44_poll oops (rhbz #741117)
diff --git a/route-cache-garbage-collector.patch b/route-cache-garbage-collector.patch
new file mode 100644
index 0000000..6ea6e5e
--- /dev/null
+++ b/route-cache-garbage-collector.patch
@@ -0,0 +1,201 @@
+Message-ID: <1324461072.2728.19.camel at edumazet-HP-Compaq-6005-Pro-SFF-PC>
+Subject: Re: Kernel-DOS error in arp mechanism =?UTF-8?Q?=E2=80=93?= no delete off incomplete arp adresses
+From: Eric Dumazet <eric.dumazet at gmail.com>
+To: David Miller <davem at davemloft.net>
+Cc: richard.weinberger at gmail.com, gladewitz at gmx.de,
+        linux-kernel at vger.kernel.org, netdev at vger.kernel.org
+Date:	Wed, 21 Dec 2011 10:51:12 +0100
+In-Reply-To: <20111221.030727.1528369698756365464.davem at davemloft.net>
+References: <4EEC5286.3070408 at gmx.de>
+	 <CAFLxGvxjLCyMCPXtpm7a7RaOL4A4=bhCLPKD=FVAc8xOdx_CsQ at mail.gmail.com>
+	 <1324453467.2610.20.camel at edumazet-laptop>
+	 <20111221.030727.1528369698756365464.davem at davemloft.net>
+Content-Type: text/plain; charset="UTF-8"
+Content-Transfer-Encoding: 8bit
+Sender: linux-kernel-owner at vger.kernel.org
+List-ID: <linux-kernel.vger.kernel.org>
+
+Le mercredi 21 décembre 2011 à 03:07 -0500, David Miller a écrit :
+> From: Eric Dumazet <eric.dumazet at gmail.com>
+> Date: Wed, 21 Dec 2011 08:44:27 +0100
+> 
+> > David, I suggest we add back the garbage collector for current kernels,
+> > we'll remove it when route cache really disappear ?
+> > 
+> > I'll send a patch today.
+> 
+> Yes, it's the best idea.
+> 
+> We can actually remove it again as early as when when route neigh's
+> are ref-less.
+
+Here is the patch I successfully tested in the neighbour stress
+situation. This is a stable candidate (2.6.39+)
+
+Thanks !
+
+[PATCH] ipv4: reintroduce route cache garbage collector
+
+Commit 2c8cec5c10b (ipv4: Cache learned PMTU information in inetpeer)
+removed IP route cache garbage collector a bit too soon, as this gc was
+responsible for expired routes cleanup, releasing their neighbour
+reference.
+
+As pointed out by Robert Gladewitz, recent kernels can fill and exhaust
+their neighbour cache.
+
+Reintroduce the garbage collection, since we'll have to wait our
+neighbour lookups become refcount-less to not depend on this stuff.
+
+Reported-by: Robert Gladewitz <gladewitz at gmx.de>
+Signed-off-by: Eric Dumazet <eric.dumazet at gmail.com>
+---
+ net/ipv4/route.c |  107 +++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 107 insertions(+)
+
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 46af623..252c512 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -133,6 +134,9 @@ static int ip_rt_min_advmss __read_mostly	= 256;
+ static int rt_chain_length_max __read_mostly	= 20;
+ static int redirect_genid;
+ 
++static struct delayed_work expires_work;
++static unsigned long expires_ljiffies;
++
+ /*
+  *	Interface to generic destination cache.
+  */
+@@ -830,6 +834,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
+ 	return ONE;
+ }
+ 
++static void rt_check_expire(void)
++{
++	static unsigned int rover;
++	unsigned int i = rover, goal;
++	struct rtable *rth;
++	struct rtable __rcu **rthp;
++	unsigned long samples = 0;
++	unsigned long sum = 0, sum2 = 0;
++	unsigned long delta;
++	u64 mult;
++
++	delta = jiffies - expires_ljiffies;
++	expires_ljiffies = jiffies;
++	mult = ((u64)delta) << rt_hash_log;
++	if (ip_rt_gc_timeout > 1)
++		do_div(mult, ip_rt_gc_timeout);
++	goal = (unsigned int)mult;
++	if (goal > rt_hash_mask)
++		goal = rt_hash_mask + 1;
++	for (; goal > 0; goal--) {
++		unsigned long tmo = ip_rt_gc_timeout;
++		unsigned long length;
++
++		i = (i + 1) & rt_hash_mask;
++		rthp = &rt_hash_table[i].chain;
++
++		if (need_resched())
++			cond_resched();
++
++		samples++;
++
++		if (rcu_dereference_raw(*rthp) == NULL)
++			continue;
++		length = 0;
++		spin_lock_bh(rt_hash_lock_addr(i));
++		while ((rth = rcu_dereference_protected(*rthp,
++					lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
++			prefetch(rth->dst.rt_next);
++			if (rt_is_expired(rth)) {
++				*rthp = rth->dst.rt_next;
++				rt_free(rth);
++				continue;
++			}
++			if (rth->dst.expires) {
++				/* Entry is expired even if it is in use */
++				if (time_before_eq(jiffies, rth->dst.expires)) {
++nofree:
++					tmo >>= 1;
++					rthp = &rth->dst.rt_next;
++					/*
++					 * We only count entries on
++					 * a chain with equal hash inputs once
++					 * so that entries for different QOS
++					 * levels, and other non-hash input
++					 * attributes don't unfairly skew
++					 * the length computation
++					 */
++					length += has_noalias(rt_hash_table[i].chain, rth);
++					continue;
++				}
++			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
++				goto nofree;
++
++			/* Cleanup aged off entries. */
++			*rthp = rth->dst.rt_next;
++			rt_free(rth);
++		}
++		spin_unlock_bh(rt_hash_lock_addr(i));
++		sum += length;
++		sum2 += length*length;
++	}
++	if (samples) {
++		unsigned long avg = sum / samples;
++		unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
++		rt_chain_length_max = max_t(unsigned long,
++					ip_rt_gc_elasticity,
++					(avg + 4*sd) >> FRACT_BITS);
++	}
++	rover = i;
++}
++
++/*
++ * rt_worker_func() is run in process context.
++ * we call rt_check_expire() to scan part of the hash table
++ */
++static void rt_worker_func(struct work_struct *work)
++{
++	rt_check_expire();
++	schedule_delayed_work(&expires_work, ip_rt_gc_interval);
++}
++
+ /*
+  * Perturbation of rt_genid by a small quantity [1..256]
+  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
+@@ -3179,6 +3274,13 @@ static ctl_table ipv4_route_table[] = {
+ 		.proc_handler	= proc_dointvec_jiffies,
+ 	},
+ 	{
++		.procname	= "gc_interval",
++		.data		= &ip_rt_gc_interval,
++		.maxlen		= sizeof(int),
++		.mode		= 0644,
++		.proc_handler	= proc_dointvec_jiffies,
++	},
++	{
+ 		.procname	= "redirect_load",
+ 		.data		= &ip_rt_redirect_load,
+ 		.maxlen		= sizeof(int),
+@@ -3388,6 +3490,11 @@ int __init ip_rt_init(void)
+ 	devinet_init();
+ 	ip_fib_init();
+ 
++	INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
++	expires_ljiffies = jiffies;
++	schedule_delayed_work(&expires_work,
++		net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
++
+ 	if (ip_rt_proc_init())
+ 		printk(KERN_ERR "Unable to create route proc files\n");
+ #ifdef CONFIG_XFRM
+
+
+--
+To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
+the body of a message to majordomo at vger.kernel.org
+More majordomo info at  http://vger.kernel.org/majordomo-info.html
+Please read the FAQ at  http://www.tux.org/lkml/
+


More information about the scm-commits mailing list