[kernel/f15] Reinstate the route cache garbage collector.
Dave Jones
davej at fedoraproject.org
Wed Dec 21 18:55:06 UTC 2011
commit 3454da31210f9ef98f8369013dc34956e9bafacf
Author: Dave Jones <davej at redhat.com>
Date: Wed Dec 21 13:37:14 2011 -0500
Reinstate the route cache garbage collector.
kernel.spec | 7 ++
route-cache-garbage-collector.patch | 201 +++++++++++++++++++++++++++++++++++
2 files changed, 208 insertions(+), 0 deletions(-)
---
diff --git a/kernel.spec b/kernel.spec
index 42b1352..3bf1d45 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -724,6 +724,8 @@ Patch21047: iwlwifi-allow-to-switch-to-HT40-if-not-associated.patch
#rhbz 741117
Patch21048: b44-Use-dev_kfree_skb_irq-in-b44_tx.patch
+Patch22000: route-cache-garbage-collector.patch
+
%endif
BuildRoot: %{_tmppath}/kernel-%{KVERREL}-root
@@ -1343,6 +1345,8 @@ ApplyPatch iwlwifi-allow-to-switch-to-HT40-if-not-associated.patch
#rhbz 741117
ApplyPatch b44-Use-dev_kfree_skb_irq-in-b44_tx.patch
+ApplyPatch route-cache-garbage-collector.patch
+
# END OF PATCH APPLICATIONS
%endif
@@ -1990,6 +1994,9 @@ fi
# and build.
%changelog
+* Wed Dec 21 2011 Dave Jones <davej at redhat.com> 2.6.41.5-5
+- Reinstate the route cache garbage collector.
+
* Tue Dec 20 2011 Josh Boyer <jwboyer at redhat.com>
- Fix config options in arm configs after latest commits
- Backport upstream fix for b44_poll oops (rhbz #741117)
diff --git a/route-cache-garbage-collector.patch b/route-cache-garbage-collector.patch
new file mode 100644
index 0000000..6ea6e5e
--- /dev/null
+++ b/route-cache-garbage-collector.patch
@@ -0,0 +1,201 @@
+Message-ID: <1324461072.2728.19.camel at edumazet-HP-Compaq-6005-Pro-SFF-PC>
+Subject: Re: Kernel-DOS error in arp mechanism =?UTF-8?Q?=E2=80=93?= no delete off incomplete arp adresses
+From: Eric Dumazet <eric.dumazet at gmail.com>
+To: David Miller <davem at davemloft.net>
+Cc: richard.weinberger at gmail.com, gladewitz at gmx.de,
+ linux-kernel at vger.kernel.org, netdev at vger.kernel.org
+Date: Wed, 21 Dec 2011 10:51:12 +0100
+In-Reply-To: <20111221.030727.1528369698756365464.davem at davemloft.net>
+References: <4EEC5286.3070408 at gmx.de>
+ <CAFLxGvxjLCyMCPXtpm7a7RaOL4A4=bhCLPKD=FVAc8xOdx_CsQ at mail.gmail.com>
+ <1324453467.2610.20.camel at edumazet-laptop>
+ <20111221.030727.1528369698756365464.davem at davemloft.net>
+Content-Type: text/plain; charset="UTF-8"
+Content-Transfer-Encoding: 8bit
+Sender: linux-kernel-owner at vger.kernel.org
+List-ID: <linux-kernel.vger.kernel.org>
+
+Le mercredi 21 décembre 2011 à 03:07 -0500, David Miller a écrit :
+> From: Eric Dumazet <eric.dumazet at gmail.com>
+> Date: Wed, 21 Dec 2011 08:44:27 +0100
+>
+> > David, I suggest we add back the garbage collector for current kernels,
+> > we'll remove it when route cache really disappear ?
+> >
+> > I'll send a patch today.
+>
+> Yes, it's the best idea.
+>
+> We can actually remove it again as early as when when route neigh's
+> are ref-less.
+
+Here is the patch I successfully tested in the neighbour stress
+situation. This is a stable candidate (2.6.39+)
+
+Thanks !
+
+[PATCH] ipv4: reintroduce route cache garbage collector
+
+Commit 2c8cec5c10b (ipv4: Cache learned PMTU information in inetpeer)
+removed IP route cache garbage collector a bit too soon, as this gc was
+responsible for expired routes cleanup, releasing their neighbour
+reference.
+
+As pointed out by Robert Gladewitz, recent kernels can fill and exhaust
+their neighbour cache.
+
+Reintroduce the garbage collection, since we'll have to wait our
+neighbour lookups become refcount-less to not depend on this stuff.
+
+Reported-by: Robert Gladewitz <gladewitz at gmx.de>
+Signed-off-by: Eric Dumazet <eric.dumazet at gmail.com>
+---
+ net/ipv4/route.c | 107 +++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 107 insertions(+)
+
+diff --git a/net/ipv4/route.c b/net/ipv4/route.c
+index 46af623..252c512 100644
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -133,6 +134,9 @@ static int ip_rt_min_advmss __read_mostly = 256;
+ static int rt_chain_length_max __read_mostly = 20;
+ static int redirect_genid;
+
++static struct delayed_work expires_work;
++static unsigned long expires_ljiffies;
++
+ /*
+ * Interface to generic destination cache.
+ */
+@@ -830,6 +834,97 @@ static int has_noalias(const struct rtable *head, const struct rtable *rth)
+ return ONE;
+ }
+
++static void rt_check_expire(void)
++{
++ static unsigned int rover;
++ unsigned int i = rover, goal;
++ struct rtable *rth;
++ struct rtable __rcu **rthp;
++ unsigned long samples = 0;
++ unsigned long sum = 0, sum2 = 0;
++ unsigned long delta;
++ u64 mult;
++
++ delta = jiffies - expires_ljiffies;
++ expires_ljiffies = jiffies;
++ mult = ((u64)delta) << rt_hash_log;
++ if (ip_rt_gc_timeout > 1)
++ do_div(mult, ip_rt_gc_timeout);
++ goal = (unsigned int)mult;
++ if (goal > rt_hash_mask)
++ goal = rt_hash_mask + 1;
++ for (; goal > 0; goal--) {
++ unsigned long tmo = ip_rt_gc_timeout;
++ unsigned long length;
++
++ i = (i + 1) & rt_hash_mask;
++ rthp = &rt_hash_table[i].chain;
++
++ if (need_resched())
++ cond_resched();
++
++ samples++;
++
++ if (rcu_dereference_raw(*rthp) == NULL)
++ continue;
++ length = 0;
++ spin_lock_bh(rt_hash_lock_addr(i));
++ while ((rth = rcu_dereference_protected(*rthp,
++ lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
++ prefetch(rth->dst.rt_next);
++ if (rt_is_expired(rth)) {
++ *rthp = rth->dst.rt_next;
++ rt_free(rth);
++ continue;
++ }
++ if (rth->dst.expires) {
++ /* Entry is expired even if it is in use */
++ if (time_before_eq(jiffies, rth->dst.expires)) {
++nofree:
++ tmo >>= 1;
++ rthp = &rth->dst.rt_next;
++ /*
++ * We only count entries on
++ * a chain with equal hash inputs once
++ * so that entries for different QOS
++ * levels, and other non-hash input
++ * attributes don't unfairly skew
++ * the length computation
++ */
++ length += has_noalias(rt_hash_table[i].chain, rth);
++ continue;
++ }
++ } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
++ goto nofree;
++
++ /* Cleanup aged off entries. */
++ *rthp = rth->dst.rt_next;
++ rt_free(rth);
++ }
++ spin_unlock_bh(rt_hash_lock_addr(i));
++ sum += length;
++ sum2 += length*length;
++ }
++ if (samples) {
++ unsigned long avg = sum / samples;
++ unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
++ rt_chain_length_max = max_t(unsigned long,
++ ip_rt_gc_elasticity,
++ (avg + 4*sd) >> FRACT_BITS);
++ }
++ rover = i;
++}
++
++/*
++ * rt_worker_func() is run in process context.
++ * we call rt_check_expire() to scan part of the hash table
++ */
++static void rt_worker_func(struct work_struct *work)
++{
++ rt_check_expire();
++ schedule_delayed_work(&expires_work, ip_rt_gc_interval);
++}
++
+ /*
+ * Perturbation of rt_genid by a small quantity [1..256]
+ * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
+@@ -3179,6 +3274,13 @@ static ctl_table ipv4_route_table[] = {
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
++ .procname = "gc_interval",
++ .data = &ip_rt_gc_interval,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec_jiffies,
++ },
++ {
+ .procname = "redirect_load",
+ .data = &ip_rt_redirect_load,
+ .maxlen = sizeof(int),
+@@ -3388,6 +3490,11 @@ int __init ip_rt_init(void)
+ devinet_init();
+ ip_fib_init();
+
++ INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
++ expires_ljiffies = jiffies;
++ schedule_delayed_work(&expires_work,
++ net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
++
+ if (ip_rt_proc_init())
+ printk(KERN_ERR "Unable to create route proc files\n");
+ #ifdef CONFIG_XFRM
+
+
+--
+To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
+the body of a message to majordomo at vger.kernel.org
+More majordomo info at http://vger.kernel.org/majordomo-info.html
+Please read the FAQ at http://www.tux.org/lkml/
+
More information about the scm-commits
mailing list