[kernel/f16] CVE-2012-3412 sfc: potential rDOS through TCP MSS option (rhbz 844714 845558)

Fri Aug 3 13:53:08 UTC 2012

commit d1e55656898e3a21bde3e8980fe6fefe7f2b589f
Author: Josh Boyer <jwboyer at redhat.com>
Date:   Fri Aug 3 09:30:37 2012 -0400

    CVE-2012-3412 sfc: potential rDOS through TCP MSS option (rhbz 844714 845558)

 kernel.spec                                        |   15 ++-
 ...r-to-limit-number-of-GSO-segments-per-skb.patch |   70 +++++++++
 ...of-TSO-segments-and-minimum-TX-queue-size.patch |  156 ++++++++++++++++++++
 tcp-Apply-device-TSO-segment-limit-earlier.patch   |  124 ++++++++++++++++
 4 files changed, 364 insertions(+), 1 deletions(-)
---

diff --git a/kernel.spec b/kernel.spec
index da5f214..b002452 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -54,7 +54,7 @@ Summary: The Linux kernel
 # For non-released -rc kernels, this will be appended after the rcX and
 # gitX tags, so a 3 here would become part of release "0.rcX.gitX.3"
 #
-%global baserelease 1
+%global baserelease 2
 %global fedora_build %{baserelease}
 
 # base_sublevel is the kernel version we're starting with and patching
@@ -716,6 +716,11 @@ Patch22060: CPU-hotplug-cpusets-suspend-Dont-modify-cpusets-during.patch
 #rhbz 820039 843554
 Patch22061: rds-set-correct-msg_namelen.patch
 
+#rhbz 845558 844714
+Patch22070: net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
+Patch22071: sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
+Patch22072: tcp-Apply-device-TSO-segment-limit-earlier.patch
+
 # END OF PATCH DEFINITIONS
 
 %endif
@@ -1337,6 +1342,11 @@ ApplyPatch CPU-hotplug-cpusets-suspend-Dont-modify-cpusets-during.patch
 #rhbz 820039 843554
 ApplyPatch rds-set-correct-msg_namelen.patch
 
+#rhbz 845558 844714
+ApplyPatch net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
+ApplyPatch sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
+ApplyPatch tcp-Apply-device-TSO-segment-limit-earlier.patch
+
 # END OF PATCH APPLICATIONS
 
 %endif
@@ -2035,6 +2045,9 @@ fi
 # and build.
 
 %changelog
+* Fri Aug 03 2012 Josh Boyer <jwboyer at redhat.com>
+- CVE-2012-3412 sfc: potential rDOS through TCP MSS option (rhbz 844714 845558)
+
 * Mon Jul 30 2012 Dave Jones <davej at redhat.com> 3.4.7-1
 - Linux 3.4.7
 
diff --git a/net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch b/net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
new file mode 100644
index 0000000..bebccf3
--- /dev/null
+++ b/net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
@@ -0,0 +1,70 @@
+From 30b678d844af3305cda5953467005cebb5d7b687 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <bhutchings at solarflare.com>
+Date: Mon, 30 Jul 2012 15:57:00 +0000
+Subject: [PATCH] net: Allow driver to limit number of GSO segments per skb
+
+A peer (or local user) may cause TCP to use a nominal MSS of as little
+as 88 (actual MSS of 76 with timestamps).  Given that we have a
+sufficiently prodigious local sender and the peer ACKs quickly enough,
+it is nevertheless possible to grow the window for such a connection
+to the point that we will try to send just under 64K at once.  This
+results in a single skb that expands to 861 segments.
+
+In some drivers with TSO support, such an skb will require hundreds of
+DMA descriptors; a substantial fraction of a TX ring or even more than
+a full ring.  The TX queue selected for the skb may stall and trigger
+the TX watchdog repeatedly (since the problem skb will be retried
+after the TX reset).  This particularly affects sfc, for which the
+issue is designated as CVE-2012-3412.
+
+Therefore:
+1. Add the field net_device::gso_max_segs holding the device-specific
+   limit.
+2. In netif_skb_features(), if the number of segments is too high then
+   mask out GSO features to force fall back to software GSO.
+
+Signed-off-by: Ben Hutchings <bhutchings at solarflare.com>
+Signed-off-by: David S. Miller <davem at davemloft.net>
+---
+ include/linux/netdevice.h |    2 ++
+ net/core/dev.c            |    4 ++++
+ 2 files changed, 6 insertions(+), 0 deletions(-)
+
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index eb06e58..a9db4f3 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -1300,6 +1300,8 @@ struct net_device {
+ 	/* for setting kernel sock attribute on TCP connection setup */
+ #define GSO_MAX_SIZE		65536
+ 	unsigned int		gso_max_size;
++#define GSO_MAX_SEGS		65535
++	u16			gso_max_segs;
+ 
+ #ifdef CONFIG_DCB
+ 	/* Data Center Bridging netlink ops */
+diff --git a/net/core/dev.c b/net/core/dev.c
+index 0cb3fe8..f91abf8 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -2134,6 +2134,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
+ 	__be16 protocol = skb->protocol;
+ 	netdev_features_t features = skb->dev->features;
+ 
++	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
++		features &= ~NETIF_F_GSO_MASK;
++
+ 	if (protocol == htons(ETH_P_8021Q)) {
+ 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+ 		protocol = veh->h_vlan_encapsulated_proto;
+@@ -5986,6 +5989,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ 	dev_net_set(dev, &init_net);
+ 
+ 	dev->gso_max_size = GSO_MAX_SIZE;
++	dev->gso_max_segs = GSO_MAX_SEGS;
+ 
+ 	INIT_LIST_HEAD(&dev->napi_list);
+ 	INIT_LIST_HEAD(&dev->unreg_list);
+-- 
+1.7.7.6
+
diff --git a/sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch b/sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
new file mode 100644
index 0000000..07ec616
--- /dev/null
+++ b/sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
@@ -0,0 +1,156 @@
+From 7e6d06f0de3f74ca929441add094518ae332257c Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <bhutchings at solarflare.com>
+Date: Mon, 30 Jul 2012 15:57:44 +0000
+Subject: [PATCH] sfc: Fix maximum number of TSO segments and minimum TX queue
+ size
+
+Currently an skb requiring TSO may not fit within a minimum-size TX
+queue.  The TX queue selected for the skb may stall and trigger the TX
+watchdog repeatedly (since the problem skb will be retried after the
+TX reset).  This issue is designated as CVE-2012-3412.
+
+Set the maximum number of TSO segments for our devices to 100.  This
+should make no difference to behaviour unless the actual MSS is less
+than about 700.  Increase the minimum TX queue size accordingly to
+allow for 2 worst-case skbs, so that there will definitely be space
+to add an skb after we wake a queue.
+
+To avoid invalidating existing configurations, change
+efx_ethtool_set_ringparam() to fix up values that are too small rather
+than returning -EINVAL.
+
+Signed-off-by: Ben Hutchings <bhutchings at solarflare.com>
+Signed-off-by: David S. Miller <davem at davemloft.net>
+---
+ drivers/net/ethernet/sfc/efx.c     |    6 ++++++
+ drivers/net/ethernet/sfc/efx.h     |   14 ++++++++++----
+ drivers/net/ethernet/sfc/ethtool.c |   16 +++++++++++-----
+ drivers/net/ethernet/sfc/tx.c      |   19 +++++++++++++++++++
+ 4 files changed, 46 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
+index 70554a1..65a8d49 100644
+--- a/drivers/net/ethernet/sfc/efx.c
++++ b/drivers/net/ethernet/sfc/efx.c
+@@ -1503,6 +1503,11 @@ static int efx_probe_all(struct efx_nic *efx)
+ 		goto fail2;
+ 	}
+ 
++	BUILD_BUG_ON(EFX_DEFAULT_DMAQ_SIZE < EFX_RXQ_MIN_ENT);
++	if (WARN_ON(EFX_DEFAULT_DMAQ_SIZE < EFX_TXQ_MIN_ENT(efx))) {
++		rc = -EINVAL;
++		goto fail3;
++	}
+ 	efx->rxq_entries = efx->txq_entries = EFX_DEFAULT_DMAQ_SIZE;
+ 
+ 	rc = efx_probe_filters(efx);
+@@ -2070,6 +2075,7 @@ static int efx_register_netdev(struct efx_nic *efx)
+ 	net_dev->irq = efx->pci_dev->irq;
+ 	net_dev->netdev_ops = &efx_netdev_ops;
+ 	SET_ETHTOOL_OPS(net_dev, &efx_ethtool_ops);
++	net_dev->gso_max_segs = EFX_TSO_MAX_SEGS;
+ 
+ 	rtnl_lock();
+ 
+diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
+index be8f915..70755c9 100644
+--- a/drivers/net/ethernet/sfc/efx.h
++++ b/drivers/net/ethernet/sfc/efx.h
+@@ -30,6 +30,7 @@ extern netdev_tx_t
+ efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
+ extern void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
+ extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc);
++extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
+ 
+ /* RX */
+ extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue);
+@@ -52,10 +53,15 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
+ #define EFX_MAX_EVQ_SIZE 16384UL
+ #define EFX_MIN_EVQ_SIZE 512UL
+ 
+-/* The smallest [rt]xq_entries that the driver supports. Callers of
+- * efx_wake_queue() assume that they can subsequently send at least one
+- * skb. Falcon/A1 may require up to three descriptors per skb_frag. */
+-#define EFX_MIN_RING_SIZE (roundup_pow_of_two(2 * 3 * MAX_SKB_FRAGS))
++/* Maximum number of TCP segments we support for soft-TSO */
++#define EFX_TSO_MAX_SEGS	100
++
++/* The smallest [rt]xq_entries that the driver supports.  RX minimum
++ * is a bit arbitrary.  For TX, we must have space for at least 2
++ * TSO skbs.
++ */
++#define EFX_RXQ_MIN_ENT		128U
++#define EFX_TXQ_MIN_ENT(efx)	(2 * efx_tx_max_skb_descs(efx))
+ 
+ /* Filters */
+ extern int efx_probe_filters(struct efx_nic *efx);
+diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
+index 10536f9..8cba2df 100644
+--- a/drivers/net/ethernet/sfc/ethtool.c
++++ b/drivers/net/ethernet/sfc/ethtool.c
+@@ -680,21 +680,27 @@ static int efx_ethtool_set_ringparam(struct net_device *net_dev,
+ 				     struct ethtool_ringparam *ring)
+ {
+ 	struct efx_nic *efx = netdev_priv(net_dev);
++	u32 txq_entries;
+ 
+ 	if (ring->rx_mini_pending || ring->rx_jumbo_pending ||
+ 	    ring->rx_pending > EFX_MAX_DMAQ_SIZE ||
+ 	    ring->tx_pending > EFX_MAX_DMAQ_SIZE)
+ 		return -EINVAL;
+ 
+-	if (ring->rx_pending < EFX_MIN_RING_SIZE ||
+-	    ring->tx_pending < EFX_MIN_RING_SIZE) {
++	if (ring->rx_pending < EFX_RXQ_MIN_ENT) {
+ 		netif_err(efx, drv, efx->net_dev,
+-			  "TX and RX queues cannot be smaller than %ld\n",
+-			  EFX_MIN_RING_SIZE);
++			  "RX queues cannot be smaller than %u\n",
++			  EFX_RXQ_MIN_ENT);
+ 		return -EINVAL;
+ 	}
+ 
+-	return efx_realloc_channels(efx, ring->rx_pending, ring->tx_pending);
++	txq_entries = max(ring->tx_pending, EFX_TXQ_MIN_ENT(efx));
++	if (txq_entries != ring->tx_pending)
++		netif_warn(efx, drv, efx->net_dev,
++			   "increasing TX queue size to minimum of %u\n",
++			   txq_entries);
++
++	return efx_realloc_channels(efx, ring->rx_pending, txq_entries);
+ }
+ 
+ static int efx_ethtool_set_pauseparam(struct net_device *net_dev,
+diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
+index 9b225a7..1871343 100644
+--- a/drivers/net/ethernet/sfc/tx.c
++++ b/drivers/net/ethernet/sfc/tx.c
+@@ -119,6 +119,25 @@ efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr)
+ 	return len;
+ }
+ 
++unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
++{
++	/* Header and payload descriptor for each output segment, plus
++	 * one for every input fragment boundary within a segment
++	 */
++	unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS;
++
++	/* Possibly one more per segment for the alignment workaround */
++	if (EFX_WORKAROUND_5391(efx))
++		max_descs += EFX_TSO_MAX_SEGS;
++
++	/* Possibly more for PCIe page boundaries within input fragments */
++	if (PAGE_SIZE > EFX_PAGE_SIZE)
++		max_descs += max_t(unsigned int, MAX_SKB_FRAGS,
++				   DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE));
++
++	return max_descs;
++}
++
+ /*
+  * Add a socket buffer to a TX queue
+  *
+-- 
+1.7.7.6
+
diff --git a/tcp-Apply-device-TSO-segment-limit-earlier.patch b/tcp-Apply-device-TSO-segment-limit-earlier.patch
new file mode 100644
index 0000000..d6fa7f4
--- /dev/null
+++ b/tcp-Apply-device-TSO-segment-limit-earlier.patch
@@ -0,0 +1,124 @@
+From 1485348d2424e1131ea42efc033cbd9366462b01 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <bhutchings at solarflare.com>
+Date: Mon, 30 Jul 2012 16:11:42 +0000
+Subject: [PATCH] tcp: Apply device TSO segment limit earlier
+
+Cache the device gso_max_segs in sock::sk_gso_max_segs and use it to
+limit the size of TSO skbs.  This avoids the need to fall back to
+software GSO for local TCP senders.
+
+Signed-off-by: Ben Hutchings <bhutchings at solarflare.com>
+Signed-off-by: David S. Miller <davem at davemloft.net>
+---
+ include/net/sock.h    |    2 ++
+ net/core/sock.c       |    1 +
+ net/ipv4/tcp.c        |    4 +++-
+ net/ipv4/tcp_cong.c   |    3 ++-
+ net/ipv4/tcp_output.c |   21 ++++++++++++---------
+ 5 files changed, 20 insertions(+), 11 deletions(-)
+
+--- linux-3.4.noarch.orig/include/net/sock.h
++++ linux-3.4.noarch/include/net/sock.h
+@@ -216,6 +216,7 @@ struct cg_proto;
+   *	@sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
+   *	@sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
+   *	@sk_gso_max_size: Maximum GSO segment size to build
++  *	@sk_gso_max_segs: Maximum number of GSO segments
+   *	@sk_lingertime: %SO_LINGER l_linger setting
+   *	@sk_backlog: always used with the per-socket spinlock held
+   *	@sk_callback_lock: used with the callbacks in the end of this struct
+@@ -335,6 +336,7 @@ struct sock {
+ 	netdev_features_t	sk_route_nocaps;
+ 	int			sk_gso_type;
+ 	unsigned int		sk_gso_max_size;
++	u16			sk_gso_max_segs;
+ 	int			sk_rcvlowat;
+ 	unsigned long	        sk_lingertime;
+ 	struct sk_buff_head	sk_error_queue;
+--- linux-3.4.noarch.orig/net/core/sock.c
++++ linux-3.4.noarch/net/core/sock.c
+@@ -1411,6 +1411,7 @@ void sk_setup_caps(struct sock *sk, stru
+ 		} else {
+ 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ 			sk->sk_gso_max_size = dst->dev->gso_max_size;
++			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+ 		}
+ 	}
+ }
+--- linux-3.4.noarch.orig/net/ipv4/tcp.c
++++ linux-3.4.noarch/net/ipv4/tcp.c
+@@ -740,7 +740,9 @@ static unsigned int tcp_xmit_size_goal(s
+ 			   old_size_goal + mss_now > xmit_size_goal)) {
+ 			xmit_size_goal = old_size_goal;
+ 		} else {
+-			tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
++			tp->xmit_size_goal_segs =
++				min_t(u16, xmit_size_goal / mss_now,
++				      sk->sk_gso_max_segs);
+ 			xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
+ 		}
+ 	}
+--- linux-3.4.noarch.orig/net/ipv4/tcp_cong.c
++++ linux-3.4.noarch/net/ipv4/tcp_cong.c
+@@ -291,7 +291,8 @@ int tcp_is_cwnd_limited(const struct soc
+ 	left = tp->snd_cwnd - in_flight;
+ 	if (sk_can_gso(sk) &&
+ 	    left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
+-	    left * tp->mss_cache < sk->sk_gso_max_size)
++	    left * tp->mss_cache < sk->sk_gso_max_size &&
++		left < sk->sk_gso_max_segs)
+ 		return 1;
+ 	return left <= tcp_max_tso_deferred_mss(tp);
+ }
+--- linux-3.4.noarch.orig/net/ipv4/tcp_output.c
++++ linux-3.4.noarch/net/ipv4/tcp_output.c
+@@ -1318,21 +1318,21 @@ static void tcp_cwnd_validate(struct soc
+  * when we would be allowed to send the split-due-to-Nagle skb fully.
+  */
+ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
+-					unsigned int mss_now, unsigned int cwnd)
++					unsigned int mss_now, unsigned int max_segs)
+ {
+ 	const struct tcp_sock *tp = tcp_sk(sk);
+-	u32 needed, window, cwnd_len;
++	u32 needed, window, max_len;
+ 
+ 	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+-	cwnd_len = mss_now * cwnd;
++	max_len = mss_now * max_segs;
+ 
+-	if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
+-		return cwnd_len;
++	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
++		return max_len;
+ 
+ 	needed = min(skb->len, window);
+ 
+-	if (cwnd_len <= needed)
+-		return cwnd_len;
++	if (max_len <= needed)
++		return max_len;
+ 
+ 	return needed - needed % mss_now;
+ }
+@@ -1560,7 +1560,8 @@ static int tcp_tso_should_defer(struct s
+ 	limit = min(send_win, cong_win);
+ 
+ 	/* If a full-sized TSO skb can be sent, do it. */
+-	if (limit >= sk->sk_gso_max_size)
++	if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
++			   sk->sk_gso_max_segs * tp->mss_cache))
+ 		goto send_now;
+ 
+ 	/* Middle in queue won't get any more data, full sendable already? */
+@@ -1786,7 +1787,9 @@ static int tcp_write_xmit(struct sock *s
+ 		limit = mss_now;
+ 		if (tso_segs > 1 && !tcp_urg_mode(tp))
+ 			limit = tcp_mss_split_point(sk, skb, mss_now,
+-						    cwnd_quota);
++						    min_t(unsigned int,
++							  cwnd_quota,
++							  sk->sk_gso_max_segs));
+ 
+ 		if (skb->len > limit &&
+ 		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))