[kernel/f16] CVE-2012-3412 sfc: potential rDOS through TCP MSS option (rhbz 844714 845558)
Josh Boyer
jwboyer at fedoraproject.org
Fri Aug 3 13:53:08 UTC 2012
commit d1e55656898e3a21bde3e8980fe6fefe7f2b589f
Author: Josh Boyer <jwboyer at redhat.com>
Date: Fri Aug 3 09:30:37 2012 -0400
CVE-2012-3412 sfc: potential rDOS through TCP MSS option (rhbz 844714 845558)
kernel.spec | 15 ++-
...r-to-limit-number-of-GSO-segments-per-skb.patch | 70 +++++++++
...of-TSO-segments-and-minimum-TX-queue-size.patch | 156 ++++++++++++++++++++
tcp-Apply-device-TSO-segment-limit-earlier.patch | 124 ++++++++++++++++
4 files changed, 364 insertions(+), 1 deletions(-)
---
diff --git a/kernel.spec b/kernel.spec
index da5f214..b002452 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -54,7 +54,7 @@ Summary: The Linux kernel
# For non-released -rc kernels, this will be appended after the rcX and
# gitX tags, so a 3 here would become part of release "0.rcX.gitX.3"
#
-%global baserelease 1
+%global baserelease 2
%global fedora_build %{baserelease}
# base_sublevel is the kernel version we're starting with and patching
@@ -716,6 +716,11 @@ Patch22060: CPU-hotplug-cpusets-suspend-Dont-modify-cpusets-during.patch
#rhbz 820039 843554
Patch22061: rds-set-correct-msg_namelen.patch
+#rhbz 845558 844714
+Patch22070: net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
+Patch22071: sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
+Patch22072: tcp-Apply-device-TSO-segment-limit-earlier.patch
+
# END OF PATCH DEFINITIONS
%endif
@@ -1337,6 +1342,11 @@ ApplyPatch CPU-hotplug-cpusets-suspend-Dont-modify-cpusets-during.patch
#rhbz 820039 843554
ApplyPatch rds-set-correct-msg_namelen.patch
+#rhbz 845558 844714
+ApplyPatch net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
+ApplyPatch sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
+ApplyPatch tcp-Apply-device-TSO-segment-limit-earlier.patch
+
# END OF PATCH APPLICATIONS
%endif
@@ -2035,6 +2045,9 @@ fi
# and build.
%changelog
+* Fri Aug 03 2012 Josh Boyer <jwboyer at redhat.com>
+- CVE-2012-3412 sfc: potential rDOS through TCP MSS option (rhbz 844714 845558)
+
* Mon Jul 30 2012 Dave Jones <davej at redhat.com> 3.4.7-1
- Linux 3.4.7
diff --git a/net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch b/net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
new file mode 100644
index 0000000..bebccf3
--- /dev/null
+++ b/net-Allow-driver-to-limit-number-of-GSO-segments-per-skb.patch
@@ -0,0 +1,70 @@
+From 30b678d844af3305cda5953467005cebb5d7b687 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <bhutchings at solarflare.com>
+Date: Mon, 30 Jul 2012 15:57:00 +0000
+Subject: [PATCH] net: Allow driver to limit number of GSO segments per skb
+
+A peer (or local user) may cause TCP to use a nominal MSS of as little
+as 88 (actual MSS of 76 with timestamps). Given that we have a
+sufficiently prodigious local sender and the peer ACKs quickly enough,
+it is nevertheless possible to grow the window for such a connection
+to the point that we will try to send just under 64K at once. This
+results in a single skb that expands to 861 segments.
+
+In some drivers with TSO support, such an skb will require hundreds of
+DMA descriptors; a substantial fraction of a TX ring or even more than
+a full ring. The TX queue selected for the skb may stall and trigger
+the TX watchdog repeatedly (since the problem skb will be retried
+after the TX reset). This particularly affects sfc, for which the
+issue is designated as CVE-2012-3412.
+
+Therefore:
+1. Add the field net_device::gso_max_segs holding the device-specific
+ limit.
+2. In netif_skb_features(), if the number of segments is too high then
+ mask out GSO features to force fall back to software GSO.
+
+Signed-off-by: Ben Hutchings <bhutchings at solarflare.com>
+Signed-off-by: David S. Miller <davem at davemloft.net>
+---
+ include/linux/netdevice.h | 2 ++
+ net/core/dev.c | 4 ++++
+ 2 files changed, 6 insertions(+), 0 deletions(-)
+
+diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
+index eb06e58..a9db4f3 100644
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -1300,6 +1300,8 @@ struct net_device {
+ /* for setting kernel sock attribute on TCP connection setup */
+ #define GSO_MAX_SIZE 65536
+ unsigned int gso_max_size;
++#define GSO_MAX_SEGS 65535
++ u16 gso_max_segs;
+
+ #ifdef CONFIG_DCB
+ /* Data Center Bridging netlink ops */
+diff --git a/net/core/dev.c b/net/core/dev.c
+index 0cb3fe8..f91abf8 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -2134,6 +2134,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
+ __be16 protocol = skb->protocol;
+ netdev_features_t features = skb->dev->features;
+
++ if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
++ features &= ~NETIF_F_GSO_MASK;
++
+ if (protocol == htons(ETH_P_8021Q)) {
+ struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
+ protocol = veh->h_vlan_encapsulated_proto;
+@@ -5986,6 +5989,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
+ dev_net_set(dev, &init_net);
+
+ dev->gso_max_size = GSO_MAX_SIZE;
++ dev->gso_max_segs = GSO_MAX_SEGS;
+
+ INIT_LIST_HEAD(&dev->napi_list);
+ INIT_LIST_HEAD(&dev->unreg_list);
+--
+1.7.7.6
+
diff --git a/sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch b/sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
new file mode 100644
index 0000000..07ec616
--- /dev/null
+++ b/sfc-Fix-maximum-number-of-TSO-segments-and-minimum-TX-queue-size.patch
@@ -0,0 +1,156 @@
+From 7e6d06f0de3f74ca929441add094518ae332257c Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <bhutchings at solarflare.com>
+Date: Mon, 30 Jul 2012 15:57:44 +0000
+Subject: [PATCH] sfc: Fix maximum number of TSO segments and minimum TX queue
+ size
+
+Currently an skb requiring TSO may not fit within a minimum-size TX
+queue. The TX queue selected for the skb may stall and trigger the TX
+watchdog repeatedly (since the problem skb will be retried after the
+TX reset). This issue is designated as CVE-2012-3412.
+
+Set the maximum number of TSO segments for our devices to 100. This
+should make no difference to behaviour unless the actual MSS is less
+than about 700. Increase the minimum TX queue size accordingly to
+allow for 2 worst-case skbs, so that there will definitely be space
+to add an skb after we wake a queue.
+
+To avoid invalidating existing configurations, change
+efx_ethtool_set_ringparam() to fix up values that are too small rather
+than returning -EINVAL.
+
+Signed-off-by: Ben Hutchings <bhutchings at solarflare.com>
+Signed-off-by: David S. Miller <davem at davemloft.net>
+---
+ drivers/net/ethernet/sfc/efx.c | 6 ++++++
+ drivers/net/ethernet/sfc/efx.h | 14 ++++++++++----
+ drivers/net/ethernet/sfc/ethtool.c | 16 +++++++++++-----
+ drivers/net/ethernet/sfc/tx.c | 19 +++++++++++++++++++
+ 4 files changed, 46 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
+index 70554a1..65a8d49 100644
+--- a/drivers/net/ethernet/sfc/efx.c
++++ b/drivers/net/ethernet/sfc/efx.c
+@@ -1503,6 +1503,11 @@ static int efx_probe_all(struct efx_nic *efx)
+ goto fail2;
+ }
+
++ BUILD_BUG_ON(EFX_DEFAULT_DMAQ_SIZE < EFX_RXQ_MIN_ENT);
++ if (WARN_ON(EFX_DEFAULT_DMAQ_SIZE < EFX_TXQ_MIN_ENT(efx))) {
++ rc = -EINVAL;
++ goto fail3;
++ }
+ efx->rxq_entries = efx->txq_entries = EFX_DEFAULT_DMAQ_SIZE;
+
+ rc = efx_probe_filters(efx);
+@@ -2070,6 +2075,7 @@ static int efx_register_netdev(struct efx_nic *efx)
+ net_dev->irq = efx->pci_dev->irq;
+ net_dev->netdev_ops = &efx_netdev_ops;
+ SET_ETHTOOL_OPS(net_dev, &efx_ethtool_ops);
++ net_dev->gso_max_segs = EFX_TSO_MAX_SEGS;
+
+ rtnl_lock();
+
+diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h
+index be8f915..70755c9 100644
+--- a/drivers/net/ethernet/sfc/efx.h
++++ b/drivers/net/ethernet/sfc/efx.h
+@@ -30,6 +30,7 @@ extern netdev_tx_t
+ efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
+ extern void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
+ extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc);
++extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
+
+ /* RX */
+ extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue);
+@@ -52,10 +53,15 @@ extern void efx_schedule_slow_fill(struct efx_rx_queue *rx_queue);
+ #define EFX_MAX_EVQ_SIZE 16384UL
+ #define EFX_MIN_EVQ_SIZE 512UL
+
+-/* The smallest [rt]xq_entries that the driver supports. Callers of
+- * efx_wake_queue() assume that they can subsequently send at least one
+- * skb. Falcon/A1 may require up to three descriptors per skb_frag. */
+-#define EFX_MIN_RING_SIZE (roundup_pow_of_two(2 * 3 * MAX_SKB_FRAGS))
++/* Maximum number of TCP segments we support for soft-TSO */
++#define EFX_TSO_MAX_SEGS 100
++
++/* The smallest [rt]xq_entries that the driver supports. RX minimum
++ * is a bit arbitrary. For TX, we must have space for at least 2
++ * TSO skbs.
++ */
++#define EFX_RXQ_MIN_ENT 128U
++#define EFX_TXQ_MIN_ENT(efx) (2 * efx_tx_max_skb_descs(efx))
+
+ /* Filters */
+ extern int efx_probe_filters(struct efx_nic *efx);
+diff --git a/drivers/net/ethernet/sfc/ethtool.c b/drivers/net/ethernet/sfc/ethtool.c
+index 10536f9..8cba2df 100644
+--- a/drivers/net/ethernet/sfc/ethtool.c
++++ b/drivers/net/ethernet/sfc/ethtool.c
+@@ -680,21 +680,27 @@ static int efx_ethtool_set_ringparam(struct net_device *net_dev,
+ struct ethtool_ringparam *ring)
+ {
+ struct efx_nic *efx = netdev_priv(net_dev);
++ u32 txq_entries;
+
+ if (ring->rx_mini_pending || ring->rx_jumbo_pending ||
+ ring->rx_pending > EFX_MAX_DMAQ_SIZE ||
+ ring->tx_pending > EFX_MAX_DMAQ_SIZE)
+ return -EINVAL;
+
+- if (ring->rx_pending < EFX_MIN_RING_SIZE ||
+- ring->tx_pending < EFX_MIN_RING_SIZE) {
++ if (ring->rx_pending < EFX_RXQ_MIN_ENT) {
+ netif_err(efx, drv, efx->net_dev,
+- "TX and RX queues cannot be smaller than %ld\n",
+- EFX_MIN_RING_SIZE);
++ "RX queues cannot be smaller than %u\n",
++ EFX_RXQ_MIN_ENT);
+ return -EINVAL;
+ }
+
+- return efx_realloc_channels(efx, ring->rx_pending, ring->tx_pending);
++ txq_entries = max(ring->tx_pending, EFX_TXQ_MIN_ENT(efx));
++ if (txq_entries != ring->tx_pending)
++ netif_warn(efx, drv, efx->net_dev,
++ "increasing TX queue size to minimum of %u\n",
++ txq_entries);
++
++ return efx_realloc_channels(efx, ring->rx_pending, txq_entries);
+ }
+
+ static int efx_ethtool_set_pauseparam(struct net_device *net_dev,
+diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
+index 9b225a7..1871343 100644
+--- a/drivers/net/ethernet/sfc/tx.c
++++ b/drivers/net/ethernet/sfc/tx.c
+@@ -119,6 +119,25 @@ efx_max_tx_len(struct efx_nic *efx, dma_addr_t dma_addr)
+ return len;
+ }
+
++unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
++{
++ /* Header and payload descriptor for each output segment, plus
++ * one for every input fragment boundary within a segment
++ */
++ unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS;
++
++ /* Possibly one more per segment for the alignment workaround */
++ if (EFX_WORKAROUND_5391(efx))
++ max_descs += EFX_TSO_MAX_SEGS;
++
++ /* Possibly more for PCIe page boundaries within input fragments */
++ if (PAGE_SIZE > EFX_PAGE_SIZE)
++ max_descs += max_t(unsigned int, MAX_SKB_FRAGS,
++ DIV_ROUND_UP(GSO_MAX_SIZE, EFX_PAGE_SIZE));
++
++ return max_descs;
++}
++
+ /*
+ * Add a socket buffer to a TX queue
+ *
+--
+1.7.7.6
+
diff --git a/tcp-Apply-device-TSO-segment-limit-earlier.patch b/tcp-Apply-device-TSO-segment-limit-earlier.patch
new file mode 100644
index 0000000..d6fa7f4
--- /dev/null
+++ b/tcp-Apply-device-TSO-segment-limit-earlier.patch
@@ -0,0 +1,124 @@
+From 1485348d2424e1131ea42efc033cbd9366462b01 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <bhutchings at solarflare.com>
+Date: Mon, 30 Jul 2012 16:11:42 +0000
+Subject: [PATCH] tcp: Apply device TSO segment limit earlier
+
+Cache the device gso_max_segs in sock::sk_gso_max_segs and use it to
+limit the size of TSO skbs. This avoids the need to fall back to
+software GSO for local TCP senders.
+
+Signed-off-by: Ben Hutchings <bhutchings at solarflare.com>
+Signed-off-by: David S. Miller <davem at davemloft.net>
+---
+ include/net/sock.h | 2 ++
+ net/core/sock.c | 1 +
+ net/ipv4/tcp.c | 4 +++-
+ net/ipv4/tcp_cong.c | 3 ++-
+ net/ipv4/tcp_output.c | 21 ++++++++++++---------
+ 5 files changed, 20 insertions(+), 11 deletions(-)
+
+--- linux-3.4.noarch.orig/include/net/sock.h
++++ linux-3.4.noarch/include/net/sock.h
+@@ -216,6 +216,7 @@ struct cg_proto;
+ * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
+ * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
+ * @sk_gso_max_size: Maximum GSO segment size to build
++ * @sk_gso_max_segs: Maximum number of GSO segments
+ * @sk_lingertime: %SO_LINGER l_linger setting
+ * @sk_backlog: always used with the per-socket spinlock held
+ * @sk_callback_lock: used with the callbacks in the end of this struct
+@@ -335,6 +336,7 @@ struct sock {
+ netdev_features_t sk_route_nocaps;
+ int sk_gso_type;
+ unsigned int sk_gso_max_size;
++ u16 sk_gso_max_segs;
+ int sk_rcvlowat;
+ unsigned long sk_lingertime;
+ struct sk_buff_head sk_error_queue;
+--- linux-3.4.noarch.orig/net/core/sock.c
++++ linux-3.4.noarch/net/core/sock.c
+@@ -1411,6 +1411,7 @@ void sk_setup_caps(struct sock *sk, stru
+ } else {
+ sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ sk->sk_gso_max_size = dst->dev->gso_max_size;
++ sk->sk_gso_max_segs = dst->dev->gso_max_segs;
+ }
+ }
+ }
+--- linux-3.4.noarch.orig/net/ipv4/tcp.c
++++ linux-3.4.noarch/net/ipv4/tcp.c
+@@ -740,7 +740,9 @@ static unsigned int tcp_xmit_size_goal(s
+ old_size_goal + mss_now > xmit_size_goal)) {
+ xmit_size_goal = old_size_goal;
+ } else {
+- tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
++ tp->xmit_size_goal_segs =
++ min_t(u16, xmit_size_goal / mss_now,
++ sk->sk_gso_max_segs);
+ xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
+ }
+ }
+--- linux-3.4.noarch.orig/net/ipv4/tcp_cong.c
++++ linux-3.4.noarch/net/ipv4/tcp_cong.c
+@@ -291,7 +291,8 @@ int tcp_is_cwnd_limited(const struct soc
+ left = tp->snd_cwnd - in_flight;
+ if (sk_can_gso(sk) &&
+ left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
+- left * tp->mss_cache < sk->sk_gso_max_size)
++ left * tp->mss_cache < sk->sk_gso_max_size &&
++ left < sk->sk_gso_max_segs)
+ return 1;
+ return left <= tcp_max_tso_deferred_mss(tp);
+ }
+--- linux-3.4.noarch.orig/net/ipv4/tcp_output.c
++++ linux-3.4.noarch/net/ipv4/tcp_output.c
+@@ -1318,21 +1318,21 @@ static void tcp_cwnd_validate(struct soc
+ * when we would be allowed to send the split-due-to-Nagle skb fully.
+ */
+ static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
+- unsigned int mss_now, unsigned int cwnd)
++ unsigned int mss_now, unsigned int max_segs)
+ {
+ const struct tcp_sock *tp = tcp_sk(sk);
+- u32 needed, window, cwnd_len;
++ u32 needed, window, max_len;
+
+ window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
+- cwnd_len = mss_now * cwnd;
++ max_len = mss_now * max_segs;
+
+- if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk)))
+- return cwnd_len;
++ if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
++ return max_len;
+
+ needed = min(skb->len, window);
+
+- if (cwnd_len <= needed)
+- return cwnd_len;
++ if (max_len <= needed)
++ return max_len;
+
+ return needed - needed % mss_now;
+ }
+@@ -1560,7 +1560,8 @@ static int tcp_tso_should_defer(struct s
+ limit = min(send_win, cong_win);
+
+ /* If a full-sized TSO skb can be sent, do it. */
+- if (limit >= sk->sk_gso_max_size)
++ if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
++ sk->sk_gso_max_segs * tp->mss_cache))
+ goto send_now;
+
+ /* Middle in queue won't get any more data, full sendable already? */
+@@ -1786,7 +1787,9 @@ static int tcp_write_xmit(struct sock *s
+ limit = mss_now;
+ if (tso_segs > 1 && !tcp_urg_mode(tp))
+ limit = tcp_mss_split_point(sk, skb, mss_now,
+- cwnd_quota);
++ min_t(unsigned int,
++ cwnd_quota,
++ sk->sk_gso_max_segs));
+
+ if (skb->len > limit &&
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
More information about the scm-commits
mailing list