When there's a large number of interfaces (e.g. vlans), teamd loses link notifications as it cannot read them as fast as kernel is broadcasting them. This often prevents teamd starting properly if started concurrently when other links are being set up. It can also fail when it's up and running, especially in the cases where the team device itself has a lot of vlans under it. It never checks the return value of nl_recvmsgs_default() and doesn't know if it just missed important/critical events.
As said in netlink(7), it's up to userspace to resynchronize with kernel when getting ENOBUFS. Unfortunately there's no way to know what events were lost and ports might not be linked in libteam side due to lost events, so all ifinfo list must be refreshed.
Checks now return value of nl_recvmsgs_default() for event socket. In case of ENOBUFS (which libnl nicely changes to ENOMEM), refreshes all ifinfo list. get_ifinfo_list() also checks now for removed interfaces in case of missed dellink event. Currently all TEAM_IFINFO_CHANGE handlers processed events one by one, so it had to be changed to support multiple ifinfo changes. For this, ifinfo changed flags are cleared and removed entries destroyed only after all handlers have been called.
Also, increased nl_cli.sock_event receive buffers to 96k like all other sockets. Added possibility to change this via environment variable.
Signed-off-by: Antti Tiainen atiainen@forcepoint.com --- libteam/ifinfo.c | 31 +++++++++++++++++++++++-------- libteam/libteam.c | 40 +++++++++++++++++++++++++++++++++++++++- libteam/team_private.h | 3 +++ 3 files changed, 65 insertions(+), 9 deletions(-)
diff --git a/libteam/ifinfo.c b/libteam/ifinfo.c index 72155ae..9330a4f 100644 --- a/libteam/ifinfo.c +++ b/libteam/ifinfo.c @@ -72,6 +72,9 @@ struct team_ifinfo { #define CHANGED_PHYS_PORT_ID (1 << 5) #define CHANGED_PHYS_PORT_ID_LEN (1 << 6) #define CHANGED_ADMIN_STATE (1 << 7) +/* This is only used when tagging interfaces for finding + removed, and thus not included to CHANGED_ANY. */ +#define CHANGED_REFRESHING (1 << 8) #define CHANGED_ANY (CHANGED_REMOVED | CHANGED_HWADDR | \ CHANGED_HWADDR_LEN | CHANGED_IFNAME | \ CHANGED_MASTER_IFINDEX | CHANGED_PHYS_PORT_ID | \ @@ -202,7 +205,7 @@ static struct team_ifinfo *ifinfo_find(struct team_handle *th, uint32_t ifindex) return NULL; }
-static void clear_last_changed(struct team_handle *th) +void ifinfo_clear_changed(struct team_handle *th) { struct team_ifinfo *ifinfo;
@@ -236,7 +239,7 @@ static void ifinfo_destroy(struct team_ifinfo *ifinfo) free(ifinfo); }
-static void ifinfo_destroy_removed(struct team_handle *th) +void ifinfo_destroy_removed(struct team_handle *th) { struct team_ifinfo *ifinfo, *tmp;
@@ -254,8 +257,6 @@ static void obj_input_newlink(struct nl_object *obj, void *arg, bool event) uint32_t ifindex; int err;
- ifinfo_destroy_removed(th); - link = (struct rtnl_link *) obj;
ifindex = rtnl_link_get_ifindex(link); @@ -269,7 +270,7 @@ static void obj_input_newlink(struct nl_object *obj, void *arg, bool event) return; }
- clear_last_changed(th); + clear_changed(ifinfo); ifinfo_update(ifinfo, link);
if (event) @@ -292,8 +293,6 @@ static void event_handler_obj_input_dellink(struct nl_object *obj, void *arg) uint32_t ifindex; int err;
- ifinfo_destroy_removed(th); - link = (struct rtnl_link *) obj;
ifindex = rtnl_link_get_ifindex(link); @@ -311,7 +310,7 @@ static void event_handler_obj_input_dellink(struct nl_object *obj, void *arg) return; }
- clear_last_changed(th); + clear_changed(ifinfo); set_changed(ifinfo, CHANGED_REMOVED); set_call_change_handlers(th, TEAM_IFINFO_CHANGE); } @@ -367,6 +366,13 @@ int get_ifinfo_list(struct team_handle *th) }; int ret; int retry = 1; + struct team_ifinfo *ifinfo; + + /* Tag all ifinfo, this is cleared in newlink handler. + Any interface that has this after dump is processed + has been removed. */ + list_for_each_node_entry(ifinfo, &th->ifinfo_list, list) + set_changed(ifinfo, CHANGED_REFRESHING);
while (retry) { retry = 0; @@ -395,6 +401,15 @@ int get_ifinfo_list(struct team_handle *th) retry = 1; } } + + list_for_each_node_entry(ifinfo, &th->ifinfo_list, list) { + if (is_changed(ifinfo, CHANGED_REFRESHING)) { + clear_changed(ifinfo); + set_changed(ifinfo, CHANGED_REMOVED); + set_call_change_handlers(th, TEAM_IFINFO_CHANGE); + } + } + ret = check_call_change_handlers(th, TEAM_IFINFO_CHANGE); if (ret < 0) err(th, "get_ifinfo_list: check_call_change_handers failed"); diff --git a/libteam/libteam.c b/libteam/libteam.c index ac187aa..9207b61 100644 --- a/libteam/libteam.c +++ b/libteam/libteam.c @@ -236,6 +236,10 @@ int check_call_change_handlers(struct team_handle *th, break; } } + if (call_type_mask & TEAM_IFINFO_CHANGE) { + ifinfo_destroy_removed(th); + ifinfo_clear_changed(th); + } th->change_handler.pending_type_mask &= ~call_type_mask; return err; } @@ -561,6 +565,8 @@ int team_init(struct team_handle *th, uint32_t ifindex) int err; int grp_id; int val; + int eventbufsize; + const char *env;
if (!ifindex) { err(th, "Passed interface index %d is not valid.", ifindex); @@ -627,6 +633,23 @@ int team_init(struct team_handle *th, uint32_t ifindex) nl_socket_modify_cb(th->nl_cli.sock_event, NL_CB_VALID, NL_CB_CUSTOM, cli_event_handler, th); nl_cli_connect(th->nl_cli.sock_event, NETLINK_ROUTE); + + env = getenv("TEAM_EVENT_BUFSIZE"); + if (env) { + eventbufsize = strtol(env, NULL, 10); + /* ignore other errors, libnl forces minimum 32k and + too large values are truncated to system rmem_max */ + if (eventbufsize < 0) + eventbufsize = 0; + } + else eventbufsize = 98304; + + err = nl_socket_set_buffer_size(th->nl_cli.sock_event, eventbufsize, 0); + if (err) { + err(th, "Failed to set cli event socket buffer size."); + return err; + } + err = nl_socket_add_membership(th->nl_cli.sock_event, RTNLGRP_LINK); if (err < 0) { err(th, "Failed to add netlink membership."); @@ -767,7 +790,22 @@ static int get_cli_sock_event_fd(struct team_handle *th)
static int cli_sock_event_handler(struct team_handle *th) { - nl_recvmsgs_default(th->nl_cli.sock_event); + int err; + + err = nl_recvmsgs_default(th->nl_cli.sock_event); + err = -nl2syserr(err); + + /* libnl thinks ENOBUFS and ENOMEM are same. Hope it was ENOBUFS. */ + if (err == -ENOMEM) { + warn(th, "Lost link notifications from kernel."); + /* There's no way to know what events were lost and no + way to get them again. Refresh all. */ + err = get_ifinfo_list(th); + } + + if (err) + return err; + return check_call_change_handlers(th, TEAM_IFINFO_CHANGE); }
diff --git a/libteam/team_private.h b/libteam/team_private.h index a07632f..a5eb0be 100644 --- a/libteam/team_private.h +++ b/libteam/team_private.h @@ -115,6 +115,9 @@ int ifinfo_link_with_port(struct team_handle *th, uint32_t ifindex, int ifinfo_link(struct team_handle *th, uint32_t ifindex, struct team_ifinfo **p_ifinfo); void ifinfo_unlink(struct team_ifinfo *ifinfo); +void ifinfo_clear_changed(struct team_handle *th); +void ifinfo_destroy_removed(struct team_handle *th); +int get_ifinfo_list(struct team_handle *th); int get_options_handler(struct nl_msg *msg, void *arg); int option_list_alloc(struct team_handle *th); int option_list_init(struct team_handle *th);
Hi.
Little more info about that fix (or rather suggestion for one). Here's a simple example to reproduce the problem (note: needs SMP).
Manually create team device, add bunch of VLANs and put it up (required to be up first in this example), then start teamd with --take-over:
root@debian:~# ip link add name team0 type team root@debian:~# for i in `seq 100 150` ; do ip link add link team0 name team0.$i type vlan id $i ; done root@debian:~# ip link set team0 up root@debian:~# cat teamd.conf { "device": "team0", "runner": { "name": "activebackup" }, "ports": { "eth1": {}, "eth2": {} } } root@debian:~# teamd -o -N -f teamd.conf This program is not intended to be run as root. 1.26 successfully started. eth1: ethtool-link went up. eth1: Can't get port priority. Using default. Changed active port to "eth1". eth2: ethtool-link went up.
It doesn't complain that anything is wrong. But when checking state:
root@debian:~# teamdctl team0 state setup: runner: activebackup ports: eth1 link watches: link summary: up instance[link_watch_0]: name: ethtool link: up down count: 0 Failed to parse JSON port dump. command call failed (Invalid argument)
Doesn't look healthy...
root@debian:~# teamdctl team0 state dump { "ports": { "eth1": { "ifinfo": { "dev_addr": "08:00:27:81:95:cf", "dev_addr_len": 6, "ifindex": 3, "ifname": "eth1" }, "link": { "duplex": "full", "speed": 1000, "up": true }, "link_watches": { "list": { "link_watch_0": { "delay_down": 0, "delay_up": 0, "down_count": 0, "name": "ethtool", "up": true } }, }, "eth2": { "link_watches": { "list": { "link_watch_0": { "delay_down": 0, "delay_up": 0, "down_count": 0, "name": "ethtool", "up": true } } } } }, "runner": { "active_port": "eth1" }, ...
Port eth2 is missing all its ifinfo and it's not properly linked to aggregate. Reason for this can be seen if running strace:
recvmsg(8, 0x7ffcb3805ca0, 0) = -1 ENOBUFS (No buffer space available)
That's the socket that joined to RTNLGRP_LINK notifications. When teamd started, all VLANs under it got carrier up, and kernel flooded events faster than teamd could read them. In this example, it now lost events related to port eth2.
That socket uses libnl default 32k buffer size. Netlink messages are quite large (over 1k), and the buffer gets easily full. Kernel neither knows nor cares were those broadcast messages actually delivered. This cannot really be fixed by simply increasing the buffer size, as there's no magical size X that is enough for all use cases. Also the system-wide rmem_max limit is quite low (usually 100-200k), and this easily requires several megabytes of buffer when there are hundreds of VLANs.
I don't think there's any elegant way to recover from this situation. All ifinfo is invalidated at this point and must be refreshed. It cannot just refresh team device and its ports, since library side might not have them linked because events missed, and it doesn't know about teamd configuration.
-antti
Fri, Jan 27, 2017 at 03:49:18PM CET, atiainen@forcepoint.com wrote:
When there's a large number of interfaces (e.g. vlans), teamd loses link notifications as it cannot read them as fast as kernel is broadcasting them. This often prevents teamd starting properly if started concurrently when other links are being set up. It can also fail when it's up and running, especially in the cases where the team device itself has a lot of vlans under it. It never checks the return value of nl_recvmsgs_default() and doesn't know if it just missed important/critical events.
As said in netlink(7), it's up to userspace to resynchronize with kernel when getting ENOBUFS. Unfortunately there's no way to know what events were lost and ports might not be linked in libteam side due to lost events, so all ifinfo list must be refreshed.
Checks now return value of nl_recvmsgs_default() for event socket. In case of ENOBUFS (which libnl nicely changes to ENOMEM), refreshes all ifinfo list. get_ifinfo_list() also checks now for removed interfaces in case of missed dellink event. Currently all TEAM_IFINFO_CHANGE handlers processed events one by one, so it had to be changed to support multiple ifinfo changes. For this, ifinfo changed flags are cleared and removed entries destroyed only after all handlers have been called.
Also, increased nl_cli.sock_event receive buffers to 96k like all other sockets. Added possibility to change this via environment variable.
Patch looks fine, nice work. See couple of nits I inlined.
For next version, please include the information you send in a follow-up mail directly to the patch description.
Also, please run checkpatch script to find styling issues and described in SubmittingPatches file (I added it there right now)
Signed-off-by: Antti Tiainen atiainen@forcepoint.com
libteam/ifinfo.c | 31 +++++++++++++++++++++++-------- libteam/libteam.c | 40 +++++++++++++++++++++++++++++++++++++++- libteam/team_private.h | 3 +++ 3 files changed, 65 insertions(+), 9 deletions(-)
diff --git a/libteam/ifinfo.c b/libteam/ifinfo.c index 72155ae..9330a4f 100644 --- a/libteam/ifinfo.c +++ b/libteam/ifinfo.c @@ -72,6 +72,9 @@ struct team_ifinfo { #define CHANGED_PHYS_PORT_ID (1 << 5) #define CHANGED_PHYS_PORT_ID_LEN (1 << 6) #define CHANGED_ADMIN_STATE (1 << 7) +/* This is only used when tagging interfaces for finding
- removed, and thus not included to CHANGED_ANY. */
+#define CHANGED_REFRESHING (1 << 8) #define CHANGED_ANY (CHANGED_REMOVED | CHANGED_HWADDR | \ CHANGED_HWADDR_LEN | CHANGED_IFNAME | \ CHANGED_MASTER_IFINDEX | CHANGED_PHYS_PORT_ID | \ @@ -202,7 +205,7 @@ static struct team_ifinfo *ifinfo_find(struct team_handle *th, uint32_t ifindex) return NULL; }
-static void clear_last_changed(struct team_handle *th) +void ifinfo_clear_changed(struct team_handle *th) { struct team_ifinfo *ifinfo;
@@ -236,7 +239,7 @@ static void ifinfo_destroy(struct team_ifinfo *ifinfo) free(ifinfo); }
-static void ifinfo_destroy_removed(struct team_handle *th) +void ifinfo_destroy_removed(struct team_handle *th) { struct team_ifinfo *ifinfo, *tmp;
@@ -254,8 +257,6 @@ static void obj_input_newlink(struct nl_object *obj, void *arg, bool event) uint32_t ifindex; int err;
ifinfo_destroy_removed(th);
link = (struct rtnl_link *) obj;
ifindex = rtnl_link_get_ifindex(link);
@@ -269,7 +270,7 @@ static void obj_input_newlink(struct nl_object *obj, void *arg, bool event) return; }
- clear_last_changed(th);
clear_changed(ifinfo); ifinfo_update(ifinfo, link);
if (event)
@@ -292,8 +293,6 @@ static void event_handler_obj_input_dellink(struct nl_object *obj, void *arg) uint32_t ifindex; int err;
ifinfo_destroy_removed(th);
link = (struct rtnl_link *) obj;
ifindex = rtnl_link_get_ifindex(link);
@@ -311,7 +310,7 @@ static void event_handler_obj_input_dellink(struct nl_object *obj, void *arg) return; }
- clear_last_changed(th);
- clear_changed(ifinfo); set_changed(ifinfo, CHANGED_REMOVED); set_call_change_handlers(th, TEAM_IFINFO_CHANGE);
} @@ -367,6 +366,13 @@ int get_ifinfo_list(struct team_handle *th) }; int ret; int retry = 1;
- struct team_ifinfo *ifinfo;
- /* Tag all ifinfo, this is cleared in newlink handler.
Any interface that has this after dump is processed
has been removed. */
Please have all multi-line comments in format: /* Tag all ifinfo, this is cleared in newlink handler. * Any interface that has this after dump is processed * has been removed. */
list_for_each_node_entry(ifinfo, &th->ifinfo_list, list)
set_changed(ifinfo, CHANGED_REFRESHING);
while (retry) { retry = 0;
@@ -395,6 +401,15 @@ int get_ifinfo_list(struct team_handle *th) retry = 1; } }
- list_for_each_node_entry(ifinfo, &th->ifinfo_list, list) {
if (is_changed(ifinfo, CHANGED_REFRESHING)) {
clear_changed(ifinfo);
set_changed(ifinfo, CHANGED_REMOVED);
set_call_change_handlers(th, TEAM_IFINFO_CHANGE);
}
- }
- ret = check_call_change_handlers(th, TEAM_IFINFO_CHANGE); if (ret < 0) err(th, "get_ifinfo_list: check_call_change_handers failed");
diff --git a/libteam/libteam.c b/libteam/libteam.c index ac187aa..9207b61 100644 --- a/libteam/libteam.c +++ b/libteam/libteam.c @@ -236,6 +236,10 @@ int check_call_change_handlers(struct team_handle *th, break; } }
- if (call_type_mask & TEAM_IFINFO_CHANGE) {
ifinfo_destroy_removed(th);
ifinfo_clear_changed(th);
- } th->change_handler.pending_type_mask &= ~call_type_mask; return err;
} @@ -561,6 +565,8 @@ int team_init(struct team_handle *th, uint32_t ifindex) int err; int grp_id; int val;
int eventbufsize;
const char *env;
if (!ifindex) { err(th, "Passed interface index %d is not valid.", ifindex);
@@ -627,6 +633,23 @@ int team_init(struct team_handle *th, uint32_t ifindex) nl_socket_modify_cb(th->nl_cli.sock_event, NL_CB_VALID, NL_CB_CUSTOM, cli_event_handler, th); nl_cli_connect(th->nl_cli.sock_event, NETLINK_ROUTE);
- env = getenv("TEAM_EVENT_BUFSIZE");
- if (env) {
eventbufsize = strtol(env, NULL, 10);
/* ignore other errors, libnl forces minimum 32k and
too large values are truncated to system rmem_max */
if (eventbufsize < 0)
eventbufsize = 0;
- }
- else eventbufsize = 98304;
Should be: if (x) { } else { } Even if only one branch is multi-line.
- err = nl_socket_set_buffer_size(th->nl_cli.sock_event, eventbufsize, 0);
- if (err) {
err(th, "Failed to set cli event socket buffer size.");
return err;
- }
- err = nl_socket_add_membership(th->nl_cli.sock_event, RTNLGRP_LINK); if (err < 0) { err(th, "Failed to add netlink membership.");
@@ -767,7 +790,22 @@ static int get_cli_sock_event_fd(struct team_handle *th)
static int cli_sock_event_handler(struct team_handle *th) {
- nl_recvmsgs_default(th->nl_cli.sock_event);
- int err;
- err = nl_recvmsgs_default(th->nl_cli.sock_event);
- err = -nl2syserr(err);
- /* libnl thinks ENOBUFS and ENOMEM are same. Hope it was ENOBUFS. */
- if (err == -ENOMEM) {
warn(th, "Lost link notifications from kernel.");
/* There's no way to know what events were lost and no
way to get them again. Refresh all. */
err = get_ifinfo_list(th);
- }
- if (err)
return err;
Good. Thanks!
return check_call_change_handlers(th, TEAM_IFINFO_CHANGE); }
diff --git a/libteam/team_private.h b/libteam/team_private.h index a07632f..a5eb0be 100644 --- a/libteam/team_private.h +++ b/libteam/team_private.h @@ -115,6 +115,9 @@ int ifinfo_link_with_port(struct team_handle *th, uint32_t ifindex, int ifinfo_link(struct team_handle *th, uint32_t ifindex, struct team_ifinfo **p_ifinfo); void ifinfo_unlink(struct team_ifinfo *ifinfo); +void ifinfo_clear_changed(struct team_handle *th); +void ifinfo_destroy_removed(struct team_handle *th); +int get_ifinfo_list(struct team_handle *th); int get_options_handler(struct nl_msg *msg, void *arg); int option_list_alloc(struct team_handle *th); int option_list_init(struct team_handle *th); -- 1.7.9.5
libteam@lists.fedorahosted.org