diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 0335285f3918..49935d5bb5c6 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -230,6 +230,14 @@ tcp_base_mss - INTEGER Path MTU discovery (MTU probing). If MTU probing is enabled, this is the initial MSS used by the connection. +tcp_min_snd_mss - INTEGER + TCP SYN and SYNACK messages usually advertise an ADVMSS option, + as described in RFC 1122 and RFC 6691. + If this ADVMSS option is smaller than tcp_min_snd_mss, + it is silently capped to tcp_min_snd_mss. + + Default : 48 (at least 8 bytes of payload per segment) + tcp_congestion_control - STRING Set the congestion control algorithm to be used for new connections. The algorithm "reno" is always available, but diff --git a/Makefile b/Makefile index 584af2e57735..f34cb9225255 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 9 -SUBLEVEL = 181 +SUBLEVEL = 182 EXTRAVERSION = NAME = Roaring Lionus diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d0c3615f9050..7f517458c64f 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -433,4 +433,7 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) tp->saved_syn = NULL; } +int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, + int shiftlen); + #endif /* _LINUX_TCP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 7adf4386ac8f..bf619a67ec03 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -94,6 +94,7 @@ struct netns_ipv4 { #endif int sysctl_tcp_mtu_probing; int sysctl_tcp_base_mss; + int sysctl_tcp_min_snd_mss; int sysctl_tcp_probe_threshold; u32 sysctl_tcp_probe_interval; diff --git a/include/net/tcp.h b/include/net/tcp.h index fed2a78fb8cb..d7047de952f0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -53,6 +53,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER (128 + MAX_HEADER) #define MAX_TCP_OPTION_SPACE 40 +#define TCP_MIN_SND_MSS 48 +#define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE) /* * Never offer a window over 32767 without using window scaling. Some diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 3442a26d36d9..56e3460d1f9f 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -282,6 +282,7 @@ enum LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */ LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */ LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ + LINUX_MIB_TCPWQUEUETOOBIG, /* TCPWqueueTooBig */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ec48d8eafc7e..8b221398534b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -306,6 +306,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), + SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 85713adf2770..e202babb14d6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -35,6 +35,8 @@ static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; static int tcp_adv_win_scale_min = -31; static int tcp_adv_win_scale_max = 31; +static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; +static int tcp_min_snd_mss_max = 65535; static int ip_ttl_min = 1; static int ip_ttl_max = 255; static int tcp_syn_retries_min = 1; @@ -838,6 +840,15 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "tcp_min_snd_mss", + .data = &init_net.ipv4.sysctl_tcp_min_snd_mss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_min_snd_mss_min, + .extra2 = &tcp_min_snd_mss_max, + }, { .procname = "tcp_probe_threshold", .data = &init_net.ipv4.sysctl_tcp_probe_threshold, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2ededb32b754..ee2822a411f9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3307,6 +3307,7 @@ void __init tcp_init(void) unsigned long limit; unsigned int i; + BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e238539c3497..e2e58bc42ba4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1320,7 +1320,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(skb)->seq += shifted; tcp_skb_pcount_add(prev, pcount); - BUG_ON(tcp_skb_pcount(skb) < pcount); + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, @@ -1387,6 +1387,21 @@ static int skb_can_shift(const struct sk_buff *skb) return !skb_headlen(skb) && skb_is_nonlinear(skb); } +int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, + int pcount, int shiftlen) +{ + /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE) + * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need + * to make sure not storing more than 65535 * 8 bytes per skb, + * even if current MSS is bigger. + */ + if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE)) + return 0; + if (unlikely(tcp_skb_pcount(to) + pcount > 65535)) + return 0; + return skb_shift(to, from, shiftlen); +} + /* Try collapsing SACK blocks spanning across multiple skbs to a single * skb. */ @@ -1398,6 +1413,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev; int mss; + int next_pcount; int pcount = 0; int len; int in_sack; @@ -1495,7 +1511,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) goto fallback; - if (!skb_shift(prev, skb, len)) + if (!tcp_skb_shift(prev, skb, pcount, len)) goto fallback; if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) goto out; @@ -1514,11 +1530,11 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto out; len = skb->len; - if (skb_shift(prev, skb, len)) { - pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + next_pcount = tcp_skb_pcount(skb); + if (tcp_skb_shift(prev, skb, next_pcount, len)) { + pcount += next_pcount; + tcp_shifted_skb(sk, skb, state, next_pcount, len, mss, 0); } - out: state->fack_count += pcount; return prev; @@ -2837,9 +2853,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) + if (!tp->sacked_out && tp->fackets_out) tp->fackets_out = 0; /* Now state machine starts. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 82c1064ff4aa..848f2c1da8a5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2456,6 +2456,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; + net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6f35cdd5f2f0..d8c6b833f0ce 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1185,6 +1185,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, if (nsize < 0) nsize = 0; + if (unlikely((sk->sk_wmem_queued >> 1) > sk->sk_sndbuf)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG); + return -ENOMEM; + } + if (skb_unclone(skb, gfp)) return -ENOMEM; @@ -1355,8 +1360,7 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu) mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ - if (mss_now < 48) - mss_now = 48; + mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss); return mss_now; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 69523389f067..d9e364c4863a 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -140,6 +140,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1; mss = min(net->ipv4.sysctl_tcp_base_mss, mss); mss = max(mss, 68 - tp->tcp_header_len); + mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss); icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); }