Commit 3dc43e3e4d0b52197d3205214fe8f162f9e0c334
Committed by
David S. Miller
1 parent
d1a4c0b37c
Exists in
master
and in
6 other branches
per-netns ipv4 sysctl_tcp_mem
This patch allows each namespace to independently set up its levels for tcp memory pressure thresholds. This patch alone does not buy much: we need to make this values per group of process somehow. This is achieved in the patches that follows in this patchset. Signed-off-by: Glauber Costa <glommer@parallels.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> CC: David S. Miller <davem@davemloft.net> CC: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 9 changed files with 57 additions and 22 deletions Side-by-side Diff
include/net/netns/ipv4.h
include/net/tcp.h
net/ipv4/af_inet.c
net/ipv4/sysctl_net_ipv4.c
... | ... | @@ -14,6 +14,7 @@ |
14 | 14 | #include <linux/init.h> |
15 | 15 | #include <linux/slab.h> |
16 | 16 | #include <linux/nsproxy.h> |
17 | +#include <linux/swap.h> | |
17 | 18 | #include <net/snmp.h> |
18 | 19 | #include <net/icmp.h> |
19 | 20 | #include <net/ip.h> |
... | ... | @@ -174,6 +175,36 @@ |
174 | 175 | return ret; |
175 | 176 | } |
176 | 177 | |
178 | +static int ipv4_tcp_mem(ctl_table *ctl, int write, | |
179 | + void __user *buffer, size_t *lenp, | |
180 | + loff_t *ppos) | |
181 | +{ | |
182 | + int ret; | |
183 | + unsigned long vec[3]; | |
184 | + struct net *net = current->nsproxy->net_ns; | |
185 | + | |
186 | + ctl_table tmp = { | |
187 | + .data = &vec, | |
188 | + .maxlen = sizeof(vec), | |
189 | + .mode = ctl->mode, | |
190 | + }; | |
191 | + | |
192 | + if (!write) { | |
193 | + ctl->data = &net->ipv4.sysctl_tcp_mem; | |
194 | + return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos); | |
195 | + } | |
196 | + | |
197 | + ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); | |
198 | + if (ret) | |
199 | + return ret; | |
200 | + | |
201 | + net->ipv4.sysctl_tcp_mem[0] = vec[0]; | |
202 | + net->ipv4.sysctl_tcp_mem[1] = vec[1]; | |
203 | + net->ipv4.sysctl_tcp_mem[2] = vec[2]; | |
204 | + | |
205 | + return 0; | |
206 | +} | |
207 | + | |
177 | 208 | static struct ctl_table ipv4_table[] = { |
178 | 209 | { |
179 | 210 | .procname = "tcp_timestamps", |
... | ... | @@ -433,13 +464,6 @@ |
433 | 464 | .proc_handler = proc_dointvec |
434 | 465 | }, |
435 | 466 | { |
436 | - .procname = "tcp_mem", | |
437 | - .data = &sysctl_tcp_mem, | |
438 | - .maxlen = sizeof(sysctl_tcp_mem), | |
439 | - .mode = 0644, | |
440 | - .proc_handler = proc_doulongvec_minmax | |
441 | - }, | |
442 | - { | |
443 | 467 | .procname = "tcp_wmem", |
444 | 468 | .data = &sysctl_tcp_wmem, |
445 | 469 | .maxlen = sizeof(sysctl_tcp_wmem), |
... | ... | @@ -721,6 +745,12 @@ |
721 | 745 | .mode = 0644, |
722 | 746 | .proc_handler = ipv4_ping_group_range, |
723 | 747 | }, |
748 | + { | |
749 | + .procname = "tcp_mem", | |
750 | + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), | |
751 | + .mode = 0644, | |
752 | + .proc_handler = ipv4_tcp_mem, | |
753 | + }, | |
724 | 754 | { } |
725 | 755 | }; |
726 | 756 | |
... | ... | @@ -734,6 +764,7 @@ |
734 | 764 | static __net_init int ipv4_sysctl_init_net(struct net *net) |
735 | 765 | { |
736 | 766 | struct ctl_table *table; |
767 | + unsigned long limit; | |
737 | 768 | |
738 | 769 | table = ipv4_net_table; |
739 | 770 | if (!net_eq(net, &init_net)) { |
... | ... | @@ -768,6 +799,12 @@ |
768 | 799 | net->ipv4.sysctl_ping_group_range[1] = 0; |
769 | 800 | |
770 | 801 | net->ipv4.sysctl_rt_cache_rebuild_count = 4; |
802 | + | |
803 | + limit = nr_free_buffer_pages() / 8; | |
804 | + limit = max(limit, 128UL); | |
805 | + net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; | |
806 | + net->ipv4.sysctl_tcp_mem[1] = limit; | |
807 | + net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; | |
771 | 808 | |
772 | 809 | net->ipv4.ipv4_hdr = register_net_sysctl_table(net, |
773 | 810 | net_ipv4_ctl_path, table); |
net/ipv4/tcp.c
... | ... | @@ -282,11 +282,9 @@ |
282 | 282 | struct percpu_counter tcp_orphan_count; |
283 | 283 | EXPORT_SYMBOL_GPL(tcp_orphan_count); |
284 | 284 | |
285 | -long sysctl_tcp_mem[3] __read_mostly; | |
286 | 285 | int sysctl_tcp_wmem[3] __read_mostly; |
287 | 286 | int sysctl_tcp_rmem[3] __read_mostly; |
288 | 287 | |
289 | -EXPORT_SYMBOL(sysctl_tcp_mem); | |
290 | 288 | EXPORT_SYMBOL(sysctl_tcp_rmem); |
291 | 289 | EXPORT_SYMBOL(sysctl_tcp_wmem); |
292 | 290 | |
293 | 291 | |
... | ... | @@ -3278,14 +3276,9 @@ |
3278 | 3276 | sysctl_tcp_max_orphans = cnt / 2; |
3279 | 3277 | sysctl_max_syn_backlog = max(128, cnt / 256); |
3280 | 3278 | |
3281 | - limit = nr_free_buffer_pages() / 8; | |
3282 | - limit = max(limit, 128UL); | |
3283 | - sysctl_tcp_mem[0] = limit / 4 * 3; | |
3284 | - sysctl_tcp_mem[1] = limit; | |
3285 | - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; | |
3286 | - | |
3287 | 3279 | /* Set per-socket limits to no more than 1/128 the pressure threshold */ |
3288 | - limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); | |
3280 | + limit = ((unsigned long)init_net.ipv4.sysctl_tcp_mem[1]) | |
3281 | + << (PAGE_SHIFT - 7); | |
3289 | 3282 | max_share = min(4UL*1024*1024, limit); |
3290 | 3283 | |
3291 | 3284 | sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; |
net/ipv4/tcp_ipv4.c
... | ... | @@ -2623,7 +2623,6 @@ |
2623 | 2623 | .orphan_count = &tcp_orphan_count, |
2624 | 2624 | .memory_allocated = &tcp_memory_allocated, |
2625 | 2625 | .memory_pressure = &tcp_memory_pressure, |
2626 | - .sysctl_mem = sysctl_tcp_mem, | |
2627 | 2626 | .sysctl_wmem = sysctl_tcp_wmem, |
2628 | 2627 | .sysctl_rmem = sysctl_tcp_rmem, |
2629 | 2628 | .max_header = MAX_TCP_HEADER, |
net/ipv4/tcp_memcontrol.c
1 | 1 | #include <net/tcp.h> |
2 | 2 | #include <net/tcp_memcontrol.h> |
3 | 3 | #include <net/sock.h> |
4 | +#include <net/ip.h> | |
5 | +#include <linux/nsproxy.h> | |
4 | 6 | #include <linux/memcontrol.h> |
5 | 7 | #include <linux/module.h> |
6 | 8 | |
... | ... | @@ -28,6 +30,7 @@ |
28 | 30 | struct tcp_memcontrol *tcp; |
29 | 31 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
30 | 32 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
33 | + struct net *net = current->nsproxy->net_ns; | |
31 | 34 | |
32 | 35 | cg_proto = tcp_prot.proto_cgroup(memcg); |
33 | 36 | if (!cg_proto) |
... | ... | @@ -35,9 +38,9 @@ |
35 | 38 | |
36 | 39 | tcp = tcp_from_cgproto(cg_proto); |
37 | 40 | |
38 | - tcp->tcp_prot_mem[0] = sysctl_tcp_mem[0]; | |
39 | - tcp->tcp_prot_mem[1] = sysctl_tcp_mem[1]; | |
40 | - tcp->tcp_prot_mem[2] = sysctl_tcp_mem[2]; | |
41 | + tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0]; | |
42 | + tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1]; | |
43 | + tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2]; | |
41 | 44 | tcp->tcp_memory_pressure = 0; |
42 | 45 | |
43 | 46 | parent_cg = tcp_prot.proto_cgroup(parent); |
net/ipv6/af_inet6.c
... | ... | @@ -1116,6 +1116,8 @@ |
1116 | 1116 | if (err) |
1117 | 1117 | goto static_sysctl_fail; |
1118 | 1118 | #endif |
1119 | + tcpv6_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem; | |
1120 | + | |
1119 | 1121 | /* |
1120 | 1122 | * ipngwg API draft makes clear that the correct semantics |
1121 | 1123 | * for TCP and UDP is to consider one TCP and UDP instance |
net/ipv6/tcp_ipv6.c
... | ... | @@ -2215,7 +2215,6 @@ |
2215 | 2215 | .memory_allocated = &tcp_memory_allocated, |
2216 | 2216 | .memory_pressure = &tcp_memory_pressure, |
2217 | 2217 | .orphan_count = &tcp_orphan_count, |
2218 | - .sysctl_mem = sysctl_tcp_mem, | |
2219 | 2218 | .sysctl_wmem = sysctl_tcp_wmem, |
2220 | 2219 | .sysctl_rmem = sysctl_tcp_rmem, |
2221 | 2220 | .max_header = MAX_TCP_HEADER, |