Commit bcbde4c0a7556cca72874c5e1efa4dccb5198a2b
Committed by
Simon Horman
1 parent
c16526a7b9
Exists in
smarc-imx_3.14.28_1.0.0_ga
and in
1 other branch
ipvs: make the service replacement more robust
commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added IP_VS_DEST_STATE_REMOVING flag and RCU callback named ip_vs_dest_wait_readers() to keep dests and services after removal for at least a RCU grace period. But we have the following corner cases: - we can not reuse the same dest if its service is removed while IP_VS_DEST_STATE_REMOVING is still set because another dest removal in the first grace period can not extend this period. It can happen when ipvsadm -C && ipvsadm -R is used. - dest->svc can be replaced but ip_vs_in_stats() and ip_vs_out_stats() have no explicit read memory barriers when accessing dest->svc. It can happen that dest->svc was just freed (replaced) while we use it to update the stats. We solve the problems as follows: - IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start will remember when for first time after deletion we noticed dest->refcnt=0. Later, the connections can grab a reference while in RCU grace period but if refcnt becomes 0 we can safely free the dest and its svc. - dest->svc becomes RCU pointer. As result, we add explicit RCU locking in ip_vs_in_stats() and ip_vs_out_stats(). - __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it now can free the service immediately or after a RCU grace period. dest->svc is not set to NULL anymore. As result, unlinked dests and their services are freed always after IP_VS_DEST_TRASH_PERIOD period, unused services are freed after a RCU grace period. Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Simon Horman <horms@verge.net.au>
Showing 3 changed files with 47 additions and 58 deletions Side-by-side Diff
include/net/ip_vs.h
... | ... | @@ -723,8 +723,6 @@ |
723 | 723 | struct rcu_head rcu_head; |
724 | 724 | }; |
725 | 725 | |
726 | -/* In grace period after removing */ | |
727 | -#define IP_VS_DEST_STATE_REMOVING 0x01 | |
728 | 726 | /* |
729 | 727 | * The real server destination forwarding entry |
730 | 728 | * with ip address, port number, and so on. |
... | ... | @@ -742,7 +740,7 @@ |
742 | 740 | |
743 | 741 | atomic_t refcnt; /* reference counter */ |
744 | 742 | struct ip_vs_stats stats; /* statistics */ |
745 | - unsigned long state; /* state flags */ | |
743 | + unsigned long idle_start; /* start time, jiffies */ | |
746 | 744 | |
747 | 745 | /* connection counters and thresholds */ |
748 | 746 | atomic_t activeconns; /* active connections */ |
749 | 747 | |
... | ... | @@ -756,14 +754,13 @@ |
756 | 754 | struct ip_vs_dest_dst __rcu *dest_dst; /* cached dst info */ |
757 | 755 | |
758 | 756 | /* for virtual service */ |
759 | - struct ip_vs_service *svc; /* service it belongs to */ | |
757 | + struct ip_vs_service __rcu *svc; /* service it belongs to */ | |
760 | 758 | __u16 protocol; /* which protocol (TCP/UDP) */ |
761 | 759 | __be16 vport; /* virtual port number */ |
762 | 760 | union nf_inet_addr vaddr; /* virtual IP address */ |
763 | 761 | __u32 vfwmark; /* firewall mark of service */ |
764 | 762 | |
765 | 763 | struct list_head t_list; /* in dest_trash */ |
766 | - struct rcu_head rcu_head; | |
767 | 764 | unsigned int in_rs_table:1; /* we are in rs_table */ |
768 | 765 | }; |
769 | 766 |
net/netfilter/ipvs/ip_vs_core.c
... | ... | @@ -116,6 +116,7 @@ |
116 | 116 | |
117 | 117 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { |
118 | 118 | struct ip_vs_cpu_stats *s; |
119 | + struct ip_vs_service *svc; | |
119 | 120 | |
120 | 121 | s = this_cpu_ptr(dest->stats.cpustats); |
121 | 122 | s->ustats.inpkts++; |
122 | 123 | |
... | ... | @@ -123,11 +124,14 @@ |
123 | 124 | s->ustats.inbytes += skb->len; |
124 | 125 | u64_stats_update_end(&s->syncp); |
125 | 126 | |
126 | - s = this_cpu_ptr(dest->svc->stats.cpustats); | |
127 | + rcu_read_lock(); | |
128 | + svc = rcu_dereference(dest->svc); | |
129 | + s = this_cpu_ptr(svc->stats.cpustats); | |
127 | 130 | s->ustats.inpkts++; |
128 | 131 | u64_stats_update_begin(&s->syncp); |
129 | 132 | s->ustats.inbytes += skb->len; |
130 | 133 | u64_stats_update_end(&s->syncp); |
134 | + rcu_read_unlock(); | |
131 | 135 | |
132 | 136 | s = this_cpu_ptr(ipvs->tot_stats.cpustats); |
133 | 137 | s->ustats.inpkts++; |
... | ... | @@ -146,6 +150,7 @@ |
146 | 150 | |
147 | 151 | if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { |
148 | 152 | struct ip_vs_cpu_stats *s; |
153 | + struct ip_vs_service *svc; | |
149 | 154 | |
150 | 155 | s = this_cpu_ptr(dest->stats.cpustats); |
151 | 156 | s->ustats.outpkts++; |
152 | 157 | |
... | ... | @@ -153,11 +158,14 @@ |
153 | 158 | s->ustats.outbytes += skb->len; |
154 | 159 | u64_stats_update_end(&s->syncp); |
155 | 160 | |
156 | - s = this_cpu_ptr(dest->svc->stats.cpustats); | |
161 | + rcu_read_lock(); | |
162 | + svc = rcu_dereference(dest->svc); | |
163 | + s = this_cpu_ptr(svc->stats.cpustats); | |
157 | 164 | s->ustats.outpkts++; |
158 | 165 | u64_stats_update_begin(&s->syncp); |
159 | 166 | s->ustats.outbytes += skb->len; |
160 | 167 | u64_stats_update_end(&s->syncp); |
168 | + rcu_read_unlock(); | |
161 | 169 | |
162 | 170 | s = this_cpu_ptr(ipvs->tot_stats.cpustats); |
163 | 171 | s->ustats.outpkts++; |
net/netfilter/ipvs/ip_vs_ctl.c
... | ... | @@ -460,7 +460,7 @@ |
460 | 460 | __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) |
461 | 461 | { |
462 | 462 | atomic_inc(&svc->refcnt); |
463 | - dest->svc = svc; | |
463 | + rcu_assign_pointer(dest->svc, svc); | |
464 | 464 | } |
465 | 465 | |
466 | 466 | static void ip_vs_service_free(struct ip_vs_service *svc) |
467 | 467 | |
468 | 468 | |
469 | 469 | |
... | ... | @@ -470,18 +470,25 @@ |
470 | 470 | kfree(svc); |
471 | 471 | } |
472 | 472 | |
473 | -static void | |
474 | -__ip_vs_unbind_svc(struct ip_vs_dest *dest) | |
473 | +static void ip_vs_service_rcu_free(struct rcu_head *head) | |
475 | 474 | { |
476 | - struct ip_vs_service *svc = dest->svc; | |
475 | + struct ip_vs_service *svc; | |
477 | 476 | |
478 | - dest->svc = NULL; | |
477 | + svc = container_of(head, struct ip_vs_service, rcu_head); | |
478 | + ip_vs_service_free(svc); | |
479 | +} | |
480 | + | |
481 | +static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) | |
482 | +{ | |
479 | 483 | if (atomic_dec_and_test(&svc->refcnt)) { |
480 | 484 | IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", |
481 | 485 | svc->fwmark, |
482 | 486 | IP_VS_DBG_ADDR(svc->af, &svc->addr), |
483 | 487 | ntohs(svc->port)); |
484 | - ip_vs_service_free(svc); | |
488 | + if (do_delay) | |
489 | + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); | |
490 | + else | |
491 | + ip_vs_service_free(svc); | |
485 | 492 | } |
486 | 493 | } |
487 | 494 | |
... | ... | @@ -667,11 +674,6 @@ |
667 | 674 | IP_VS_DBG_ADDR(svc->af, &dest->addr), |
668 | 675 | ntohs(dest->port), |
669 | 676 | atomic_read(&dest->refcnt)); |
670 | - /* We can not reuse dest while in grace period | |
671 | - * because conns still can use dest->svc | |
672 | - */ | |
673 | - if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) | |
674 | - continue; | |
675 | 677 | if (dest->af == svc->af && |
676 | 678 | ip_vs_addr_equal(svc->af, &dest->addr, daddr) && |
677 | 679 | dest->port == dport && |
678 | 680 | |
... | ... | @@ -697,8 +699,10 @@ |
697 | 699 | |
698 | 700 | static void ip_vs_dest_free(struct ip_vs_dest *dest) |
699 | 701 | { |
702 | + struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); | |
703 | + | |
700 | 704 | __ip_vs_dst_cache_reset(dest); |
701 | - __ip_vs_unbind_svc(dest); | |
705 | + __ip_vs_svc_put(svc, false); | |
702 | 706 | free_percpu(dest->stats.cpustats); |
703 | 707 | kfree(dest); |
704 | 708 | } |
... | ... | @@ -771,6 +775,7 @@ |
771 | 775 | struct ip_vs_dest_user_kern *udest, int add) |
772 | 776 | { |
773 | 777 | struct netns_ipvs *ipvs = net_ipvs(svc->net); |
778 | + struct ip_vs_service *old_svc; | |
774 | 779 | struct ip_vs_scheduler *sched; |
775 | 780 | int conn_flags; |
776 | 781 | |
777 | 782 | |
778 | 783 | |
... | ... | @@ -792,13 +797,14 @@ |
792 | 797 | atomic_set(&dest->conn_flags, conn_flags); |
793 | 798 | |
794 | 799 | /* bind the service */ |
795 | - if (!dest->svc) { | |
800 | + old_svc = rcu_dereference_protected(dest->svc, 1); | |
801 | + if (!old_svc) { | |
796 | 802 | __ip_vs_bind_svc(dest, svc); |
797 | 803 | } else { |
798 | - if (dest->svc != svc) { | |
799 | - __ip_vs_unbind_svc(dest); | |
804 | + if (old_svc != svc) { | |
800 | 805 | ip_vs_zero_stats(&dest->stats); |
801 | 806 | __ip_vs_bind_svc(dest, svc); |
807 | + __ip_vs_svc_put(old_svc, true); | |
802 | 808 | } |
803 | 809 | } |
804 | 810 | |
... | ... | @@ -998,16 +1004,6 @@ |
998 | 1004 | return 0; |
999 | 1005 | } |
1000 | 1006 | |
1001 | -static void ip_vs_dest_wait_readers(struct rcu_head *head) | |
1002 | -{ | |
1003 | - struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest, | |
1004 | - rcu_head); | |
1005 | - | |
1006 | - /* End of grace period after unlinking */ | |
1007 | - clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); | |
1008 | -} | |
1009 | - | |
1010 | - | |
1011 | 1007 | /* |
1012 | 1008 | * Delete a destination (must be already unlinked from the service) |
1013 | 1009 | */ |
1014 | 1010 | |
1015 | 1011 | |
... | ... | @@ -1023,20 +1019,16 @@ |
1023 | 1019 | */ |
1024 | 1020 | ip_vs_rs_unhash(dest); |
1025 | 1021 | |
1026 | - if (!cleanup) { | |
1027 | - set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state); | |
1028 | - call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers); | |
1029 | - } | |
1030 | - | |
1031 | 1022 | spin_lock_bh(&ipvs->dest_trash_lock); |
1032 | 1023 | IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", |
1033 | 1024 | IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), |
1034 | 1025 | atomic_read(&dest->refcnt)); |
1035 | 1026 | if (list_empty(&ipvs->dest_trash) && !cleanup) |
1036 | 1027 | mod_timer(&ipvs->dest_trash_timer, |
1037 | - jiffies + IP_VS_DEST_TRASH_PERIOD); | |
1028 | + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); | |
1038 | 1029 | /* dest lives in trash without reference */ |
1039 | 1030 | list_add(&dest->t_list, &ipvs->dest_trash); |
1031 | + dest->idle_start = 0; | |
1040 | 1032 | spin_unlock_bh(&ipvs->dest_trash_lock); |
1041 | 1033 | ip_vs_dest_put(dest); |
1042 | 1034 | } |
1043 | 1035 | |
1044 | 1036 | |
1045 | 1037 | |
1046 | 1038 | |
... | ... | @@ -1108,24 +1100,30 @@ |
1108 | 1100 | struct net *net = (struct net *) data; |
1109 | 1101 | struct netns_ipvs *ipvs = net_ipvs(net); |
1110 | 1102 | struct ip_vs_dest *dest, *next; |
1103 | + unsigned long now = jiffies; | |
1111 | 1104 | |
1112 | 1105 | spin_lock(&ipvs->dest_trash_lock); |
1113 | 1106 | list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { |
1114 | - /* Skip if dest is in grace period */ | |
1115 | - if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state)) | |
1116 | - continue; | |
1117 | 1107 | if (atomic_read(&dest->refcnt) > 0) |
1118 | 1108 | continue; |
1109 | + if (dest->idle_start) { | |
1110 | + if (time_before(now, dest->idle_start + | |
1111 | + IP_VS_DEST_TRASH_PERIOD)) | |
1112 | + continue; | |
1113 | + } else { | |
1114 | + dest->idle_start = max(1UL, now); | |
1115 | + continue; | |
1116 | + } | |
1119 | 1117 | IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n", |
1120 | 1118 | dest->vfwmark, |
1121 | - IP_VS_DBG_ADDR(dest->svc->af, &dest->addr), | |
1119 | + IP_VS_DBG_ADDR(dest->af, &dest->addr), | |
1122 | 1120 | ntohs(dest->port)); |
1123 | 1121 | list_del(&dest->t_list); |
1124 | 1122 | ip_vs_dest_free(dest); |
1125 | 1123 | } |
1126 | 1124 | if (!list_empty(&ipvs->dest_trash)) |
1127 | 1125 | mod_timer(&ipvs->dest_trash_timer, |
1128 | - jiffies + IP_VS_DEST_TRASH_PERIOD); | |
1126 | + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); | |
1129 | 1127 | spin_unlock(&ipvs->dest_trash_lock); |
1130 | 1128 | } |
1131 | 1129 | |
... | ... | @@ -1320,14 +1318,6 @@ |
1320 | 1318 | return ret; |
1321 | 1319 | } |
1322 | 1320 | |
1323 | -static void ip_vs_service_rcu_free(struct rcu_head *head) | |
1324 | -{ | |
1325 | - struct ip_vs_service *svc; | |
1326 | - | |
1327 | - svc = container_of(head, struct ip_vs_service, rcu_head); | |
1328 | - ip_vs_service_free(svc); | |
1329 | -} | |
1330 | - | |
1331 | 1321 | /* |
1332 | 1322 | * Delete a service from the service list |
1333 | 1323 | * - The service must be unlinked, unlocked and not referenced! |
... | ... | @@ -1376,13 +1366,7 @@ |
1376 | 1366 | /* |
1377 | 1367 | * Free the service if nobody refers to it |
1378 | 1368 | */ |
1379 | - if (atomic_dec_and_test(&svc->refcnt)) { | |
1380 | - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", | |
1381 | - svc->fwmark, | |
1382 | - IP_VS_DBG_ADDR(svc->af, &svc->addr), | |
1383 | - ntohs(svc->port)); | |
1384 | - call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); | |
1385 | - } | |
1369 | + __ip_vs_svc_put(svc, true); | |
1386 | 1370 | |
1387 | 1371 | /* decrease the module use count */ |
1388 | 1372 | ip_vs_use_count_dec(); |