Commit bcbde4c0a7556cca72874c5e1efa4dccb5198a2b

Authored by Julian Anastasov
Committed by Simon Horman
1 parent c16526a7b9

ipvs: make the service replacement more robust

commit 578bc3ef1e473a ("ipvs: reorganize dest trash") added
IP_VS_DEST_STATE_REMOVING flag and RCU callback named
ip_vs_dest_wait_readers() to keep dests and services after
removal for at least a RCU grace period. But we have the
following corner cases:

- we can not reuse the same dest if its service is removed
while IP_VS_DEST_STATE_REMOVING is still set because another dest
removal in the first grace period can not extend this period.
It can happen when ipvsadm -C && ipvsadm -R is used.

- dest->svc can be replaced but ip_vs_in_stats() and
ip_vs_out_stats() have no explicit read memory barriers
when accessing dest->svc. It can happen that dest->svc
was just freed (replaced) while we use it to update
the stats.

We solve the problems as follows:

- IP_VS_DEST_STATE_REMOVING is removed and we ensure a fixed
idle period for the dest (IP_VS_DEST_TRASH_PERIOD). idle_start
will remember when for first time after deletion we noticed
dest->refcnt=0. Later, the connections can grab a reference
while in RCU grace period but if refcnt becomes 0 we can
safely free the dest and its svc.

- dest->svc becomes RCU pointer. As result, we add explicit
RCU locking in ip_vs_in_stats() and ip_vs_out_stats().

- __ip_vs_unbind_svc is renamed to __ip_vs_svc_put(), it
now can free the service immediately or after a RCU grace
period. dest->svc is not set to NULL anymore.

	As result, unlinked dests and their services are
freed always after IP_VS_DEST_TRASH_PERIOD period, unused
services are freed after a RCU grace period.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>

Showing 3 changed files with 47 additions and 58 deletions Side-by-side Diff

... ... @@ -723,8 +723,6 @@
723 723 struct rcu_head rcu_head;
724 724 };
725 725  
726   -/* In grace period after removing */
727   -#define IP_VS_DEST_STATE_REMOVING 0x01
728 726 /*
729 727 * The real server destination forwarding entry
730 728 * with ip address, port number, and so on.
... ... @@ -742,7 +740,7 @@
742 740  
743 741 atomic_t refcnt; /* reference counter */
744 742 struct ip_vs_stats stats; /* statistics */
745   - unsigned long state; /* state flags */
  743 + unsigned long idle_start; /* start time, jiffies */
746 744  
747 745 /* connection counters and thresholds */
748 746 atomic_t activeconns; /* active connections */
749 747  
... ... @@ -756,14 +754,13 @@
756 754 struct ip_vs_dest_dst __rcu *dest_dst; /* cached dst info */
757 755  
758 756 /* for virtual service */
759   - struct ip_vs_service *svc; /* service it belongs to */
  757 + struct ip_vs_service __rcu *svc; /* service it belongs to */
760 758 __u16 protocol; /* which protocol (TCP/UDP) */
761 759 __be16 vport; /* virtual port number */
762 760 union nf_inet_addr vaddr; /* virtual IP address */
763 761 __u32 vfwmark; /* firewall mark of service */
764 762  
765 763 struct list_head t_list; /* in dest_trash */
766   - struct rcu_head rcu_head;
767 764 unsigned int in_rs_table:1; /* we are in rs_table */
768 765 };
769 766  
net/netfilter/ipvs/ip_vs_core.c
... ... @@ -116,6 +116,7 @@
116 116  
117 117 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
118 118 struct ip_vs_cpu_stats *s;
  119 + struct ip_vs_service *svc;
119 120  
120 121 s = this_cpu_ptr(dest->stats.cpustats);
121 122 s->ustats.inpkts++;
122 123  
... ... @@ -123,11 +124,14 @@
123 124 s->ustats.inbytes += skb->len;
124 125 u64_stats_update_end(&s->syncp);
125 126  
126   - s = this_cpu_ptr(dest->svc->stats.cpustats);
  127 + rcu_read_lock();
  128 + svc = rcu_dereference(dest->svc);
  129 + s = this_cpu_ptr(svc->stats.cpustats);
127 130 s->ustats.inpkts++;
128 131 u64_stats_update_begin(&s->syncp);
129 132 s->ustats.inbytes += skb->len;
130 133 u64_stats_update_end(&s->syncp);
  134 + rcu_read_unlock();
131 135  
132 136 s = this_cpu_ptr(ipvs->tot_stats.cpustats);
133 137 s->ustats.inpkts++;
... ... @@ -146,6 +150,7 @@
146 150  
147 151 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
148 152 struct ip_vs_cpu_stats *s;
  153 + struct ip_vs_service *svc;
149 154  
150 155 s = this_cpu_ptr(dest->stats.cpustats);
151 156 s->ustats.outpkts++;
152 157  
... ... @@ -153,11 +158,14 @@
153 158 s->ustats.outbytes += skb->len;
154 159 u64_stats_update_end(&s->syncp);
155 160  
156   - s = this_cpu_ptr(dest->svc->stats.cpustats);
  161 + rcu_read_lock();
  162 + svc = rcu_dereference(dest->svc);
  163 + s = this_cpu_ptr(svc->stats.cpustats);
157 164 s->ustats.outpkts++;
158 165 u64_stats_update_begin(&s->syncp);
159 166 s->ustats.outbytes += skb->len;
160 167 u64_stats_update_end(&s->syncp);
  168 + rcu_read_unlock();
161 169  
162 170 s = this_cpu_ptr(ipvs->tot_stats.cpustats);
163 171 s->ustats.outpkts++;
net/netfilter/ipvs/ip_vs_ctl.c
... ... @@ -460,7 +460,7 @@
460 460 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
461 461 {
462 462 atomic_inc(&svc->refcnt);
463   - dest->svc = svc;
  463 + rcu_assign_pointer(dest->svc, svc);
464 464 }
465 465  
466 466 static void ip_vs_service_free(struct ip_vs_service *svc)
467 467  
468 468  
469 469  
... ... @@ -470,18 +470,25 @@
470 470 kfree(svc);
471 471 }
472 472  
473   -static void
474   -__ip_vs_unbind_svc(struct ip_vs_dest *dest)
  473 +static void ip_vs_service_rcu_free(struct rcu_head *head)
475 474 {
476   - struct ip_vs_service *svc = dest->svc;
  475 + struct ip_vs_service *svc;
477 476  
478   - dest->svc = NULL;
  477 + svc = container_of(head, struct ip_vs_service, rcu_head);
  478 + ip_vs_service_free(svc);
  479 +}
  480 +
  481 +static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
  482 +{
479 483 if (atomic_dec_and_test(&svc->refcnt)) {
480 484 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
481 485 svc->fwmark,
482 486 IP_VS_DBG_ADDR(svc->af, &svc->addr),
483 487 ntohs(svc->port));
484   - ip_vs_service_free(svc);
  488 + if (do_delay)
  489 + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
  490 + else
  491 + ip_vs_service_free(svc);
485 492 }
486 493 }
487 494  
... ... @@ -667,11 +674,6 @@
667 674 IP_VS_DBG_ADDR(svc->af, &dest->addr),
668 675 ntohs(dest->port),
669 676 atomic_read(&dest->refcnt));
670   - /* We can not reuse dest while in grace period
671   - * because conns still can use dest->svc
672   - */
673   - if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
674   - continue;
675 677 if (dest->af == svc->af &&
676 678 ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
677 679 dest->port == dport &&
678 680  
... ... @@ -697,8 +699,10 @@
697 699  
698 700 static void ip_vs_dest_free(struct ip_vs_dest *dest)
699 701 {
  702 + struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
  703 +
700 704 __ip_vs_dst_cache_reset(dest);
701   - __ip_vs_unbind_svc(dest);
  705 + __ip_vs_svc_put(svc, false);
702 706 free_percpu(dest->stats.cpustats);
703 707 kfree(dest);
704 708 }
... ... @@ -771,6 +775,7 @@
771 775 struct ip_vs_dest_user_kern *udest, int add)
772 776 {
773 777 struct netns_ipvs *ipvs = net_ipvs(svc->net);
  778 + struct ip_vs_service *old_svc;
774 779 struct ip_vs_scheduler *sched;
775 780 int conn_flags;
776 781  
777 782  
778 783  
... ... @@ -792,13 +797,14 @@
792 797 atomic_set(&dest->conn_flags, conn_flags);
793 798  
794 799 /* bind the service */
795   - if (!dest->svc) {
  800 + old_svc = rcu_dereference_protected(dest->svc, 1);
  801 + if (!old_svc) {
796 802 __ip_vs_bind_svc(dest, svc);
797 803 } else {
798   - if (dest->svc != svc) {
799   - __ip_vs_unbind_svc(dest);
  804 + if (old_svc != svc) {
800 805 ip_vs_zero_stats(&dest->stats);
801 806 __ip_vs_bind_svc(dest, svc);
  807 + __ip_vs_svc_put(old_svc, true);
802 808 }
803 809 }
804 810  
... ... @@ -998,16 +1004,6 @@
998 1004 return 0;
999 1005 }
1000 1006  
1001   -static void ip_vs_dest_wait_readers(struct rcu_head *head)
1002   -{
1003   - struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest,
1004   - rcu_head);
1005   -
1006   - /* End of grace period after unlinking */
1007   - clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
1008   -}
1009   -
1010   -
1011 1007 /*
1012 1008 * Delete a destination (must be already unlinked from the service)
1013 1009 */
1014 1010  
1015 1011  
... ... @@ -1023,20 +1019,16 @@
1023 1019 */
1024 1020 ip_vs_rs_unhash(dest);
1025 1021  
1026   - if (!cleanup) {
1027   - set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
1028   - call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers);
1029   - }
1030   -
1031 1022 spin_lock_bh(&ipvs->dest_trash_lock);
1032 1023 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1033 1024 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1034 1025 atomic_read(&dest->refcnt));
1035 1026 if (list_empty(&ipvs->dest_trash) && !cleanup)
1036 1027 mod_timer(&ipvs->dest_trash_timer,
1037   - jiffies + IP_VS_DEST_TRASH_PERIOD);
  1028 + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1038 1029 /* dest lives in trash without reference */
1039 1030 list_add(&dest->t_list, &ipvs->dest_trash);
  1031 + dest->idle_start = 0;
1040 1032 spin_unlock_bh(&ipvs->dest_trash_lock);
1041 1033 ip_vs_dest_put(dest);
1042 1034 }
1043 1035  
1044 1036  
1045 1037  
1046 1038  
... ... @@ -1108,24 +1100,30 @@
1108 1100 struct net *net = (struct net *) data;
1109 1101 struct netns_ipvs *ipvs = net_ipvs(net);
1110 1102 struct ip_vs_dest *dest, *next;
  1103 + unsigned long now = jiffies;
1111 1104  
1112 1105 spin_lock(&ipvs->dest_trash_lock);
1113 1106 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1114   - /* Skip if dest is in grace period */
1115   - if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
1116   - continue;
1117 1107 if (atomic_read(&dest->refcnt) > 0)
1118 1108 continue;
  1109 + if (dest->idle_start) {
  1110 + if (time_before(now, dest->idle_start +
  1111 + IP_VS_DEST_TRASH_PERIOD))
  1112 + continue;
  1113 + } else {
  1114 + dest->idle_start = max(1UL, now);
  1115 + continue;
  1116 + }
1119 1117 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1120 1118 dest->vfwmark,
1121   - IP_VS_DBG_ADDR(dest->svc->af, &dest->addr),
  1119 + IP_VS_DBG_ADDR(dest->af, &dest->addr),
1122 1120 ntohs(dest->port));
1123 1121 list_del(&dest->t_list);
1124 1122 ip_vs_dest_free(dest);
1125 1123 }
1126 1124 if (!list_empty(&ipvs->dest_trash))
1127 1125 mod_timer(&ipvs->dest_trash_timer,
1128   - jiffies + IP_VS_DEST_TRASH_PERIOD);
  1126 + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1129 1127 spin_unlock(&ipvs->dest_trash_lock);
1130 1128 }
1131 1129  
... ... @@ -1320,14 +1318,6 @@
1320 1318 return ret;
1321 1319 }
1322 1320  
1323   -static void ip_vs_service_rcu_free(struct rcu_head *head)
1324   -{
1325   - struct ip_vs_service *svc;
1326   -
1327   - svc = container_of(head, struct ip_vs_service, rcu_head);
1328   - ip_vs_service_free(svc);
1329   -}
1330   -
1331 1321 /*
1332 1322 * Delete a service from the service list
1333 1323 * - The service must be unlinked, unlocked and not referenced!
... ... @@ -1376,13 +1366,7 @@
1376 1366 /*
1377 1367 * Free the service if nobody refers to it
1378 1368 */
1379   - if (atomic_dec_and_test(&svc->refcnt)) {
1380   - IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
1381   - svc->fwmark,
1382   - IP_VS_DBG_ADDR(svc->af, &svc->addr),
1383   - ntohs(svc->port));
1384   - call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
1385   - }
  1369 + __ip_vs_svc_put(svc, true);
1386 1370  
1387 1371 /* decrease the module use count */
1388 1372 ip_vs_use_count_dec();