Commit 83723d60717f8da0f53f91cf42a845ed56c09662

Authored by Eric Dumazet
Committed by Pablo Neira Ayuso
1 parent 45b9f509b7

netfilter: x_tables: dont block BH while reading counters

Using "iptables -L" with a lot of rules have a too big BH latency.
Jesper mentioned ~6 ms and worried of frame drops.

Switch to a per_cpu seqlock scheme, so that taking a snapshot of
counters doesnt need to block BH (for this cpu, but also other cpus).

This adds two increments on seqlock sequence per ipt_do_table() call,
its a reasonable cost for allowing "iptables -L" not block BH
processing.

Reported-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

Showing 5 changed files with 49 additions and 99 deletions Side-by-side Diff

include/linux/netfilter/x_tables.h
... ... @@ -472,7 +472,7 @@
472 472 * necessary for reading the counters.
473 473 */
474 474 struct xt_info_lock {
475   - spinlock_t lock;
  475 + seqlock_t lock;
476 476 unsigned char readers;
477 477 };
478 478 DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
... ... @@ -497,7 +497,7 @@
497 497 local_bh_disable();
498 498 lock = &__get_cpu_var(xt_info_locks);
499 499 if (likely(!lock->readers++))
500   - spin_lock(&lock->lock);
  500 + write_seqlock(&lock->lock);
501 501 }
502 502  
503 503 static inline void xt_info_rdunlock_bh(void)
... ... @@ -505,7 +505,7 @@
505 505 struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
506 506  
507 507 if (likely(!--lock->readers))
508   - spin_unlock(&lock->lock);
  508 + write_sequnlock(&lock->lock);
509 509 local_bh_enable();
510 510 }
511 511  
512 512  
... ... @@ -516,12 +516,12 @@
516 516 */
517 517 static inline void xt_info_wrlock(unsigned int cpu)
518 518 {
519   - spin_lock(&per_cpu(xt_info_locks, cpu).lock);
  519 + write_seqlock(&per_cpu(xt_info_locks, cpu).lock);
520 520 }
521 521  
522 522 static inline void xt_info_wrunlock(unsigned int cpu)
523 523 {
524   - spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
  524 + write_sequnlock(&per_cpu(xt_info_locks, cpu).lock);
525 525 }
526 526  
527 527 /*
net/ipv4/netfilter/arp_tables.c
... ... @@ -710,42 +710,25 @@
710 710 struct arpt_entry *iter;
711 711 unsigned int cpu;
712 712 unsigned int i;
713   - unsigned int curcpu = get_cpu();
714 713  
715   - /* Instead of clearing (by a previous call to memset())
716   - * the counters and using adds, we set the counters
717   - * with data used by 'current' CPU
718   - *
719   - * Bottom half has to be disabled to prevent deadlock
720   - * if new softirq were to run and call ipt_do_table
721   - */
722   - local_bh_disable();
723   - i = 0;
724   - xt_entry_foreach(iter, t->entries[curcpu], t->size) {
725   - SET_COUNTER(counters[i], iter->counters.bcnt,
726   - iter->counters.pcnt);
727   - ++i;
728   - }
729   - local_bh_enable();
730   - /* Processing counters from other cpus, we can let bottom half enabled,
731   - * (preemption is disabled)
732   - */
733   -
734 714 for_each_possible_cpu(cpu) {
735   - if (cpu == curcpu)
736   - continue;
  715 + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
  716 +
737 717 i = 0;
738   - local_bh_disable();
739   - xt_info_wrlock(cpu);
740 718 xt_entry_foreach(iter, t->entries[cpu], t->size) {
741   - ADD_COUNTER(counters[i], iter->counters.bcnt,
742   - iter->counters.pcnt);
  719 + u64 bcnt, pcnt;
  720 + unsigned int start;
  721 +
  722 + do {
  723 + start = read_seqbegin(lock);
  724 + bcnt = iter->counters.bcnt;
  725 + pcnt = iter->counters.pcnt;
  726 + } while (read_seqretry(lock, start));
  727 +
  728 + ADD_COUNTER(counters[i], bcnt, pcnt);
743 729 ++i;
744 730 }
745   - xt_info_wrunlock(cpu);
746   - local_bh_enable();
747 731 }
748   - put_cpu();
749 732 }
750 733  
751 734 static struct xt_counters *alloc_counters(const struct xt_table *table)
... ... @@ -759,7 +742,7 @@
759 742 * about).
760 743 */
761 744 countersize = sizeof(struct xt_counters) * private->number;
762   - counters = vmalloc(countersize);
  745 + counters = vzalloc(countersize);
763 746  
764 747 if (counters == NULL)
765 748 return ERR_PTR(-ENOMEM);
... ... @@ -1007,7 +990,7 @@
1007 990 struct arpt_entry *iter;
1008 991  
1009 992 ret = 0;
1010   - counters = vmalloc(num_counters * sizeof(struct xt_counters));
  993 + counters = vzalloc(num_counters * sizeof(struct xt_counters));
1011 994 if (!counters) {
1012 995 ret = -ENOMEM;
1013 996 goto out;
net/ipv4/netfilter/ip_tables.c
... ... @@ -884,42 +884,25 @@
884 884 struct ipt_entry *iter;
885 885 unsigned int cpu;
886 886 unsigned int i;
887   - unsigned int curcpu = get_cpu();
888 887  
889   - /* Instead of clearing (by a previous call to memset())
890   - * the counters and using adds, we set the counters
891   - * with data used by 'current' CPU.
892   - *
893   - * Bottom half has to be disabled to prevent deadlock
894   - * if new softirq were to run and call ipt_do_table
895   - */
896   - local_bh_disable();
897   - i = 0;
898   - xt_entry_foreach(iter, t->entries[curcpu], t->size) {
899   - SET_COUNTER(counters[i], iter->counters.bcnt,
900   - iter->counters.pcnt);
901   - ++i;
902   - }
903   - local_bh_enable();
904   - /* Processing counters from other cpus, we can let bottom half enabled,
905   - * (preemption is disabled)
906   - */
907   -
908 888 for_each_possible_cpu(cpu) {
909   - if (cpu == curcpu)
910   - continue;
  889 + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
  890 +
911 891 i = 0;
912   - local_bh_disable();
913   - xt_info_wrlock(cpu);
914 892 xt_entry_foreach(iter, t->entries[cpu], t->size) {
915   - ADD_COUNTER(counters[i], iter->counters.bcnt,
916   - iter->counters.pcnt);
  893 + u64 bcnt, pcnt;
  894 + unsigned int start;
  895 +
  896 + do {
  897 + start = read_seqbegin(lock);
  898 + bcnt = iter->counters.bcnt;
  899 + pcnt = iter->counters.pcnt;
  900 + } while (read_seqretry(lock, start));
  901 +
  902 + ADD_COUNTER(counters[i], bcnt, pcnt);
917 903 ++i; /* macro does multi eval of i */
918 904 }
919   - xt_info_wrunlock(cpu);
920   - local_bh_enable();
921 905 }
922   - put_cpu();
923 906 }
924 907  
925 908 static struct xt_counters *alloc_counters(const struct xt_table *table)
... ... @@ -932,7 +915,7 @@
932 915 (other than comefrom, which userspace doesn't care
933 916 about). */
934 917 countersize = sizeof(struct xt_counters) * private->number;
935   - counters = vmalloc(countersize);
  918 + counters = vzalloc(countersize);
936 919  
937 920 if (counters == NULL)
938 921 return ERR_PTR(-ENOMEM);
... ... @@ -1203,7 +1186,7 @@
1203 1186 struct ipt_entry *iter;
1204 1187  
1205 1188 ret = 0;
1206   - counters = vmalloc(num_counters * sizeof(struct xt_counters));
  1189 + counters = vzalloc(num_counters * sizeof(struct xt_counters));
1207 1190 if (!counters) {
1208 1191 ret = -ENOMEM;
1209 1192 goto out;
net/ipv6/netfilter/ip6_tables.c
... ... @@ -897,42 +897,25 @@
897 897 struct ip6t_entry *iter;
898 898 unsigned int cpu;
899 899 unsigned int i;
900   - unsigned int curcpu = get_cpu();
901 900  
902   - /* Instead of clearing (by a previous call to memset())
903   - * the counters and using adds, we set the counters
904   - * with data used by 'current' CPU
905   - *
906   - * Bottom half has to be disabled to prevent deadlock
907   - * if new softirq were to run and call ipt_do_table
908   - */
909   - local_bh_disable();
910   - i = 0;
911   - xt_entry_foreach(iter, t->entries[curcpu], t->size) {
912   - SET_COUNTER(counters[i], iter->counters.bcnt,
913   - iter->counters.pcnt);
914   - ++i;
915   - }
916   - local_bh_enable();
917   - /* Processing counters from other cpus, we can let bottom half enabled,
918   - * (preemption is disabled)
919   - */
920   -
921 901 for_each_possible_cpu(cpu) {
922   - if (cpu == curcpu)
923   - continue;
  902 + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
  903 +
924 904 i = 0;
925   - local_bh_disable();
926   - xt_info_wrlock(cpu);
927 905 xt_entry_foreach(iter, t->entries[cpu], t->size) {
928   - ADD_COUNTER(counters[i], iter->counters.bcnt,
929   - iter->counters.pcnt);
  906 + u64 bcnt, pcnt;
  907 + unsigned int start;
  908 +
  909 + do {
  910 + start = read_seqbegin(lock);
  911 + bcnt = iter->counters.bcnt;
  912 + pcnt = iter->counters.pcnt;
  913 + } while (read_seqretry(lock, start));
  914 +
  915 + ADD_COUNTER(counters[i], bcnt, pcnt);
930 916 ++i;
931 917 }
932   - xt_info_wrunlock(cpu);
933   - local_bh_enable();
934 918 }
935   - put_cpu();
936 919 }
937 920  
938 921 static struct xt_counters *alloc_counters(const struct xt_table *table)
... ... @@ -945,7 +928,7 @@
945 928 (other than comefrom, which userspace doesn't care
946 929 about). */
947 930 countersize = sizeof(struct xt_counters) * private->number;
948   - counters = vmalloc(countersize);
  931 + counters = vzalloc(countersize);
949 932  
950 933 if (counters == NULL)
951 934 return ERR_PTR(-ENOMEM);
... ... @@ -1216,7 +1199,7 @@
1216 1199 struct ip6t_entry *iter;
1217 1200  
1218 1201 ret = 0;
1219   - counters = vmalloc(num_counters * sizeof(struct xt_counters));
  1202 + counters = vzalloc(num_counters * sizeof(struct xt_counters));
1220 1203 if (!counters) {
1221 1204 ret = -ENOMEM;
1222 1205 goto out;
net/netfilter/x_tables.c
... ... @@ -1325,7 +1325,8 @@
1325 1325  
1326 1326 for_each_possible_cpu(i) {
1327 1327 struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
1328   - spin_lock_init(&lock->lock);
  1328 +
  1329 + seqlock_init(&lock->lock);
1329 1330 lock->readers = 0;
1330 1331 }
1331 1332