Commit 83723d60717f8da0f53f91cf42a845ed56c09662
Committed by
Pablo Neira Ayuso
1 parent
45b9f509b7
netfilter: x_tables: dont block BH while reading counters
Using "iptables -L" with a lot of rules have a too big BH latency. Jesper mentioned ~6 ms and worried of frame drops. Switch to a per_cpu seqlock scheme, so that taking a snapshot of counters doesnt need to block BH (for this cpu, but also other cpus). This adds two increments on seqlock sequence per ipt_do_table() call, its a reasonable cost for allowing "iptables -L" not block BH processing. Reported-by: Jesper Dangaard Brouer <hawk@comx.dk> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> CC: Patrick McHardy <kaber@trash.net> Acked-by: Stephen Hemminger <shemminger@vyatta.com> Acked-by: Jesper Dangaard Brouer <hawk@comx.dk> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Showing 5 changed files with 49 additions and 99 deletions Side-by-side Diff
include/linux/netfilter/x_tables.h
... | ... | @@ -472,7 +472,7 @@ |
472 | 472 | * necessary for reading the counters. |
473 | 473 | */ |
474 | 474 | struct xt_info_lock { |
475 | - spinlock_t lock; | |
475 | + seqlock_t lock; | |
476 | 476 | unsigned char readers; |
477 | 477 | }; |
478 | 478 | DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); |
... | ... | @@ -497,7 +497,7 @@ |
497 | 497 | local_bh_disable(); |
498 | 498 | lock = &__get_cpu_var(xt_info_locks); |
499 | 499 | if (likely(!lock->readers++)) |
500 | - spin_lock(&lock->lock); | |
500 | + write_seqlock(&lock->lock); | |
501 | 501 | } |
502 | 502 | |
503 | 503 | static inline void xt_info_rdunlock_bh(void) |
... | ... | @@ -505,7 +505,7 @@ |
505 | 505 | struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); |
506 | 506 | |
507 | 507 | if (likely(!--lock->readers)) |
508 | - spin_unlock(&lock->lock); | |
508 | + write_sequnlock(&lock->lock); | |
509 | 509 | local_bh_enable(); |
510 | 510 | } |
511 | 511 | |
512 | 512 | |
... | ... | @@ -516,12 +516,12 @@ |
516 | 516 | */ |
517 | 517 | static inline void xt_info_wrlock(unsigned int cpu) |
518 | 518 | { |
519 | - spin_lock(&per_cpu(xt_info_locks, cpu).lock); | |
519 | + write_seqlock(&per_cpu(xt_info_locks, cpu).lock); | |
520 | 520 | } |
521 | 521 | |
522 | 522 | static inline void xt_info_wrunlock(unsigned int cpu) |
523 | 523 | { |
524 | - spin_unlock(&per_cpu(xt_info_locks, cpu).lock); | |
524 | + write_sequnlock(&per_cpu(xt_info_locks, cpu).lock); | |
525 | 525 | } |
526 | 526 | |
527 | 527 | /* |
net/ipv4/netfilter/arp_tables.c
... | ... | @@ -710,42 +710,25 @@ |
710 | 710 | struct arpt_entry *iter; |
711 | 711 | unsigned int cpu; |
712 | 712 | unsigned int i; |
713 | - unsigned int curcpu = get_cpu(); | |
714 | 713 | |
715 | - /* Instead of clearing (by a previous call to memset()) | |
716 | - * the counters and using adds, we set the counters | |
717 | - * with data used by 'current' CPU | |
718 | - * | |
719 | - * Bottom half has to be disabled to prevent deadlock | |
720 | - * if new softirq were to run and call ipt_do_table | |
721 | - */ | |
722 | - local_bh_disable(); | |
723 | - i = 0; | |
724 | - xt_entry_foreach(iter, t->entries[curcpu], t->size) { | |
725 | - SET_COUNTER(counters[i], iter->counters.bcnt, | |
726 | - iter->counters.pcnt); | |
727 | - ++i; | |
728 | - } | |
729 | - local_bh_enable(); | |
730 | - /* Processing counters from other cpus, we can let bottom half enabled, | |
731 | - * (preemption is disabled) | |
732 | - */ | |
733 | - | |
734 | 714 | for_each_possible_cpu(cpu) { |
735 | - if (cpu == curcpu) | |
736 | - continue; | |
715 | + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; | |
716 | + | |
737 | 717 | i = 0; |
738 | - local_bh_disable(); | |
739 | - xt_info_wrlock(cpu); | |
740 | 718 | xt_entry_foreach(iter, t->entries[cpu], t->size) { |
741 | - ADD_COUNTER(counters[i], iter->counters.bcnt, | |
742 | - iter->counters.pcnt); | |
719 | + u64 bcnt, pcnt; | |
720 | + unsigned int start; | |
721 | + | |
722 | + do { | |
723 | + start = read_seqbegin(lock); | |
724 | + bcnt = iter->counters.bcnt; | |
725 | + pcnt = iter->counters.pcnt; | |
726 | + } while (read_seqretry(lock, start)); | |
727 | + | |
728 | + ADD_COUNTER(counters[i], bcnt, pcnt); | |
743 | 729 | ++i; |
744 | 730 | } |
745 | - xt_info_wrunlock(cpu); | |
746 | - local_bh_enable(); | |
747 | 731 | } |
748 | - put_cpu(); | |
749 | 732 | } |
750 | 733 | |
751 | 734 | static struct xt_counters *alloc_counters(const struct xt_table *table) |
... | ... | @@ -759,7 +742,7 @@ |
759 | 742 | * about). |
760 | 743 | */ |
761 | 744 | countersize = sizeof(struct xt_counters) * private->number; |
762 | - counters = vmalloc(countersize); | |
745 | + counters = vzalloc(countersize); | |
763 | 746 | |
764 | 747 | if (counters == NULL) |
765 | 748 | return ERR_PTR(-ENOMEM); |
... | ... | @@ -1007,7 +990,7 @@ |
1007 | 990 | struct arpt_entry *iter; |
1008 | 991 | |
1009 | 992 | ret = 0; |
1010 | - counters = vmalloc(num_counters * sizeof(struct xt_counters)); | |
993 | + counters = vzalloc(num_counters * sizeof(struct xt_counters)); | |
1011 | 994 | if (!counters) { |
1012 | 995 | ret = -ENOMEM; |
1013 | 996 | goto out; |
net/ipv4/netfilter/ip_tables.c
... | ... | @@ -884,42 +884,25 @@ |
884 | 884 | struct ipt_entry *iter; |
885 | 885 | unsigned int cpu; |
886 | 886 | unsigned int i; |
887 | - unsigned int curcpu = get_cpu(); | |
888 | 887 | |
889 | - /* Instead of clearing (by a previous call to memset()) | |
890 | - * the counters and using adds, we set the counters | |
891 | - * with data used by 'current' CPU. | |
892 | - * | |
893 | - * Bottom half has to be disabled to prevent deadlock | |
894 | - * if new softirq were to run and call ipt_do_table | |
895 | - */ | |
896 | - local_bh_disable(); | |
897 | - i = 0; | |
898 | - xt_entry_foreach(iter, t->entries[curcpu], t->size) { | |
899 | - SET_COUNTER(counters[i], iter->counters.bcnt, | |
900 | - iter->counters.pcnt); | |
901 | - ++i; | |
902 | - } | |
903 | - local_bh_enable(); | |
904 | - /* Processing counters from other cpus, we can let bottom half enabled, | |
905 | - * (preemption is disabled) | |
906 | - */ | |
907 | - | |
908 | 888 | for_each_possible_cpu(cpu) { |
909 | - if (cpu == curcpu) | |
910 | - continue; | |
889 | + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; | |
890 | + | |
911 | 891 | i = 0; |
912 | - local_bh_disable(); | |
913 | - xt_info_wrlock(cpu); | |
914 | 892 | xt_entry_foreach(iter, t->entries[cpu], t->size) { |
915 | - ADD_COUNTER(counters[i], iter->counters.bcnt, | |
916 | - iter->counters.pcnt); | |
893 | + u64 bcnt, pcnt; | |
894 | + unsigned int start; | |
895 | + | |
896 | + do { | |
897 | + start = read_seqbegin(lock); | |
898 | + bcnt = iter->counters.bcnt; | |
899 | + pcnt = iter->counters.pcnt; | |
900 | + } while (read_seqretry(lock, start)); | |
901 | + | |
902 | + ADD_COUNTER(counters[i], bcnt, pcnt); | |
917 | 903 | ++i; /* macro does multi eval of i */ |
918 | 904 | } |
919 | - xt_info_wrunlock(cpu); | |
920 | - local_bh_enable(); | |
921 | 905 | } |
922 | - put_cpu(); | |
923 | 906 | } |
924 | 907 | |
925 | 908 | static struct xt_counters *alloc_counters(const struct xt_table *table) |
... | ... | @@ -932,7 +915,7 @@ |
932 | 915 | (other than comefrom, which userspace doesn't care |
933 | 916 | about). */ |
934 | 917 | countersize = sizeof(struct xt_counters) * private->number; |
935 | - counters = vmalloc(countersize); | |
918 | + counters = vzalloc(countersize); | |
936 | 919 | |
937 | 920 | if (counters == NULL) |
938 | 921 | return ERR_PTR(-ENOMEM); |
... | ... | @@ -1203,7 +1186,7 @@ |
1203 | 1186 | struct ipt_entry *iter; |
1204 | 1187 | |
1205 | 1188 | ret = 0; |
1206 | - counters = vmalloc(num_counters * sizeof(struct xt_counters)); | |
1189 | + counters = vzalloc(num_counters * sizeof(struct xt_counters)); | |
1207 | 1190 | if (!counters) { |
1208 | 1191 | ret = -ENOMEM; |
1209 | 1192 | goto out; |
net/ipv6/netfilter/ip6_tables.c
... | ... | @@ -897,42 +897,25 @@ |
897 | 897 | struct ip6t_entry *iter; |
898 | 898 | unsigned int cpu; |
899 | 899 | unsigned int i; |
900 | - unsigned int curcpu = get_cpu(); | |
901 | 900 | |
902 | - /* Instead of clearing (by a previous call to memset()) | |
903 | - * the counters and using adds, we set the counters | |
904 | - * with data used by 'current' CPU | |
905 | - * | |
906 | - * Bottom half has to be disabled to prevent deadlock | |
907 | - * if new softirq were to run and call ipt_do_table | |
908 | - */ | |
909 | - local_bh_disable(); | |
910 | - i = 0; | |
911 | - xt_entry_foreach(iter, t->entries[curcpu], t->size) { | |
912 | - SET_COUNTER(counters[i], iter->counters.bcnt, | |
913 | - iter->counters.pcnt); | |
914 | - ++i; | |
915 | - } | |
916 | - local_bh_enable(); | |
917 | - /* Processing counters from other cpus, we can let bottom half enabled, | |
918 | - * (preemption is disabled) | |
919 | - */ | |
920 | - | |
921 | 901 | for_each_possible_cpu(cpu) { |
922 | - if (cpu == curcpu) | |
923 | - continue; | |
902 | + seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock; | |
903 | + | |
924 | 904 | i = 0; |
925 | - local_bh_disable(); | |
926 | - xt_info_wrlock(cpu); | |
927 | 905 | xt_entry_foreach(iter, t->entries[cpu], t->size) { |
928 | - ADD_COUNTER(counters[i], iter->counters.bcnt, | |
929 | - iter->counters.pcnt); | |
906 | + u64 bcnt, pcnt; | |
907 | + unsigned int start; | |
908 | + | |
909 | + do { | |
910 | + start = read_seqbegin(lock); | |
911 | + bcnt = iter->counters.bcnt; | |
912 | + pcnt = iter->counters.pcnt; | |
913 | + } while (read_seqretry(lock, start)); | |
914 | + | |
915 | + ADD_COUNTER(counters[i], bcnt, pcnt); | |
930 | 916 | ++i; |
931 | 917 | } |
932 | - xt_info_wrunlock(cpu); | |
933 | - local_bh_enable(); | |
934 | 918 | } |
935 | - put_cpu(); | |
936 | 919 | } |
937 | 920 | |
938 | 921 | static struct xt_counters *alloc_counters(const struct xt_table *table) |
... | ... | @@ -945,7 +928,7 @@ |
945 | 928 | (other than comefrom, which userspace doesn't care |
946 | 929 | about). */ |
947 | 930 | countersize = sizeof(struct xt_counters) * private->number; |
948 | - counters = vmalloc(countersize); | |
931 | + counters = vzalloc(countersize); | |
949 | 932 | |
950 | 933 | if (counters == NULL) |
951 | 934 | return ERR_PTR(-ENOMEM); |
... | ... | @@ -1216,7 +1199,7 @@ |
1216 | 1199 | struct ip6t_entry *iter; |
1217 | 1200 | |
1218 | 1201 | ret = 0; |
1219 | - counters = vmalloc(num_counters * sizeof(struct xt_counters)); | |
1202 | + counters = vzalloc(num_counters * sizeof(struct xt_counters)); | |
1220 | 1203 | if (!counters) { |
1221 | 1204 | ret = -ENOMEM; |
1222 | 1205 | goto out; |
net/netfilter/x_tables.c