Commit 942e4a2bd680c606af0211e64eb216be2e19bf61
Committed by
David S. Miller
1 parent
bf368e4e70
Exists in
master
and in
39 other branches
netfilter: revised locking for x_tables
The x_tables are organized with a table structure and a per-cpu copies of the counters and rules. On older kernels there was a reader/writer lock per table which was a performance bottleneck. In 2.6.30-rc, this was converted to use RCU and the counters/rules which solved the performance problems for do_table but made replacing rules much slower because of the necessary RCU grace period. This version uses a per-cpu set of spinlocks and counters to allow to table processing to proceed without the cache thrashing of a global reader lock and keeps the same performance for table updates. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 5 changed files with 204 additions and 296 deletions Side-by-side Diff
include/linux/netfilter/x_tables.h
... | ... | @@ -354,9 +354,6 @@ |
354 | 354 | /* What hooks you will enter on */ |
355 | 355 | unsigned int valid_hooks; |
356 | 356 | |
357 | - /* Lock for the curtain */ | |
358 | - struct mutex lock; | |
359 | - | |
360 | 357 | /* Man behind the curtain... */ |
361 | 358 | struct xt_table_info *private; |
362 | 359 | |
... | ... | @@ -434,8 +431,74 @@ |
434 | 431 | |
435 | 432 | extern struct xt_table_info *xt_alloc_table_info(unsigned int size); |
436 | 433 | extern void xt_free_table_info(struct xt_table_info *info); |
437 | -extern void xt_table_entry_swap_rcu(struct xt_table_info *old, | |
438 | - struct xt_table_info *new); | |
434 | + | |
435 | +/* | |
436 | + * Per-CPU spinlock associated with per-cpu table entries, and | |
437 | + * with a counter for the "reading" side that allows a recursive | |
438 | + * reader to avoid taking the lock and deadlocking. | |
439 | + * | |
440 | + * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu. | |
441 | + * It needs to ensure that the rules are not being changed while the packet | |
442 | + * is being processed. In some cases, the read lock will be acquired | |
443 | + * twice on the same CPU; this is okay because of the count. | |
444 | + * | |
445 | + * "writing" is used when reading counters. | |
446 | + * During replace any readers that are using the old tables have to complete | |
447 | + * before freeing the old table. This is handled by the write locking | |
448 | + * necessary for reading the counters. | |
449 | + */ | |
450 | +struct xt_info_lock { | |
451 | + spinlock_t lock; | |
452 | + unsigned char readers; | |
453 | +}; | |
454 | +DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); | |
455 | + | |
456 | +/* | |
457 | + * Note: we need to ensure that preemption is disabled before acquiring | |
458 | + * the per-cpu-variable, so we do it as a two step process rather than | |
459 | + * using "spin_lock_bh()". | |
460 | + * | |
461 | + * We _also_ need to disable bottom half processing before updating our | |
462 | + * nesting count, to make sure that the only kind of re-entrancy is this | |
463 | + * code being called by itself: since the count+lock is not an atomic | |
464 | + * operation, we can allow no races. | |
465 | + * | |
466 | + * _Only_ that special combination of being per-cpu and never getting | |
467 | + * re-entered asynchronously means that the count is safe. | |
468 | + */ | |
469 | +static inline void xt_info_rdlock_bh(void) | |
470 | +{ | |
471 | + struct xt_info_lock *lock; | |
472 | + | |
473 | + local_bh_disable(); | |
474 | + lock = &__get_cpu_var(xt_info_locks); | |
475 | + if (!lock->readers++) | |
476 | + spin_lock(&lock->lock); | |
477 | +} | |
478 | + | |
479 | +static inline void xt_info_rdunlock_bh(void) | |
480 | +{ | |
481 | + struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); | |
482 | + | |
483 | + if (!--lock->readers) | |
484 | + spin_unlock(&lock->lock); | |
485 | + local_bh_enable(); | |
486 | +} | |
487 | + | |
488 | +/* | |
489 | + * The "writer" side needs to get exclusive access to the lock, | |
490 | + * regardless of readers. This must be called with bottom half | |
491 | + * processing (and thus also preemption) disabled. | |
492 | + */ | |
493 | +static inline void xt_info_wrlock(unsigned int cpu) | |
494 | +{ | |
495 | + spin_lock(&per_cpu(xt_info_locks, cpu).lock); | |
496 | +} | |
497 | + | |
498 | +static inline void xt_info_wrunlock(unsigned int cpu) | |
499 | +{ | |
500 | + spin_unlock(&per_cpu(xt_info_locks, cpu).lock); | |
501 | +} | |
439 | 502 | |
440 | 503 | /* |
441 | 504 | * This helper is performance critical and must be inlined |
net/ipv4/netfilter/arp_tables.c
... | ... | @@ -253,9 +253,9 @@ |
253 | 253 | indev = in ? in->name : nulldevname; |
254 | 254 | outdev = out ? out->name : nulldevname; |
255 | 255 | |
256 | - rcu_read_lock_bh(); | |
257 | - private = rcu_dereference(table->private); | |
258 | - table_base = rcu_dereference(private->entries[smp_processor_id()]); | |
256 | + xt_info_rdlock_bh(); | |
257 | + private = table->private; | |
258 | + table_base = private->entries[smp_processor_id()]; | |
259 | 259 | |
260 | 260 | e = get_entry(table_base, private->hook_entry[hook]); |
261 | 261 | back = get_entry(table_base, private->underflow[hook]); |
... | ... | @@ -273,6 +273,7 @@ |
273 | 273 | |
274 | 274 | hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + |
275 | 275 | (2 * skb->dev->addr_len); |
276 | + | |
276 | 277 | ADD_COUNTER(e->counters, hdr_len, 1); |
277 | 278 | |
278 | 279 | t = arpt_get_target(e); |
279 | 280 | |
... | ... | @@ -328,9 +329,8 @@ |
328 | 329 | e = (void *)e + e->next_offset; |
329 | 330 | } |
330 | 331 | } while (!hotdrop); |
332 | + xt_info_rdunlock_bh(); | |
331 | 333 | |
332 | - rcu_read_unlock_bh(); | |
333 | - | |
334 | 334 | if (hotdrop) |
335 | 335 | return NF_DROP; |
336 | 336 | else |
337 | 337 | |
... | ... | @@ -711,9 +711,12 @@ |
711 | 711 | /* Instead of clearing (by a previous call to memset()) |
712 | 712 | * the counters and using adds, we set the counters |
713 | 713 | * with data used by 'current' CPU |
714 | - * We dont care about preemption here. | |
714 | + * | |
715 | + * Bottom half has to be disabled to prevent deadlock | |
716 | + * if new softirq were to run and call ipt_do_table | |
715 | 717 | */ |
716 | - curcpu = raw_smp_processor_id(); | |
718 | + local_bh_disable(); | |
719 | + curcpu = smp_processor_id(); | |
717 | 720 | |
718 | 721 | i = 0; |
719 | 722 | ARPT_ENTRY_ITERATE(t->entries[curcpu], |
720 | 723 | |
721 | 724 | |
722 | 725 | |
723 | 726 | |
... | ... | @@ -726,73 +729,22 @@ |
726 | 729 | if (cpu == curcpu) |
727 | 730 | continue; |
728 | 731 | i = 0; |
732 | + xt_info_wrlock(cpu); | |
729 | 733 | ARPT_ENTRY_ITERATE(t->entries[cpu], |
730 | 734 | t->size, |
731 | 735 | add_entry_to_counter, |
732 | 736 | counters, |
733 | 737 | &i); |
738 | + xt_info_wrunlock(cpu); | |
734 | 739 | } |
735 | -} | |
736 | - | |
737 | - | |
738 | -/* We're lazy, and add to the first CPU; overflow works its fey magic | |
739 | - * and everything is OK. */ | |
740 | -static int | |
741 | -add_counter_to_entry(struct arpt_entry *e, | |
742 | - const struct xt_counters addme[], | |
743 | - unsigned int *i) | |
744 | -{ | |
745 | - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | |
746 | - | |
747 | - (*i)++; | |
748 | - return 0; | |
749 | -} | |
750 | - | |
751 | -/* Take values from counters and add them back onto the current cpu */ | |
752 | -static void put_counters(struct xt_table_info *t, | |
753 | - const struct xt_counters counters[]) | |
754 | -{ | |
755 | - unsigned int i, cpu; | |
756 | - | |
757 | - local_bh_disable(); | |
758 | - cpu = smp_processor_id(); | |
759 | - i = 0; | |
760 | - ARPT_ENTRY_ITERATE(t->entries[cpu], | |
761 | - t->size, | |
762 | - add_counter_to_entry, | |
763 | - counters, | |
764 | - &i); | |
765 | 740 | local_bh_enable(); |
766 | 741 | } |
767 | 742 | |
768 | -static inline int | |
769 | -zero_entry_counter(struct arpt_entry *e, void *arg) | |
770 | -{ | |
771 | - e->counters.bcnt = 0; | |
772 | - e->counters.pcnt = 0; | |
773 | - return 0; | |
774 | -} | |
775 | - | |
776 | -static void | |
777 | -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) | |
778 | -{ | |
779 | - unsigned int cpu; | |
780 | - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; | |
781 | - | |
782 | - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | |
783 | - for_each_possible_cpu(cpu) { | |
784 | - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); | |
785 | - ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, | |
786 | - zero_entry_counter, NULL); | |
787 | - } | |
788 | -} | |
789 | - | |
790 | 743 | static struct xt_counters *alloc_counters(struct xt_table *table) |
791 | 744 | { |
792 | 745 | unsigned int countersize; |
793 | 746 | struct xt_counters *counters; |
794 | 747 | struct xt_table_info *private = table->private; |
795 | - struct xt_table_info *info; | |
796 | 748 | |
797 | 749 | /* We need atomic snapshot of counters: rest doesn't change |
798 | 750 | * (other than comefrom, which userspace doesn't care |
799 | 751 | |
800 | 752 | |
801 | 753 | |
... | ... | @@ -802,30 +754,11 @@ |
802 | 754 | counters = vmalloc_node(countersize, numa_node_id()); |
803 | 755 | |
804 | 756 | if (counters == NULL) |
805 | - goto nomem; | |
757 | + return ERR_PTR(-ENOMEM); | |
806 | 758 | |
807 | - info = xt_alloc_table_info(private->size); | |
808 | - if (!info) | |
809 | - goto free_counters; | |
759 | + get_counters(private, counters); | |
810 | 760 | |
811 | - clone_counters(info, private); | |
812 | - | |
813 | - mutex_lock(&table->lock); | |
814 | - xt_table_entry_swap_rcu(private, info); | |
815 | - synchronize_net(); /* Wait until smoke has cleared */ | |
816 | - | |
817 | - get_counters(info, counters); | |
818 | - put_counters(private, counters); | |
819 | - mutex_unlock(&table->lock); | |
820 | - | |
821 | - xt_free_table_info(info); | |
822 | - | |
823 | 761 | return counters; |
824 | - | |
825 | - free_counters: | |
826 | - vfree(counters); | |
827 | - nomem: | |
828 | - return ERR_PTR(-ENOMEM); | |
829 | 762 | } |
830 | 763 | |
831 | 764 | static int copy_entries_to_user(unsigned int total_size, |
832 | 765 | |
... | ... | @@ -1094,8 +1027,9 @@ |
1094 | 1027 | (newinfo->number <= oldinfo->initial_entries)) |
1095 | 1028 | module_put(t->me); |
1096 | 1029 | |
1097 | - /* Get the old counters. */ | |
1030 | + /* Get the old counters, and synchronize with replace */ | |
1098 | 1031 | get_counters(oldinfo, counters); |
1032 | + | |
1099 | 1033 | /* Decrease module usage counts and free resource */ |
1100 | 1034 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; |
1101 | 1035 | ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, |
1102 | 1036 | |
... | ... | @@ -1165,10 +1099,23 @@ |
1165 | 1099 | return ret; |
1166 | 1100 | } |
1167 | 1101 | |
1102 | +/* We're lazy, and add to the first CPU; overflow works its fey magic | |
1103 | + * and everything is OK. */ | |
1104 | +static int | |
1105 | +add_counter_to_entry(struct arpt_entry *e, | |
1106 | + const struct xt_counters addme[], | |
1107 | + unsigned int *i) | |
1108 | +{ | |
1109 | + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | |
1110 | + | |
1111 | + (*i)++; | |
1112 | + return 0; | |
1113 | +} | |
1114 | + | |
1168 | 1115 | static int do_add_counters(struct net *net, void __user *user, unsigned int len, |
1169 | 1116 | int compat) |
1170 | 1117 | { |
1171 | - unsigned int i; | |
1118 | + unsigned int i, curcpu; | |
1172 | 1119 | struct xt_counters_info tmp; |
1173 | 1120 | struct xt_counters *paddc; |
1174 | 1121 | unsigned int num_counters; |
1175 | 1122 | |
1176 | 1123 | |
1177 | 1124 | |
1178 | 1125 | |
... | ... | @@ -1224,26 +1171,26 @@ |
1224 | 1171 | goto free; |
1225 | 1172 | } |
1226 | 1173 | |
1227 | - mutex_lock(&t->lock); | |
1174 | + local_bh_disable(); | |
1228 | 1175 | private = t->private; |
1229 | 1176 | if (private->number != num_counters) { |
1230 | 1177 | ret = -EINVAL; |
1231 | 1178 | goto unlock_up_free; |
1232 | 1179 | } |
1233 | 1180 | |
1234 | - preempt_disable(); | |
1235 | 1181 | i = 0; |
1236 | 1182 | /* Choose the copy that is on our node */ |
1237 | - loc_cpu_entry = private->entries[smp_processor_id()]; | |
1183 | + curcpu = smp_processor_id(); | |
1184 | + loc_cpu_entry = private->entries[curcpu]; | |
1185 | + xt_info_wrlock(curcpu); | |
1238 | 1186 | ARPT_ENTRY_ITERATE(loc_cpu_entry, |
1239 | 1187 | private->size, |
1240 | 1188 | add_counter_to_entry, |
1241 | 1189 | paddc, |
1242 | 1190 | &i); |
1243 | - preempt_enable(); | |
1191 | + xt_info_wrunlock(curcpu); | |
1244 | 1192 | unlock_up_free: |
1245 | - mutex_unlock(&t->lock); | |
1246 | - | |
1193 | + local_bh_enable(); | |
1247 | 1194 | xt_table_unlock(t); |
1248 | 1195 | module_put(t->me); |
1249 | 1196 | free: |
net/ipv4/netfilter/ip_tables.c
... | ... | @@ -338,11 +338,10 @@ |
338 | 338 | tgpar.hooknum = hook; |
339 | 339 | |
340 | 340 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); |
341 | + xt_info_rdlock_bh(); | |
342 | + private = table->private; | |
343 | + table_base = private->entries[smp_processor_id()]; | |
341 | 344 | |
342 | - rcu_read_lock_bh(); | |
343 | - private = rcu_dereference(table->private); | |
344 | - table_base = rcu_dereference(private->entries[smp_processor_id()]); | |
345 | - | |
346 | 345 | e = get_entry(table_base, private->hook_entry[hook]); |
347 | 346 | |
348 | 347 | /* For return from builtin chain */ |
349 | 348 | |
... | ... | @@ -436,9 +435,8 @@ |
436 | 435 | e = (void *)e + e->next_offset; |
437 | 436 | } |
438 | 437 | } while (!hotdrop); |
438 | + xt_info_rdunlock_bh(); | |
439 | 439 | |
440 | - rcu_read_unlock_bh(); | |
441 | - | |
442 | 440 | #ifdef DEBUG_ALLOW_ALL |
443 | 441 | return NF_ACCEPT; |
444 | 442 | #else |
445 | 443 | |
... | ... | @@ -896,10 +894,13 @@ |
896 | 894 | |
897 | 895 | /* Instead of clearing (by a previous call to memset()) |
898 | 896 | * the counters and using adds, we set the counters |
899 | - * with data used by 'current' CPU | |
900 | - * We dont care about preemption here. | |
897 | + * with data used by 'current' CPU. | |
898 | + * | |
899 | + * Bottom half has to be disabled to prevent deadlock | |
900 | + * if new softirq were to run and call ipt_do_table | |
901 | 901 | */ |
902 | - curcpu = raw_smp_processor_id(); | |
902 | + local_bh_disable(); | |
903 | + curcpu = smp_processor_id(); | |
903 | 904 | |
904 | 905 | i = 0; |
905 | 906 | IPT_ENTRY_ITERATE(t->entries[curcpu], |
906 | 907 | |
907 | 908 | |
908 | 909 | |
909 | 910 | |
... | ... | @@ -912,74 +913,22 @@ |
912 | 913 | if (cpu == curcpu) |
913 | 914 | continue; |
914 | 915 | i = 0; |
916 | + xt_info_wrlock(cpu); | |
915 | 917 | IPT_ENTRY_ITERATE(t->entries[cpu], |
916 | 918 | t->size, |
917 | 919 | add_entry_to_counter, |
918 | 920 | counters, |
919 | 921 | &i); |
922 | + xt_info_wrunlock(cpu); | |
920 | 923 | } |
921 | - | |
922 | -} | |
923 | - | |
924 | -/* We're lazy, and add to the first CPU; overflow works its fey magic | |
925 | - * and everything is OK. */ | |
926 | -static int | |
927 | -add_counter_to_entry(struct ipt_entry *e, | |
928 | - const struct xt_counters addme[], | |
929 | - unsigned int *i) | |
930 | -{ | |
931 | - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | |
932 | - | |
933 | - (*i)++; | |
934 | - return 0; | |
935 | -} | |
936 | - | |
937 | -/* Take values from counters and add them back onto the current cpu */ | |
938 | -static void put_counters(struct xt_table_info *t, | |
939 | - const struct xt_counters counters[]) | |
940 | -{ | |
941 | - unsigned int i, cpu; | |
942 | - | |
943 | - local_bh_disable(); | |
944 | - cpu = smp_processor_id(); | |
945 | - i = 0; | |
946 | - IPT_ENTRY_ITERATE(t->entries[cpu], | |
947 | - t->size, | |
948 | - add_counter_to_entry, | |
949 | - counters, | |
950 | - &i); | |
951 | 924 | local_bh_enable(); |
952 | 925 | } |
953 | 926 | |
954 | - | |
955 | -static inline int | |
956 | -zero_entry_counter(struct ipt_entry *e, void *arg) | |
957 | -{ | |
958 | - e->counters.bcnt = 0; | |
959 | - e->counters.pcnt = 0; | |
960 | - return 0; | |
961 | -} | |
962 | - | |
963 | -static void | |
964 | -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) | |
965 | -{ | |
966 | - unsigned int cpu; | |
967 | - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; | |
968 | - | |
969 | - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | |
970 | - for_each_possible_cpu(cpu) { | |
971 | - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); | |
972 | - IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, | |
973 | - zero_entry_counter, NULL); | |
974 | - } | |
975 | -} | |
976 | - | |
977 | 927 | static struct xt_counters * alloc_counters(struct xt_table *table) |
978 | 928 | { |
979 | 929 | unsigned int countersize; |
980 | 930 | struct xt_counters *counters; |
981 | 931 | struct xt_table_info *private = table->private; |
982 | - struct xt_table_info *info; | |
983 | 932 | |
984 | 933 | /* We need atomic snapshot of counters: rest doesn't change |
985 | 934 | (other than comefrom, which userspace doesn't care |
986 | 935 | |
987 | 936 | |
988 | 937 | |
... | ... | @@ -988,30 +937,11 @@ |
988 | 937 | counters = vmalloc_node(countersize, numa_node_id()); |
989 | 938 | |
990 | 939 | if (counters == NULL) |
991 | - goto nomem; | |
940 | + return ERR_PTR(-ENOMEM); | |
992 | 941 | |
993 | - info = xt_alloc_table_info(private->size); | |
994 | - if (!info) | |
995 | - goto free_counters; | |
942 | + get_counters(private, counters); | |
996 | 943 | |
997 | - clone_counters(info, private); | |
998 | - | |
999 | - mutex_lock(&table->lock); | |
1000 | - xt_table_entry_swap_rcu(private, info); | |
1001 | - synchronize_net(); /* Wait until smoke has cleared */ | |
1002 | - | |
1003 | - get_counters(info, counters); | |
1004 | - put_counters(private, counters); | |
1005 | - mutex_unlock(&table->lock); | |
1006 | - | |
1007 | - xt_free_table_info(info); | |
1008 | - | |
1009 | 944 | return counters; |
1010 | - | |
1011 | - free_counters: | |
1012 | - vfree(counters); | |
1013 | - nomem: | |
1014 | - return ERR_PTR(-ENOMEM); | |
1015 | 945 | } |
1016 | 946 | |
1017 | 947 | static int |
1018 | 948 | |
... | ... | @@ -1306,8 +1236,9 @@ |
1306 | 1236 | (newinfo->number <= oldinfo->initial_entries)) |
1307 | 1237 | module_put(t->me); |
1308 | 1238 | |
1309 | - /* Get the old counters. */ | |
1239 | + /* Get the old counters, and synchronize with replace */ | |
1310 | 1240 | get_counters(oldinfo, counters); |
1241 | + | |
1311 | 1242 | /* Decrease module usage counts and free resource */ |
1312 | 1243 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; |
1313 | 1244 | IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, |
1314 | 1245 | |
1315 | 1246 | |
... | ... | @@ -1377,11 +1308,23 @@ |
1377 | 1308 | return ret; |
1378 | 1309 | } |
1379 | 1310 | |
1311 | +/* We're lazy, and add to the first CPU; overflow works its fey magic | |
1312 | + * and everything is OK. */ | |
1313 | +static int | |
1314 | +add_counter_to_entry(struct ipt_entry *e, | |
1315 | + const struct xt_counters addme[], | |
1316 | + unsigned int *i) | |
1317 | +{ | |
1318 | + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | |
1380 | 1319 | |
1320 | + (*i)++; | |
1321 | + return 0; | |
1322 | +} | |
1323 | + | |
1381 | 1324 | static int |
1382 | 1325 | do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) |
1383 | 1326 | { |
1384 | - unsigned int i; | |
1327 | + unsigned int i, curcpu; | |
1385 | 1328 | struct xt_counters_info tmp; |
1386 | 1329 | struct xt_counters *paddc; |
1387 | 1330 | unsigned int num_counters; |
1388 | 1331 | |
1389 | 1332 | |
1390 | 1333 | |
1391 | 1334 | |
... | ... | @@ -1437,25 +1380,26 @@ |
1437 | 1380 | goto free; |
1438 | 1381 | } |
1439 | 1382 | |
1440 | - mutex_lock(&t->lock); | |
1383 | + local_bh_disable(); | |
1441 | 1384 | private = t->private; |
1442 | 1385 | if (private->number != num_counters) { |
1443 | 1386 | ret = -EINVAL; |
1444 | 1387 | goto unlock_up_free; |
1445 | 1388 | } |
1446 | 1389 | |
1447 | - preempt_disable(); | |
1448 | 1390 | i = 0; |
1449 | 1391 | /* Choose the copy that is on our node */ |
1450 | - loc_cpu_entry = private->entries[raw_smp_processor_id()]; | |
1392 | + curcpu = smp_processor_id(); | |
1393 | + loc_cpu_entry = private->entries[curcpu]; | |
1394 | + xt_info_wrlock(curcpu); | |
1451 | 1395 | IPT_ENTRY_ITERATE(loc_cpu_entry, |
1452 | 1396 | private->size, |
1453 | 1397 | add_counter_to_entry, |
1454 | 1398 | paddc, |
1455 | 1399 | &i); |
1456 | - preempt_enable(); | |
1400 | + xt_info_wrunlock(curcpu); | |
1457 | 1401 | unlock_up_free: |
1458 | - mutex_unlock(&t->lock); | |
1402 | + local_bh_enable(); | |
1459 | 1403 | xt_table_unlock(t); |
1460 | 1404 | module_put(t->me); |
1461 | 1405 | free: |
net/ipv6/netfilter/ip6_tables.c
... | ... | @@ -365,9 +365,9 @@ |
365 | 365 | |
366 | 366 | IP_NF_ASSERT(table->valid_hooks & (1 << hook)); |
367 | 367 | |
368 | - rcu_read_lock_bh(); | |
369 | - private = rcu_dereference(table->private); | |
370 | - table_base = rcu_dereference(private->entries[smp_processor_id()]); | |
368 | + xt_info_rdlock_bh(); | |
369 | + private = table->private; | |
370 | + table_base = private->entries[smp_processor_id()]; | |
371 | 371 | |
372 | 372 | e = get_entry(table_base, private->hook_entry[hook]); |
373 | 373 | |
... | ... | @@ -466,7 +466,7 @@ |
466 | 466 | #ifdef CONFIG_NETFILTER_DEBUG |
467 | 467 | ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; |
468 | 468 | #endif |
469 | - rcu_read_unlock_bh(); | |
469 | + xt_info_rdunlock_bh(); | |
470 | 470 | |
471 | 471 | #ifdef DEBUG_ALLOW_ALL |
472 | 472 | return NF_ACCEPT; |
473 | 473 | |
... | ... | @@ -926,9 +926,12 @@ |
926 | 926 | /* Instead of clearing (by a previous call to memset()) |
927 | 927 | * the counters and using adds, we set the counters |
928 | 928 | * with data used by 'current' CPU |
929 | - * We dont care about preemption here. | |
929 | + * | |
930 | + * Bottom half has to be disabled to prevent deadlock | |
931 | + * if new softirq were to run and call ipt_do_table | |
930 | 932 | */ |
931 | - curcpu = raw_smp_processor_id(); | |
933 | + local_bh_disable(); | |
934 | + curcpu = smp_processor_id(); | |
932 | 935 | |
933 | 936 | i = 0; |
934 | 937 | IP6T_ENTRY_ITERATE(t->entries[curcpu], |
935 | 938 | |
936 | 939 | |
937 | 940 | |
938 | 941 | |
... | ... | @@ -941,72 +944,22 @@ |
941 | 944 | if (cpu == curcpu) |
942 | 945 | continue; |
943 | 946 | i = 0; |
947 | + xt_info_wrlock(cpu); | |
944 | 948 | IP6T_ENTRY_ITERATE(t->entries[cpu], |
945 | 949 | t->size, |
946 | 950 | add_entry_to_counter, |
947 | 951 | counters, |
948 | 952 | &i); |
953 | + xt_info_wrunlock(cpu); | |
949 | 954 | } |
950 | -} | |
951 | - | |
952 | -/* We're lazy, and add to the first CPU; overflow works its fey magic | |
953 | - * and everything is OK. */ | |
954 | -static int | |
955 | -add_counter_to_entry(struct ip6t_entry *e, | |
956 | - const struct xt_counters addme[], | |
957 | - unsigned int *i) | |
958 | -{ | |
959 | - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | |
960 | - | |
961 | - (*i)++; | |
962 | - return 0; | |
963 | -} | |
964 | - | |
965 | -/* Take values from counters and add them back onto the current cpu */ | |
966 | -static void put_counters(struct xt_table_info *t, | |
967 | - const struct xt_counters counters[]) | |
968 | -{ | |
969 | - unsigned int i, cpu; | |
970 | - | |
971 | - local_bh_disable(); | |
972 | - cpu = smp_processor_id(); | |
973 | - i = 0; | |
974 | - IP6T_ENTRY_ITERATE(t->entries[cpu], | |
975 | - t->size, | |
976 | - add_counter_to_entry, | |
977 | - counters, | |
978 | - &i); | |
979 | 955 | local_bh_enable(); |
980 | 956 | } |
981 | 957 | |
982 | -static inline int | |
983 | -zero_entry_counter(struct ip6t_entry *e, void *arg) | |
984 | -{ | |
985 | - e->counters.bcnt = 0; | |
986 | - e->counters.pcnt = 0; | |
987 | - return 0; | |
988 | -} | |
989 | - | |
990 | -static void | |
991 | -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) | |
992 | -{ | |
993 | - unsigned int cpu; | |
994 | - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; | |
995 | - | |
996 | - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); | |
997 | - for_each_possible_cpu(cpu) { | |
998 | - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); | |
999 | - IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, | |
1000 | - zero_entry_counter, NULL); | |
1001 | - } | |
1002 | -} | |
1003 | - | |
1004 | 958 | static struct xt_counters *alloc_counters(struct xt_table *table) |
1005 | 959 | { |
1006 | 960 | unsigned int countersize; |
1007 | 961 | struct xt_counters *counters; |
1008 | 962 | struct xt_table_info *private = table->private; |
1009 | - struct xt_table_info *info; | |
1010 | 963 | |
1011 | 964 | /* We need atomic snapshot of counters: rest doesn't change |
1012 | 965 | (other than comefrom, which userspace doesn't care |
1013 | 966 | |
1014 | 967 | |
1015 | 968 | |
... | ... | @@ -1015,30 +968,11 @@ |
1015 | 968 | counters = vmalloc_node(countersize, numa_node_id()); |
1016 | 969 | |
1017 | 970 | if (counters == NULL) |
1018 | - goto nomem; | |
971 | + return ERR_PTR(-ENOMEM); | |
1019 | 972 | |
1020 | - info = xt_alloc_table_info(private->size); | |
1021 | - if (!info) | |
1022 | - goto free_counters; | |
973 | + get_counters(private, counters); | |
1023 | 974 | |
1024 | - clone_counters(info, private); | |
1025 | - | |
1026 | - mutex_lock(&table->lock); | |
1027 | - xt_table_entry_swap_rcu(private, info); | |
1028 | - synchronize_net(); /* Wait until smoke has cleared */ | |
1029 | - | |
1030 | - get_counters(info, counters); | |
1031 | - put_counters(private, counters); | |
1032 | - mutex_unlock(&table->lock); | |
1033 | - | |
1034 | - xt_free_table_info(info); | |
1035 | - | |
1036 | 975 | return counters; |
1037 | - | |
1038 | - free_counters: | |
1039 | - vfree(counters); | |
1040 | - nomem: | |
1041 | - return ERR_PTR(-ENOMEM); | |
1042 | 976 | } |
1043 | 977 | |
1044 | 978 | static int |
1045 | 979 | |
... | ... | @@ -1334,8 +1268,9 @@ |
1334 | 1268 | (newinfo->number <= oldinfo->initial_entries)) |
1335 | 1269 | module_put(t->me); |
1336 | 1270 | |
1337 | - /* Get the old counters. */ | |
1271 | + /* Get the old counters, and synchronize with replace */ | |
1338 | 1272 | get_counters(oldinfo, counters); |
1273 | + | |
1339 | 1274 | /* Decrease module usage counts and free resource */ |
1340 | 1275 | loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; |
1341 | 1276 | IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, |
1342 | 1277 | |
1343 | 1278 | |
... | ... | @@ -1405,11 +1340,24 @@ |
1405 | 1340 | return ret; |
1406 | 1341 | } |
1407 | 1342 | |
1343 | +/* We're lazy, and add to the first CPU; overflow works its fey magic | |
1344 | + * and everything is OK. */ | |
1408 | 1345 | static int |
1346 | +add_counter_to_entry(struct ip6t_entry *e, | |
1347 | + const struct xt_counters addme[], | |
1348 | + unsigned int *i) | |
1349 | +{ | |
1350 | + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); | |
1351 | + | |
1352 | + (*i)++; | |
1353 | + return 0; | |
1354 | +} | |
1355 | + | |
1356 | +static int | |
1409 | 1357 | do_add_counters(struct net *net, void __user *user, unsigned int len, |
1410 | 1358 | int compat) |
1411 | 1359 | { |
1412 | - unsigned int i; | |
1360 | + unsigned int i, curcpu; | |
1413 | 1361 | struct xt_counters_info tmp; |
1414 | 1362 | struct xt_counters *paddc; |
1415 | 1363 | unsigned int num_counters; |
1416 | 1364 | |
1417 | 1365 | |
1418 | 1366 | |
1419 | 1367 | |
... | ... | @@ -1465,25 +1413,28 @@ |
1465 | 1413 | goto free; |
1466 | 1414 | } |
1467 | 1415 | |
1468 | - mutex_lock(&t->lock); | |
1416 | + | |
1417 | + local_bh_disable(); | |
1469 | 1418 | private = t->private; |
1470 | 1419 | if (private->number != num_counters) { |
1471 | 1420 | ret = -EINVAL; |
1472 | 1421 | goto unlock_up_free; |
1473 | 1422 | } |
1474 | 1423 | |
1475 | - preempt_disable(); | |
1476 | 1424 | i = 0; |
1477 | 1425 | /* Choose the copy that is on our node */ |
1478 | - loc_cpu_entry = private->entries[raw_smp_processor_id()]; | |
1426 | + curcpu = smp_processor_id(); | |
1427 | + xt_info_wrlock(curcpu); | |
1428 | + loc_cpu_entry = private->entries[curcpu]; | |
1479 | 1429 | IP6T_ENTRY_ITERATE(loc_cpu_entry, |
1480 | 1430 | private->size, |
1481 | 1431 | add_counter_to_entry, |
1482 | 1432 | paddc, |
1483 | 1433 | &i); |
1484 | - preempt_enable(); | |
1434 | + xt_info_wrunlock(curcpu); | |
1435 | + | |
1485 | 1436 | unlock_up_free: |
1486 | - mutex_unlock(&t->lock); | |
1437 | + local_bh_enable(); | |
1487 | 1438 | xt_table_unlock(t); |
1488 | 1439 | module_put(t->me); |
1489 | 1440 | free: |
net/netfilter/x_tables.c
... | ... | @@ -625,20 +625,6 @@ |
625 | 625 | } |
626 | 626 | EXPORT_SYMBOL(xt_free_table_info); |
627 | 627 | |
628 | -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, | |
629 | - struct xt_table_info *newinfo) | |
630 | -{ | |
631 | - unsigned int cpu; | |
632 | - | |
633 | - for_each_possible_cpu(cpu) { | |
634 | - void *p = oldinfo->entries[cpu]; | |
635 | - rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); | |
636 | - newinfo->entries[cpu] = p; | |
637 | - } | |
638 | - | |
639 | -} | |
640 | -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); | |
641 | - | |
642 | 628 | /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ |
643 | 629 | struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, |
644 | 630 | const char *name) |
645 | 631 | |
646 | 632 | |
647 | 633 | |
648 | 634 | |
649 | 635 | |
650 | 636 | |
... | ... | @@ -676,32 +662,43 @@ |
676 | 662 | EXPORT_SYMBOL_GPL(xt_compat_unlock); |
677 | 663 | #endif |
678 | 664 | |
665 | +DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks); | |
666 | +EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); | |
667 | + | |
668 | + | |
679 | 669 | struct xt_table_info * |
680 | 670 | xt_replace_table(struct xt_table *table, |
681 | 671 | unsigned int num_counters, |
682 | 672 | struct xt_table_info *newinfo, |
683 | 673 | int *error) |
684 | 674 | { |
685 | - struct xt_table_info *oldinfo, *private; | |
675 | + struct xt_table_info *private; | |
686 | 676 | |
687 | 677 | /* Do the substitution. */ |
688 | - mutex_lock(&table->lock); | |
678 | + local_bh_disable(); | |
689 | 679 | private = table->private; |
680 | + | |
690 | 681 | /* Check inside lock: is the old number correct? */ |
691 | 682 | if (num_counters != private->number) { |
692 | 683 | duprintf("num_counters != table->private->number (%u/%u)\n", |
693 | 684 | num_counters, private->number); |
694 | - mutex_unlock(&table->lock); | |
685 | + local_bh_enable(); | |
695 | 686 | *error = -EAGAIN; |
696 | 687 | return NULL; |
697 | 688 | } |
698 | - oldinfo = private; | |
699 | - rcu_assign_pointer(table->private, newinfo); | |
700 | - newinfo->initial_entries = oldinfo->initial_entries; | |
701 | - mutex_unlock(&table->lock); | |
702 | 689 | |
703 | - synchronize_net(); | |
704 | - return oldinfo; | |
690 | + table->private = newinfo; | |
691 | + newinfo->initial_entries = private->initial_entries; | |
692 | + | |
693 | + /* | |
694 | + * Even though table entries have now been swapped, other CPU's | |
695 | + * may still be using the old entries. This is okay, because | |
696 | + * resynchronization happens because of the locking done | |
697 | + * during the get_counters() routine. | |
698 | + */ | |
699 | + local_bh_enable(); | |
700 | + | |
701 | + return private; | |
705 | 702 | } |
706 | 703 | EXPORT_SYMBOL_GPL(xt_replace_table); |
707 | 704 | |
... | ... | @@ -734,7 +731,6 @@ |
734 | 731 | |
735 | 732 | /* Simplifies replace_table code. */ |
736 | 733 | table->private = bootstrap; |
737 | - mutex_init(&table->lock); | |
738 | 734 | |
739 | 735 | if (!xt_replace_table(table, 0, newinfo, &ret)) |
740 | 736 | goto unlock; |
... | ... | @@ -1147,7 +1143,14 @@ |
1147 | 1143 | |
1148 | 1144 | static int __init xt_init(void) |
1149 | 1145 | { |
1150 | - int i, rv; | |
1146 | + unsigned int i; | |
1147 | + int rv; | |
1148 | + | |
1149 | + for_each_possible_cpu(i) { | |
1150 | + struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); | |
1151 | + spin_lock_init(&lock->lock); | |
1152 | + lock->readers = 0; | |
1153 | + } | |
1151 | 1154 | |
1152 | 1155 | xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); |
1153 | 1156 | if (!xt) |