Commit 942e4a2bd680c606af0211e64eb216be2e19bf61

Authored by Stephen Hemminger
Committed by David S. Miller
1 parent bf368e4e70

netfilter: revised locking for x_tables

The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.

This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 5 changed files with 204 additions and 296 deletions Side-by-side Diff

include/linux/netfilter/x_tables.h
... ... @@ -354,9 +354,6 @@
354 354 /* What hooks you will enter on */
355 355 unsigned int valid_hooks;
356 356  
357   - /* Lock for the curtain */
358   - struct mutex lock;
359   -
360 357 /* Man behind the curtain... */
361 358 struct xt_table_info *private;
362 359  
... ... @@ -434,8 +431,74 @@
434 431  
435 432 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
436 433 extern void xt_free_table_info(struct xt_table_info *info);
437   -extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
438   - struct xt_table_info *new);
  434 +
  435 +/*
  436 + * Per-CPU spinlock associated with per-cpu table entries, and
  437 + * with a counter for the "reading" side that allows a recursive
  438 + * reader to avoid taking the lock and deadlocking.
  439 + *
  440 + * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
  441 + * It needs to ensure that the rules are not being changed while the packet
  442 + * is being processed. In some cases, the read lock will be acquired
  443 + * twice on the same CPU; this is okay because of the count.
  444 + *
  445 + * "writing" is used when reading counters.
  446 + * During replace any readers that are using the old tables have to complete
  447 + * before freeing the old table. This is handled by the write locking
  448 + * necessary for reading the counters.
  449 + */
  450 +struct xt_info_lock {
  451 + spinlock_t lock;
  452 + unsigned char readers;
  453 +};
  454 +DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
  455 +
  456 +/*
  457 + * Note: we need to ensure that preemption is disabled before acquiring
  458 + * the per-cpu-variable, so we do it as a two step process rather than
  459 + * using "spin_lock_bh()".
  460 + *
  461 + * We _also_ need to disable bottom half processing before updating our
  462 + * nesting count, to make sure that the only kind of re-entrancy is this
  463 + * code being called by itself: since the count+lock is not an atomic
  464 + * operation, we can allow no races.
  465 + *
  466 + * _Only_ that special combination of being per-cpu and never getting
  467 + * re-entered asynchronously means that the count is safe.
  468 + */
  469 +static inline void xt_info_rdlock_bh(void)
  470 +{
  471 + struct xt_info_lock *lock;
  472 +
  473 + local_bh_disable();
  474 + lock = &__get_cpu_var(xt_info_locks);
  475 + if (!lock->readers++)
  476 + spin_lock(&lock->lock);
  477 +}
  478 +
  479 +static inline void xt_info_rdunlock_bh(void)
  480 +{
  481 + struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
  482 +
  483 + if (!--lock->readers)
  484 + spin_unlock(&lock->lock);
  485 + local_bh_enable();
  486 +}
  487 +
  488 +/*
  489 + * The "writer" side needs to get exclusive access to the lock,
  490 + * regardless of readers. This must be called with bottom half
  491 + * processing (and thus also preemption) disabled.
  492 + */
  493 +static inline void xt_info_wrlock(unsigned int cpu)
  494 +{
  495 + spin_lock(&per_cpu(xt_info_locks, cpu).lock);
  496 +}
  497 +
  498 +static inline void xt_info_wrunlock(unsigned int cpu)
  499 +{
  500 + spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
  501 +}
439 502  
440 503 /*
441 504 * This helper is performance critical and must be inlined
net/ipv4/netfilter/arp_tables.c
... ... @@ -253,9 +253,9 @@
253 253 indev = in ? in->name : nulldevname;
254 254 outdev = out ? out->name : nulldevname;
255 255  
256   - rcu_read_lock_bh();
257   - private = rcu_dereference(table->private);
258   - table_base = rcu_dereference(private->entries[smp_processor_id()]);
  256 + xt_info_rdlock_bh();
  257 + private = table->private;
  258 + table_base = private->entries[smp_processor_id()];
259 259  
260 260 e = get_entry(table_base, private->hook_entry[hook]);
261 261 back = get_entry(table_base, private->underflow[hook]);
... ... @@ -273,6 +273,7 @@
273 273  
274 274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
275 275 (2 * skb->dev->addr_len);
  276 +
276 277 ADD_COUNTER(e->counters, hdr_len, 1);
277 278  
278 279 t = arpt_get_target(e);
279 280  
... ... @@ -328,9 +329,8 @@
328 329 e = (void *)e + e->next_offset;
329 330 }
330 331 } while (!hotdrop);
  332 + xt_info_rdunlock_bh();
331 333  
332   - rcu_read_unlock_bh();
333   -
334 334 if (hotdrop)
335 335 return NF_DROP;
336 336 else
337 337  
... ... @@ -711,9 +711,12 @@
711 711 /* Instead of clearing (by a previous call to memset())
712 712 * the counters and using adds, we set the counters
713 713 * with data used by 'current' CPU
714   - * We dont care about preemption here.
  714 + *
  715 + * Bottom half has to be disabled to prevent deadlock
  716 + * if new softirq were to run and call ipt_do_table
715 717 */
716   - curcpu = raw_smp_processor_id();
  718 + local_bh_disable();
  719 + curcpu = smp_processor_id();
717 720  
718 721 i = 0;
719 722 ARPT_ENTRY_ITERATE(t->entries[curcpu],
720 723  
721 724  
722 725  
723 726  
... ... @@ -726,73 +729,22 @@
726 729 if (cpu == curcpu)
727 730 continue;
728 731 i = 0;
  732 + xt_info_wrlock(cpu);
729 733 ARPT_ENTRY_ITERATE(t->entries[cpu],
730 734 t->size,
731 735 add_entry_to_counter,
732 736 counters,
733 737 &i);
  738 + xt_info_wrunlock(cpu);
734 739 }
735   -}
736   -
737   -
738   -/* We're lazy, and add to the first CPU; overflow works its fey magic
739   - * and everything is OK. */
740   -static int
741   -add_counter_to_entry(struct arpt_entry *e,
742   - const struct xt_counters addme[],
743   - unsigned int *i)
744   -{
745   - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
746   -
747   - (*i)++;
748   - return 0;
749   -}
750   -
751   -/* Take values from counters and add them back onto the current cpu */
752   -static void put_counters(struct xt_table_info *t,
753   - const struct xt_counters counters[])
754   -{
755   - unsigned int i, cpu;
756   -
757   - local_bh_disable();
758   - cpu = smp_processor_id();
759   - i = 0;
760   - ARPT_ENTRY_ITERATE(t->entries[cpu],
761   - t->size,
762   - add_counter_to_entry,
763   - counters,
764   - &i);
765 740 local_bh_enable();
766 741 }
767 742  
768   -static inline int
769   -zero_entry_counter(struct arpt_entry *e, void *arg)
770   -{
771   - e->counters.bcnt = 0;
772   - e->counters.pcnt = 0;
773   - return 0;
774   -}
775   -
776   -static void
777   -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
778   -{
779   - unsigned int cpu;
780   - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
781   -
782   - memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
783   - for_each_possible_cpu(cpu) {
784   - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
785   - ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
786   - zero_entry_counter, NULL);
787   - }
788   -}
789   -
790 743 static struct xt_counters *alloc_counters(struct xt_table *table)
791 744 {
792 745 unsigned int countersize;
793 746 struct xt_counters *counters;
794 747 struct xt_table_info *private = table->private;
795   - struct xt_table_info *info;
796 748  
797 749 /* We need atomic snapshot of counters: rest doesn't change
798 750 * (other than comefrom, which userspace doesn't care
799 751  
800 752  
801 753  
... ... @@ -802,30 +754,11 @@
802 754 counters = vmalloc_node(countersize, numa_node_id());
803 755  
804 756 if (counters == NULL)
805   - goto nomem;
  757 + return ERR_PTR(-ENOMEM);
806 758  
807   - info = xt_alloc_table_info(private->size);
808   - if (!info)
809   - goto free_counters;
  759 + get_counters(private, counters);
810 760  
811   - clone_counters(info, private);
812   -
813   - mutex_lock(&table->lock);
814   - xt_table_entry_swap_rcu(private, info);
815   - synchronize_net(); /* Wait until smoke has cleared */
816   -
817   - get_counters(info, counters);
818   - put_counters(private, counters);
819   - mutex_unlock(&table->lock);
820   -
821   - xt_free_table_info(info);
822   -
823 761 return counters;
824   -
825   - free_counters:
826   - vfree(counters);
827   - nomem:
828   - return ERR_PTR(-ENOMEM);
829 762 }
830 763  
831 764 static int copy_entries_to_user(unsigned int total_size,
832 765  
... ... @@ -1094,8 +1027,9 @@
1094 1027 (newinfo->number <= oldinfo->initial_entries))
1095 1028 module_put(t->me);
1096 1029  
1097   - /* Get the old counters. */
  1030 + /* Get the old counters, and synchronize with replace */
1098 1031 get_counters(oldinfo, counters);
  1032 +
1099 1033 /* Decrease module usage counts and free resource */
1100 1034 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1101 1035 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
1102 1036  
... ... @@ -1165,10 +1099,23 @@
1165 1099 return ret;
1166 1100 }
1167 1101  
  1102 +/* We're lazy, and add to the first CPU; overflow works its fey magic
  1103 + * and everything is OK. */
  1104 +static int
  1105 +add_counter_to_entry(struct arpt_entry *e,
  1106 + const struct xt_counters addme[],
  1107 + unsigned int *i)
  1108 +{
  1109 + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
  1110 +
  1111 + (*i)++;
  1112 + return 0;
  1113 +}
  1114 +
1168 1115 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1169 1116 int compat)
1170 1117 {
1171   - unsigned int i;
  1118 + unsigned int i, curcpu;
1172 1119 struct xt_counters_info tmp;
1173 1120 struct xt_counters *paddc;
1174 1121 unsigned int num_counters;
1175 1122  
1176 1123  
1177 1124  
1178 1125  
... ... @@ -1224,26 +1171,26 @@
1224 1171 goto free;
1225 1172 }
1226 1173  
1227   - mutex_lock(&t->lock);
  1174 + local_bh_disable();
1228 1175 private = t->private;
1229 1176 if (private->number != num_counters) {
1230 1177 ret = -EINVAL;
1231 1178 goto unlock_up_free;
1232 1179 }
1233 1180  
1234   - preempt_disable();
1235 1181 i = 0;
1236 1182 /* Choose the copy that is on our node */
1237   - loc_cpu_entry = private->entries[smp_processor_id()];
  1183 + curcpu = smp_processor_id();
  1184 + loc_cpu_entry = private->entries[curcpu];
  1185 + xt_info_wrlock(curcpu);
1238 1186 ARPT_ENTRY_ITERATE(loc_cpu_entry,
1239 1187 private->size,
1240 1188 add_counter_to_entry,
1241 1189 paddc,
1242 1190 &i);
1243   - preempt_enable();
  1191 + xt_info_wrunlock(curcpu);
1244 1192 unlock_up_free:
1245   - mutex_unlock(&t->lock);
1246   -
  1193 + local_bh_enable();
1247 1194 xt_table_unlock(t);
1248 1195 module_put(t->me);
1249 1196 free:
net/ipv4/netfilter/ip_tables.c
... ... @@ -338,11 +338,10 @@
338 338 tgpar.hooknum = hook;
339 339  
340 340 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
  341 + xt_info_rdlock_bh();
  342 + private = table->private;
  343 + table_base = private->entries[smp_processor_id()];
341 344  
342   - rcu_read_lock_bh();
343   - private = rcu_dereference(table->private);
344   - table_base = rcu_dereference(private->entries[smp_processor_id()]);
345   -
346 345 e = get_entry(table_base, private->hook_entry[hook]);
347 346  
348 347 /* For return from builtin chain */
349 348  
... ... @@ -436,9 +435,8 @@
436 435 e = (void *)e + e->next_offset;
437 436 }
438 437 } while (!hotdrop);
  438 + xt_info_rdunlock_bh();
439 439  
440   - rcu_read_unlock_bh();
441   -
442 440 #ifdef DEBUG_ALLOW_ALL
443 441 return NF_ACCEPT;
444 442 #else
445 443  
... ... @@ -896,10 +894,13 @@
896 894  
897 895 /* Instead of clearing (by a previous call to memset())
898 896 * the counters and using adds, we set the counters
899   - * with data used by 'current' CPU
900   - * We dont care about preemption here.
  897 + * with data used by 'current' CPU.
  898 + *
  899 + * Bottom half has to be disabled to prevent deadlock
  900 + * if new softirq were to run and call ipt_do_table
901 901 */
902   - curcpu = raw_smp_processor_id();
  902 + local_bh_disable();
  903 + curcpu = smp_processor_id();
903 904  
904 905 i = 0;
905 906 IPT_ENTRY_ITERATE(t->entries[curcpu],
906 907  
907 908  
908 909  
909 910  
... ... @@ -912,74 +913,22 @@
912 913 if (cpu == curcpu)
913 914 continue;
914 915 i = 0;
  916 + xt_info_wrlock(cpu);
915 917 IPT_ENTRY_ITERATE(t->entries[cpu],
916 918 t->size,
917 919 add_entry_to_counter,
918 920 counters,
919 921 &i);
  922 + xt_info_wrunlock(cpu);
920 923 }
921   -
922   -}
923   -
924   -/* We're lazy, and add to the first CPU; overflow works its fey magic
925   - * and everything is OK. */
926   -static int
927   -add_counter_to_entry(struct ipt_entry *e,
928   - const struct xt_counters addme[],
929   - unsigned int *i)
930   -{
931   - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
932   -
933   - (*i)++;
934   - return 0;
935   -}
936   -
937   -/* Take values from counters and add them back onto the current cpu */
938   -static void put_counters(struct xt_table_info *t,
939   - const struct xt_counters counters[])
940   -{
941   - unsigned int i, cpu;
942   -
943   - local_bh_disable();
944   - cpu = smp_processor_id();
945   - i = 0;
946   - IPT_ENTRY_ITERATE(t->entries[cpu],
947   - t->size,
948   - add_counter_to_entry,
949   - counters,
950   - &i);
951 924 local_bh_enable();
952 925 }
953 926  
954   -
955   -static inline int
956   -zero_entry_counter(struct ipt_entry *e, void *arg)
957   -{
958   - e->counters.bcnt = 0;
959   - e->counters.pcnt = 0;
960   - return 0;
961   -}
962   -
963   -static void
964   -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
965   -{
966   - unsigned int cpu;
967   - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
968   -
969   - memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
970   - for_each_possible_cpu(cpu) {
971   - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
972   - IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
973   - zero_entry_counter, NULL);
974   - }
975   -}
976   -
977 927 static struct xt_counters * alloc_counters(struct xt_table *table)
978 928 {
979 929 unsigned int countersize;
980 930 struct xt_counters *counters;
981 931 struct xt_table_info *private = table->private;
982   - struct xt_table_info *info;
983 932  
984 933 /* We need atomic snapshot of counters: rest doesn't change
985 934 (other than comefrom, which userspace doesn't care
986 935  
987 936  
988 937  
... ... @@ -988,30 +937,11 @@
988 937 counters = vmalloc_node(countersize, numa_node_id());
989 938  
990 939 if (counters == NULL)
991   - goto nomem;
  940 + return ERR_PTR(-ENOMEM);
992 941  
993   - info = xt_alloc_table_info(private->size);
994   - if (!info)
995   - goto free_counters;
  942 + get_counters(private, counters);
996 943  
997   - clone_counters(info, private);
998   -
999   - mutex_lock(&table->lock);
1000   - xt_table_entry_swap_rcu(private, info);
1001   - synchronize_net(); /* Wait until smoke has cleared */
1002   -
1003   - get_counters(info, counters);
1004   - put_counters(private, counters);
1005   - mutex_unlock(&table->lock);
1006   -
1007   - xt_free_table_info(info);
1008   -
1009 944 return counters;
1010   -
1011   - free_counters:
1012   - vfree(counters);
1013   - nomem:
1014   - return ERR_PTR(-ENOMEM);
1015 945 }
1016 946  
1017 947 static int
1018 948  
... ... @@ -1306,8 +1236,9 @@
1306 1236 (newinfo->number <= oldinfo->initial_entries))
1307 1237 module_put(t->me);
1308 1238  
1309   - /* Get the old counters. */
  1239 + /* Get the old counters, and synchronize with replace */
1310 1240 get_counters(oldinfo, counters);
  1241 +
1311 1242 /* Decrease module usage counts and free resource */
1312 1243 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1313 1244 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
1314 1245  
1315 1246  
... ... @@ -1377,11 +1308,23 @@
1377 1308 return ret;
1378 1309 }
1379 1310  
  1311 +/* We're lazy, and add to the first CPU; overflow works its fey magic
  1312 + * and everything is OK. */
  1313 +static int
  1314 +add_counter_to_entry(struct ipt_entry *e,
  1315 + const struct xt_counters addme[],
  1316 + unsigned int *i)
  1317 +{
  1318 + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1380 1319  
  1320 + (*i)++;
  1321 + return 0;
  1322 +}
  1323 +
1381 1324 static int
1382 1325 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
1383 1326 {
1384   - unsigned int i;
  1327 + unsigned int i, curcpu;
1385 1328 struct xt_counters_info tmp;
1386 1329 struct xt_counters *paddc;
1387 1330 unsigned int num_counters;
1388 1331  
1389 1332  
1390 1333  
1391 1334  
... ... @@ -1437,25 +1380,26 @@
1437 1380 goto free;
1438 1381 }
1439 1382  
1440   - mutex_lock(&t->lock);
  1383 + local_bh_disable();
1441 1384 private = t->private;
1442 1385 if (private->number != num_counters) {
1443 1386 ret = -EINVAL;
1444 1387 goto unlock_up_free;
1445 1388 }
1446 1389  
1447   - preempt_disable();
1448 1390 i = 0;
1449 1391 /* Choose the copy that is on our node */
1450   - loc_cpu_entry = private->entries[raw_smp_processor_id()];
  1392 + curcpu = smp_processor_id();
  1393 + loc_cpu_entry = private->entries[curcpu];
  1394 + xt_info_wrlock(curcpu);
1451 1395 IPT_ENTRY_ITERATE(loc_cpu_entry,
1452 1396 private->size,
1453 1397 add_counter_to_entry,
1454 1398 paddc,
1455 1399 &i);
1456   - preempt_enable();
  1400 + xt_info_wrunlock(curcpu);
1457 1401 unlock_up_free:
1458   - mutex_unlock(&t->lock);
  1402 + local_bh_enable();
1459 1403 xt_table_unlock(t);
1460 1404 module_put(t->me);
1461 1405 free:
net/ipv6/netfilter/ip6_tables.c
... ... @@ -365,9 +365,9 @@
365 365  
366 366 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
367 367  
368   - rcu_read_lock_bh();
369   - private = rcu_dereference(table->private);
370   - table_base = rcu_dereference(private->entries[smp_processor_id()]);
  368 + xt_info_rdlock_bh();
  369 + private = table->private;
  370 + table_base = private->entries[smp_processor_id()];
371 371  
372 372 e = get_entry(table_base, private->hook_entry[hook]);
373 373  
... ... @@ -466,7 +466,7 @@
466 466 #ifdef CONFIG_NETFILTER_DEBUG
467 467 ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
468 468 #endif
469   - rcu_read_unlock_bh();
  469 + xt_info_rdunlock_bh();
470 470  
471 471 #ifdef DEBUG_ALLOW_ALL
472 472 return NF_ACCEPT;
473 473  
... ... @@ -926,9 +926,12 @@
926 926 /* Instead of clearing (by a previous call to memset())
927 927 * the counters and using adds, we set the counters
928 928 * with data used by 'current' CPU
929   - * We dont care about preemption here.
  929 + *
  930 + * Bottom half has to be disabled to prevent deadlock
  931 + * if new softirq were to run and call ipt_do_table
930 932 */
931   - curcpu = raw_smp_processor_id();
  933 + local_bh_disable();
  934 + curcpu = smp_processor_id();
932 935  
933 936 i = 0;
934 937 IP6T_ENTRY_ITERATE(t->entries[curcpu],
935 938  
936 939  
937 940  
938 941  
... ... @@ -941,72 +944,22 @@
941 944 if (cpu == curcpu)
942 945 continue;
943 946 i = 0;
  947 + xt_info_wrlock(cpu);
944 948 IP6T_ENTRY_ITERATE(t->entries[cpu],
945 949 t->size,
946 950 add_entry_to_counter,
947 951 counters,
948 952 &i);
  953 + xt_info_wrunlock(cpu);
949 954 }
950   -}
951   -
952   -/* We're lazy, and add to the first CPU; overflow works its fey magic
953   - * and everything is OK. */
954   -static int
955   -add_counter_to_entry(struct ip6t_entry *e,
956   - const struct xt_counters addme[],
957   - unsigned int *i)
958   -{
959   - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
960   -
961   - (*i)++;
962   - return 0;
963   -}
964   -
965   -/* Take values from counters and add them back onto the current cpu */
966   -static void put_counters(struct xt_table_info *t,
967   - const struct xt_counters counters[])
968   -{
969   - unsigned int i, cpu;
970   -
971   - local_bh_disable();
972   - cpu = smp_processor_id();
973   - i = 0;
974   - IP6T_ENTRY_ITERATE(t->entries[cpu],
975   - t->size,
976   - add_counter_to_entry,
977   - counters,
978   - &i);
979 955 local_bh_enable();
980 956 }
981 957  
982   -static inline int
983   -zero_entry_counter(struct ip6t_entry *e, void *arg)
984   -{
985   - e->counters.bcnt = 0;
986   - e->counters.pcnt = 0;
987   - return 0;
988   -}
989   -
990   -static void
991   -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
992   -{
993   - unsigned int cpu;
994   - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
995   -
996   - memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
997   - for_each_possible_cpu(cpu) {
998   - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
999   - IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
1000   - zero_entry_counter, NULL);
1001   - }
1002   -}
1003   -
1004 958 static struct xt_counters *alloc_counters(struct xt_table *table)
1005 959 {
1006 960 unsigned int countersize;
1007 961 struct xt_counters *counters;
1008 962 struct xt_table_info *private = table->private;
1009   - struct xt_table_info *info;
1010 963  
1011 964 /* We need atomic snapshot of counters: rest doesn't change
1012 965 (other than comefrom, which userspace doesn't care
1013 966  
1014 967  
1015 968  
... ... @@ -1015,30 +968,11 @@
1015 968 counters = vmalloc_node(countersize, numa_node_id());
1016 969  
1017 970 if (counters == NULL)
1018   - goto nomem;
  971 + return ERR_PTR(-ENOMEM);
1019 972  
1020   - info = xt_alloc_table_info(private->size);
1021   - if (!info)
1022   - goto free_counters;
  973 + get_counters(private, counters);
1023 974  
1024   - clone_counters(info, private);
1025   -
1026   - mutex_lock(&table->lock);
1027   - xt_table_entry_swap_rcu(private, info);
1028   - synchronize_net(); /* Wait until smoke has cleared */
1029   -
1030   - get_counters(info, counters);
1031   - put_counters(private, counters);
1032   - mutex_unlock(&table->lock);
1033   -
1034   - xt_free_table_info(info);
1035   -
1036 975 return counters;
1037   -
1038   - free_counters:
1039   - vfree(counters);
1040   - nomem:
1041   - return ERR_PTR(-ENOMEM);
1042 976 }
1043 977  
1044 978 static int
1045 979  
... ... @@ -1334,8 +1268,9 @@
1334 1268 (newinfo->number <= oldinfo->initial_entries))
1335 1269 module_put(t->me);
1336 1270  
1337   - /* Get the old counters. */
  1271 + /* Get the old counters, and synchronize with replace */
1338 1272 get_counters(oldinfo, counters);
  1273 +
1339 1274 /* Decrease module usage counts and free resource */
1340 1275 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1341 1276 IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
1342 1277  
1343 1278  
... ... @@ -1405,11 +1340,24 @@
1405 1340 return ret;
1406 1341 }
1407 1342  
  1343 +/* We're lazy, and add to the first CPU; overflow works its fey magic
  1344 + * and everything is OK. */
1408 1345 static int
  1346 +add_counter_to_entry(struct ip6t_entry *e,
  1347 + const struct xt_counters addme[],
  1348 + unsigned int *i)
  1349 +{
  1350 + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
  1351 +
  1352 + (*i)++;
  1353 + return 0;
  1354 +}
  1355 +
  1356 +static int
1409 1357 do_add_counters(struct net *net, void __user *user, unsigned int len,
1410 1358 int compat)
1411 1359 {
1412   - unsigned int i;
  1360 + unsigned int i, curcpu;
1413 1361 struct xt_counters_info tmp;
1414 1362 struct xt_counters *paddc;
1415 1363 unsigned int num_counters;
1416 1364  
1417 1365  
1418 1366  
1419 1367  
... ... @@ -1465,25 +1413,28 @@
1465 1413 goto free;
1466 1414 }
1467 1415  
1468   - mutex_lock(&t->lock);
  1416 +
  1417 + local_bh_disable();
1469 1418 private = t->private;
1470 1419 if (private->number != num_counters) {
1471 1420 ret = -EINVAL;
1472 1421 goto unlock_up_free;
1473 1422 }
1474 1423  
1475   - preempt_disable();
1476 1424 i = 0;
1477 1425 /* Choose the copy that is on our node */
1478   - loc_cpu_entry = private->entries[raw_smp_processor_id()];
  1426 + curcpu = smp_processor_id();
  1427 + xt_info_wrlock(curcpu);
  1428 + loc_cpu_entry = private->entries[curcpu];
1479 1429 IP6T_ENTRY_ITERATE(loc_cpu_entry,
1480 1430 private->size,
1481 1431 add_counter_to_entry,
1482 1432 paddc,
1483 1433 &i);
1484   - preempt_enable();
  1434 + xt_info_wrunlock(curcpu);
  1435 +
1485 1436 unlock_up_free:
1486   - mutex_unlock(&t->lock);
  1437 + local_bh_enable();
1487 1438 xt_table_unlock(t);
1488 1439 module_put(t->me);
1489 1440 free:
net/netfilter/x_tables.c
... ... @@ -625,20 +625,6 @@
625 625 }
626 626 EXPORT_SYMBOL(xt_free_table_info);
627 627  
628   -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
629   - struct xt_table_info *newinfo)
630   -{
631   - unsigned int cpu;
632   -
633   - for_each_possible_cpu(cpu) {
634   - void *p = oldinfo->entries[cpu];
635   - rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
636   - newinfo->entries[cpu] = p;
637   - }
638   -
639   -}
640   -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
641   -
642 628 /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
643 629 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
644 630 const char *name)
645 631  
646 632  
647 633  
648 634  
649 635  
650 636  
... ... @@ -676,32 +662,43 @@
676 662 EXPORT_SYMBOL_GPL(xt_compat_unlock);
677 663 #endif
678 664  
  665 +DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
  666 +EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
  667 +
  668 +
679 669 struct xt_table_info *
680 670 xt_replace_table(struct xt_table *table,
681 671 unsigned int num_counters,
682 672 struct xt_table_info *newinfo,
683 673 int *error)
684 674 {
685   - struct xt_table_info *oldinfo, *private;
  675 + struct xt_table_info *private;
686 676  
687 677 /* Do the substitution. */
688   - mutex_lock(&table->lock);
  678 + local_bh_disable();
689 679 private = table->private;
  680 +
690 681 /* Check inside lock: is the old number correct? */
691 682 if (num_counters != private->number) {
692 683 duprintf("num_counters != table->private->number (%u/%u)\n",
693 684 num_counters, private->number);
694   - mutex_unlock(&table->lock);
  685 + local_bh_enable();
695 686 *error = -EAGAIN;
696 687 return NULL;
697 688 }
698   - oldinfo = private;
699   - rcu_assign_pointer(table->private, newinfo);
700   - newinfo->initial_entries = oldinfo->initial_entries;
701   - mutex_unlock(&table->lock);
702 689  
703   - synchronize_net();
704   - return oldinfo;
  690 + table->private = newinfo;
  691 + newinfo->initial_entries = private->initial_entries;
  692 +
  693 + /*
  694 + * Even though table entries have now been swapped, other CPU's
  695 + * may still be using the old entries. This is okay, because
  696 + * resynchronization happens because of the locking done
  697 + * during the get_counters() routine.
  698 + */
  699 + local_bh_enable();
  700 +
  701 + return private;
705 702 }
706 703 EXPORT_SYMBOL_GPL(xt_replace_table);
707 704  
... ... @@ -734,7 +731,6 @@
734 731  
735 732 /* Simplifies replace_table code. */
736 733 table->private = bootstrap;
737   - mutex_init(&table->lock);
738 734  
739 735 if (!xt_replace_table(table, 0, newinfo, &ret))
740 736 goto unlock;
... ... @@ -1147,7 +1143,14 @@
1147 1143  
1148 1144 static int __init xt_init(void)
1149 1145 {
1150   - int i, rv;
  1146 + unsigned int i;
  1147 + int rv;
  1148 +
  1149 + for_each_possible_cpu(i) {
  1150 + struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
  1151 + spin_lock_init(&lock->lock);
  1152 + lock->readers = 0;
  1153 + }
1151 1154  
1152 1155 xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
1153 1156 if (!xt)