Commit fd5db42254518fbf241dc454e918598fbe494fa2
Committed by
Linus Torvalds
1 parent
2dcb22b346
ipc/sem.c: optimize update_queue() for bulk wakeup calls
The following series of patches tries to fix the spinlock contention reported by Chris Mason - his benchmark exposes problems of the current code: - In the worst case, the algorithm used by update_queue() is O(N^2). Bulk wake-up calls can enter this worst case. The patch series fix that. Note that the benchmark app doesn't expose the problem, it just should be fixed: Real world apps might do the wake-ups in another order than perfect FIFO. - The part of the code that runs within the semaphore array spinlock is significantly larger than necessary. The patch series fixes that. This change is responsible for the main improvement. - The cacheline with the spinlock is also used for a variable that is read in the hot path (sem_base) and for a variable that is unnecessarily written to multiple times (sem_otime). The last step of the series cacheline-aligns the spinlock. This patch: The SysV semaphore code allows to perform multiple operations on all semaphores in the array as atomic operations. After a modification, update_queue() checks which of the waiting tasks can complete. The algorithm that is used to identify the tasks is O(N^2) in the worst case. For some cases, it is simple to avoid the O(N^2). The patch adds a detection logic for some cases, especially for the case of an array where all sleeping tasks are single sembuf operations and a multi-sembuf operation is used to wake up multiple tasks. A big database application uses that approach. The patch fixes wakeup due to semctl(,,SETALL,) - the initial version of the patch breaks that. [akpm@linux-foundation.org: make do_smart_update() static] Signed-off-by: Manfred Spraul <manfred@colorfullife.com> Cc: Chris Mason <chris.mason@oracle.com> Cc: Zach Brown <zach.brown@oracle.com> Cc: Jens Axboe <jens.axboe@oracle.com> Cc: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 97 additions and 13 deletions Side-by-side Diff
ipc/sem.c
... | ... | @@ -434,7 +434,70 @@ |
434 | 434 | sma->complex_count--; |
435 | 435 | } |
436 | 436 | |
437 | +/** check_restart(sma, q) | |
438 | + * @sma: semaphore array | |
439 | + * @q: the operation that just completed | |
440 | + * | |
441 | + * update_queue is O(N^2) when it restarts scanning the whole queue of | |
442 | + * waiting operations. Therefore this function checks if the restart is | |
443 | + * really necessary. It is called after a previously waiting operation | |
444 | + * was completed. | |
445 | + */ | |
446 | +static int check_restart(struct sem_array *sma, struct sem_queue *q) | |
447 | +{ | |
448 | + struct sem *curr; | |
449 | + struct sem_queue *h; | |
437 | 450 | |
451 | + /* if the operation didn't modify the array, then no restart */ | |
452 | + if (q->alter == 0) | |
453 | + return 0; | |
454 | + | |
455 | + /* pending complex operations are too difficult to analyse */ | |
456 | + if (sma->complex_count) | |
457 | + return 1; | |
458 | + | |
459 | + /* we were a sleeping complex operation. Too difficult */ | |
460 | + if (q->nsops > 1) | |
461 | + return 1; | |
462 | + | |
463 | + curr = sma->sem_base + q->sops[0].sem_num; | |
464 | + | |
465 | + /* No-one waits on this queue */ | |
466 | + if (list_empty(&curr->sem_pending)) | |
467 | + return 0; | |
468 | + | |
469 | + /* the new semaphore value */ | |
470 | + if (curr->semval) { | |
471 | + /* It is impossible that someone waits for the new value: | |
472 | + * - q is a previously sleeping simple operation that | |
473 | + * altered the array. It must be a decrement, because | |
474 | + * simple increments never sleep. | |
475 | + * - The value is not 0, thus wait-for-zero won't proceed. | |
476 | + * - If there are older (higher priority) decrements | |
477 | + * in the queue, then they have observed the original | |
478 | + * semval value and couldn't proceed. The operation | |
479 | + * decremented to value - thus they won't proceed either. | |
480 | + */ | |
481 | + BUG_ON(q->sops[0].sem_op >= 0); | |
482 | + return 0; | |
483 | + } | |
484 | + /* | |
485 | + * semval is 0. Check if there are wait-for-zero semops. | |
486 | + * They must be the first entries in the per-semaphore simple queue | |
487 | + */ | |
488 | + h = list_first_entry(&curr->sem_pending, struct sem_queue, simple_list); | |
489 | + BUG_ON(h->nsops != 1); | |
490 | + BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num); | |
491 | + | |
492 | + /* Yes, there is a wait-for-zero semop. Restart */ | |
493 | + if (h->sops[0].sem_op == 0) | |
494 | + return 1; | |
495 | + | |
496 | + /* Again - no-one is waiting for the new value. */ | |
497 | + return 0; | |
498 | +} | |
499 | + | |
500 | + | |
438 | 501 | /** |
439 | 502 | * update_queue(sma, semnum): Look for tasks that can be completed. |
440 | 503 | * @sma: semaphore array. |
... | ... | @@ -469,7 +532,7 @@ |
469 | 532 | again: |
470 | 533 | walk = pending_list->next; |
471 | 534 | while (walk != pending_list) { |
472 | - int error, alter; | |
535 | + int error, restart; | |
473 | 536 | |
474 | 537 | q = (struct sem_queue *)((char *)walk - offset); |
475 | 538 | walk = walk->next; |
476 | 539 | |
477 | 540 | |
... | ... | @@ -494,22 +557,43 @@ |
494 | 557 | |
495 | 558 | unlink_queue(sma, q); |
496 | 559 | |
497 | - /* | |
498 | - * The next operation that must be checked depends on the type | |
499 | - * of the completed operation: | |
500 | - * - if the operation modified the array, then restart from the | |
501 | - * head of the queue and check for threads that might be | |
502 | - * waiting for the new semaphore values. | |
503 | - * - if the operation didn't modify the array, then just | |
504 | - * continue. | |
505 | - */ | |
506 | - alter = q->alter; | |
560 | + if (error) | |
561 | + restart = 0; | |
562 | + else | |
563 | + restart = check_restart(sma, q); | |
564 | + | |
507 | 565 | wake_up_sem_queue(q, error); |
508 | - if (alter && !error) | |
566 | + if (restart) | |
509 | 567 | goto again; |
510 | 568 | } |
511 | 569 | } |
512 | 570 | |
571 | +/** do_smart_update(sma, sops, nsops): Optimized update_queue | |
572 | + * @sma: semaphore array | |
573 | + * @sops: operations that were performed | |
574 | + * @nsops: number of operations | |
575 | + * | |
576 | + * do_smart_update() does the required called to update_queue, based on the | |
577 | + * actual changes that were performed on the semaphore array. | |
578 | + */ | |
579 | +static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops) | |
580 | +{ | |
581 | + int i; | |
582 | + | |
583 | + if (sma->complex_count || sops == NULL) { | |
584 | + update_queue(sma, -1); | |
585 | + return; | |
586 | + } | |
587 | + | |
588 | + for (i = 0; i < nsops; i++) { | |
589 | + if (sops[i].sem_op > 0 || | |
590 | + (sops[i].sem_op < 0 && | |
591 | + sma->sem_base[sops[i].sem_num].semval == 0)) | |
592 | + update_queue(sma, sops[i].sem_num); | |
593 | + } | |
594 | +} | |
595 | + | |
596 | + | |
513 | 597 | /* The following counts are associated to each semaphore: |
514 | 598 | * semncnt number of tasks waiting on semval being nonzero |
515 | 599 | * semzcnt number of tasks waiting on semval being zero |
... | ... | @@ -1225,7 +1309,7 @@ |
1225 | 1309 | error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current)); |
1226 | 1310 | if (error <= 0) { |
1227 | 1311 | if (alter && error == 0) |
1228 | - update_queue(sma, (nsops == 1) ? sops[0].sem_num : -1); | |
1312 | + do_smart_update(sma, sops, nsops); | |
1229 | 1313 | |
1230 | 1314 | goto out_unlock_free; |
1231 | 1315 | } |