Commit aadd06e5c56b9ff5117ec77e59eada43dc46e2fc
1 parent
b3cf257623
Exists in
master
and in
39 other branches
[PATCH] splice: fix problems with sys_tee()
Several issues noticed/fixed: - We cannot reliably block in link_pipe() while holding both input and output mutexes. So do preparatory checks before locking down both mutexes and doing the link. - The ipipe->nrbufs vs i check was bad, because we could have dropped the ipipe lock in-between. This causes us to potentially look at unknown buffers if we were racing with someone else reading this pipe. Signed-off-by: Jens Axboe <axboe@suse.de>
Showing 1 changed file with 133 additions and 105 deletions Side-by-side Diff
fs/splice.c
... | ... | @@ -1307,6 +1307,85 @@ |
1307 | 1307 | } |
1308 | 1308 | |
1309 | 1309 | /* |
1310 | + * Make sure there's data to read. Wait for input if we can, otherwise | |
1311 | + * return an appropriate error. | |
1312 | + */ | |
1313 | +static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) | |
1314 | +{ | |
1315 | + int ret; | |
1316 | + | |
1317 | + /* | |
1318 | + * Check ->nrbufs without the inode lock first. This function | |
1319 | + * is speculative anyways, so missing one is ok. | |
1320 | + */ | |
1321 | + if (pipe->nrbufs) | |
1322 | + return 0; | |
1323 | + | |
1324 | + ret = 0; | |
1325 | + mutex_lock(&pipe->inode->i_mutex); | |
1326 | + | |
1327 | + while (!pipe->nrbufs) { | |
1328 | + if (signal_pending(current)) { | |
1329 | + ret = -ERESTARTSYS; | |
1330 | + break; | |
1331 | + } | |
1332 | + if (!pipe->writers) | |
1333 | + break; | |
1334 | + if (!pipe->waiting_writers) { | |
1335 | + if (flags & SPLICE_F_NONBLOCK) { | |
1336 | + ret = -EAGAIN; | |
1337 | + break; | |
1338 | + } | |
1339 | + } | |
1340 | + pipe_wait(pipe); | |
1341 | + } | |
1342 | + | |
1343 | + mutex_unlock(&pipe->inode->i_mutex); | |
1344 | + return ret; | |
1345 | +} | |
1346 | + | |
1347 | +/* | |
1348 | + * Make sure there's writeable room. Wait for room if we can, otherwise | |
1349 | + * return an appropriate error. | |
1350 | + */ | |
1351 | +static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) | |
1352 | +{ | |
1353 | + int ret; | |
1354 | + | |
1355 | + /* | |
1356 | + * Check ->nrbufs without the inode lock first. This function | |
1357 | + * is speculative anyways, so missing one is ok. | |
1358 | + */ | |
1359 | + if (pipe->nrbufs < PIPE_BUFFERS) | |
1360 | + return 0; | |
1361 | + | |
1362 | + ret = 0; | |
1363 | + mutex_lock(&pipe->inode->i_mutex); | |
1364 | + | |
1365 | + while (pipe->nrbufs >= PIPE_BUFFERS) { | |
1366 | + if (!pipe->readers) { | |
1367 | + send_sig(SIGPIPE, current, 0); | |
1368 | + ret = -EPIPE; | |
1369 | + break; | |
1370 | + } | |
1371 | + if (flags & SPLICE_F_NONBLOCK) { | |
1372 | + ret = -EAGAIN; | |
1373 | + break; | |
1374 | + } | |
1375 | + if (signal_pending(current)) { | |
1376 | + ret = -ERESTARTSYS; | |
1377 | + break; | |
1378 | + } | |
1379 | + pipe->waiting_writers++; | |
1380 | + pipe_wait(pipe); | |
1381 | + pipe->waiting_writers--; | |
1382 | + } | |
1383 | + | |
1384 | + mutex_unlock(&pipe->inode->i_mutex); | |
1385 | + return ret; | |
1386 | +} | |
1387 | + | |
1388 | +/* | |
1310 | 1389 | * Link contents of ipipe to opipe. |
1311 | 1390 | */ |
1312 | 1391 | static int link_pipe(struct pipe_inode_info *ipipe, |
1313 | 1392 | |
1314 | 1393 | |
1315 | 1394 | |
1316 | 1395 | |
1317 | 1396 | |
1318 | 1397 | |
1319 | 1398 | |
1320 | 1399 | |
1321 | 1400 | |
1322 | 1401 | |
1323 | 1402 | |
1324 | 1403 | |
1325 | 1404 | |
1326 | 1405 | |
1327 | 1406 | |
... | ... | @@ -1314,136 +1393,70 @@ |
1314 | 1393 | size_t len, unsigned int flags) |
1315 | 1394 | { |
1316 | 1395 | struct pipe_buffer *ibuf, *obuf; |
1317 | - int ret, do_wakeup, i, ipipe_first; | |
1396 | + int ret = 0, i = 0, nbuf; | |
1318 | 1397 | |
1319 | - ret = do_wakeup = ipipe_first = 0; | |
1320 | - | |
1321 | 1398 | /* |
1322 | 1399 | * Potential ABBA deadlock, work around it by ordering lock |
1323 | 1400 | * grabbing by inode address. Otherwise two different processes |
1324 | 1401 | * could deadlock (one doing tee from A -> B, the other from B -> A). |
1325 | 1402 | */ |
1326 | 1403 | if (ipipe->inode < opipe->inode) { |
1327 | - ipipe_first = 1; | |
1328 | - mutex_lock(&ipipe->inode->i_mutex); | |
1329 | - mutex_lock(&opipe->inode->i_mutex); | |
1404 | + mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_PARENT); | |
1405 | + mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_CHILD); | |
1330 | 1406 | } else { |
1331 | - mutex_lock(&opipe->inode->i_mutex); | |
1332 | - mutex_lock(&ipipe->inode->i_mutex); | |
1407 | + mutex_lock_nested(&opipe->inode->i_mutex, I_MUTEX_PARENT); | |
1408 | + mutex_lock_nested(&ipipe->inode->i_mutex, I_MUTEX_CHILD); | |
1333 | 1409 | } |
1334 | 1410 | |
1335 | - for (i = 0;; i++) { | |
1411 | + do { | |
1336 | 1412 | if (!opipe->readers) { |
1337 | 1413 | send_sig(SIGPIPE, current, 0); |
1338 | 1414 | if (!ret) |
1339 | 1415 | ret = -EPIPE; |
1340 | 1416 | break; |
1341 | 1417 | } |
1342 | - if (ipipe->nrbufs - i) { | |
1343 | - ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); | |
1344 | 1418 | |
1345 | - /* | |
1346 | - * If we have room, fill this buffer | |
1347 | - */ | |
1348 | - if (opipe->nrbufs < PIPE_BUFFERS) { | |
1349 | - int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); | |
1419 | + /* | |
1420 | + * If we have iterated all input buffers or ran out of | |
1421 | + * output room, break. | |
1422 | + */ | |
1423 | + if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) | |
1424 | + break; | |
1350 | 1425 | |
1351 | - /* | |
1352 | - * Get a reference to this pipe buffer, | |
1353 | - * so we can copy the contents over. | |
1354 | - */ | |
1355 | - ibuf->ops->get(ipipe, ibuf); | |
1426 | + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); | |
1427 | + nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); | |
1356 | 1428 | |
1357 | - obuf = opipe->bufs + nbuf; | |
1358 | - *obuf = *ibuf; | |
1429 | + /* | |
1430 | + * Get a reference to this pipe buffer, | |
1431 | + * so we can copy the contents over. | |
1432 | + */ | |
1433 | + ibuf->ops->get(ipipe, ibuf); | |
1359 | 1434 | |
1360 | - /* | |
1361 | - * Don't inherit the gift flag, we need to | |
1362 | - * prevent multiple steals of this page. | |
1363 | - */ | |
1364 | - obuf->flags &= ~PIPE_BUF_FLAG_GIFT; | |
1435 | + obuf = opipe->bufs + nbuf; | |
1436 | + *obuf = *ibuf; | |
1365 | 1437 | |
1366 | - if (obuf->len > len) | |
1367 | - obuf->len = len; | |
1368 | - | |
1369 | - opipe->nrbufs++; | |
1370 | - do_wakeup = 1; | |
1371 | - ret += obuf->len; | |
1372 | - len -= obuf->len; | |
1373 | - | |
1374 | - if (!len) | |
1375 | - break; | |
1376 | - if (opipe->nrbufs < PIPE_BUFFERS) | |
1377 | - continue; | |
1378 | - } | |
1379 | - | |
1380 | - /* | |
1381 | - * We have input available, but no output room. | |
1382 | - * If we already copied data, return that. If we | |
1383 | - * need to drop the opipe lock, it must be ordered | |
1384 | - * last to avoid deadlocks. | |
1385 | - */ | |
1386 | - if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { | |
1387 | - if (!ret) | |
1388 | - ret = -EAGAIN; | |
1389 | - break; | |
1390 | - } | |
1391 | - if (signal_pending(current)) { | |
1392 | - if (!ret) | |
1393 | - ret = -ERESTARTSYS; | |
1394 | - break; | |
1395 | - } | |
1396 | - if (do_wakeup) { | |
1397 | - smp_mb(); | |
1398 | - if (waitqueue_active(&opipe->wait)) | |
1399 | - wake_up_interruptible(&opipe->wait); | |
1400 | - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); | |
1401 | - do_wakeup = 0; | |
1402 | - } | |
1403 | - | |
1404 | - opipe->waiting_writers++; | |
1405 | - pipe_wait(opipe); | |
1406 | - opipe->waiting_writers--; | |
1407 | - continue; | |
1408 | - } | |
1409 | - | |
1410 | 1438 | /* |
1411 | - * No input buffers, do the usual checks for available | |
1412 | - * writers and blocking and wait if necessary | |
1439 | + * Don't inherit the gift flag, we need to | |
1440 | + * prevent multiple steals of this page. | |
1413 | 1441 | */ |
1414 | - if (!ipipe->writers) | |
1415 | - break; | |
1416 | - if (!ipipe->waiting_writers) { | |
1417 | - if (ret) | |
1418 | - break; | |
1419 | - } | |
1420 | - /* | |
1421 | - * pipe_wait() drops the ipipe mutex. To avoid deadlocks | |
1422 | - * with another process, we can only safely do that if | |
1423 | - * the ipipe lock is ordered last. | |
1424 | - */ | |
1425 | - if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { | |
1426 | - if (!ret) | |
1427 | - ret = -EAGAIN; | |
1428 | - break; | |
1429 | - } | |
1430 | - if (signal_pending(current)) { | |
1431 | - if (!ret) | |
1432 | - ret = -ERESTARTSYS; | |
1433 | - break; | |
1434 | - } | |
1442 | + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; | |
1435 | 1443 | |
1436 | - if (waitqueue_active(&ipipe->wait)) | |
1437 | - wake_up_interruptible_sync(&ipipe->wait); | |
1438 | - kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); | |
1444 | + if (obuf->len > len) | |
1445 | + obuf->len = len; | |
1439 | 1446 | |
1440 | - pipe_wait(ipipe); | |
1441 | - } | |
1447 | + opipe->nrbufs++; | |
1448 | + ret += obuf->len; | |
1449 | + len -= obuf->len; | |
1450 | + i++; | |
1451 | + } while (len); | |
1442 | 1452 | |
1443 | 1453 | mutex_unlock(&ipipe->inode->i_mutex); |
1444 | 1454 | mutex_unlock(&opipe->inode->i_mutex); |
1445 | 1455 | |
1446 | - if (do_wakeup) { | |
1456 | + /* | |
1457 | + * If we put data in the output pipe, wakeup any potential readers. | |
1458 | + */ | |
1459 | + if (ret > 0) { | |
1447 | 1460 | smp_mb(); |
1448 | 1461 | if (waitqueue_active(&opipe->wait)) |
1449 | 1462 | wake_up_interruptible(&opipe->wait); |
1450 | 1463 | |
1451 | 1464 | |
1452 | 1465 | |
... | ... | @@ -1464,14 +1477,29 @@ |
1464 | 1477 | { |
1465 | 1478 | struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; |
1466 | 1479 | struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; |
1480 | + int ret = -EINVAL; | |
1467 | 1481 | |
1468 | 1482 | /* |
1469 | - * Link ipipe to the two output pipes, consuming as we go along. | |
1483 | + * Duplicate the contents of ipipe to opipe without actually | |
1484 | + * copying the data. | |
1470 | 1485 | */ |
1471 | - if (ipipe && opipe) | |
1472 | - return link_pipe(ipipe, opipe, len, flags); | |
1486 | + if (ipipe && opipe && ipipe != opipe) { | |
1487 | + /* | |
1488 | + * Keep going, unless we encounter an error. The ipipe/opipe | |
1489 | + * ordering doesn't really matter. | |
1490 | + */ | |
1491 | + ret = link_ipipe_prep(ipipe, flags); | |
1492 | + if (!ret) { | |
1493 | + ret = link_opipe_prep(opipe, flags); | |
1494 | + if (!ret) { | |
1495 | + ret = link_pipe(ipipe, opipe, len, flags); | |
1496 | + if (!ret && (flags & SPLICE_F_NONBLOCK)) | |
1497 | + ret = -EAGAIN; | |
1498 | + } | |
1499 | + } | |
1500 | + } | |
1473 | 1501 | |
1474 | - return -EINVAL; | |
1502 | + return ret; | |
1475 | 1503 | } |
1476 | 1504 | |
1477 | 1505 | asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) |