Commit 68f8353b480e5f2e136c38a511abdbb88eaa8ce2

Authored by Ilpo Järvinen
Committed by David S. Miller
1 parent fd6dad616d

[TCP]: Rewrite SACK block processing & sack_recv_cache use

Key points of this patch are:

  - In case new SACK information is advance only type, no skb
    processing below previously discovered highest point is done
  - Optimize cases below highest point too since there's no need
    to always go up to highest point (which is very likely still
    present in that SACK), this is not entirely true though
    because I'm dropping the fastpath_skb_hint which could
    previously optimize those cases even better. Whether that's
    significant, I'm not too sure.

Currently it will provide skipping by walking. Combined with
RB-tree, all skipping would become fast too regardless of window
size (can be done incrementally later).

Previously a number of cases in TCP SACK processing fails to
take advantage of costly stored information in sack_recv_cache,
most importantly, expected events such as cumulative ACK and new
hole ACKs. Processing on such ACKs result in rather long walks
building up latencies (which easily gets nasty when window is
huge). Those latencies are often completely unnecessary
compared with the amount of _new_ information received, usually
for cumulative ACK there's no new information at all, yet TCP
walks whole queue unnecessary potentially taking a number of
costly cache misses on the way, etc.!

Since the inclusion of highest_sack, there's a lot information
that is very likely redundant (SACK fastpath hint stuff,
fackets_out, highest_sack), though there's no ultimate guarantee
that they'll remain the same whole the time (in all unearthly
scenarios). Take advantage of this knowledge here and drop
fastpath hint and use direct access to highest SACKed skb as
a replacement.

Effectively "special cased" fastpath is dropped. This change
adds some complexity to introduce better coveraged "fastpath",
though the added complexity should make TCP behave more cache
friendly.

The current ACK's SACK blocks are compared against each cached
block individially and only ranges that are new are then scanned
by the high constant walk. For other parts of write queue, even
when in previously known part of the SACK blocks, a faster skip
function is used (if necessary at all). In addition, whenever
possible, TCP fast-forwards to highest_sack skb that was made
available by an earlier patch. In typical case, no other things
but this fast-forward and mandatory markings after that occur
making the access pattern quite similar to the former fastpath
"special case".

DSACKs are special case that must always be walked.

The local to recv_sack_cache copying could be more intelligent
w.r.t DSACKs which are likely to be there only once but that
is left to a separate patch.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 4 changed files with 172 additions and 117 deletions Side-by-side Diff

... ... @@ -343,10 +343,7 @@
343 343 struct sk_buff *scoreboard_skb_hint;
344 344 struct sk_buff *retransmit_skb_hint;
345 345 struct sk_buff *forward_skb_hint;
346   - struct sk_buff *fastpath_skb_hint;
347 346  
348   - int fastpath_cnt_hint; /* Lags behind by current skb's pcount
349   - * compared to respective fackets_out */
350 347 int lost_cnt_hint;
351 348 int retransmit_cnt_hint;
352 349  
... ... @@ -1081,7 +1081,6 @@
1081 1081 static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
1082 1082 {
1083 1083 tcp_clear_retrans_hints_partial(tp);
1084   - tp->fastpath_skb_hint = NULL;
1085 1084 }
1086 1085  
1087 1086 /* MD5 Signature */
net/ipv4/tcp_input.c
... ... @@ -1333,6 +1333,88 @@
1333 1333 return flag;
1334 1334 }
1335 1335  
  1336 +static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
  1337 + struct tcp_sack_block *next_dup,
  1338 + u32 start_seq, u32 end_seq,
  1339 + int dup_sack_in, int *fack_count,
  1340 + int *reord, int *flag)
  1341 +{
  1342 + struct tcp_sock *tp = tcp_sk(sk);
  1343 +
  1344 + tcp_for_write_queue_from(skb, sk) {
  1345 + int in_sack = 0;
  1346 + int dup_sack = dup_sack_in;
  1347 +
  1348 + if (skb == tcp_send_head(sk))
  1349 + break;
  1350 +
  1351 + /* queue is in-order => we can short-circuit the walk early */
  1352 + if (!before(TCP_SKB_CB(skb)->seq, end_seq))
  1353 + break;
  1354 +
  1355 + if ((next_dup != NULL) &&
  1356 + before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
  1357 + in_sack = tcp_match_skb_to_sack(sk, skb,
  1358 + next_dup->start_seq,
  1359 + next_dup->end_seq);
  1360 + if (in_sack > 0)
  1361 + dup_sack = 1;
  1362 + }
  1363 +
  1364 + if (in_sack <= 0)
  1365 + in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq);
  1366 + if (unlikely(in_sack < 0))
  1367 + break;
  1368 +
  1369 + if (in_sack)
  1370 + *flag |= tcp_sacktag_one(skb, tp, reord, dup_sack, *fack_count);
  1371 +
  1372 + *fack_count += tcp_skb_pcount(skb);
  1373 + }
  1374 + return skb;
  1375 +}
  1376 +
  1377 +/* Avoid all extra work that is being done by sacktag while walking in
  1378 + * a normal way
  1379 + */
  1380 +static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
  1381 + u32 skip_to_seq)
  1382 +{
  1383 + tcp_for_write_queue_from(skb, sk) {
  1384 + if (skb == tcp_send_head(sk))
  1385 + break;
  1386 +
  1387 + if (before(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
  1388 + break;
  1389 + }
  1390 + return skb;
  1391 +}
  1392 +
  1393 +static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
  1394 + struct sock *sk,
  1395 + struct tcp_sack_block *next_dup,
  1396 + u32 skip_to_seq,
  1397 + int *fack_count, int *reord,
  1398 + int *flag)
  1399 +{
  1400 + if (next_dup == NULL)
  1401 + return skb;
  1402 +
  1403 + if (before(next_dup->start_seq, skip_to_seq)) {
  1404 + skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
  1405 + tcp_sacktag_walk(skb, sk, NULL,
  1406 + next_dup->start_seq, next_dup->end_seq,
  1407 + 1, fack_count, reord, flag);
  1408 + }
  1409 +
  1410 + return skb;
  1411 +}
  1412 +
  1413 +static int tcp_sack_cache_ok(struct tcp_sock *tp, struct tcp_sack_block *cache)
  1414 +{
  1415 + return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
  1416 +}
  1417 +
1336 1418 static int
1337 1419 tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
1338 1420 {
1339 1421  
1340 1422  
... ... @@ -1342,16 +1424,16 @@
1342 1424 TCP_SKB_CB(ack_skb)->sacked);
1343 1425 struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
1344 1426 struct tcp_sack_block sp[4];
1345   - struct sk_buff *cached_skb;
  1427 + struct tcp_sack_block *cache;
  1428 + struct sk_buff *skb;
1346 1429 int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
1347 1430 int used_sacks;
1348 1431 int reord = tp->packets_out;
1349 1432 int flag = 0;
1350 1433 int found_dup_sack = 0;
1351   - int cached_fack_count;
1352   - int i;
  1434 + int fack_count;
  1435 + int i, j;
1353 1436 int first_sack_index;
1354   - int force_one_sack;
1355 1437  
1356 1438 if (!tp->sacked_out) {
1357 1439 if (WARN_ON(tp->fackets_out))
1358 1440  
1359 1441  
1360 1442  
1361 1443  
1362 1444  
1363 1445  
1364 1446  
1365 1447  
1366 1448  
1367 1449  
1368 1450  
1369 1451  
1370 1452  
1371 1453  
1372 1454  
1373 1455  
1374 1456  
1375 1457  
1376 1458  
... ... @@ -1409,132 +1491,123 @@
1409 1491 used_sacks++;
1410 1492 }
1411 1493  
1412   - /* SACK fastpath:
1413   - * if the only SACK change is the increase of the end_seq of
1414   - * the first block then only apply that SACK block
1415   - * and use retrans queue hinting otherwise slowpath */
1416   - force_one_sack = 1;
1417   - for (i = 0; i < used_sacks; i++) {
1418   - u32 start_seq = sp[i].start_seq;
1419   - u32 end_seq = sp[i].end_seq;
  1494 + /* order SACK blocks to allow in order walk of the retrans queue */
  1495 + for (i = used_sacks - 1; i > 0; i--) {
  1496 + for (j = 0; j < i; j++){
  1497 + if (after(sp[j].start_seq, sp[j+1].start_seq)) {
  1498 + struct tcp_sack_block tmp;
1420 1499  
1421   - if (i == 0) {
1422   - if (tp->recv_sack_cache[i].start_seq != start_seq)
1423   - force_one_sack = 0;
1424   - } else {
1425   - if ((tp->recv_sack_cache[i].start_seq != start_seq) ||
1426   - (tp->recv_sack_cache[i].end_seq != end_seq))
1427   - force_one_sack = 0;
1428   - }
1429   - tp->recv_sack_cache[i].start_seq = start_seq;
1430   - tp->recv_sack_cache[i].end_seq = end_seq;
1431   - }
1432   - /* Clear the rest of the cache sack blocks so they won't match mistakenly. */
1433   - for (; i < ARRAY_SIZE(tp->recv_sack_cache); i++) {
1434   - tp->recv_sack_cache[i].start_seq = 0;
1435   - tp->recv_sack_cache[i].end_seq = 0;
1436   - }
  1500 + tmp = sp[j];
  1501 + sp[j] = sp[j+1];
  1502 + sp[j+1] = tmp;
1437 1503  
1438   - if (force_one_sack)
1439   - used_sacks = 1;
1440   - else {
1441   - int j;
1442   - tp->fastpath_skb_hint = NULL;
1443   -
1444   - /* order SACK blocks to allow in order walk of the retrans queue */
1445   - for (i = used_sacks - 1; i > 0; i--) {
1446   - for (j = 0; j < i; j++){
1447   - if (after(sp[j].start_seq, sp[j+1].start_seq)) {
1448   - struct tcp_sack_block tmp;
1449   -
1450   - tmp = sp[j];
1451   - sp[j] = sp[j+1];
1452   - sp[j+1] = tmp;
1453   -
1454   - /* Track where the first SACK block goes to */
1455   - if (j == first_sack_index)
1456   - first_sack_index = j+1;
1457   - }
1458   -
  1504 + /* Track where the first SACK block goes to */
  1505 + if (j == first_sack_index)
  1506 + first_sack_index = j+1;
1459 1507 }
1460 1508 }
1461 1509 }
1462 1510  
1463   - /* Use SACK fastpath hint if valid */
1464   - cached_skb = tp->fastpath_skb_hint;
1465   - cached_fack_count = tp->fastpath_cnt_hint;
1466   - if (!cached_skb) {
1467   - cached_skb = tcp_write_queue_head(sk);
1468   - cached_fack_count = 0;
  1511 + skb = tcp_write_queue_head(sk);
  1512 + fack_count = 0;
  1513 + i = 0;
  1514 +
  1515 + if (!tp->sacked_out) {
  1516 + /* It's already past, so skip checking against it */
  1517 + cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
  1518 + } else {
  1519 + cache = tp->recv_sack_cache;
  1520 + /* Skip empty blocks in at head of the cache */
  1521 + while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
  1522 + !cache->end_seq)
  1523 + cache++;
1469 1524 }
1470 1525  
1471   - for (i = 0; i < used_sacks; i++) {
1472   - struct sk_buff *skb;
  1526 + while (i < used_sacks) {
1473 1527 u32 start_seq = sp[i].start_seq;
1474 1528 u32 end_seq = sp[i].end_seq;
1475   - int fack_count;
1476 1529 int dup_sack = (found_dup_sack && (i == first_sack_index));
1477   - int next_dup = (found_dup_sack && (i+1 == first_sack_index));
  1530 + struct tcp_sack_block *next_dup = NULL;
1478 1531  
1479   - skb = cached_skb;
1480   - fack_count = cached_fack_count;
  1532 + if (found_dup_sack && ((i + 1) == first_sack_index))
  1533 + next_dup = &sp[i + 1];
1481 1534  
1482 1535 /* Event "B" in the comment above. */
1483 1536 if (after(end_seq, tp->high_seq))
1484 1537 flag |= FLAG_DATA_LOST;
1485 1538  
1486   - tcp_for_write_queue_from(skb, sk) {
1487   - int in_sack = 0;
  1539 + /* Skip too early cached blocks */
  1540 + while (tcp_sack_cache_ok(tp, cache) &&
  1541 + !before(start_seq, cache->end_seq))
  1542 + cache++;
1488 1543  
1489   - if (skb == tcp_send_head(sk))
1490   - break;
  1544 + /* Can skip some work by looking recv_sack_cache? */
  1545 + if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
  1546 + after(end_seq, cache->start_seq)) {
1491 1547  
1492   - cached_skb = skb;
1493   - cached_fack_count = fack_count;
1494   - if (i == first_sack_index) {
1495   - tp->fastpath_skb_hint = skb;
1496   - tp->fastpath_cnt_hint = fack_count;
  1548 + /* Head todo? */
  1549 + if (before(start_seq, cache->start_seq)) {
  1550 + skb = tcp_sacktag_skip(skb, sk, start_seq);
  1551 + skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq,
  1552 + cache->start_seq, dup_sack,
  1553 + &fack_count, &reord, &flag);
1497 1554 }
1498 1555  
1499   - /* The retransmission queue is always in order, so
1500   - * we can short-circuit the walk early.
1501   - */
1502   - if (!before(TCP_SKB_CB(skb)->seq, end_seq))
1503   - break;
  1556 + /* Rest of the block already fully processed? */
  1557 + if (!after(end_seq, cache->end_seq)) {
  1558 + skb = tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq,
  1559 + &fack_count, &reord, &flag);
  1560 + goto advance_sp;
  1561 + }
1504 1562  
1505   - dup_sack = (found_dup_sack && (i == first_sack_index));
  1563 + /* ...tail remains todo... */
  1564 + if (TCP_SKB_CB(tp->highest_sack)->end_seq == cache->end_seq) {
  1565 + /* ...but better entrypoint exists! Check that DSACKs are
  1566 + * properly accounted while skipping here
  1567 + */
  1568 + tcp_maybe_skipping_dsack(skb, sk, next_dup, cache->end_seq,
  1569 + &fack_count, &reord, &flag);
1506 1570  
1507   - /* Due to sorting DSACK may reside within this SACK block! */
1508   - if (next_dup) {
1509   - u32 dup_start = sp[i+1].start_seq;
1510   - u32 dup_end = sp[i+1].end_seq;
1511   -
1512   - if (before(TCP_SKB_CB(skb)->seq, dup_end)) {
1513   - in_sack = tcp_match_skb_to_sack(sk, skb, dup_start, dup_end);
1514   - if (in_sack > 0)
1515   - dup_sack = 1;
1516   - }
  1571 + skb = tcp_write_queue_next(sk, tp->highest_sack);
  1572 + fack_count = tp->fackets_out;
  1573 + cache++;
  1574 + goto walk;
1517 1575 }
1518 1576  
1519   - /* DSACK info lost if out-of-mem, try SACK still */
1520   - if (in_sack <= 0)
1521   - in_sack = tcp_match_skb_to_sack(sk, skb, start_seq, end_seq);
1522   - if (unlikely(in_sack < 0))
1523   - break;
  1577 + skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
  1578 + /* Check overlap against next cached too (past this one already) */
  1579 + cache++;
  1580 + continue;
  1581 + }
1524 1582  
1525   - if (in_sack)
1526   - flag |= tcp_sacktag_one(skb, tp, &reord, dup_sack, fack_count);
1527   -
1528   - fack_count += tcp_skb_pcount(skb);
  1583 + if (!before(start_seq, tcp_highest_sack_seq(tp))) {
  1584 + skb = tcp_write_queue_next(sk, tp->highest_sack);
  1585 + fack_count = tp->fackets_out;
1529 1586 }
  1587 + skb = tcp_sacktag_skip(skb, sk, start_seq);
1530 1588  
  1589 +walk:
  1590 + skb = tcp_sacktag_walk(skb, sk, next_dup, start_seq, end_seq,
  1591 + dup_sack, &fack_count, &reord, &flag);
  1592 +
  1593 +advance_sp:
1531 1594 /* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
1532 1595 * due to in-order walk
1533 1596 */
1534 1597 if (after(end_seq, tp->frto_highmark))
1535 1598 flag &= ~FLAG_ONLY_ORIG_SACKED;
  1599 +
  1600 + i++;
1536 1601 }
1537 1602  
  1603 + /* Clear the head of the cache sack blocks so we can skip it next time */
  1604 + for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
  1605 + tp->recv_sack_cache[i].start_seq = 0;
  1606 + tp->recv_sack_cache[i].end_seq = 0;
  1607 + }
  1608 + for (j = 0; j < used_sacks; j++)
  1609 + tp->recv_sack_cache[i++] = sp[j];
  1610 +
1538 1611 flag |= tcp_mark_lost_retrans(sk);
1539 1612  
1540 1613 tcp_verify_left_out(tp);
... ... @@ -2821,9 +2894,7 @@
2821 2894 }
2822 2895  
2823 2896 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
2824   - /* hint's skb might be NULL but we don't need to care */
2825   - tp->fastpath_cnt_hint -= min_t(u32, pkts_acked,
2826   - tp->fastpath_cnt_hint);
  2897 +
2827 2898 if (ca_ops->pkts_acked) {
2828 2899 s32 rtt_us = -1;
2829 2900  
net/ipv4/tcp_output.c
... ... @@ -653,9 +653,7 @@
653 653 }
654 654  
655 655 /* When a modification to fackets out becomes necessary, we need to check
656   - * skb is counted to fackets_out or not. Another important thing is to
657   - * tweak SACK fastpath hint too as it would overwrite all changes unless
658   - * hint is also changed.
  656 + * skb is counted to fackets_out or not.
659 657 */
660 658 static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb,
661 659 int decr)
... ... @@ -667,11 +665,6 @@
667 665  
668 666 if (!before(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
669 667 tp->fackets_out -= decr;
670   -
671   - /* cnt_hint is "off-by-one" compared with fackets_out (see sacktag) */
672   - if (tp->fastpath_skb_hint != NULL &&
673   - after(TCP_SKB_CB(tp->fastpath_skb_hint)->seq, TCP_SKB_CB(skb)->seq))
674   - tp->fastpath_cnt_hint -= decr;
675 668 }
676 669  
677 670 /* Function to create two new TCP segments. Shrinks the given segment
... ... @@ -1753,11 +1746,6 @@
1753 1746  
1754 1747 /* changed transmit queue under us so clear hints */
1755 1748 tcp_clear_retrans_hints_partial(tp);
1756   - /* manually tune sacktag skb hint */
1757   - if (tp->fastpath_skb_hint == next_skb) {
1758   - tp->fastpath_skb_hint = skb;
1759   - tp->fastpath_cnt_hint -= tcp_skb_pcount(skb);
1760   - }
1761 1749  
1762 1750 sk_stream_free_skb(sk, next_skb);
1763 1751 }