Commit 0403e3827788d878163f9ef0541b748b0f88ca5d

Authored by Dan Williams
1 parent f9dd213437

dmaengine: add fence support

Some engines optimize operation by reading ahead in the descriptor chain
such that descriptor2 may start execution before descriptor1 completes.
If descriptor2 depends on the result from descriptor1 then a fence is
required (on descriptor2) to disable this optimization.  The async_tx
api could implicitly identify dependencies via the 'depend_tx'
parameter, but that would constrain cases where the dependency chain
only specifies a completion order rather than a data dependency.  So,
provide an ASYNC_TX_FENCE to explicitly identify data dependencies.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Showing 8 changed files with 79 additions and 41 deletions Side-by-side Diff

crypto/async_tx/async_memcpy.c
... ... @@ -52,9 +52,12 @@
52 52  
53 53 if (device) {
54 54 dma_addr_t dma_dest, dma_src;
55   - unsigned long dma_prep_flags;
  55 + unsigned long dma_prep_flags = 0;
56 56  
57   - dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
  57 + if (submit->cb_fn)
  58 + dma_prep_flags |= DMA_PREP_INTERRUPT;
  59 + if (submit->flags & ASYNC_TX_FENCE)
  60 + dma_prep_flags |= DMA_PREP_FENCE;
58 61 dma_dest = dma_map_page(device->dev, dest, dest_offset, len,
59 62 DMA_FROM_DEVICE);
60 63  
crypto/async_tx/async_memset.c
... ... @@ -49,9 +49,12 @@
49 49  
50 50 if (device) {
51 51 dma_addr_t dma_dest;
52   - unsigned long dma_prep_flags;
  52 + unsigned long dma_prep_flags = 0;
53 53  
54   - dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
  54 + if (submit->cb_fn)
  55 + dma_prep_flags |= DMA_PREP_INTERRUPT;
  56 + if (submit->flags & ASYNC_TX_FENCE)
  57 + dma_prep_flags |= DMA_PREP_FENCE;
55 58 dma_dest = dma_map_page(device->dev, dest, offset, len,
56 59 DMA_FROM_DEVICE);
57 60  
crypto/async_tx/async_pq.c
... ... @@ -101,6 +101,7 @@
101 101 */
102 102 if (src_cnt > pq_src_cnt) {
103 103 submit->flags &= ~ASYNC_TX_ACK;
  104 + submit->flags |= ASYNC_TX_FENCE;
104 105 dma_flags |= DMA_COMPL_SKIP_DEST_UNMAP;
105 106 submit->cb_fn = NULL;
106 107 submit->cb_param = NULL;
... ... @@ -111,6 +112,8 @@
111 112 if (cb_fn_orig)
112 113 dma_flags |= DMA_PREP_INTERRUPT;
113 114 }
  115 + if (submit->flags & ASYNC_TX_FENCE)
  116 + dma_flags |= DMA_PREP_FENCE;
114 117  
115 118 /* Since we have clobbered the src_list we are committed
116 119 * to doing this asynchronously. Drivers force forward
... ... @@ -282,6 +285,8 @@
282 285 dma_flags |= DMA_PREP_PQ_DISABLE_P;
283 286 if (!Q(blocks, disks))
284 287 dma_flags |= DMA_PREP_PQ_DISABLE_Q;
  288 + if (submit->flags & ASYNC_TX_FENCE)
  289 + dma_flags |= DMA_PREP_FENCE;
285 290 for (i = 0; i < disks; i++)
286 291 if (likely(blocks[i])) {
287 292 BUG_ON(is_raid6_zero_block(blocks[i]));
crypto/async_tx/async_raid6_recov.c
... ... @@ -44,6 +44,8 @@
44 44 struct dma_async_tx_descriptor *tx;
45 45 enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
46 46  
  47 + if (submit->flags & ASYNC_TX_FENCE)
  48 + dma_flags |= DMA_PREP_FENCE;
47 49 dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
48 50 dma_src[0] = dma_map_page(dev, srcs[0], 0, len, DMA_TO_DEVICE);
49 51 dma_src[1] = dma_map_page(dev, srcs[1], 0, len, DMA_TO_DEVICE);
... ... @@ -89,6 +91,8 @@
89 91 struct dma_async_tx_descriptor *tx;
90 92 enum dma_ctrl_flags dma_flags = DMA_PREP_PQ_DISABLE_P;
91 93  
  94 + if (submit->flags & ASYNC_TX_FENCE)
  95 + dma_flags |= DMA_PREP_FENCE;
92 96 dma_dest[1] = dma_map_page(dev, dest, 0, len, DMA_BIDIRECTIONAL);
93 97 dma_src[0] = dma_map_page(dev, src, 0, len, DMA_TO_DEVICE);
94 98 tx = dma->device_prep_dma_pq(chan, dma_dest, dma_src, 1, &coef,
... ... @@ -138,7 +142,7 @@
138 142 srcs[1] = q;
139 143 coef[0] = raid6_gfexi[failb-faila];
140 144 coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
141   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  145 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
142 146 tx = async_sum_product(b, srcs, coef, bytes, submit);
143 147  
144 148 /* Dy = P+Pxy+Dx */
145 149  
146 150  
147 151  
... ... @@ -188,23 +192,23 @@
188 192 dp = blocks[faila];
189 193 dq = blocks[failb];
190 194  
191   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  195 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
192 196 tx = async_memcpy(dp, g, 0, 0, bytes, submit);
193   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  197 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
194 198 tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
195 199  
196 200 /* compute P + Pxy */
197 201 srcs[0] = dp;
198 202 srcs[1] = p;
199   - init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
200   - scribble);
  203 + init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
  204 + NULL, NULL, scribble);
201 205 tx = async_xor(dp, srcs, 0, 2, bytes, submit);
202 206  
203 207 /* compute Q + Qxy */
204 208 srcs[0] = dq;
205 209 srcs[1] = q;
206   - init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
207   - scribble);
  210 + init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
  211 + NULL, NULL, scribble);
208 212 tx = async_xor(dq, srcs, 0, 2, bytes, submit);
209 213  
210 214 /* Dx = A*(P+Pxy) + B*(Q+Qxy) */
... ... @@ -212,7 +216,7 @@
212 216 srcs[1] = dq;
213 217 coef[0] = raid6_gfexi[failb-faila];
214 218 coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
215   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  219 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
216 220 tx = async_sum_product(dq, srcs, coef, bytes, submit);
217 221  
218 222 /* Dy = P+Pxy+Dx */
... ... @@ -252,7 +256,7 @@
252 256 blocks[failb] = (void *)raid6_empty_zero_page;
253 257 blocks[disks-1] = dq;
254 258  
255   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  259 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
256 260 tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
257 261  
258 262 /* Restore pointer table */
259 263  
... ... @@ -264,15 +268,15 @@
264 268 /* compute P + Pxy */
265 269 srcs[0] = dp;
266 270 srcs[1] = p;
267   - init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
268   - scribble);
  271 + init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
  272 + NULL, NULL, scribble);
269 273 tx = async_xor(dp, srcs, 0, 2, bytes, submit);
270 274  
271 275 /* compute Q + Qxy */
272 276 srcs[0] = dq;
273 277 srcs[1] = q;
274   - init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
275   - scribble);
  278 + init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
  279 + NULL, NULL, scribble);
276 280 tx = async_xor(dq, srcs, 0, 2, bytes, submit);
277 281  
278 282 /* Dx = A*(P+Pxy) + B*(Q+Qxy) */
... ... @@ -280,7 +284,7 @@
280 284 srcs[1] = dq;
281 285 coef[0] = raid6_gfexi[failb-faila];
282 286 coef[1] = raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]];
283   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  287 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
284 288 tx = async_sum_product(dq, srcs, coef, bytes, submit);
285 289  
286 290 /* Dy = P+Pxy+Dx */
287 291  
288 292  
... ... @@ -407,13 +411,16 @@
407 411 int good = faila == 0 ? 1 : 0;
408 412 struct page *g = blocks[good];
409 413  
410   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  414 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
  415 + scribble);
411 416 tx = async_memcpy(p, g, 0, 0, bytes, submit);
412 417  
413   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  418 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
  419 + scribble);
414 420 tx = async_mult(dq, g, raid6_gfexp[good], bytes, submit);
415 421 } else {
416   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  422 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL,
  423 + scribble);
417 424 tx = async_gen_syndrome(blocks, 0, disks, bytes, submit);
418 425 }
419 426  
420 427  
... ... @@ -426,11 +433,11 @@
426 433  
427 434 srcs[0] = dq;
428 435 srcs[1] = q;
429   - init_async_submit(submit, ASYNC_TX_XOR_DROP_DST, tx, NULL, NULL,
430   - scribble);
  436 + init_async_submit(submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
  437 + NULL, NULL, scribble);
431 438 tx = async_xor(dq, srcs, 0, 2, bytes, submit);
432 439  
433   - init_async_submit(submit, 0, tx, NULL, NULL, scribble);
  440 + init_async_submit(submit, ASYNC_TX_FENCE, tx, NULL, NULL, scribble);
434 441 tx = async_mult(dq, dq, coef, bytes, submit);
435 442  
436 443 srcs[0] = p;
crypto/async_tx/async_xor.c
... ... @@ -69,6 +69,7 @@
69 69 */
70 70 if (src_cnt > xor_src_cnt) {
71 71 submit->flags &= ~ASYNC_TX_ACK;
  72 + submit->flags |= ASYNC_TX_FENCE;
72 73 dma_flags = DMA_COMPL_SKIP_DEST_UNMAP;
73 74 submit->cb_fn = NULL;
74 75 submit->cb_param = NULL;
... ... @@ -78,7 +79,8 @@
78 79 }
79 80 if (submit->cb_fn)
80 81 dma_flags |= DMA_PREP_INTERRUPT;
81   -
  82 + if (submit->flags & ASYNC_TX_FENCE)
  83 + dma_flags |= DMA_PREP_FENCE;
82 84 /* Since we have clobbered the src_list we are committed
83 85 * to doing this asynchronously. Drivers force forward progress
84 86 * in case they can not provide a descriptor
85 87  
... ... @@ -264,12 +266,15 @@
264 266 dma_src = (dma_addr_t *) src_list;
265 267  
266 268 if (dma_src && device && src_cnt <= device->max_xor) {
267   - unsigned long dma_prep_flags;
  269 + unsigned long dma_prep_flags = 0;
268 270 int i;
269 271  
270 272 pr_debug("%s: (async) len: %zu\n", __func__, len);
271 273  
272   - dma_prep_flags = submit->cb_fn ? DMA_PREP_INTERRUPT : 0;
  274 + if (submit->cb_fn)
  275 + dma_prep_flags |= DMA_PREP_INTERRUPT;
  276 + if (submit->flags & ASYNC_TX_FENCE)
  277 + dma_prep_flags |= DMA_PREP_FENCE;
273 278 for (i = 0; i < src_cnt; i++)
274 279 dma_src[i] = dma_map_page(device->dev, src_list[i],
275 280 offset, len, DMA_TO_DEVICE);
... ... @@ -502,13 +502,17 @@
502 502 int i;
503 503 int page_offset;
504 504 struct async_submit_ctl submit;
  505 + enum async_tx_flags flags = 0;
505 506  
506 507 if (bio->bi_sector >= sector)
507 508 page_offset = (signed)(bio->bi_sector - sector) * 512;
508 509 else
509 510 page_offset = (signed)(sector - bio->bi_sector) * -512;
510 511  
511   - init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
  512 + if (frombio)
  513 + flags |= ASYNC_TX_FENCE;
  514 + init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
  515 +
512 516 bio_for_each_segment(bvl, bio, i) {
513 517 int len = bio_iovec_idx(bio, i)->bv_len;
514 518 int clen;
... ... @@ -685,7 +689,7 @@
685 689  
686 690 atomic_inc(&sh->count);
687 691  
688   - init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
  692 + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
689 693 ops_complete_compute, sh, to_addr_conv(sh, percpu));
690 694 if (unlikely(count == 1))
691 695 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
... ... @@ -763,7 +767,8 @@
763 767 count = set_syndrome_sources(blocks, sh);
764 768 blocks[count] = NULL; /* regenerating p is not necessary */
765 769 BUG_ON(blocks[count+1] != dest); /* q should already be set */
766   - init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
  770 + init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
  771 + ops_complete_compute, sh,
767 772 to_addr_conv(sh, percpu));
768 773 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
769 774 } else {
... ... @@ -775,8 +780,8 @@
775 780 blocks[count++] = sh->dev[i].page;
776 781 }
777 782  
778   - init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
779   - ops_complete_compute, sh,
  783 + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
  784 + NULL, ops_complete_compute, sh,
780 785 to_addr_conv(sh, percpu));
781 786 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
782 787 }
... ... @@ -837,8 +842,9 @@
837 842 /* Q disk is one of the missing disks */
838 843 if (faila == syndrome_disks) {
839 844 /* Missing P+Q, just recompute */
840   - init_async_submit(&submit, 0, NULL, ops_complete_compute,
841   - sh, to_addr_conv(sh, percpu));
  845 + init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
  846 + ops_complete_compute, sh,
  847 + to_addr_conv(sh, percpu));
842 848 return async_gen_syndrome(blocks, 0, count+2,
843 849 STRIPE_SIZE, &submit);
844 850 } else {
845 851  
846 852  
... ... @@ -859,21 +865,24 @@
859 865 blocks[count++] = sh->dev[i].page;
860 866 }
861 867 dest = sh->dev[data_target].page;
862   - init_async_submit(&submit, ASYNC_TX_XOR_ZERO_DST, NULL,
863   - NULL, NULL, to_addr_conv(sh, percpu));
  868 + init_async_submit(&submit,
  869 + ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
  870 + NULL, NULL, NULL,
  871 + to_addr_conv(sh, percpu));
864 872 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
865 873 &submit);
866 874  
867 875 count = set_syndrome_sources(blocks, sh);
868   - init_async_submit(&submit, 0, tx, ops_complete_compute,
869   - sh, to_addr_conv(sh, percpu));
  876 + init_async_submit(&submit, ASYNC_TX_FENCE, tx,
  877 + ops_complete_compute, sh,
  878 + to_addr_conv(sh, percpu));
870 879 return async_gen_syndrome(blocks, 0, count+2,
871 880 STRIPE_SIZE, &submit);
872 881 }
873 882 }
874 883  
875   - init_async_submit(&submit, 0, NULL, ops_complete_compute, sh,
876   - to_addr_conv(sh, percpu));
  884 + init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute,
  885 + sh, to_addr_conv(sh, percpu));
877 886 if (failb == syndrome_disks) {
878 887 /* We're missing D+P. */
879 888 return async_raid6_datap_recov(syndrome_disks+2, STRIPE_SIZE,
... ... @@ -916,7 +925,7 @@
916 925 xor_srcs[count++] = dev->page;
917 926 }
918 927  
919   - init_async_submit(&submit, ASYNC_TX_XOR_DROP_DST, tx,
  928 + init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
920 929 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
921 930 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
922 931  
include/linux/async_tx.h
... ... @@ -58,11 +58,14 @@
58 58 * array.
59 59 * @ASYNC_TX_ACK: immediately ack the descriptor, precludes setting up a
60 60 * dependency chain
  61 + * @ASYNC_TX_FENCE: specify that the next operation in the dependency
  62 + * chain uses this operation's result as an input
61 63 */
62 64 enum async_tx_flags {
63 65 ASYNC_TX_XOR_ZERO_DST = (1 << 0),
64 66 ASYNC_TX_XOR_DROP_DST = (1 << 1),
65 67 ASYNC_TX_ACK = (1 << 2),
  68 + ASYNC_TX_FENCE = (1 << 3),
66 69 };
67 70  
68 71 /**
include/linux/dmaengine.h
... ... @@ -87,6 +87,8 @@
87 87 * @DMA_PREP_CONTINUE - indicate to a driver that it is reusing buffers as
88 88 * sources that were the result of a previous operation, in the case of a PQ
89 89 * operation it continues the calculation with new sources
  90 + * @DMA_PREP_FENCE - tell the driver that subsequent operations depend
  91 + * on the result of this operation
90 92 */
91 93 enum dma_ctrl_flags {
92 94 DMA_PREP_INTERRUPT = (1 << 0),
... ... @@ -98,6 +100,7 @@
98 100 DMA_PREP_PQ_DISABLE_P = (1 << 6),
99 101 DMA_PREP_PQ_DISABLE_Q = (1 << 7),
100 102 DMA_PREP_CONTINUE = (1 << 8),
  103 + DMA_PREP_FENCE = (1 << 9),
101 104 };
102 105  
103 106 /**