Commit b45305fce5bb1abec263fcff9d81ebecd6306ede

Authored by Daniel Vetter
1 parent 6547fbdbff

drm/i915: Implement workaround for broken CS tlb on i830/845

Now that Chris Wilson demonstrated that the key for stability on early
gen 2 is to simple _never_ exchange the physical backing storage of
batch buffers I've tried a stab at a kernel solution. Doesn't look too
nefarious imho, now that I don't try to be too clever for my own good
any more.

v2: After discussing the various techniques, we've decided to always blit
batches on the suspect devices, but allow userspace to opt out of the
kernel workaround assume full responsibility for providing coherent
batches. The principal reason is that avoiding the blit does improve
performance in a few key microbenchmarks and also in cairo-trace
replays.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
[danvet:
- Drop the hunk which uses HAS_BROKEN_CS_TLB to implement the ring
  wrap w/a. Suggested by Chris Wilson.
- Also add the ACTHD check from Chris Wilson for the error state
  dumping, so that we still catch batches when userspace opts out of
  the w/a.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>

Showing 7 changed files with 100 additions and 8 deletions Side-by-side Diff

drivers/gpu/drm/i915/i915_dma.c
... ... @@ -989,6 +989,9 @@
989 989 case I915_PARAM_HAS_SECURE_BATCHES:
990 990 value = capable(CAP_SYS_ADMIN);
991 991 break;
  992 + case I915_PARAM_HAS_PINNED_BATCHES:
  993 + value = 1;
  994 + break;
992 995 default:
993 996 DRM_DEBUG_DRIVER("Unknown parameter %d\n",
994 997 param->param);
drivers/gpu/drm/i915/i915_drv.h
... ... @@ -1100,6 +1100,7 @@
1100 1100 */
1101 1101 atomic_t pending_flip;
1102 1102 };
  1103 +#define to_gem_object(obj) (&((struct drm_i915_gem_object *)(obj))->base)
1103 1104  
1104 1105 #define to_intel_bo(x) container_of(x, struct drm_i915_gem_object, base)
1105 1106  
... ... @@ -1198,6 +1199,9 @@
1198 1199  
1199 1200 #define HAS_OVERLAY(dev) (INTEL_INFO(dev)->has_overlay)
1200 1201 #define OVERLAY_NEEDS_PHYSICAL(dev) (INTEL_INFO(dev)->overlay_needs_physical)
  1202 +
  1203 +/* Early gen2 have a totally busted CS tlb and require pinned batches. */
  1204 +#define HAS_BROKEN_CS_TLB(dev) (IS_I830(dev) || IS_845G(dev))
1201 1205  
1202 1206 /* With the 945 and later, Y tiling got adjusted so that it was 32 128-byte
1203 1207 * rows, which changed the alignment requirements and fence programming.
drivers/gpu/drm/i915/i915_gem_execbuffer.c
... ... @@ -808,6 +808,8 @@
808 808  
809 809 flags |= I915_DISPATCH_SECURE;
810 810 }
  811 + if (args->flags & I915_EXEC_IS_PINNED)
  812 + flags |= I915_DISPATCH_PINNED;
811 813  
812 814 switch (args->flags & I915_EXEC_RING_MASK) {
813 815 case I915_EXEC_DEFAULT:
drivers/gpu/drm/i915/i915_irq.c
... ... @@ -1087,6 +1087,18 @@
1087 1087 if (!ring->get_seqno)
1088 1088 return NULL;
1089 1089  
  1090 + if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
  1091 + u32 acthd = I915_READ(ACTHD);
  1092 +
  1093 + if (WARN_ON(ring->id != RCS))
  1094 + return NULL;
  1095 +
  1096 + obj = ring->private;
  1097 + if (acthd >= obj->gtt_offset &&
  1098 + acthd < obj->gtt_offset + obj->base.size)
  1099 + return i915_error_object_create(dev_priv, obj);
  1100 + }
  1101 +
1090 1102 seqno = ring->get_seqno(ring, false);
1091 1103 list_for_each_entry(obj, &dev_priv->mm.active_list, mm_list) {
1092 1104 if (obj->ring != ring)
drivers/gpu/drm/i915/intel_ringbuffer.c
... ... @@ -547,9 +547,14 @@
547 547  
548 548 static void render_ring_cleanup(struct intel_ring_buffer *ring)
549 549 {
  550 + struct drm_device *dev = ring->dev;
  551 +
550 552 if (!ring->private)
551 553 return;
552 554  
  555 + if (HAS_BROKEN_CS_TLB(dev))
  556 + drm_gem_object_unreference(to_gem_object(ring->private));
  557 +
553 558 cleanup_pipe_control(ring);
554 559 }
555 560  
... ... @@ -969,6 +974,8 @@
969 974 return 0;
970 975 }
971 976  
  977 +/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
  978 +#define I830_BATCH_LIMIT (256*1024)
972 979 static int
973 980 i830_dispatch_execbuffer(struct intel_ring_buffer *ring,
974 981 u32 offset, u32 len,
975 982  
976 983  
... ... @@ -976,16 +983,48 @@
976 983 {
977 984 int ret;
978 985  
979   - ret = intel_ring_begin(ring, 4);
980   - if (ret)
981   - return ret;
  986 + if (flags & I915_DISPATCH_PINNED) {
  987 + ret = intel_ring_begin(ring, 4);
  988 + if (ret)
  989 + return ret;
982 990  
983   - intel_ring_emit(ring, MI_BATCH_BUFFER);
984   - intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE));
985   - intel_ring_emit(ring, offset + len - 8);
986   - intel_ring_emit(ring, 0);
987   - intel_ring_advance(ring);
  991 + intel_ring_emit(ring, MI_BATCH_BUFFER);
  992 + intel_ring_emit(ring, offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE));
  993 + intel_ring_emit(ring, offset + len - 8);
  994 + intel_ring_emit(ring, MI_NOOP);
  995 + intel_ring_advance(ring);
  996 + } else {
  997 + struct drm_i915_gem_object *obj = ring->private;
  998 + u32 cs_offset = obj->gtt_offset;
988 999  
  1000 + if (len > I830_BATCH_LIMIT)
  1001 + return -ENOSPC;
  1002 +
  1003 + ret = intel_ring_begin(ring, 9+3);
  1004 + if (ret)
  1005 + return ret;
  1006 + /* Blit the batch (which has now all relocs applied) to the stable batch
  1007 + * scratch bo area (so that the CS never stumbles over its tlb
  1008 + * invalidation bug) ... */
  1009 + intel_ring_emit(ring, XY_SRC_COPY_BLT_CMD |
  1010 + XY_SRC_COPY_BLT_WRITE_ALPHA |
  1011 + XY_SRC_COPY_BLT_WRITE_RGB);
  1012 + intel_ring_emit(ring, BLT_DEPTH_32 | BLT_ROP_GXCOPY | 4096);
  1013 + intel_ring_emit(ring, 0);
  1014 + intel_ring_emit(ring, (DIV_ROUND_UP(len, 4096) << 16) | 1024);
  1015 + intel_ring_emit(ring, cs_offset);
  1016 + intel_ring_emit(ring, 0);
  1017 + intel_ring_emit(ring, 4096);
  1018 + intel_ring_emit(ring, offset);
  1019 + intel_ring_emit(ring, MI_FLUSH);
  1020 +
  1021 + /* ... and execute it. */
  1022 + intel_ring_emit(ring, MI_BATCH_BUFFER);
  1023 + intel_ring_emit(ring, cs_offset | (flags & I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE));
  1024 + intel_ring_emit(ring, cs_offset + len - 8);
  1025 + intel_ring_advance(ring);
  1026 + }
  1027 +
989 1028 return 0;
990 1029 }
991 1030  
... ... @@ -1595,6 +1634,27 @@
1595 1634 ring->dispatch_execbuffer = i915_dispatch_execbuffer;
1596 1635 ring->init = init_render_ring;
1597 1636 ring->cleanup = render_ring_cleanup;
  1637 +
  1638 + /* Workaround batchbuffer to combat CS tlb bug. */
  1639 + if (HAS_BROKEN_CS_TLB(dev)) {
  1640 + struct drm_i915_gem_object *obj;
  1641 + int ret;
  1642 +
  1643 + obj = i915_gem_alloc_object(dev, I830_BATCH_LIMIT);
  1644 + if (obj == NULL) {
  1645 + DRM_ERROR("Failed to allocate batch bo\n");
  1646 + return -ENOMEM;
  1647 + }
  1648 +
  1649 + ret = i915_gem_object_pin(obj, 0, true, false);
  1650 + if (ret != 0) {
  1651 + drm_gem_object_unreference(&obj->base);
  1652 + DRM_ERROR("Failed to ping batch bo\n");
  1653 + return ret;
  1654 + }
  1655 +
  1656 + ring->private = obj;
  1657 + }
1598 1658  
1599 1659 return intel_init_ring_buffer(dev, ring);
1600 1660 }
drivers/gpu/drm/i915/intel_ringbuffer.h
... ... @@ -94,6 +94,7 @@
94 94 u32 offset, u32 length,
95 95 unsigned flags);
96 96 #define I915_DISPATCH_SECURE 0x1
  97 +#define I915_DISPATCH_PINNED 0x2
97 98 void (*cleanup)(struct intel_ring_buffer *ring);
98 99 int (*sync_to)(struct intel_ring_buffer *ring,
99 100 struct intel_ring_buffer *to,
include/uapi/drm/i915_drm.h
... ... @@ -307,6 +307,7 @@
307 307 #define I915_PARAM_HAS_PRIME_VMAP_FLUSH 21
308 308 #define I915_PARAM_RSVD_FOR_FUTURE_USE 22
309 309 #define I915_PARAM_HAS_SECURE_BATCHES 23
  310 +#define I915_PARAM_HAS_PINNED_BATCHES 24
310 311  
311 312 typedef struct drm_i915_getparam {
312 313 int param;
... ... @@ -676,6 +677,15 @@
676 677 * DRM_ROOT_ONLY | DRM_MASTER processes.
677 678 */
678 679 #define I915_EXEC_SECURE (1<<9)
  680 +
  681 +/** Inform the kernel that the batch is and will always be pinned. This
  682 + * negates the requirement for a workaround to be performed to avoid
  683 + * an incoherent CS (such as can be found on 830/845). If this flag is
  684 + * not passed, the kernel will endeavour to make sure the batch is
  685 + * coherent with the CS before execution. If this flag is passed,
  686 + * userspace assumes the responsibility for ensuring the same.
  687 + */
  688 +#define I915_EXEC_IS_PINNED (1<<10)
679 689  
680 690 #define I915_EXEC_CONTEXT_ID_MASK (0xffffffff)
681 691 #define i915_execbuffer2_set_context_id(eb2, context) \