Blame view

kernel/rseq.c 10.3 KB
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
  // SPDX-License-Identifier: GPL-2.0+
  /*
   * Restartable sequences system call
   *
   * Copyright (C) 2015, Google, Inc.,
   * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com>
   * Copyright (C) 2015-2018, EfficiOS Inc.,
   * Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   */
  
  #include <linux/sched.h>
  #include <linux/uaccess.h>
  #include <linux/syscalls.h>
  #include <linux/rseq.h>
  #include <linux/types.h>
  #include <asm/ptrace.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/rseq.h>
  
  #define RSEQ_CS_PREEMPT_MIGRATE_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE | \
  				       RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT)
  
  /*
   *
   * Restartable sequences are a lightweight interface that allows
   * user-level code to be executed atomically relative to scheduler
   * preemption and signal delivery. Typically used for implementing
   * per-cpu operations.
   *
   * It allows user-space to perform update operations on per-cpu data
   * without requiring heavy-weight atomic operations.
   *
   * Detailed algorithm of rseq user-space assembly sequences:
   *
   *                     init(rseq_cs)
   *                     cpu = TLS->rseq::cpu_id_start
   *   [1]               TLS->rseq::rseq_cs = rseq_cs
   *   [start_ip]        ----------------------------
   *   [2]               if (cpu != TLS->rseq::cpu_id)
   *                             goto abort_ip;
   *   [3]               <last_instruction_in_cs>
   *   [post_commit_ip]  ----------------------------
   *
   *   The address of jump target abort_ip must be outside the critical
   *   region, i.e.:
   *
   *     [abort_ip] < [start_ip]  || [abort_ip] >= [post_commit_ip]
   *
   *   Steps [2]-[3] (inclusive) need to be a sequence of instructions in
   *   userspace that can handle being interrupted between any of those
   *   instructions, and then resumed to the abort_ip.
   *
   *   1.  Userspace stores the address of the struct rseq_cs assembly
   *       block descriptor into the rseq_cs field of the registered
   *       struct rseq TLS area. This update is performed through a single
   *       store within the inline assembly instruction sequence.
   *       [start_ip]
   *
   *   2.  Userspace tests to check whether the current cpu_id field match
   *       the cpu number loaded before start_ip, branching to abort_ip
   *       in case of a mismatch.
   *
   *       If the sequence is preempted or interrupted by a signal
   *       at or after start_ip and before post_commit_ip, then the kernel
   *       clears TLS->__rseq_abi::rseq_cs, and sets the user-space return
   *       ip to abort_ip before returning to user-space, so the preempted
   *       execution resumes at abort_ip.
   *
   *   3.  Userspace critical section final instruction before
   *       post_commit_ip is the commit. The critical section is
   *       self-terminating.
   *       [post_commit_ip]
   *
   *   4.  <success>
   *
   *   On failure at [2], or if interrupted by preempt or signal delivery
   *   between [1] and [3]:
   *
   *       [abort_ip]
   *   F1. <failure>
   */
  
  static int rseq_update_cpu_id(struct task_struct *t)
  {
  	u32 cpu_id = raw_smp_processor_id();
60af388d2   Eric Dumazet   rseq: Optimize rs...
87
  	struct rseq __user *rseq = t->rseq;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
88

60af388d2   Eric Dumazet   rseq: Optimize rs...
89
90
91
92
93
  	if (!user_write_access_begin(rseq, sizeof(*rseq)))
  		goto efault;
  	unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
  	unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
  	user_write_access_end();
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
94
95
  	trace_rseq_update(t);
  	return 0;
60af388d2   Eric Dumazet   rseq: Optimize rs...
96
97
98
99
100
  
  efault_end:
  	user_write_access_end();
  efault:
  	return -EFAULT;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
101
102
103
104
105
106
107
108
109
  }
  
  static int rseq_reset_rseq_cpu_id(struct task_struct *t)
  {
  	u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
  
  	/*
  	 * Reset cpu_id_start to its initial state (0).
  	 */
8f2817701   Mathieu Desnoyers   rseq: Use get_use...
110
  	if (put_user(cpu_id_start, &t->rseq->cpu_id_start))
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
111
112
113
114
115
116
  		return -EFAULT;
  	/*
  	 * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
  	 * in after unregistration can figure out that rseq needs to be
  	 * registered again.
  	 */
8f2817701   Mathieu Desnoyers   rseq: Use get_use...
117
  	if (put_user(cpu_id, &t->rseq->cpu_id))
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
118
119
120
121
122
123
124
  		return -EFAULT;
  	return 0;
  }
  
  static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
  {
  	struct rseq_cs __user *urseq_cs;
ec9c82e03   Mathieu Desnoyers   rseq: uapi: Decla...
125
  	u64 ptr;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
126
127
128
  	u32 __user *usig;
  	u32 sig;
  	int ret;
5e0ccd4a3   Eric Dumazet   rseq: Optimise rs...
129
  #ifdef CONFIG_64BIT
3bb11f3f6   Mathieu Desnoyers   rseq: Remove brok...
130
  	if (get_user(ptr, &t->rseq->rseq_cs))
5e0ccd4a3   Eric Dumazet   rseq: Optimise rs...
131
132
  		return -EFAULT;
  #else
3bb11f3f6   Mathieu Desnoyers   rseq: Remove brok...
133
  	if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
ec9c82e03   Mathieu Desnoyers   rseq: uapi: Decla...
134
  		return -EFAULT;
5e0ccd4a3   Eric Dumazet   rseq: Optimise rs...
135
  #endif
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
136
137
138
139
  	if (!ptr) {
  		memset(rseq_cs, 0, sizeof(*rseq_cs));
  		return 0;
  	}
ec9c82e03   Mathieu Desnoyers   rseq: uapi: Decla...
140
141
142
  	if (ptr >= TASK_SIZE)
  		return -EINVAL;
  	urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
143
144
  	if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
  		return -EFAULT;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
145

e96d71359   Mathieu Desnoyers   rseq: Use __u64 f...
146
147
148
149
150
151
152
153
  	if (rseq_cs->start_ip >= TASK_SIZE ||
  	    rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
  	    rseq_cs->abort_ip >= TASK_SIZE ||
  	    rseq_cs->version > 0)
  		return -EINVAL;
  	/* Check for overflow. */
  	if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
  		return -EINVAL;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
154
155
156
  	/* Ensure that abort_ip is not in the critical section. */
  	if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
  		return -EINVAL;
e96d71359   Mathieu Desnoyers   rseq: Use __u64 f...
157
  	usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
158
159
160
161
162
163
164
165
166
  	ret = get_user(sig, usig);
  	if (ret)
  		return ret;
  
  	if (current->rseq_sig != sig) {
  		printk_ratelimited(KERN_WARNING
  			"Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).
  ",
  			sig, current->rseq_sig, current->pid, usig);
e96d71359   Mathieu Desnoyers   rseq: Use __u64 f...
167
  		return -EINVAL;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
168
169
170
171
172
173
174
175
176
177
  	}
  	return 0;
  }
  
  static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
  {
  	u32 flags, event_mask;
  	int ret;
  
  	/* Get thread flags. */
8f2817701   Mathieu Desnoyers   rseq: Use get_use...
178
  	ret = get_user(flags, &t->rseq->flags);
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
  	if (ret)
  		return ret;
  
  	/* Take critical section flags into account. */
  	flags |= cs_flags;
  
  	/*
  	 * Restart on signal can only be inhibited when restart on
  	 * preempt and restart on migrate are inhibited too. Otherwise,
  	 * a preempted signal handler could fail to restart the prior
  	 * execution context on sigreturn.
  	 */
  	if (unlikely((flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) &&
  		     (flags & RSEQ_CS_PREEMPT_MIGRATE_FLAGS) !=
  		     RSEQ_CS_PREEMPT_MIGRATE_FLAGS))
  		return -EINVAL;
  
  	/*
  	 * Load and clear event mask atomically with respect to
  	 * scheduler preemption.
  	 */
  	preempt_disable();
  	event_mask = t->rseq_event_mask;
  	t->rseq_event_mask = 0;
  	preempt_enable();
  
  	return !!(event_mask & ~flags);
  }
  
  static int clear_rseq_cs(struct task_struct *t)
  {
  	/*
  	 * The rseq_cs field is set to NULL on preemption or signal
  	 * delivery on top of rseq assembly block, as well as on top
  	 * of code outside of the rseq assembly block. This performs
  	 * a lazy clear of the rseq_cs field.
  	 *
0fb9a1abc   Mathieu Desnoyers   rseq: uapi: Updat...
216
  	 * Set rseq_cs to NULL.
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
217
  	 */
5e0ccd4a3   Eric Dumazet   rseq: Optimise rs...
218
  #ifdef CONFIG_64BIT
3bb11f3f6   Mathieu Desnoyers   rseq: Remove brok...
219
  	return put_user(0UL, &t->rseq->rseq_cs);
5e0ccd4a3   Eric Dumazet   rseq: Optimise rs...
220
  #else
3bb11f3f6   Mathieu Desnoyers   rseq: Remove brok...
221
  	if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
ec9c82e03   Mathieu Desnoyers   rseq: uapi: Decla...
222
223
  		return -EFAULT;
  	return 0;
5e0ccd4a3   Eric Dumazet   rseq: Optimise rs...
224
  #endif
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
  }
  
  /*
   * Unsigned comparison will be true when ip >= start_ip, and when
   * ip < start_ip + post_commit_offset.
   */
  static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
  {
  	return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
  }
  
  static int rseq_ip_fixup(struct pt_regs *regs)
  {
  	unsigned long ip = instruction_pointer(regs);
  	struct task_struct *t = current;
  	struct rseq_cs rseq_cs;
  	int ret;
  
  	ret = rseq_get_rseq_cs(t, &rseq_cs);
  	if (ret)
  		return ret;
  
  	/*
  	 * Handle potentially not being within a critical section.
  	 * If not nested over a rseq critical section, restart is useless.
  	 * Clear the rseq_cs pointer and return.
  	 */
  	if (!in_rseq_cs(ip, &rseq_cs))
  		return clear_rseq_cs(t);
  	ret = rseq_need_restart(t, rseq_cs.flags);
  	if (ret <= 0)
  		return ret;
  	ret = clear_rseq_cs(t);
  	if (ret)
  		return ret;
  	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
  			    rseq_cs.abort_ip);
  	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
  	return 0;
  }
  
  /*
   * This resume handler must always be executed between any of:
   * - preemption,
   * - signal delivery,
   * and return to user-space.
   *
bff9504bf   Mathieu Desnoyers   rseq: Clean up co...
272
   * This is how we can ensure that the entire rseq critical section
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
273
274
275
276
   * will issue the commit instruction only if executed atomically with
   * respect to other threads scheduled on the same CPU, and with respect
   * to signal handlers.
   */
784e0300f   Will Deacon   rseq: Avoid infin...
277
  void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
278
279
  {
  	struct task_struct *t = current;
784e0300f   Will Deacon   rseq: Avoid infin...
280
  	int ret, sig;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
281
282
283
  
  	if (unlikely(t->flags & PF_EXITING))
  		return;
8646e5363   Sean Christopherson   KVM: rseq: Update...
284
285
286
287
288
289
290
291
292
293
294
  
  	/*
  	 * regs is NULL if and only if the caller is in a syscall path.  Skip
  	 * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
  	 * kill a misbehaving userspace on debug kernels.
  	 */
  	if (regs) {
  		ret = rseq_ip_fixup(regs);
  		if (unlikely(ret < 0))
  			goto error;
  	}
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
295
296
297
298
299
  	if (unlikely(rseq_update_cpu_id(t)))
  		goto error;
  	return;
  
  error:
784e0300f   Will Deacon   rseq: Avoid infin...
300
  	sig = ksig ? ksig->sig : 0;
cb44c9a0a   Eric W. Biederman   signal: Remove ta...
301
  	force_sigsegv(sig);
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
  }
  
  #ifdef CONFIG_DEBUG_RSEQ
  
  /*
   * Terminate the process if a syscall is issued within a restartable
   * sequence.
   */
  void rseq_syscall(struct pt_regs *regs)
  {
  	unsigned long ip = instruction_pointer(regs);
  	struct task_struct *t = current;
  	struct rseq_cs rseq_cs;
  
  	if (!t->rseq)
  		return;
0ed960515   Eric Dumazet   rseq: Remove redu...
318
  	if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
3cf5d076f   Eric W. Biederman   signal: Remove ta...
319
  		force_sig(SIGSEGV);
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
320
321
322
323
324
325
326
327
328
329
330
331
332
  }
  
  #endif
  
  /*
   * sys_rseq - setup restartable sequences for caller thread.
   */
  SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
  		int, flags, u32, sig)
  {
  	int ret;
  
  	if (flags & RSEQ_FLAG_UNREGISTER) {
66528a457   Mathieu Desnoyers   rseq: Reject unkn...
333
334
  		if (flags & ~RSEQ_FLAG_UNREGISTER)
  			return -EINVAL;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
335
336
337
  		/* Unregister rseq for current thread. */
  		if (current->rseq != rseq || !current->rseq)
  			return -EINVAL;
83b0b15bc   Mathieu Desnoyers   rseq: Remove supe...
338
  		if (rseq_len != sizeof(*rseq))
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
339
340
341
342
343
344
345
  			return -EINVAL;
  		if (current->rseq_sig != sig)
  			return -EPERM;
  		ret = rseq_reset_rseq_cpu_id(current);
  		if (ret)
  			return ret;
  		current->rseq = NULL;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
346
347
348
349
350
351
352
353
354
355
356
357
358
  		current->rseq_sig = 0;
  		return 0;
  	}
  
  	if (unlikely(flags))
  		return -EINVAL;
  
  	if (current->rseq) {
  		/*
  		 * If rseq is already registered, check whether
  		 * the provided address differs from the prior
  		 * one.
  		 */
83b0b15bc   Mathieu Desnoyers   rseq: Remove supe...
359
  		if (current->rseq != rseq || rseq_len != sizeof(*rseq))
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
360
361
362
363
364
365
366
367
368
369
370
371
372
373
  			return -EINVAL;
  		if (current->rseq_sig != sig)
  			return -EPERM;
  		/* Already registered. */
  		return -EBUSY;
  	}
  
  	/*
  	 * If there was no rseq previously registered,
  	 * ensure the provided rseq is properly aligned and valid.
  	 */
  	if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) ||
  	    rseq_len != sizeof(*rseq))
  		return -EINVAL;
96d4f267e   Linus Torvalds   Remove 'type' arg...
374
  	if (!access_ok(rseq, rseq_len))
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
375
376
  		return -EFAULT;
  	current->rseq = rseq;
d7822b1e2   Mathieu Desnoyers   rseq: Introduce r...
377
378
379
380
381
382
383
384
385
386
  	current->rseq_sig = sig;
  	/*
  	 * If rseq was previously inactive, and has just been
  	 * registered, ensure the cpu_id_start and cpu_id fields
  	 * are updated before returning to user-space.
  	 */
  	rseq_set_notify_resume(current);
  
  	return 0;
  }