Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* linux/kernel/exit.c

2

* linux/kernel/exit.c

3

*

3

*

4

5

*/

5

*/

6

7

#include <linux/mm.h>

7

#include <linux/mm.h>

8

#include <linux/slab.h>

8

#include <linux/slab.h>

9

#include <linux/interrupt.h>

9

#include <linux/interrupt.h>

10

#include <linux/module.h>

10

#include <linux/module.h>

11

#include <linux/capability.h>

11

#include <linux/capability.h>

12

#include <linux/completion.h>

12

#include <linux/completion.h>

13

#include <linux/personality.h>

13

#include <linux/personality.h>

14

#include <linux/tty.h>

14

#include <linux/tty.h>

15

#include <linux/iocontext.h>

15

#include <linux/iocontext.h>

16

#include <linux/key.h>

16

#include <linux/key.h>

17

#include <linux/security.h>

17

#include <linux/security.h>

18

#include <linux/cpu.h>

18

#include <linux/cpu.h>

19

#include <linux/acct.h>

19

#include <linux/acct.h>

20

#include <linux/tsacct_kern.h>

20

#include <linux/tsacct_kern.h>

21

#include <linux/file.h>

21

#include <linux/file.h>

22

#include <linux/fdtable.h>

22

#include <linux/fdtable.h>

23

#include <linux/freezer.h>

23

#include <linux/freezer.h>

24

#include <linux/binfmts.h>

24

#include <linux/binfmts.h>

25

#include <linux/nsproxy.h>

25

#include <linux/nsproxy.h>

26

#include <linux/pid_namespace.h>

26

#include <linux/pid_namespace.h>

27

#include <linux/ptrace.h>

27

#include <linux/ptrace.h>

28

#include <linux/profile.h>

28

#include <linux/profile.h>

29

#include <linux/mount.h>

29

#include <linux/mount.h>

30

#include <linux/proc_fs.h>

30

#include <linux/proc_fs.h>

31

#include <linux/kthread.h>

31

#include <linux/kthread.h>

32

#include <linux/mempolicy.h>

32

#include <linux/mempolicy.h>

33

#include <linux/taskstats_kern.h>

33

#include <linux/taskstats_kern.h>

34

#include <linux/delayacct.h>

34

#include <linux/delayacct.h>

35

#include <linux/cgroup.h>

35

#include <linux/cgroup.h>

36

#include <linux/syscalls.h>

36

#include <linux/syscalls.h>

37

#include <linux/signal.h>

37

#include <linux/signal.h>

38

#include <linux/posix-timers.h>

38

#include <linux/posix-timers.h>

39

#include <linux/cn_proc.h>

39

#include <linux/cn_proc.h>

40

#include <linux/mutex.h>

40

#include <linux/mutex.h>

41

#include <linux/futex.h>

41

#include <linux/futex.h>

42

#include <linux/pipe_fs_i.h>

42

#include <linux/pipe_fs_i.h>

43

#include <linux/audit.h> /* for audit_free() */

43

#include <linux/audit.h> /* for audit_free() */

44

#include <linux/resource.h>

44

#include <linux/resource.h>

45

#include <linux/blkdev.h>

45

#include <linux/blkdev.h>

46

#include <linux/task_io_accounting_ops.h>

46

#include <linux/task_io_accounting_ops.h>

47

#include <linux/tracehook.h>

47

#include <linux/tracehook.h>

48

#include <linux/fs_struct.h>

48

#include <linux/fs_struct.h>

49

#include <linux/init_task.h>

49

#include <linux/init_task.h>

50

#include <linux/perf_event.h>

50

#include <linux/perf_event.h>

51

#include <trace/events/sched.h>

51

#include <trace/events/sched.h>

52

#include <linux/hw_breakpoint.h>

52

#include <linux/hw_breakpoint.h>

53

#include <linux/oom.h>

53

#include <linux/oom.h>

54

#include <linux/writeback.h>

54

#include <linux/writeback.h>

55

#include <linux/shm.h>

55

#include <linux/shm.h>

56

57

#include <asm/uaccess.h>

57

#include <asm/uaccess.h>

58

#include <asm/unistd.h>

58

#include <asm/unistd.h>

59

#include <asm/pgtable.h>

59

#include <asm/pgtable.h>

60

#include <asm/mmu_context.h>

60

#include <asm/mmu_context.h>

61

62

static void exit_mm(struct task_struct *tsk);

62

static void exit_mm(struct task_struct *tsk);

63

64

static void __unhash_process(struct task_struct *p, bool group_dead)

64

static void __unhash_process(struct task_struct *p, bool group_dead)

65

{

65

{

66

nr_threads--;

66

nr_threads--;

67

detach_pid(p, PIDTYPE_PID);

67

detach_pid(p, PIDTYPE_PID);

68

if (group_dead) {

68

if (group_dead) {

69

detach_pid(p, PIDTYPE_PGID);

69

detach_pid(p, PIDTYPE_PGID);

70

detach_pid(p, PIDTYPE_SID);

70

detach_pid(p, PIDTYPE_SID);

71

72

list_del_rcu(&p->tasks);

72

list_del_rcu(&p->tasks);

73

list_del_init(&p->sibling);

73

list_del_init(&p->sibling);

74

__this_cpu_dec(process_counts);

74

__this_cpu_dec(process_counts);

75

}

75

}

76

list_del_rcu(&p->thread_group);

76

list_del_rcu(&p->thread_group);

77

list_del_rcu(&p->thread_node);

77

list_del_rcu(&p->thread_node);

78

}

78

}

79

80

/*

80

/*

81

* This function expects the tasklist_lock write-locked.

81

* This function expects the tasklist_lock write-locked.

82

*/

82

*/

83

static void __exit_signal(struct task_struct *tsk)

83

static void __exit_signal(struct task_struct *tsk)

84

{

84

{

85

struct signal_struct *sig = tsk->signal;

85

struct signal_struct *sig = tsk->signal;

86

bool group_dead = thread_group_leader(tsk);

86

bool group_dead = thread_group_leader(tsk);

87

struct sighand_struct *sighand;

87

struct sighand_struct *sighand;

88

struct tty_struct *uninitialized_var(tty);

88

struct tty_struct *uninitialized_var(tty);

89

cputime_t utime, stime;

89

cputime_t utime, stime;

90

91

sighand = rcu_dereference_check(tsk->sighand,

91

sighand = rcu_dereference_check(tsk->sighand,

92

lockdep_tasklist_lock_is_held());

92

lockdep_tasklist_lock_is_held());

93

spin_lock(&sighand->siglock);

93

spin_lock(&sighand->siglock);

94

95

posix_cpu_timers_exit(tsk);

95

posix_cpu_timers_exit(tsk);

96

if (group_dead) {

96

if (group_dead) {

97

posix_cpu_timers_exit_group(tsk);

97

posix_cpu_timers_exit_group(tsk);

98

tty = sig->tty;

98

tty = sig->tty;

99

sig->tty = NULL;

99

sig->tty = NULL;

100

} else {

100

} else {

101

/*

101

/*

102

* This can only happen if the caller is de_thread().

102

* This can only happen if the caller is de_thread().

103

* FIXME: this is the temporary hack, we should teach

103

* FIXME: this is the temporary hack, we should teach

104

* posix-cpu-timers to handle this case correctly.

104

* posix-cpu-timers to handle this case correctly.

105

*/

105

*/

106

if (unlikely(has_group_leader_pid(tsk)))

106

if (unlikely(has_group_leader_pid(tsk)))

107

posix_cpu_timers_exit_group(tsk);

107

posix_cpu_timers_exit_group(tsk);

108

109

/*

109

/*

110

* If there is any task waiting for the group exit

110

* If there is any task waiting for the group exit

111

* then notify it:

111

* then notify it:

112

*/

112

*/

113

if (sig->notify_count > 0 && !--sig->notify_count)

113

if (sig->notify_count > 0 && !--sig->notify_count)

114

wake_up_process(sig->group_exit_task);

114

wake_up_process(sig->group_exit_task);

115

116

if (tsk == sig->curr_target)

116

if (tsk == sig->curr_target)

117

sig->curr_target = next_thread(tsk);

117

sig->curr_target = next_thread(tsk);

118

}

118

}

119

120

/*

120

/*

121

* Accumulate here the counters for all threads but the group leader

121

* Accumulate here the counters for all threads but the group leader

122

* as they die, so they can be added into the process-wide totals

122

* as they die, so they can be added into the process-wide totals

123

* when those are taken. The group leader stays around as a zombie as

123

* when those are taken. The group leader stays around as a zombie as

124

* long as there are other threads. When it gets reaped, the exit.c

124

* long as there are other threads. When it gets reaped, the exit.c

125

* code will add its counts into these totals. We won't ever get here

125

* code will add its counts into these totals. We won't ever get here

126

* for the group leader, since it will have been the last reference on

126

* for the group leader, since it will have been the last reference on

127

* the signal_struct.

127

* the signal_struct.

128

*/

128

*/

129

task_cputime(tsk, &utime, &stime);

129

task_cputime(tsk, &utime, &stime);

130

write_seqlock(&sig->stats_lock);

130

write_seqlock(&sig->stats_lock);

131

sig->utime += utime;

131

sig->utime += utime;

132

sig->stime += stime;

132

sig->stime += stime;

133

sig->gtime += task_gtime(tsk);

133

sig->gtime += task_gtime(tsk);

134

sig->min_flt += tsk->min_flt;

134

sig->min_flt += tsk->min_flt;

135

sig->maj_flt += tsk->maj_flt;

135

sig->maj_flt += tsk->maj_flt;

136

sig->nvcsw += tsk->nvcsw;

136

sig->nvcsw += tsk->nvcsw;

137

sig->nivcsw += tsk->nivcsw;

137

sig->nivcsw += tsk->nivcsw;

138

sig->inblock += task_io_get_inblock(tsk);

138

sig->inblock += task_io_get_inblock(tsk);

139

sig->oublock += task_io_get_oublock(tsk);

139

sig->oublock += task_io_get_oublock(tsk);

140

task_io_accounting_add(&sig->ioac, &tsk->ioac);

140

task_io_accounting_add(&sig->ioac, &tsk->ioac);

141

sig->sum_sched_runtime += tsk->se.sum_exec_runtime;

141

sig->sum_sched_runtime += tsk->se.sum_exec_runtime;

142

sig->nr_threads--;

142

sig->nr_threads--;

143

__unhash_process(tsk, group_dead);

143

__unhash_process(tsk, group_dead);

144

write_sequnlock(&sig->stats_lock);

144

write_sequnlock(&sig->stats_lock);

145

146

/*

146

/*

147

* Do this under ->siglock, we can race with another thread

147

* Do this under ->siglock, we can race with another thread

148

* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.

148

* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.

149

*/

149

*/

150

flush_sigqueue(&tsk->pending);

150

flush_sigqueue(&tsk->pending);

151

tsk->sighand = NULL;

151

tsk->sighand = NULL;

152

spin_unlock(&sighand->siglock);

152

spin_unlock(&sighand->siglock);

153

154

__cleanup_sighand(sighand);

154

__cleanup_sighand(sighand);

155

clear_tsk_thread_flag(tsk, TIF_SIGPENDING);

155

clear_tsk_thread_flag(tsk, TIF_SIGPENDING);

156

if (group_dead) {

156

if (group_dead) {

157

flush_sigqueue(&sig->shared_pending);

157

flush_sigqueue(&sig->shared_pending);

158

tty_kref_put(tty);

158

tty_kref_put(tty);

159

}

159

}

160

}

160

}

161

162

static void delayed_put_task_struct(struct rcu_head *rhp)

162

static void delayed_put_task_struct(struct rcu_head *rhp)

163

{

163

{

164

struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

164

struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

165

166

perf_event_delayed_put(tsk);

166

perf_event_delayed_put(tsk);

167

trace_sched_process_free(tsk);

167

trace_sched_process_free(tsk);

168

put_task_struct(tsk);

168

put_task_struct(tsk);

169

}

169

}

170

171

172

void release_task(struct task_struct *p)

172

void release_task(struct task_struct *p)

173

{

173

{

174

struct task_struct *leader;

174

struct task_struct *leader;

175

int zap_leader;

175

int zap_leader;

176

repeat:

176

repeat:

177

/* don't need to get the RCU readlock here - the process is dead and

177

/* don't need to get the RCU readlock here - the process is dead and

178

* can't be modifying its own credentials. But shut RCU-lockdep up */

178

* can't be modifying its own credentials. But shut RCU-lockdep up */

179

rcu_read_lock();

179

rcu_read_lock();

180

atomic_dec(&__task_cred(p)->user->processes);

180

atomic_dec(&__task_cred(p)->user->processes);

181

rcu_read_unlock();

181

rcu_read_unlock();

182

183

proc_flush_task(p);

183

proc_flush_task(p);

184

185

write_lock_irq(&tasklist_lock);

185

write_lock_irq(&tasklist_lock);

186

ptrace_release_task(p);

186

ptrace_release_task(p);

187

__exit_signal(p);

187

__exit_signal(p);

188

189

/*

189

/*

190

* If we are the last non-leader member of the thread

190

* If we are the last non-leader member of the thread

191

* group, and the leader is zombie, then notify the

191

* group, and the leader is zombie, then notify the

192

* group leader's parent process. (if it wants notification.)

192

* group leader's parent process. (if it wants notification.)

193

*/

193

*/

194

zap_leader = 0;

194

zap_leader = 0;

195

leader = p->group_leader;

195

leader = p->group_leader;

196

if (leader != p && thread_group_empty(leader)

196

if (leader != p && thread_group_empty(leader)

197

&& leader->exit_state == EXIT_ZOMBIE) {

197

&& leader->exit_state == EXIT_ZOMBIE) {

198

/*

198

/*

199

* If we were the last child thread and the leader has

199

* If we were the last child thread and the leader has

200

* exited already, and the leader's parent ignores SIGCHLD,

200

* exited already, and the leader's parent ignores SIGCHLD,

201

* then we are the one who should release the leader.

201

* then we are the one who should release the leader.

202

*/

202

*/

203

zap_leader = do_notify_parent(leader, leader->exit_signal);

203

zap_leader = do_notify_parent(leader, leader->exit_signal);

204

if (zap_leader)

204

if (zap_leader)

205

leader->exit_state = EXIT_DEAD;

205

leader->exit_state = EXIT_DEAD;

206

}

206

}

207

208

write_unlock_irq(&tasklist_lock);

208

write_unlock_irq(&tasklist_lock);

209

release_thread(p);

209

release_thread(p);

210

call_rcu(&p->rcu, delayed_put_task_struct);

210

call_rcu(&p->rcu, delayed_put_task_struct);

211

212

p = leader;

212

p = leader;

213

if (unlikely(zap_leader))

213

if (unlikely(zap_leader))

214

goto repeat;

214

goto repeat;

215

}

215

}

216

217

/*

217

/*

218

* This checks not only the pgrp, but falls back on the pid if no

218

* This checks not only the pgrp, but falls back on the pid if no

219

* satisfactory pgrp is found. I dunno - gdb doesn't work correctly

219

* satisfactory pgrp is found. I dunno - gdb doesn't work correctly

220

* without this...

220

* without this...

221

*

221

*

222

* The caller must hold rcu lock or the tasklist lock.

222

* The caller must hold rcu lock or the tasklist lock.

223

*/

223

*/

224

struct pid *session_of_pgrp(struct pid *pgrp)

224

struct pid *session_of_pgrp(struct pid *pgrp)

225

{

225

{

226

struct task_struct *p;

226

struct task_struct *p;

227

struct pid *sid = NULL;

227

struct pid *sid = NULL;

228

229

p = pid_task(pgrp, PIDTYPE_PGID);

229

p = pid_task(pgrp, PIDTYPE_PGID);

230

if (p == NULL)

230

if (p == NULL)

231

p = pid_task(pgrp, PIDTYPE_PID);

231

p = pid_task(pgrp, PIDTYPE_PID);

232

if (p != NULL)

232

if (p != NULL)

233

sid = task_session(p);

233

sid = task_session(p);

234

235

return sid;

235

return sid;

236

}

236

}

237

238

/*

238

/*

239

* Determine if a process group is "orphaned", according to the POSIX

239

* Determine if a process group is "orphaned", according to the POSIX

240

* definition in 2.2.2.52. Orphaned process groups are not to be affected

240

* definition in 2.2.2.52. Orphaned process groups are not to be affected

241

* by terminal-generated stop signals. Newly orphaned process groups are

241

* by terminal-generated stop signals. Newly orphaned process groups are

242

* to receive a SIGHUP and a SIGCONT.

242

* to receive a SIGHUP and a SIGCONT.

243

*

243

*

244

* "I ask you, have you ever known what it is to be an orphan?"

244

* "I ask you, have you ever known what it is to be an orphan?"

245

*/

245

*/

246

static int will_become_orphaned_pgrp(struct pid *pgrp,

246

static int will_become_orphaned_pgrp(struct pid *pgrp,

247

struct task_struct *ignored_task)

247

struct task_struct *ignored_task)

248

{

248

{

249

struct task_struct *p;

249

struct task_struct *p;

250

251

do_each_pid_task(pgrp, PIDTYPE_PGID, p) {

251

do_each_pid_task(pgrp, PIDTYPE_PGID, p) {

252

if ((p == ignored_task) ||

252

if ((p == ignored_task) ||

253

(p->exit_state && thread_group_empty(p)) ||

253

(p->exit_state && thread_group_empty(p)) ||

254

is_global_init(p->real_parent))

254

is_global_init(p->real_parent))

255

continue;

255

continue;

256

257

if (task_pgrp(p->real_parent) != pgrp &&

257

if (task_pgrp(p->real_parent) != pgrp &&

258

task_session(p->real_parent) == task_session(p))

258

task_session(p->real_parent) == task_session(p))

259

return 0;

259

return 0;

260

} while_each_pid_task(pgrp, PIDTYPE_PGID, p);

260

} while_each_pid_task(pgrp, PIDTYPE_PGID, p);

261

262

return 1;

262

return 1;

263

}

263

}

264

265

int is_current_pgrp_orphaned(void)

265

int is_current_pgrp_orphaned(void)

266

{

266

{

267

int retval;

267

int retval;

268

269

read_lock(&tasklist_lock);

269

read_lock(&tasklist_lock);

270

retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);

270

retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);

271

read_unlock(&tasklist_lock);

271

read_unlock(&tasklist_lock);

272

273

return retval;

273

return retval;

274

}

274

}

275

276

static bool has_stopped_jobs(struct pid *pgrp)

276

static bool has_stopped_jobs(struct pid *pgrp)

277

{

277

{

278

struct task_struct *p;

278

struct task_struct *p;

279

280

do_each_pid_task(pgrp, PIDTYPE_PGID, p) {

280

do_each_pid_task(pgrp, PIDTYPE_PGID, p) {

281

if (p->signal->flags & SIGNAL_STOP_STOPPED)

281

if (p->signal->flags & SIGNAL_STOP_STOPPED)

282

return true;

282

return true;

283

} while_each_pid_task(pgrp, PIDTYPE_PGID, p);

283

} while_each_pid_task(pgrp, PIDTYPE_PGID, p);

284

285

return false;

285

return false;

286

}

286

}

287

288

/*

288

/*

289

* Check to see if any process groups have become orphaned as

289

* Check to see if any process groups have become orphaned as

290

* a result of our exiting, and if they have any stopped jobs,

290

* a result of our exiting, and if they have any stopped jobs,

291

* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)

291

* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)

292

*/

292

*/

293

static void

293

static void

294

kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)

294

kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)

295

{

295

{

296

struct pid *pgrp = task_pgrp(tsk);

296

struct pid *pgrp = task_pgrp(tsk);

297

struct task_struct *ignored_task = tsk;

297

struct task_struct *ignored_task = tsk;

298

299

if (!parent)

299

if (!parent)

300

/* exit: our father is in a different pgrp than

300

/* exit: our father is in a different pgrp than

301

* we are and we were the only connection outside.

301

* we are and we were the only connection outside.

302

*/

302

*/

303

parent = tsk->real_parent;

303

parent = tsk->real_parent;

304

else

304

else

305

/* reparent: our child is in a different pgrp than

305

/* reparent: our child is in a different pgrp than

306

* we are, and it was the only connection outside.

306

* we are, and it was the only connection outside.

307

*/

307

*/

308

ignored_task = NULL;

308

ignored_task = NULL;

309

310

if (task_pgrp(parent) != pgrp &&

310

if (task_pgrp(parent) != pgrp &&

311

task_session(parent) == task_session(tsk) &&

311

task_session(parent) == task_session(tsk) &&

312

will_become_orphaned_pgrp(pgrp, ignored_task) &&

312

will_become_orphaned_pgrp(pgrp, ignored_task) &&

313

has_stopped_jobs(pgrp)) {

313

has_stopped_jobs(pgrp)) {

314

__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);

314

__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);

315

__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);

315

__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);

316

}

316

}

317

}

317

}

318

319

#ifdef CONFIG_MEMCG

319

#ifdef CONFIG_MEMCG

320

/*

320

/*

321

* A task is exiting. If it owned this mm, find a new owner for the mm.

321

* A task is exiting. If it owned this mm, find a new owner for the mm.

322

*/

322

*/

323

void mm_update_next_owner(struct mm_struct *mm)

323

void mm_update_next_owner(struct mm_struct *mm)

324

{

324

{

325

struct task_struct *c, *g, *p = current;

325

struct task_struct *c, *g, *p = current;

326

327

retry:

327

retry:

328

/*

328

/*

329

* If the exiting or execing task is not the owner, it's

329

* If the exiting or execing task is not the owner, it's

330

* someone else's problem.

330

* someone else's problem.

331

*/

331

*/

332

if (mm->owner != p)

332

if (mm->owner != p)

333

return;

333

return;

334

/*

334

/*

335

* The current owner is exiting/execing and there are no other

335

* The current owner is exiting/execing and there are no other

336

* candidates. Do not leave the mm pointing to a possibly

336

* candidates. Do not leave the mm pointing to a possibly

337

* freed task structure.

337

* freed task structure.

338

*/

338

*/

339

if (atomic_read(&mm->mm_users) <= 1) {

339

if (atomic_read(&mm->mm_users) <= 1) {

340

mm->owner = NULL;

340

mm->owner = NULL;

341

return;

341

return;

342

}

342

}

343

344

read_lock(&tasklist_lock);

344

read_lock(&tasklist_lock);

345

/*

345

/*

346

* Search in the children

346

* Search in the children

347

*/

347

*/

348

list_for_each_entry(c, &p->children, sibling) {

348

list_for_each_entry(c, &p->children, sibling) {

349

if (c->mm == mm)

349

if (c->mm == mm)

350

goto assign_new_owner;

350

goto assign_new_owner;

351

}

351

}

352

353

/*

353

/*

354

* Search in the siblings

354

* Search in the siblings

355

*/

355

*/

356

list_for_each_entry(c, &p->real_parent->children, sibling) {

356

list_for_each_entry(c, &p->real_parent->children, sibling) {

357

if (c->mm == mm)

357

if (c->mm == mm)

358

goto assign_new_owner;

358

goto assign_new_owner;

359

}

359

}

360

361

/*

361

/*

362

* Search through everything else, we should not get here often.

362

* Search through everything else, we should not get here often.

363

*/

363

*/

364

for_each_process(g) {

364

for_each_process(g) {

365

if (g->flags & PF_KTHREAD)

365

if (g->flags & PF_KTHREAD)

366

continue;

366

continue;

367

for_each_thread(g, c) {

367

for_each_thread(g, c) {

368

if (c->mm == mm)

368

if (c->mm == mm)

369

goto assign_new_owner;

369

goto assign_new_owner;

370

if (c->mm)

370

if (c->mm)

371

break;

371

break;

372

}

372

}

373

}

373

}

374

read_unlock(&tasklist_lock);

374

read_unlock(&tasklist_lock);

375

/*

375

/*

376

* We found no owner yet mm_users > 1: this implies that we are

376

* We found no owner yet mm_users > 1: this implies that we are

377

* most likely racing with swapoff (try_to_unuse()) or /proc or

377

* most likely racing with swapoff (try_to_unuse()) or /proc or

378

* ptrace or page migration (get_task_mm()). Mark owner as NULL.

378

* ptrace or page migration (get_task_mm()). Mark owner as NULL.

379

*/

379

*/

380

mm->owner = NULL;

380

mm->owner = NULL;

381

return;

381

return;

382

383

assign_new_owner:

383

assign_new_owner:

384

BUG_ON(c == p);

384

BUG_ON(c == p);

385

get_task_struct(c);

385

get_task_struct(c);

386

/*

386

/*

387

* The task_lock protects c->mm from changing.

387

* The task_lock protects c->mm from changing.

388

* We always want mm->owner->mm == mm

388

* We always want mm->owner->mm == mm

389

*/

389

*/

390

task_lock(c);

390

task_lock(c);

391

/*

391

/*

392

* Delay read_unlock() till we have the task_lock()

392

* Delay read_unlock() till we have the task_lock()

393

* to ensure that c does not slip away underneath us

393

* to ensure that c does not slip away underneath us

394

*/

394

*/

395

read_unlock(&tasklist_lock);

395

read_unlock(&tasklist_lock);

396

if (c->mm != mm) {

396

if (c->mm != mm) {

397

task_unlock(c);

397

task_unlock(c);

398

put_task_struct(c);

398

put_task_struct(c);

399

goto retry;

399

goto retry;

400

}

400

}

401

mm->owner = c;

401

mm->owner = c;

402

task_unlock(c);

402

task_unlock(c);

403

put_task_struct(c);

403

put_task_struct(c);

404

}

404

}

405

#endif /* CONFIG_MEMCG */

405

#endif /* CONFIG_MEMCG */

406

407

/*

407

/*

408

* Turn us into a lazy TLB process if we

408

* Turn us into a lazy TLB process if we

409

* aren't already..

409

* aren't already..

410

*/

410

*/

411

static void exit_mm(struct task_struct *tsk)

411

static void exit_mm(struct task_struct *tsk)

412

{

412

{

413

struct mm_struct *mm = tsk->mm;

413

struct mm_struct *mm = tsk->mm;

414

struct core_state *core_state;

414

struct core_state *core_state;

415

416

mm_release(tsk, mm);

416

mm_release(tsk, mm);

417

if (!mm)

417

if (!mm)

418

return;

418

return;

419

sync_mm_rss(mm);

419

sync_mm_rss(mm);

420

/*

420

/*

421

* Serialize with any possible pending coredump.

421

* Serialize with any possible pending coredump.

422

* We must hold mmap_sem around checking core_state

422

* We must hold mmap_sem around checking core_state

423

* and clearing tsk->mm. The core-inducing thread

423

* and clearing tsk->mm. The core-inducing thread

424

* will increment ->nr_threads for each thread in the

424

* will increment ->nr_threads for each thread in the

425

* group with ->mm != NULL.

425

* group with ->mm != NULL.

426

*/

426

*/

427

down_read(&mm->mmap_sem);

427

down_read(&mm->mmap_sem);

428

core_state = mm->core_state;

428

core_state = mm->core_state;

429

if (core_state) {

429

if (core_state) {

430

struct core_thread self;

430

struct core_thread self;

431

432

up_read(&mm->mmap_sem);

432

up_read(&mm->mmap_sem);

433

434

self.task = tsk;

434

self.task = tsk;

435

self.next = xchg(&core_state->dumper.next, &self);

435

self.next = xchg(&core_state->dumper.next, &self);

436

/*

436

/*

437

* Implies mb(), the result of xchg() must be visible

437

* Implies mb(), the result of xchg() must be visible

438

* to core_state->dumper.

438

* to core_state->dumper.

439

*/

439

*/

440

if (atomic_dec_and_test(&core_state->nr_threads))

440

if (atomic_dec_and_test(&core_state->nr_threads))

441

complete(&core_state->startup);

441

complete(&core_state->startup);

442

443

for (;;) {

443

for (;;) {

444

set_task_state(tsk, TASK_UNINTERRUPTIBLE);

444

set_task_state(tsk, TASK_UNINTERRUPTIBLE);

445

if (!self.task) /* see coredump_finish() */

445

if (!self.task) /* see coredump_finish() */

446

break;

446

break;

447

freezable_schedule();

447

freezable_schedule();

448

}

448

}

449

__set_task_state(tsk, TASK_RUNNING);

449

__set_task_state(tsk, TASK_RUNNING);

450

down_read(&mm->mmap_sem);

450

down_read(&mm->mmap_sem);

451

}

451

}

452

atomic_inc(&mm->mm_count);

452

atomic_inc(&mm->mm_count);

453

BUG_ON(mm != tsk->active_mm);

453

BUG_ON(mm != tsk->active_mm);

454

/* more a memory barrier than a real lock */

454

/* more a memory barrier than a real lock */

455

task_lock(tsk);

455

task_lock(tsk);

456

tsk->mm = NULL;

456

tsk->mm = NULL;

457

up_read(&mm->mmap_sem);

457

up_read(&mm->mmap_sem);

458

enter_lazy_tlb(mm, current);

458

enter_lazy_tlb(mm, current);

459

task_unlock(tsk);

459

task_unlock(tsk);

460

mm_update_next_owner(mm);

460

mm_update_next_owner(mm);

461

mmput(mm);

461

mmput(mm);

462

clear_thread_flag(TIF_MEMDIE);

462

clear_thread_flag(TIF_MEMDIE);

463

}

463

}

464

465

/*

465

/*

466

* When we die, we re-parent all our children, and try to:

466

* When we die, we re-parent all our children, and try to:

467

* 1. give them to another thread in our thread group, if such a member exists

467

* 1. give them to another thread in our thread group, if such a member exists

468

* 2. give it to the first ancestor process which prctl'd itself as a

468

* 2. give it to the first ancestor process which prctl'd itself as a

469

* child_subreaper for its children (like a service manager)

469

* child_subreaper for its children (like a service manager)

470

* 3. give it to the init process (PID 1) in our pid namespace

470

* 3. give it to the init process (PID 1) in our pid namespace

471

*/

471

*/

472

static struct task_struct *find_new_reaper(struct task_struct *father)

472

static struct task_struct *find_new_reaper(struct task_struct *father)

473

__releases(&tasklist_lock)

473

__releases(&tasklist_lock)

474

__acquires(&tasklist_lock)

474

__acquires(&tasklist_lock)

475

{

475

{

476

struct pid_namespace *pid_ns = task_active_pid_ns(father);

476

struct pid_namespace *pid_ns = task_active_pid_ns(father);

477

struct task_struct *thread;

477

struct task_struct *thread;

478

479

thread = father;

479

thread = father;

480

while_each_thread(father, thread) {

480

while_each_thread(father, thread) {

481

if (thread->flags & PF_EXITING)

481

if (thread->flags & PF_EXITING)

482

continue;

482

continue;

483

if (unlikely(pid_ns->child_reaper == father))

483

if (unlikely(pid_ns->child_reaper == father))

484

pid_ns->child_reaper = thread;

484

pid_ns->child_reaper = thread;

485

return thread;

485

return thread;

486

}

486

}

487

488

if (unlikely(pid_ns->child_reaper == father)) {

488

if (unlikely(pid_ns->child_reaper == father)) {

489

write_unlock_irq(&tasklist_lock);

489

write_unlock_irq(&tasklist_lock);

490

if (unlikely(pid_ns == &init_pid_ns)) {

490

if (unlikely(pid_ns == &init_pid_ns)) {

491

panic("Attempted to kill init! exitcode=0x%08x\n",

491

panic("Attempted to kill init! exitcode=0x%08x\n",

492

father->signal->group_exit_code ?:

492

father->signal->group_exit_code ?:

493

father->exit_code);

493

father->exit_code);

494

}

494

}

495

496

zap_pid_ns_processes(pid_ns);

496

zap_pid_ns_processes(pid_ns);

497

write_lock_irq(&tasklist_lock);

497

write_lock_irq(&tasklist_lock);

498

} else if (father->signal->has_child_subreaper) {

498

} else if (father->signal->has_child_subreaper) {

499

struct task_struct *reaper;

499

struct task_struct *reaper;

500

501

/*

501

/*

502

* Find the first ancestor marked as child_subreaper.

502

* Find the first ancestor marked as child_subreaper.

503

* Note that the code below checks same_thread_group(reaper,

503

* Note that the code below checks same_thread_group(reaper,

504

* pid_ns->child_reaper). This is what we need to DTRT in a

504

* pid_ns->child_reaper). This is what we need to DTRT in a

505

* PID namespace. However we still need the check above, see

505

* PID namespace. However we still need the check above, see

506

* http://marc.info/?l=linux-kernel&m=131385460420380

506

* http://marc.info/?l=linux-kernel&m=131385460420380

507

*/

507

*/

508

for (reaper = father->real_parent;

508

for (reaper = father->real_parent;

509

reaper != &init_task;

509

reaper != &init_task;

510

reaper = reaper->real_parent) {

510

reaper = reaper->real_parent) {

511

if (same_thread_group(reaper, pid_ns->child_reaper))

511

if (same_thread_group(reaper, pid_ns->child_reaper))

512

break;

512

break;

513

if (!reaper->signal->is_child_subreaper)

513

if (!reaper->signal->is_child_subreaper)

514

continue;

514

continue;

515

thread = reaper;

515

thread = reaper;

516

do {

516

do {

517

if (!(thread->flags & PF_EXITING))

517

if (!(thread->flags & PF_EXITING))

518

return reaper;

518

return reaper;

519

} while_each_thread(reaper, thread);

519

} while_each_thread(reaper, thread);

520

}

520

}

521

}

521

}

522

523

return pid_ns->child_reaper;

523

return pid_ns->child_reaper;

524

}

524

}

525

526

/*

526

/*

527

* Any that need to be release_task'd are put on the @dead list.

527

* Any that need to be release_task'd are put on the @dead list.

528

*/

528

*/

529

static void reparent_leader(struct task_struct *father, struct task_struct *p,

529

static void reparent_leader(struct task_struct *father, struct task_struct *p,

530

struct list_head *dead)

530

struct list_head *dead)

531

{

531

{

532

list_move_tail(&p->sibling, &p->real_parent->children);

532

list_move_tail(&p->sibling, &p->real_parent->children);

533

534

if (p->exit_state == EXIT_DEAD)

534

if (p->exit_state == EXIT_DEAD)

535

return;

535

return;

536

/*

536

/*

537

* If this is a threaded reparent there is no need to

537

* If this is a threaded reparent there is no need to

538

* notify anyone anything has happened.

538

* notify anyone anything has happened.

539

*/

539

*/

540

if (same_thread_group(p->real_parent, father))

540

if (same_thread_group(p->real_parent, father))

541

return;

541

return;

542

543

/* We don't want people slaying init. */

543

/* We don't want people slaying init. */

544

p->exit_signal = SIGCHLD;

544

p->exit_signal = SIGCHLD;

545

546

/* If it has exited notify the new parent about this child's death. */

546

/* If it has exited notify the new parent about this child's death. */

547

if (!p->ptrace &&

547

if (!p->ptrace &&

548

p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {

548

p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {

549

if (do_notify_parent(p, p->exit_signal)) {

549

if (do_notify_parent(p, p->exit_signal)) {

550

p->exit_state = EXIT_DEAD;

550

p->exit_state = EXIT_DEAD;

551

list_move_tail(&p->sibling, dead);

551

list_move_tail(&p->sibling, dead);

552

}

552

}

553

}

553

}

554

555

kill_orphaned_pgrp(p, father);

555

kill_orphaned_pgrp(p, father);

556

}

556

}

557

558

static void forget_original_parent(struct task_struct *father)

558

static void forget_original_parent(struct task_struct *father)

559

{

559

{

560

struct task_struct *p, *n, *reaper;

560

struct task_struct *p, *n, *reaper;

561

LIST_HEAD(dead_children);

561

LIST_HEAD(dead_children);

562

563

write_lock_irq(&tasklist_lock);

563

write_lock_irq(&tasklist_lock);

564

/*

564

/*

565

* Note that exit_ptrace() and find_new_reaper() might

565

* Note that exit_ptrace() and find_new_reaper() might

566

* drop tasklist_lock and reacquire it.

566

* drop tasklist_lock and reacquire it.

567

*/

567

*/

568

exit_ptrace(father);

568

exit_ptrace(father);

569

reaper = find_new_reaper(father);

569

reaper = find_new_reaper(father);

570

571

list_for_each_entry_safe(p, n, &father->children, sibling) {

571

list_for_each_entry_safe(p, n, &father->children, sibling) {

572

struct task_struct *t = p;

572

struct task_struct *t = p;

573

574

do {

574

do {

575

t->real_parent = reaper;

575

t->real_parent = reaper;

576

if (t->parent == father) {

576

if (t->parent == father) {

577

BUG_ON(t->ptrace);

577

BUG_ON(t->ptrace);

578

t->parent = t->real_parent;

578

t->parent = t->real_parent;

579

}

579

}

580

if (t->pdeath_signal)

580

if (t->pdeath_signal)

581

group_send_sig_info(t->pdeath_signal,

581

group_send_sig_info(t->pdeath_signal,

582

SEND_SIG_NOINFO, t);

582

SEND_SIG_NOINFO, t);

583

} while_each_thread(p, t);

583

} while_each_thread(p, t);

584

reparent_leader(father, p, &dead_children);

584

reparent_leader(father, p, &dead_children);

585

}

585

}

586

write_unlock_irq(&tasklist_lock);

586

write_unlock_irq(&tasklist_lock);

587

588

BUG_ON(!list_empty(&father->children));

588

BUG_ON(!list_empty(&father->children));

589

590

list_for_each_entry_safe(p, n, &dead_children, sibling) {

590

list_for_each_entry_safe(p, n, &dead_children, sibling) {

591

list_del_init(&p->sibling);

591

list_del_init(&p->sibling);

592

release_task(p);

592

release_task(p);

593

}

593

}

594

}

594

}

595

596

/*

596

/*

597

* Send signals to all our closest relatives so that they know

597

* Send signals to all our closest relatives so that they know

598

* to properly mourn us..

598

* to properly mourn us..

599

*/

599

*/

600

static void exit_notify(struct task_struct *tsk, int group_dead)

600

static void exit_notify(struct task_struct *tsk, int group_dead)

601

{

601

{

602

bool autoreap;

602

bool autoreap;

603

604

/*

604

/*

605

* This does two things:

605

* This does two things:

606

*

606

*

607

* A. Make init inherit all the child processes

607

* A. Make init inherit all the child processes

608

* B. Check to see if any process groups have become orphaned

608

* B. Check to see if any process groups have become orphaned

609

* as a result of our exiting, and if they have any stopped

609

* as a result of our exiting, and if they have any stopped

610

* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)

610

* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)

611

*/

611

*/

612

forget_original_parent(tsk);

612

forget_original_parent(tsk);

613

614

write_lock_irq(&tasklist_lock);

614

write_lock_irq(&tasklist_lock);

615

if (group_dead)

615

if (group_dead)

616

kill_orphaned_pgrp(tsk->group_leader, NULL);

616

kill_orphaned_pgrp(tsk->group_leader, NULL);

617

618

if (unlikely(tsk->ptrace)) {

618

if (unlikely(tsk->ptrace)) {

619

int sig = thread_group_leader(tsk) &&

619

int sig = thread_group_leader(tsk) &&

620

thread_group_empty(tsk) &&

620

thread_group_empty(tsk) &&

621

!ptrace_reparented(tsk) ?

621

!ptrace_reparented(tsk) ?

622

tsk->exit_signal : SIGCHLD;

622

tsk->exit_signal : SIGCHLD;

623

autoreap = do_notify_parent(tsk, sig);

623

autoreap = do_notify_parent(tsk, sig);

624

} else if (thread_group_leader(tsk)) {

624

} else if (thread_group_leader(tsk)) {

625

autoreap = thread_group_empty(tsk) &&

625

autoreap = thread_group_empty(tsk) &&

626

do_notify_parent(tsk, tsk->exit_signal);

626

do_notify_parent(tsk, tsk->exit_signal);

627

} else {

627

} else {

628

autoreap = true;

628

autoreap = true;

629

}

629

}

630

631

tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;

631

tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;

632

633

/* mt-exec, de_thread() is waiting for group leader */

633

/* mt-exec, de_thread() is waiting for group leader */

634

if (unlikely(tsk->signal->notify_count < 0))

634

if (unlikely(tsk->signal->notify_count < 0))

635

wake_up_process(tsk->signal->group_exit_task);

635

wake_up_process(tsk->signal->group_exit_task);

636

write_unlock_irq(&tasklist_lock);

636

write_unlock_irq(&tasklist_lock);

637

638

/* If the process is dead, release it - nobody will wait for it */

638

/* If the process is dead, release it - nobody will wait for it */

639

if (autoreap)

639

if (autoreap)

640

release_task(tsk);

640

release_task(tsk);

641

}

641

}

642

643

#ifdef CONFIG_DEBUG_STACK_USAGE

643

#ifdef CONFIG_DEBUG_STACK_USAGE

644

static void check_stack_usage(void)

644

static void check_stack_usage(void)

645

{

645

{

646

static DEFINE_SPINLOCK(low_water_lock);

646

static DEFINE_SPINLOCK(low_water_lock);

647

static int lowest_to_date = THREAD_SIZE;

647

static int lowest_to_date = THREAD_SIZE;

648

unsigned long free;

648

unsigned long free;

649

650

free = stack_not_used(current);

650

free = stack_not_used(current);

651

652

if (free >= lowest_to_date)

652

if (free >= lowest_to_date)

653

return;

653

return;

654

655

spin_lock(&low_water_lock);

655

spin_lock(&low_water_lock);

656

if (free < lowest_to_date) {

656

if (free < lowest_to_date) {

657

pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",

657

pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",

658

current->comm, task_pid_nr(current), free);

658

current->comm, task_pid_nr(current), free);

659

lowest_to_date = free;

659

lowest_to_date = free;

660

}

660

}

661

spin_unlock(&low_water_lock);

661

spin_unlock(&low_water_lock);

662

}

662

}

663

#else

663

#else

664

static inline void check_stack_usage(void) {}

664

static inline void check_stack_usage(void) {}

665

#endif

665

#endif

666

667

void do_exit(long code)

667

void do_exit(long code)

668

{

668

{

669

struct task_struct *tsk = current;

669

struct task_struct *tsk = current;

670

int group_dead;

670

int group_dead;

671

TASKS_RCU(int tasks_rcu_i);

671

TASKS_RCU(int tasks_rcu_i);

672

673

profile_task_exit(tsk);

673

profile_task_exit(tsk);

674

675

WARN_ON(blk_needs_flush_plug(tsk));

675

WARN_ON(blk_needs_flush_plug(tsk));

676

677

if (unlikely(in_interrupt()))

677

if (unlikely(in_interrupt()))

678

panic("Aiee, killing interrupt handler!");

678

panic("Aiee, killing interrupt handler!");

679

if (unlikely(!tsk->pid))

679

if (unlikely(!tsk->pid))

680

panic("Attempted to kill the idle task!");

680

panic("Attempted to kill the idle task!");

681

682

/*

682

/*

683

* If do_exit is called because this processes oopsed, it's possible

683

* If do_exit is called because this processes oopsed, it's possible

684

* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before

684

* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before

685

* continuing. Amongst other possible reasons, this is to prevent

685

* continuing. Amongst other possible reasons, this is to prevent

686

* mm_release()->clear_child_tid() from writing to a user-controlled

686

* mm_release()->clear_child_tid() from writing to a user-controlled

687

* kernel address.

687

* kernel address.

688

*/

688

*/

689

set_fs(USER_DS);

689

set_fs(USER_DS);

690

691

ptrace_event(PTRACE_EVENT_EXIT, code);

691

ptrace_event(PTRACE_EVENT_EXIT, code);

692

693

validate_creds_for_do_exit(tsk);

693

validate_creds_for_do_exit(tsk);

694

695

/*

695

/*

696

* We're taking recursive faults here in do_exit. Safest is to just

696

* We're taking recursive faults here in do_exit. Safest is to just

697

* leave this task alone and wait for reboot.

697

* leave this task alone and wait for reboot.

698

*/

698

*/

699

if (unlikely(tsk->flags & PF_EXITING)) {

699

if (unlikely(tsk->flags & PF_EXITING)) {

700

pr_alert("Fixing recursive fault but reboot is needed!\n");

700

pr_alert("Fixing recursive fault but reboot is needed!\n");

701

/*

701

/*

702

* We can do this unlocked here. The futex code uses

702

* We can do this unlocked here. The futex code uses

703

* this flag just to verify whether the pi state

703

* this flag just to verify whether the pi state

704

* cleanup has been done or not. In the worst case it

704

* cleanup has been done or not. In the worst case it

705

* loops once more. We pretend that the cleanup was

705

* loops once more. We pretend that the cleanup was

706

* done as there is no way to return. Either the

706

* done as there is no way to return. Either the

707

* OWNER_DIED bit is set by now or we push the blocked

707

* OWNER_DIED bit is set by now or we push the blocked

708

* task into the wait for ever nirwana as well.

708

* task into the wait for ever nirwana as well.

709

*/

709

*/

710

tsk->flags |= PF_EXITPIDONE;

710

tsk->flags |= PF_EXITPIDONE;

711

set_current_state(TASK_UNINTERRUPTIBLE);

711

set_current_state(TASK_UNINTERRUPTIBLE);

712

schedule();

712

schedule();

713

}

713

}

714

715

exit_signals(tsk); /* sets PF_EXITING */

715

exit_signals(tsk); /* sets PF_EXITING */

716

/*

716

/*

717

* tsk->flags are checked in the futex code to protect against

717

* tsk->flags are checked in the futex code to protect against

718

* an exiting task cleaning up the robust pi futexes.

718

* an exiting task cleaning up the robust pi futexes.

719

*/

719

*/

720

smp_mb();

720

smp_mb();

721

raw_spin_unlock_wait(&tsk->pi_lock);

721

raw_spin_unlock_wait(&tsk->pi_lock);

722

723

if (unlikely(in_atomic()))

723

if (unlikely(in_atomic()))

724

pr_info("note: %s[%d] exited with preempt_count %d\n",

724

pr_info("note: %s[%d] exited with preempt_count %d\n",

725

current->comm, task_pid_nr(current),

725

current->comm, task_pid_nr(current),

726

preempt_count());

726

preempt_count());

727

728

acct_update_integrals(tsk);

728

acct_update_integrals(tsk);

729

/* sync mm's RSS info before statistics gathering */

729

/* sync mm's RSS info before statistics gathering */

730

if (tsk->mm)

730

if (tsk->mm)

731

sync_mm_rss(tsk->mm);

731

sync_mm_rss(tsk->mm);

732

group_dead = atomic_dec_and_test(&tsk->signal->live);

732

group_dead = atomic_dec_and_test(&tsk->signal->live);

733

if (group_dead) {

733

if (group_dead) {

734

hrtimer_cancel(&tsk->signal->real_timer);

734

hrtimer_cancel(&tsk->signal->real_timer);

735

exit_itimers(tsk->signal);

735

exit_itimers(tsk->signal);

736

if (tsk->mm)

736

if (tsk->mm)

737

setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);

737

setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);

738

}

738

}

739

acct_collect(code, group_dead);

739

acct_collect(code, group_dead);

740

if (group_dead)

740

if (group_dead)

741

tty_audit_exit();

741

tty_audit_exit();

742

audit_free(tsk);

742

audit_free(tsk);

743

744

tsk->exit_code = code;

744

tsk->exit_code = code;

745

taskstats_exit(tsk, group_dead);

745

taskstats_exit(tsk, group_dead);

746

747

exit_mm(tsk);

747

exit_mm(tsk);

748

749

if (group_dead)

749

if (group_dead)

750

acct_process();

750

acct_process();

751

trace_sched_process_exit(tsk);

751

trace_sched_process_exit(tsk);

752

753

exit_sem(tsk);

753

exit_sem(tsk);

754

exit_shm(tsk);

754

exit_shm(tsk);

755

exit_files(tsk);

755

exit_files(tsk);

756

exit_fs(tsk);

756

exit_fs(tsk);

757

if (group_dead)

757

if (group_dead)

758

disassociate_ctty(1);

758

disassociate_ctty(1);

759

exit_task_namespaces(tsk);

759

exit_task_namespaces(tsk);

760

exit_task_work(tsk);

760

exit_task_work(tsk);

761

exit_thread();

761

exit_thread();

762

763

/*

763

/*

764

* Flush inherited counters to the parent - before the parent

764

* Flush inherited counters to the parent - before the parent

765

* gets woken up by child-exit notifications.

765

* gets woken up by child-exit notifications.

766

*

766

*

767

* because of cgroup mode, must be called before cgroup_exit()

767

* because of cgroup mode, must be called before cgroup_exit()

768

*/

768

*/

769

perf_event_exit_task(tsk);

769

perf_event_exit_task(tsk);

770

771

cgroup_exit(tsk);

771

cgroup_exit(tsk);

772

773

module_put(task_thread_info(tsk)->exec_domain->module);

773

module_put(task_thread_info(tsk)->exec_domain->module);

774

775

/*

775

/*

776

* FIXME: do that only when needed, using sched_exit tracepoint

776

* FIXME: do that only when needed, using sched_exit tracepoint

777

*/

777

*/

778

flush_ptrace_hw_breakpoint(tsk);

778

flush_ptrace_hw_breakpoint(tsk);

779

780

TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));

780

TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));

781

exit_notify(tsk, group_dead);

781

exit_notify(tsk, group_dead);

782

proc_exit_connector(tsk);

782

proc_exit_connector(tsk);

783

#ifdef CONFIG_NUMA

783

#ifdef CONFIG_NUMA

784

task_lock(tsk);

784

task_lock(tsk);

785

mpol_put(tsk->mempolicy);

785

mpol_put(tsk->mempolicy);

786

tsk->mempolicy = NULL;

786

tsk->mempolicy = NULL;

787

task_unlock(tsk);

787

task_unlock(tsk);

788

#endif

788

#endif

789

#ifdef CONFIG_FUTEX

789

#ifdef CONFIG_FUTEX

790

if (unlikely(current->pi_state_cache))

790

if (unlikely(current->pi_state_cache))

791

kfree(current->pi_state_cache);

791

kfree(current->pi_state_cache);

792

#endif

792

#endif

793

/*

793

/*

794

* Make sure we are holding no locks:

794

* Make sure we are holding no locks:

795

*/

795

*/

796

debug_check_no_locks_held();

796

debug_check_no_locks_held();

797

/*

797

/*

798

* We can do this unlocked here. The futex code uses this flag

798

* We can do this unlocked here. The futex code uses this flag

799

* just to verify whether the pi state cleanup has been done

799

* just to verify whether the pi state cleanup has been done

800

* or not. In the worst case it loops once more.

800

* or not. In the worst case it loops once more.

801

*/

801

*/

802

tsk->flags |= PF_EXITPIDONE;

802

tsk->flags |= PF_EXITPIDONE;

803

804

if (tsk->io_context)

804

if (tsk->io_context)

805

exit_io_context(tsk);

805

exit_io_context(tsk);

806

807

if (tsk->splice_pipe)

807

if (tsk->splice_pipe)

808

free_pipe_info(tsk->splice_pipe);

808

free_pipe_info(tsk->splice_pipe);

809

810

if (tsk->task_frag.page)

810

if (tsk->task_frag.page)

811

put_page(tsk->task_frag.page);

811

put_page(tsk->task_frag.page);

812

813

validate_creds_for_do_exit(tsk);

813

validate_creds_for_do_exit(tsk);

814

815

check_stack_usage();

815

check_stack_usage();

816

preempt_disable();

816

preempt_disable();

817

if (tsk->nr_dirtied)

817

if (tsk->nr_dirtied)

818

__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);

818

__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);

819

exit_rcu();

819

exit_rcu();

820

TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));

820

TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));

821

822

/*

822

/*

823

* The setting of TASK_RUNNING by try_to_wake_up() may be delayed

823

* The setting of TASK_RUNNING by try_to_wake_up() may be delayed

824

* when the following two conditions become true.

824

* when the following two conditions become true.

825

* - There is race condition of mmap_sem (It is acquired by

825

* - There is race condition of mmap_sem (It is acquired by

826

* exit_mm()), and

826

* exit_mm()), and

827

* - SMI occurs before setting TASK_RUNINNG.

827

* - SMI occurs before setting TASK_RUNINNG.

828

* (or hypervisor of virtual machine switches to other guest)

828

* (or hypervisor of virtual machine switches to other guest)

829

* As a result, we may become TASK_RUNNING after becoming TASK_DEAD

829

* As a result, we may become TASK_RUNNING after becoming TASK_DEAD

830

*

830

*

831

* To avoid it, we have to wait for releasing tsk->pi_lock which

831

* To avoid it, we have to wait for releasing tsk->pi_lock which

832

* is held by try_to_wake_up()

832

* is held by try_to_wake_up()

833

*/

833

*/

834

smp_mb();

834

smp_mb();

835

raw_spin_unlock_wait(&tsk->pi_lock);

835

raw_spin_unlock_wait(&tsk->pi_lock);

836

837

/* causes final put_task_struct in finish_task_switch(). */

837

/* causes final put_task_struct in finish_task_switch(). */

838

tsk->state = TASK_DEAD;

838

tsk->state = TASK_DEAD;

839

tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */

839

tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */

840

schedule();

840

schedule();

841

BUG();

841

BUG();

842

/* Avoid "noreturn function does return". */

842

/* Avoid "noreturn function does return". */

843

for (;;)

843

for (;;)

844

cpu_relax(); /* For when BUG is null */

844

cpu_relax(); /* For when BUG is null */

845

}

845

}

846

EXPORT_SYMBOL_GPL(do_exit);

846

EXPORT_SYMBOL_GPL(do_exit);

847

848

void complete_and_exit(struct completion *comp, long code)

848

void complete_and_exit(struct completion *comp, long code)

849

{

849

{

850

if (comp)

850

if (comp)

851

complete(comp);

851

complete(comp);

852

853

do_exit(code);

853

do_exit(code);

854

}

854

}

855

EXPORT_SYMBOL(complete_and_exit);

855

EXPORT_SYMBOL(complete_and_exit);

856

857

SYSCALL_DEFINE1(exit, int, error_code)

857

SYSCALL_DEFINE1(exit, int, error_code)

858

{

858

{

859

do_exit((error_code&0xff)<<8);

859

do_exit((error_code&0xff)<<8);

860

}

860

}

861

862

/*

862

/*

863

* Take down every thread in the group. This is called by fatal signals

863

* Take down every thread in the group. This is called by fatal signals

864

* as well as by sys_exit_group (below).

864

* as well as by sys_exit_group (below).

865

*/

865

*/

866

void

866

void

867

do_group_exit(int exit_code)

867

do_group_exit(int exit_code)

868

{

868

{

869

struct signal_struct *sig = current->signal;

869

struct signal_struct *sig = current->signal;

870

871

BUG_ON(exit_code & 0x80); /* core dumps don't get here */

871

BUG_ON(exit_code & 0x80); /* core dumps don't get here */

872

873

if (signal_group_exit(sig))

873

if (signal_group_exit(sig))

874

exit_code = sig->group_exit_code;

874

exit_code = sig->group_exit_code;

875

else if (!thread_group_empty(current)) {

875

else if (!thread_group_empty(current)) {

876

struct sighand_struct *const sighand = current->sighand;

876

struct sighand_struct *const sighand = current->sighand;

877

878

spin_lock_irq(&sighand->siglock);

878

spin_lock_irq(&sighand->siglock);

879

if (signal_group_exit(sig))

879

if (signal_group_exit(sig))

880

/* Another thread got here before we took the lock. */

880

/* Another thread got here before we took the lock. */

881

exit_code = sig->group_exit_code;

881

exit_code = sig->group_exit_code;

882

else {

882

else {

883

sig->group_exit_code = exit_code;

883

sig->group_exit_code = exit_code;

884

sig->flags = SIGNAL_GROUP_EXIT;

884

sig->flags = SIGNAL_GROUP_EXIT;

885

zap_other_threads(current);

885

zap_other_threads(current);

886

}

886

}

887

spin_unlock_irq(&sighand->siglock);

887

spin_unlock_irq(&sighand->siglock);

888

}

888

}

889

890

do_exit(exit_code);

890

do_exit(exit_code);

891

/* NOTREACHED */

891

/* NOTREACHED */

892

}

892

}

893

894

/*

894

/*

895

* this kills every thread in the thread group. Note that any externally

895

* this kills every thread in the thread group. Note that any externally

896

* wait4()-ing process will get the correct exit code - even if this

896

* wait4()-ing process will get the correct exit code - even if this

897

* thread is not the thread group leader.

897

* thread is not the thread group leader.

898

*/

898

*/

899

SYSCALL_DEFINE1(exit_group, int, error_code)

899

SYSCALL_DEFINE1(exit_group, int, error_code)

900

{

900

{

901

do_group_exit((error_code & 0xff) << 8);

901

do_group_exit((error_code & 0xff) << 8);

902

/* NOTREACHED */

902

/* NOTREACHED */

903

return 0;

903

return 0;

904

}

904

}

905

906

struct wait_opts {

906

struct wait_opts {

907

enum pid_type wo_type;

907

enum pid_type wo_type;

908

int wo_flags;

908

int wo_flags;

909

struct pid *wo_pid;

909

struct pid *wo_pid;

910

911

struct siginfo __user *wo_info;

911

struct siginfo __user *wo_info;

912

int __user *wo_stat;

912

int __user *wo_stat;

913

struct rusage __user *wo_rusage;

913

struct rusage __user *wo_rusage;

914

915

wait_queue_t child_wait;

915

wait_queue_t child_wait;

916

int notask_error;

916

int notask_error;

917

};

917

};

918

919

static inline

919

static inline

920

struct pid *task_pid_type(struct task_struct *task, enum pid_type type)

920

struct pid *task_pid_type(struct task_struct *task, enum pid_type type)

921

{

921

{

922

if (type != PIDTYPE_PID)

922

if (type != PIDTYPE_PID)

923

task = task->group_leader;

923

task = task->group_leader;

924

return task->pids[type].pid;

924

return task->pids[type].pid;

925

}

925

}

926

927

static int eligible_pid(struct wait_opts *wo, struct task_struct *p)

927

static int eligible_pid(struct wait_opts *wo, struct task_struct *p)

928

{

928

{

929

return wo->wo_type == PIDTYPE_MAX ||

929

return wo->wo_type == PIDTYPE_MAX ||

930

task_pid_type(p, wo->wo_type) == wo->wo_pid;

930

task_pid_type(p, wo->wo_type) == wo->wo_pid;

931

}

931

}

932

933

static int eligible_child(struct wait_opts *wo, struct task_struct *p)

933

static int eligible_child(struct wait_opts *wo, struct task_struct *p)

934

{

934

{

935

if (!eligible_pid(wo, p))

935

if (!eligible_pid(wo, p))

936

return 0;

936

return 0;

937

/* Wait for all children (clone and not) if __WALL is set;

937

/* Wait for all children (clone and not) if __WALL is set;

938

* otherwise, wait for clone children *only* if __WCLONE is

938

* otherwise, wait for clone children *only* if __WCLONE is

939

* set; otherwise, wait for non-clone children *only*. (Note:

939

* set; otherwise, wait for non-clone children *only*. (Note:

940

* A "clone" child here is one that reports to its parent

940

* A "clone" child here is one that reports to its parent

941

* using a signal other than SIGCHLD.) */

941

* using a signal other than SIGCHLD.) */

942

if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))

942

if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))

943

&& !(wo->wo_flags & __WALL))

943

&& !(wo->wo_flags & __WALL))

944

return 0;

944

return 0;

945

946

return 1;

946

return 1;

947

}

947

}

948

949

static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,

949

static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,

950

pid_t pid, uid_t uid, int why, int status)

950

pid_t pid, uid_t uid, int why, int status)

951

{

951

{

952

struct siginfo __user *infop;

952

struct siginfo __user *infop;

953

int retval = wo->wo_rusage

953

int retval = wo->wo_rusage

954

? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;

954

? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;

955

956

put_task_struct(p);

956

put_task_struct(p);

957

infop = wo->wo_info;

957

infop = wo->wo_info;

958

if (infop) {

958

if (infop) {

959

if (!retval)

959

if (!retval)

960

retval = put_user(SIGCHLD, &infop->si_signo);

960

retval = put_user(SIGCHLD, &infop->si_signo);

961

if (!retval)

961

if (!retval)

962

retval = put_user(0, &infop->si_errno);

962

retval = put_user(0, &infop->si_errno);

963

if (!retval)

963

if (!retval)

964

retval = put_user((short)why, &infop->si_code);

964

retval = put_user((short)why, &infop->si_code);

965

if (!retval)

965

if (!retval)

966

retval = put_user(pid, &infop->si_pid);

966

retval = put_user(pid, &infop->si_pid);

967

if (!retval)

967

if (!retval)

968

retval = put_user(uid, &infop->si_uid);

968

retval = put_user(uid, &infop->si_uid);

969

if (!retval)

969

if (!retval)

970

retval = put_user(status, &infop->si_status);

970

retval = put_user(status, &infop->si_status);

971

}

971

}

972

if (!retval)

972

if (!retval)

973

retval = pid;

973

retval = pid;

974

return retval;

974

return retval;

975

}

975

}

976

977

/*

977

/*

978

* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold

978

* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold

979

* read_lock(&tasklist_lock) on entry. If we return zero, we still hold

979

* read_lock(&tasklist_lock) on entry. If we return zero, we still hold

980

* the lock and this task is uninteresting. If we return nonzero, we have

980

* the lock and this task is uninteresting. If we return nonzero, we have

981

* released the lock and the system call should return.

981

* released the lock and the system call should return.

982

*/

982

*/

983

static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)

983

static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)

984

{

984

{

985

unsigned long state;

985

unsigned long state;

986

int retval, status, traced;

986

int retval, status, traced;

987

pid_t pid = task_pid_vnr(p);

987

pid_t pid = task_pid_vnr(p);

988

uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));

988

uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));

989

struct siginfo __user *infop;

989

struct siginfo __user *infop;

990

991

if (!likely(wo->wo_flags & WEXITED))

991

if (!likely(wo->wo_flags & WEXITED))

992

return 0;

992

return 0;

993

994

if (unlikely(wo->wo_flags & WNOWAIT)) {

994

if (unlikely(wo->wo_flags & WNOWAIT)) {

995

int exit_code = p->exit_code;

995

int exit_code = p->exit_code;

996

int why;

996

int why;

997

998

get_task_struct(p);

998

get_task_struct(p);

999

read_unlock(&tasklist_lock);

999

read_unlock(&tasklist_lock);

1000

if ((exit_code & 0x7f) == 0) {

1000

if ((exit_code & 0x7f) == 0) {

1001

why = CLD_EXITED;

1001

why = CLD_EXITED;

1002

status = exit_code >> 8;

1002

status = exit_code >> 8;

1003

} else {

1003

} else {

1004

why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;

1004

why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;

1005

status = exit_code & 0x7f;

1005

status = exit_code & 0x7f;

1006

}

1006

}

1007

return wait_noreap_copyout(wo, p, pid, uid, why, status);

1007

return wait_noreap_copyout(wo, p, pid, uid, why, status);

1008

}

1008

}

1009

1010

traced = ptrace_reparented(p);

1010

traced = ptrace_reparented(p);

1011

/*

1011

/*

1012

* Move the task's state to DEAD/TRACE, only one thread can do this.

1012

* Move the task's state to DEAD/TRACE, only one thread can do this.

1013

*/

1013

*/

1014

state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;

1014

state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;

1015

if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)

1015

if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)

1016

return 0;

1016

return 0;

1017

/*

1017

/*

1018

* It can be ptraced but not reparented, check

1018

* It can be ptraced but not reparented, check

1019

* thread_group_leader() to filter out sub-threads.

1019

* thread_group_leader() to filter out sub-threads.

1020

*/

1020

*/

1021

if (likely(!traced) && thread_group_leader(p)) {

1021

if (likely(!traced) && thread_group_leader(p)) {

1022

struct signal_struct *psig;

1022

struct signal_struct *psig;

1023

struct signal_struct *sig;

1023

struct signal_struct *sig;

1024

unsigned long maxrss;

1024

unsigned long maxrss;

1025

cputime_t tgutime, tgstime;

1025

cputime_t tgutime, tgstime;

1026

1027

/*

1027

/*

1028

* The resource counters for the group leader are in its

1028

* The resource counters for the group leader are in its

1029

* own task_struct. Those for dead threads in the group

1029

* own task_struct. Those for dead threads in the group

1030

* are in its signal_struct, as are those for the child

1030

* are in its signal_struct, as are those for the child

1031

* processes it has previously reaped. All these

1031

* processes it has previously reaped. All these

1032

* accumulate in the parent's signal_struct c* fields.

1032

* accumulate in the parent's signal_struct c* fields.

1033

*

1033

*

1034

* We don't bother to take a lock here to protect these

1034

* We don't bother to take a lock here to protect these

1035

* p->signal fields, because they are only touched by

1035

* p->signal fields, because they are only touched by

1036

* __exit_signal, which runs with tasklist_lock

1036

* __exit_signal, which runs with tasklist_lock

1037

* write-locked anyway, and so is excluded here. We do

1037

* write-locked anyway, and so is excluded here. We do

1038

* need to protect the access to parent->signal fields,

1038

* need to protect the access to parent->signal fields,

1039

* as other threads in the parent group can be right

1039

* as other threads in the parent group can be right

1040

* here reaping other children at the same time.

1040

* here reaping other children at the same time.

1041

*

1041

*

1042

* We use thread_group_cputime_adjusted() to get times for

1042

* We use thread_group_cputime_adjusted() to get times for

1043

* the thread group, which consolidates times for all threads

1043

* the thread group, which consolidates times for all threads

1044

* in the group including the group leader.

1044

* in the group including the group leader.

1045

*/

1045

*/

1046

thread_group_cputime_adjusted(p, &tgutime, &tgstime);

1046

thread_group_cputime_adjusted(p, &tgutime, &tgstime);

1047

spin_lock_irq(&p->real_parent->sighand->siglock);

1047

spin_lock_irq(&p->real_parent->sighand->siglock);

1048

psig = p->real_parent->signal;

1048

psig = p->real_parent->signal;

1049

sig = p->signal;

1049

sig = p->signal;

1050

write_seqlock(&psig->stats_lock);

1050

write_seqlock(&psig->stats_lock);

1051

psig->cutime += tgutime + sig->cutime;

1051

psig->cutime += tgutime + sig->cutime;

1052

psig->cstime += tgstime + sig->cstime;

1052

psig->cstime += tgstime + sig->cstime;

1053

psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;

1053

psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;

1054

psig->cmin_flt +=

1054

psig->cmin_flt +=

1055

p->min_flt + sig->min_flt + sig->cmin_flt;

1055

p->min_flt + sig->min_flt + sig->cmin_flt;

1056

psig->cmaj_flt +=

1056

psig->cmaj_flt +=

1057

p->maj_flt + sig->maj_flt + sig->cmaj_flt;

1057

p->maj_flt + sig->maj_flt + sig->cmaj_flt;

1058

psig->cnvcsw +=

1058

psig->cnvcsw +=

1059

p->nvcsw + sig->nvcsw + sig->cnvcsw;

1059

p->nvcsw + sig->nvcsw + sig->cnvcsw;

1060

psig->cnivcsw +=

1060

psig->cnivcsw +=

1061

p->nivcsw + sig->nivcsw + sig->cnivcsw;

1061

p->nivcsw + sig->nivcsw + sig->cnivcsw;

1062

psig->cinblock +=

1062

psig->cinblock +=

1063

task_io_get_inblock(p) +

1063

task_io_get_inblock(p) +

1064

sig->inblock + sig->cinblock;

1064

sig->inblock + sig->cinblock;

1065

psig->coublock +=

1065

psig->coublock +=

1066

task_io_get_oublock(p) +

1066

task_io_get_oublock(p) +

1067

sig->oublock + sig->coublock;

1067

sig->oublock + sig->coublock;

1068

maxrss = max(sig->maxrss, sig->cmaxrss);

1068

maxrss = max(sig->maxrss, sig->cmaxrss);

1069

if (psig->cmaxrss < maxrss)

1069

if (psig->cmaxrss < maxrss)

1070

psig->cmaxrss = maxrss;

1070

psig->cmaxrss = maxrss;

1071

task_io_accounting_add(&psig->ioac, &p->ioac);

1071

task_io_accounting_add(&psig->ioac, &p->ioac);

1072

task_io_accounting_add(&psig->ioac, &sig->ioac);

1072

task_io_accounting_add(&psig->ioac, &sig->ioac);

1073

write_sequnlock(&psig->stats_lock);

1073

write_sequnlock(&psig->stats_lock);

1074

spin_unlock_irq(&p->real_parent->sighand->siglock);

1074

spin_unlock_irq(&p->real_parent->sighand->siglock);

1075

}

1075

}

1076

1077

/*

1077

/*

1078

* Now we are sure this task is interesting, and no other

1078

* Now we are sure this task is interesting, and no other

1079

* thread can reap it because we its state == DEAD/TRACE.

1079

* thread can reap it because we its state == DEAD/TRACE.

1080

*/

1080

*/

1081

read_unlock(&tasklist_lock);

1081

read_unlock(&tasklist_lock);

1082

1083

retval = wo->wo_rusage

1083

retval = wo->wo_rusage

1084

? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;

1084

? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;

1085

status = (p->signal->flags & SIGNAL_GROUP_EXIT)

1085

status = (p->signal->flags & SIGNAL_GROUP_EXIT)

1086

? p->signal->group_exit_code : p->exit_code;

1086

? p->signal->group_exit_code : p->exit_code;

1087

if (!retval && wo->wo_stat)

1087

if (!retval && wo->wo_stat)

1088

retval = put_user(status, wo->wo_stat);

1088

retval = put_user(status, wo->wo_stat);

1089

1090

infop = wo->wo_info;

1090

infop = wo->wo_info;

1091

if (!retval && infop)

1091

if (!retval && infop)

1092

retval = put_user(SIGCHLD, &infop->si_signo);

1092

retval = put_user(SIGCHLD, &infop->si_signo);

1093

if (!retval && infop)

1093

if (!retval && infop)

1094

retval = put_user(0, &infop->si_errno);

1094

retval = put_user(0, &infop->si_errno);

1095

if (!retval && infop) {

1095

if (!retval && infop) {

1096

int why;

1096

int why;

1097

1098

if ((status & 0x7f) == 0) {

1098

if ((status & 0x7f) == 0) {

1099

why = CLD_EXITED;

1099

why = CLD_EXITED;

1100

status >>= 8;

1100

status >>= 8;

1101

} else {

1101

} else {

1102

why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;

1102

why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;

1103

status &= 0x7f;

1103

status &= 0x7f;

1104

}

1104

}

1105

retval = put_user((short)why, &infop->si_code);

1105

retval = put_user((short)why, &infop->si_code);

1106

if (!retval)

1106

if (!retval)

1107

retval = put_user(status, &infop->si_status);

1107

retval = put_user(status, &infop->si_status);

1108

}

1108

}

1109

if (!retval && infop)

1109

if (!retval && infop)

1110

retval = put_user(pid, &infop->si_pid);

1110

retval = put_user(pid, &infop->si_pid);

1111

if (!retval && infop)

1111

if (!retval && infop)

1112

retval = put_user(uid, &infop->si_uid);

1112

retval = put_user(uid, &infop->si_uid);

1113

if (!retval)

1113

if (!retval)

1114

retval = pid;

1114

retval = pid;

1115

1116

if (state == EXIT_TRACE) {

1116

if (state == EXIT_TRACE) {

1117

write_lock_irq(&tasklist_lock);

1117

write_lock_irq(&tasklist_lock);

1118

/* We dropped tasklist, ptracer could die and untrace */

1118

/* We dropped tasklist, ptracer could die and untrace */

1119

ptrace_unlink(p);

1119

ptrace_unlink(p);

1120

1121

/* If parent wants a zombie, don't release it now */

1121

/* If parent wants a zombie, don't release it now */

1122

state = EXIT_ZOMBIE;

1122

state = EXIT_ZOMBIE;

1123

if (do_notify_parent(p, p->exit_signal))

1123

if (do_notify_parent(p, p->exit_signal))

1124

state = EXIT_DEAD;

1124

state = EXIT_DEAD;

1125

p->exit_state = state;

1125

p->exit_state = state;

1126

write_unlock_irq(&tasklist_lock);

1126

write_unlock_irq(&tasklist_lock);

1127

}

1127

}

1128

if (state == EXIT_DEAD)

1128

if (state == EXIT_DEAD)

1129

release_task(p);

1129

release_task(p);

1130

1131

return retval;

1131

return retval;

1132

}

1132

}

1133

1134

static int *task_stopped_code(struct task_struct *p, bool ptrace)

1134

static int *task_stopped_code(struct task_struct *p, bool ptrace)

1135

{

1135

{

1136

if (ptrace) {

1136

if (ptrace) {

1137

if (task_is_stopped_or_traced(p) &&

1137

if (task_is_stopped_or_traced(p) &&

1138

!(p->jobctl & JOBCTL_LISTENING))

1138

!(p->jobctl & JOBCTL_LISTENING))

1139

return &p->exit_code;

1139

return &p->exit_code;

1140

} else {

1140

} else {

1141

if (p->signal->flags & SIGNAL_STOP_STOPPED)

1141

if (p->signal->flags & SIGNAL_STOP_STOPPED)

1142

return &p->signal->group_exit_code;

1142

return &p->signal->group_exit_code;

1143

}

1143

}

1144

return NULL;

1144

return NULL;

1145

}

1145

}

1146

1147

/**

1147

/**

1148

* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED

1148

* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED

1149

* @wo: wait options

1149

* @wo: wait options

1150

* @ptrace: is the wait for ptrace

1150

* @ptrace: is the wait for ptrace

1151

* @p: task to wait for

1151

* @p: task to wait for

1152

*

1152

*

1153

* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.

1153

* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.

1154

*

1154

*

1155

* CONTEXT:

1155

* CONTEXT:

1156

* read_lock(&tasklist_lock), which is released if return value is

1156

* read_lock(&tasklist_lock), which is released if return value is

1157

* non-zero. Also, grabs and releases @p->sighand->siglock.

1157

* non-zero. Also, grabs and releases @p->sighand->siglock.

1158

*

1158

*

1159

* RETURNS:

1159

* RETURNS:

1160

* 0 if wait condition didn't exist and search for other wait conditions

1160

* 0 if wait condition didn't exist and search for other wait conditions

1161

* should continue. Non-zero return, -errno on failure and @p's pid on

1161

* should continue. Non-zero return, -errno on failure and @p's pid on

1162

* success, implies that tasklist_lock is released and wait condition

1162

* success, implies that tasklist_lock is released and wait condition

1163

* search should terminate.

1163

* search should terminate.

1164

*/

1164

*/

1165

static int wait_task_stopped(struct wait_opts *wo,

1165

static int wait_task_stopped(struct wait_opts *wo,

1166

int ptrace, struct task_struct *p)

1166

int ptrace, struct task_struct *p)

1167

{

1167

{

1168

struct siginfo __user *infop;

1168

struct siginfo __user *infop;

1169

int retval, exit_code, *p_code, why;

1169

int retval, exit_code, *p_code, why;

1170

uid_t uid = 0; /* unneeded, required by compiler */

1170

uid_t uid = 0; /* unneeded, required by compiler */

1171

pid_t pid;

1171

pid_t pid;

1172

1173

/*

1173

/*

1174

* Traditionally we see ptrace'd stopped tasks regardless of options.

1174

* Traditionally we see ptrace'd stopped tasks regardless of options.

1175

*/

1175

*/

1176

if (!ptrace && !(wo->wo_flags & WUNTRACED))

1176

if (!ptrace && !(wo->wo_flags & WUNTRACED))

1177

return 0;

1177

return 0;

1178

1179

if (!task_stopped_code(p, ptrace))

1179

if (!task_stopped_code(p, ptrace))

1180

return 0;

1180

return 0;

1181

1182

exit_code = 0;

1182

exit_code = 0;

1183

spin_lock_irq(&p->sighand->siglock);

1183

spin_lock_irq(&p->sighand->siglock);

1184

1185

p_code = task_stopped_code(p, ptrace);

1185

p_code = task_stopped_code(p, ptrace);

1186

if (unlikely(!p_code))

1186

if (unlikely(!p_code))

1187

goto unlock_sig;

1187

goto unlock_sig;

1188

1189

exit_code = *p_code;

1189

exit_code = *p_code;

1190

if (!exit_code)

1190

if (!exit_code)

1191

goto unlock_sig;

1191

goto unlock_sig;

1192

1193

if (!unlikely(wo->wo_flags & WNOWAIT))

1193

if (!unlikely(wo->wo_flags & WNOWAIT))

1194

*p_code = 0;

1194

*p_code = 0;

1195

1196

uid = from_kuid_munged(current_user_ns(), task_uid(p));

1196

uid = from_kuid_munged(current_user_ns(), task_uid(p));

1197

unlock_sig:

1197

unlock_sig:

1198

spin_unlock_irq(&p->sighand->siglock);

1198

spin_unlock_irq(&p->sighand->siglock);

1199

if (!exit_code)

1199

if (!exit_code)

1200

return 0;

1200

return 0;

1201

1202

/*

1202

/*

1203

* Now we are pretty sure this task is interesting.

1203

* Now we are pretty sure this task is interesting.

1204

* Make sure it doesn't get reaped out from under us while we

1204

* Make sure it doesn't get reaped out from under us while we

1205

* give up the lock and then examine it below. We don't want to

1205

* give up the lock and then examine it below. We don't want to

1206

* keep holding onto the tasklist_lock while we call getrusage and

1206

* keep holding onto the tasklist_lock while we call getrusage and

1207

* possibly take page faults for user memory.

1207

* possibly take page faults for user memory.

1208

*/

1208

*/

1209

get_task_struct(p);

1209

get_task_struct(p);

1210

pid = task_pid_vnr(p);

1210

pid = task_pid_vnr(p);

1211

why = ptrace ? CLD_TRAPPED : CLD_STOPPED;

1211

why = ptrace ? CLD_TRAPPED : CLD_STOPPED;

1212

read_unlock(&tasklist_lock);

1212

read_unlock(&tasklist_lock);

1213

1214

if (unlikely(wo->wo_flags & WNOWAIT))

1214

if (unlikely(wo->wo_flags & WNOWAIT))

1215

return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);

1215

return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);

1216

1217

retval = wo->wo_rusage

1217

retval = wo->wo_rusage

1218

? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;

1218

? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;

1219

if (!retval && wo->wo_stat)

1219

if (!retval && wo->wo_stat)

1220

retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);

1220

retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);

1221

1222

infop = wo->wo_info;

1222

infop = wo->wo_info;

1223

if (!retval && infop)

1223

if (!retval && infop)

1224

retval = put_user(SIGCHLD, &infop->si_signo);

1224

retval = put_user(SIGCHLD, &infop->si_signo);

1225

if (!retval && infop)

1225

if (!retval && infop)

1226

retval = put_user(0, &infop->si_errno);

1226

retval = put_user(0, &infop->si_errno);

1227

if (!retval && infop)

1227

if (!retval && infop)

1228

retval = put_user((short)why, &infop->si_code);

1228

retval = put_user((short)why, &infop->si_code);

1229

if (!retval && infop)

1229

if (!retval && infop)

1230

retval = put_user(exit_code, &infop->si_status);

1230

retval = put_user(exit_code, &infop->si_status);

1231

if (!retval && infop)

1231

if (!retval && infop)

1232

retval = put_user(pid, &infop->si_pid);

1232

retval = put_user(pid, &infop->si_pid);

1233

if (!retval && infop)

1233

if (!retval && infop)

1234

retval = put_user(uid, &infop->si_uid);

1234

retval = put_user(uid, &infop->si_uid);

1235

if (!retval)

1235

if (!retval)

1236

retval = pid;

1236

retval = pid;

1237

put_task_struct(p);

1237

put_task_struct(p);

1238

1239

BUG_ON(!retval);

1239

BUG_ON(!retval);

1240

return retval;

1240

return retval;

1241

}

1241

}

1242

1243

/*

1243

/*

1244

* Handle do_wait work for one task in a live, non-stopped state.

1244

* Handle do_wait work for one task in a live, non-stopped state.

1245

* read_lock(&tasklist_lock) on entry. If we return zero, we still hold

1245

* read_lock(&tasklist_lock) on entry. If we return zero, we still hold

1246

* the lock and this task is uninteresting. If we return nonzero, we have

1246

* the lock and this task is uninteresting. If we return nonzero, we have

1247

* released the lock and the system call should return.

1247

* released the lock and the system call should return.

1248

*/

1248

*/

1249

static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)

1249

static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)

1250

{

1250

{

1251

int retval;

1251

int retval;

1252

pid_t pid;

1252

pid_t pid;

1253

uid_t uid;

1253

uid_t uid;

1254

1255

if (!unlikely(wo->wo_flags & WCONTINUED))

1255

if (!unlikely(wo->wo_flags & WCONTINUED))

1256

return 0;

1256

return 0;

1257

1258

if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))

1258

if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))

1259

return 0;

1259

return 0;

1260

1261

spin_lock_irq(&p->sighand->siglock);

1261

spin_lock_irq(&p->sighand->siglock);

1262

/* Re-check with the lock held. */

1262

/* Re-check with the lock held. */

1263

if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {

1263

if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {

1264

spin_unlock_irq(&p->sighand->siglock);

1264

spin_unlock_irq(&p->sighand->siglock);

1265

return 0;

1265

return 0;

1266

}

1266

}

1267

if (!unlikely(wo->wo_flags & WNOWAIT))

1267

if (!unlikely(wo->wo_flags & WNOWAIT))

1268

p->signal->flags &= ~SIGNAL_STOP_CONTINUED;

1268

p->signal->flags &= ~SIGNAL_STOP_CONTINUED;

1269

uid = from_kuid_munged(current_user_ns(), task_uid(p));

1269

uid = from_kuid_munged(current_user_ns(), task_uid(p));

1270

spin_unlock_irq(&p->sighand->siglock);

1270

spin_unlock_irq(&p->sighand->siglock);

1271

1272

pid = task_pid_vnr(p);

1272

pid = task_pid_vnr(p);

1273

get_task_struct(p);

1273

get_task_struct(p);

1274

read_unlock(&tasklist_lock);

1274

read_unlock(&tasklist_lock);

1275

1276

if (!wo->wo_info) {

1276

if (!wo->wo_info) {

1277

retval = wo->wo_rusage

1277

retval = wo->wo_rusage

1278

? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;

1278

? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;

1279

put_task_struct(p);

1279

put_task_struct(p);

1280

if (!retval && wo->wo_stat)

1280

if (!retval && wo->wo_stat)

1281

retval = put_user(0xffff, wo->wo_stat);

1281

retval = put_user(0xffff, wo->wo_stat);

1282

if (!retval)

1282

if (!retval)

1283

retval = pid;

1283

retval = pid;

1284

} else {

1284

} else {

1285

retval = wait_noreap_copyout(wo, p, pid, uid,

1285

retval = wait_noreap_copyout(wo, p, pid, uid,

1286

CLD_CONTINUED, SIGCONT);

1286

CLD_CONTINUED, SIGCONT);

1287

BUG_ON(retval == 0);

1287

BUG_ON(retval == 0);

1288

}

1288

}

1289

1290

return retval;

1290

return retval;

1291

}

1291

}

1292

1293

/*

1293

/*

1294

* Consider @p for a wait by @parent.

1294

* Consider @p for a wait by @parent.

1295

*

1295

*

1296

* -ECHILD should be in ->notask_error before the first call.

1296

* -ECHILD should be in ->notask_error before the first call.

1297

* Returns nonzero for a final return, when we have unlocked tasklist_lock.

1297

* Returns nonzero for a final return, when we have unlocked tasklist_lock.

1298

* Returns zero if the search for a child should continue;

1298

* Returns zero if the search for a child should continue;

1299

* then ->notask_error is 0 if @p is an eligible child,

1299

* then ->notask_error is 0 if @p is an eligible child,

1300

* or another error from security_task_wait(), or still -ECHILD.

1300

* or another error from security_task_wait(), or still -ECHILD.

1301

*/

1301

*/

1302

static int wait_consider_task(struct wait_opts *wo, int ptrace,

1302

static int wait_consider_task(struct wait_opts *wo, int ptrace,

1303

struct task_struct *p)

1303

struct task_struct *p)

1304

{

1304

{

1305

/*

1306

* We can race with wait_task_zombie() from another thread.

1307

* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition

1308

* can't confuse the checks below.

1309

*/

1310

int exit_state = ACCESS_ONCE(p->exit_state);

1305

int ret;

1311

int ret;

1306

1312

1307

if (unlikely(p->exit_state == EXIT_DEAD))

1313

if (unlikely(exit_state == EXIT_DEAD))

1308

return 0;

1314

return 0;

1309

1315

1310

ret = eligible_child(wo, p);

1316

ret = eligible_child(wo, p);

1311

if (!ret)

1317

if (!ret)

1312

return ret;

1318

return ret;

1313

1319

1314

ret = security_task_wait(p);

1320

ret = security_task_wait(p);

1315

if (unlikely(ret < 0)) {

1321

if (unlikely(ret < 0)) {

1316

/*

1322

/*

1317

* If we have not yet seen any eligible child,

1323

* If we have not yet seen any eligible child,

1318

* then let this error code replace -ECHILD.

1324

* then let this error code replace -ECHILD.

1319

* A permission error will give the user a clue

1325

* A permission error will give the user a clue

1320

* to look for security policy problems, rather

1326

* to look for security policy problems, rather

1321

* than for mysterious wait bugs.

1327

* than for mysterious wait bugs.

1322

*/

1328

*/

1323

if (wo->notask_error)

1329

if (wo->notask_error)

1324

wo->notask_error = ret;

1330

wo->notask_error = ret;

1325

return 0;

1331

return 0;

1326

}

1332

}

1327

1333

1328

if (unlikely(p->exit_state == EXIT_TRACE)) {

1334

if (unlikely(exit_state == EXIT_TRACE)) {

1329

/*

1335

/*

1330

* ptrace == 0 means we are the natural parent. In this case

1336

* ptrace == 0 means we are the natural parent. In this case

1331

* we should clear notask_error, debugger will notify us.

1337

* we should clear notask_error, debugger will notify us.

1332

*/

1338

*/

1333

if (likely(!ptrace))

1339

if (likely(!ptrace))

1334

wo->notask_error = 0;

1340

wo->notask_error = 0;

1335

return 0;

1341

return 0;

1336

}

1342

}

1337

1343

1338

if (likely(!ptrace) && unlikely(p->ptrace)) {

1344

if (likely(!ptrace) && unlikely(p->ptrace)) {

1339

/*

1345

/*

1340

* If it is traced by its real parent's group, just pretend

1346

* If it is traced by its real parent's group, just pretend

1341

* the caller is ptrace_do_wait() and reap this child if it

1347

* the caller is ptrace_do_wait() and reap this child if it

1342

* is zombie.

1348

* is zombie.

1343

*

1349

*

1344

* This also hides group stop state from real parent; otherwise

1350

* This also hides group stop state from real parent; otherwise

1345

* a single stop can be reported twice as group and ptrace stop.

1351

* a single stop can be reported twice as group and ptrace stop.

1346

* If a ptracer wants to distinguish these two events for its

1352

* If a ptracer wants to distinguish these two events for its

1347

* own children it should create a separate process which takes

1353

* own children it should create a separate process which takes

1348

* the role of real parent.

1354

* the role of real parent.

1349

*/

1355

*/

1350

if (!ptrace_reparented(p))

1356

if (!ptrace_reparented(p))

1351

ptrace = 1;

1357

ptrace = 1;

1352

}

1358

}

1353

1359

1354

/* slay zombie? */

1360

/* slay zombie? */

1355

if (p->exit_state == EXIT_ZOMBIE) {

1361

if (exit_state == EXIT_ZOMBIE) {

1356

/* we don't reap group leaders with subthreads */

1362

/* we don't reap group leaders with subthreads */

1357

if (!delay_group_leader(p)) {

1363

if (!delay_group_leader(p)) {

1358

/*

1364

/*

1359

* A zombie ptracee is only visible to its ptracer.

1365

* A zombie ptracee is only visible to its ptracer.

1360

* Notification and reaping will be cascaded to the

1366

* Notification and reaping will be cascaded to the

1361

* real parent when the ptracer detaches.

1367

* real parent when the ptracer detaches.

1362

*/

1368

*/

1363

if (unlikely(ptrace) || likely(!p->ptrace))

1369

if (unlikely(ptrace) || likely(!p->ptrace))

1364

return wait_task_zombie(wo, p);

1370

return wait_task_zombie(wo, p);

1365

}

1371

}

1366

1372

1367

/*

1373

/*

1368

* Allow access to stopped/continued state via zombie by

1374

* Allow access to stopped/continued state via zombie by

1369

* falling through. Clearing of notask_error is complex.

1375

* falling through. Clearing of notask_error is complex.

1370

*

1376

*

1371

* When !@ptrace:

1377

* When !@ptrace:

1372

*

1378

*

1373

* If WEXITED is set, notask_error should naturally be

1379

* If WEXITED is set, notask_error should naturally be

1374

* cleared. If not, subset of WSTOPPED|WCONTINUED is set,

1380

* cleared. If not, subset of WSTOPPED|WCONTINUED is set,

1375

* so, if there are live subthreads, there are events to

1381

* so, if there are live subthreads, there are events to

1376

* wait for. If all subthreads are dead, it's still safe

1382

* wait for. If all subthreads are dead, it's still safe

1377

* to clear - this function will be called again in finite

1383

* to clear - this function will be called again in finite

1378

* amount time once all the subthreads are released and

1384

* amount time once all the subthreads are released and

1379

* will then return without clearing.

1385

* will then return without clearing.

1380

*

1386

*

1381

* When @ptrace:

1387

* When @ptrace:

1382

*

1388

*

1383

* Stopped state is per-task and thus can't change once the

1389

* Stopped state is per-task and thus can't change once the

1384

* target task dies. Only continued and exited can happen.

1390

* target task dies. Only continued and exited can happen.

1385

* Clear notask_error if WCONTINUED | WEXITED.

1391

* Clear notask_error if WCONTINUED | WEXITED.

1386

*/

1392

*/

1387

if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))

1393

if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))

1388

wo->notask_error = 0;

1394

wo->notask_error = 0;

1389

} else {

1395

} else {

1390

/*

1396

/*

1391

* @p is alive and it's gonna stop, continue or exit, so

1397

* @p is alive and it's gonna stop, continue or exit, so

1392

* there always is something to wait for.

1398

* there always is something to wait for.

1393

*/

1399

*/

1394

wo->notask_error = 0;

1400

wo->notask_error = 0;

1395

}

1401

}

1396

1402

1397

/*

1403

/*

1398

* Wait for stopped. Depending on @ptrace, different stopped state

1404

* Wait for stopped. Depending on @ptrace, different stopped state

1399

* is used and the two don't interact with each other.

1405

* is used and the two don't interact with each other.

1400

*/

1406

*/

1401

ret = wait_task_stopped(wo, ptrace, p);

1407

ret = wait_task_stopped(wo, ptrace, p);

1402

if (ret)

1408

if (ret)

1403

return ret;

1409

return ret;

1404

1410

1405

/*

1411

/*

1406

* Wait for continued. There's only one continued state and the

1412

* Wait for continued. There's only one continued state and the

1407

* ptracer can consume it which can confuse the real parent. Don't

1413

* ptracer can consume it which can confuse the real parent. Don't

1408

* use WCONTINUED from ptracer. You don't need or want it.

1414

* use WCONTINUED from ptracer. You don't need or want it.

1409

*/

1415

*/

1410

return wait_task_continued(wo, p);

1416

return wait_task_continued(wo, p);

1411

}

1417

}

1412

1418

1413

/*

1419

/*

1414

* Do the work of do_wait() for one thread in the group, @tsk.

1420

* Do the work of do_wait() for one thread in the group, @tsk.

1415

*

1421

*

1416

* -ECHILD should be in ->notask_error before the first call.

1422

* -ECHILD should be in ->notask_error before the first call.

1417

* Returns nonzero for a final return, when we have unlocked tasklist_lock.

1423

* Returns nonzero for a final return, when we have unlocked tasklist_lock.

1418

* Returns zero if the search for a child should continue; then

1424

* Returns zero if the search for a child should continue; then

1419

* ->notask_error is 0 if there were any eligible children,

1425

* ->notask_error is 0 if there were any eligible children,

1420

* or another error from security_task_wait(), or still -ECHILD.

1426

* or another error from security_task_wait(), or still -ECHILD.

1421

*/

1427

*/

1422

static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)

1428

static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)

1423

{

1429

{

1424

struct task_struct *p;

1430

struct task_struct *p;

1425

1431

1426

list_for_each_entry(p, &tsk->children, sibling) {

1432

list_for_each_entry(p, &tsk->children, sibling) {

1427

int ret = wait_consider_task(wo, 0, p);

1433

int ret = wait_consider_task(wo, 0, p);

1428

1434

1429

if (ret)

1435

if (ret)

1430

return ret;

1436

return ret;

1431

}

1437

}

1432

1438

1433

return 0;

1439

return 0;

1434

}

1440

}

1435

1441

1436

static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)

1442

static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)

1437

{

1443

{

1438

struct task_struct *p;

1444

struct task_struct *p;

1439

1445

1440

list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {

1446

list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {

1441

int ret = wait_consider_task(wo, 1, p);

1447

int ret = wait_consider_task(wo, 1, p);

1442

1448

1443

if (ret)

1449

if (ret)

1444

return ret;

1450

return ret;

1445

}

1451

}

1446

1452

1447

return 0;

1453

return 0;

1448

}

1454

}

1449

1455

1450

static int child_wait_callback(wait_queue_t *wait, unsigned mode,

1456

static int child_wait_callback(wait_queue_t *wait, unsigned mode,

1451

int sync, void *key)

1457

int sync, void *key)

1452

{

1458

{

1453

struct wait_opts *wo = container_of(wait, struct wait_opts,

1459

struct wait_opts *wo = container_of(wait, struct wait_opts,

1454

child_wait);

1460

child_wait);

1455

struct task_struct *p = key;

1461

struct task_struct *p = key;

1456

1462

1457

if (!eligible_pid(wo, p))

1463

if (!eligible_pid(wo, p))

1458

return 0;

1464

return 0;

1459

1465

1460

if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)

1466

if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)

1461

return 0;

1467

return 0;

1462

1468

1463

return default_wake_function(wait, mode, sync, key);

1469

return default_wake_function(wait, mode, sync, key);

1464

}

1470

}

1465

1471

1466

void __wake_up_parent(struct task_struct *p, struct task_struct *parent)

1472

void __wake_up_parent(struct task_struct *p, struct task_struct *parent)

1467

{

1473

{

1468

__wake_up_sync_key(&parent->signal->wait_chldexit,

1474

__wake_up_sync_key(&parent->signal->wait_chldexit,

1469

TASK_INTERRUPTIBLE, 1, p);

1475

TASK_INTERRUPTIBLE, 1, p);

1470

}

1476

}

1471

1477

1472

static long do_wait(struct wait_opts *wo)

1478

static long do_wait(struct wait_opts *wo)

1473

{

1479

{

1474

struct task_struct *tsk;

1480

struct task_struct *tsk;

1475

int retval;

1481

int retval;

1476

1482

1477

trace_sched_process_wait(wo->wo_pid);

1483

trace_sched_process_wait(wo->wo_pid);

1478

1484

1479

init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);

1485

init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);

1480

wo->child_wait.private = current;

1486

wo->child_wait.private = current;

1481

add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);

1487

add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);

1482

repeat:

1488

repeat:

1483

/*

1489

/*

1484

* If there is nothing that can match our critiera just get out.

1490

* If there is nothing that can match our critiera just get out.

1485

* We will clear ->notask_error to zero if we see any child that

1491

* We will clear ->notask_error to zero if we see any child that

1486

* might later match our criteria, even if we are not able to reap

1492

* might later match our criteria, even if we are not able to reap

1487

* it yet.

1493

* it yet.

1488

*/

1494

*/

1489

wo->notask_error = -ECHILD;

1495

wo->notask_error = -ECHILD;

1490

if ((wo->wo_type < PIDTYPE_MAX) &&

1496

if ((wo->wo_type < PIDTYPE_MAX) &&

1491

(!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))

1497

(!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))

1492

goto notask;

1498

goto notask;

1493

1499

1494

set_current_state(TASK_INTERRUPTIBLE);

1500

set_current_state(TASK_INTERRUPTIBLE);

1495

read_lock(&tasklist_lock);

1501

read_lock(&tasklist_lock);

1496

tsk = current;

1502

tsk = current;

1497

do {

1503

do {

1498

retval = do_wait_thread(wo, tsk);

1504

retval = do_wait_thread(wo, tsk);

1499

if (retval)

1505

if (retval)

1500

goto end;

1506

goto end;

1501

1507

1502

retval = ptrace_do_wait(wo, tsk);

1508

retval = ptrace_do_wait(wo, tsk);

1503

if (retval)

1509

if (retval)

1504

goto end;

1510

goto end;

1505

1511

1506

if (wo->wo_flags & __WNOTHREAD)

1512

if (wo->wo_flags & __WNOTHREAD)

1507

break;

1513

break;

1508

} while_each_thread(current, tsk);

1514

} while_each_thread(current, tsk);

1509

read_unlock(&tasklist_lock);

1515

read_unlock(&tasklist_lock);

1510

1516

1511

notask:

1517

notask:

1512

retval = wo->notask_error;

1518

retval = wo->notask_error;

1513

if (!retval && !(wo->wo_flags & WNOHANG)) {

1519

if (!retval && !(wo->wo_flags & WNOHANG)) {

1514

retval = -ERESTARTSYS;

1520

retval = -ERESTARTSYS;

1515

if (!signal_pending(current)) {

1521

if (!signal_pending(current)) {

1516

schedule();

1522

schedule();

1517

goto repeat;

1523

goto repeat;

1518

}

1524

}

1519

}

1525

}

1520

end:

1526

end:

1521

__set_current_state(TASK_RUNNING);

1527

__set_current_state(TASK_RUNNING);

1522

remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);

1528

remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);

1523

return retval;

1529

return retval;

1524

}

1530

}

1525

1531

1526

SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,

1532

SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,

1527

infop, int, options, struct rusage __user *, ru)

1533

infop, int, options, struct rusage __user *, ru)

1528

{

1534

{

1529

struct wait_opts wo;

1535

struct wait_opts wo;

1530

struct pid *pid = NULL;

1536

struct pid *pid = NULL;

1531

enum pid_type type;

1537

enum pid_type type;

1532

long ret;

1538

long ret;

1533

1539

1534

if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))

1540

if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))

1535

return -EINVAL;

1541

return -EINVAL;

1536

if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))

1542

if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))

1537

return -EINVAL;

1543

return -EINVAL;

1538

1544

1539

switch (which) {

1545

switch (which) {

1540

case P_ALL:

1546

case P_ALL:

1541

type = PIDTYPE_MAX;

1547

type = PIDTYPE_MAX;

1542

break;

1548

break;

1543

case P_PID:

1549

case P_PID:

1544

type = PIDTYPE_PID;

1550

type = PIDTYPE_PID;

1545

if (upid <= 0)

1551

if (upid <= 0)

1546

return -EINVAL;

1552

return -EINVAL;

1547

break;

1553

break;

1548

case P_PGID:

1554

case P_PGID:

1549

type = PIDTYPE_PGID;

1555

type = PIDTYPE_PGID;

1550

if (upid <= 0)

1556

if (upid <= 0)

1551

return -EINVAL;

1557

return -EINVAL;

1552

break;

1558

break;

1553

default:

1559

default:

1554

return -EINVAL;

1560

return -EINVAL;

1555

}

1561

}

1556

1562

1557

if (type < PIDTYPE_MAX)

1563

if (type < PIDTYPE_MAX)

1558

pid = find_get_pid(upid);

1564

pid = find_get_pid(upid);

1559

1565

1560

wo.wo_type = type;

1566

wo.wo_type = type;

1561

wo.wo_pid = pid;

1567

wo.wo_pid = pid;

1562

wo.wo_flags = options;

1568

wo.wo_flags = options;

1563

wo.wo_info = infop;

1569

wo.wo_info = infop;

1564

wo.wo_stat = NULL;

1570

wo.wo_stat = NULL;

1565

wo.wo_rusage = ru;

1571

wo.wo_rusage = ru;

1566

ret = do_wait(&wo);

1572

ret = do_wait(&wo);

1567

1573

1568

if (ret > 0) {

1574

if (ret > 0) {

1569

ret = 0;

1575

ret = 0;

1570

} else if (infop) {

1576

} else if (infop) {

1571

/*

1577

/*

1572

* For a WNOHANG return, clear out all the fields

1578

* For a WNOHANG return, clear out all the fields

1573

* we would set so the user can easily tell the

1579

* we would set so the user can easily tell the

1574

* difference.

1580

* difference.

1575

*/

1581

*/

1576

if (!ret)

1582

if (!ret)

1577

ret = put_user(0, &infop->si_signo);

1583

ret = put_user(0, &infop->si_signo);

1578

if (!ret)

1584

if (!ret)

1579

ret = put_user(0, &infop->si_errno);

1585

ret = put_user(0, &infop->si_errno);

1580

if (!ret)

1586

if (!ret)

1581

ret = put_user(0, &infop->si_code);

1587

ret = put_user(0, &infop->si_code);

1582

if (!ret)

1588

if (!ret)

1583

ret = put_user(0, &infop->si_pid);

1589

ret = put_user(0, &infop->si_pid);

1584

if (!ret)

1590

if (!ret)

1585

ret = put_user(0, &infop->si_uid);

1591

ret = put_user(0, &infop->si_uid);

1586

if (!ret)

1592

if (!ret)

1587

ret = put_user(0, &infop->si_status);

1593

ret = put_user(0, &infop->si_status);

1588

}

1594

}

1589

1595

1590

put_pid(pid);

1596

put_pid(pid);

1591

return ret;

1597

return ret;

1592

}

1598

}

1593

1599

1594

SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,

1600

SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,

1595

int, options, struct rusage __user *, ru)

1601

int, options, struct rusage __user *, ru)

1596

{

1602

{

1597

struct wait_opts wo;

1603

struct wait_opts wo;

1598

struct pid *pid = NULL;

1604

struct pid *pid = NULL;

1599

enum pid_type type;

1605

enum pid_type type;

1600

long ret;

1606

long ret;

1601

1607

1602

if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|

1608

if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|

1603

__WNOTHREAD|__WCLONE|__WALL))

1609

__WNOTHREAD|__WCLONE|__WALL))

1604

return -EINVAL;

1610

return -EINVAL;

1605

1611

1606

if (upid == -1)

1612

if (upid == -1)

1607

type = PIDTYPE_MAX;

1613

type = PIDTYPE_MAX;

1608

else if (upid < 0) {

1614

else if (upid < 0) {

1609

type = PIDTYPE_PGID;

1615

type = PIDTYPE_PGID;

1610

pid = find_get_pid(-upid);

1616

pid = find_get_pid(-upid);

1611

} else if (upid == 0) {

1617

} else if (upid == 0) {

1612

type = PIDTYPE_PGID;

1618

type = PIDTYPE_PGID;

1613

pid = get_task_pid(current, PIDTYPE_PGID);

1619

pid = get_task_pid(current, PIDTYPE_PGID);

1614

} else /* upid > 0 */ {

1620

} else /* upid > 0 */ {

1615

type = PIDTYPE_PID;

1621

type = PIDTYPE_PID;

1616

pid = find_get_pid(upid);

1622

pid = find_get_pid(upid);

1617

}

1623

}

1618

1624

1619

wo.wo_type = type;

1625

wo.wo_type = type;

1620

wo.wo_pid = pid;

1626

wo.wo_pid = pid;

1621

wo.wo_flags = options | WEXITED;

1627

wo.wo_flags = options | WEXITED;

1622

wo.wo_info = NULL;

1628

wo.wo_info = NULL;

1623

wo.wo_stat = stat_addr;

1629

wo.wo_stat = stat_addr;

1624

wo.wo_rusage = ru;

1630

wo.wo_rusage = ru;

1625

ret = do_wait(&wo);

1631

ret = do_wait(&wo);

1626

put_pid(pid);

1632

put_pid(pid);

1627

1633

1628

return ret;

1634

return ret;

1629

}

1635

}

1630

1636

1631

#ifdef __ARCH_WANT_SYS_WAITPID

1637

#ifdef __ARCH_WANT_SYS_WAITPID

1632

1638

1633

/*

1639

/*

1634

* sys_waitpid() remains for compatibility. waitpid() should be

1640

* sys_waitpid() remains for compatibility. waitpid() should be

1635

* implemented by calling sys_wait4() from libc.a.

1641

* implemented by calling sys_wait4() from libc.a.

1636

*/

1642

*/

1637

SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)

1643

SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)

1638

{

1644

{

1639

return sys_wait4(pid, stat_addr, options, NULL);

1645

return sys_wait4(pid, stat_addr, options, NULL);

1640

}

1646

}

1641

1647

1642

#endif

1648

#endif

1643

1649

GITLAB

exit: fix race between wait_consider_task() and wait_task_zombie()

 /*
  *  linux/kernel/exit.c
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  */
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/personality.h>
 #include <linux/tty.h>
 #include <linux/iocontext.h>
 #include <linux/key.h>
 #include <linux/security.h>
 #include <linux/cpu.h>
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/freezer.h>
 #include <linux/binfmts.h>
 #include <linux/nsproxy.h>
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/kthread.h>
 #include <linux/mempolicy.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
 #include <linux/cgroup.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
 #include <linux/posix-timers.h>
 #include <linux/cn_proc.h>
 #include <linux/mutex.h>
 #include <linux/futex.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/audit.h> /* for audit_free() */
 #include <linux/resource.h>
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
 #include <linux/writeback.h>
 #include <linux/shm.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 static void exit_mm(struct task_struct *tsk);
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
 	nr_threads--;
 	detach_pid(p, PIDTYPE_PID);
 	if (group_dead) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
 		list_del_rcu(&p->tasks);
 		list_del_init(&p->sibling);
 		__this_cpu_dec(process_counts);
 	}
 	list_del_rcu(&p->thread_group);
 	list_del_rcu(&p->thread_node);
 }
 /*
  * This function expects the tasklist_lock write-locked.
  */
 static void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
 	bool group_dead = thread_group_leader(tsk);
 	struct sighand_struct *sighand;
 	struct tty_struct *uninitialized_var(tty);
 	cputime_t utime, stime;
 	sighand = rcu_dereference_check(tsk->sighand,
 					lockdep_tasklist_lock_is_held());
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (group_dead) {
 		posix_cpu_timers_exit_group(tsk);
 		tty = sig->tty;
 		sig->tty = NULL;
 	} else {
 		/*
 		 * This can only happen if the caller is de_thread().
 		 * FIXME: this is the temporary hack, we should teach
 		 * posix-cpu-timers to handle this case correctly.
 		 */
 		if (unlikely(has_group_leader_pid(tsk)))
 			posix_cpu_timers_exit_group(tsk);
 		/*
 		 * If there is any task waiting for the group exit
 		 * then notify it:
 		 */
 		if (sig->notify_count > 0 && !--sig->notify_count)
 			wake_up_process(sig->group_exit_task);
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 	}
 	/*
 	 * Accumulate here the counters for all threads but the group leader
 	 * as they die, so they can be added into the process-wide totals
 	 * when those are taken.  The group leader stays around as a zombie as
 	 * long as there are other threads.  When it gets reaped, the exit.c
 	 * code will add its counts into these totals.  We won't ever get here
 	 * for the group leader, since it will have been the last reference on
 	 * the signal_struct.
 	 */
 	task_cputime(tsk, &utime, &stime);
 	write_seqlock(&sig->stats_lock);
 	sig->utime += utime;
 	sig->stime += stime;
 	sig->gtime += task_gtime(tsk);
 	sig->min_flt += tsk->min_flt;
 	sig->maj_flt += tsk->maj_flt;
 	sig->nvcsw += tsk->nvcsw;
 	sig->nivcsw += tsk->nivcsw;
 	sig->inblock += task_io_get_inblock(tsk);
 	sig->oublock += task_io_get_oublock(tsk);
 	task_io_accounting_add(&sig->ioac, &tsk->ioac);
 	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 	sig->nr_threads--;
 	__unhash_process(tsk, group_dead);
 	write_sequnlock(&sig->stats_lock);
 	/*
 	 * Do this under ->siglock, we can race with another thread
 	 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
 	 */
 	flush_sigqueue(&tsk->pending);
 	tsk->sighand = NULL;
 	spin_unlock(&sighand->siglock);
 	__cleanup_sighand(sighand);
 	clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 	if (group_dead) {
 		flush_sigqueue(&sig->shared_pending);
 		tty_kref_put(tty);
 	}
 }
 static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 	perf_event_delayed_put(tsk);
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
 }
 void release_task(struct task_struct *p)
 {
 	struct task_struct *leader;
 	int zap_leader;
 repeat:
 	/* don't need to get the RCU readlock here - the process is dead and
 	 * can't be modifying its own credentials. But shut RCU-lockdep up */
 	rcu_read_lock();
 	atomic_dec(&__task_cred(p)->user->processes);
 	rcu_read_unlock();
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
 	ptrace_release_task(p);
 	__exit_signal(p);
 	/*
 	 * If we are the last non-leader member of the thread
 	 * group, and the leader is zombie, then notify the
 	 * group leader's parent process. (if it wants notification.)
 	 */
 	zap_leader = 0;
 	leader = p->group_leader;
 	if (leader != p && thread_group_empty(leader)
 			&& leader->exit_state == EXIT_ZOMBIE) {
 		/*
 		 * If we were the last child thread and the leader has
 		 * exited already, and the leader's parent ignores SIGCHLD,
 		 * then we are the one who should release the leader.
 		 */
 		zap_leader = do_notify_parent(leader, leader->exit_signal);
 		if (zap_leader)
 			leader->exit_state = EXIT_DEAD;
 	}
 	write_unlock_irq(&tasklist_lock);
 	release_thread(p);
 	call_rcu(&p->rcu, delayed_put_task_struct);
 	p = leader;
 	if (unlikely(zap_leader))
 		goto repeat;
 }
 /*
  * This checks not only the pgrp, but falls back on the pid if no
  * satisfactory pgrp is found. I dunno - gdb doesn't work correctly
  * without this...
  *
  * The caller must hold rcu lock or the tasklist lock.
  */
 struct pid *session_of_pgrp(struct pid *pgrp)
 {
 	struct task_struct *p;
 	struct pid *sid = NULL;
 	p = pid_task(pgrp, PIDTYPE_PGID);
 	if (p == NULL)
 		p = pid_task(pgrp, PIDTYPE_PID);
 	if (p != NULL)
 		sid = task_session(p);
 	return sid;
 }
 /*
  * Determine if a process group is "orphaned", according to the POSIX
  * definition in 2.2.2.52.  Orphaned process groups are not to be affected
  * by terminal-generated stop signals.  Newly orphaned process groups are
  * to receive a SIGHUP and a SIGCONT.
  *
  * "I ask you, have you ever known what it is to be an orphan?"
  */
 static int will_become_orphaned_pgrp(struct pid *pgrp,
 					struct task_struct *ignored_task)
 {
 	struct task_struct *p;
 	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 		if ((p == ignored_task) ||
 		    (p->exit_state && thread_group_empty(p)) ||
 		    is_global_init(p->real_parent))
 			continue;
 		if (task_pgrp(p->real_parent) != pgrp &&
 		    task_session(p->real_parent) == task_session(p))
 			return 0;
 	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 	return 1;
 }
 int is_current_pgrp_orphaned(void)
 {
 	int retval;
 	read_lock(&tasklist_lock);
 	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
 	read_unlock(&tasklist_lock);
 	return retval;
 }
 static bool has_stopped_jobs(struct pid *pgrp)
 {
 	struct task_struct *p;
 	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
 		if (p->signal->flags & SIGNAL_STOP_STOPPED)
 			return true;
 	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
 	return false;
 }
 /*
  * Check to see if any process groups have become orphaned as
  * a result of our exiting, and if they have any stopped jobs,
  * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
  */
 static void
 kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 {
 	struct pid *pgrp = task_pgrp(tsk);
 	struct task_struct *ignored_task = tsk;
 	if (!parent)
 		/* exit: our father is in a different pgrp than
 		 * we are and we were the only connection outside.
 		 */
 		parent = tsk->real_parent;
 	else
 		/* reparent: our child is in a different pgrp than
 		 * we are, and it was the only connection outside.
 		 */
 		ignored_task = NULL;
 	if (task_pgrp(parent) != pgrp &&
 	    task_session(parent) == task_session(tsk) &&
 	    will_become_orphaned_pgrp(pgrp, ignored_task) &&
 	    has_stopped_jobs(pgrp)) {
 		__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
 		__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 	}
 }
 #ifdef CONFIG_MEMCG
 /*
  * A task is exiting.   If it owned this mm, find a new owner for the mm.
  */
 void mm_update_next_owner(struct mm_struct *mm)
 {
 	struct task_struct *c, *g, *p = current;
 retry:
 	/*
 	 * If the exiting or execing task is not the owner, it's
 	 * someone else's problem.
 	 */
 	if (mm->owner != p)
 		return;
 	/*
 	 * The current owner is exiting/execing and there are no other
 	 * candidates.  Do not leave the mm pointing to a possibly
 	 * freed task structure.
 	 */
 	if (atomic_read(&mm->mm_users) <= 1) {
 		mm->owner = NULL;
 		return;
 	}
 	read_lock(&tasklist_lock);
 	/*
 	 * Search in the children
 	 */
 	list_for_each_entry(c, &p->children, sibling) {
 		if (c->mm == mm)
 			goto assign_new_owner;
 	}
 	/*
 	 * Search in the siblings
 	 */
 	list_for_each_entry(c, &p->real_parent->children, sibling) {
 		if (c->mm == mm)
 			goto assign_new_owner;
 	}
 	/*
 	 * Search through everything else, we should not get here often.
 	 */
 	for_each_process(g) {
 		if (g->flags & PF_KTHREAD)
 			continue;
 		for_each_thread(g, c) {
 			if (c->mm == mm)
 				goto assign_new_owner;
 			if (c->mm)
 				break;
 		}
 	}
 	read_unlock(&tasklist_lock);
 	/*
 	 * We found no owner yet mm_users > 1: this implies that we are
 	 * most likely racing with swapoff (try_to_unuse()) or /proc or
 	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 	 */
 	mm->owner = NULL;
 	return;
 assign_new_owner:
 	BUG_ON(c == p);
 	get_task_struct(c);
 	/*
 	 * The task_lock protects c->mm from changing.
 	 * We always want mm->owner->mm == mm
 	 */
 	task_lock(c);
 	/*
 	 * Delay read_unlock() till we have the task_lock()
 	 * to ensure that c does not slip away underneath us
 	 */
 	read_unlock(&tasklist_lock);
 	if (c->mm != mm) {
 		task_unlock(c);
 		put_task_struct(c);
 		goto retry;
 	}
 	mm->owner = c;
 	task_unlock(c);
 	put_task_struct(c);
 }
 #endif /* CONFIG_MEMCG */
 /*
  * Turn us into a lazy TLB process if we
  * aren't already..
  */
 static void exit_mm(struct task_struct *tsk)
 {
 	struct mm_struct *mm = tsk->mm;
 	struct core_state *core_state;
 	mm_release(tsk, mm);
 	if (!mm)
 		return;
 	sync_mm_rss(mm);
 	/*
 	 * Serialize with any possible pending coredump.
 	 * We must hold mmap_sem around checking core_state
 	 * and clearing tsk->mm.  The core-inducing thread
 	 * will increment ->nr_threads for each thread in the
 	 * group with ->mm != NULL.
 	 */
 	down_read(&mm->mmap_sem);
 	core_state = mm->core_state;
 	if (core_state) {
 		struct core_thread self;
 		up_read(&mm->mmap_sem);
 		self.task = tsk;
 		self.next = xchg(&core_state->dumper.next, &self);
 		/*
 		 * Implies mb(), the result of xchg() must be visible
 		 * to core_state->dumper.
 		 */
 		if (atomic_dec_and_test(&core_state->nr_threads))
 			complete(&core_state->startup);
 		for (;;) {
 			set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 			if (!self.task) /* see coredump_finish() */
 				break;
 			freezable_schedule();
 		}
 		__set_task_state(tsk, TASK_RUNNING);
 		down_read(&mm->mmap_sem);
 	}
 	atomic_inc(&mm->mm_count);
 	BUG_ON(mm != tsk->active_mm);
 	/* more a memory barrier than a real lock */
 	task_lock(tsk);
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
 	clear_thread_flag(TIF_MEMDIE);
 }
 /*
  * When we die, we re-parent all our children, and try to:
  * 1. give them to another thread in our thread group, if such a member exists
  * 2. give it to the first ancestor process which prctl'd itself as a
  *    child_subreaper for its children (like a service manager)
  * 3. give it to the init process (PID 1) in our pid namespace
  */
 static struct task_struct *find_new_reaper(struct task_struct *father)
 	__releases(&tasklist_lock)
 	__acquires(&tasklist_lock)
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(father);
 	struct task_struct *thread;
 	thread = father;
 	while_each_thread(father, thread) {
 		if (thread->flags & PF_EXITING)
 			continue;
 		if (unlikely(pid_ns->child_reaper == father))
 			pid_ns->child_reaper = thread;
 		return thread;
 	}
 	if (unlikely(pid_ns->child_reaper == father)) {
 		write_unlock_irq(&tasklist_lock);
 		if (unlikely(pid_ns == &init_pid_ns)) {
 			panic("Attempted to kill init! exitcode=0x%08x\n",
 				father->signal->group_exit_code ?:
 					father->exit_code);
 		}
 		zap_pid_ns_processes(pid_ns);
 		write_lock_irq(&tasklist_lock);
 	} else if (father->signal->has_child_subreaper) {
 		struct task_struct *reaper;
 		/*
 		 * Find the first ancestor marked as child_subreaper.
 		 * Note that the code below checks same_thread_group(reaper,
 		 * pid_ns->child_reaper).  This is what we need to DTRT in a
 		 * PID namespace. However we still need the check above, see
 		 * http://marc.info/?l=linux-kernel&m=131385460420380
 		 */
 		for (reaper = father->real_parent;
 		     reaper != &init_task;
 		     reaper = reaper->real_parent) {
 			if (same_thread_group(reaper, pid_ns->child_reaper))
 				break;
 			if (!reaper->signal->is_child_subreaper)
 				continue;
 			thread = reaper;
 			do {
 				if (!(thread->flags & PF_EXITING))
 					return reaper;
 			} while_each_thread(reaper, thread);
 		}
 	}
 	return pid_ns->child_reaper;
 }
 /*
 * Any that need to be release_task'd are put on the @dead list.
  */
 static void reparent_leader(struct task_struct *father, struct task_struct *p,
 				struct list_head *dead)
 {
 	list_move_tail(&p->sibling, &p->real_parent->children);
 	if (p->exit_state == EXIT_DEAD)
 		return;
 	/*
 	 * If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
 	if (same_thread_group(p->real_parent, father))
 		return;
 	/* We don't want people slaying init. */
 	p->exit_signal = SIGCHLD;
 	/* If it has exited notify the new parent about this child's death. */
 	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		if (do_notify_parent(p, p->exit_signal)) {
 			p->exit_state = EXIT_DEAD;
 			list_move_tail(&p->sibling, dead);
 		}
 	}
 	kill_orphaned_pgrp(p, father);
 }
 static void forget_original_parent(struct task_struct *father)
 {
 	struct task_struct *p, *n, *reaper;
 	LIST_HEAD(dead_children);
 	write_lock_irq(&tasklist_lock);
 	/*
 	 * Note that exit_ptrace() and find_new_reaper() might
 	 * drop tasklist_lock and reacquire it.
 	 */
 	exit_ptrace(father);
 	reaper = find_new_reaper(father);
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		struct task_struct *t = p;
 		do {
 			t->real_parent = reaper;
 			if (t->parent == father) {
 				BUG_ON(t->ptrace);
 				t->parent = t->real_parent;
 			}
 			if (t->pdeath_signal)
 				group_send_sig_info(t->pdeath_signal,
 						    SEND_SIG_NOINFO, t);
 		} while_each_thread(p, t);
 		reparent_leader(father, p, &dead_children);
 	}
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
 	list_for_each_entry_safe(p, n, &dead_children, sibling) {
 		list_del_init(&p->sibling);
 		release_task(p);
 	}
 }
 /*
  * Send signals to all our closest relatives so that they know
  * to properly mourn us..
  */
 static void exit_notify(struct task_struct *tsk, int group_dead)
 {
 	bool autoreap;
 	/*
 	 * This does two things:
 	 *
 	 * A.  Make init inherit all the child processes
 	 * B.  Check to see if any process groups have become orphaned
 	 *	as a result of our exiting, and if they have any stopped
 	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 	 */
 	forget_original_parent(tsk);
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 	if (unlikely(tsk->ptrace)) {
 		int sig = thread_group_leader(tsk) &&
 				thread_group_empty(tsk) &&
 				!ptrace_reparented(tsk) ?
 			tsk->exit_signal : SIGCHLD;
 		autoreap = do_notify_parent(tsk, sig);
 	} else if (thread_group_leader(tsk)) {
 		autoreap = thread_group_empty(tsk) &&
 			do_notify_parent(tsk, tsk->exit_signal);
 	} else {
 		autoreap = true;
 	}
 	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
 		wake_up_process(tsk->signal->group_exit_task);
 	write_unlock_irq(&tasklist_lock);
 	/* If the process is dead, release it - nobody will wait for it */
 	if (autoreap)
 		release_task(tsk);
 }
 #ifdef CONFIG_DEBUG_STACK_USAGE
 static void check_stack_usage(void)
 {
 	static DEFINE_SPINLOCK(low_water_lock);
 	static int lowest_to_date = THREAD_SIZE;
 	unsigned long free;
 	free = stack_not_used(current);
 	if (free >= lowest_to_date)
 		return;
 	spin_lock(&low_water_lock);
 	if (free < lowest_to_date) {
 		pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
 			current->comm, task_pid_nr(current), free);
 		lowest_to_date = free;
 	}
 	spin_unlock(&low_water_lock);
 }
 #else
 static inline void check_stack_usage(void) {}
 #endif
 void do_exit(long code)
 {
 	struct task_struct *tsk = current;
 	int group_dead;
 	TASKS_RCU(int tasks_rcu_i);
 	profile_task_exit(tsk);
 	WARN_ON(blk_needs_flush_plug(tsk));
 	if (unlikely(in_interrupt()))
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
 	/*
 	 * If do_exit is called because this processes oopsed, it's possible
 	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
 	 * continuing. Amongst other possible reasons, this is to prevent
 	 * mm_release()->clear_child_tid() from writing to a user-controlled
 	 * kernel address.
 	 */
 	set_fs(USER_DS);
 	ptrace_event(PTRACE_EVENT_EXIT, code);
 	validate_creds_for_do_exit(tsk);
 	/*
 	 * We're taking recursive faults here in do_exit. Safest is to just
 	 * leave this task alone and wait for reboot.
 	 */
 	if (unlikely(tsk->flags & PF_EXITING)) {
 		pr_alert("Fixing recursive fault but reboot is needed!\n");
 		/*
 		 * We can do this unlocked here. The futex code uses
 		 * this flag just to verify whether the pi state
 		 * cleanup has been done or not. In the worst case it
 		 * loops once more. We pretend that the cleanup was
 		 * done as there is no way to return. Either the
 		 * OWNER_DIED bit is set by now or we push the blocked
 		 * task into the wait for ever nirwana as well.
 		 */
 		tsk->flags |= PF_EXITPIDONE;
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule();
 	}
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
 	 * an exiting task cleaning up the robust pi futexes.
 	 */
 	smp_mb();
 	raw_spin_unlock_wait(&tsk->pi_lock);
 	if (unlikely(in_atomic()))
 		pr_info("note: %s[%d] exited with preempt_count %d\n",
 			current->comm, task_pid_nr(current),
 			preempt_count());
 	acct_update_integrals(tsk);
 	/* sync mm's RSS info before statistics gathering */
 	if (tsk->mm)
 		sync_mm_rss(tsk->mm);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
 		hrtimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 		if (tsk->mm)
 			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
 	}
 	acct_collect(code, group_dead);
 	if (group_dead)
 		tty_audit_exit();
 	audit_free(tsk);
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 	exit_mm(tsk);
 	if (group_dead)
 		acct_process();
 	trace_sched_process_exit(tsk);
 	exit_sem(tsk);
 	exit_shm(tsk);
 	exit_files(tsk);
 	exit_fs(tsk);
 	if (group_dead)
 		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	exit_thread();
 	/*
 	 * Flush inherited counters to the parent - before the parent
 	 * gets woken up by child-exit notifications.
 	 *
 	 * because of cgroup mode, must be called before cgroup_exit()
 	 */
 	perf_event_exit_task(tsk);
 	cgroup_exit(tsk);
 	module_put(task_thread_info(tsk)->exec_domain->module);
 	/*
 	 * FIXME: do that only when needed, using sched_exit tracepoint
 	 */
 	flush_ptrace_hw_breakpoint(tsk);
 	TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
 	exit_notify(tsk, group_dead);
 	proc_exit_connector(tsk);
 #ifdef CONFIG_NUMA
 	task_lock(tsk);
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 	task_unlock(tsk);
 #endif
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
 		kfree(current->pi_state_cache);
 #endif
 	/*
 	 * Make sure we are holding no locks:
 	 */
 	debug_check_no_locks_held();
 	/*
 	 * We can do this unlocked here. The futex code uses this flag
 	 * just to verify whether the pi state cleanup has been done
 	 * or not. In the worst case it loops once more.
 	 */
 	tsk->flags |= PF_EXITPIDONE;
 	if (tsk->io_context)
 		exit_io_context(tsk);
 	if (tsk->splice_pipe)
 		free_pipe_info(tsk->splice_pipe);
 	if (tsk->task_frag.page)
 		put_page(tsk->task_frag.page);
 	validate_creds_for_do_exit(tsk);
 	check_stack_usage();
 	preempt_disable();
 	if (tsk->nr_dirtied)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
 	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
 	/*
 	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
 	 * when the following two conditions become true.
 	 *   - There is race condition of mmap_sem (It is acquired by
 	 *     exit_mm()), and
 	 *   - SMI occurs before setting TASK_RUNINNG.
 	 *     (or hypervisor of virtual machine switches to other guest)
 	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
 	 *
 	 * To avoid it, we have to wait for releasing tsk->pi_lock which
 	 * is held by try_to_wake_up()
 	 */
 	smp_mb();
 	raw_spin_unlock_wait(&tsk->pi_lock);
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
 	tsk->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
 	for (;;)
 		cpu_relax();	/* For when BUG is null */
 }
 EXPORT_SYMBOL_GPL(do_exit);
 void complete_and_exit(struct completion *comp, long code)
 {
 	if (comp)
 		complete(comp);
 	do_exit(code);
 }
 EXPORT_SYMBOL(complete_and_exit);
 SYSCALL_DEFINE1(exit, int, error_code)
 {
 	do_exit((error_code&0xff)<<8);
 }
 /*
  * Take down every thread in the group.  This is called by fatal signals
  * as well as by sys_exit_group (below).
  */
 void
 do_group_exit(int exit_code)
 {
 	struct signal_struct *sig = current->signal;
 	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
 	if (signal_group_exit(sig))
 		exit_code = sig->group_exit_code;
 	else if (!thread_group_empty(current)) {
 		struct sighand_struct *const sighand = current->sighand;
 		spin_lock_irq(&sighand->siglock);
 		if (signal_group_exit(sig))
 			/* Another thread got here before we took the lock.  */
 			exit_code = sig->group_exit_code;
 		else {
 			sig->group_exit_code = exit_code;
 			sig->flags = SIGNAL_GROUP_EXIT;
 			zap_other_threads(current);
 		}
 		spin_unlock_irq(&sighand->siglock);
 	}
 	do_exit(exit_code);
 	/* NOTREACHED */
 }
 /*
  * this kills every thread in the thread group. Note that any externally
  * wait4()-ing process will get the correct exit code - even if this
  * thread is not the thread group leader.
  */
 SYSCALL_DEFINE1(exit_group, int, error_code)
 {
 	do_group_exit((error_code & 0xff) << 8);
 	/* NOTREACHED */
 	return 0;
 }
 struct wait_opts {
 	enum pid_type		wo_type;
 	int			wo_flags;
 	struct pid		*wo_pid;
 	struct siginfo __user	*wo_info;
 	int __user		*wo_stat;
 	struct rusage __user	*wo_rusage;
 	wait_queue_t		child_wait;
 	int			notask_error;
 };
 static inline
 struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 {
 	if (type != PIDTYPE_PID)
 		task = task->group_leader;
 	return task->pids[type].pid;
 }
 static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
 	return	wo->wo_type == PIDTYPE_MAX ||
 		task_pid_type(p, wo->wo_type) == wo->wo_pid;
 }
 static int eligible_child(struct wait_opts *wo, struct task_struct *p)
 {
 	if (!eligible_pid(wo, p))
 		return 0;
 	/* Wait for all children (clone and not) if __WALL is set;
 	 * otherwise, wait for clone children *only* if __WCLONE is
 	 * set; otherwise, wait for non-clone children *only*.  (Note:
 	 * A "clone" child here is one that reports to its parent
 	 * using a signal other than SIGCHLD.) */
 	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
 	    && !(wo->wo_flags & __WALL))
 		return 0;
 	return 1;
 }
 static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
 				pid_t pid, uid_t uid, int why, int status)
 {
 	struct siginfo __user *infop;
 	int retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 	put_task_struct(p);
 	infop = wo->wo_info;
 	if (infop) {
 		if (!retval)
 			retval = put_user(SIGCHLD, &infop->si_signo);
 		if (!retval)
 			retval = put_user(0, &infop->si_errno);
 		if (!retval)
 			retval = put_user((short)why, &infop->si_code);
 		if (!retval)
 			retval = put_user(pid, &infop->si_pid);
 		if (!retval)
 			retval = put_user(uid, &infop->si_uid);
 		if (!retval)
 			retval = put_user(status, &infop->si_status);
 	}
 	if (!retval)
 		retval = pid;
 	return retval;
 }
 /*
  * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
 static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 {
 	unsigned long state;
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
 	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
 	struct siginfo __user *infop;
 	if (!likely(wo->wo_flags & WEXITED))
 		return 0;
 	if (unlikely(wo->wo_flags & WNOWAIT)) {
 		int exit_code = p->exit_code;
 		int why;
 		get_task_struct(p);
 		read_unlock(&tasklist_lock);
 		if ((exit_code & 0x7f) == 0) {
 			why = CLD_EXITED;
 			status = exit_code >> 8;
 		} else {
 			why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
 			status = exit_code & 0x7f;
 		}
 		return wait_noreap_copyout(wo, p, pid, uid, why, status);
 	}
 	traced = ptrace_reparented(p);
 	/*
 	 * Move the task's state to DEAD/TRACE, only one thread can do this.
 	 */
 	state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
 	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
 		return 0;
 	/*
 	 * It can be ptraced but not reparented, check
 	 * thread_group_leader() to filter out sub-threads.
 	 */
 	if (likely(!traced) && thread_group_leader(p)) {
 		struct signal_struct *psig;
 		struct signal_struct *sig;
 		unsigned long maxrss;
 		cputime_t tgutime, tgstime;
 		/*
 		 * The resource counters for the group leader are in its
 		 * own task_struct.  Those for dead threads in the group
 		 * are in its signal_struct, as are those for the child
 		 * processes it has previously reaped.  All these
 		 * accumulate in the parent's signal_struct c* fields.
 		 *
 		 * We don't bother to take a lock here to protect these
 		 * p->signal fields, because they are only touched by
 		 * __exit_signal, which runs with tasklist_lock
 		 * write-locked anyway, and so is excluded here.  We do
 		 * need to protect the access to parent->signal fields,
 		 * as other threads in the parent group can be right
 		 * here reaping other children at the same time.
 		 *
 		 * We use thread_group_cputime_adjusted() to get times for
 		 * the thread group, which consolidates times for all threads
 		 * in the group including the group leader.
 		 */
 		thread_group_cputime_adjusted(p, &tgutime, &tgstime);
 		spin_lock_irq(&p->real_parent->sighand->siglock);
 		psig = p->real_parent->signal;
 		sig = p->signal;
 		write_seqlock(&psig->stats_lock);
 		psig->cutime += tgutime + sig->cutime;
 		psig->cstime += tgstime + sig->cstime;
 		psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
 		psig->cmin_flt +=
 			p->min_flt + sig->min_flt + sig->cmin_flt;
 		psig->cmaj_flt +=
 			p->maj_flt + sig->maj_flt + sig->cmaj_flt;
 		psig->cnvcsw +=
 			p->nvcsw + sig->nvcsw + sig->cnvcsw;
 		psig->cnivcsw +=
 			p->nivcsw + sig->nivcsw + sig->cnivcsw;
 		psig->cinblock +=
 			task_io_get_inblock(p) +
 			sig->inblock + sig->cinblock;
 		psig->coublock +=
 			task_io_get_oublock(p) +
 			sig->oublock + sig->coublock;
 		maxrss = max(sig->maxrss, sig->cmaxrss);
 		if (psig->cmaxrss < maxrss)
 			psig->cmaxrss = maxrss;
 		task_io_accounting_add(&psig->ioac, &p->ioac);
 		task_io_accounting_add(&psig->ioac, &sig->ioac);
 		write_sequnlock(&psig->stats_lock);
 		spin_unlock_irq(&p->real_parent->sighand->siglock);
 	}
 	/*
 	 * Now we are sure this task is interesting, and no other
 	 * thread can reap it because we its state == DEAD/TRACE.
 	 */
 	read_unlock(&tasklist_lock);
 	retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
 	if (!retval && wo->wo_stat)
 		retval = put_user(status, wo->wo_stat);
 	infop = wo->wo_info;
 	if (!retval && infop)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval && infop)
 		retval = put_user(0, &infop->si_errno);
 	if (!retval && infop) {
 		int why;
 		if ((status & 0x7f) == 0) {
 			why = CLD_EXITED;
 			status >>= 8;
 		} else {
 			why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
 			status &= 0x7f;
 		}
 		retval = put_user((short)why, &infop->si_code);
 		if (!retval)
 			retval = put_user(status, &infop->si_status);
 	}
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
 		retval = put_user(uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 	if (state == EXIT_TRACE) {
 		write_lock_irq(&tasklist_lock);
 		/* We dropped tasklist, ptracer could die and untrace */
 		ptrace_unlink(p);
 		/* If parent wants a zombie, don't release it now */
 		state = EXIT_ZOMBIE;
 		if (do_notify_parent(p, p->exit_signal))
 			state = EXIT_DEAD;
 		p->exit_state = state;
 		write_unlock_irq(&tasklist_lock);
 	}
 	if (state == EXIT_DEAD)
 		release_task(p);
 	return retval;
 }
 static int *task_stopped_code(struct task_struct *p, bool ptrace)
 {
 	if (ptrace) {
 		if (task_is_stopped_or_traced(p) &&
 		    !(p->jobctl & JOBCTL_LISTENING))
 			return &p->exit_code;
 	} else {
 		if (p->signal->flags & SIGNAL_STOP_STOPPED)
 			return &p->signal->group_exit_code;
 	}
 	return NULL;
 }
 /**
  * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
  * @wo: wait options
  * @ptrace: is the wait for ptrace
  * @p: task to wait for
  *
  * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
  *
  * CONTEXT:
  * read_lock(&tasklist_lock), which is released if return value is
  * non-zero.  Also, grabs and releases @p->sighand->siglock.
  *
  * RETURNS:
  * 0 if wait condition didn't exist and search for other wait conditions
  * should continue.  Non-zero return, -errno on failure and @p's pid on
  * success, implies that tasklist_lock is released and wait condition
  * search should terminate.
  */
 static int wait_task_stopped(struct wait_opts *wo,
 				int ptrace, struct task_struct *p)
 {
 	struct siginfo __user *infop;
 	int retval, exit_code, *p_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 	/*
 	 * Traditionally we see ptrace'd stopped tasks regardless of options.
 	 */
 	if (!ptrace && !(wo->wo_flags & WUNTRACED))
 		return 0;
 	if (!task_stopped_code(p, ptrace))
 		return 0;
 	exit_code = 0;
 	spin_lock_irq(&p->sighand->siglock);
 	p_code = task_stopped_code(p, ptrace);
 	if (unlikely(!p_code))
 		goto unlock_sig;
 	exit_code = *p_code;
 	if (!exit_code)
 		goto unlock_sig;
 	if (!unlikely(wo->wo_flags & WNOWAIT))
 		*p_code = 0;
 	uid = from_kuid_munged(current_user_ns(), task_uid(p));
 unlock_sig:
 	spin_unlock_irq(&p->sighand->siglock);
 	if (!exit_code)
 		return 0;
 	/*
 	 * Now we are pretty sure this task is interesting.
 	 * Make sure it doesn't get reaped out from under us while we
 	 * give up the lock and then examine it below.  We don't want to
 	 * keep holding onto the tasklist_lock while we call getrusage and
 	 * possibly take page faults for user memory.
 	 */
 	get_task_struct(p);
 	pid = task_pid_vnr(p);
 	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 	if (unlikely(wo->wo_flags & WNOWAIT))
 		return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
 	retval = wo->wo_rusage
 		? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 	if (!retval && wo->wo_stat)
 		retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
 	infop = wo->wo_info;
 	if (!retval && infop)
 		retval = put_user(SIGCHLD, &infop->si_signo);
 	if (!retval && infop)
 		retval = put_user(0, &infop->si_errno);
 	if (!retval && infop)
 		retval = put_user((short)why, &infop->si_code);
 	if (!retval && infop)
 		retval = put_user(exit_code, &infop->si_status);
 	if (!retval && infop)
 		retval = put_user(pid, &infop->si_pid);
 	if (!retval && infop)
 		retval = put_user(uid, &infop->si_uid);
 	if (!retval)
 		retval = pid;
 	put_task_struct(p);
 	BUG_ON(!retval);
 	return retval;
 }
 /*
  * Handle do_wait work for one task in a live, non-stopped state.
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
 static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 {
 	int retval;
 	pid_t pid;
 	uid_t uid;
 	if (!unlikely(wo->wo_flags & WCONTINUED))
 		return 0;
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
 		return 0;
 	spin_lock_irq(&p->sighand->siglock);
 	/* Re-check with the lock held.  */
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
 		spin_unlock_irq(&p->sighand->siglock);
 		return 0;
 	}
 	if (!unlikely(wo->wo_flags & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 	uid = from_kuid_munged(current_user_ns(), task_uid(p));
 	spin_unlock_irq(&p->sighand->siglock);
 	pid = task_pid_vnr(p);
 	get_task_struct(p);
 	read_unlock(&tasklist_lock);
 	if (!wo->wo_info) {
 		retval = wo->wo_rusage
 			? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
 		put_task_struct(p);
 		if (!retval && wo->wo_stat)
 			retval = put_user(0xffff, wo->wo_stat);
 		if (!retval)
 			retval = pid;
 	} else {
 		retval = wait_noreap_copyout(wo, p, pid, uid,
 					     CLD_CONTINUED, SIGCONT);
 		BUG_ON(retval == 0);
 	}
 	return retval;
 }
 /*
  * Consider @p for a wait by @parent.
  *
  * -ECHILD should be in ->notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue;
  * then ->notask_error is 0 if @p is an eligible child,
  * or another error from security_task_wait(), or still -ECHILD.
  */
 static int wait_consider_task(struct wait_opts *wo, int ptrace,
 				struct task_struct *p)
 {
+	/*
+	 * We can race with wait_task_zombie() from another thread.
+	 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
+	 * can't confuse the checks below.
+	 */
+	int exit_state = ACCESS_ONCE(p->exit_state);
 	int ret;
-	if (unlikely(p->exit_state == EXIT_DEAD))
+	if (unlikely(exit_state == EXIT_DEAD))
 		return 0;
 	ret = eligible_child(wo, p);
 	if (!ret)
 		return ret;
 	ret = security_task_wait(p);
 	if (unlikely(ret < 0)) {
 		/*
 		 * If we have not yet seen any eligible child,
 		 * then let this error code replace -ECHILD.
 		 * A permission error will give the user a clue
 		 * to look for security policy problems, rather
 		 * than for mysterious wait bugs.
 		 */
 		if (wo->notask_error)
 			wo->notask_error = ret;
 		return 0;
 	}
-	if (unlikely(p->exit_state == EXIT_TRACE)) {
+	if (unlikely(exit_state == EXIT_TRACE)) {
 		/*
 		 * ptrace == 0 means we are the natural parent. In this case
 		 * we should clear notask_error, debugger will notify us.
 		 */
 		if (likely(!ptrace))
 			wo->notask_error = 0;
 		return 0;
 	}
 	if (likely(!ptrace) && unlikely(p->ptrace)) {
 		/*
 		 * If it is traced by its real parent's group, just pretend
 		 * the caller is ptrace_do_wait() and reap this child if it
 		 * is zombie.
 		 *
 		 * This also hides group stop state from real parent; otherwise
 		 * a single stop can be reported twice as group and ptrace stop.
 		 * If a ptracer wants to distinguish these two events for its
 		 * own children it should create a separate process which takes
 		 * the role of real parent.
 		 */
 		if (!ptrace_reparented(p))
 			ptrace = 1;
 	}
 	/* slay zombie? */
-	if (p->exit_state == EXIT_ZOMBIE) {
+	if (exit_state == EXIT_ZOMBIE) {
 		/* we don't reap group leaders with subthreads */
 		if (!delay_group_leader(p)) {
 			/*
 			 * A zombie ptracee is only visible to its ptracer.
 			 * Notification and reaping will be cascaded to the
 			 * real parent when the ptracer detaches.
 			 */
 			if (unlikely(ptrace) || likely(!p->ptrace))
 				return wait_task_zombie(wo, p);
 		}
 		/*
 		 * Allow access to stopped/continued state via zombie by
 		 * falling through.  Clearing of notask_error is complex.
 		 *
 		 * When !@ptrace:
 		 *
 		 * If WEXITED is set, notask_error should naturally be
 		 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
 		 * so, if there are live subthreads, there are events to
 		 * wait for.  If all subthreads are dead, it's still safe
 		 * to clear - this function will be called again in finite
 		 * amount time once all the subthreads are released and
 		 * will then return without clearing.
 		 *
 		 * When @ptrace:
 		 *
 		 * Stopped state is per-task and thus can't change once the
 		 * target task dies.  Only continued and exited can happen.
 		 * Clear notask_error if WCONTINUED | WEXITED.
 		 */
 		if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
 			wo->notask_error = 0;
 	} else {
 		/*
 		 * @p is alive and it's gonna stop, continue or exit, so
 		 * there always is something to wait for.
 		 */
 		wo->notask_error = 0;
 	}
 	/*
 	 * Wait for stopped.  Depending on @ptrace, different stopped state
 	 * is used and the two don't interact with each other.
 	 */
 	ret = wait_task_stopped(wo, ptrace, p);
 	if (ret)
 		return ret;
 	/*
 	 * Wait for continued.  There's only one continued state and the
 	 * ptracer can consume it which can confuse the real parent.  Don't
 	 * use WCONTINUED from ptracer.  You don't need or want it.
 	 */
 	return wait_task_continued(wo, p);
 }
 /*
  * Do the work of do_wait() for one thread in the group, @tsk.
  *
  * -ECHILD should be in ->notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then
  * ->notask_error is 0 if there were any eligible children,
  * or another error from security_task_wait(), or still -ECHILD.
  */
 static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
 {
 	struct task_struct *p;
 	list_for_each_entry(p, &tsk->children, sibling) {
 		int ret = wait_consider_task(wo, 0, p);
 		if (ret)
 			return ret;
 	}
 	return 0;
 }
 static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
 {
 	struct task_struct *p;
 	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
 		int ret = wait_consider_task(wo, 1, p);
 		if (ret)
 			return ret;
 	}
 	return 0;
 }
 static int child_wait_callback(wait_queue_t *wait, unsigned mode,
 				int sync, void *key)
 {
 	struct wait_opts *wo = container_of(wait, struct wait_opts,
 						child_wait);
 	struct task_struct *p = key;
 	if (!eligible_pid(wo, p))
 		return 0;
 	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
 		return 0;
 	return default_wake_function(wait, mode, sync, key);
 }
 void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
 {
 	__wake_up_sync_key(&parent->signal->wait_chldexit,
 				TASK_INTERRUPTIBLE, 1, p);
 }
 static long do_wait(struct wait_opts *wo)
 {
 	struct task_struct *tsk;
 	int retval;
 	trace_sched_process_wait(wo->wo_pid);
 	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
 	wo->child_wait.private = current;
 	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 repeat:
 	/*
 	 * If there is nothing that can match our critiera just get out.
 	 * We will clear ->notask_error to zero if we see any child that
 	 * might later match our criteria, even if we are not able to reap
 	 * it yet.
 	 */
 	wo->notask_error = -ECHILD;
 	if ((wo->wo_type < PIDTYPE_MAX) &&
 	   (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
 		goto notask;
 	set_current_state(TASK_INTERRUPTIBLE);
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
 		retval = do_wait_thread(wo, tsk);
 		if (retval)
 			goto end;
 		retval = ptrace_do_wait(wo, tsk);
 		if (retval)
 			goto end;
 		if (wo->wo_flags & __WNOTHREAD)
 			break;
 	} while_each_thread(current, tsk);
 	read_unlock(&tasklist_lock);
 notask:
 	retval = wo->notask_error;
 	if (!retval && !(wo->wo_flags & WNOHANG)) {
 		retval = -ERESTARTSYS;
 		if (!signal_pending(current)) {
 			schedule();
 			goto repeat;
 		}
 	}
 end:
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
 	return retval;
 }
 SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 		infop, int, options, struct rusage __user *, ru)
 {
 	struct wait_opts wo;
 	struct pid *pid = NULL;
 	enum pid_type type;
 	long ret;
 	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
 		return -EINVAL;
 	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
 		return -EINVAL;
 	switch (which) {
 	case P_ALL:
 		type = PIDTYPE_MAX;
 		break;
 	case P_PID:
 		type = PIDTYPE_PID;
 		if (upid <= 0)
 			return -EINVAL;
 		break;
 	case P_PGID:
 		type = PIDTYPE_PGID;
 		if (upid <= 0)
 			return -EINVAL;
 		break;
 	default:
 		return -EINVAL;
 	}
 	if (type < PIDTYPE_MAX)
 		pid = find_get_pid(upid);
 	wo.wo_type	= type;
 	wo.wo_pid	= pid;
 	wo.wo_flags	= options;
 	wo.wo_info	= infop;
 	wo.wo_stat	= NULL;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
 	if (ret > 0) {
 		ret = 0;
 	} else if (infop) {
 		/*
 		 * For a WNOHANG return, clear out all the fields
 		 * we would set so the user can easily tell the
 		 * difference.
 		 */
 		if (!ret)
 			ret = put_user(0, &infop->si_signo);
 		if (!ret)
 			ret = put_user(0, &infop->si_errno);
 		if (!ret)
 			ret = put_user(0, &infop->si_code);
 		if (!ret)
 			ret = put_user(0, &infop->si_pid);
 		if (!ret)
 			ret = put_user(0, &infop->si_uid);
 		if (!ret)
 			ret = put_user(0, &infop->si_status);
 	}
 	put_pid(pid);
 	return ret;
 }
 SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		int, options, struct rusage __user *, ru)
 {
 	struct wait_opts wo;
 	struct pid *pid = NULL;
 	enum pid_type type;
 	long ret;
 	if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
 			__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
 	if (upid == -1)
 		type = PIDTYPE_MAX;
 	else if (upid < 0) {
 		type = PIDTYPE_PGID;
 		pid = find_get_pid(-upid);
 	} else if (upid == 0) {
 		type = PIDTYPE_PGID;
 		pid = get_task_pid(current, PIDTYPE_PGID);
 	} else /* upid > 0 */ {
 		type = PIDTYPE_PID;
 		pid = find_get_pid(upid);
 	}
 	wo.wo_type	= type;
 	wo.wo_pid	= pid;
 	wo.wo_flags	= options | WEXITED;
 	wo.wo_info	= NULL;
 	wo.wo_stat	= stat_addr;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
 	put_pid(pid);
 	return ret;
 }
 #ifdef __ARCH_WANT_SYS_WAITPID
 /*
  * sys_waitpid() remains for compatibility. waitpid() should be
  * implemented by calling sys_wait4() from libc.a.
  */
 SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
 {
 	return sys_wait4(pid, stat_addr, options, NULL);
 }
 #endif