Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

3

* Authors: Andi Kleen, Fengguang Wu

3

* Authors: Andi Kleen, Fengguang Wu

4

*

4

*

5

* This software may be redistributed and/or modified under the terms of

5

* This software may be redistributed and/or modified under the terms of

6

* the GNU General Public License ("GPL") version 2 only as published by the

6

* the GNU General Public License ("GPL") version 2 only as published by the

7

* Free Software Foundation.

7

* Free Software Foundation.

8

*

8

*

9

* High level machine check handler. Handles pages reported by the

9

* High level machine check handler. Handles pages reported by the

10

* hardware as being corrupted usually due to a multi-bit ECC memory or cache

10

* hardware as being corrupted usually due to a multi-bit ECC memory or cache

11

* failure.

11

* failure.

12

*

12

*

13

* In addition there is a "soft offline" entry point that allows stop using

13

* In addition there is a "soft offline" entry point that allows stop using

14

* not-yet-corrupted-by-suspicious pages without killing anything.

14

* not-yet-corrupted-by-suspicious pages without killing anything.

15

*

15

*

16

* Handles page cache pages in various states. The tricky part

16

* Handles page cache pages in various states. The tricky part

17

* here is that we can access any page asynchronously in respect to

17

* here is that we can access any page asynchronously in respect to

18

* other VM users, because memory failures could happen anytime and

18

* other VM users, because memory failures could happen anytime and

19

* anywhere. This could violate some of their assumptions. This is why

19

* anywhere. This could violate some of their assumptions. This is why

20

* this code has to be extremely careful. Generally it tries to use

20

* this code has to be extremely careful. Generally it tries to use

21

* normal locking rules, as in get the standard locks, even if that means

21

* normal locking rules, as in get the standard locks, even if that means

22

* the error handling takes potentially a long time.

22

* the error handling takes potentially a long time.

23

*

23

*

24

* There are several operations here with exponential complexity because

24

* There are several operations here with exponential complexity because

25

* of unsuitable VM data structures. For example the operation to map back

25

* of unsuitable VM data structures. For example the operation to map back

26

* from RMAP chains to processes has to walk the complete process list and

26

* from RMAP chains to processes has to walk the complete process list and

27

* has non linear complexity with the number. But since memory corruptions

27

* has non linear complexity with the number. But since memory corruptions

28

* are rare we hope to get away with this. This avoids impacting the core

28

* are rare we hope to get away with this. This avoids impacting the core

29

* VM.

29

* VM.

30

*/

30

*/

31

32

/*

32

/*

33

* Notebook:

33

* Notebook:

34

* - hugetlb needs more code

34

* - hugetlb needs more code

35

* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages

35

* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages

36

* - pass bad pages to kdump next kernel

36

* - pass bad pages to kdump next kernel

37

*/

37

*/

38

#include <linux/kernel.h>

38

#include <linux/kernel.h>

39

#include <linux/mm.h>

39

#include <linux/mm.h>

40

#include <linux/page-flags.h>

40

#include <linux/page-flags.h>

41

#include <linux/kernel-page-flags.h>

41

#include <linux/kernel-page-flags.h>

42

#include <linux/sched.h>

42

#include <linux/sched.h>

43

#include <linux/ksm.h>

43

#include <linux/ksm.h>

44

#include <linux/rmap.h>

44

#include <linux/rmap.h>

45

#include <linux/export.h>

45

#include <linux/export.h>

46

#include <linux/pagemap.h>

46

#include <linux/pagemap.h>

47

#include <linux/swap.h>

47

#include <linux/swap.h>

48

#include <linux/backing-dev.h>

48

#include <linux/backing-dev.h>

49

#include <linux/migrate.h>

49

#include <linux/migrate.h>

50

#include <linux/page-isolation.h>

50

#include <linux/page-isolation.h>

51

#include <linux/suspend.h>

51

#include <linux/suspend.h>

52

#include <linux/slab.h>

52

#include <linux/slab.h>

53

#include <linux/swapops.h>

53

#include <linux/swapops.h>

54

#include <linux/hugetlb.h>

54

#include <linux/hugetlb.h>

55

#include <linux/memory_hotplug.h>

55

#include <linux/memory_hotplug.h>

56

#include <linux/mm_inline.h>

56

#include <linux/mm_inline.h>

57

#include <linux/kfifo.h>

57

#include <linux/kfifo.h>

58

#include "internal.h"

58

#include "internal.h"

59

60

int sysctl_memory_failure_early_kill __read_mostly = 0;

60

int sysctl_memory_failure_early_kill __read_mostly = 0;

61

62

int sysctl_memory_failure_recovery __read_mostly = 1;

62

int sysctl_memory_failure_recovery __read_mostly = 1;

63

64

atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);

64

atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);

65

66

#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)

66

#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)

67

68

u32 hwpoison_filter_enable = 0;

68

u32 hwpoison_filter_enable = 0;

69

u32 hwpoison_filter_dev_major = ~0U;

69

u32 hwpoison_filter_dev_major = ~0U;

70

u32 hwpoison_filter_dev_minor = ~0U;

70

u32 hwpoison_filter_dev_minor = ~0U;

71

u64 hwpoison_filter_flags_mask;

71

u64 hwpoison_filter_flags_mask;

72

u64 hwpoison_filter_flags_value;

72

u64 hwpoison_filter_flags_value;

73

EXPORT_SYMBOL_GPL(hwpoison_filter_enable);

73

EXPORT_SYMBOL_GPL(hwpoison_filter_enable);

74

EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);

74

EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);

75

EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);

75

EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);

76

EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);

76

EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);

77

EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);

77

EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);

78

79

static int hwpoison_filter_dev(struct page *p)

79

static int hwpoison_filter_dev(struct page *p)

80

{

80

{

81

struct address_space *mapping;

81

struct address_space *mapping;

82

dev_t dev;

82

dev_t dev;

83

84

if (hwpoison_filter_dev_major == ~0U &&

84

if (hwpoison_filter_dev_major == ~0U &&

85

hwpoison_filter_dev_minor == ~0U)

85

hwpoison_filter_dev_minor == ~0U)

86

return 0;

86

return 0;

87

88

/*

88

/*

89

* page_mapping() does not accept slab pages.

89

* page_mapping() does not accept slab pages.

90

*/

90

*/

91

if (PageSlab(p))

91

if (PageSlab(p))

92

return -EINVAL;

92

return -EINVAL;

93

94

mapping = page_mapping(p);

94

mapping = page_mapping(p);

95

if (mapping == NULL || mapping->host == NULL)

95

if (mapping == NULL || mapping->host == NULL)

96

return -EINVAL;

96

return -EINVAL;

97

98

dev = mapping->host->i_sb->s_dev;

98

dev = mapping->host->i_sb->s_dev;

99

if (hwpoison_filter_dev_major != ~0U &&

99

if (hwpoison_filter_dev_major != ~0U &&

100

hwpoison_filter_dev_major != MAJOR(dev))

100

hwpoison_filter_dev_major != MAJOR(dev))

101

return -EINVAL;

101

return -EINVAL;

102

if (hwpoison_filter_dev_minor != ~0U &&

102

if (hwpoison_filter_dev_minor != ~0U &&

103

hwpoison_filter_dev_minor != MINOR(dev))

103

hwpoison_filter_dev_minor != MINOR(dev))

104

return -EINVAL;

104

return -EINVAL;

105

106

return 0;

106

return 0;

107

}

107

}

108

109

static int hwpoison_filter_flags(struct page *p)

109

static int hwpoison_filter_flags(struct page *p)

110

{

110

{

111

if (!hwpoison_filter_flags_mask)

111

if (!hwpoison_filter_flags_mask)

112

return 0;

112

return 0;

113

114

if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==

114

if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==

115

hwpoison_filter_flags_value)

115

hwpoison_filter_flags_value)

116

return 0;

116

return 0;

117

else

117

else

118

return -EINVAL;

118

return -EINVAL;

119

}

119

}

120

121

/*

121

/*

122

* This allows stress tests to limit test scope to a collection of tasks

122

* This allows stress tests to limit test scope to a collection of tasks

123

* by putting them under some memcg. This prevents killing unrelated/important

123

* by putting them under some memcg. This prevents killing unrelated/important

124

* processes such as /sbin/init. Note that the target task may share clean

124

* processes such as /sbin/init. Note that the target task may share clean

125

* pages with init (eg. libc text), which is harmless. If the target task

125

* pages with init (eg. libc text), which is harmless. If the target task

126

* share _dirty_ pages with another task B, the test scheme must make sure B

126

* share _dirty_ pages with another task B, the test scheme must make sure B

127

* is also included in the memcg. At last, due to race conditions this filter

127

* is also included in the memcg. At last, due to race conditions this filter

128

* can only guarantee that the page either belongs to the memcg tasks, or is

128

* can only guarantee that the page either belongs to the memcg tasks, or is

129

* a freed page.

129

* a freed page.

130

*/

130

*/

131

#ifdef CONFIG_MEMCG_SWAP

131

#ifdef CONFIG_MEMCG_SWAP

132

u64 hwpoison_filter_memcg;

132

u64 hwpoison_filter_memcg;

133

EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);

133

EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);

134

static int hwpoison_filter_task(struct page *p)

134

static int hwpoison_filter_task(struct page *p)

135

{

135

{

136

struct mem_cgroup *mem;

136

struct mem_cgroup *mem;

137

struct cgroup_subsys_state *css;

137

struct cgroup_subsys_state *css;

138

unsigned long ino;

138

unsigned long ino;

139

140

if (!hwpoison_filter_memcg)

140

if (!hwpoison_filter_memcg)

141

return 0;

141

return 0;

142

143

mem = try_get_mem_cgroup_from_page(p);

143

mem = try_get_mem_cgroup_from_page(p);

144

if (!mem)

144

if (!mem)

145

return -EINVAL;

145

return -EINVAL;

146

147

css = mem_cgroup_css(mem);

147

css = mem_cgroup_css(mem);

148

ino = cgroup_ino(css->cgroup);

148

ino = cgroup_ino(css->cgroup);

149

css_put(css);

149

css_put(css);

150

151

if (!ino || ino != hwpoison_filter_memcg)

151

if (!ino || ino != hwpoison_filter_memcg)

152

return -EINVAL;

152

return -EINVAL;

153

154

return 0;

154

return 0;

155

}

155

}

156

#else

156

#else

157

static int hwpoison_filter_task(struct page *p) { return 0; }

157

static int hwpoison_filter_task(struct page *p) { return 0; }

158

#endif

158

#endif

159

160

int hwpoison_filter(struct page *p)

160

int hwpoison_filter(struct page *p)

161

{

161

{

162

if (!hwpoison_filter_enable)

162

if (!hwpoison_filter_enable)

163

return 0;

163

return 0;

164

165

if (hwpoison_filter_dev(p))

165

if (hwpoison_filter_dev(p))

166

return -EINVAL;

166

return -EINVAL;

167

168

if (hwpoison_filter_flags(p))

168

if (hwpoison_filter_flags(p))

169

return -EINVAL;

169

return -EINVAL;

170

171

if (hwpoison_filter_task(p))

171

if (hwpoison_filter_task(p))

172

return -EINVAL;

172

return -EINVAL;

173

174

return 0;

174

return 0;

175

}

175

}

176

#else

176

#else

177

int hwpoison_filter(struct page *p)

177

int hwpoison_filter(struct page *p)

178

{

178

{

179

return 0;

179

return 0;

180

}

180

}

181

#endif

181

#endif

182

183

EXPORT_SYMBOL_GPL(hwpoison_filter);

183

EXPORT_SYMBOL_GPL(hwpoison_filter);

184

185

/*

185

/*

186

* Send all the processes who have the page mapped a signal.

186

* Send all the processes who have the page mapped a signal.

187

* ``action optional'' if they are not immediately affected by the error

187

* ``action optional'' if they are not immediately affected by the error

188

* ``action required'' if error happened in current execution context

188

* ``action required'' if error happened in current execution context

189

*/

189

*/

190

static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,

190

static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,

191

unsigned long pfn, struct page *page, int flags)

191

unsigned long pfn, struct page *page, int flags)

192

{

192

{

193

struct siginfo si;

193

struct siginfo si;

194

int ret;

194

int ret;

195

196

printk(KERN_ERR

196

printk(KERN_ERR

197

"MCE %#lx: Killing %s:%d due to hardware memory corruption\n",

197

"MCE %#lx: Killing %s:%d due to hardware memory corruption\n",

198

pfn, t->comm, t->pid);

198

pfn, t->comm, t->pid);

199

si.si_signo = SIGBUS;

199

si.si_signo = SIGBUS;

200

si.si_errno = 0;

200

si.si_errno = 0;

201

si.si_addr = (void *)addr;

201

si.si_addr = (void *)addr;

202

#ifdef __ARCH_SI_TRAPNO

202

#ifdef __ARCH_SI_TRAPNO

203

si.si_trapno = trapno;

203

si.si_trapno = trapno;

204

#endif

204

#endif

205

si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;

205

si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;

206

207

if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {

207

if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {

208

si.si_code = BUS_MCEERR_AR;

208

si.si_code = BUS_MCEERR_AR;

209

ret = force_sig_info(SIGBUS, &si, current);

209

ret = force_sig_info(SIGBUS, &si, current);

210

} else {

210

} else {

211

/*

211

/*

212

* Don't use force here, it's convenient if the signal

212

* Don't use force here, it's convenient if the signal

213

* can be temporarily blocked.

213

* can be temporarily blocked.

214

* This could cause a loop when the user sets SIGBUS

214

* This could cause a loop when the user sets SIGBUS

215

* to SIG_IGN, but hopefully no one will do that?

215

* to SIG_IGN, but hopefully no one will do that?

216

*/

216

*/

217

si.si_code = BUS_MCEERR_AO;

217

si.si_code = BUS_MCEERR_AO;

218

ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */

218

ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */

219

}

219

}

220

if (ret < 0)

220

if (ret < 0)

221

printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",

221

printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",

222

t->comm, t->pid, ret);

222

t->comm, t->pid, ret);

223

return ret;

223

return ret;

224

}

224

}

225

226

/*

226

/*

227

* When a unknown page type is encountered drain as many buffers as possible

227

* When a unknown page type is encountered drain as many buffers as possible

228

* in the hope to turn the page into a LRU or free page, which we can handle.

228

* in the hope to turn the page into a LRU or free page, which we can handle.

229

*/

229

*/

230

void shake_page(struct page *p, int access)

230

void shake_page(struct page *p, int access)

231

{

231

{

232

if (!PageSlab(p)) {

232

if (!PageSlab(p)) {

233

lru_add_drain_all();

233

lru_add_drain_all();

234

if (PageLRU(p))

234

if (PageLRU(p))

235

return;

235

return;

236

drain_all_pages();

236

drain_all_pages();

237

if (PageLRU(p) || is_free_buddy_page(p))

237

if (PageLRU(p) || is_free_buddy_page(p))

238

return;

238

return;

239

}

239

}

240

241

/*

241

/*

242

* Only call shrink_slab here (which would also shrink other caches) if

242

* Only call shrink_slab here (which would also shrink other caches) if

243

* access is not potentially fatal.

243

* access is not potentially fatal.

244

*/

244

*/

245

if (access) {

245

if (access) {

246

int nr;

246

int nr;

247

int nid = page_to_nid(p);

247

int nid = page_to_nid(p);

248

do {

248

do {

249

struct shrink_control shrink = {

249

struct shrink_control shrink = {

250

.gfp_mask = GFP_KERNEL,

250

.gfp_mask = GFP_KERNEL,

251

};

251

};

252

node_set(nid, shrink.nodes_to_scan);

252

node_set(nid, shrink.nodes_to_scan);

253

254

nr = shrink_slab(&shrink, 1000, 1000);

254

nr = shrink_slab(&shrink, 1000, 1000);

255

if (page_count(p) == 1)

255

if (page_count(p) == 1)

256

break;

256

break;

257

} while (nr > 10);

257

} while (nr > 10);

258

}

258

}

259

}

259

}

260

EXPORT_SYMBOL_GPL(shake_page);

260

EXPORT_SYMBOL_GPL(shake_page);

261

262

/*

262

/*

263

* Kill all processes that have a poisoned page mapped and then isolate

263

* Kill all processes that have a poisoned page mapped and then isolate

264

* the page.

264

* the page.

265

*

265

*

266

* General strategy:

266

* General strategy:

267

* Find all processes having the page mapped and kill them.

267

* Find all processes having the page mapped and kill them.

268

* But we keep a page reference around so that the page is not

268

* But we keep a page reference around so that the page is not

269

* actually freed yet.

269

* actually freed yet.

270

* Then stash the page away

270

* Then stash the page away

271

*

271

*

272

* There's no convenient way to get back to mapped processes

272

* There's no convenient way to get back to mapped processes

273

* from the VMAs. So do a brute-force search over all

273

* from the VMAs. So do a brute-force search over all

274

* running processes.

274

* running processes.

275

*

275

*

276

* Remember that machine checks are not common (or rather

276

* Remember that machine checks are not common (or rather

277

* if they are common you have other problems), so this shouldn't

277

* if they are common you have other problems), so this shouldn't

278

* be a performance issue.

278

* be a performance issue.

279

*

279

*

280

* Also there are some races possible while we get from the

280

* Also there are some races possible while we get from the

281

* error detection to actually handle it.

281

* error detection to actually handle it.

282

*/

282

*/

283

284

struct to_kill {

284

struct to_kill {

285

struct list_head nd;

285

struct list_head nd;

286

struct task_struct *tsk;

286

struct task_struct *tsk;

287

unsigned long addr;

287

unsigned long addr;

288

char addr_valid;

288

char addr_valid;

289

};

289

};

290

291

/*

291

/*

292

* Failure handling: if we can't find or can't kill a process there's

292

* Failure handling: if we can't find or can't kill a process there's

293

* not much we can do. We just print a message and ignore otherwise.

293

* not much we can do. We just print a message and ignore otherwise.

294

*/

294

*/

295

296

/*

296

/*

297

* Schedule a process for later kill.

297

* Schedule a process for later kill.

298

* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.

298

* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.

299

* TBD would GFP_NOIO be enough?

299

* TBD would GFP_NOIO be enough?

300

*/

300

*/

301

static void add_to_kill(struct task_struct *tsk, struct page *p,

301

static void add_to_kill(struct task_struct *tsk, struct page *p,

302

struct vm_area_struct *vma,

302

struct vm_area_struct *vma,

303

struct list_head *to_kill,

303

struct list_head *to_kill,

304

struct to_kill **tkc)

304

struct to_kill **tkc)

305

{

305

{

306

struct to_kill *tk;

306

struct to_kill *tk;

307

308

if (*tkc) {

308

if (*tkc) {

309

tk = *tkc;

309

tk = *tkc;

310

*tkc = NULL;

310

*tkc = NULL;

311

} else {

311

} else {

312

tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);

312

tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);

313

if (!tk) {

313

if (!tk) {

314

printk(KERN_ERR

314

printk(KERN_ERR

315

"MCE: Out of memory while machine check handling\n");

315

"MCE: Out of memory while machine check handling\n");

316

return;

316

return;

317

}

317

}

318

}

318

}

319

tk->addr = page_address_in_vma(p, vma);

319

tk->addr = page_address_in_vma(p, vma);

320

tk->addr_valid = 1;

320

tk->addr_valid = 1;

321

322

/*

322

/*

323

* In theory we don't have to kill when the page was

323

* In theory we don't have to kill when the page was

324

* munmaped. But it could be also a mremap. Since that's

324

* munmaped. But it could be also a mremap. Since that's

325

* likely very rare kill anyways just out of paranoia, but use

325

* likely very rare kill anyways just out of paranoia, but use

326

* a SIGKILL because the error is not contained anymore.

326

* a SIGKILL because the error is not contained anymore.

327

*/

327

*/

328

if (tk->addr == -EFAULT) {

328

if (tk->addr == -EFAULT) {

329

pr_info("MCE: Unable to find user space address %lx in %s\n",

329

pr_info("MCE: Unable to find user space address %lx in %s\n",

330

page_to_pfn(p), tsk->comm);

330

page_to_pfn(p), tsk->comm);

331

tk->addr_valid = 0;

331

tk->addr_valid = 0;

332

}

332

}

333

get_task_struct(tsk);

333

get_task_struct(tsk);

334

tk->tsk = tsk;

334

tk->tsk = tsk;

335

list_add_tail(&tk->nd, to_kill);

335

list_add_tail(&tk->nd, to_kill);

336

}

336

}

337

338

/*

338

/*

339

* Kill the processes that have been collected earlier.

339

* Kill the processes that have been collected earlier.

340

*

340

*

341

* Only do anything when DOIT is set, otherwise just free the list

341

* Only do anything when DOIT is set, otherwise just free the list

342

* (this is used for clean pages which do not need killing)

342

* (this is used for clean pages which do not need killing)

343

* Also when FAIL is set do a force kill because something went

343

* Also when FAIL is set do a force kill because something went

344

* wrong earlier.

344

* wrong earlier.

345

*/

345

*/

346

static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,

346

static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,

347

int fail, struct page *page, unsigned long pfn,

347

int fail, struct page *page, unsigned long pfn,

348

int flags)

348

int flags)

349

{

349

{

350

struct to_kill *tk, *next;

350

struct to_kill *tk, *next;

351

352

list_for_each_entry_safe (tk, next, to_kill, nd) {

352

list_for_each_entry_safe (tk, next, to_kill, nd) {

353

if (forcekill) {

353

if (forcekill) {

354

/*

354

/*

355

* In case something went wrong with munmapping

355

* In case something went wrong with munmapping

356

* make sure the process doesn't catch the

356

* make sure the process doesn't catch the

357

* signal and then access the memory. Just kill it.

357

* signal and then access the memory. Just kill it.

358

*/

358

*/

359

if (fail || tk->addr_valid == 0) {

359

if (fail || tk->addr_valid == 0) {

360

printk(KERN_ERR

360

printk(KERN_ERR

361

"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",

361

"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",

362

pfn, tk->tsk->comm, tk->tsk->pid);

362

pfn, tk->tsk->comm, tk->tsk->pid);

363

force_sig(SIGKILL, tk->tsk);

363

force_sig(SIGKILL, tk->tsk);

364

}

364

}

365

366

/*

366

/*

367

* In theory the process could have mapped

367

* In theory the process could have mapped

368

* something else on the address in-between. We could

368

* something else on the address in-between. We could

369

* check for that, but we need to tell the

369

* check for that, but we need to tell the

370

* process anyways.

370

* process anyways.

371

*/

371

*/

372

else if (kill_proc(tk->tsk, tk->addr, trapno,

372

else if (kill_proc(tk->tsk, tk->addr, trapno,

373

pfn, page, flags) < 0)

373

pfn, page, flags) < 0)

374

printk(KERN_ERR

374

printk(KERN_ERR

375

"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",

375

"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",

376

pfn, tk->tsk->comm, tk->tsk->pid);

376

pfn, tk->tsk->comm, tk->tsk->pid);

377

}

377

}

378

put_task_struct(tk->tsk);

378

put_task_struct(tk->tsk);

379

kfree(tk);

379

kfree(tk);

380

}

380

}

381

}

381

}

382

383

/*

383

/*

384

* Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)

384

* Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)

385

* on behalf of the thread group. Return task_struct of the (first found)

385

* on behalf of the thread group. Return task_struct of the (first found)

386

* dedicated thread if found, and return NULL otherwise.

386

* dedicated thread if found, and return NULL otherwise.

387

*

387

*

388

* We already hold read_lock(&tasklist_lock) in the caller, so we don't

388

* We already hold read_lock(&tasklist_lock) in the caller, so we don't

389

* have to call rcu_read_lock/unlock() in this function.

389

* have to call rcu_read_lock/unlock() in this function.

390

*/

390

*/

391

static struct task_struct *find_early_kill_thread(struct task_struct *tsk)

391

static struct task_struct *find_early_kill_thread(struct task_struct *tsk)

392

{

392

{

393

struct task_struct *t;

393

struct task_struct *t;

394

395

for_each_thread(tsk, t)

395

for_each_thread(tsk, t)

396

if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))

396

if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))

397

return t;

397

return t;

398

return NULL;

398

return NULL;

399

}

399

}

400

401

/*

401

/*

402

* Determine whether a given process is "early kill" process which expects

402

* Determine whether a given process is "early kill" process which expects

403

* to be signaled when some page under the process is hwpoisoned.

403

* to be signaled when some page under the process is hwpoisoned.

404

* Return task_struct of the dedicated thread (main thread unless explicitly

404

* Return task_struct of the dedicated thread (main thread unless explicitly

405

* specified) if the process is "early kill," and otherwise returns NULL.

405

* specified) if the process is "early kill," and otherwise returns NULL.

406

*/

406

*/

407

static struct task_struct *task_early_kill(struct task_struct *tsk,

407

static struct task_struct *task_early_kill(struct task_struct *tsk,

408

int force_early)

408

int force_early)

409

{

409

{

410

struct task_struct *t;

410

struct task_struct *t;

411

if (!tsk->mm)

411

if (!tsk->mm)

412

return NULL;

412

return NULL;

413

if (force_early)

413

if (force_early)

414

return tsk;

414

return tsk;

415

t = find_early_kill_thread(tsk);

415

t = find_early_kill_thread(tsk);

416

if (t)

416

if (t)

417

return t;

417

return t;

418

if (sysctl_memory_failure_early_kill)

418

if (sysctl_memory_failure_early_kill)

419

return tsk;

419

return tsk;

420

return NULL;

420

return NULL;

421

}

421

}

422

423

/*

423

/*

424

* Collect processes when the error hit an anonymous page.

424

* Collect processes when the error hit an anonymous page.

425

*/

425

*/

426

static void collect_procs_anon(struct page *page, struct list_head *to_kill,

426

static void collect_procs_anon(struct page *page, struct list_head *to_kill,

427

struct to_kill **tkc, int force_early)

427

struct to_kill **tkc, int force_early)

428

{

428

{

429

struct vm_area_struct *vma;

429

struct vm_area_struct *vma;

430

struct task_struct *tsk;

430

struct task_struct *tsk;

431

struct anon_vma *av;

431

struct anon_vma *av;

432

pgoff_t pgoff;

432

pgoff_t pgoff;

433

434

av = page_lock_anon_vma_read(page);

434

av = page_lock_anon_vma_read(page);

435

if (av == NULL) /* Not actually mapped anymore */

435

if (av == NULL) /* Not actually mapped anymore */

436

return;

436

return;

437

438

pgoff = page_to_pgoff(page);

438

pgoff = page_to_pgoff(page);

439

read_lock(&tasklist_lock);

439

read_lock(&tasklist_lock);

440

for_each_process (tsk) {

440

for_each_process (tsk) {

441

struct anon_vma_chain *vmac;

441

struct anon_vma_chain *vmac;

442

struct task_struct *t = task_early_kill(tsk, force_early);

442

struct task_struct *t = task_early_kill(tsk, force_early);

443

444

if (!t)

444

if (!t)

445

continue;

445

continue;

446

anon_vma_interval_tree_foreach(vmac, &av->rb_root,

446

anon_vma_interval_tree_foreach(vmac, &av->rb_root,

447

pgoff, pgoff) {

447

pgoff, pgoff) {

448

vma = vmac->vma;

448

vma = vmac->vma;

449

if (!page_mapped_in_vma(page, vma))

449

if (!page_mapped_in_vma(page, vma))

450

continue;

450

continue;

451

if (vma->vm_mm == t->mm)

451

if (vma->vm_mm == t->mm)

452

add_to_kill(t, page, vma, to_kill, tkc);

452

add_to_kill(t, page, vma, to_kill, tkc);

453

}

453

}

454

}

454

}

455

read_unlock(&tasklist_lock);

455

read_unlock(&tasklist_lock);

456

page_unlock_anon_vma_read(av);

456

page_unlock_anon_vma_read(av);

457

}

457

}

458

459

/*

459

/*

460

* Collect processes when the error hit a file mapped page.

460

* Collect processes when the error hit a file mapped page.

461

*/

461

*/

462

static void collect_procs_file(struct page *page, struct list_head *to_kill,

462

static void collect_procs_file(struct page *page, struct list_head *to_kill,

463

struct to_kill **tkc, int force_early)

463

struct to_kill **tkc, int force_early)

464

{

464

{

465

struct vm_area_struct *vma;

465

struct vm_area_struct *vma;

466

struct task_struct *tsk;

466

struct task_struct *tsk;

467

struct address_space *mapping = page->mapping;

467

struct address_space *mapping = page->mapping;

468

469

mutex_lock(&mapping->i_mmap_mutex);

469

mutex_lock(&mapping->i_mmap_mutex);

470

read_lock(&tasklist_lock);

470

read_lock(&tasklist_lock);

471

for_each_process(tsk) {

471

for_each_process(tsk) {

472

pgoff_t pgoff = page_to_pgoff(page);

472

pgoff_t pgoff = page_to_pgoff(page);

473

struct task_struct *t = task_early_kill(tsk, force_early);

473

struct task_struct *t = task_early_kill(tsk, force_early);

474

475

if (!t)

475

if (!t)

476

continue;

476

continue;

477

vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,

477

vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,

478

pgoff) {

478

pgoff) {

479

/*

479

/*

480

* Send early kill signal to tasks where a vma covers

480

* Send early kill signal to tasks where a vma covers

481

* the page but the corrupted page is not necessarily

481

* the page but the corrupted page is not necessarily

482

* mapped it in its pte.

482

* mapped it in its pte.

483

* Assume applications who requested early kill want

483

* Assume applications who requested early kill want

484

* to be informed of all such data corruptions.

484

* to be informed of all such data corruptions.

485

*/

485

*/

486

if (vma->vm_mm == t->mm)

486

if (vma->vm_mm == t->mm)

487

add_to_kill(t, page, vma, to_kill, tkc);

487

add_to_kill(t, page, vma, to_kill, tkc);

488

}

488

}

489

}

489

}

490

read_unlock(&tasklist_lock);

490

read_unlock(&tasklist_lock);

491

mutex_unlock(&mapping->i_mmap_mutex);

491

mutex_unlock(&mapping->i_mmap_mutex);

492

}

492

}

493

494

/*

494

/*

495

* Collect the processes who have the corrupted page mapped to kill.

495

* Collect the processes who have the corrupted page mapped to kill.

496

* This is done in two steps for locking reasons.

496

* This is done in two steps for locking reasons.

497

* First preallocate one tokill structure outside the spin locks,

497

* First preallocate one tokill structure outside the spin locks,

498

* so that we can kill at least one process reasonably reliable.

498

* so that we can kill at least one process reasonably reliable.

499

*/

499

*/

500

static void collect_procs(struct page *page, struct list_head *tokill,

500

static void collect_procs(struct page *page, struct list_head *tokill,

501

int force_early)

501

int force_early)

502

{

502

{

503

struct to_kill *tk;

503

struct to_kill *tk;

504

505

if (!page->mapping)

505

if (!page->mapping)

506

return;

506

return;

507

508

tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);

508

tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);

509

if (!tk)

509

if (!tk)

510

return;

510

return;

511

if (PageAnon(page))

511

if (PageAnon(page))

512

collect_procs_anon(page, tokill, &tk, force_early);

512

collect_procs_anon(page, tokill, &tk, force_early);

513

else

513

else

514

collect_procs_file(page, tokill, &tk, force_early);

514

collect_procs_file(page, tokill, &tk, force_early);

515

kfree(tk);

515

kfree(tk);

516

}

516

}

517

518

/*

518

/*

519

* Error handlers for various types of pages.

519

* Error handlers for various types of pages.

520

*/

520

*/

521

522

enum outcome {

522

enum outcome {

523

IGNORED, /* Error: cannot be handled */

523

IGNORED, /* Error: cannot be handled */

524

FAILED, /* Error: handling failed */

524

FAILED, /* Error: handling failed */

525

DELAYED, /* Will be handled later */

525

DELAYED, /* Will be handled later */

526

RECOVERED, /* Successfully recovered */

526

RECOVERED, /* Successfully recovered */

527

};

527

};

528

529

static const char *action_name[] = {

529

static const char *action_name[] = {

530

[IGNORED] = "Ignored",

530

[IGNORED] = "Ignored",

531

[FAILED] = "Failed",

531

[FAILED] = "Failed",

532

[DELAYED] = "Delayed",

532

[DELAYED] = "Delayed",

533

[RECOVERED] = "Recovered",

533

[RECOVERED] = "Recovered",

534

};

534

};

535

536

/*

536

/*

537

* XXX: It is possible that a page is isolated from LRU cache,

537

* XXX: It is possible that a page is isolated from LRU cache,

538

* and then kept in swap cache or failed to remove from page cache.

538

* and then kept in swap cache or failed to remove from page cache.

539

* The page count will stop it from being freed by unpoison.

539

* The page count will stop it from being freed by unpoison.

540

* Stress tests should be aware of this memory leak problem.

540

* Stress tests should be aware of this memory leak problem.

541

*/

541

*/

542

static int delete_from_lru_cache(struct page *p)

542

static int delete_from_lru_cache(struct page *p)

543

{

543

{

544

if (!isolate_lru_page(p)) {

544

if (!isolate_lru_page(p)) {

545

/*

545

/*

546

* Clear sensible page flags, so that the buddy system won't

546

* Clear sensible page flags, so that the buddy system won't

547

* complain when the page is unpoison-and-freed.

547

* complain when the page is unpoison-and-freed.

548

*/

548

*/

549

ClearPageActive(p);

549

ClearPageActive(p);

550

ClearPageUnevictable(p);

550

ClearPageUnevictable(p);

551

/*

551

/*

552

* drop the page count elevated by isolate_lru_page()

552

* drop the page count elevated by isolate_lru_page()

553

*/

553

*/

554

page_cache_release(p);

554

page_cache_release(p);

555

return 0;

555

return 0;

556

}

556

}

557

return -EIO;

557

return -EIO;

558

}

558

}

559

560

/*

560

/*

561

* Error hit kernel page.

561

* Error hit kernel page.

562

* Do nothing, try to be lucky and not touch this instead. For a few cases we

562

* Do nothing, try to be lucky and not touch this instead. For a few cases we

563

* could be more sophisticated.

563

* could be more sophisticated.

564

*/

564

*/

565

static int me_kernel(struct page *p, unsigned long pfn)

565

static int me_kernel(struct page *p, unsigned long pfn)

566

{

566

{

567

return IGNORED;

567

return IGNORED;

568

}

568

}

569

570

/*

570

/*

571

* Page in unknown state. Do nothing.

571

* Page in unknown state. Do nothing.

572

*/

572

*/

573

static int me_unknown(struct page *p, unsigned long pfn)

573

static int me_unknown(struct page *p, unsigned long pfn)

574

{

574

{

575

printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);

575

printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);

576

return FAILED;

576

return FAILED;

577

}

577

}

578

579

/*

579

/*

580

* Clean (or cleaned) page cache page.

580

* Clean (or cleaned) page cache page.

581

*/

581

*/

582

static int me_pagecache_clean(struct page *p, unsigned long pfn)

582

static int me_pagecache_clean(struct page *p, unsigned long pfn)

583

{

583

{

584

int err;

584

int err;

585

int ret = FAILED;

585

int ret = FAILED;

586

struct address_space *mapping;

586

struct address_space *mapping;

587

588

delete_from_lru_cache(p);

588

delete_from_lru_cache(p);

589

590

/*

590

/*

591

* For anonymous pages we're done the only reference left

591

* For anonymous pages we're done the only reference left

592

* should be the one m_f() holds.

592

* should be the one m_f() holds.

593

*/

593

*/

594

if (PageAnon(p))

594

if (PageAnon(p))

595

return RECOVERED;

595

return RECOVERED;

596

597

/*

597

/*

598

* Now truncate the page in the page cache. This is really

598

* Now truncate the page in the page cache. This is really

599

* more like a "temporary hole punch"

599

* more like a "temporary hole punch"

600

* Don't do this for block devices when someone else

600

* Don't do this for block devices when someone else

601

* has a reference, because it could be file system metadata

601

* has a reference, because it could be file system metadata

602

* and that's not safe to truncate.

602

* and that's not safe to truncate.

603

*/

603

*/

604

mapping = page_mapping(p);

604

mapping = page_mapping(p);

605

if (!mapping) {

605

if (!mapping) {

606

/*

606

/*

607

* Page has been teared down in the meanwhile

607

* Page has been teared down in the meanwhile

608

*/

608

*/

609

return FAILED;

609

return FAILED;

610

}

610

}

611

612

/*

612

/*

613

* Truncation is a bit tricky. Enable it per file system for now.

613

* Truncation is a bit tricky. Enable it per file system for now.

614

*

614

*

615

* Open: to take i_mutex or not for this? Right now we don't.

615

* Open: to take i_mutex or not for this? Right now we don't.

616

*/

616

*/

617

if (mapping->a_ops->error_remove_page) {

617

if (mapping->a_ops->error_remove_page) {

618

err = mapping->a_ops->error_remove_page(mapping, p);

618

err = mapping->a_ops->error_remove_page(mapping, p);

619

if (err != 0) {

619

if (err != 0) {

620

printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",

620

printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",

621

pfn, err);

621

pfn, err);

622

} else if (page_has_private(p) &&

622

} else if (page_has_private(p) &&

623

!try_to_release_page(p, GFP_NOIO)) {

623

!try_to_release_page(p, GFP_NOIO)) {

624

pr_info("MCE %#lx: failed to release buffers\n", pfn);

624

pr_info("MCE %#lx: failed to release buffers\n", pfn);

625

} else {

625

} else {

626

ret = RECOVERED;

626

ret = RECOVERED;

627

}

627

}

628

} else {

628

} else {

629

/*

629

/*

630

* If the file system doesn't support it just invalidate

630

* If the file system doesn't support it just invalidate

631

* This fails on dirty or anything with private pages

631

* This fails on dirty or anything with private pages

632

*/

632

*/

633

if (invalidate_inode_page(p))

633

if (invalidate_inode_page(p))

634

ret = RECOVERED;

634

ret = RECOVERED;

635

else

635

else

636

printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",

636

printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",

637

pfn);

637

pfn);

638

}

638

}

639

return ret;

639

return ret;

640

}

640

}

641

642

/*

642

/*

643

* Dirty pagecache page

643

* Dirty pagecache page

644

* Issues: when the error hit a hole page the error is not properly

644

* Issues: when the error hit a hole page the error is not properly

645

* propagated.

645

* propagated.

646

*/

646

*/

647

static int me_pagecache_dirty(struct page *p, unsigned long pfn)

647

static int me_pagecache_dirty(struct page *p, unsigned long pfn)

648

{

648

{

649

struct address_space *mapping = page_mapping(p);

649

struct address_space *mapping = page_mapping(p);

650

651

SetPageError(p);

651

SetPageError(p);

652

/* TBD: print more information about the file. */

652

/* TBD: print more information about the file. */

653

if (mapping) {

653

if (mapping) {

654

/*

654

/*

655

* IO error will be reported by write(), fsync(), etc.

655

* IO error will be reported by write(), fsync(), etc.

656

* who check the mapping.

656

* who check the mapping.

657

* This way the application knows that something went

657

* This way the application knows that something went

658

* wrong with its dirty file data.

658

* wrong with its dirty file data.

659

*

659

*

660

* There's one open issue:

660

* There's one open issue:

661

*

661

*

662

* The EIO will be only reported on the next IO

662

* The EIO will be only reported on the next IO

663

* operation and then cleared through the IO map.

663

* operation and then cleared through the IO map.

664

* Normally Linux has two mechanisms to pass IO error

664

* Normally Linux has two mechanisms to pass IO error

665

* first through the AS_EIO flag in the address space

665

* first through the AS_EIO flag in the address space

666

* and then through the PageError flag in the page.

666

* and then through the PageError flag in the page.

667

* Since we drop pages on memory failure handling the

667

* Since we drop pages on memory failure handling the

668

* only mechanism open to use is through AS_AIO.

668

* only mechanism open to use is through AS_AIO.

669

*

669

*

670

* This has the disadvantage that it gets cleared on

670

* This has the disadvantage that it gets cleared on

671

* the first operation that returns an error, while

671

* the first operation that returns an error, while

672

* the PageError bit is more sticky and only cleared

672

* the PageError bit is more sticky and only cleared

673

* when the page is reread or dropped. If an

673

* when the page is reread or dropped. If an

674

* application assumes it will always get error on

674

* application assumes it will always get error on

675

* fsync, but does other operations on the fd before

675

* fsync, but does other operations on the fd before

676

* and the page is dropped between then the error

676

* and the page is dropped between then the error

677

* will not be properly reported.

677

* will not be properly reported.

678

*

678

*

679

* This can already happen even without hwpoisoned

679

* This can already happen even without hwpoisoned

680

* pages: first on metadata IO errors (which only

680

* pages: first on metadata IO errors (which only

681

* report through AS_EIO) or when the page is dropped

681

* report through AS_EIO) or when the page is dropped

682

* at the wrong time.

682

* at the wrong time.

683

*

683

*

684

* So right now we assume that the application DTRT on

684

* So right now we assume that the application DTRT on

685

* the first EIO, but we're not worse than other parts

685

* the first EIO, but we're not worse than other parts

686

* of the kernel.

686

* of the kernel.

687

*/

687

*/

688

mapping_set_error(mapping, EIO);

688

mapping_set_error(mapping, EIO);

689

}

689

}

690

691

return me_pagecache_clean(p, pfn);

691

return me_pagecache_clean(p, pfn);

692

}

692

}

693

694

/*

694

/*

695

* Clean and dirty swap cache.

695

* Clean and dirty swap cache.

696

*

696

*

697

* Dirty swap cache page is tricky to handle. The page could live both in page

697

* Dirty swap cache page is tricky to handle. The page could live both in page

698

* cache and swap cache(ie. page is freshly swapped in). So it could be

698

* cache and swap cache(ie. page is freshly swapped in). So it could be

699

* referenced concurrently by 2 types of PTEs:

699

* referenced concurrently by 2 types of PTEs:

700

* normal PTEs and swap PTEs. We try to handle them consistently by calling

700

* normal PTEs and swap PTEs. We try to handle them consistently by calling

701

* try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,

701

* try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,

702

* and then

702

* and then

703

* - clear dirty bit to prevent IO

703

* - clear dirty bit to prevent IO

704

* - remove from LRU

704

* - remove from LRU

705

* - but keep in the swap cache, so that when we return to it on

705

* - but keep in the swap cache, so that when we return to it on

706

* a later page fault, we know the application is accessing

706

* a later page fault, we know the application is accessing

707

* corrupted data and shall be killed (we installed simple

707

* corrupted data and shall be killed (we installed simple

708

* interception code in do_swap_page to catch it).

708

* interception code in do_swap_page to catch it).

709

*

709

*

710

* Clean swap cache pages can be directly isolated. A later page fault will

710

* Clean swap cache pages can be directly isolated. A later page fault will

711

* bring in the known good data from disk.

711

* bring in the known good data from disk.

712

*/

712

*/

713

static int me_swapcache_dirty(struct page *p, unsigned long pfn)

713

static int me_swapcache_dirty(struct page *p, unsigned long pfn)

714

{

714

{

715

ClearPageDirty(p);

715

ClearPageDirty(p);

716

/* Trigger EIO in shmem: */

716

/* Trigger EIO in shmem: */

717

ClearPageUptodate(p);

717

ClearPageUptodate(p);

718

719

if (!delete_from_lru_cache(p))

719

if (!delete_from_lru_cache(p))

720

return DELAYED;

720

return DELAYED;

721

else

721

else

722

return FAILED;

722

return FAILED;

723

}

723

}

724

725

static int me_swapcache_clean(struct page *p, unsigned long pfn)

725

static int me_swapcache_clean(struct page *p, unsigned long pfn)

726

{

726

{

727

delete_from_swap_cache(p);

727

delete_from_swap_cache(p);

728

729

if (!delete_from_lru_cache(p))

729

if (!delete_from_lru_cache(p))

730

return RECOVERED;

730

return RECOVERED;

731

else

731

else

732

return FAILED;

732

return FAILED;

733

}

733

}

734

735

/*

735

/*

736

* Huge pages. Needs work.

736

* Huge pages. Needs work.

737

* Issues:

737

* Issues:

738

* - Error on hugepage is contained in hugepage unit (not in raw page unit.)

738

* - Error on hugepage is contained in hugepage unit (not in raw page unit.)

739

* To narrow down kill region to one page, we need to break up pmd.

739

* To narrow down kill region to one page, we need to break up pmd.

740

*/

740

*/

741

static int me_huge_page(struct page *p, unsigned long pfn)

741

static int me_huge_page(struct page *p, unsigned long pfn)

742

{

742

{

743

int res = 0;

743

int res = 0;

744

struct page *hpage = compound_head(p);

744

struct page *hpage = compound_head(p);

745

/*

745

/*

746

* We can safely recover from error on free or reserved (i.e.

746

* We can safely recover from error on free or reserved (i.e.

747

* not in-use) hugepage by dequeuing it from freelist.

747

* not in-use) hugepage by dequeuing it from freelist.

748

* To check whether a hugepage is in-use or not, we can't use

748

* To check whether a hugepage is in-use or not, we can't use

749

* page->lru because it can be used in other hugepage operations,

749

* page->lru because it can be used in other hugepage operations,

750

* such as __unmap_hugepage_range() and gather_surplus_pages().

750

* such as __unmap_hugepage_range() and gather_surplus_pages().

751

* So instead we use page_mapping() and PageAnon().

751

* So instead we use page_mapping() and PageAnon().

752

* We assume that this function is called with page lock held,

752

* We assume that this function is called with page lock held,

753

* so there is no race between isolation and mapping/unmapping.

753

* so there is no race between isolation and mapping/unmapping.

754

*/

754

*/

755

if (!(page_mapping(hpage) || PageAnon(hpage))) {

755

if (!(page_mapping(hpage) || PageAnon(hpage))) {

756

res = dequeue_hwpoisoned_huge_page(hpage);

756

res = dequeue_hwpoisoned_huge_page(hpage);

757

if (!res)

757

if (!res)

758

return RECOVERED;

758

return RECOVERED;

759

}

759

}

760

return DELAYED;

760

return DELAYED;

761

}

761

}

762

763

/*

763

/*

764

* Various page states we can handle.

764

* Various page states we can handle.

765

*

765

*

766

* A page state is defined by its current page->flags bits.

766

* A page state is defined by its current page->flags bits.

767

* The table matches them in order and calls the right handler.

767

* The table matches them in order and calls the right handler.

768

*

768

*

769

* This is quite tricky because we can access page at any time

769

* This is quite tricky because we can access page at any time

770

* in its live cycle, so all accesses have to be extremely careful.

770

* in its live cycle, so all accesses have to be extremely careful.

771

*

771

*

772

* This is not complete. More states could be added.

772

* This is not complete. More states could be added.

773

* For any missing state don't attempt recovery.

773

* For any missing state don't attempt recovery.

774

*/

774

*/

775

776

#define dirty (1UL << PG_dirty)

776

#define dirty (1UL << PG_dirty)

777

#define sc (1UL << PG_swapcache)

777

#define sc (1UL << PG_swapcache)

778

#define unevict (1UL << PG_unevictable)

778

#define unevict (1UL << PG_unevictable)

779

#define mlock (1UL << PG_mlocked)

779

#define mlock (1UL << PG_mlocked)

780

#define writeback (1UL << PG_writeback)

780

#define writeback (1UL << PG_writeback)

781

#define lru (1UL << PG_lru)

781

#define lru (1UL << PG_lru)

782

#define swapbacked (1UL << PG_swapbacked)

782

#define swapbacked (1UL << PG_swapbacked)

783

#define head (1UL << PG_head)

783

#define head (1UL << PG_head)

784

#define tail (1UL << PG_tail)

784

#define tail (1UL << PG_tail)

785

#define compound (1UL << PG_compound)

785

#define compound (1UL << PG_compound)

786

#define slab (1UL << PG_slab)

786

#define slab (1UL << PG_slab)

787

#define reserved (1UL << PG_reserved)

787

#define reserved (1UL << PG_reserved)

788

789

static struct page_state {

789

static struct page_state {

790

unsigned long mask;

790

unsigned long mask;

791

unsigned long res;

791

unsigned long res;

792

char *msg;

792

char *msg;

793

int (*action)(struct page *p, unsigned long pfn);

793

int (*action)(struct page *p, unsigned long pfn);

794

} error_states[] = {

794

} error_states[] = {

795

{ reserved, reserved, "reserved kernel", me_kernel },

795

{ reserved, reserved, "reserved kernel", me_kernel },

796

/*

796

/*

797

* free pages are specially detected outside this table:

797

* free pages are specially detected outside this table:

798

* PG_buddy pages only make a small fraction of all free pages.

798

* PG_buddy pages only make a small fraction of all free pages.

799

*/

799

*/

800

801

/*

801

/*

802

* Could in theory check if slab page is free or if we can drop

802

* Could in theory check if slab page is free or if we can drop

803

* currently unused objects without touching them. But just

803

* currently unused objects without touching them. But just

804

* treat it as standard kernel for now.

804

* treat it as standard kernel for now.

805

*/

805

*/

806

{ slab, slab, "kernel slab", me_kernel },

806

{ slab, slab, "kernel slab", me_kernel },

807

808

#ifdef CONFIG_PAGEFLAGS_EXTENDED

808

#ifdef CONFIG_PAGEFLAGS_EXTENDED

809

{ head, head, "huge", me_huge_page },

809

{ head, head, "huge", me_huge_page },

810

{ tail, tail, "huge", me_huge_page },

810

{ tail, tail, "huge", me_huge_page },

811

#else

811

#else

812

{ compound, compound, "huge", me_huge_page },

812

{ compound, compound, "huge", me_huge_page },

813

#endif

813

#endif

814

815

{ sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },

815

{ sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },

816

{ sc|dirty, sc, "clean swapcache", me_swapcache_clean },

816

{ sc|dirty, sc, "clean swapcache", me_swapcache_clean },

817

818

{ mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },

818

{ mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },

819

{ mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },

819

{ mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },

820

821

{ unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },

821

{ unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },

822

{ unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },

822

{ unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },

823

824

{ lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },

824

{ lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },

825

{ lru|dirty, lru, "clean LRU", me_pagecache_clean },

825

{ lru|dirty, lru, "clean LRU", me_pagecache_clean },

826

827

/*

827

/*

828

* Catchall entry: must be at end.

828

* Catchall entry: must be at end.

829

*/

829

*/

830

{ 0, 0, "unknown page state", me_unknown },

830

{ 0, 0, "unknown page state", me_unknown },

831

};

831

};

832

833

#undef dirty

833

#undef dirty

834

#undef sc

834

#undef sc

835

#undef unevict

835

#undef unevict

836

#undef mlock

836

#undef mlock

837

#undef writeback

837

#undef writeback

838

#undef lru

838

#undef lru

839

#undef swapbacked

839

#undef swapbacked

840

#undef head

840

#undef head

841

#undef tail

841

#undef tail

842

#undef compound

842

#undef compound

843

#undef slab

843

#undef slab

844

#undef reserved

844

#undef reserved

845

846

/*

846

/*

847

* "Dirty/Clean" indication is not 100% accurate due to the possibility of

847

* "Dirty/Clean" indication is not 100% accurate due to the possibility of

848

* setting PG_dirty outside page lock. See also comment above set_page_dirty().

848

* setting PG_dirty outside page lock. See also comment above set_page_dirty().

849

*/

849

*/

850

static void action_result(unsigned long pfn, char *msg, int result)

850

static void action_result(unsigned long pfn, char *msg, int result)

851

{

851

{

852

pr_err("MCE %#lx: %s page recovery: %s\n",

852

pr_err("MCE %#lx: %s page recovery: %s\n",

853

pfn, msg, action_name[result]);

853

pfn, msg, action_name[result]);

854

}

854

}

855

856

static int page_action(struct page_state *ps, struct page *p,

856

static int page_action(struct page_state *ps, struct page *p,

857

unsigned long pfn)

857

unsigned long pfn)

858

{

858

{

859

int result;

859

int result;

860

int count;

860

int count;

861

862

result = ps->action(p, pfn);

862

result = ps->action(p, pfn);

863

action_result(pfn, ps->msg, result);

863

action_result(pfn, ps->msg, result);

864

865

count = page_count(p) - 1;

865

count = page_count(p) - 1;

866

if (ps->action == me_swapcache_dirty && result == DELAYED)

866

if (ps->action == me_swapcache_dirty && result == DELAYED)

867

count--;

867

count--;

868

if (count != 0) {

868

if (count != 0) {

869

printk(KERN_ERR

869

printk(KERN_ERR

870

"MCE %#lx: %s page still referenced by %d users\n",

870

"MCE %#lx: %s page still referenced by %d users\n",

871

pfn, ps->msg, count);

871

pfn, ps->msg, count);

872

result = FAILED;

872

result = FAILED;

873

}

873

}

874

875

/* Could do more checks here if page looks ok */

875

/* Could do more checks here if page looks ok */

876

/*

876

/*

877

* Could adjust zone counters here to correct for the missing page.

877

* Could adjust zone counters here to correct for the missing page.

878

*/

878

*/

879

880

return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;

880

return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;

881

}

881

}

882

883

/*

883

/*

884

* Do all that is necessary to remove user space mappings. Unmap

884

* Do all that is necessary to remove user space mappings. Unmap

885

* the pages and send SIGBUS to the processes if the data was dirty.

885

* the pages and send SIGBUS to the processes if the data was dirty.

886

*/

886

*/

887

static int hwpoison_user_mappings(struct page *p, unsigned long pfn,

887

static int hwpoison_user_mappings(struct page *p, unsigned long pfn,

888

int trapno, int flags, struct page **hpagep)

888

int trapno, int flags, struct page **hpagep)

889

{

889

{

890

enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;

890

enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;

891

struct address_space *mapping;

891

struct address_space *mapping;

892

LIST_HEAD(tokill);

892

LIST_HEAD(tokill);

893

int ret;

893

int ret;

894

int kill = 1, forcekill;

894

int kill = 1, forcekill;

895

struct page *hpage = *hpagep;

895

struct page *hpage = *hpagep;

896

struct page *ppage;

896

struct page *ppage;

897

898

if (PageReserved(p) || PageSlab(p) || !PageLRU(p))

898

/*

899

* Here we are interested only in user-mapped pages, so skip any

900

* other types of pages.

901

*/

902

if (PageReserved(p) || PageSlab(p))

903

return SWAP_SUCCESS;

904

if (!(PageLRU(hpage) || PageHuge(p)))

899

return SWAP_SUCCESS;

905

return SWAP_SUCCESS;

900

906

901

/*

907

/*

902

* This check implies we don't kill processes if their pages

908

* This check implies we don't kill processes if their pages

903

* are in the swap cache early. Those are always late kills.

909

* are in the swap cache early. Those are always late kills.

904

*/

910

*/

905

if (!page_mapped(hpage))

911

if (!page_mapped(hpage))

906

return SWAP_SUCCESS;

912

return SWAP_SUCCESS;

907

913

908

if (PageKsm(p))

914

if (PageKsm(p))

909

return SWAP_FAIL;

915

return SWAP_FAIL;

910

916

911

if (PageSwapCache(p)) {

917

if (PageSwapCache(p)) {

912

printk(KERN_ERR

918

printk(KERN_ERR

913

"MCE %#lx: keeping poisoned page in swap cache\n", pfn);

919

"MCE %#lx: keeping poisoned page in swap cache\n", pfn);

914

ttu |= TTU_IGNORE_HWPOISON;

920

ttu |= TTU_IGNORE_HWPOISON;

915

}

921

}

916

922

917

/*

923

/*

918

* Propagate the dirty bit from PTEs to struct page first, because we

924

* Propagate the dirty bit from PTEs to struct page first, because we

919

* need this to decide if we should kill or just drop the page.

925

* need this to decide if we should kill or just drop the page.

920

* XXX: the dirty test could be racy: set_page_dirty() may not always

926

* XXX: the dirty test could be racy: set_page_dirty() may not always

921

* be called inside page lock (it's recommended but not enforced).

927

* be called inside page lock (it's recommended but not enforced).

922

*/

928

*/

923

mapping = page_mapping(hpage);

929

mapping = page_mapping(hpage);

924

if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&

930

if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&

925

mapping_cap_writeback_dirty(mapping)) {

931

mapping_cap_writeback_dirty(mapping)) {

926

if (page_mkclean(hpage)) {

932

if (page_mkclean(hpage)) {

927

SetPageDirty(hpage);

933

SetPageDirty(hpage);

928

} else {

934

} else {

929

kill = 0;

935

kill = 0;

930

ttu |= TTU_IGNORE_HWPOISON;

936

ttu |= TTU_IGNORE_HWPOISON;

931

printk(KERN_INFO

937

printk(KERN_INFO

932

"MCE %#lx: corrupted page was clean: dropped without side effects\n",

938

"MCE %#lx: corrupted page was clean: dropped without side effects\n",

933

pfn);

939

pfn);

934

}

940

}

935

}

941

}

936

942

937

/*

943

/*

938

* ppage: poisoned page

944

* ppage: poisoned page

939

* if p is regular page(4k page)

945

* if p is regular page(4k page)

940

* ppage == real poisoned page;

946

* ppage == real poisoned page;

941

* else p is hugetlb or THP, ppage == head page.

947

* else p is hugetlb or THP, ppage == head page.

942

*/

948

*/

943

ppage = hpage;

949

ppage = hpage;

944

950

945

if (PageTransHuge(hpage)) {

951

if (PageTransHuge(hpage)) {

946

/*

952

/*

947

* Verify that this isn't a hugetlbfs head page, the check for

953

* Verify that this isn't a hugetlbfs head page, the check for

948

* PageAnon is just for avoid tripping a split_huge_page

954

* PageAnon is just for avoid tripping a split_huge_page

949

* internal debug check, as split_huge_page refuses to deal with

955

* internal debug check, as split_huge_page refuses to deal with

950

* anything that isn't an anon page. PageAnon can't go away fro

956

* anything that isn't an anon page. PageAnon can't go away fro

951

* under us because we hold a refcount on the hpage, without a

957

* under us because we hold a refcount on the hpage, without a

952

* refcount on the hpage. split_huge_page can't be safely called

958

* refcount on the hpage. split_huge_page can't be safely called

953

* in the first place, having a refcount on the tail isn't

959

* in the first place, having a refcount on the tail isn't

954

* enough * to be safe.

960

* enough * to be safe.

955

*/

961

*/

956

if (!PageHuge(hpage) && PageAnon(hpage)) {

962

if (!PageHuge(hpage) && PageAnon(hpage)) {

957

if (unlikely(split_huge_page(hpage))) {

963

if (unlikely(split_huge_page(hpage))) {

958

/*

964

/*

959

* FIXME: if splitting THP is failed, it is

965

* FIXME: if splitting THP is failed, it is

960

* better to stop the following operation rather

966

* better to stop the following operation rather

961

* than causing panic by unmapping. System might

967

* than causing panic by unmapping. System might

962

* survive if the page is freed later.

968

* survive if the page is freed later.

963

*/

969

*/

964

printk(KERN_INFO

970

printk(KERN_INFO

965

"MCE %#lx: failed to split THP\n", pfn);

971

"MCE %#lx: failed to split THP\n", pfn);

966

972

967

BUG_ON(!PageHWPoison(p));

973

BUG_ON(!PageHWPoison(p));

968

return SWAP_FAIL;

974

return SWAP_FAIL;

969

}

975

}

970

/*

976

/*

971

* We pinned the head page for hwpoison handling,

977

* We pinned the head page for hwpoison handling,

972

* now we split the thp and we are interested in

978

* now we split the thp and we are interested in

973

* the hwpoisoned raw page, so move the refcount

979

* the hwpoisoned raw page, so move the refcount

974

* to it. Similarly, page lock is shifted.

980

* to it. Similarly, page lock is shifted.

975

*/

981

*/

976

if (hpage != p) {

982

if (hpage != p) {

977

if (!(flags & MF_COUNT_INCREASED)) {

983

if (!(flags & MF_COUNT_INCREASED)) {

978

put_page(hpage);

984

put_page(hpage);

979

get_page(p);

985

get_page(p);

980

}

986

}

981

lock_page(p);

987

lock_page(p);

982

unlock_page(hpage);

988

unlock_page(hpage);

983

*hpagep = p;

989

*hpagep = p;

984

}

990

}

985

/* THP is split, so ppage should be the real poisoned page. */

991

/* THP is split, so ppage should be the real poisoned page. */

986

ppage = p;

992

ppage = p;

987

}

993

}

988

}

994

}

989

995

990

/*

996

/*

991

* First collect all the processes that have the page

997

* First collect all the processes that have the page

992

* mapped in dirty form. This has to be done before try_to_unmap,

998

* mapped in dirty form. This has to be done before try_to_unmap,

993

* because ttu takes the rmap data structures down.

999

* because ttu takes the rmap data structures down.

994

*

1000

*

995

* Error handling: We ignore errors here because

1001

* Error handling: We ignore errors here because

996

* there's nothing that can be done.

1002

* there's nothing that can be done.

997

*/

1003

*/

998

if (kill)

1004

if (kill)

999

collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);

1005

collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);

1000

1006

1001

ret = try_to_unmap(ppage, ttu);

1007

ret = try_to_unmap(ppage, ttu);

1002

if (ret != SWAP_SUCCESS)

1008

if (ret != SWAP_SUCCESS)

1003

printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",

1009

printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",

1004

pfn, page_mapcount(ppage));

1010

pfn, page_mapcount(ppage));

1005

1011

1006

/*

1012

/*

1007

* Now that the dirty bit has been propagated to the

1013

* Now that the dirty bit has been propagated to the

1008

* struct page and all unmaps done we can decide if

1014

* struct page and all unmaps done we can decide if

1009

* killing is needed or not. Only kill when the page

1015

* killing is needed or not. Only kill when the page

1010

* was dirty or the process is not restartable,

1016

* was dirty or the process is not restartable,

1011

* otherwise the tokill list is merely

1017

* otherwise the tokill list is merely

1012

* freed. When there was a problem unmapping earlier

1018

* freed. When there was a problem unmapping earlier

1013

* use a more force-full uncatchable kill to prevent

1019

* use a more force-full uncatchable kill to prevent

1014

* any accesses to the poisoned memory.

1020

* any accesses to the poisoned memory.

1015

*/

1021

*/

1016

forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);

1022

forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);

1017

kill_procs(&tokill, forcekill, trapno,

1023

kill_procs(&tokill, forcekill, trapno,

1018

ret != SWAP_SUCCESS, p, pfn, flags);

1024

ret != SWAP_SUCCESS, p, pfn, flags);

1019

1025

1020

return ret;

1026

return ret;

1021

}

1027

}

1022

1028

1023

static void set_page_hwpoison_huge_page(struct page *hpage)

1029

static void set_page_hwpoison_huge_page(struct page *hpage)

1024

{

1030

{

1025

int i;

1031

int i;

1026

int nr_pages = 1 << compound_order(hpage);

1032

int nr_pages = 1 << compound_order(hpage);

1027

for (i = 0; i < nr_pages; i++)

1033

for (i = 0; i < nr_pages; i++)

1028

SetPageHWPoison(hpage + i);

1034

SetPageHWPoison(hpage + i);

1029

}

1035

}

1030

1036

1031

static void clear_page_hwpoison_huge_page(struct page *hpage)

1037

static void clear_page_hwpoison_huge_page(struct page *hpage)

1032

{

1038

{

1033

int i;

1039

int i;

1034

int nr_pages = 1 << compound_order(hpage);

1040

int nr_pages = 1 << compound_order(hpage);

1035

for (i = 0; i < nr_pages; i++)

1041

for (i = 0; i < nr_pages; i++)

1036

ClearPageHWPoison(hpage + i);

1042

ClearPageHWPoison(hpage + i);

1037

}

1043

}

1038

1044

1039

/**

1045

/**

1040

* memory_failure - Handle memory failure of a page.

1046

* memory_failure - Handle memory failure of a page.

1041

* @pfn: Page Number of the corrupted page

1047

* @pfn: Page Number of the corrupted page

1042

* @trapno: Trap number reported in the signal to user space.

1048

* @trapno: Trap number reported in the signal to user space.

1043

* @flags: fine tune action taken

1049

* @flags: fine tune action taken

1044

*

1050

*

1045

* This function is called by the low level machine check code

1051

* This function is called by the low level machine check code

1046

* of an architecture when it detects hardware memory corruption

1052

* of an architecture when it detects hardware memory corruption

1047

* of a page. It tries its best to recover, which includes

1053

* of a page. It tries its best to recover, which includes

1048

* dropping pages, killing processes etc.

1054

* dropping pages, killing processes etc.

1049

*

1055

*

1050

* The function is primarily of use for corruptions that

1056

* The function is primarily of use for corruptions that

1051

* happen outside the current execution context (e.g. when

1057

* happen outside the current execution context (e.g. when

1052

* detected by a background scrubber)

1058

* detected by a background scrubber)

1053

*

1059

*

1054

* Must run in process context (e.g. a work queue) with interrupts

1060

* Must run in process context (e.g. a work queue) with interrupts

1055

* enabled and no spinlocks hold.

1061

* enabled and no spinlocks hold.

1056

*/

1062

*/

1057

int memory_failure(unsigned long pfn, int trapno, int flags)

1063

int memory_failure(unsigned long pfn, int trapno, int flags)

1058

{

1064

{

1059

struct page_state *ps;

1065

struct page_state *ps;

1060

struct page *p;

1066

struct page *p;

1061

struct page *hpage;

1067

struct page *hpage;

1062

int res;

1068

int res;

1063

unsigned int nr_pages;

1069

unsigned int nr_pages;

1064

unsigned long page_flags;

1070

unsigned long page_flags;

1065

1071

1066

if (!sysctl_memory_failure_recovery)

1072

if (!sysctl_memory_failure_recovery)

1067

panic("Memory failure from trap %d on page %lx", trapno, pfn);

1073

panic("Memory failure from trap %d on page %lx", trapno, pfn);

1068

1074

1069

if (!pfn_valid(pfn)) {

1075

if (!pfn_valid(pfn)) {

1070

printk(KERN_ERR

1076

printk(KERN_ERR

1071

"MCE %#lx: memory outside kernel control\n",

1077

"MCE %#lx: memory outside kernel control\n",

1072

pfn);

1078

pfn);

1073

return -ENXIO;

1079

return -ENXIO;

1074

}

1080

}

1075

1081

1076

p = pfn_to_page(pfn);

1082

p = pfn_to_page(pfn);

1077

hpage = compound_head(p);

1083

hpage = compound_head(p);

1078

if (TestSetPageHWPoison(p)) {

1084

if (TestSetPageHWPoison(p)) {

1079

printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);

1085

printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);

1080

return 0;

1086

return 0;

1081

}

1087

}

1082

1088

1083

/*

1089

/*

1084

* Currently errors on hugetlbfs pages are measured in hugepage units,

1090

* Currently errors on hugetlbfs pages are measured in hugepage units,

1085

* so nr_pages should be 1 << compound_order. OTOH when errors are on

1091

* so nr_pages should be 1 << compound_order. OTOH when errors are on

1086

* transparent hugepages, they are supposed to be split and error

1092

* transparent hugepages, they are supposed to be split and error

1087

* measurement is done in normal page units. So nr_pages should be one

1093

* measurement is done in normal page units. So nr_pages should be one

1088

* in this case.

1094

* in this case.

1089

*/

1095

*/

1090

if (PageHuge(p))

1096

if (PageHuge(p))

1091

nr_pages = 1 << compound_order(hpage);

1097

nr_pages = 1 << compound_order(hpage);

1092

else /* normal page or thp */

1098

else /* normal page or thp */

1093

nr_pages = 1;

1099

nr_pages = 1;

1094

atomic_long_add(nr_pages, &num_poisoned_pages);

1100

atomic_long_add(nr_pages, &num_poisoned_pages);

1095

1101

1096

/*

1102

/*

1097

* We need/can do nothing about count=0 pages.

1103

* We need/can do nothing about count=0 pages.

1098

* 1) it's a free page, and therefore in safe hand:

1104

* 1) it's a free page, and therefore in safe hand:

1099

* prep_new_page() will be the gate keeper.

1105

* prep_new_page() will be the gate keeper.

1100

* 2) it's a free hugepage, which is also safe:

1106

* 2) it's a free hugepage, which is also safe:

1101

* an affected hugepage will be dequeued from hugepage freelist,

1107

* an affected hugepage will be dequeued from hugepage freelist,

1102

* so there's no concern about reusing it ever after.

1108

* so there's no concern about reusing it ever after.

1103

* 3) it's part of a non-compound high order page.

1109

* 3) it's part of a non-compound high order page.

1104

* Implies some kernel user: cannot stop them from

1110

* Implies some kernel user: cannot stop them from

1105

* R/W the page; let's pray that the page has been

1111

* R/W the page; let's pray that the page has been

1106

* used and will be freed some time later.

1112

* used and will be freed some time later.

1107

* In fact it's dangerous to directly bump up page count from 0,

1113

* In fact it's dangerous to directly bump up page count from 0,

1108

* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.

1114

* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.

1109

*/

1115

*/

1110

if (!(flags & MF_COUNT_INCREASED) &&

1116

if (!(flags & MF_COUNT_INCREASED) &&

1111

!get_page_unless_zero(hpage)) {

1117

!get_page_unless_zero(hpage)) {

1112

if (is_free_buddy_page(p)) {

1118

if (is_free_buddy_page(p)) {

1113

action_result(pfn, "free buddy", DELAYED);

1119

action_result(pfn, "free buddy", DELAYED);

1114

return 0;

1120

return 0;

1115

} else if (PageHuge(hpage)) {

1121

} else if (PageHuge(hpage)) {

1116

/*

1122

/*

1117

* Check "filter hit" and "race with other subpage."

1123

* Check "filter hit" and "race with other subpage."

1118

*/

1124

*/

1119

lock_page(hpage);

1125

lock_page(hpage);

1120

if (PageHWPoison(hpage)) {

1126

if (PageHWPoison(hpage)) {

1121

if ((hwpoison_filter(p) && TestClearPageHWPoison(p))

1127

if ((hwpoison_filter(p) && TestClearPageHWPoison(p))

1122

|| (p != hpage && TestSetPageHWPoison(hpage))) {

1128

|| (p != hpage && TestSetPageHWPoison(hpage))) {

1123

atomic_long_sub(nr_pages, &num_poisoned_pages);

1129

atomic_long_sub(nr_pages, &num_poisoned_pages);

1124

unlock_page(hpage);

1130

unlock_page(hpage);

1125

return 0;

1131

return 0;

1126

}

1132

}

1127

}

1133

}

1128

set_page_hwpoison_huge_page(hpage);

1134

set_page_hwpoison_huge_page(hpage);

1129

res = dequeue_hwpoisoned_huge_page(hpage);

1135

res = dequeue_hwpoisoned_huge_page(hpage);

1130

action_result(pfn, "free huge",

1136

action_result(pfn, "free huge",

1131

res ? IGNORED : DELAYED);

1137

res ? IGNORED : DELAYED);

1132

unlock_page(hpage);

1138

unlock_page(hpage);

1133

return res;

1139

return res;

1134

} else {

1140

} else {

1135

action_result(pfn, "high order kernel", IGNORED);

1141

action_result(pfn, "high order kernel", IGNORED);

1136

return -EBUSY;

1142

return -EBUSY;

1137

}

1143

}

1138

}

1144

}

1139

1145

1140

/*

1146

/*

1141

* We ignore non-LRU pages for good reasons.

1147

* We ignore non-LRU pages for good reasons.

1142

* - PG_locked is only well defined for LRU pages and a few others

1148

* - PG_locked is only well defined for LRU pages and a few others

1143

* - to avoid races with __set_page_locked()

1149

* - to avoid races with __set_page_locked()

1144

* - to avoid races with __SetPageSlab*() (and more non-atomic ops)

1150

* - to avoid races with __SetPageSlab*() (and more non-atomic ops)

1145

* The check (unnecessarily) ignores LRU pages being isolated and

1151

* The check (unnecessarily) ignores LRU pages being isolated and

1146

* walked by the page reclaim code, however that's not a big loss.

1152

* walked by the page reclaim code, however that's not a big loss.

1147

*/

1153

*/

1148

if (!PageHuge(p) && !PageTransTail(p)) {

1154

if (!PageHuge(p) && !PageTransTail(p)) {

1149

if (!PageLRU(p))

1155

if (!PageLRU(p))

1150

shake_page(p, 0);

1156

shake_page(p, 0);

1151

if (!PageLRU(p)) {

1157

if (!PageLRU(p)) {

1152

/*

1158

/*

1153

* shake_page could have turned it free.

1159

* shake_page could have turned it free.

1154

*/

1160

*/

1155

if (is_free_buddy_page(p)) {

1161

if (is_free_buddy_page(p)) {

1156

if (flags & MF_COUNT_INCREASED)

1162

if (flags & MF_COUNT_INCREASED)

1157

action_result(pfn, "free buddy", DELAYED);

1163

action_result(pfn, "free buddy", DELAYED);

1158

else

1164

else

1159

action_result(pfn, "free buddy, 2nd try", DELAYED);

1165

action_result(pfn, "free buddy, 2nd try", DELAYED);

1160

return 0;

1166

return 0;

1161

}

1167

}

1162

}

1168

}

1163

}

1169

}

1164

1170

1165

lock_page(hpage);

1171

lock_page(hpage);

1166

1172

1167

/*

1173

/*

1168

* We use page flags to determine what action should be taken, but

1174

* We use page flags to determine what action should be taken, but

1169

* the flags can be modified by the error containment action. One

1175

* the flags can be modified by the error containment action. One

1170

* example is an mlocked page, where PG_mlocked is cleared by

1176

* example is an mlocked page, where PG_mlocked is cleared by

1171

* page_remove_rmap() in try_to_unmap_one(). So to determine page status

1177

* page_remove_rmap() in try_to_unmap_one(). So to determine page status

1172

* correctly, we save a copy of the page flags at this time.

1178

* correctly, we save a copy of the page flags at this time.

1173

*/

1179

*/

1174

page_flags = p->flags;

1180

page_flags = p->flags;

1175

1181

1176

/*

1182

/*

1177

* unpoison always clear PG_hwpoison inside page lock

1183

* unpoison always clear PG_hwpoison inside page lock

1178

*/

1184

*/

1179

if (!PageHWPoison(p)) {

1185

if (!PageHWPoison(p)) {

1180

printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);

1186

printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);

1181

atomic_long_sub(nr_pages, &num_poisoned_pages);

1187

atomic_long_sub(nr_pages, &num_poisoned_pages);

1182

put_page(hpage);

1188

put_page(hpage);

1183

res = 0;

1189

res = 0;

1184

goto out;

1190

goto out;

1185

}

1191

}

1186

if (hwpoison_filter(p)) {

1192

if (hwpoison_filter(p)) {

1187

if (TestClearPageHWPoison(p))

1193

if (TestClearPageHWPoison(p))

1188

atomic_long_sub(nr_pages, &num_poisoned_pages);

1194

atomic_long_sub(nr_pages, &num_poisoned_pages);

1189

unlock_page(hpage);

1195

unlock_page(hpage);

1190

put_page(hpage);

1196

put_page(hpage);

1191

return 0;

1197

return 0;

1192

}

1198

}

1193

1199

1194

if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))

1200

if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))

1195

goto identify_page_state;

1201

goto identify_page_state;

1196

1202

1197

/*

1203

/*

1198

* For error on the tail page, we should set PG_hwpoison

1204

* For error on the tail page, we should set PG_hwpoison

1199

* on the head page to show that the hugepage is hwpoisoned

1205

* on the head page to show that the hugepage is hwpoisoned

1200

*/

1206

*/

1201

if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {

1207

if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {

1202

action_result(pfn, "hugepage already hardware poisoned",

1208

action_result(pfn, "hugepage already hardware poisoned",

1203

IGNORED);

1209

IGNORED);

1204

unlock_page(hpage);

1210

unlock_page(hpage);

1205

put_page(hpage);

1211

put_page(hpage);

1206

return 0;

1212

return 0;

1207

}

1213

}

1208

/*

1214

/*

1209

* Set PG_hwpoison on all pages in an error hugepage,

1215

* Set PG_hwpoison on all pages in an error hugepage,

1210

* because containment is done in hugepage unit for now.

1216

* because containment is done in hugepage unit for now.

1211

* Since we have done TestSetPageHWPoison() for the head page with

1217

* Since we have done TestSetPageHWPoison() for the head page with

1212

* page lock held, we can safely set PG_hwpoison bits on tail pages.

1218

* page lock held, we can safely set PG_hwpoison bits on tail pages.

1213

*/

1219

*/

1214

if (PageHuge(p))

1220

if (PageHuge(p))

1215

set_page_hwpoison_huge_page(hpage);

1221

set_page_hwpoison_huge_page(hpage);

1216

1222

1217

/*

1223

/*

1218

* It's very difficult to mess with pages currently under IO

1224

* It's very difficult to mess with pages currently under IO

1219

* and in many cases impossible, so we just avoid it here.

1225

* and in many cases impossible, so we just avoid it here.

1220

*/

1226

*/

1221

wait_on_page_writeback(p);

1227

wait_on_page_writeback(p);

1222

1228

1223

/*

1229

/*

1224

* Now take care of user space mappings.

1230

* Now take care of user space mappings.

1225

* Abort on fail: __delete_from_page_cache() assumes unmapped page.

1231

* Abort on fail: __delete_from_page_cache() assumes unmapped page.

1226

*

1232

*

1227

* When the raw error page is thp tail page, hpage points to the raw

1233

* When the raw error page is thp tail page, hpage points to the raw

1228

* page after thp split.

1234

* page after thp split.

1229

*/

1235

*/

1230

if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)

1236

if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)

1231

!= SWAP_SUCCESS) {

1237

!= SWAP_SUCCESS) {

1232

printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);

1238

printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);

1233

res = -EBUSY;

1239

res = -EBUSY;

1234

goto out;

1240

goto out;

1235

}

1241

}

1236

1242

1237

/*

1243

/*

1238

* Torn down by someone else?

1244

* Torn down by someone else?

1239

*/

1245

*/

1240

if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {

1246

if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {

1241

action_result(pfn, "already truncated LRU", IGNORED);

1247

action_result(pfn, "already truncated LRU", IGNORED);

1242

res = -EBUSY;

1248

res = -EBUSY;

1243

goto out;

1249

goto out;

1244

}

1250

}

1245

1251

1246

identify_page_state:

1252

identify_page_state:

1247

res = -EBUSY;

1253

res = -EBUSY;

1248

/*

1254

/*

1249

* The first check uses the current page flags which may not have any

1255

* The first check uses the current page flags which may not have any

1250

* relevant information. The second check with the saved page flagss is

1256

* relevant information. The second check with the saved page flagss is

1251

* carried out only if the first check can't determine the page status.

1257

* carried out only if the first check can't determine the page status.

1252

*/

1258

*/

1253

for (ps = error_states;; ps++)

1259

for (ps = error_states;; ps++)

1254

if ((p->flags & ps->mask) == ps->res)

1260

if ((p->flags & ps->mask) == ps->res)

1255

break;

1261

break;

1256

1262

1257

page_flags |= (p->flags & (1UL << PG_dirty));

1263

page_flags |= (p->flags & (1UL << PG_dirty));

1258

1264

1259

if (!ps->mask)

1265

if (!ps->mask)

1260

for (ps = error_states;; ps++)

1266

for (ps = error_states;; ps++)

1261

if ((page_flags & ps->mask) == ps->res)

1267

if ((page_flags & ps->mask) == ps->res)

1262

break;

1268

break;

1263

res = page_action(ps, p, pfn);

1269

res = page_action(ps, p, pfn);

1264

out:

1270

out:

1265

unlock_page(hpage);

1271

unlock_page(hpage);

1266

return res;

1272

return res;

1267

}

1273

}

1268

EXPORT_SYMBOL_GPL(memory_failure);

1274

EXPORT_SYMBOL_GPL(memory_failure);

1269

1275

1270

#define MEMORY_FAILURE_FIFO_ORDER 4

1276

#define MEMORY_FAILURE_FIFO_ORDER 4

1271

#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)

1277

#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)

1272

1278

1273

struct memory_failure_entry {

1279

struct memory_failure_entry {

1274

unsigned long pfn;

1280

unsigned long pfn;

1275

int trapno;

1281

int trapno;

1276

int flags;

1282

int flags;

1277

};

1283

};

1278

1284

1279

struct memory_failure_cpu {

1285

struct memory_failure_cpu {

1280

DECLARE_KFIFO(fifo, struct memory_failure_entry,

1286

DECLARE_KFIFO(fifo, struct memory_failure_entry,

1281

MEMORY_FAILURE_FIFO_SIZE);

1287

MEMORY_FAILURE_FIFO_SIZE);

1282

spinlock_t lock;

1288

spinlock_t lock;

1283

struct work_struct work;

1289

struct work_struct work;

1284

};

1290

};

1285

1291

1286

static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);

1292

static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);

1287

1293

1288

/**

1294

/**

1289

* memory_failure_queue - Schedule handling memory failure of a page.

1295

* memory_failure_queue - Schedule handling memory failure of a page.

1290

* @pfn: Page Number of the corrupted page

1296

* @pfn: Page Number of the corrupted page

1291

* @trapno: Trap number reported in the signal to user space.

1297

* @trapno: Trap number reported in the signal to user space.

1292

* @flags: Flags for memory failure handling

1298

* @flags: Flags for memory failure handling

1293

*

1299

*

1294

* This function is called by the low level hardware error handler

1300

* This function is called by the low level hardware error handler

1295

* when it detects hardware memory corruption of a page. It schedules

1301

* when it detects hardware memory corruption of a page. It schedules

1296

* the recovering of error page, including dropping pages, killing

1302

* the recovering of error page, including dropping pages, killing

1297

* processes etc.

1303

* processes etc.

1298

*

1304

*

1299

* The function is primarily of use for corruptions that

1305

* The function is primarily of use for corruptions that

1300

* happen outside the current execution context (e.g. when

1306

* happen outside the current execution context (e.g. when

1301

* detected by a background scrubber)

1307

* detected by a background scrubber)

1302

*

1308

*

1303

* Can run in IRQ context.

1309

* Can run in IRQ context.

1304

*/

1310

*/

1305

void memory_failure_queue(unsigned long pfn, int trapno, int flags)

1311

void memory_failure_queue(unsigned long pfn, int trapno, int flags)

1306

{

1312

{

1307

struct memory_failure_cpu *mf_cpu;

1313

struct memory_failure_cpu *mf_cpu;

1308

unsigned long proc_flags;

1314

unsigned long proc_flags;

1309

struct memory_failure_entry entry = {

1315

struct memory_failure_entry entry = {

1310

.pfn = pfn,

1316

.pfn = pfn,

1311

.trapno = trapno,

1317

.trapno = trapno,

1312

.flags = flags,

1318

.flags = flags,

1313

};

1319

};

1314

1320

1315

mf_cpu = &get_cpu_var(memory_failure_cpu);

1321

mf_cpu = &get_cpu_var(memory_failure_cpu);

1316

spin_lock_irqsave(&mf_cpu->lock, proc_flags);

1322

spin_lock_irqsave(&mf_cpu->lock, proc_flags);

1317

if (kfifo_put(&mf_cpu->fifo, entry))

1323

if (kfifo_put(&mf_cpu->fifo, entry))

1318

schedule_work_on(smp_processor_id(), &mf_cpu->work);

1324

schedule_work_on(smp_processor_id(), &mf_cpu->work);

1319

else

1325

else

1320

pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",

1326

pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",

1321

pfn);

1327

pfn);

1322

spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);

1328

spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);

1323

put_cpu_var(memory_failure_cpu);

1329

put_cpu_var(memory_failure_cpu);

1324

}

1330

}

1325

EXPORT_SYMBOL_GPL(memory_failure_queue);

1331

EXPORT_SYMBOL_GPL(memory_failure_queue);

1326

1332

1327

static void memory_failure_work_func(struct work_struct *work)

1333

static void memory_failure_work_func(struct work_struct *work)

1328

{

1334

{

1329

struct memory_failure_cpu *mf_cpu;

1335

struct memory_failure_cpu *mf_cpu;

1330

struct memory_failure_entry entry = { 0, };

1336

struct memory_failure_entry entry = { 0, };

1331

unsigned long proc_flags;

1337

unsigned long proc_flags;

1332

int gotten;

1338

int gotten;

1333

1339

1334

mf_cpu = this_cpu_ptr(&memory_failure_cpu);

1340

mf_cpu = this_cpu_ptr(&memory_failure_cpu);

1335

for (;;) {

1341

for (;;) {

1336

spin_lock_irqsave(&mf_cpu->lock, proc_flags);

1342

spin_lock_irqsave(&mf_cpu->lock, proc_flags);

1337

gotten = kfifo_get(&mf_cpu->fifo, &entry);

1343

gotten = kfifo_get(&mf_cpu->fifo, &entry);

1338

spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);

1344

spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);

1339

if (!gotten)

1345

if (!gotten)

1340

break;

1346

break;

1341

if (entry.flags & MF_SOFT_OFFLINE)

1347

if (entry.flags & MF_SOFT_OFFLINE)

1342

soft_offline_page(pfn_to_page(entry.pfn), entry.flags);

1348

soft_offline_page(pfn_to_page(entry.pfn), entry.flags);

1343

else

1349

else

1344

memory_failure(entry.pfn, entry.trapno, entry.flags);

1350

memory_failure(entry.pfn, entry.trapno, entry.flags);

1345

}

1351

}

1346

}

1352

}

1347

1353

1348

static int __init memory_failure_init(void)

1354

static int __init memory_failure_init(void)

1349

{

1355

{

1350

struct memory_failure_cpu *mf_cpu;

1356

struct memory_failure_cpu *mf_cpu;

1351

int cpu;

1357

int cpu;

1352

1358

1353

for_each_possible_cpu(cpu) {

1359

for_each_possible_cpu(cpu) {

1354

mf_cpu = &per_cpu(memory_failure_cpu, cpu);

1360

mf_cpu = &per_cpu(memory_failure_cpu, cpu);

1355

spin_lock_init(&mf_cpu->lock);

1361

spin_lock_init(&mf_cpu->lock);

1356

INIT_KFIFO(mf_cpu->fifo);

1362

INIT_KFIFO(mf_cpu->fifo);

1357

INIT_WORK(&mf_cpu->work, memory_failure_work_func);

1363

INIT_WORK(&mf_cpu->work, memory_failure_work_func);

1358

}

1364

}

1359

1365

1360

return 0;

1366

return 0;

1361

}

1367

}

1362

core_initcall(memory_failure_init);

1368

core_initcall(memory_failure_init);

1363

1369

1364

/**

1370

/**

1365

* unpoison_memory - Unpoison a previously poisoned page

1371

* unpoison_memory - Unpoison a previously poisoned page

1366

* @pfn: Page number of the to be unpoisoned page

1372

* @pfn: Page number of the to be unpoisoned page

1367

*

1373

*

1368

* Software-unpoison a page that has been poisoned by

1374

* Software-unpoison a page that has been poisoned by

1369

* memory_failure() earlier.

1375

* memory_failure() earlier.

1370

*

1376

*

1371

* This is only done on the software-level, so it only works

1377

* This is only done on the software-level, so it only works

1372

* for linux injected failures, not real hardware failures

1378

* for linux injected failures, not real hardware failures

1373

*

1379

*

1374

* Returns 0 for success, otherwise -errno.

1380

* Returns 0 for success, otherwise -errno.

1375

*/

1381

*/

1376

int unpoison_memory(unsigned long pfn)

1382

int unpoison_memory(unsigned long pfn)

1377

{

1383

{

1378

struct page *page;

1384

struct page *page;

1379

struct page *p;

1385

struct page *p;

1380

int freeit = 0;

1386

int freeit = 0;

1381

unsigned int nr_pages;

1387

unsigned int nr_pages;

1382

1388

1383

if (!pfn_valid(pfn))

1389

if (!pfn_valid(pfn))

1384

return -ENXIO;

1390

return -ENXIO;

1385

1391

1386

p = pfn_to_page(pfn);

1392

p = pfn_to_page(pfn);

1387

page = compound_head(p);

1393

page = compound_head(p);

1388

1394

1389

if (!PageHWPoison(p)) {

1395

if (!PageHWPoison(p)) {

1390

pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);

1396

pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);

1391

return 0;

1397

return 0;

1392

}

1398

}

1393

1399

1394

/*

1400

/*

1395

* unpoison_memory() can encounter thp only when the thp is being

1401

* unpoison_memory() can encounter thp only when the thp is being

1396

* worked by memory_failure() and the page lock is not held yet.

1402

* worked by memory_failure() and the page lock is not held yet.

1397

* In such case, we yield to memory_failure() and make unpoison fail.

1403

* In such case, we yield to memory_failure() and make unpoison fail.

1398

*/

1404

*/

1399

if (!PageHuge(page) && PageTransHuge(page)) {

1405

if (!PageHuge(page) && PageTransHuge(page)) {

1400

pr_info("MCE: Memory failure is now running on %#lx\n", pfn);

1406

pr_info("MCE: Memory failure is now running on %#lx\n", pfn);

1401

return 0;

1407

return 0;

1402

}

1408

}

1403

1409

1404

nr_pages = 1 << compound_order(page);

1410

nr_pages = 1 << compound_order(page);

1405

1411

1406

if (!get_page_unless_zero(page)) {

1412

if (!get_page_unless_zero(page)) {

1407

/*

1413

/*

1408

* Since HWPoisoned hugepage should have non-zero refcount,

1414

* Since HWPoisoned hugepage should have non-zero refcount,

1409

* race between memory failure and unpoison seems to happen.

1415

* race between memory failure and unpoison seems to happen.

1410

* In such case unpoison fails and memory failure runs

1416

* In such case unpoison fails and memory failure runs

1411

* to the end.

1417

* to the end.

1412

*/

1418

*/

1413

if (PageHuge(page)) {

1419

if (PageHuge(page)) {

1414

pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);

1420

pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);

1415

return 0;

1421

return 0;

1416

}

1422

}

1417

if (TestClearPageHWPoison(p))

1423

if (TestClearPageHWPoison(p))

1418

atomic_long_dec(&num_poisoned_pages);

1424

atomic_long_dec(&num_poisoned_pages);

1419

pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);

1425

pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);

1420

return 0;

1426

return 0;

1421

}

1427

}

1422

1428

1423

lock_page(page);

1429

lock_page(page);

1424

/*

1430

/*

1425

* This test is racy because PG_hwpoison is set outside of page lock.

1431

* This test is racy because PG_hwpoison is set outside of page lock.

1426

* That's acceptable because that won't trigger kernel panic. Instead,

1432

* That's acceptable because that won't trigger kernel panic. Instead,

1427

* the PG_hwpoison page will be caught and isolated on the entrance to

1433

* the PG_hwpoison page will be caught and isolated on the entrance to

1428

* the free buddy page pool.

1434

* the free buddy page pool.

1429

*/

1435

*/

1430

if (TestClearPageHWPoison(page)) {

1436

if (TestClearPageHWPoison(page)) {

1431

pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);

1437

pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);

1432

atomic_long_sub(nr_pages, &num_poisoned_pages);

1438

atomic_long_sub(nr_pages, &num_poisoned_pages);

1433

freeit = 1;

1439

freeit = 1;

1434

if (PageHuge(page))

1440

if (PageHuge(page))

1435

clear_page_hwpoison_huge_page(page);

1441

clear_page_hwpoison_huge_page(page);

1436

}

1442

}

1437

unlock_page(page);

1443

unlock_page(page);

1438

1444

1439

put_page(page);

1445

put_page(page);

1440

if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))

1446

if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))

1441

put_page(page);

1447

put_page(page);

1442

1448

1443

return 0;

1449

return 0;

1444

}

1450

}

1445

EXPORT_SYMBOL(unpoison_memory);

1451

EXPORT_SYMBOL(unpoison_memory);

1446

1452

1447

static struct page *new_page(struct page *p, unsigned long private, int **x)

1453

static struct page *new_page(struct page *p, unsigned long private, int **x)

1448

{

1454

{

1449

int nid = page_to_nid(p);

1455

int nid = page_to_nid(p);

1450

if (PageHuge(p))

1456

if (PageHuge(p))

1451

return alloc_huge_page_node(page_hstate(compound_head(p)),

1457

return alloc_huge_page_node(page_hstate(compound_head(p)),

1452

nid);

1458

nid);

1453

else

1459

else

1454

return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);

1460

return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);

1455

}

1461

}

1456

1462

1457

/*

1463

/*

1458

* Safely get reference count of an arbitrary page.

1464

* Safely get reference count of an arbitrary page.

1459

* Returns 0 for a free page, -EIO for a zero refcount page

1465

* Returns 0 for a free page, -EIO for a zero refcount page

1460

* that is not free, and 1 for any other page type.

1466

* that is not free, and 1 for any other page type.

1461

* For 1 the page is returned with increased page count, otherwise not.

1467

* For 1 the page is returned with increased page count, otherwise not.

1462

*/

1468

*/

1463

static int __get_any_page(struct page *p, unsigned long pfn, int flags)

1469

static int __get_any_page(struct page *p, unsigned long pfn, int flags)

1464

{

1470

{

1465

int ret;

1471

int ret;

1466

1472

1467

if (flags & MF_COUNT_INCREASED)

1473

if (flags & MF_COUNT_INCREASED)

1468

return 1;

1474

return 1;

1469

1475

1470

/*

1476

/*

1471

* When the target page is a free hugepage, just remove it

1477

* When the target page is a free hugepage, just remove it

1472

* from free hugepage list.

1478

* from free hugepage list.

1473

*/

1479

*/

1474

if (!get_page_unless_zero(compound_head(p))) {

1480

if (!get_page_unless_zero(compound_head(p))) {

1475

if (PageHuge(p)) {

1481

if (PageHuge(p)) {

1476

pr_info("%s: %#lx free huge page\n", __func__, pfn);

1482

pr_info("%s: %#lx free huge page\n", __func__, pfn);

1477

ret = 0;

1483

ret = 0;

1478

} else if (is_free_buddy_page(p)) {

1484

} else if (is_free_buddy_page(p)) {

1479

pr_info("%s: %#lx free buddy page\n", __func__, pfn);

1485

pr_info("%s: %#lx free buddy page\n", __func__, pfn);

1480

ret = 0;

1486

ret = 0;

1481

} else {

1487

} else {

1482

pr_info("%s: %#lx: unknown zero refcount page type %lx\n",

1488

pr_info("%s: %#lx: unknown zero refcount page type %lx\n",

1483

__func__, pfn, p->flags);

1489

__func__, pfn, p->flags);

1484

ret = -EIO;

1490

ret = -EIO;

1485

}

1491

}

1486

} else {

1492

} else {

1487

/* Not a free page */

1493

/* Not a free page */

1488

ret = 1;

1494

ret = 1;

1489

}

1495

}

1490

return ret;

1496

return ret;

1491

}

1497

}

1492

1498

1493

static int get_any_page(struct page *page, unsigned long pfn, int flags)

1499

static int get_any_page(struct page *page, unsigned long pfn, int flags)

1494

{

1500

{

1495

int ret = __get_any_page(page, pfn, flags);

1501

int ret = __get_any_page(page, pfn, flags);

1496

1502

1497

if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {

1503

if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {

1498

/*

1504

/*

1499

* Try to free it.

1505

* Try to free it.

1500

*/

1506

*/

1501

put_page(page);

1507

put_page(page);

1502

shake_page(page, 1);

1508

shake_page(page, 1);

1503

1509

1504

/*

1510

/*

1505

* Did it turn free?

1511

* Did it turn free?

1506

*/

1512

*/

1507

ret = __get_any_page(page, pfn, 0);

1513

ret = __get_any_page(page, pfn, 0);

1508

if (!PageLRU(page)) {

1514

if (!PageLRU(page)) {

1509

pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",

1515

pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",

1510

pfn, page->flags);

1516

pfn, page->flags);

1511

return -EIO;

1517

return -EIO;

1512

}

1518

}

1513

}

1519

}

1514

return ret;

1520

return ret;

1515

}

1521

}

1516

1522

1517

static int soft_offline_huge_page(struct page *page, int flags)

1523

static int soft_offline_huge_page(struct page *page, int flags)

1518

{

1524

{

1519

int ret;

1525

int ret;

1520

unsigned long pfn = page_to_pfn(page);

1526

unsigned long pfn = page_to_pfn(page);

1521

struct page *hpage = compound_head(page);

1527

struct page *hpage = compound_head(page);

1522

LIST_HEAD(pagelist);

1528

LIST_HEAD(pagelist);

1523

1529

1524

/*

1530

/*

1525

* This double-check of PageHWPoison is to avoid the race with

1531

* This double-check of PageHWPoison is to avoid the race with

1526

* memory_failure(). See also comment in __soft_offline_page().

1532

* memory_failure(). See also comment in __soft_offline_page().

1527

*/

1533

*/

1528

lock_page(hpage);

1534

lock_page(hpage);

1529

if (PageHWPoison(hpage)) {

1535

if (PageHWPoison(hpage)) {

1530

unlock_page(hpage);

1536

unlock_page(hpage);

1531

put_page(hpage);

1537

put_page(hpage);

1532

pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);

1538

pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);

1533

return -EBUSY;

1539

return -EBUSY;

1534

}

1540

}

1535

unlock_page(hpage);

1541

unlock_page(hpage);

1536

1542

1537

/* Keep page count to indicate a given hugepage is isolated. */

1543

/* Keep page count to indicate a given hugepage is isolated. */

1538

list_move(&hpage->lru, &pagelist);

1544

list_move(&hpage->lru, &pagelist);

1539

ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,

1545

ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,

1540

MIGRATE_SYNC, MR_MEMORY_FAILURE);

1546

MIGRATE_SYNC, MR_MEMORY_FAILURE);

1541

if (ret) {

1547

if (ret) {

1542

pr_info("soft offline: %#lx: migration failed %d, type %lx\n",

1548

pr_info("soft offline: %#lx: migration failed %d, type %lx\n",

1543

pfn, ret, page->flags);

1549

pfn, ret, page->flags);

1544

/*

1550

/*

1545

* We know that soft_offline_huge_page() tries to migrate

1551

* We know that soft_offline_huge_page() tries to migrate

1546

* only one hugepage pointed to by hpage, so we need not

1552

* only one hugepage pointed to by hpage, so we need not

1547

* run through the pagelist here.

1553

* run through the pagelist here.

1548

*/

1554

*/

1549

putback_active_hugepage(hpage);

1555

putback_active_hugepage(hpage);

1550

if (ret > 0)

1556

if (ret > 0)

1551

ret = -EIO;

1557

ret = -EIO;

1552

} else {

1558

} else {

1553

/* overcommit hugetlb page will be freed to buddy */

1559

/* overcommit hugetlb page will be freed to buddy */

1554

if (PageHuge(page)) {

1560

if (PageHuge(page)) {

1555

set_page_hwpoison_huge_page(hpage);

1561

set_page_hwpoison_huge_page(hpage);

1556

dequeue_hwpoisoned_huge_page(hpage);

1562

dequeue_hwpoisoned_huge_page(hpage);

1557

atomic_long_add(1 << compound_order(hpage),

1563

atomic_long_add(1 << compound_order(hpage),

1558

&num_poisoned_pages);

1564

&num_poisoned_pages);

1559

} else {

1565

} else {

1560

SetPageHWPoison(page);

1566

SetPageHWPoison(page);

1561

atomic_long_inc(&num_poisoned_pages);

1567

atomic_long_inc(&num_poisoned_pages);

1562

}

1568

}

1563

}

1569

}

1564

return ret;

1570

return ret;

1565

}

1571

}

1566

1572

1567

static int __soft_offline_page(struct page *page, int flags)

1573

static int __soft_offline_page(struct page *page, int flags)

1568

{

1574

{

1569

int ret;

1575

int ret;

1570

unsigned long pfn = page_to_pfn(page);

1576

unsigned long pfn = page_to_pfn(page);

1571

1577

1572

/*

1578

/*

1573

* Check PageHWPoison again inside page lock because PageHWPoison

1579

* Check PageHWPoison again inside page lock because PageHWPoison

1574

* is set by memory_failure() outside page lock. Note that

1580

* is set by memory_failure() outside page lock. Note that

1575

* memory_failure() also double-checks PageHWPoison inside page lock,

1581

* memory_failure() also double-checks PageHWPoison inside page lock,

1576

* so there's no race between soft_offline_page() and memory_failure().

1582

* so there's no race between soft_offline_page() and memory_failure().

1577

*/

1583

*/

1578

lock_page(page);

1584

lock_page(page);

1579

wait_on_page_writeback(page);

1585

wait_on_page_writeback(page);

1580

if (PageHWPoison(page)) {

1586

if (PageHWPoison(page)) {

1581

unlock_page(page);

1587

unlock_page(page);

1582

put_page(page);

1588

put_page(page);

1583

pr_info("soft offline: %#lx page already poisoned\n", pfn);

1589

pr_info("soft offline: %#lx page already poisoned\n", pfn);

1584

return -EBUSY;

1590

return -EBUSY;

1585

}

1591

}

1586

/*

1592

/*

1587

* Try to invalidate first. This should work for

1593

* Try to invalidate first. This should work for

1588

* non dirty unmapped page cache pages.

1594

* non dirty unmapped page cache pages.

1589

*/

1595

*/

1590

ret = invalidate_inode_page(page);

1596

ret = invalidate_inode_page(page);

1591

unlock_page(page);

1597

unlock_page(page);

1592

/*

1598

/*

1593

* RED-PEN would be better to keep it isolated here, but we

1599

* RED-PEN would be better to keep it isolated here, but we

1594

* would need to fix isolation locking first.

1600

* would need to fix isolation locking first.

1595

*/

1601

*/

1596

if (ret == 1) {

1602

if (ret == 1) {

1597

put_page(page);

1603

put_page(page);

1598

pr_info("soft_offline: %#lx: invalidated\n", pfn);

1604

pr_info("soft_offline: %#lx: invalidated\n", pfn);

1599

SetPageHWPoison(page);

1605

SetPageHWPoison(page);

1600

atomic_long_inc(&num_poisoned_pages);

1606

atomic_long_inc(&num_poisoned_pages);

1601

return 0;

1607

return 0;

1602

}

1608

}

1603

1609

1604

/*

1610

/*

1605

* Simple invalidation didn't work.

1611

* Simple invalidation didn't work.

1606

* Try to migrate to a new page instead. migrate.c

1612

* Try to migrate to a new page instead. migrate.c

1607

* handles a large number of cases for us.

1613

* handles a large number of cases for us.

1608

*/

1614

*/

1609

ret = isolate_lru_page(page);

1615

ret = isolate_lru_page(page);

1610

/*

1616

/*

1611

* Drop page reference which is came from get_any_page()

1617

* Drop page reference which is came from get_any_page()

1612

* successful isolate_lru_page() already took another one.

1618

* successful isolate_lru_page() already took another one.

1613

*/

1619

*/

1614

put_page(page);

1620

put_page(page);

1615

if (!ret) {

1621

if (!ret) {

1616

LIST_HEAD(pagelist);

1622

LIST_HEAD(pagelist);

1617

inc_zone_page_state(page, NR_ISOLATED_ANON +

1623

inc_zone_page_state(page, NR_ISOLATED_ANON +

1618

page_is_file_cache(page));

1624

page_is_file_cache(page));

1619

list_add(&page->lru, &pagelist);

1625

list_add(&page->lru, &pagelist);

1620

ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,

1626

ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,

1621

MIGRATE_SYNC, MR_MEMORY_FAILURE);

1627

MIGRATE_SYNC, MR_MEMORY_FAILURE);

1622

if (ret) {

1628

if (ret) {

1623

if (!list_empty(&pagelist)) {

1629

if (!list_empty(&pagelist)) {

1624

list_del(&page->lru);

1630

list_del(&page->lru);

1625

dec_zone_page_state(page, NR_ISOLATED_ANON +

1631

dec_zone_page_state(page, NR_ISOLATED_ANON +

1626

page_is_file_cache(page));

1632

page_is_file_cache(page));

1627

putback_lru_page(page);

1633

putback_lru_page(page);

1628

}

1634

}

1629

1635

1630

pr_info("soft offline: %#lx: migration failed %d, type %lx\n",

1636

pr_info("soft offline: %#lx: migration failed %d, type %lx\n",

1631

pfn, ret, page->flags);

1637

pfn, ret, page->flags);

1632

if (ret > 0)

1638

if (ret > 0)

1633

ret = -EIO;

1639

ret = -EIO;

1634

} else {

1640

} else {

1635

/*

1641

/*

1636

* After page migration succeeds, the source page can

1642

* After page migration succeeds, the source page can

1637

* be trapped in pagevec and actual freeing is delayed.

1643

* be trapped in pagevec and actual freeing is delayed.

1638

* Freeing code works differently based on PG_hwpoison,

1644

* Freeing code works differently based on PG_hwpoison,

1639

* so there's a race. We need to make sure that the

1645

* so there's a race. We need to make sure that the

1640

* source page should be freed back to buddy before

1646

* source page should be freed back to buddy before

1641

* setting PG_hwpoison.

1647

* setting PG_hwpoison.

1642

*/

1648

*/

1643

if (!is_free_buddy_page(page))

1649

if (!is_free_buddy_page(page))

1644

lru_add_drain_all();

1650

lru_add_drain_all();

1645

if (!is_free_buddy_page(page))

1651

if (!is_free_buddy_page(page))

1646

drain_all_pages();

1652

drain_all_pages();

1647

SetPageHWPoison(page);

1653

SetPageHWPoison(page);

1648

if (!is_free_buddy_page(page))

1654

if (!is_free_buddy_page(page))

1649

pr_info("soft offline: %#lx: page leaked\n",

1655

pr_info("soft offline: %#lx: page leaked\n",

1650

pfn);

1656

pfn);

1651

atomic_long_inc(&num_poisoned_pages);

1657

atomic_long_inc(&num_poisoned_pages);

1652

}

1658

}

1653

} else {

1659

} else {

1654

pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",

1660

pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",

1655

pfn, ret, page_count(page), page->flags);

1661

pfn, ret, page_count(page), page->flags);

1656

}

1662

}

1657

return ret;

1663

return ret;

1658

}

1664

}

1659

1665

1660

/**

1666

/**

1661

* soft_offline_page - Soft offline a page.

1667

* soft_offline_page - Soft offline a page.

1662

* @page: page to offline

1668

* @page: page to offline

1663

* @flags: flags. Same as memory_failure().

1669

* @flags: flags. Same as memory_failure().

1664

*

1670

*

1665

* Returns 0 on success, otherwise negated errno.

1671

* Returns 0 on success, otherwise negated errno.

1666

*

1672

*

1667

* Soft offline a page, by migration or invalidation,

1673

* Soft offline a page, by migration or invalidation,

1668

* without killing anything. This is for the case when

1674

* without killing anything. This is for the case when

1669

* a page is not corrupted yet (so it's still valid to access),

1675

* a page is not corrupted yet (so it's still valid to access),

1670

* but has had a number of corrected errors and is better taken

1676

* but has had a number of corrected errors and is better taken

1671

* out.

1677

* out.

1672

*

1678

*

1673

* The actual policy on when to do that is maintained by

1679

* The actual policy on when to do that is maintained by

1674

* user space.

1680

* user space.

1675

*

1681

*

1676

* This should never impact any application or cause data loss,

1682

* This should never impact any application or cause data loss,

1677

* however it might take some time.

1683

* however it might take some time.

1678

*

1684

*

1679

* This is not a 100% solution for all memory, but tries to be

1685

* This is not a 100% solution for all memory, but tries to be

1680

* ``good enough'' for the majority of memory.

1686

* ``good enough'' for the majority of memory.

1681

*/

1687

*/

1682

int soft_offline_page(struct page *page, int flags)

1688

int soft_offline_page(struct page *page, int flags)

1683

{

1689

{

1684

int ret;

1690

int ret;

1685

unsigned long pfn = page_to_pfn(page);

1691

unsigned long pfn = page_to_pfn(page);

1686

struct page *hpage = compound_head(page);

1692

struct page *hpage = compound_head(page);

1687

1693

1688

if (PageHWPoison(page)) {

1694

if (PageHWPoison(page)) {

1689

pr_info("soft offline: %#lx page already poisoned\n", pfn);

1695

pr_info("soft offline: %#lx page already poisoned\n", pfn);

1690

return -EBUSY;

1696

return -EBUSY;

1691

}

1697

}

1692

if (!PageHuge(page) && PageTransHuge(hpage)) {

1698

if (!PageHuge(page) && PageTransHuge(hpage)) {

1693

if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {

1699

if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {

1694

pr_info("soft offline: %#lx: failed to split THP\n",

1700

pr_info("soft offline: %#lx: failed to split THP\n",

1695

pfn);

1701

pfn);

1696

return -EBUSY;

1702

return -EBUSY;

1697

}

1703

}

1698

}

1704

}

1699

1705

1700

get_online_mems();

1706

get_online_mems();

1701

1707

1702

/*

1708

/*

1703

* Isolate the page, so that it doesn't get reallocated if it

1709

* Isolate the page, so that it doesn't get reallocated if it

1704

* was free. This flag should be kept set until the source page

1710

* was free. This flag should be kept set until the source page

1705

* is freed and PG_hwpoison on it is set.

1711

* is freed and PG_hwpoison on it is set.

1706

*/

1712

*/

1707

if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)

1713

if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)

1708

set_migratetype_isolate(page, true);

1714

set_migratetype_isolate(page, true);

1709

1715

1710

ret = get_any_page(page, pfn, flags);

1716

ret = get_any_page(page, pfn, flags);

1711

put_online_mems();

1717

put_online_mems();

1712

if (ret > 0) { /* for in-use pages */

1718

if (ret > 0) { /* for in-use pages */

1713

if (PageHuge(page))

1719

if (PageHuge(page))

1714

ret = soft_offline_huge_page(page, flags);

1720

ret = soft_offline_huge_page(page, flags);

1715

else

1721

else

1716

ret = __soft_offline_page(page, flags);

1722

ret = __soft_offline_page(page, flags);

1717

} else if (ret == 0) { /* for free pages */

1723

} else if (ret == 0) { /* for free pages */

1718

if (PageHuge(page)) {

1724

if (PageHuge(page)) {

1719

set_page_hwpoison_huge_page(hpage);

1725

set_page_hwpoison_huge_page(hpage);

1720

dequeue_hwpoisoned_huge_page(hpage);

1726

dequeue_hwpoisoned_huge_page(hpage);

1721

atomic_long_add(1 << compound_order(hpage),

1727

atomic_long_add(1 << compound_order(hpage),

1722

&num_poisoned_pages);

1728

&num_poisoned_pages);

1723

} else {

1729

} else {

1724

SetPageHWPoison(page);

1730

SetPageHWPoison(page);

1725

atomic_long_inc(&num_poisoned_pages);

1731

atomic_long_inc(&num_poisoned_pages);

1726

}

1732

}

1727

}

1733

}

1728

unset_migratetype_isolate(page, MIGRATE_MOVABLE);

1734

unset_migratetype_isolate(page, MIGRATE_MOVABLE);

1729

return ret;

1735

return ret;

1730

}

1736

}

1731

1737

GITLAB

hwpoison: fix hugetlbfs/thp precheck in hwpoison_user_mappings()

 /*
  * Copyright (C) 2008, 2009 Intel Corporation
  * Authors: Andi Kleen, Fengguang Wu
  *
  * This software may be redistributed and/or modified under the terms of
  * the GNU General Public License ("GPL") version 2 only as published by the
  * Free Software Foundation.
  *
  * High level machine check handler. Handles pages reported by the
  * hardware as being corrupted usually due to a multi-bit ECC memory or cache
  * failure.
  *
  * In addition there is a "soft offline" entry point that allows stop using
  * not-yet-corrupted-by-suspicious pages without killing anything.
  *
  * Handles page cache pages in various states.	The tricky part
  * here is that we can access any page asynchronously in respect to
  * other VM users, because memory failures could happen anytime and
  * anywhere. This could violate some of their assumptions. This is why
  * this code has to be extremely careful. Generally it tries to use
  * normal locking rules, as in get the standard locks, even if that means
  * the error handling takes potentially a long time.
  *
  * There are several operations here with exponential complexity because
  * of unsuitable VM data structures. For example the operation to map back
  * from RMAP chains to processes has to walk the complete process list and
  * has non linear complexity with the number. But since memory corruptions
  * are rare we hope to get away with this. This avoids impacting the core
  * VM.
  */
 /*
  * Notebook:
  * - hugetlb needs more code
  * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
  * - pass bad pages to kdump next kernel
  */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
 #include <linux/kernel-page-flags.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/backing-dev.h>
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
 #include <linux/suspend.h>
 #include <linux/slab.h>
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
 #include <linux/memory_hotplug.h>
 #include <linux/mm_inline.h>
 #include <linux/kfifo.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
 int sysctl_memory_failure_recovery __read_mostly = 1;
 atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
 #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
 u32 hwpoison_filter_enable = 0;
 u32 hwpoison_filter_dev_major = ~0U;
 u32 hwpoison_filter_dev_minor = ~0U;
 u64 hwpoison_filter_flags_mask;
 u64 hwpoison_filter_flags_value;
 EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
 EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
 EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
 static int hwpoison_filter_dev(struct page *p)
 {
 	struct address_space *mapping;
 	dev_t dev;
 	if (hwpoison_filter_dev_major == ~0U &&
 	    hwpoison_filter_dev_minor == ~0U)
 		return 0;
 	/*
 	 * page_mapping() does not accept slab pages.
 	 */
 	if (PageSlab(p))
 		return -EINVAL;
 	mapping = page_mapping(p);
 	if (mapping == NULL || mapping->host == NULL)
 		return -EINVAL;
 	dev = mapping->host->i_sb->s_dev;
 	if (hwpoison_filter_dev_major != ~0U &&
 	    hwpoison_filter_dev_major != MAJOR(dev))
 		return -EINVAL;
 	if (hwpoison_filter_dev_minor != ~0U &&
 	    hwpoison_filter_dev_minor != MINOR(dev))
 		return -EINVAL;
 	return 0;
 }
 static int hwpoison_filter_flags(struct page *p)
 {
 	if (!hwpoison_filter_flags_mask)
 		return 0;
 	if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
 				    hwpoison_filter_flags_value)
 		return 0;
 	else
 		return -EINVAL;
 }
 /*
  * This allows stress tests to limit test scope to a collection of tasks
  * by putting them under some memcg. This prevents killing unrelated/important
  * processes such as /sbin/init. Note that the target task may share clean
  * pages with init (eg. libc text), which is harmless. If the target task
  * share _dirty_ pages with another task B, the test scheme must make sure B
  * is also included in the memcg. At last, due to race conditions this filter
  * can only guarantee that the page either belongs to the memcg tasks, or is
  * a freed page.
  */
 #ifdef	CONFIG_MEMCG_SWAP
 u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
 {
 	struct mem_cgroup *mem;
 	struct cgroup_subsys_state *css;
 	unsigned long ino;
 	if (!hwpoison_filter_memcg)
 		return 0;
 	mem = try_get_mem_cgroup_from_page(p);
 	if (!mem)
 		return -EINVAL;
 	css = mem_cgroup_css(mem);
 	ino = cgroup_ino(css->cgroup);
 	css_put(css);
 	if (!ino || ino != hwpoison_filter_memcg)
 		return -EINVAL;
 	return 0;
 }
 #else
 static int hwpoison_filter_task(struct page *p) { return 0; }
 #endif
 int hwpoison_filter(struct page *p)
 {
 	if (!hwpoison_filter_enable)
 		return 0;
 	if (hwpoison_filter_dev(p))
 		return -EINVAL;
 	if (hwpoison_filter_flags(p))
 		return -EINVAL;
 	if (hwpoison_filter_task(p))
 		return -EINVAL;
 	return 0;
 }
 #else
 int hwpoison_filter(struct page *p)
 {
 	return 0;
 }
 #endif
 EXPORT_SYMBOL_GPL(hwpoison_filter);
 /*
  * Send all the processes who have the page mapped a signal.
  * ``action optional'' if they are not immediately affected by the error
  * ``action required'' if error happened in current execution context
  */
 static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
 			unsigned long pfn, struct page *page, int flags)
 {
 	struct siginfo si;
 	int ret;
 	printk(KERN_ERR
 		"MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
 		pfn, t->comm, t->pid);
 	si.si_signo = SIGBUS;
 	si.si_errno = 0;
 	si.si_addr = (void *)addr;
 #ifdef __ARCH_SI_TRAPNO
 	si.si_trapno = trapno;
 #endif
 	si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
 	if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
 		si.si_code = BUS_MCEERR_AR;
 		ret = force_sig_info(SIGBUS, &si, current);
 	} else {
 		/*
 		 * Don't use force here, it's convenient if the signal
 		 * can be temporarily blocked.
 		 * This could cause a loop when the user sets SIGBUS
 		 * to SIG_IGN, but hopefully no one will do that?
 		 */
 		si.si_code = BUS_MCEERR_AO;
 		ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
 	}
 	if (ret < 0)
 		printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
 		       t->comm, t->pid, ret);
 	return ret;
 }
 /*
  * When a unknown page type is encountered drain as many buffers as possible
  * in the hope to turn the page into a LRU or free page, which we can handle.
  */
 void shake_page(struct page *p, int access)
 {
 	if (!PageSlab(p)) {
 		lru_add_drain_all();
 		if (PageLRU(p))
 			return;
 		drain_all_pages();
 		if (PageLRU(p) || is_free_buddy_page(p))
 			return;
 	}
 	/*
 	 * Only call shrink_slab here (which would also shrink other caches) if
 	 * access is not potentially fatal.
 	 */
 	if (access) {
 		int nr;
 		int nid = page_to_nid(p);
 		do {
 			struct shrink_control shrink = {
 				.gfp_mask = GFP_KERNEL,
 			};
 			node_set(nid, shrink.nodes_to_scan);
 			nr = shrink_slab(&shrink, 1000, 1000);
 			if (page_count(p) == 1)
 				break;
 		} while (nr > 10);
 	}
 }
 EXPORT_SYMBOL_GPL(shake_page);
 /*
  * Kill all processes that have a poisoned page mapped and then isolate
  * the page.
  *
  * General strategy:
  * Find all processes having the page mapped and kill them.
  * But we keep a page reference around so that the page is not
  * actually freed yet.
  * Then stash the page away
  *
  * There's no convenient way to get back to mapped processes
  * from the VMAs. So do a brute-force search over all
  * running processes.
  *
  * Remember that machine checks are not common (or rather
  * if they are common you have other problems), so this shouldn't
  * be a performance issue.
  *
  * Also there are some races possible while we get from the
  * error detection to actually handle it.
  */
 struct to_kill {
 	struct list_head nd;
 	struct task_struct *tsk;
 	unsigned long addr;
 	char addr_valid;
 };
 /*
  * Failure handling: if we can't find or can't kill a process there's
  * not much we can do.	We just print a message and ignore otherwise.
  */
 /*
  * Schedule a process for later kill.
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  * TBD would GFP_NOIO be enough?
  */
 static void add_to_kill(struct task_struct *tsk, struct page *p,
 		       struct vm_area_struct *vma,
 		       struct list_head *to_kill,
 		       struct to_kill **tkc)
 {
 	struct to_kill *tk;
 	if (*tkc) {
 		tk = *tkc;
 		*tkc = NULL;
 	} else {
 		tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
 		if (!tk) {
 			printk(KERN_ERR
 		"MCE: Out of memory while machine check handling\n");
 			return;
 		}
 	}
 	tk->addr = page_address_in_vma(p, vma);
 	tk->addr_valid = 1;
 	/*
 	 * In theory we don't have to kill when the page was
 	 * munmaped. But it could be also a mremap. Since that's
 	 * likely very rare kill anyways just out of paranoia, but use
 	 * a SIGKILL because the error is not contained anymore.
 	 */
 	if (tk->addr == -EFAULT) {
 		pr_info("MCE: Unable to find user space address %lx in %s\n",
 			page_to_pfn(p), tsk->comm);
 		tk->addr_valid = 0;
 	}
 	get_task_struct(tsk);
 	tk->tsk = tsk;
 	list_add_tail(&tk->nd, to_kill);
 }
 /*
  * Kill the processes that have been collected earlier.
  *
  * Only do anything when DOIT is set, otherwise just free the list
  * (this is used for clean pages which do not need killing)
  * Also when FAIL is set do a force kill because something went
  * wrong earlier.
  */
 static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
 			  int fail, struct page *page, unsigned long pfn,
 			  int flags)
 {
 	struct to_kill *tk, *next;
 	list_for_each_entry_safe (tk, next, to_kill, nd) {
 		if (forcekill) {
 			/*
 			 * In case something went wrong with munmapping
 			 * make sure the process doesn't catch the
 			 * signal and then access the memory. Just kill it.
 			 */
 			if (fail || tk->addr_valid == 0) {
 				printk(KERN_ERR
 		"MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
 					pfn, tk->tsk->comm, tk->tsk->pid);
 				force_sig(SIGKILL, tk->tsk);
 			}
 			/*
 			 * In theory the process could have mapped
 			 * something else on the address in-between. We could
 			 * check for that, but we need to tell the
 			 * process anyways.
 			 */
 			else if (kill_proc(tk->tsk, tk->addr, trapno,
 					      pfn, page, flags) < 0)
 				printk(KERN_ERR
 		"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
 					pfn, tk->tsk->comm, tk->tsk->pid);
 		}
 		put_task_struct(tk->tsk);
 		kfree(tk);
 	}
 }
 /*
  * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO)
  * on behalf of the thread group. Return task_struct of the (first found)
  * dedicated thread if found, and return NULL otherwise.
  *
  * We already hold read_lock(&tasklist_lock) in the caller, so we don't
  * have to call rcu_read_lock/unlock() in this function.
  */
 static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
 {
 	struct task_struct *t;
 	for_each_thread(tsk, t)
 		if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
 			return t;
 	return NULL;
 }
 /*
  * Determine whether a given process is "early kill" process which expects
  * to be signaled when some page under the process is hwpoisoned.
  * Return task_struct of the dedicated thread (main thread unless explicitly
  * specified) if the process is "early kill," and otherwise returns NULL.
  */
 static struct task_struct *task_early_kill(struct task_struct *tsk,
 					   int force_early)
 {
 	struct task_struct *t;
 	if (!tsk->mm)
 		return NULL;
 	if (force_early)
 		return tsk;
 	t = find_early_kill_thread(tsk);
 	if (t)
 		return t;
 	if (sysctl_memory_failure_early_kill)
 		return tsk;
 	return NULL;
 }
 /*
  * Collect processes when the error hit an anonymous page.
  */
 static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 			      struct to_kill **tkc, int force_early)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
 	struct anon_vma *av;
 	pgoff_t pgoff;
 	av = page_lock_anon_vma_read(page);
 	if (av == NULL)	/* Not actually mapped anymore */
 		return;
 	pgoff = page_to_pgoff(page);
 	read_lock(&tasklist_lock);
 	for_each_process (tsk) {
 		struct anon_vma_chain *vmac;
 		struct task_struct *t = task_early_kill(tsk, force_early);
 		if (!t)
 			continue;
 		anon_vma_interval_tree_foreach(vmac, &av->rb_root,
 					       pgoff, pgoff) {
 			vma = vmac->vma;
 			if (!page_mapped_in_vma(page, vma))
 				continue;
 			if (vma->vm_mm == t->mm)
 				add_to_kill(t, page, vma, to_kill, tkc);
 		}
 	}
 	read_unlock(&tasklist_lock);
 	page_unlock_anon_vma_read(av);
 }
 /*
  * Collect processes when the error hit a file mapped page.
  */
 static void collect_procs_file(struct page *page, struct list_head *to_kill,
 			      struct to_kill **tkc, int force_early)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
 	struct address_space *mapping = page->mapping;
 	mutex_lock(&mapping->i_mmap_mutex);
 	read_lock(&tasklist_lock);
 	for_each_process(tsk) {
 		pgoff_t pgoff = page_to_pgoff(page);
 		struct task_struct *t = task_early_kill(tsk, force_early);
 		if (!t)
 			continue;
 		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
 				      pgoff) {
 			/*
 			 * Send early kill signal to tasks where a vma covers
 			 * the page but the corrupted page is not necessarily
 			 * mapped it in its pte.
 			 * Assume applications who requested early kill want
 			 * to be informed of all such data corruptions.
 			 */
 			if (vma->vm_mm == t->mm)
 				add_to_kill(t, page, vma, to_kill, tkc);
 		}
 	}
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&mapping->i_mmap_mutex);
 }
 /*
  * Collect the processes who have the corrupted page mapped to kill.
  * This is done in two steps for locking reasons.
  * First preallocate one tokill structure outside the spin locks,
  * so that we can kill at least one process reasonably reliable.
  */
 static void collect_procs(struct page *page, struct list_head *tokill,
 				int force_early)
 {
 	struct to_kill *tk;
 	if (!page->mapping)
 		return;
 	tk = kmalloc(sizeof(struct to_kill), GFP_NOIO);
 	if (!tk)
 		return;
 	if (PageAnon(page))
 		collect_procs_anon(page, tokill, &tk, force_early);
 	else
 		collect_procs_file(page, tokill, &tk, force_early);
 	kfree(tk);
 }
 /*
  * Error handlers for various types of pages.
  */
 enum outcome {
 	IGNORED,	/* Error: cannot be handled */
 	FAILED,		/* Error: handling failed */
 	DELAYED,	/* Will be handled later */
 	RECOVERED,	/* Successfully recovered */
 };
 static const char *action_name[] = {
 	[IGNORED] = "Ignored",
 	[FAILED] = "Failed",
 	[DELAYED] = "Delayed",
 	[RECOVERED] = "Recovered",
 };
 /*
  * XXX: It is possible that a page is isolated from LRU cache,
  * and then kept in swap cache or failed to remove from page cache.
  * The page count will stop it from being freed by unpoison.
  * Stress tests should be aware of this memory leak problem.
  */
 static int delete_from_lru_cache(struct page *p)
 {
 	if (!isolate_lru_page(p)) {
 		/*
 		 * Clear sensible page flags, so that the buddy system won't
 		 * complain when the page is unpoison-and-freed.
 		 */
 		ClearPageActive(p);
 		ClearPageUnevictable(p);
 		/*
 		 * drop the page count elevated by isolate_lru_page()
 		 */
 		page_cache_release(p);
 		return 0;
 	}
 	return -EIO;
 }
 /*
  * Error hit kernel page.
  * Do nothing, try to be lucky and not touch this instead. For a few cases we
  * could be more sophisticated.
  */
 static int me_kernel(struct page *p, unsigned long pfn)
 {
 	return IGNORED;
 }
 /*
  * Page in unknown state. Do nothing.
  */
 static int me_unknown(struct page *p, unsigned long pfn)
 {
 	printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
 	return FAILED;
 }
 /*
  * Clean (or cleaned) page cache page.
  */
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
 {
 	int err;
 	int ret = FAILED;
 	struct address_space *mapping;
 	delete_from_lru_cache(p);
 	/*
 	 * For anonymous pages we're done the only reference left
 	 * should be the one m_f() holds.
 	 */
 	if (PageAnon(p))
 		return RECOVERED;
 	/*
 	 * Now truncate the page in the page cache. This is really
 	 * more like a "temporary hole punch"
 	 * Don't do this for block devices when someone else
 	 * has a reference, because it could be file system metadata
 	 * and that's not safe to truncate.
 	 */
 	mapping = page_mapping(p);
 	if (!mapping) {
 		/*
 		 * Page has been teared down in the meanwhile
 		 */
 		return FAILED;
 	}
 	/*
 	 * Truncation is a bit tricky. Enable it per file system for now.
 	 *
 	 * Open: to take i_mutex or not for this? Right now we don't.
 	 */
 	if (mapping->a_ops->error_remove_page) {
 		err = mapping->a_ops->error_remove_page(mapping, p);
 		if (err != 0) {
 			printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
 					pfn, err);
 		} else if (page_has_private(p) &&
 				!try_to_release_page(p, GFP_NOIO)) {
 			pr_info("MCE %#lx: failed to release buffers\n", pfn);
 		} else {
 			ret = RECOVERED;
 		}
 	} else {
 		/*
 		 * If the file system doesn't support it just invalidate
 		 * This fails on dirty or anything with private pages
 		 */
 		if (invalidate_inode_page(p))
 			ret = RECOVERED;
 		else
 			printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
 				pfn);
 	}
 	return ret;
 }
 /*
  * Dirty pagecache page
  * Issues: when the error hit a hole page the error is not properly
  * propagated.
  */
 static int me_pagecache_dirty(struct page *p, unsigned long pfn)
 {
 	struct address_space *mapping = page_mapping(p);
 	SetPageError(p);
 	/* TBD: print more information about the file. */
 	if (mapping) {
 		/*
 		 * IO error will be reported by write(), fsync(), etc.
 		 * who check the mapping.
 		 * This way the application knows that something went
 		 * wrong with its dirty file data.
 		 *
 		 * There's one open issue:
 		 *
 		 * The EIO will be only reported on the next IO
 		 * operation and then cleared through the IO map.
 		 * Normally Linux has two mechanisms to pass IO error
 		 * first through the AS_EIO flag in the address space
 		 * and then through the PageError flag in the page.
 		 * Since we drop pages on memory failure handling the
 		 * only mechanism open to use is through AS_AIO.
 		 *
 		 * This has the disadvantage that it gets cleared on
 		 * the first operation that returns an error, while
 		 * the PageError bit is more sticky and only cleared
 		 * when the page is reread or dropped.  If an
 		 * application assumes it will always get error on
 		 * fsync, but does other operations on the fd before
 		 * and the page is dropped between then the error
 		 * will not be properly reported.
 		 *
 		 * This can already happen even without hwpoisoned
 		 * pages: first on metadata IO errors (which only
 		 * report through AS_EIO) or when the page is dropped
 		 * at the wrong time.
 		 *
 		 * So right now we assume that the application DTRT on
 		 * the first EIO, but we're not worse than other parts
 		 * of the kernel.
 		 */
 		mapping_set_error(mapping, EIO);
 	}
 	return me_pagecache_clean(p, pfn);
 }
 /*
  * Clean and dirty swap cache.
  *
  * Dirty swap cache page is tricky to handle. The page could live both in page
  * cache and swap cache(ie. page is freshly swapped in). So it could be
  * referenced concurrently by 2 types of PTEs:
  * normal PTEs and swap PTEs. We try to handle them consistently by calling
  * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs,
  * and then
  *      - clear dirty bit to prevent IO
  *      - remove from LRU
  *      - but keep in the swap cache, so that when we return to it on
  *        a later page fault, we know the application is accessing
  *        corrupted data and shall be killed (we installed simple
  *        interception code in do_swap_page to catch it).
  *
  * Clean swap cache pages can be directly isolated. A later page fault will
  * bring in the known good data from disk.
  */
 static int me_swapcache_dirty(struct page *p, unsigned long pfn)
 {
 	ClearPageDirty(p);
 	/* Trigger EIO in shmem: */
 	ClearPageUptodate(p);
 	if (!delete_from_lru_cache(p))
 		return DELAYED;
 	else
 		return FAILED;
 }
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
 	delete_from_swap_cache(p);
 	if (!delete_from_lru_cache(p))
 		return RECOVERED;
 	else
 		return FAILED;
 }
 /*
  * Huge pages. Needs work.
  * Issues:
  * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
  *   To narrow down kill region to one page, we need to break up pmd.
  */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
 	int res = 0;
 	struct page *hpage = compound_head(p);
 	/*
 	 * We can safely recover from error on free or reserved (i.e.
 	 * not in-use) hugepage by dequeuing it from freelist.
 	 * To check whether a hugepage is in-use or not, we can't use
 	 * page->lru because it can be used in other hugepage operations,
 	 * such as __unmap_hugepage_range() and gather_surplus_pages().
 	 * So instead we use page_mapping() and PageAnon().
 	 * We assume that this function is called with page lock held,
 	 * so there is no race between isolation and mapping/unmapping.
 	 */
 	if (!(page_mapping(hpage) || PageAnon(hpage))) {
 		res = dequeue_hwpoisoned_huge_page(hpage);
 		if (!res)
 			return RECOVERED;
 	}
 	return DELAYED;
 }
 /*
  * Various page states we can handle.
  *
  * A page state is defined by its current page->flags bits.
  * The table matches them in order and calls the right handler.
  *
  * This is quite tricky because we can access page at any time
  * in its live cycle, so all accesses have to be extremely careful.
  *
  * This is not complete. More states could be added.
  * For any missing state don't attempt recovery.
  */
 #define dirty		(1UL << PG_dirty)
 #define sc		(1UL << PG_swapcache)
 #define unevict		(1UL << PG_unevictable)
 #define mlock		(1UL << PG_mlocked)
 #define writeback	(1UL << PG_writeback)
 #define lru		(1UL << PG_lru)
 #define swapbacked	(1UL << PG_swapbacked)
 #define head		(1UL << PG_head)
 #define tail		(1UL << PG_tail)
 #define compound	(1UL << PG_compound)
 #define slab		(1UL << PG_slab)
 #define reserved	(1UL << PG_reserved)
 static struct page_state {
 	unsigned long mask;
 	unsigned long res;
 	char *msg;
 	int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
 	{ reserved,	reserved,	"reserved kernel",	me_kernel },
 	/*
 	 * free pages are specially detected outside this table:
 	 * PG_buddy pages only make a small fraction of all free pages.
 	 */
 	/*
 	 * Could in theory check if slab page is free or if we can drop
 	 * currently unused objects without touching them. But just
 	 * treat it as standard kernel for now.
 	 */
 	{ slab,		slab,		"kernel slab",	me_kernel },
 #ifdef CONFIG_PAGEFLAGS_EXTENDED
 	{ head,		head,		"huge",		me_huge_page },
 	{ tail,		tail,		"huge",		me_huge_page },
 #else
 	{ compound,	compound,	"huge",		me_huge_page },
 #endif
 	{ sc|dirty,	sc|dirty,	"dirty swapcache",	me_swapcache_dirty },
 	{ sc|dirty,	sc,		"clean swapcache",	me_swapcache_clean },
 	{ mlock|dirty,	mlock|dirty,	"dirty mlocked LRU",	me_pagecache_dirty },
 	{ mlock|dirty,	mlock,		"clean mlocked LRU",	me_pagecache_clean },
 	{ unevict|dirty, unevict|dirty,	"dirty unevictable LRU", me_pagecache_dirty },
 	{ unevict|dirty, unevict,	"clean unevictable LRU", me_pagecache_clean },
 	{ lru|dirty,	lru|dirty,	"dirty LRU",	me_pagecache_dirty },
 	{ lru|dirty,	lru,		"clean LRU",	me_pagecache_clean },
 	/*
 	 * Catchall entry: must be at end.
 	 */
 	{ 0,		0,		"unknown page state",	me_unknown },
 };
 #undef dirty
 #undef sc
 #undef unevict
 #undef mlock
 #undef writeback
 #undef lru
 #undef swapbacked
 #undef head
 #undef tail
 #undef compound
 #undef slab
 #undef reserved
 /*
  * "Dirty/Clean" indication is not 100% accurate due to the possibility of
  * setting PG_dirty outside page lock. See also comment above set_page_dirty().
  */
 static void action_result(unsigned long pfn, char *msg, int result)
 {
 	pr_err("MCE %#lx: %s page recovery: %s\n",
 		pfn, msg, action_name[result]);
 }
 static int page_action(struct page_state *ps, struct page *p,
 			unsigned long pfn)
 {
 	int result;
 	int count;
 	result = ps->action(p, pfn);
 	action_result(pfn, ps->msg, result);
 	count = page_count(p) - 1;
 	if (ps->action == me_swapcache_dirty && result == DELAYED)
 		count--;
 	if (count != 0) {
 		printk(KERN_ERR
 		       "MCE %#lx: %s page still referenced by %d users\n",
 		       pfn, ps->msg, count);
 		result = FAILED;
 	}
 	/* Could do more checks here if page looks ok */
 	/*
 	 * Could adjust zone counters here to correct for the missing page.
 	 */
 	return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
 }
 /*
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
  */
 static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 				  int trapno, int flags, struct page **hpagep)
 {
 	enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
 	struct address_space *mapping;
 	LIST_HEAD(tokill);
 	int ret;
 	int kill = 1, forcekill;
 	struct page *hpage = *hpagep;
 	struct page *ppage;
-	if (PageReserved(p) || PageSlab(p) || !PageLRU(p))
+	/*
+	 * Here we are interested only in user-mapped pages, so skip any
+	 * other types of pages.
+	 */
+	if (PageReserved(p) || PageSlab(p))
+		return SWAP_SUCCESS;
+	if (!(PageLRU(hpage) || PageHuge(p)))
 		return SWAP_SUCCESS;
 	/*
 	 * This check implies we don't kill processes if their pages
 	 * are in the swap cache early. Those are always late kills.
 	 */
 	if (!page_mapped(hpage))
 		return SWAP_SUCCESS;
 	if (PageKsm(p))
 		return SWAP_FAIL;
 	if (PageSwapCache(p)) {
 		printk(KERN_ERR
 		       "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
 		ttu |= TTU_IGNORE_HWPOISON;
 	}
 	/*
 	 * Propagate the dirty bit from PTEs to struct page first, because we
 	 * need this to decide if we should kill or just drop the page.
 	 * XXX: the dirty test could be racy: set_page_dirty() may not always
 	 * be called inside page lock (it's recommended but not enforced).
 	 */
 	mapping = page_mapping(hpage);
 	if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
 	    mapping_cap_writeback_dirty(mapping)) {
 		if (page_mkclean(hpage)) {
 			SetPageDirty(hpage);
 		} else {
 			kill = 0;
 			ttu |= TTU_IGNORE_HWPOISON;
 			printk(KERN_INFO
 	"MCE %#lx: corrupted page was clean: dropped without side effects\n",
 				pfn);
 		}
 	}
 	/*
 	 * ppage: poisoned page
 	 *   if p is regular page(4k page)
 	 *        ppage == real poisoned page;
 	 *   else p is hugetlb or THP, ppage == head page.
 	 */
 	ppage = hpage;
 	if (PageTransHuge(hpage)) {
 		/*
 		 * Verify that this isn't a hugetlbfs head page, the check for
 		 * PageAnon is just for avoid tripping a split_huge_page
 		 * internal debug check, as split_huge_page refuses to deal with
 		 * anything that isn't an anon page. PageAnon can't go away fro
 		 * under us because we hold a refcount on the hpage, without a
 		 * refcount on the hpage. split_huge_page can't be safely called
 		 * in the first place, having a refcount on the tail isn't
 		 * enough * to be safe.
 		 */
 		if (!PageHuge(hpage) && PageAnon(hpage)) {
 			if (unlikely(split_huge_page(hpage))) {
 				/*
 				 * FIXME: if splitting THP is failed, it is
 				 * better to stop the following operation rather
 				 * than causing panic by unmapping. System might
 				 * survive if the page is freed later.
 				 */
 				printk(KERN_INFO
 					"MCE %#lx: failed to split THP\n", pfn);
 				BUG_ON(!PageHWPoison(p));
 				return SWAP_FAIL;
 			}
 			/*
 			 * We pinned the head page for hwpoison handling,
 			 * now we split the thp and we are interested in
 			 * the hwpoisoned raw page, so move the refcount
 			 * to it. Similarly, page lock is shifted.
 			 */
 			if (hpage != p) {
 				if (!(flags & MF_COUNT_INCREASED)) {
 					put_page(hpage);
 					get_page(p);
 				}
 				lock_page(p);
 				unlock_page(hpage);
 				*hpagep = p;
 			}
 			/* THP is split, so ppage should be the real poisoned page. */
 			ppage = p;
 		}
 	}
 	/*
 	 * First collect all the processes that have the page
 	 * mapped in dirty form.  This has to be done before try_to_unmap,
 	 * because ttu takes the rmap data structures down.
 	 *
 	 * Error handling: We ignore errors here because
 	 * there's nothing that can be done.
 	 */
 	if (kill)
 		collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
 	ret = try_to_unmap(ppage, ttu);
 	if (ret != SWAP_SUCCESS)
 		printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
 				pfn, page_mapcount(ppage));
 	/*
 	 * Now that the dirty bit has been propagated to the
 	 * struct page and all unmaps done we can decide if
 	 * killing is needed or not.  Only kill when the page
 	 * was dirty or the process is not restartable,
 	 * otherwise the tokill list is merely
 	 * freed.  When there was a problem unmapping earlier
 	 * use a more force-full uncatchable kill to prevent
 	 * any accesses to the poisoned memory.
 	 */
 	forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
 	kill_procs(&tokill, forcekill, trapno,
 		      ret != SWAP_SUCCESS, p, pfn, flags);
 	return ret;
 }
 static void set_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
 	int nr_pages = 1 << compound_order(hpage);
 	for (i = 0; i < nr_pages; i++)
 		SetPageHWPoison(hpage + i);
 }
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
 	int nr_pages = 1 << compound_order(hpage);
 	for (i = 0; i < nr_pages; i++)
 		ClearPageHWPoison(hpage + i);
 }
 /**
  * memory_failure - Handle memory failure of a page.
  * @pfn: Page Number of the corrupted page
  * @trapno: Trap number reported in the signal to user space.
  * @flags: fine tune action taken
  *
  * This function is called by the low level machine check code
  * of an architecture when it detects hardware memory corruption
  * of a page. It tries its best to recover, which includes
  * dropping pages, killing processes etc.
  *
  * The function is primarily of use for corruptions that
  * happen outside the current execution context (e.g. when
  * detected by a background scrubber)
  *
  * Must run in process context (e.g. a work queue) with interrupts
  * enabled and no spinlocks hold.
  */
 int memory_failure(unsigned long pfn, int trapno, int flags)
 {
 	struct page_state *ps;
 	struct page *p;
 	struct page *hpage;
 	int res;
 	unsigned int nr_pages;
 	unsigned long page_flags;
 	if (!sysctl_memory_failure_recovery)
 		panic("Memory failure from trap %d on page %lx", trapno, pfn);
 	if (!pfn_valid(pfn)) {
 		printk(KERN_ERR
 		       "MCE %#lx: memory outside kernel control\n",
 		       pfn);
 		return -ENXIO;
 	}
 	p = pfn_to_page(pfn);
 	hpage = compound_head(p);
 	if (TestSetPageHWPoison(p)) {
 		printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
 		return 0;
 	}
 	/*
 	 * Currently errors on hugetlbfs pages are measured in hugepage units,
 	 * so nr_pages should be 1 << compound_order.  OTOH when errors are on
 	 * transparent hugepages, they are supposed to be split and error
 	 * measurement is done in normal page units.  So nr_pages should be one
 	 * in this case.
 	 */
 	if (PageHuge(p))
 		nr_pages = 1 << compound_order(hpage);
 	else /* normal page or thp */
 		nr_pages = 1;
 	atomic_long_add(nr_pages, &num_poisoned_pages);
 	/*
 	 * We need/can do nothing about count=0 pages.
 	 * 1) it's a free page, and therefore in safe hand:
 	 *    prep_new_page() will be the gate keeper.
 	 * 2) it's a free hugepage, which is also safe:
 	 *    an affected hugepage will be dequeued from hugepage freelist,
 	 *    so there's no concern about reusing it ever after.
 	 * 3) it's part of a non-compound high order page.
 	 *    Implies some kernel user: cannot stop them from
 	 *    R/W the page; let's pray that the page has been
 	 *    used and will be freed some time later.
 	 * In fact it's dangerous to directly bump up page count from 0,
 	 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
 	 */
 	if (!(flags & MF_COUNT_INCREASED) &&
 		!get_page_unless_zero(hpage)) {
 		if (is_free_buddy_page(p)) {
 			action_result(pfn, "free buddy", DELAYED);
 			return 0;
 		} else if (PageHuge(hpage)) {
 			/*
 			 * Check "filter hit" and "race with other subpage."
 			 */
 			lock_page(hpage);
 			if (PageHWPoison(hpage)) {
 				if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
 				    || (p != hpage && TestSetPageHWPoison(hpage))) {
 					atomic_long_sub(nr_pages, &num_poisoned_pages);
 					unlock_page(hpage);
 					return 0;
 				}
 			}
 			set_page_hwpoison_huge_page(hpage);
 			res = dequeue_hwpoisoned_huge_page(hpage);
 			action_result(pfn, "free huge",
 				      res ? IGNORED : DELAYED);
 			unlock_page(hpage);
 			return res;
 		} else {
 			action_result(pfn, "high order kernel", IGNORED);
 			return -EBUSY;
 		}
 	}
 	/*
 	 * We ignore non-LRU pages for good reasons.
 	 * - PG_locked is only well defined for LRU pages and a few others
 	 * - to avoid races with __set_page_locked()
 	 * - to avoid races with __SetPageSlab*() (and more non-atomic ops)
 	 * The check (unnecessarily) ignores LRU pages being isolated and
 	 * walked by the page reclaim code, however that's not a big loss.
 	 */
 	if (!PageHuge(p) && !PageTransTail(p)) {
 		if (!PageLRU(p))
 			shake_page(p, 0);
 		if (!PageLRU(p)) {
 			/*
 			 * shake_page could have turned it free.
 			 */
 			if (is_free_buddy_page(p)) {
 				if (flags & MF_COUNT_INCREASED)
 					action_result(pfn, "free buddy", DELAYED);
 				else
 					action_result(pfn, "free buddy, 2nd try", DELAYED);
 				return 0;
 			}
 		}
 	}
 	lock_page(hpage);
 	/*
 	 * We use page flags to determine what action should be taken, but
 	 * the flags can be modified by the error containment action.  One
 	 * example is an mlocked page, where PG_mlocked is cleared by
 	 * page_remove_rmap() in try_to_unmap_one(). So to determine page status
 	 * correctly, we save a copy of the page flags at this time.
 	 */
 	page_flags = p->flags;
 	/*
 	 * unpoison always clear PG_hwpoison inside page lock
 	 */
 	if (!PageHWPoison(p)) {
 		printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
 		atomic_long_sub(nr_pages, &num_poisoned_pages);
 		put_page(hpage);
 		res = 0;
 		goto out;
 	}
 	if (hwpoison_filter(p)) {
 		if (TestClearPageHWPoison(p))
 			atomic_long_sub(nr_pages, &num_poisoned_pages);
 		unlock_page(hpage);
 		put_page(hpage);
 		return 0;
 	}
 	if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
 		goto identify_page_state;
 	/*
 	 * For error on the tail page, we should set PG_hwpoison
 	 * on the head page to show that the hugepage is hwpoisoned
 	 */
 	if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
 		action_result(pfn, "hugepage already hardware poisoned",
 				IGNORED);
 		unlock_page(hpage);
 		put_page(hpage);
 		return 0;
 	}
 	/*
 	 * Set PG_hwpoison on all pages in an error hugepage,
 	 * because containment is done in hugepage unit for now.
 	 * Since we have done TestSetPageHWPoison() for the head page with
 	 * page lock held, we can safely set PG_hwpoison bits on tail pages.
 	 */
 	if (PageHuge(p))
 		set_page_hwpoison_huge_page(hpage);
 	/*
 	 * It's very difficult to mess with pages currently under IO
 	 * and in many cases impossible, so we just avoid it here.
 	 */
 	wait_on_page_writeback(p);
 	/*
 	 * Now take care of user space mappings.
 	 * Abort on fail: __delete_from_page_cache() assumes unmapped page.
 	 *
 	 * When the raw error page is thp tail page, hpage points to the raw
 	 * page after thp split.
 	 */
 	if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
 	    != SWAP_SUCCESS) {
 		printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
 		res = -EBUSY;
 		goto out;
 	}
 	/*
 	 * Torn down by someone else?
 	 */
 	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
 		action_result(pfn, "already truncated LRU", IGNORED);
 		res = -EBUSY;
 		goto out;
 	}
 identify_page_state:
 	res = -EBUSY;
 	/*
 	 * The first check uses the current page flags which may not have any
 	 * relevant information. The second check with the saved page flagss is
 	 * carried out only if the first check can't determine the page status.
 	 */
 	for (ps = error_states;; ps++)
 		if ((p->flags & ps->mask) == ps->res)
 			break;
 	page_flags |= (p->flags & (1UL << PG_dirty));
 	if (!ps->mask)
 		for (ps = error_states;; ps++)
 			if ((page_flags & ps->mask) == ps->res)
 				break;
 	res = page_action(ps, p, pfn);
 out:
 	unlock_page(hpage);
 	return res;
 }
 EXPORT_SYMBOL_GPL(memory_failure);
 #define MEMORY_FAILURE_FIFO_ORDER	4
 #define MEMORY_FAILURE_FIFO_SIZE	(1 << MEMORY_FAILURE_FIFO_ORDER)
 struct memory_failure_entry {
 	unsigned long pfn;
 	int trapno;
 	int flags;
 };
 struct memory_failure_cpu {
 	DECLARE_KFIFO(fifo, struct memory_failure_entry,
 		      MEMORY_FAILURE_FIFO_SIZE);
 	spinlock_t lock;
 	struct work_struct work;
 };
 static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
 /**
  * memory_failure_queue - Schedule handling memory failure of a page.
  * @pfn: Page Number of the corrupted page
  * @trapno: Trap number reported in the signal to user space.
  * @flags: Flags for memory failure handling
  *
  * This function is called by the low level hardware error handler
  * when it detects hardware memory corruption of a page. It schedules
  * the recovering of error page, including dropping pages, killing
  * processes etc.
  *
  * The function is primarily of use for corruptions that
  * happen outside the current execution context (e.g. when
  * detected by a background scrubber)
  *
  * Can run in IRQ context.
  */
 void memory_failure_queue(unsigned long pfn, int trapno, int flags)
 {
 	struct memory_failure_cpu *mf_cpu;
 	unsigned long proc_flags;
 	struct memory_failure_entry entry = {
 		.pfn =		pfn,
 		.trapno =	trapno,
 		.flags =	flags,
 	};
 	mf_cpu = &get_cpu_var(memory_failure_cpu);
 	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
 	if (kfifo_put(&mf_cpu->fifo, entry))
 		schedule_work_on(smp_processor_id(), &mf_cpu->work);
 	else
 		pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
 		       pfn);
 	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
 	put_cpu_var(memory_failure_cpu);
 }
 EXPORT_SYMBOL_GPL(memory_failure_queue);
 static void memory_failure_work_func(struct work_struct *work)
 {
 	struct memory_failure_cpu *mf_cpu;
 	struct memory_failure_entry entry = { 0, };
 	unsigned long proc_flags;
 	int gotten;
 	mf_cpu = this_cpu_ptr(&memory_failure_cpu);
 	for (;;) {
 		spin_lock_irqsave(&mf_cpu->lock, proc_flags);
 		gotten = kfifo_get(&mf_cpu->fifo, &entry);
 		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
 		if (!gotten)
 			break;
 		if (entry.flags & MF_SOFT_OFFLINE)
 			soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
 		else
 			memory_failure(entry.pfn, entry.trapno, entry.flags);
 	}
 }
 static int __init memory_failure_init(void)
 {
 	struct memory_failure_cpu *mf_cpu;
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		mf_cpu = &per_cpu(memory_failure_cpu, cpu);
 		spin_lock_init(&mf_cpu->lock);
 		INIT_KFIFO(mf_cpu->fifo);
 		INIT_WORK(&mf_cpu->work, memory_failure_work_func);
 	}
 	return 0;
 }
 core_initcall(memory_failure_init);
 /**
  * unpoison_memory - Unpoison a previously poisoned page
  * @pfn: Page number of the to be unpoisoned page
  *
  * Software-unpoison a page that has been poisoned by
  * memory_failure() earlier.
  *
  * This is only done on the software-level, so it only works
  * for linux injected failures, not real hardware failures
  *
  * Returns 0 for success, otherwise -errno.
  */
 int unpoison_memory(unsigned long pfn)
 {
 	struct page *page;
 	struct page *p;
 	int freeit = 0;
 	unsigned int nr_pages;
 	if (!pfn_valid(pfn))
 		return -ENXIO;
 	p = pfn_to_page(pfn);
 	page = compound_head(p);
 	if (!PageHWPoison(p)) {
 		pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
 		return 0;
 	}
 	/*
 	 * unpoison_memory() can encounter thp only when the thp is being
 	 * worked by memory_failure() and the page lock is not held yet.
 	 * In such case, we yield to memory_failure() and make unpoison fail.
 	 */
 	if (!PageHuge(page) && PageTransHuge(page)) {
 		pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
 			return 0;
 	}
 	nr_pages = 1 << compound_order(page);
 	if (!get_page_unless_zero(page)) {
 		/*
 		 * Since HWPoisoned hugepage should have non-zero refcount,
 		 * race between memory failure and unpoison seems to happen.
 		 * In such case unpoison fails and memory failure runs
 		 * to the end.
 		 */
 		if (PageHuge(page)) {
 			pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
 			return 0;
 		}
 		if (TestClearPageHWPoison(p))
 			atomic_long_dec(&num_poisoned_pages);
 		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
 		return 0;
 	}
 	lock_page(page);
 	/*
 	 * This test is racy because PG_hwpoison is set outside of page lock.
 	 * That's acceptable because that won't trigger kernel panic. Instead,
 	 * the PG_hwpoison page will be caught and isolated on the entrance to
 	 * the free buddy page pool.
 	 */
 	if (TestClearPageHWPoison(page)) {
 		pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
 		atomic_long_sub(nr_pages, &num_poisoned_pages);
 		freeit = 1;
 		if (PageHuge(page))
 			clear_page_hwpoison_huge_page(page);
 	}
 	unlock_page(page);
 	put_page(page);
 	if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
 		put_page(page);
 	return 0;
 }
 EXPORT_SYMBOL(unpoison_memory);
 static struct page *new_page(struct page *p, unsigned long private, int **x)
 {
 	int nid = page_to_nid(p);
 	if (PageHuge(p))
 		return alloc_huge_page_node(page_hstate(compound_head(p)),
 						   nid);
 	else
 		return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
 /*
  * Safely get reference count of an arbitrary page.
  * Returns 0 for a free page, -EIO for a zero refcount page
  * that is not free, and 1 for any other page type.
  * For 1 the page is returned with increased page count, otherwise not.
  */
 static int __get_any_page(struct page *p, unsigned long pfn, int flags)
 {
 	int ret;
 	if (flags & MF_COUNT_INCREASED)
 		return 1;
 	/*
 	 * When the target page is a free hugepage, just remove it
 	 * from free hugepage list.
 	 */
 	if (!get_page_unless_zero(compound_head(p))) {
 		if (PageHuge(p)) {
 			pr_info("%s: %#lx free huge page\n", __func__, pfn);
 			ret = 0;
 		} else if (is_free_buddy_page(p)) {
 			pr_info("%s: %#lx free buddy page\n", __func__, pfn);
 			ret = 0;
 		} else {
 			pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
 				__func__, pfn, p->flags);
 			ret = -EIO;
 		}
 	} else {
 		/* Not a free page */
 		ret = 1;
 	}
 	return ret;
 }
 static int get_any_page(struct page *page, unsigned long pfn, int flags)
 {
 	int ret = __get_any_page(page, pfn, flags);
 	if (ret == 1 && !PageHuge(page) && !PageLRU(page)) {
 		/*
 		 * Try to free it.
 		 */
 		put_page(page);
 		shake_page(page, 1);
 		/*
 		 * Did it turn free?
 		 */
 		ret = __get_any_page(page, pfn, 0);
 		if (!PageLRU(page)) {
 			pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
 				pfn, page->flags);
 			return -EIO;
 		}
 	}
 	return ret;
 }
 static int soft_offline_huge_page(struct page *page, int flags)
 {
 	int ret;
 	unsigned long pfn = page_to_pfn(page);
 	struct page *hpage = compound_head(page);
 	LIST_HEAD(pagelist);
 	/*
 	 * This double-check of PageHWPoison is to avoid the race with
 	 * memory_failure(). See also comment in __soft_offline_page().
 	 */
 	lock_page(hpage);
 	if (PageHWPoison(hpage)) {
 		unlock_page(hpage);
 		put_page(hpage);
 		pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
 		return -EBUSY;
 	}
 	unlock_page(hpage);
 	/* Keep page count to indicate a given hugepage is isolated. */
 	list_move(&hpage->lru, &pagelist);
 	ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
 			pfn, ret, page->flags);
 		/*
 		 * We know that soft_offline_huge_page() tries to migrate
 		 * only one hugepage pointed to by hpage, so we need not
 		 * run through the pagelist here.
 		 */
 		putback_active_hugepage(hpage);
 		if (ret > 0)
 			ret = -EIO;
 	} else {
 		/* overcommit hugetlb page will be freed to buddy */
 		if (PageHuge(page)) {
 			set_page_hwpoison_huge_page(hpage);
 			dequeue_hwpoisoned_huge_page(hpage);
 			atomic_long_add(1 << compound_order(hpage),
 					&num_poisoned_pages);
 		} else {
 			SetPageHWPoison(page);
 			atomic_long_inc(&num_poisoned_pages);
 		}
 	}
 	return ret;
 }
 static int __soft_offline_page(struct page *page, int flags)
 {
 	int ret;
 	unsigned long pfn = page_to_pfn(page);
 	/*
 	 * Check PageHWPoison again inside page lock because PageHWPoison
 	 * is set by memory_failure() outside page lock. Note that
 	 * memory_failure() also double-checks PageHWPoison inside page lock,
 	 * so there's no race between soft_offline_page() and memory_failure().
 	 */
 	lock_page(page);
 	wait_on_page_writeback(page);
 	if (PageHWPoison(page)) {
 		unlock_page(page);
 		put_page(page);
 		pr_info("soft offline: %#lx page already poisoned\n", pfn);
 		return -EBUSY;
 	}
 	/*
 	 * Try to invalidate first. This should work for
 	 * non dirty unmapped page cache pages.
 	 */
 	ret = invalidate_inode_page(page);
 	unlock_page(page);
 	/*
 	 * RED-PEN would be better to keep it isolated here, but we
 	 * would need to fix isolation locking first.
 	 */
 	if (ret == 1) {
 		put_page(page);
 		pr_info("soft_offline: %#lx: invalidated\n", pfn);
 		SetPageHWPoison(page);
 		atomic_long_inc(&num_poisoned_pages);
 		return 0;
 	}
 	/*
 	 * Simple invalidation didn't work.
 	 * Try to migrate to a new page instead. migrate.c
 	 * handles a large number of cases for us.
 	 */
 	ret = isolate_lru_page(page);
 	/*
 	 * Drop page reference which is came from get_any_page()
 	 * successful isolate_lru_page() already took another one.
 	 */
 	put_page(page);
 	if (!ret) {
 		LIST_HEAD(pagelist);
 		inc_zone_page_state(page, NR_ISOLATED_ANON +
 					page_is_file_cache(page));
 		list_add(&page->lru, &pagelist);
 		ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
 					MIGRATE_SYNC, MR_MEMORY_FAILURE);
 		if (ret) {
 			if (!list_empty(&pagelist)) {
 				list_del(&page->lru);
 				dec_zone_page_state(page, NR_ISOLATED_ANON +
 						page_is_file_cache(page));
 				putback_lru_page(page);
 			}
 			pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
 				pfn, ret, page->flags);
 			if (ret > 0)
 				ret = -EIO;
 		} else {
 			/*
 			 * After page migration succeeds, the source page can
 			 * be trapped in pagevec and actual freeing is delayed.
 			 * Freeing code works differently based on PG_hwpoison,
 			 * so there's a race. We need to make sure that the
 			 * source page should be freed back to buddy before
 			 * setting PG_hwpoison.
 			 */
 			if (!is_free_buddy_page(page))
 				lru_add_drain_all();
 			if (!is_free_buddy_page(page))
 				drain_all_pages();
 			SetPageHWPoison(page);
 			if (!is_free_buddy_page(page))
 				pr_info("soft offline: %#lx: page leaked\n",
 					pfn);
 			atomic_long_inc(&num_poisoned_pages);
 		}
 	} else {
 		pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
 			pfn, ret, page_count(page), page->flags);
 	}
 	return ret;
 }
 /**
  * soft_offline_page - Soft offline a page.
  * @page: page to offline
  * @flags: flags. Same as memory_failure().
  *
  * Returns 0 on success, otherwise negated errno.
  *
  * Soft offline a page, by migration or invalidation,
  * without killing anything. This is for the case when
  * a page is not corrupted yet (so it's still valid to access),
  * but has had a number of corrected errors and is better taken
  * out.
  *
  * The actual policy on when to do that is maintained by
  * user space.
  *
  * This should never impact any application or cause data loss,
  * however it might take some time.
  *
  * This is not a 100% solution for all memory, but tries to be
  * ``good enough'' for the majority of memory.
  */
 int soft_offline_page(struct page *page, int flags)
 {
 	int ret;
 	unsigned long pfn = page_to_pfn(page);
 	struct page *hpage = compound_head(page);
 	if (PageHWPoison(page)) {
 		pr_info("soft offline: %#lx page already poisoned\n", pfn);
 		return -EBUSY;
 	}
 	if (!PageHuge(page) && PageTransHuge(hpage)) {
 		if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
 			pr_info("soft offline: %#lx: failed to split THP\n",
 				pfn);
 			return -EBUSY;
 		}
 	}
 	get_online_mems();
 	/*
 	 * Isolate the page, so that it doesn't get reallocated if it
 	 * was free. This flag should be kept set until the source page
 	 * is freed and PG_hwpoison on it is set.
 	 */
 	if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
 		set_migratetype_isolate(page, true);
 	ret = get_any_page(page, pfn, flags);
 	put_online_mems();
 	if (ret > 0) { /* for in-use pages */
 		if (PageHuge(page))
 			ret = soft_offline_huge_page(page, flags);
 		else
 			ret = __soft_offline_page(page, flags);
 	} else if (ret == 0) { /* for free pages */
 		if (PageHuge(page)) {
 			set_page_hwpoison_huge_page(hpage);
 			dequeue_hwpoisoned_huge_page(hpage);
 			atomic_long_add(1 << compound_order(hpage),
 					&num_poisoned_pages);
 		} else {
 			SetPageHWPoison(page);
 			atomic_long_inc(&num_poisoned_pages);
 		}
 	}
 	unset_migratetype_isolate(page, MIGRATE_MOVABLE);
 	return ret;
 }