Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* Simple NUMA memory policy for the Linux kernel.

2

* Simple NUMA memory policy for the Linux kernel.

3

*

3

*

4

5

6

* Subject to the GNU Public License, version 2.

6

* Subject to the GNU Public License, version 2.

7

*

7

*

8

* NUMA policy allows the user to give hints in which node(s) memory should

8

* NUMA policy allows the user to give hints in which node(s) memory should

9

* be allocated.

9

* be allocated.

10

*

10

*

11

* Support four policies per VMA and per process:

11

* Support four policies per VMA and per process:

12

*

12

*

13

* The VMA policy has priority over the process policy for a page fault.

13

* The VMA policy has priority over the process policy for a page fault.

14

*

14

*

15

* interleave Allocate memory interleaved over a set of nodes,

15

* interleave Allocate memory interleaved over a set of nodes,

16

* with normal fallback if it fails.

16

* with normal fallback if it fails.

17

* For VMA based allocations this interleaves based on the

17

* For VMA based allocations this interleaves based on the

18

* offset into the backing object or offset into the mapping

18

* offset into the backing object or offset into the mapping

19

* for anonymous memory. For process policy an process counter

19

* for anonymous memory. For process policy an process counter

20

* is used.

20

* is used.

21

*

21

*

22

* bind Only allocate memory on a specific set of nodes,

22

* bind Only allocate memory on a specific set of nodes,

23

* no fallback.

23

* no fallback.

24

* FIXME: memory is allocated starting with the first node

24

* FIXME: memory is allocated starting with the first node

25

* to the last. It would be better if bind would truly restrict

25

* to the last. It would be better if bind would truly restrict

26

* the allocation to memory nodes instead

26

* the allocation to memory nodes instead

27

*

27

*

28

* preferred Try a specific node first before normal fallback.

28

* preferred Try a specific node first before normal fallback.

29

* As a special case NUMA_NO_NODE here means do the allocation

29

* As a special case NUMA_NO_NODE here means do the allocation

30

* on the local CPU. This is normally identical to default,

30

* on the local CPU. This is normally identical to default,

31

* but useful to set in a VMA when you have a non default

31

* but useful to set in a VMA when you have a non default

32

* process policy.

32

* process policy.

33

*

33

*

34

* default Allocate on the local node first, or when on a VMA

34

* default Allocate on the local node first, or when on a VMA

35

* use the process policy. This is what Linux always did

35

* use the process policy. This is what Linux always did

36

* in a NUMA aware kernel and still does by, ahem, default.

36

* in a NUMA aware kernel and still does by, ahem, default.

37

*

37

*

38

* The process policy is applied for most non interrupt memory allocations

38

* The process policy is applied for most non interrupt memory allocations

39

* in that process' context. Interrupts ignore the policies and always

39

* in that process' context. Interrupts ignore the policies and always

40

* try to allocate on the local CPU. The VMA policy is only applied for memory

40

* try to allocate on the local CPU. The VMA policy is only applied for memory

41

* allocations for a VMA in the VM.

41

* allocations for a VMA in the VM.

42

*

42

*

43

* Currently there are a few corner cases in swapping where the policy

43

* Currently there are a few corner cases in swapping where the policy

44

* is not applied, but the majority should be handled. When process policy

44

* is not applied, but the majority should be handled. When process policy

45

* is used it is not remembered over swap outs/swap ins.

45

* is used it is not remembered over swap outs/swap ins.

46

*

46

*

47

* Only the highest zone in the zone hierarchy gets policied. Allocations

47

* Only the highest zone in the zone hierarchy gets policied. Allocations

48

* requesting a lower zone just use default policy. This implies that

48

* requesting a lower zone just use default policy. This implies that

49

* on systems with highmem kernel lowmem allocation don't get policied.

49

* on systems with highmem kernel lowmem allocation don't get policied.

50

* Same with GFP_DMA allocations.

50

* Same with GFP_DMA allocations.

51

*

51

*

52

* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between

52

* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between

53

* all users and remembered even when nobody has memory mapped.

53

* all users and remembered even when nobody has memory mapped.

54

*/

54

*/

55

56

/* Notebook:

56

/* Notebook:

57

fix mmap readahead to honour policy and enable policy for any page cache

57

fix mmap readahead to honour policy and enable policy for any page cache

58

object

58

object

59

statistics for bigpages

59

statistics for bigpages

60

global policy for page cache? currently it uses process policy. Requires

60

global policy for page cache? currently it uses process policy. Requires

61

first item above.

61

first item above.

62

handle mremap for shared memory (currently ignored for the policy)

62

handle mremap for shared memory (currently ignored for the policy)

63

grows down?

63

grows down?

64

make bind policy root only? It can trigger oom much faster and the

64

make bind policy root only? It can trigger oom much faster and the

65

kernel is not always grateful with that.

65

kernel is not always grateful with that.

66

*/

66

*/

67

68

#include <linux/mempolicy.h>

68

#include <linux/mempolicy.h>

69

#include <linux/mm.h>

69

#include <linux/mm.h>

70

#include <linux/highmem.h>

70

#include <linux/highmem.h>

71

#include <linux/hugetlb.h>

71

#include <linux/hugetlb.h>

72

#include <linux/kernel.h>

72

#include <linux/kernel.h>

73

#include <linux/sched.h>

73

#include <linux/sched.h>

74

#include <linux/nodemask.h>

74

#include <linux/nodemask.h>

75

#include <linux/cpuset.h>

75

#include <linux/cpuset.h>

76

#include <linux/slab.h>

76

#include <linux/slab.h>

77

#include <linux/string.h>

77

#include <linux/string.h>

78

#include <linux/export.h>

78

#include <linux/export.h>

79

#include <linux/nsproxy.h>

79

#include <linux/nsproxy.h>

80

#include <linux/interrupt.h>

80

#include <linux/interrupt.h>

81

#include <linux/init.h>

81

#include <linux/init.h>

82

#include <linux/compat.h>

82

#include <linux/compat.h>

83

#include <linux/swap.h>

83

#include <linux/swap.h>

84

#include <linux/seq_file.h>

84

#include <linux/seq_file.h>

85

#include <linux/proc_fs.h>

85

#include <linux/proc_fs.h>

86

#include <linux/migrate.h>

86

#include <linux/migrate.h>

87

#include <linux/ksm.h>

87

#include <linux/ksm.h>

88

#include <linux/rmap.h>

88

#include <linux/rmap.h>

89

#include <linux/security.h>

89

#include <linux/security.h>

90

#include <linux/syscalls.h>

90

#include <linux/syscalls.h>

91

#include <linux/ctype.h>

91

#include <linux/ctype.h>

92

#include <linux/mm_inline.h>

92

#include <linux/mm_inline.h>

93

#include <linux/mmu_notifier.h>

93

#include <linux/mmu_notifier.h>

94

95

#include <asm/tlbflush.h>

95

#include <asm/tlbflush.h>

96

#include <asm/uaccess.h>

96

#include <asm/uaccess.h>

97

#include <linux/random.h>

97

#include <linux/random.h>

98

99

#include "internal.h"

99

#include "internal.h"

100

101

/* Internal flags */

101

/* Internal flags */

102

#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */

102

#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */

103

#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */

103

#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */

104

105

static struct kmem_cache *policy_cache;

105

static struct kmem_cache *policy_cache;

106

static struct kmem_cache *sn_cache;

106

static struct kmem_cache *sn_cache;

107

108

/* Highest zone. An specific allocation for a zone below that is not

108

/* Highest zone. An specific allocation for a zone below that is not

109

policied. */

109

policied. */

110

enum zone_type policy_zone = 0;

110

enum zone_type policy_zone = 0;

111

112

/*

112

/*

113

* run-time system-wide default policy => local allocation

113

* run-time system-wide default policy => local allocation

114

*/

114

*/

115

static struct mempolicy default_policy = {

115

static struct mempolicy default_policy = {

116

.refcnt = ATOMIC_INIT(1), /* never free it */

116

.refcnt = ATOMIC_INIT(1), /* never free it */

117

.mode = MPOL_PREFERRED,

117

.mode = MPOL_PREFERRED,

118

.flags = MPOL_F_LOCAL,

118

.flags = MPOL_F_LOCAL,

119

};

119

};

120

121

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

121

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

122

123

static struct mempolicy *get_task_policy(struct task_struct *p)

123

static struct mempolicy *get_task_policy(struct task_struct *p)

124

{

124

{

125

struct mempolicy *pol = p->mempolicy;

125

struct mempolicy *pol = p->mempolicy;

126

127

if (!pol) {

127

if (!pol) {

128

int node = numa_node_id();

128

int node = numa_node_id();

129

130

if (node != NUMA_NO_NODE) {

130

if (node != NUMA_NO_NODE) {

131

pol = &preferred_node_policy[node];

131

pol = &preferred_node_policy[node];

132

/*

132

/*

133

* preferred_node_policy is not initialised early in

133

* preferred_node_policy is not initialised early in

134

* boot

134

* boot

135

*/

135

*/

136

if (!pol->mode)

136

if (!pol->mode)

137

pol = NULL;

137

pol = NULL;

138

}

138

}

139

}

139

}

140

141

return pol;

141

return pol;

142

}

142

}

143

144

static const struct mempolicy_operations {

144

static const struct mempolicy_operations {

145

int (*create)(struct mempolicy *pol, const nodemask_t *nodes);

145

int (*create)(struct mempolicy *pol, const nodemask_t *nodes);

146

/*

146

/*

147

* If read-side task has no lock to protect task->mempolicy, write-side

147

* If read-side task has no lock to protect task->mempolicy, write-side

148

* task will rebind the task->mempolicy by two step. The first step is

148

* task will rebind the task->mempolicy by two step. The first step is

149

* setting all the newly nodes, and the second step is cleaning all the

149

* setting all the newly nodes, and the second step is cleaning all the

150

* disallowed nodes. In this way, we can avoid finding no node to alloc

150

* disallowed nodes. In this way, we can avoid finding no node to alloc

151

* page.

151

* page.

152

* If we have a lock to protect task->mempolicy in read-side, we do

152

* If we have a lock to protect task->mempolicy in read-side, we do

153

* rebind directly.

153

* rebind directly.

154

*

154

*

155

* step:

155

* step:

156

* MPOL_REBIND_ONCE - do rebind work at once

156

* MPOL_REBIND_ONCE - do rebind work at once

157

* MPOL_REBIND_STEP1 - set all the newly nodes

157

* MPOL_REBIND_STEP1 - set all the newly nodes

158

* MPOL_REBIND_STEP2 - clean all the disallowed nodes

158

* MPOL_REBIND_STEP2 - clean all the disallowed nodes

159

*/

159

*/

160

void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,

160

void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,

161

enum mpol_rebind_step step);

161

enum mpol_rebind_step step);

162

} mpol_ops[MPOL_MAX];

162

} mpol_ops[MPOL_MAX];

163

164

/* Check that the nodemask contains at least one populated zone */

164

/* Check that the nodemask contains at least one populated zone */

165

static int is_valid_nodemask(const nodemask_t *nodemask)

165

static int is_valid_nodemask(const nodemask_t *nodemask)

166

{

166

{

167

return nodes_intersects(*nodemask, node_states[N_MEMORY]);

167

return nodes_intersects(*nodemask, node_states[N_MEMORY]);

168

}

168

}

169

170

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)

170

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)

171

{

171

{

172

return pol->flags & MPOL_MODE_FLAGS;

172

return pol->flags & MPOL_MODE_FLAGS;

173

}

173

}

174

175

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,

175

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,

176

const nodemask_t *rel)

176

const nodemask_t *rel)

177

{

177

{

178

nodemask_t tmp;

178

nodemask_t tmp;

179

nodes_fold(tmp, *orig, nodes_weight(*rel));

179

nodes_fold(tmp, *orig, nodes_weight(*rel));

180

nodes_onto(*ret, tmp, *rel);

180

nodes_onto(*ret, tmp, *rel);

181

}

181

}

182

183

static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)

183

static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)

184

{

184

{

185

if (nodes_empty(*nodes))

185

if (nodes_empty(*nodes))

186

return -EINVAL;

186

return -EINVAL;

187

pol->v.nodes = *nodes;

187

pol->v.nodes = *nodes;

188

return 0;

188

return 0;

189

}

189

}

190

191

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)

191

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)

192

{

192

{

193

if (!nodes)

193

if (!nodes)

194

pol->flags |= MPOL_F_LOCAL; /* local allocation */

194

pol->flags |= MPOL_F_LOCAL; /* local allocation */

195

else if (nodes_empty(*nodes))

195

else if (nodes_empty(*nodes))

196

return -EINVAL; /* no allowed nodes */

196

return -EINVAL; /* no allowed nodes */

197

else

197

else

198

pol->v.preferred_node = first_node(*nodes);

198

pol->v.preferred_node = first_node(*nodes);

199

return 0;

199

return 0;

200

}

200

}

201

202

static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)

202

static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)

203

{

203

{

204

if (!is_valid_nodemask(nodes))

204

if (!is_valid_nodemask(nodes))

205

return -EINVAL;

205

return -EINVAL;

206

pol->v.nodes = *nodes;

206

pol->v.nodes = *nodes;

207

return 0;

207

return 0;

208

}

208

}

209

210

/*

210

/*

211

* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if

211

* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if

212

* any, for the new policy. mpol_new() has already validated the nodes

212

* any, for the new policy. mpol_new() has already validated the nodes

213

* parameter with respect to the policy mode and flags. But, we need to

213

* parameter with respect to the policy mode and flags. But, we need to

214

* handle an empty nodemask with MPOL_PREFERRED here.

214

* handle an empty nodemask with MPOL_PREFERRED here.

215

*

215

*

216

* Must be called holding task's alloc_lock to protect task's mems_allowed

216

* Must be called holding task's alloc_lock to protect task's mems_allowed

217

* and mempolicy. May also be called holding the mmap_semaphore for write.

217

* and mempolicy. May also be called holding the mmap_semaphore for write.

218

*/

218

*/

219

static int mpol_set_nodemask(struct mempolicy *pol,

219

static int mpol_set_nodemask(struct mempolicy *pol,

220

const nodemask_t *nodes, struct nodemask_scratch *nsc)

220

const nodemask_t *nodes, struct nodemask_scratch *nsc)

221

{

221

{

222

int ret;

222

int ret;

223

224

/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */

224

/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */

225

if (pol == NULL)

225

if (pol == NULL)

226

return 0;

226

return 0;

227

/* Check N_MEMORY */

227

/* Check N_MEMORY */

228

nodes_and(nsc->mask1,

228

nodes_and(nsc->mask1,

229

cpuset_current_mems_allowed, node_states[N_MEMORY]);

229

cpuset_current_mems_allowed, node_states[N_MEMORY]);

230

231

VM_BUG_ON(!nodes);

231

VM_BUG_ON(!nodes);

232

if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))

232

if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))

233

nodes = NULL; /* explicit local allocation */

233

nodes = NULL; /* explicit local allocation */

234

else {

234

else {

235

if (pol->flags & MPOL_F_RELATIVE_NODES)

235

if (pol->flags & MPOL_F_RELATIVE_NODES)

236

mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);

236

mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);

237

else

237

else

238

nodes_and(nsc->mask2, *nodes, nsc->mask1);

238

nodes_and(nsc->mask2, *nodes, nsc->mask1);

239

240

if (mpol_store_user_nodemask(pol))

240

if (mpol_store_user_nodemask(pol))

241

pol->w.user_nodemask = *nodes;

241

pol->w.user_nodemask = *nodes;

242

else

242

else

243

pol->w.cpuset_mems_allowed =

243

pol->w.cpuset_mems_allowed =

244

cpuset_current_mems_allowed;

244

cpuset_current_mems_allowed;

245

}

245

}

246

247

if (nodes)

247

if (nodes)

248

ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);

248

ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);

249

else

249

else

250

ret = mpol_ops[pol->mode].create(pol, NULL);

250

ret = mpol_ops[pol->mode].create(pol, NULL);

251

return ret;

251

return ret;

252

}

252

}

253

254

/*

254

/*

255

* This function just creates a new policy, does some check and simple

255

* This function just creates a new policy, does some check and simple

256

* initialization. You must invoke mpol_set_nodemask() to set nodes.

256

* initialization. You must invoke mpol_set_nodemask() to set nodes.

257

*/

257

*/

258

static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,

258

static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,

259

nodemask_t *nodes)

259

nodemask_t *nodes)

260

{

260

{

261

struct mempolicy *policy;

261

struct mempolicy *policy;

262

263

pr_debug("setting mode %d flags %d nodes[0] %lx\n",

263

pr_debug("setting mode %d flags %d nodes[0] %lx\n",

264

mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);

264

mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);

265

266

if (mode == MPOL_DEFAULT) {

266

if (mode == MPOL_DEFAULT) {

267

if (nodes && !nodes_empty(*nodes))

267

if (nodes && !nodes_empty(*nodes))

268

return ERR_PTR(-EINVAL);

268

return ERR_PTR(-EINVAL);

269

return NULL;

269

return NULL;

270

}

270

}

271

VM_BUG_ON(!nodes);

271

VM_BUG_ON(!nodes);

272

273

/*

273

/*

274

* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or

274

* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or

275

* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).

275

* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).

276

* All other modes require a valid pointer to a non-empty nodemask.

276

* All other modes require a valid pointer to a non-empty nodemask.

277

*/

277

*/

278

if (mode == MPOL_PREFERRED) {

278

if (mode == MPOL_PREFERRED) {

279

if (nodes_empty(*nodes)) {

279

if (nodes_empty(*nodes)) {

280

if (((flags & MPOL_F_STATIC_NODES) ||

280

if (((flags & MPOL_F_STATIC_NODES) ||

281

(flags & MPOL_F_RELATIVE_NODES)))

281

(flags & MPOL_F_RELATIVE_NODES)))

282

return ERR_PTR(-EINVAL);

282

return ERR_PTR(-EINVAL);

283

}

283

}

284

} else if (mode == MPOL_LOCAL) {

284

} else if (mode == MPOL_LOCAL) {

285

if (!nodes_empty(*nodes))

285

if (!nodes_empty(*nodes))

286

return ERR_PTR(-EINVAL);

286

return ERR_PTR(-EINVAL);

287

mode = MPOL_PREFERRED;

287

mode = MPOL_PREFERRED;

288

} else if (nodes_empty(*nodes))

288

} else if (nodes_empty(*nodes))

289

return ERR_PTR(-EINVAL);

289

return ERR_PTR(-EINVAL);

290

policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);

290

policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);

291

if (!policy)

291

if (!policy)

292

return ERR_PTR(-ENOMEM);

292

return ERR_PTR(-ENOMEM);

293

atomic_set(&policy->refcnt, 1);

293

atomic_set(&policy->refcnt, 1);

294

policy->mode = mode;

294

policy->mode = mode;

295

policy->flags = flags;

295

policy->flags = flags;

296

297

return policy;

297

return policy;

298

}

298

}

299

300

/* Slow path of a mpol destructor. */

300

/* Slow path of a mpol destructor. */

301

void __mpol_put(struct mempolicy *p)

301

void __mpol_put(struct mempolicy *p)

302

{

302

{

303

if (!atomic_dec_and_test(&p->refcnt))

303

if (!atomic_dec_and_test(&p->refcnt))

304

return;

304

return;

305

kmem_cache_free(policy_cache, p);

305

kmem_cache_free(policy_cache, p);

306

}

306

}

307

308

static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,

308

static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,

309

enum mpol_rebind_step step)

309

enum mpol_rebind_step step)

310

{

310

{

311

}

311

}

312

313

/*

313

/*

314

* step:

314

* step:

315

* MPOL_REBIND_ONCE - do rebind work at once

315

* MPOL_REBIND_ONCE - do rebind work at once

316

* MPOL_REBIND_STEP1 - set all the newly nodes

316

* MPOL_REBIND_STEP1 - set all the newly nodes

317

* MPOL_REBIND_STEP2 - clean all the disallowed nodes

317

* MPOL_REBIND_STEP2 - clean all the disallowed nodes

318

*/

318

*/

319

static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,

319

static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,

320

enum mpol_rebind_step step)

320

enum mpol_rebind_step step)

321

{

321

{

322

nodemask_t tmp;

322

nodemask_t tmp;

323

324

if (pol->flags & MPOL_F_STATIC_NODES)

324

if (pol->flags & MPOL_F_STATIC_NODES)

325

nodes_and(tmp, pol->w.user_nodemask, *nodes);

325

nodes_and(tmp, pol->w.user_nodemask, *nodes);

326

else if (pol->flags & MPOL_F_RELATIVE_NODES)

326

else if (pol->flags & MPOL_F_RELATIVE_NODES)

327

mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);

327

mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);

328

else {

328

else {

329

/*

329

/*

330

* if step == 1, we use ->w.cpuset_mems_allowed to cache the

330

* if step == 1, we use ->w.cpuset_mems_allowed to cache the

331

* result

331

* result

332

*/

332

*/

333

if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {

333

if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {

334

nodes_remap(tmp, pol->v.nodes,

334

nodes_remap(tmp, pol->v.nodes,

335

pol->w.cpuset_mems_allowed, *nodes);

335

pol->w.cpuset_mems_allowed, *nodes);

336

pol->w.cpuset_mems_allowed = step ? tmp : *nodes;

336

pol->w.cpuset_mems_allowed = step ? tmp : *nodes;

337

} else if (step == MPOL_REBIND_STEP2) {

337

} else if (step == MPOL_REBIND_STEP2) {

338

tmp = pol->w.cpuset_mems_allowed;

338

tmp = pol->w.cpuset_mems_allowed;

339

pol->w.cpuset_mems_allowed = *nodes;

339

pol->w.cpuset_mems_allowed = *nodes;

340

} else

340

} else

341

BUG();

341

BUG();

342

}

342

}

343

344

if (nodes_empty(tmp))

344

if (nodes_empty(tmp))

345

tmp = *nodes;

345

tmp = *nodes;

346

347

if (step == MPOL_REBIND_STEP1)

347

if (step == MPOL_REBIND_STEP1)

348

nodes_or(pol->v.nodes, pol->v.nodes, tmp);

348

nodes_or(pol->v.nodes, pol->v.nodes, tmp);

349

else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)

349

else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)

350

pol->v.nodes = tmp;

350

pol->v.nodes = tmp;

351

else

351

else

352

BUG();

352

BUG();

353

354

if (!node_isset(current->il_next, tmp)) {

354

if (!node_isset(current->il_next, tmp)) {

355

current->il_next = next_node(current->il_next, tmp);

355

current->il_next = next_node(current->il_next, tmp);

356

if (current->il_next >= MAX_NUMNODES)

356

if (current->il_next >= MAX_NUMNODES)

357

current->il_next = first_node(tmp);

357

current->il_next = first_node(tmp);

358

if (current->il_next >= MAX_NUMNODES)

358

if (current->il_next >= MAX_NUMNODES)

359

current->il_next = numa_node_id();

359

current->il_next = numa_node_id();

360

}

360

}

361

}

361

}

362

363

static void mpol_rebind_preferred(struct mempolicy *pol,

363

static void mpol_rebind_preferred(struct mempolicy *pol,

364

const nodemask_t *nodes,

364

const nodemask_t *nodes,

365

enum mpol_rebind_step step)

365

enum mpol_rebind_step step)

366

{

366

{

367

nodemask_t tmp;

367

nodemask_t tmp;

368

369

if (pol->flags & MPOL_F_STATIC_NODES) {

369

if (pol->flags & MPOL_F_STATIC_NODES) {

370

int node = first_node(pol->w.user_nodemask);

370

int node = first_node(pol->w.user_nodemask);

371

372

if (node_isset(node, *nodes)) {

372

if (node_isset(node, *nodes)) {

373

pol->v.preferred_node = node;

373

pol->v.preferred_node = node;

374

pol->flags &= ~MPOL_F_LOCAL;

374

pol->flags &= ~MPOL_F_LOCAL;

375

} else

375

} else

376

pol->flags |= MPOL_F_LOCAL;

376

pol->flags |= MPOL_F_LOCAL;

377

} else if (pol->flags & MPOL_F_RELATIVE_NODES) {

377

} else if (pol->flags & MPOL_F_RELATIVE_NODES) {

378

mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);

378

mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);

379

pol->v.preferred_node = first_node(tmp);

379

pol->v.preferred_node = first_node(tmp);

380

} else if (!(pol->flags & MPOL_F_LOCAL)) {

380

} else if (!(pol->flags & MPOL_F_LOCAL)) {

381

pol->v.preferred_node = node_remap(pol->v.preferred_node,

381

pol->v.preferred_node = node_remap(pol->v.preferred_node,

382

pol->w.cpuset_mems_allowed,

382

pol->w.cpuset_mems_allowed,

383

*nodes);

383

*nodes);

384

pol->w.cpuset_mems_allowed = *nodes;

384

pol->w.cpuset_mems_allowed = *nodes;

385

}

385

}

386

}

386

}

387

388

/*

388

/*

389

* mpol_rebind_policy - Migrate a policy to a different set of nodes

389

* mpol_rebind_policy - Migrate a policy to a different set of nodes

390

*

390

*

391

* If read-side task has no lock to protect task->mempolicy, write-side

391

* If read-side task has no lock to protect task->mempolicy, write-side

392

* task will rebind the task->mempolicy by two step. The first step is

392

* task will rebind the task->mempolicy by two step. The first step is

393

* setting all the newly nodes, and the second step is cleaning all the

393

* setting all the newly nodes, and the second step is cleaning all the

394

* disallowed nodes. In this way, we can avoid finding no node to alloc

394

* disallowed nodes. In this way, we can avoid finding no node to alloc

395

* page.

395

* page.

396

* If we have a lock to protect task->mempolicy in read-side, we do

396

* If we have a lock to protect task->mempolicy in read-side, we do

397

* rebind directly.

397

* rebind directly.

398

*

398

*

399

* step:

399

* step:

400

* MPOL_REBIND_ONCE - do rebind work at once

400

* MPOL_REBIND_ONCE - do rebind work at once

401

* MPOL_REBIND_STEP1 - set all the newly nodes

401

* MPOL_REBIND_STEP1 - set all the newly nodes

402

* MPOL_REBIND_STEP2 - clean all the disallowed nodes

402

* MPOL_REBIND_STEP2 - clean all the disallowed nodes

403

*/

403

*/

404

static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,

404

static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,

405

enum mpol_rebind_step step)

405

enum mpol_rebind_step step)

406

{

406

{

407

if (!pol)

407

if (!pol)

408

return;

408

return;

409

if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&

409

if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&

410

nodes_equal(pol->w.cpuset_mems_allowed, *newmask))

410

nodes_equal(pol->w.cpuset_mems_allowed, *newmask))

411

return;

411

return;

412

413

if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))

413

if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))

414

return;

414

return;

415

416

if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))

416

if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))

417

BUG();

417

BUG();

418

419

if (step == MPOL_REBIND_STEP1)

419

if (step == MPOL_REBIND_STEP1)

420

pol->flags |= MPOL_F_REBINDING;

420

pol->flags |= MPOL_F_REBINDING;

421

else if (step == MPOL_REBIND_STEP2)

421

else if (step == MPOL_REBIND_STEP2)

422

pol->flags &= ~MPOL_F_REBINDING;

422

pol->flags &= ~MPOL_F_REBINDING;

423

else if (step >= MPOL_REBIND_NSTEP)

423

else if (step >= MPOL_REBIND_NSTEP)

424

BUG();

424

BUG();

425

426

mpol_ops[pol->mode].rebind(pol, newmask, step);

426

mpol_ops[pol->mode].rebind(pol, newmask, step);

427

}

427

}

428

429

/*

429

/*

430

* Wrapper for mpol_rebind_policy() that just requires task

430

* Wrapper for mpol_rebind_policy() that just requires task

431

* pointer, and updates task mempolicy.

431

* pointer, and updates task mempolicy.

432

*

432

*

433

* Called with task's alloc_lock held.

433

* Called with task's alloc_lock held.

434

*/

434

*/

435

436

void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,

436

void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,

437

enum mpol_rebind_step step)

437

enum mpol_rebind_step step)

438

{

438

{

439

mpol_rebind_policy(tsk->mempolicy, new, step);

439

mpol_rebind_policy(tsk->mempolicy, new, step);

440

}

440

}

441

442

/*

442

/*

443

* Rebind each vma in mm to new nodemask.

443

* Rebind each vma in mm to new nodemask.

444

*

444

*

445

* Call holding a reference to mm. Takes mm->mmap_sem during call.

445

* Call holding a reference to mm. Takes mm->mmap_sem during call.

446

*/

446

*/

447

448

void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)

448

void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)

449

{

449

{

450

struct vm_area_struct *vma;

450

struct vm_area_struct *vma;

451

452

down_write(&mm->mmap_sem);

452

down_write(&mm->mmap_sem);

453

for (vma = mm->mmap; vma; vma = vma->vm_next)

453

for (vma = mm->mmap; vma; vma = vma->vm_next)

454

mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);

454

mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);

455

up_write(&mm->mmap_sem);

455

up_write(&mm->mmap_sem);

456

}

456

}

457

458

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {

458

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {

459

[MPOL_DEFAULT] = {

459

[MPOL_DEFAULT] = {

460

.rebind = mpol_rebind_default,

460

.rebind = mpol_rebind_default,

461

},

461

},

462

[MPOL_INTERLEAVE] = {

462

[MPOL_INTERLEAVE] = {

463

.create = mpol_new_interleave,

463

.create = mpol_new_interleave,

464

.rebind = mpol_rebind_nodemask,

464

.rebind = mpol_rebind_nodemask,

465

},

465

},

466

[MPOL_PREFERRED] = {

466

[MPOL_PREFERRED] = {

467

.create = mpol_new_preferred,

467

.create = mpol_new_preferred,

468

.rebind = mpol_rebind_preferred,

468

.rebind = mpol_rebind_preferred,

469

},

469

},

470

[MPOL_BIND] = {

470

[MPOL_BIND] = {

471

.create = mpol_new_bind,

471

.create = mpol_new_bind,

472

.rebind = mpol_rebind_nodemask,

472

.rebind = mpol_rebind_nodemask,

473

},

473

},

474

};

474

};

475

476

static void migrate_page_add(struct page *page, struct list_head *pagelist,

476

static void migrate_page_add(struct page *page, struct list_head *pagelist,

477

unsigned long flags);

477

unsigned long flags);

478

479

/*

479

/*

480

* Scan through pages checking if pages follow certain conditions,

480

* Scan through pages checking if pages follow certain conditions,

481

* and move them to the pagelist if they do.

481

* and move them to the pagelist if they do.

482

*/

482

*/

483

static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,

483

static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,

484

unsigned long addr, unsigned long end,

484

unsigned long addr, unsigned long end,

485

const nodemask_t *nodes, unsigned long flags,

485

const nodemask_t *nodes, unsigned long flags,

486

void *private)

486

void *private)

487

{

487

{

488

pte_t *orig_pte;

488

pte_t *orig_pte;

489

pte_t *pte;

489

pte_t *pte;

490

spinlock_t *ptl;

490

spinlock_t *ptl;

491

492

orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

492

orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

493

do {

493

do {

494

struct page *page;

494

struct page *page;

495

int nid;

495

int nid;

496

497

if (!pte_present(*pte))

497

if (!pte_present(*pte))

498

continue;

498

continue;

499

page = vm_normal_page(vma, addr, *pte);

499

page = vm_normal_page(vma, addr, *pte);

500

if (!page)

500

if (!page)

501

continue;

501

continue;

502

/*

502

/*

503

* vm_normal_page() filters out zero pages, but there might

503

* vm_normal_page() filters out zero pages, but there might

504

* still be PageReserved pages to skip, perhaps in a VDSO.

504

* still be PageReserved pages to skip, perhaps in a VDSO.

505

*/

505

*/

506

if (PageReserved(page))

506

if (PageReserved(page))

507

continue;

507

continue;

508

nid = page_to_nid(page);

508

nid = page_to_nid(page);

509

if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))

509

if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))

510

continue;

510

continue;

511

512

if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))

512

if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))

513

migrate_page_add(page, private, flags);

513

migrate_page_add(page, private, flags);

514

else

514

else

515

break;

515

break;

516

} while (pte++, addr += PAGE_SIZE, addr != end);

516

} while (pte++, addr += PAGE_SIZE, addr != end);

517

pte_unmap_unlock(orig_pte, ptl);

517

pte_unmap_unlock(orig_pte, ptl);

518

return addr != end;

518

return addr != end;

519

}

519

}

520

521

static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,

521

static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,

522

pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,

522

pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,

523

void *private)

523

void *private)

524

{

524

{

525

#ifdef CONFIG_HUGETLB_PAGE

525

#ifdef CONFIG_HUGETLB_PAGE

526

int nid;

526

int nid;

527

struct page *page;

527

struct page *page;

528

spinlock_t *ptl;

528

spinlock_t *ptl;

529

pte_t entry;

529

530

ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);

531

ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);

531

page = pte_page(huge_ptep_get((pte_t *)pmd));

532

entry = huge_ptep_get((pte_t *)pmd);

533

if (!pte_present(entry))

534

goto unlock;

535

page = pte_page(entry);

532

nid = page_to_nid(page);

536

nid = page_to_nid(page);

533

if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))

537

if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))

534

goto unlock;

538

goto unlock;

535

/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */

539

/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */

536

if (flags & (MPOL_MF_MOVE_ALL) ||

540

if (flags & (MPOL_MF_MOVE_ALL) ||

537

(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))

541

(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))

538

isolate_huge_page(page, private);

542

isolate_huge_page(page, private);

539

unlock:

543

unlock:

540

spin_unlock(ptl);

544

spin_unlock(ptl);

541

#else

545

#else

542

BUG();

546

BUG();

543

#endif

547

#endif

544

}

548

}

545

549

546

static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,

550

static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,

547

unsigned long addr, unsigned long end,

551

unsigned long addr, unsigned long end,

548

const nodemask_t *nodes, unsigned long flags,

552

const nodemask_t *nodes, unsigned long flags,

549

void *private)

553

void *private)

550

{

554

{

551

pmd_t *pmd;

555

pmd_t *pmd;

552

unsigned long next;

556

unsigned long next;

553

557

554

pmd = pmd_offset(pud, addr);

558

pmd = pmd_offset(pud, addr);

555

do {

559

do {

556

next = pmd_addr_end(addr, end);

560

next = pmd_addr_end(addr, end);

557

if (!pmd_present(*pmd))

561

if (!pmd_present(*pmd))

558

continue;

562

continue;

559

if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {

563

if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {

560

queue_pages_hugetlb_pmd_range(vma, pmd, nodes,

564

queue_pages_hugetlb_pmd_range(vma, pmd, nodes,

561

flags, private);

565

flags, private);

562

continue;

566

continue;

563

}

567

}

564

split_huge_page_pmd(vma, addr, pmd);

568

split_huge_page_pmd(vma, addr, pmd);

565

if (pmd_none_or_trans_huge_or_clear_bad(pmd))

569

if (pmd_none_or_trans_huge_or_clear_bad(pmd))

566

continue;

570

continue;

567

if (queue_pages_pte_range(vma, pmd, addr, next, nodes,

571

if (queue_pages_pte_range(vma, pmd, addr, next, nodes,

568

flags, private))

572

flags, private))

569

return -EIO;

573

return -EIO;

570

} while (pmd++, addr = next, addr != end);

574

} while (pmd++, addr = next, addr != end);

571

return 0;

575

return 0;

572

}

576

}

573

577

574

static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,

578

static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,

575

unsigned long addr, unsigned long end,

579

unsigned long addr, unsigned long end,

576

const nodemask_t *nodes, unsigned long flags,

580

const nodemask_t *nodes, unsigned long flags,

577

void *private)

581

void *private)

578

{

582

{

579

pud_t *pud;

583

pud_t *pud;

580

unsigned long next;

584

unsigned long next;

581

585

582

pud = pud_offset(pgd, addr);

586

pud = pud_offset(pgd, addr);

583

do {

587

do {

584

next = pud_addr_end(addr, end);

588

next = pud_addr_end(addr, end);

585

if (pud_huge(*pud) && is_vm_hugetlb_page(vma))

589

if (pud_huge(*pud) && is_vm_hugetlb_page(vma))

586

continue;

590

continue;

587

if (pud_none_or_clear_bad(pud))

591

if (pud_none_or_clear_bad(pud))

588

continue;

592

continue;

589

if (queue_pages_pmd_range(vma, pud, addr, next, nodes,

593

if (queue_pages_pmd_range(vma, pud, addr, next, nodes,

590

flags, private))

594

flags, private))

591

return -EIO;

595

return -EIO;

592

} while (pud++, addr = next, addr != end);

596

} while (pud++, addr = next, addr != end);

593

return 0;

597

return 0;

594

}

598

}

595

599

596

static inline int queue_pages_pgd_range(struct vm_area_struct *vma,

600

static inline int queue_pages_pgd_range(struct vm_area_struct *vma,

597

unsigned long addr, unsigned long end,

601

unsigned long addr, unsigned long end,

598

const nodemask_t *nodes, unsigned long flags,

602

const nodemask_t *nodes, unsigned long flags,

599

void *private)

603

void *private)

600

{

604

{

601

pgd_t *pgd;

605

pgd_t *pgd;

602

unsigned long next;

606

unsigned long next;

603

607

604

pgd = pgd_offset(vma->vm_mm, addr);

608

pgd = pgd_offset(vma->vm_mm, addr);

605

do {

609

do {

606

next = pgd_addr_end(addr, end);

610

next = pgd_addr_end(addr, end);

607

if (pgd_none_or_clear_bad(pgd))

611

if (pgd_none_or_clear_bad(pgd))

608

continue;

612

continue;

609

if (queue_pages_pud_range(vma, pgd, addr, next, nodes,

613

if (queue_pages_pud_range(vma, pgd, addr, next, nodes,

610

flags, private))

614

flags, private))

611

return -EIO;

615

return -EIO;

612

} while (pgd++, addr = next, addr != end);

616

} while (pgd++, addr = next, addr != end);

613

return 0;

617

return 0;

614

}

618

}

615

619

616

#ifdef CONFIG_NUMA_BALANCING

620

#ifdef CONFIG_NUMA_BALANCING

617

/*

621

/*

618

* This is used to mark a range of virtual addresses to be inaccessible.

622

* This is used to mark a range of virtual addresses to be inaccessible.

619

* These are later cleared by a NUMA hinting fault. Depending on these

623

* These are later cleared by a NUMA hinting fault. Depending on these

620

* faults, pages may be migrated for better NUMA placement.

624

* faults, pages may be migrated for better NUMA placement.

621

*

625

*

622

* This is assuming that NUMA faults are handled using PROT_NONE. If

626

* This is assuming that NUMA faults are handled using PROT_NONE. If

623

* an architecture makes a different choice, it will need further

627

* an architecture makes a different choice, it will need further

624

* changes to the core.

628

* changes to the core.

625

*/

629

*/

626

unsigned long change_prot_numa(struct vm_area_struct *vma,

630

unsigned long change_prot_numa(struct vm_area_struct *vma,

627

unsigned long addr, unsigned long end)

631

unsigned long addr, unsigned long end)

628

{

632

{

629

int nr_updated;

633

int nr_updated;

630

634

631

nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);

635

nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);

632

if (nr_updated)

636

if (nr_updated)

633

count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);

637

count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);

634

638

635

return nr_updated;

639

return nr_updated;

636

}

640

}

637

#else

641

#else

638

static unsigned long change_prot_numa(struct vm_area_struct *vma,

642

static unsigned long change_prot_numa(struct vm_area_struct *vma,

639

unsigned long addr, unsigned long end)

643

unsigned long addr, unsigned long end)

640

{

644

{

641

return 0;

645

return 0;

642

}

646

}

643

#endif /* CONFIG_NUMA_BALANCING */

647

#endif /* CONFIG_NUMA_BALANCING */

644

648

645

/*

649

/*

646

* Walk through page tables and collect pages to be migrated.

650

* Walk through page tables and collect pages to be migrated.

647

*

651

*

648

* If pages found in a given range are on a set of nodes (determined by

652

* If pages found in a given range are on a set of nodes (determined by

649

* @nodes and @flags,) it's isolated and queued to the pagelist which is

653

* @nodes and @flags,) it's isolated and queued to the pagelist which is

650

* passed via @private.)

654

* passed via @private.)

651

*/

655

*/

652

static struct vm_area_struct *

656

static struct vm_area_struct *

653

queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,

657

queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,

654

const nodemask_t *nodes, unsigned long flags, void *private)

658

const nodemask_t *nodes, unsigned long flags, void *private)

655

{

659

{

656

int err;

660

int err;

657

struct vm_area_struct *first, *vma, *prev;

661

struct vm_area_struct *first, *vma, *prev;

658

662

659

663

660

first = find_vma(mm, start);

664

first = find_vma(mm, start);

661

if (!first)

665

if (!first)

662

return ERR_PTR(-EFAULT);

666

return ERR_PTR(-EFAULT);

663

prev = NULL;

667

prev = NULL;

664

for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {

668

for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {

665

unsigned long endvma = vma->vm_end;

669

unsigned long endvma = vma->vm_end;

666

670

667

if (endvma > end)

671

if (endvma > end)

668

endvma = end;

672

endvma = end;

669

if (vma->vm_start > start)

673

if (vma->vm_start > start)

670

start = vma->vm_start;

674

start = vma->vm_start;

671

675

672

if (!(flags & MPOL_MF_DISCONTIG_OK)) {

676

if (!(flags & MPOL_MF_DISCONTIG_OK)) {

673

if (!vma->vm_next && vma->vm_end < end)

677

if (!vma->vm_next && vma->vm_end < end)

674

return ERR_PTR(-EFAULT);

678

return ERR_PTR(-EFAULT);

675

if (prev && prev->vm_end < vma->vm_start)

679

if (prev && prev->vm_end < vma->vm_start)

676

return ERR_PTR(-EFAULT);

680

return ERR_PTR(-EFAULT);

677

}

681

}

678

682

679

if (flags & MPOL_MF_LAZY) {

683

if (flags & MPOL_MF_LAZY) {

680

change_prot_numa(vma, start, endvma);

684

change_prot_numa(vma, start, endvma);

681

goto next;

685

goto next;

682

}

686

}

683

687

684

if ((flags & MPOL_MF_STRICT) ||

688

if ((flags & MPOL_MF_STRICT) ||

685

((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&

689

((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&

686

vma_migratable(vma))) {

690

vma_migratable(vma))) {

687

691

688

err = queue_pages_pgd_range(vma, start, endvma, nodes,

692

err = queue_pages_pgd_range(vma, start, endvma, nodes,

689

flags, private);

693

flags, private);

690

if (err) {

694

if (err) {

691

first = ERR_PTR(err);

695

first = ERR_PTR(err);

692

break;

696

break;

693

}

697

}

694

}

698

}

695

prev = vma;

700

prev = vma;

697

}

701

}

698

return first;

702

return first;

699

}

703

}

700

704

701

/*

705

/*

702

* Apply policy to a single VMA

706

* Apply policy to a single VMA

703

* This must be called with the mmap_sem held for writing.

707

* This must be called with the mmap_sem held for writing.

704

*/

708

*/

705

static int vma_replace_policy(struct vm_area_struct *vma,

709

static int vma_replace_policy(struct vm_area_struct *vma,

706

struct mempolicy *pol)

710

struct mempolicy *pol)

707

{

711

{

708

int err;

712

int err;

709

struct mempolicy *old;

713

struct mempolicy *old;

710

struct mempolicy *new;

714

struct mempolicy *new;

711

715

712

pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",

716

pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",

713

vma->vm_start, vma->vm_end, vma->vm_pgoff,

717

vma->vm_start, vma->vm_end, vma->vm_pgoff,

714

vma->vm_ops, vma->vm_file,

718

vma->vm_ops, vma->vm_file,

715

vma->vm_ops ? vma->vm_ops->set_policy : NULL);

719

vma->vm_ops ? vma->vm_ops->set_policy : NULL);

716

720

717

new = mpol_dup(pol);

721

new = mpol_dup(pol);

718

if (IS_ERR(new))

722

if (IS_ERR(new))

719

return PTR_ERR(new);

723

return PTR_ERR(new);

720

724

721

if (vma->vm_ops && vma->vm_ops->set_policy) {

725

if (vma->vm_ops && vma->vm_ops->set_policy) {

722

err = vma->vm_ops->set_policy(vma, new);

726

err = vma->vm_ops->set_policy(vma, new);

723

if (err)

727

if (err)

724

goto err_out;

728

goto err_out;

725

}

729

}

726

730

727

old = vma->vm_policy;

731

old = vma->vm_policy;

728

vma->vm_policy = new; /* protected by mmap_sem */

732

vma->vm_policy = new; /* protected by mmap_sem */

729

mpol_put(old);

733

mpol_put(old);

730

734

731

return 0;

735

return 0;

732

err_out:

736

err_out:

733

mpol_put(new);

737

mpol_put(new);

734

return err;

738

return err;

735

}

739

}

736

740

737

/* Step 2: apply policy to a range and do splits. */

741

/* Step 2: apply policy to a range and do splits. */

738

static int mbind_range(struct mm_struct *mm, unsigned long start,

742

static int mbind_range(struct mm_struct *mm, unsigned long start,

739

unsigned long end, struct mempolicy *new_pol)

743

unsigned long end, struct mempolicy *new_pol)

740

{

744

{

741

struct vm_area_struct *next;

745

struct vm_area_struct *next;

742

struct vm_area_struct *prev;

746

struct vm_area_struct *prev;

743

struct vm_area_struct *vma;

747

struct vm_area_struct *vma;

744

int err = 0;

748

int err = 0;

745

pgoff_t pgoff;

749

pgoff_t pgoff;

746

unsigned long vmstart;

750

unsigned long vmstart;

747

unsigned long vmend;

751

unsigned long vmend;

748

752

749

vma = find_vma(mm, start);

753

vma = find_vma(mm, start);

750

if (!vma || vma->vm_start > start)

754

if (!vma || vma->vm_start > start)

751

return -EFAULT;

755

return -EFAULT;

752

756

753

prev = vma->vm_prev;

757

prev = vma->vm_prev;

754

if (start > vma->vm_start)

758

if (start > vma->vm_start)

755

prev = vma;

759

prev = vma;

756

760

757

for (; vma && vma->vm_start < end; prev = vma, vma = next) {

761

for (; vma && vma->vm_start < end; prev = vma, vma = next) {

758

next = vma->vm_next;

762

next = vma->vm_next;

759

vmstart = max(start, vma->vm_start);

763

vmstart = max(start, vma->vm_start);

760

vmend = min(end, vma->vm_end);

764

vmend = min(end, vma->vm_end);

761

765

762

if (mpol_equal(vma_policy(vma), new_pol))

766

if (mpol_equal(vma_policy(vma), new_pol))

763

continue;

767

continue;

764

768

765

pgoff = vma->vm_pgoff +

769

pgoff = vma->vm_pgoff +

766

((vmstart - vma->vm_start) >> PAGE_SHIFT);

770

((vmstart - vma->vm_start) >> PAGE_SHIFT);

767

prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,

771

prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,

768

vma->anon_vma, vma->vm_file, pgoff,

772

vma->anon_vma, vma->vm_file, pgoff,

769

new_pol);

773

new_pol);

770

if (prev) {

774

if (prev) {

771

vma = prev;

775

vma = prev;

772

next = vma->vm_next;

776

next = vma->vm_next;

773

if (mpol_equal(vma_policy(vma), new_pol))

777

if (mpol_equal(vma_policy(vma), new_pol))

774

continue;

778

continue;

775

/* vma_merge() joined vma && vma->next, case 8 */

779

/* vma_merge() joined vma && vma->next, case 8 */

776

goto replace;

780

goto replace;

777

}

781

}

778

if (vma->vm_start != vmstart) {

782

if (vma->vm_start != vmstart) {

779

err = split_vma(vma->vm_mm, vma, vmstart, 1);

783

err = split_vma(vma->vm_mm, vma, vmstart, 1);

780

if (err)

784

if (err)

781

goto out;

785

goto out;

782

}

786

}

783

if (vma->vm_end != vmend) {

787

if (vma->vm_end != vmend) {

784

err = split_vma(vma->vm_mm, vma, vmend, 0);

788

err = split_vma(vma->vm_mm, vma, vmend, 0);

785

if (err)

789

if (err)

786

goto out;

790

goto out;

787

}

791

}

788

replace:

792

replace:

789

err = vma_replace_policy(vma, new_pol);

793

err = vma_replace_policy(vma, new_pol);

790

if (err)

794

if (err)

791

goto out;

795

goto out;

792

}

796

}

793

797

794

out:

798

out:

795

return err;

799

return err;

796

}

800

}

797

801

798

/* Set the process memory policy */

802

/* Set the process memory policy */

799

static long do_set_mempolicy(unsigned short mode, unsigned short flags,

803

static long do_set_mempolicy(unsigned short mode, unsigned short flags,

800

nodemask_t *nodes)

804

nodemask_t *nodes)

801

{

805

{

802

struct mempolicy *new, *old;

806

struct mempolicy *new, *old;

803

struct mm_struct *mm = current->mm;

807

struct mm_struct *mm = current->mm;

804

NODEMASK_SCRATCH(scratch);

808

NODEMASK_SCRATCH(scratch);

805

int ret;

809

int ret;

806

810

807

if (!scratch)

811

if (!scratch)

808

return -ENOMEM;

812

return -ENOMEM;

809

813

810

new = mpol_new(mode, flags, nodes);

814

new = mpol_new(mode, flags, nodes);

811

if (IS_ERR(new)) {

815

if (IS_ERR(new)) {

812

ret = PTR_ERR(new);

816

ret = PTR_ERR(new);

813

goto out;

817

goto out;

814

}

818

}

815

/*

819

/*

816

* prevent changing our mempolicy while show_numa_maps()

820

* prevent changing our mempolicy while show_numa_maps()

817

* is using it.

821

* is using it.

818

* Note: do_set_mempolicy() can be called at init time

822

* Note: do_set_mempolicy() can be called at init time

819

* with no 'mm'.

823

* with no 'mm'.

820

*/

824

*/

821

if (mm)

825

if (mm)

822

down_write(&mm->mmap_sem);

826

down_write(&mm->mmap_sem);

823

task_lock(current);

827

task_lock(current);

824

ret = mpol_set_nodemask(new, nodes, scratch);

828

ret = mpol_set_nodemask(new, nodes, scratch);

825

if (ret) {

829

if (ret) {

826

task_unlock(current);

830

task_unlock(current);

827

if (mm)

831

if (mm)

828

up_write(&mm->mmap_sem);

832

up_write(&mm->mmap_sem);

829

mpol_put(new);

833

mpol_put(new);

830

goto out;

834

goto out;

831

}

835

}

832

old = current->mempolicy;

836

old = current->mempolicy;

833

current->mempolicy = new;

837

current->mempolicy = new;

834

if (new && new->mode == MPOL_INTERLEAVE &&

838

if (new && new->mode == MPOL_INTERLEAVE &&

835

nodes_weight(new->v.nodes))

839

nodes_weight(new->v.nodes))

836

current->il_next = first_node(new->v.nodes);

840

current->il_next = first_node(new->v.nodes);

837

task_unlock(current);

841

task_unlock(current);

838

if (mm)

842

if (mm)

839

up_write(&mm->mmap_sem);

843

up_write(&mm->mmap_sem);

840

844

841

mpol_put(old);

845

mpol_put(old);

842

ret = 0;

846

ret = 0;

843

out:

847

out:

844

NODEMASK_SCRATCH_FREE(scratch);

848

NODEMASK_SCRATCH_FREE(scratch);

845

return ret;

849

return ret;

846

}

850

}

847

851

848

/*

852

/*

849

* Return nodemask for policy for get_mempolicy() query

853

* Return nodemask for policy for get_mempolicy() query

850

*

854

*

851

* Called with task's alloc_lock held

855

* Called with task's alloc_lock held

852

*/

856

*/

853

static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)

857

static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)

854

{

858

{

855

nodes_clear(*nodes);

859

nodes_clear(*nodes);

856

if (p == &default_policy)

860

if (p == &default_policy)

857

return;

861

return;

858

862

859

switch (p->mode) {

863

switch (p->mode) {

860

case MPOL_BIND:

864

case MPOL_BIND:

861

/* Fall through */

865

/* Fall through */

862

case MPOL_INTERLEAVE:

866

case MPOL_INTERLEAVE:

863

*nodes = p->v.nodes;

867

*nodes = p->v.nodes;

864

break;

868

break;

865

case MPOL_PREFERRED:

869

case MPOL_PREFERRED:

866

if (!(p->flags & MPOL_F_LOCAL))

870

if (!(p->flags & MPOL_F_LOCAL))

867

node_set(p->v.preferred_node, *nodes);

871

node_set(p->v.preferred_node, *nodes);

868

/* else return empty node mask for local allocation */

872

/* else return empty node mask for local allocation */

869

break;

873

break;

870

default:

874

default:

871

BUG();

875

BUG();

872

}

876

}

873

}

877

}

874

878

875

static int lookup_node(struct mm_struct *mm, unsigned long addr)

879

static int lookup_node(struct mm_struct *mm, unsigned long addr)

876

{

880

{

877

struct page *p;

881

struct page *p;

878

int err;

882

int err;

879

883

880

err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);

884

err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);

881

if (err >= 0) {

885

if (err >= 0) {

882

err = page_to_nid(p);

886

err = page_to_nid(p);

883

put_page(p);

887

put_page(p);

884

}

888

}

885

return err;

889

return err;

886

}

890

}

887

891

888

/* Retrieve NUMA policy */

892

/* Retrieve NUMA policy */

889

static long do_get_mempolicy(int *policy, nodemask_t *nmask,

893

static long do_get_mempolicy(int *policy, nodemask_t *nmask,

890

unsigned long addr, unsigned long flags)

894

unsigned long addr, unsigned long flags)

891

{

895

{

892

int err;

896

int err;

893

struct mm_struct *mm = current->mm;

897

struct mm_struct *mm = current->mm;

894

struct vm_area_struct *vma = NULL;

898

struct vm_area_struct *vma = NULL;

895

struct mempolicy *pol = current->mempolicy;

899

struct mempolicy *pol = current->mempolicy;

896

900

897

if (flags &

901

if (flags &

898

~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))

902

~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))

899

return -EINVAL;

903

return -EINVAL;

900

904

901

if (flags & MPOL_F_MEMS_ALLOWED) {

905

if (flags & MPOL_F_MEMS_ALLOWED) {

902

if (flags & (MPOL_F_NODE|MPOL_F_ADDR))

906

if (flags & (MPOL_F_NODE|MPOL_F_ADDR))

903

return -EINVAL;

907

return -EINVAL;

904

*policy = 0; /* just so it's initialized */

908

*policy = 0; /* just so it's initialized */

905

task_lock(current);

909

task_lock(current);

906

*nmask = cpuset_current_mems_allowed;

910

*nmask = cpuset_current_mems_allowed;

907

task_unlock(current);

911

task_unlock(current);

908

return 0;

912

return 0;

909

}

913

}

910

914

911

if (flags & MPOL_F_ADDR) {

915

if (flags & MPOL_F_ADDR) {

912

/*

916

/*

913

* Do NOT fall back to task policy if the

917

* Do NOT fall back to task policy if the

914

* vma/shared policy at addr is NULL. We

918

* vma/shared policy at addr is NULL. We

915

* want to return MPOL_DEFAULT in this case.

919

* want to return MPOL_DEFAULT in this case.

916

*/

920

*/

917

down_read(&mm->mmap_sem);

921

down_read(&mm->mmap_sem);

918

vma = find_vma_intersection(mm, addr, addr+1);

922

vma = find_vma_intersection(mm, addr, addr+1);

919

if (!vma) {

923

if (!vma) {

920

up_read(&mm->mmap_sem);

924

up_read(&mm->mmap_sem);

921

return -EFAULT;

925

return -EFAULT;

922

}

926

}

923

if (vma->vm_ops && vma->vm_ops->get_policy)

927

if (vma->vm_ops && vma->vm_ops->get_policy)

924

pol = vma->vm_ops->get_policy(vma, addr);

928

pol = vma->vm_ops->get_policy(vma, addr);

925

else

929

else

926

pol = vma->vm_policy;

930

pol = vma->vm_policy;

927

} else if (addr)

931

} else if (addr)

928

return -EINVAL;

932

return -EINVAL;

929

933

930

if (!pol)

934

if (!pol)

931

pol = &default_policy; /* indicates default behavior */

935

pol = &default_policy; /* indicates default behavior */

932

936

933

if (flags & MPOL_F_NODE) {

937

if (flags & MPOL_F_NODE) {

934

if (flags & MPOL_F_ADDR) {

938

if (flags & MPOL_F_ADDR) {

935

err = lookup_node(mm, addr);

939

err = lookup_node(mm, addr);

936

if (err < 0)

940

if (err < 0)

937

goto out;

941

goto out;

938

*policy = err;

942

*policy = err;

939

} else if (pol == current->mempolicy &&

943

} else if (pol == current->mempolicy &&

940

pol->mode == MPOL_INTERLEAVE) {

944

pol->mode == MPOL_INTERLEAVE) {

941

*policy = current->il_next;

945

*policy = current->il_next;

942

} else {

946

} else {

943

err = -EINVAL;

947

err = -EINVAL;

944

goto out;

948

goto out;

945

}

949

}

946

} else {

950

} else {

947

*policy = pol == &default_policy ? MPOL_DEFAULT :

951

*policy = pol == &default_policy ? MPOL_DEFAULT :

948

pol->mode;

952

pol->mode;

949

/*

953

/*

950

* Internal mempolicy flags must be masked off before exposing

954

* Internal mempolicy flags must be masked off before exposing

951

* the policy to userspace.

955

* the policy to userspace.

952

*/

956

*/

953

*policy |= (pol->flags & MPOL_MODE_FLAGS);

957

*policy |= (pol->flags & MPOL_MODE_FLAGS);

954

}

958

}

955

959

956

if (vma) {

960

if (vma) {

957

up_read(&current->mm->mmap_sem);

961

up_read(&current->mm->mmap_sem);

958

vma = NULL;

962

vma = NULL;

959

}

963

}

960

964

961

err = 0;

965

err = 0;

962

if (nmask) {

966

if (nmask) {

963

if (mpol_store_user_nodemask(pol)) {

967

if (mpol_store_user_nodemask(pol)) {

964

*nmask = pol->w.user_nodemask;

968

*nmask = pol->w.user_nodemask;

965

} else {

969

} else {

966

task_lock(current);

970

task_lock(current);

967

get_policy_nodemask(pol, nmask);

971

get_policy_nodemask(pol, nmask);

968

task_unlock(current);

972

task_unlock(current);

969

}

973

}

970

}

974

}

971

975

972

out:

976

out:

973

mpol_cond_put(pol);

977

mpol_cond_put(pol);

974

if (vma)

978

if (vma)

975

up_read(&current->mm->mmap_sem);

979

up_read(&current->mm->mmap_sem);

976

return err;

980

return err;

977

}

981

}

978

982

979

#ifdef CONFIG_MIGRATION

983

#ifdef CONFIG_MIGRATION

980

/*

984

/*

981

* page migration

985

* page migration

982

*/

986

*/

983

static void migrate_page_add(struct page *page, struct list_head *pagelist,

987

static void migrate_page_add(struct page *page, struct list_head *pagelist,

984

unsigned long flags)

988

unsigned long flags)

985

{

989

{

986

/*

990

/*

987

* Avoid migrating a page that is shared with others.

991

* Avoid migrating a page that is shared with others.

988

*/

992

*/

989

if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {

993

if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {

990

if (!isolate_lru_page(page)) {

994

if (!isolate_lru_page(page)) {

991

list_add_tail(&page->lru, pagelist);

995

list_add_tail(&page->lru, pagelist);

992

inc_zone_page_state(page, NR_ISOLATED_ANON +

996

inc_zone_page_state(page, NR_ISOLATED_ANON +

993

page_is_file_cache(page));

997

page_is_file_cache(page));

994

}

998

}

995

}

999

}

996

}

1000

}

997

1001

998

static struct page *new_node_page(struct page *page, unsigned long node, int **x)

1002

static struct page *new_node_page(struct page *page, unsigned long node, int **x)

999

{

1003

{

1000

if (PageHuge(page))

1004

if (PageHuge(page))

1001

return alloc_huge_page_node(page_hstate(compound_head(page)),

1005

return alloc_huge_page_node(page_hstate(compound_head(page)),

1002

node);

1006

node);

1003

else

1007

else

1004

return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);

1008

return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);

1005

}

1009

}

1006

1010

1007

/*

1011

/*

1008

* Migrate pages from one node to a target node.

1012

* Migrate pages from one node to a target node.

1009

* Returns error or the number of pages not migrated.

1013

* Returns error or the number of pages not migrated.

1010

*/

1014

*/

1011

static int migrate_to_node(struct mm_struct *mm, int source, int dest,

1015

static int migrate_to_node(struct mm_struct *mm, int source, int dest,

1012

int flags)

1016

int flags)

1013

{

1017

{

1014

nodemask_t nmask;

1018

nodemask_t nmask;

1015

LIST_HEAD(pagelist);

1019

LIST_HEAD(pagelist);

1016

int err = 0;

1020

int err = 0;

1017

1021

1018

nodes_clear(nmask);

1022

nodes_clear(nmask);

1019

node_set(source, nmask);

1023

node_set(source, nmask);

1020

1024

1021

/*

1025

/*

1022

* This does not "check" the range but isolates all pages that

1026

* This does not "check" the range but isolates all pages that

1023

* need migration. Between passing in the full user address

1027

* need migration. Between passing in the full user address

1024

* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.

1028

* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.

1025

*/

1029

*/

1026

VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));

1030

VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));

1027

queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,

1031

queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,

1028

flags | MPOL_MF_DISCONTIG_OK, &pagelist);

1032

flags | MPOL_MF_DISCONTIG_OK, &pagelist);

1029

1033

1030

if (!list_empty(&pagelist)) {

1034

if (!list_empty(&pagelist)) {

1031

err = migrate_pages(&pagelist, new_node_page, dest,

1035

err = migrate_pages(&pagelist, new_node_page, dest,

1032

MIGRATE_SYNC, MR_SYSCALL);

1036

MIGRATE_SYNC, MR_SYSCALL);

1033

if (err)

1037

if (err)

1034

putback_movable_pages(&pagelist);

1038

putback_movable_pages(&pagelist);

1035

}

1039

}

1036

1040

1037

return err;

1041

return err;

1038

}

1042

}

1039

1043

1040

/*

1044

/*

1041

* Move pages between the two nodesets so as to preserve the physical

1045

* Move pages between the two nodesets so as to preserve the physical

1042

* layout as much as possible.

1046

* layout as much as possible.

1043

*

1047

*

1044

* Returns the number of page that could not be moved.

1048

* Returns the number of page that could not be moved.

1045

*/

1049

*/

1046

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,

1050

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,

1047

const nodemask_t *to, int flags)

1051

const nodemask_t *to, int flags)

1048

{

1052

{

1049

int busy = 0;

1053

int busy = 0;

1050

int err;

1054

int err;

1051

nodemask_t tmp;

1055

nodemask_t tmp;

1052

1056

1053

err = migrate_prep();

1057

err = migrate_prep();

1054

if (err)

1058

if (err)

1055

return err;

1059

return err;

1056

1060

1057

down_read(&mm->mmap_sem);

1061

down_read(&mm->mmap_sem);

1058

1062

1059

err = migrate_vmas(mm, from, to, flags);

1063

err = migrate_vmas(mm, from, to, flags);

1060

if (err)

1064

if (err)

1061

goto out;

1065

goto out;

1062

1066

1063

/*

1067

/*

1064

* Find a 'source' bit set in 'tmp' whose corresponding 'dest'

1068

* Find a 'source' bit set in 'tmp' whose corresponding 'dest'

1065

* bit in 'to' is not also set in 'tmp'. Clear the found 'source'

1069

* bit in 'to' is not also set in 'tmp'. Clear the found 'source'

1066

* bit in 'tmp', and return that <source, dest> pair for migration.

1070

* bit in 'tmp', and return that <source, dest> pair for migration.

1067

* The pair of nodemasks 'to' and 'from' define the map.

1071

* The pair of nodemasks 'to' and 'from' define the map.

1068

*

1072

*

1069

* If no pair of bits is found that way, fallback to picking some

1073

* If no pair of bits is found that way, fallback to picking some

1070

* pair of 'source' and 'dest' bits that are not the same. If the

1074

* pair of 'source' and 'dest' bits that are not the same. If the

1071

* 'source' and 'dest' bits are the same, this represents a node

1075

* 'source' and 'dest' bits are the same, this represents a node

1072

* that will be migrating to itself, so no pages need move.

1076

* that will be migrating to itself, so no pages need move.

1073

*

1077

*

1074

* If no bits are left in 'tmp', or if all remaining bits left

1078

* If no bits are left in 'tmp', or if all remaining bits left

1075

* in 'tmp' correspond to the same bit in 'to', return false

1079

* in 'tmp' correspond to the same bit in 'to', return false

1076

* (nothing left to migrate).

1080

* (nothing left to migrate).

1077

*

1081

*

1078

* This lets us pick a pair of nodes to migrate between, such that

1082

* This lets us pick a pair of nodes to migrate between, such that

1079

* if possible the dest node is not already occupied by some other

1083

* if possible the dest node is not already occupied by some other

1080

* source node, minimizing the risk of overloading the memory on a

1084

* source node, minimizing the risk of overloading the memory on a

1081

* node that would happen if we migrated incoming memory to a node

1085

* node that would happen if we migrated incoming memory to a node

1082

* before migrating outgoing memory source that same node.

1086

* before migrating outgoing memory source that same node.

1083

*

1087

*

1084

* A single scan of tmp is sufficient. As we go, we remember the

1088

* A single scan of tmp is sufficient. As we go, we remember the

1085

* most recent <s, d> pair that moved (s != d). If we find a pair

1089

* most recent <s, d> pair that moved (s != d). If we find a pair

1086

* that not only moved, but what's better, moved to an empty slot

1090

* that not only moved, but what's better, moved to an empty slot

1087

* (d is not set in tmp), then we break out then, with that pair.

1091

* (d is not set in tmp), then we break out then, with that pair.

1088

* Otherwise when we finish scanning from_tmp, we at least have the

1092

* Otherwise when we finish scanning from_tmp, we at least have the

1089

* most recent <s, d> pair that moved. If we get all the way through

1093

* most recent <s, d> pair that moved. If we get all the way through

1090

* the scan of tmp without finding any node that moved, much less

1094

* the scan of tmp without finding any node that moved, much less

1091

* moved to an empty node, then there is nothing left worth migrating.

1095

* moved to an empty node, then there is nothing left worth migrating.

1092

*/

1096

*/

1093

1097

1094

tmp = *from;

1098

tmp = *from;

1095

while (!nodes_empty(tmp)) {

1099

while (!nodes_empty(tmp)) {

1096

int s,d;

1100

int s,d;

1097

int source = NUMA_NO_NODE;

1101

int source = NUMA_NO_NODE;

1098

int dest = 0;

1102

int dest = 0;

1099

1103

1100

for_each_node_mask(s, tmp) {

1104

for_each_node_mask(s, tmp) {

1101

1105

1102

/*

1106

/*

1103

* do_migrate_pages() tries to maintain the relative

1107

* do_migrate_pages() tries to maintain the relative

1104

* node relationship of the pages established between

1108

* node relationship of the pages established between

1105

* threads and memory areas.

1109

* threads and memory areas.

1106

*

1110

*

1107

* However if the number of source nodes is not equal to

1111

* However if the number of source nodes is not equal to

1108

* the number of destination nodes we can not preserve

1112

* the number of destination nodes we can not preserve

1109

* this node relative relationship. In that case, skip

1113

* this node relative relationship. In that case, skip

1110

* copying memory from a node that is in the destination

1114

* copying memory from a node that is in the destination

1111

* mask.

1115

* mask.

1112

*

1116

*

1113

* Example: [2,3,4] -> [3,4,5] moves everything.

1117

* Example: [2,3,4] -> [3,4,5] moves everything.

1114

* [0-7] - > [3,4,5] moves only 0,1,2,6,7.

1118

* [0-7] - > [3,4,5] moves only 0,1,2,6,7.

1115

*/

1119

*/

1116

1120

1117

if ((nodes_weight(*from) != nodes_weight(*to)) &&

1121

if ((nodes_weight(*from) != nodes_weight(*to)) &&

1118

(node_isset(s, *to)))

1122

(node_isset(s, *to)))

1119

continue;

1123

continue;

1120

1124

1121

d = node_remap(s, *from, *to);

1125

d = node_remap(s, *from, *to);

1122

if (s == d)

1126

if (s == d)

1123

continue;

1127

continue;

1124

1128

1125

source = s; /* Node moved. Memorize */

1129

source = s; /* Node moved. Memorize */

1126

dest = d;

1130

dest = d;

1127

1131

1128

/* dest not in remaining from nodes? */

1132

/* dest not in remaining from nodes? */

1129

if (!node_isset(dest, tmp))

1133

if (!node_isset(dest, tmp))

1130

break;

1134

break;

1131

}

1135

}

1132

if (source == NUMA_NO_NODE)

1136

if (source == NUMA_NO_NODE)

1133

break;

1137

break;

1134

1138

1135

node_clear(source, tmp);

1139

node_clear(source, tmp);

1136

err = migrate_to_node(mm, source, dest, flags);

1140

err = migrate_to_node(mm, source, dest, flags);

1137

if (err > 0)

1141

if (err > 0)

1138

busy += err;

1142

busy += err;

1139

if (err < 0)

1143

if (err < 0)

1140

break;

1144

break;

1141

}

1145

}

1142

out:

1146

out:

1143

up_read(&mm->mmap_sem);

1147

up_read(&mm->mmap_sem);

1144

if (err < 0)

1148

if (err < 0)

1145

return err;

1149

return err;

1146

return busy;

1150

return busy;

1147

1151

1148

}

1152

}

1149

1153

1150

/*

1154

/*

1151

* Allocate a new page for page migration based on vma policy.

1155

* Allocate a new page for page migration based on vma policy.

1152

* Start assuming that page is mapped by vma pointed to by @private.

1156

* Start assuming that page is mapped by vma pointed to by @private.

1153

* Search forward from there, if not. N.B., this assumes that the

1157

* Search forward from there, if not. N.B., this assumes that the

1154

* list of pages handed to migrate_pages()--which is how we get here--

1158

* list of pages handed to migrate_pages()--which is how we get here--

1155

* is in virtual address order.

1159

* is in virtual address order.

1156

*/

1160

*/

1157

static struct page *new_vma_page(struct page *page, unsigned long private, int **x)

1161

static struct page *new_vma_page(struct page *page, unsigned long private, int **x)

1158

{

1162

{

1159

struct vm_area_struct *vma = (struct vm_area_struct *)private;

1163

struct vm_area_struct *vma = (struct vm_area_struct *)private;

1160

unsigned long uninitialized_var(address);

1164

unsigned long uninitialized_var(address);

1161

1165

1162

while (vma) {

1166

while (vma) {

1163

address = page_address_in_vma(page, vma);

1167

address = page_address_in_vma(page, vma);

1164

if (address != -EFAULT)

1168

if (address != -EFAULT)

1165

break;

1169

break;

1166

vma = vma->vm_next;

1170

vma = vma->vm_next;

1167

}

1171

}

1168

1172

1169

if (PageHuge(page)) {

1173

if (PageHuge(page)) {

1170

BUG_ON(!vma);

1174

BUG_ON(!vma);

1171

return alloc_huge_page_noerr(vma, address, 1);

1175

return alloc_huge_page_noerr(vma, address, 1);

1172

}

1176

}

1173

/*

1177

/*

1174

* if !vma, alloc_page_vma() will use task or system default policy

1178

* if !vma, alloc_page_vma() will use task or system default policy

1175

*/

1179

*/

1176

return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);

1180

return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);

1177

}

1181

}

1178

#else

1182

#else

1179

1183

1180

static void migrate_page_add(struct page *page, struct list_head *pagelist,

1184

static void migrate_page_add(struct page *page, struct list_head *pagelist,

1181

unsigned long flags)

1185

unsigned long flags)

1182

{

1186

{

1183

}

1187

}

1184

1188

1185

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,

1189

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,

1186

const nodemask_t *to, int flags)

1190

const nodemask_t *to, int flags)

1187

{

1191

{

1188

return -ENOSYS;

1192

return -ENOSYS;

1189

}

1193

}

1190

1194

1191

static struct page *new_vma_page(struct page *page, unsigned long private, int **x)

1195

static struct page *new_vma_page(struct page *page, unsigned long private, int **x)

1192

{

1196

{

1193

return NULL;

1197

return NULL;

1194

}

1198

}

1195

#endif

1199

#endif

1196

1200

1197

static long do_mbind(unsigned long start, unsigned long len,

1201

static long do_mbind(unsigned long start, unsigned long len,

1198

unsigned short mode, unsigned short mode_flags,

1202

unsigned short mode, unsigned short mode_flags,

1199

nodemask_t *nmask, unsigned long flags)

1203

nodemask_t *nmask, unsigned long flags)

1200

{

1204

{

1201

struct vm_area_struct *vma;

1205

struct vm_area_struct *vma;

1202

struct mm_struct *mm = current->mm;

1206

struct mm_struct *mm = current->mm;

1203

struct mempolicy *new;

1207

struct mempolicy *new;

1204

unsigned long end;

1208

unsigned long end;

1205

int err;

1209

int err;

1206

LIST_HEAD(pagelist);

1210

LIST_HEAD(pagelist);

1207

1211

1208

if (flags & ~(unsigned long)MPOL_MF_VALID)

1212

if (flags & ~(unsigned long)MPOL_MF_VALID)

1209

return -EINVAL;

1213

return -EINVAL;

1210

if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))

1214

if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))

1211

return -EPERM;

1215

return -EPERM;

1212

1216

1213

if (start & ~PAGE_MASK)

1217

if (start & ~PAGE_MASK)

1214

return -EINVAL;

1218

return -EINVAL;

1215

1219

1216

if (mode == MPOL_DEFAULT)

1220

if (mode == MPOL_DEFAULT)

1217

flags &= ~MPOL_MF_STRICT;

1221

flags &= ~MPOL_MF_STRICT;

1218

1222

1219

len = (len + PAGE_SIZE - 1) & PAGE_MASK;

1223

len = (len + PAGE_SIZE - 1) & PAGE_MASK;

1220

end = start + len;

1224

end = start + len;

1221

1225

1222

if (end < start)

1226

if (end < start)

1223

return -EINVAL;

1227

return -EINVAL;

1224

if (end == start)

1228

if (end == start)

1225

return 0;

1229

return 0;

1226

1230

1227

new = mpol_new(mode, mode_flags, nmask);

1231

new = mpol_new(mode, mode_flags, nmask);

1228

if (IS_ERR(new))

1232

if (IS_ERR(new))

1229

return PTR_ERR(new);

1233

return PTR_ERR(new);

1230

1234

1231

if (flags & MPOL_MF_LAZY)

1235

if (flags & MPOL_MF_LAZY)

1232

new->flags |= MPOL_F_MOF;

1236

new->flags |= MPOL_F_MOF;

1233

1237

1234

/*

1238

/*

1235

* If we are using the default policy then operation

1239

* If we are using the default policy then operation

1236

* on discontinuous address spaces is okay after all

1240

* on discontinuous address spaces is okay after all

1237

*/

1241

*/

1238

if (!new)

1242

if (!new)

1239

flags |= MPOL_MF_DISCONTIG_OK;

1243

flags |= MPOL_MF_DISCONTIG_OK;

1240

1244

1241

pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",

1245

pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",

1242

start, start + len, mode, mode_flags,

1246

start, start + len, mode, mode_flags,

1243

nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);

1247

nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);

1244

1248

1245

if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {

1249

if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {

1246

1250

1247

err = migrate_prep();

1251

err = migrate_prep();

1248

if (err)

1252

if (err)

1249

goto mpol_out;

1253

goto mpol_out;

1250

}

1254

}

1251

{

1255

{

1252

NODEMASK_SCRATCH(scratch);

1256

NODEMASK_SCRATCH(scratch);

1253

if (scratch) {

1257

if (scratch) {

1254

down_write(&mm->mmap_sem);

1258

down_write(&mm->mmap_sem);

1255

task_lock(current);

1259

task_lock(current);

1256

err = mpol_set_nodemask(new, nmask, scratch);

1260

err = mpol_set_nodemask(new, nmask, scratch);

1257

task_unlock(current);

1261

task_unlock(current);

1258

if (err)

1262

if (err)

1259

up_write(&mm->mmap_sem);

1263

up_write(&mm->mmap_sem);

1260

} else

1264

} else

1261

err = -ENOMEM;

1265

err = -ENOMEM;

1262

NODEMASK_SCRATCH_FREE(scratch);

1266

NODEMASK_SCRATCH_FREE(scratch);

1263

}

1267

}

1264

if (err)

1268

if (err)

1265

goto mpol_out;

1269

goto mpol_out;

1266

1270

1267

vma = queue_pages_range(mm, start, end, nmask,

1271

vma = queue_pages_range(mm, start, end, nmask,

1268

flags | MPOL_MF_INVERT, &pagelist);

1272

flags | MPOL_MF_INVERT, &pagelist);

1269

1273

1270

err = PTR_ERR(vma); /* maybe ... */

1274

err = PTR_ERR(vma); /* maybe ... */

1271

if (!IS_ERR(vma))

1275

if (!IS_ERR(vma))

1272

err = mbind_range(mm, start, end, new);

1276

err = mbind_range(mm, start, end, new);

1273

1277

1274

if (!err) {

1278

if (!err) {

1275

int nr_failed = 0;

1279

int nr_failed = 0;

1276

1280

1277

if (!list_empty(&pagelist)) {

1281

if (!list_empty(&pagelist)) {

1278

WARN_ON_ONCE(flags & MPOL_MF_LAZY);

1282

WARN_ON_ONCE(flags & MPOL_MF_LAZY);

1279

nr_failed = migrate_pages(&pagelist, new_vma_page,

1283

nr_failed = migrate_pages(&pagelist, new_vma_page,

1280

(unsigned long)vma,

1284

(unsigned long)vma,

1281

MIGRATE_SYNC, MR_MEMPOLICY_MBIND);

1285

MIGRATE_SYNC, MR_MEMPOLICY_MBIND);

1282

if (nr_failed)

1286

if (nr_failed)

1283

putback_movable_pages(&pagelist);

1287

putback_movable_pages(&pagelist);

1284

}

1288

}

1285

1289

1286

if (nr_failed && (flags & MPOL_MF_STRICT))

1290

if (nr_failed && (flags & MPOL_MF_STRICT))

1287

err = -EIO;

1291

err = -EIO;

1288

} else

1292

} else

1289

putback_movable_pages(&pagelist);

1293

putback_movable_pages(&pagelist);

1290

1294

1291

up_write(&mm->mmap_sem);

1295

up_write(&mm->mmap_sem);

1292

mpol_out:

1296

mpol_out:

1293

mpol_put(new);

1297

mpol_put(new);

1294

return err;

1298

return err;

1295

}

1299

}

1296

1300

1297

/*

1301

/*

1298

* User space interface with variable sized bitmaps for nodelists.

1302

* User space interface with variable sized bitmaps for nodelists.

1299

*/

1303

*/

1300

1304

1301

/* Copy a node mask from user space. */

1305

/* Copy a node mask from user space. */

1302

static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,

1306

static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,

1303

unsigned long maxnode)

1307

unsigned long maxnode)

1304

{

1308

{

1305

unsigned long k;

1309

unsigned long k;

1306

unsigned long nlongs;

1310

unsigned long nlongs;

1307

unsigned long endmask;

1311

unsigned long endmask;

1308

1312

1309

--maxnode;

1313

--maxnode;

1310

nodes_clear(*nodes);

1314

nodes_clear(*nodes);

1311

if (maxnode == 0 || !nmask)

1315

if (maxnode == 0 || !nmask)

1312

return 0;

1316

return 0;

1313

if (maxnode > PAGE_SIZE*BITS_PER_BYTE)

1317

if (maxnode > PAGE_SIZE*BITS_PER_BYTE)

1314

return -EINVAL;

1318

return -EINVAL;

1315

1319

1316

nlongs = BITS_TO_LONGS(maxnode);

1320

nlongs = BITS_TO_LONGS(maxnode);

1317

if ((maxnode % BITS_PER_LONG) == 0)

1321

if ((maxnode % BITS_PER_LONG) == 0)

1318

endmask = ~0UL;

1322

endmask = ~0UL;

1319

else

1323

else

1320

endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;

1324

endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;

1321

1325

1322

/* When the user specified more nodes than supported just check

1326

/* When the user specified more nodes than supported just check

1323

if the non supported part is all zero. */

1327

if the non supported part is all zero. */

1324

if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {

1328

if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {

1325

if (nlongs > PAGE_SIZE/sizeof(long))

1329

if (nlongs > PAGE_SIZE/sizeof(long))

1326

return -EINVAL;

1330

return -EINVAL;

1327

for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {

1331

for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {

1328

unsigned long t;

1332

unsigned long t;

1329

if (get_user(t, nmask + k))

1333

if (get_user(t, nmask + k))

1330

return -EFAULT;

1334

return -EFAULT;

1331

if (k == nlongs - 1) {

1335

if (k == nlongs - 1) {

1332

if (t & endmask)

1336

if (t & endmask)

1333

return -EINVAL;

1337

return -EINVAL;

1334

} else if (t)

1338

} else if (t)

1335

return -EINVAL;

1339

return -EINVAL;

1336

}

1340

}

1337

nlongs = BITS_TO_LONGS(MAX_NUMNODES);

1341

nlongs = BITS_TO_LONGS(MAX_NUMNODES);

1338

endmask = ~0UL;

1342

endmask = ~0UL;

1339

}

1343

}

1340

1344

1341

if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))

1345

if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))

1342

return -EFAULT;

1346

return -EFAULT;

1343

nodes_addr(*nodes)[nlongs-1] &= endmask;

1347

nodes_addr(*nodes)[nlongs-1] &= endmask;

1344

return 0;

1348

return 0;

1345

}

1349

}

1346

1350

1347

/* Copy a kernel node mask to user space */

1351

/* Copy a kernel node mask to user space */

1348

static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,

1352

static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,

1349

nodemask_t *nodes)

1353

nodemask_t *nodes)

1350

{

1354

{

1351

unsigned long copy = ALIGN(maxnode-1, 64) / 8;

1355

unsigned long copy = ALIGN(maxnode-1, 64) / 8;

1352

const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);

1356

const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);

1353

1357

1354

if (copy > nbytes) {

1358

if (copy > nbytes) {

1355

if (copy > PAGE_SIZE)

1359

if (copy > PAGE_SIZE)

1356

return -EINVAL;

1360

return -EINVAL;

1357

if (clear_user((char __user *)mask + nbytes, copy - nbytes))

1361

if (clear_user((char __user *)mask + nbytes, copy - nbytes))

1358

return -EFAULT;

1362

return -EFAULT;

1359

copy = nbytes;

1363

copy = nbytes;

1360

}

1364

}

1361

return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;

1365

return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;

1362

}

1366

}

1363

1367

1364

SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,

1368

SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,

1365

unsigned long, mode, unsigned long __user *, nmask,

1369

unsigned long, mode, unsigned long __user *, nmask,

1366

unsigned long, maxnode, unsigned, flags)

1370

unsigned long, maxnode, unsigned, flags)

1367

{

1371

{

1368

nodemask_t nodes;

1372

nodemask_t nodes;

1369

int err;

1373

int err;

1370

unsigned short mode_flags;

1374

unsigned short mode_flags;

1371

1375

1372

mode_flags = mode & MPOL_MODE_FLAGS;

1376

mode_flags = mode & MPOL_MODE_FLAGS;

1373

mode &= ~MPOL_MODE_FLAGS;

1377

mode &= ~MPOL_MODE_FLAGS;

1374

if (mode >= MPOL_MAX)

1378

if (mode >= MPOL_MAX)

1375

return -EINVAL;

1379

return -EINVAL;

1376

if ((mode_flags & MPOL_F_STATIC_NODES) &&

1380

if ((mode_flags & MPOL_F_STATIC_NODES) &&

1377

(mode_flags & MPOL_F_RELATIVE_NODES))

1381

(mode_flags & MPOL_F_RELATIVE_NODES))

1378

return -EINVAL;

1382

return -EINVAL;

1379

err = get_nodes(&nodes, nmask, maxnode);

1383

err = get_nodes(&nodes, nmask, maxnode);

1380

if (err)

1384

if (err)

1381

return err;

1385

return err;

1382

return do_mbind(start, len, mode, mode_flags, &nodes, flags);

1386

return do_mbind(start, len, mode, mode_flags, &nodes, flags);

1383

}

1387

}

1384

1388

1385

/* Set the process memory policy */

1389

/* Set the process memory policy */

1386

SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,

1390

SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,

1387

unsigned long, maxnode)

1391

unsigned long, maxnode)

1388

{

1392

{

1389

int err;

1393

int err;

1390

nodemask_t nodes;

1394

nodemask_t nodes;

1391

unsigned short flags;

1395

unsigned short flags;

1392

1396

1393

flags = mode & MPOL_MODE_FLAGS;

1397

flags = mode & MPOL_MODE_FLAGS;

1394

mode &= ~MPOL_MODE_FLAGS;

1398

mode &= ~MPOL_MODE_FLAGS;

1395

if ((unsigned int)mode >= MPOL_MAX)

1399

if ((unsigned int)mode >= MPOL_MAX)

1396

return -EINVAL;

1400

return -EINVAL;

1397

if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))

1401

if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))

1398

return -EINVAL;

1402

return -EINVAL;

1399

err = get_nodes(&nodes, nmask, maxnode);

1403

err = get_nodes(&nodes, nmask, maxnode);

1400

if (err)

1404

if (err)

1401

return err;

1405

return err;

1402

return do_set_mempolicy(mode, flags, &nodes);

1406

return do_set_mempolicy(mode, flags, &nodes);

1403

}

1407

}

1404

1408

1405

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,

1409

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,

1406

const unsigned long __user *, old_nodes,

1410

const unsigned long __user *, old_nodes,

1407

const unsigned long __user *, new_nodes)

1411

const unsigned long __user *, new_nodes)

1408

{

1412

{

1409

const struct cred *cred = current_cred(), *tcred;

1413

const struct cred *cred = current_cred(), *tcred;

1410

struct mm_struct *mm = NULL;

1414

struct mm_struct *mm = NULL;

1411

struct task_struct *task;

1415

struct task_struct *task;

1412

nodemask_t task_nodes;

1416

nodemask_t task_nodes;

1413

int err;

1417

int err;

1414

nodemask_t *old;

1418

nodemask_t *old;

1415

nodemask_t *new;

1419

nodemask_t *new;

1416

NODEMASK_SCRATCH(scratch);

1420

NODEMASK_SCRATCH(scratch);

1417

1421

1418

if (!scratch)

1422

if (!scratch)

1419

return -ENOMEM;

1423

return -ENOMEM;

1420

1424

1421

old = &scratch->mask1;

1425

old = &scratch->mask1;

1422

new = &scratch->mask2;

1426

new = &scratch->mask2;

1423

1427

1424

err = get_nodes(old, old_nodes, maxnode);

1428

err = get_nodes(old, old_nodes, maxnode);

1425

if (err)

1429

if (err)

1426

goto out;

1430

goto out;

1427

1431

1428

err = get_nodes(new, new_nodes, maxnode);

1432

err = get_nodes(new, new_nodes, maxnode);

1429

if (err)

1433

if (err)

1430

goto out;

1434

goto out;

1431

1435

1432

/* Find the mm_struct */

1436

/* Find the mm_struct */

1433

rcu_read_lock();

1437

rcu_read_lock();

1434

task = pid ? find_task_by_vpid(pid) : current;

1438

task = pid ? find_task_by_vpid(pid) : current;

1435

if (!task) {

1439

if (!task) {

1436

rcu_read_unlock();

1440

rcu_read_unlock();

1437

err = -ESRCH;

1441

err = -ESRCH;

1438

goto out;

1442

goto out;

1439

}

1443

}

1440

get_task_struct(task);

1444

get_task_struct(task);

1441

1445

1442

err = -EINVAL;

1446

err = -EINVAL;

1443

1447

1444

/*

1448

/*

1445

* Check if this process has the right to modify the specified

1449

* Check if this process has the right to modify the specified

1446

* process. The right exists if the process has administrative

1450

* process. The right exists if the process has administrative

1447

* capabilities, superuser privileges or the same

1451

* capabilities, superuser privileges or the same

1448

* userid as the target process.

1452

* userid as the target process.

1449

*/

1453

*/

1450

tcred = __task_cred(task);

1454

tcred = __task_cred(task);

1451

if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&

1455

if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&

1452

!uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&

1456

!uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&

1453

!capable(CAP_SYS_NICE)) {

1457

!capable(CAP_SYS_NICE)) {

1454

rcu_read_unlock();

1458

rcu_read_unlock();

1455

err = -EPERM;

1459

err = -EPERM;

1456

goto out_put;

1460

goto out_put;

1457

}

1461

}

1458

rcu_read_unlock();

1462

rcu_read_unlock();

1459

1463

1460

task_nodes = cpuset_mems_allowed(task);

1464

task_nodes = cpuset_mems_allowed(task);

1461

/* Is the user allowed to access the target nodes? */

1465

/* Is the user allowed to access the target nodes? */

1462

if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {

1466

if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {

1463

err = -EPERM;

1467

err = -EPERM;

1464

goto out_put;

1468

goto out_put;

1465

}

1469

}

1466

1470

1467

if (!nodes_subset(*new, node_states[N_MEMORY])) {

1471

if (!nodes_subset(*new, node_states[N_MEMORY])) {

1468

err = -EINVAL;

1472

err = -EINVAL;

1469

goto out_put;

1473

goto out_put;

1470

}

1474

}

1471

1475

1472

err = security_task_movememory(task);

1476

err = security_task_movememory(task);

1473

if (err)

1477

if (err)

1474

goto out_put;

1478

goto out_put;

1475

1479

1476

mm = get_task_mm(task);

1480

mm = get_task_mm(task);

1477

put_task_struct(task);

1481

put_task_struct(task);

1478

1482

1479

if (!mm) {

1483

if (!mm) {

1480

err = -EINVAL;

1484

err = -EINVAL;

1481

goto out;

1485

goto out;

1482

}

1486

}

1483

1487

1484

err = do_migrate_pages(mm, old, new,

1488

err = do_migrate_pages(mm, old, new,

1485

capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);

1489

capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);

1486

1490

1487

mmput(mm);

1491

mmput(mm);

1488

out:

1492

out:

1489

NODEMASK_SCRATCH_FREE(scratch);

1493

NODEMASK_SCRATCH_FREE(scratch);

1490

1494

1491

return err;

1495

return err;

1492

1496

1493

out_put:

1497

out_put:

1494

put_task_struct(task);

1498

put_task_struct(task);

1495

goto out;

1499

goto out;

1496

1500

1497

}

1501

}

1498

1502

1499

1503

1500

/* Retrieve NUMA policy */

1504

/* Retrieve NUMA policy */

1501

SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,

1505

SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,

1502

unsigned long __user *, nmask, unsigned long, maxnode,

1506

unsigned long __user *, nmask, unsigned long, maxnode,

1503

unsigned long, addr, unsigned long, flags)

1507

unsigned long, addr, unsigned long, flags)

1504

{

1508

{

1505

int err;

1509

int err;

1506

int uninitialized_var(pval);

1510

int uninitialized_var(pval);

1507

nodemask_t nodes;

1511

nodemask_t nodes;

1508

1512

1509

if (nmask != NULL && maxnode < MAX_NUMNODES)

1513

if (nmask != NULL && maxnode < MAX_NUMNODES)

1510

return -EINVAL;

1514

return -EINVAL;

1511

1515

1512

err = do_get_mempolicy(&pval, &nodes, addr, flags);

1516

err = do_get_mempolicy(&pval, &nodes, addr, flags);

1513

1517

1514

if (err)

1518

if (err)

1515

return err;

1519

return err;

1516

1520

1517

if (policy && put_user(pval, policy))

1521

if (policy && put_user(pval, policy))

1518

return -EFAULT;

1522

return -EFAULT;

1519

1523

1520

if (nmask)

1524

if (nmask)

1521

err = copy_nodes_to_user(nmask, maxnode, &nodes);

1525

err = copy_nodes_to_user(nmask, maxnode, &nodes);

1522

1526

1523

return err;

1527

return err;

1524

}

1528

}

1525

1529

1526

#ifdef CONFIG_COMPAT

1530

#ifdef CONFIG_COMPAT

1527

1531

1528

COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,

1532

COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,

1529

compat_ulong_t __user *, nmask,

1533

compat_ulong_t __user *, nmask,

1530

compat_ulong_t, maxnode,

1534

compat_ulong_t, maxnode,

1531

compat_ulong_t, addr, compat_ulong_t, flags)

1535

compat_ulong_t, addr, compat_ulong_t, flags)

1532

{

1536

{

1533

long err;

1537

long err;

1534

unsigned long __user *nm = NULL;

1538

unsigned long __user *nm = NULL;

1535

unsigned long nr_bits, alloc_size;

1539

unsigned long nr_bits, alloc_size;

1536

DECLARE_BITMAP(bm, MAX_NUMNODES);

1540

DECLARE_BITMAP(bm, MAX_NUMNODES);

1537

1541

1538

nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);

1542

nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);

1539

alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

1543

alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

1540

1544

1541

if (nmask)

1545

if (nmask)

1542

nm = compat_alloc_user_space(alloc_size);

1546

nm = compat_alloc_user_space(alloc_size);

1543

1547

1544

err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);

1548

err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);

1545

1549

1546

if (!err && nmask) {

1550

if (!err && nmask) {

1547

unsigned long copy_size;

1551

unsigned long copy_size;

1548

copy_size = min_t(unsigned long, sizeof(bm), alloc_size);

1552

copy_size = min_t(unsigned long, sizeof(bm), alloc_size);

1549

err = copy_from_user(bm, nm, copy_size);

1553

err = copy_from_user(bm, nm, copy_size);

1550

/* ensure entire bitmap is zeroed */

1554

/* ensure entire bitmap is zeroed */

1551

err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);

1555

err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);

1552

err |= compat_put_bitmap(nmask, bm, nr_bits);

1556

err |= compat_put_bitmap(nmask, bm, nr_bits);

1553

}

1557

}

1554

1558

1555

return err;

1559

return err;

1556

}

1560

}

1557

1561

1558

COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,

1562

COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,

1559

compat_ulong_t, maxnode)

1563

compat_ulong_t, maxnode)

1560

{

1564

{

1561

long err = 0;

1565

long err = 0;

1562

unsigned long __user *nm = NULL;

1566

unsigned long __user *nm = NULL;

1563

unsigned long nr_bits, alloc_size;

1567

unsigned long nr_bits, alloc_size;

1564

DECLARE_BITMAP(bm, MAX_NUMNODES);

1568

DECLARE_BITMAP(bm, MAX_NUMNODES);

1565

1569

1566

nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);

1570

nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);

1567

alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

1571

alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

1568

1572

1569

if (nmask) {

1573

if (nmask) {

1570

err = compat_get_bitmap(bm, nmask, nr_bits);

1574

err = compat_get_bitmap(bm, nmask, nr_bits);

1571

nm = compat_alloc_user_space(alloc_size);

1575

nm = compat_alloc_user_space(alloc_size);

1572

err |= copy_to_user(nm, bm, alloc_size);

1576

err |= copy_to_user(nm, bm, alloc_size);

1573

}

1577

}

1574

1578

1575

if (err)

1579

if (err)

1576

return -EFAULT;

1580

return -EFAULT;

1577

1581

1578

return sys_set_mempolicy(mode, nm, nr_bits+1);

1582

return sys_set_mempolicy(mode, nm, nr_bits+1);

1579

}

1583

}

1580

1584

1581

COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,

1585

COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,

1582

compat_ulong_t, mode, compat_ulong_t __user *, nmask,

1586

compat_ulong_t, mode, compat_ulong_t __user *, nmask,

1583

compat_ulong_t, maxnode, compat_ulong_t, flags)

1587

compat_ulong_t, maxnode, compat_ulong_t, flags)

1584

{

1588

{

1585

long err = 0;

1589

long err = 0;

1586

unsigned long __user *nm = NULL;

1590

unsigned long __user *nm = NULL;

1587

unsigned long nr_bits, alloc_size;

1591

unsigned long nr_bits, alloc_size;

1588

nodemask_t bm;

1592

nodemask_t bm;

1589

1593

1590

nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);

1594

nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);

1591

alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

1595

alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;

1592

1596

1593

if (nmask) {

1597

if (nmask) {

1594

err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);

1598

err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);

1595

nm = compat_alloc_user_space(alloc_size);

1599

nm = compat_alloc_user_space(alloc_size);

1596

err |= copy_to_user(nm, nodes_addr(bm), alloc_size);

1600

err |= copy_to_user(nm, nodes_addr(bm), alloc_size);

1597

}

1601

}

1598

1602

1599

if (err)

1603

if (err)

1600

return -EFAULT;

1604

return -EFAULT;

1601

1605

1602

return sys_mbind(start, len, mode, nm, nr_bits+1, flags);

1606

return sys_mbind(start, len, mode, nm, nr_bits+1, flags);

1603

}

1607

}

1604

1608

1605

#endif

1609

#endif

1606

1610

1607

/*

1611

/*

1608

* get_vma_policy(@task, @vma, @addr)

1612

* get_vma_policy(@task, @vma, @addr)

1609

* @task - task for fallback if vma policy == default

1613

* @task - task for fallback if vma policy == default

1610

* @vma - virtual memory area whose policy is sought

1614

* @vma - virtual memory area whose policy is sought

1611

* @addr - address in @vma for shared policy lookup

1615

* @addr - address in @vma for shared policy lookup

1612

*

1616

*

1613

* Returns effective policy for a VMA at specified address.

1617

* Returns effective policy for a VMA at specified address.

1614

* Falls back to @task or system default policy, as necessary.

1618

* Falls back to @task or system default policy, as necessary.

1615

* Current or other task's task mempolicy and non-shared vma policies must be

1619

* Current or other task's task mempolicy and non-shared vma policies must be

1616

* protected by task_lock(task) by the caller.

1620

* protected by task_lock(task) by the caller.

1617

* Shared policies [those marked as MPOL_F_SHARED] require an extra reference

1621

* Shared policies [those marked as MPOL_F_SHARED] require an extra reference

1618

* count--added by the get_policy() vm_op, as appropriate--to protect against

1622

* count--added by the get_policy() vm_op, as appropriate--to protect against

1619

* freeing by another task. It is the caller's responsibility to free the

1623

* freeing by another task. It is the caller's responsibility to free the

1620

* extra reference for shared policies.

1624

* extra reference for shared policies.

1621

*/

1625

*/

1622

struct mempolicy *get_vma_policy(struct task_struct *task,

1626

struct mempolicy *get_vma_policy(struct task_struct *task,

1623

struct vm_area_struct *vma, unsigned long addr)

1627

struct vm_area_struct *vma, unsigned long addr)

1624

{

1628

{

1625

struct mempolicy *pol = get_task_policy(task);

1629

struct mempolicy *pol = get_task_policy(task);

1626

1630

1627

if (vma) {

1631

if (vma) {

1628

if (vma->vm_ops && vma->vm_ops->get_policy) {

1632

if (vma->vm_ops && vma->vm_ops->get_policy) {

1629

struct mempolicy *vpol = vma->vm_ops->get_policy(vma,

1633

struct mempolicy *vpol = vma->vm_ops->get_policy(vma,

1630

addr);

1634

addr);

1631

if (vpol)

1635

if (vpol)

1632

pol = vpol;

1636

pol = vpol;

1633

} else if (vma->vm_policy) {

1637

} else if (vma->vm_policy) {

1634

pol = vma->vm_policy;

1638

pol = vma->vm_policy;

1635

1639

1636

/*

1640

/*

1637

* shmem_alloc_page() passes MPOL_F_SHARED policy with

1641

* shmem_alloc_page() passes MPOL_F_SHARED policy with

1638

* a pseudo vma whose vma->vm_ops=NULL. Take a reference

1642

* a pseudo vma whose vma->vm_ops=NULL. Take a reference

1639

* count on these policies which will be dropped by

1643

* count on these policies which will be dropped by

1640

* mpol_cond_put() later

1644

* mpol_cond_put() later

1641

*/

1645

*/

1642

if (mpol_needs_cond_ref(pol))

1646

if (mpol_needs_cond_ref(pol))

1643

mpol_get(pol);

1647

mpol_get(pol);

1644

}

1648

}

1645

}

1649

}

1646

if (!pol)

1650

if (!pol)

1647

pol = &default_policy;

1651

pol = &default_policy;

1648

return pol;

1652

return pol;

1649

}

1653

}

1650

1654

1651

bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)

1655

bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)

1652

{

1656

{

1653

struct mempolicy *pol = get_task_policy(task);

1657

struct mempolicy *pol = get_task_policy(task);

1654

if (vma) {

1658

if (vma) {

1655

if (vma->vm_ops && vma->vm_ops->get_policy) {

1659

if (vma->vm_ops && vma->vm_ops->get_policy) {

1656

bool ret = false;

1660

bool ret = false;

1657

1661

1658

pol = vma->vm_ops->get_policy(vma, vma->vm_start);

1662

pol = vma->vm_ops->get_policy(vma, vma->vm_start);

1659

if (pol && (pol->flags & MPOL_F_MOF))

1663

if (pol && (pol->flags & MPOL_F_MOF))

1660

ret = true;

1664

ret = true;

1661

mpol_cond_put(pol);

1665

mpol_cond_put(pol);

1662

1666

1663

return ret;

1667

return ret;

1664

} else if (vma->vm_policy) {

1668

} else if (vma->vm_policy) {

1665

pol = vma->vm_policy;

1669

pol = vma->vm_policy;

1666

}

1670

}

1667

}

1671

}

1668

1672

1669

if (!pol)

1673

if (!pol)

1670

return default_policy.flags & MPOL_F_MOF;

1674

return default_policy.flags & MPOL_F_MOF;

1671

1675

1672

return pol->flags & MPOL_F_MOF;

1676

return pol->flags & MPOL_F_MOF;

1673

}

1677

}

1674

1678

1675

static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)

1679

static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)

1676

{

1680

{

1677

enum zone_type dynamic_policy_zone = policy_zone;

1681

enum zone_type dynamic_policy_zone = policy_zone;

1678

1682

1679

BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);

1683

BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);

1680

1684

1681

/*

1685

/*

1682

* if policy->v.nodes has movable memory only,

1686

* if policy->v.nodes has movable memory only,

1683

* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.

1687

* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.

1684

*

1688

*

1685

* policy->v.nodes is intersect with node_states[N_MEMORY].

1689

* policy->v.nodes is intersect with node_states[N_MEMORY].

1686

* so if the following test faile, it implies

1690

* so if the following test faile, it implies

1687

* policy->v.nodes has movable memory only.

1691

* policy->v.nodes has movable memory only.

1688

*/

1692

*/

1689

if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))

1693

if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))

1690

dynamic_policy_zone = ZONE_MOVABLE;

1694

dynamic_policy_zone = ZONE_MOVABLE;

1691

1695

1692

return zone >= dynamic_policy_zone;

1696

return zone >= dynamic_policy_zone;

1693

}

1697

}

1694

1698

1695

/*

1699

/*

1696

* Return a nodemask representing a mempolicy for filtering nodes for

1700

* Return a nodemask representing a mempolicy for filtering nodes for

1697

* page allocation

1701

* page allocation

1698

*/

1702

*/

1699

static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)

1703

static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)

1700

{

1704

{

1701

/* Lower zones don't get a nodemask applied for MPOL_BIND */

1705

/* Lower zones don't get a nodemask applied for MPOL_BIND */

1702

if (unlikely(policy->mode == MPOL_BIND) &&

1706

if (unlikely(policy->mode == MPOL_BIND) &&

1703

apply_policy_zone(policy, gfp_zone(gfp)) &&

1707

apply_policy_zone(policy, gfp_zone(gfp)) &&

1704

cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))

1708

cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))

1705

return &policy->v.nodes;

1709

return &policy->v.nodes;

1706

1710

1707

return NULL;

1711

return NULL;

1708

}

1712

}

1709

1713

1710

/* Return a zonelist indicated by gfp for node representing a mempolicy */

1714

/* Return a zonelist indicated by gfp for node representing a mempolicy */

1711

static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,

1715

static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,

1712

int nd)

1716

int nd)

1713

{

1717

{

1714

switch (policy->mode) {

1718

switch (policy->mode) {

1715

case MPOL_PREFERRED:

1719

case MPOL_PREFERRED:

1716

if (!(policy->flags & MPOL_F_LOCAL))

1720

if (!(policy->flags & MPOL_F_LOCAL))

1717

nd = policy->v.preferred_node;

1721

nd = policy->v.preferred_node;

1718

break;

1722

break;

1719

case MPOL_BIND:

1723

case MPOL_BIND:

1720

/*

1724

/*

1721

* Normally, MPOL_BIND allocations are node-local within the

1725

* Normally, MPOL_BIND allocations are node-local within the

1722

* allowed nodemask. However, if __GFP_THISNODE is set and the

1726

* allowed nodemask. However, if __GFP_THISNODE is set and the

1723

* current node isn't part of the mask, we use the zonelist for

1727

* current node isn't part of the mask, we use the zonelist for

1724

* the first node in the mask instead.

1728

* the first node in the mask instead.

1725

*/

1729

*/

1726

if (unlikely(gfp & __GFP_THISNODE) &&

1730

if (unlikely(gfp & __GFP_THISNODE) &&

1727

unlikely(!node_isset(nd, policy->v.nodes)))

1731

unlikely(!node_isset(nd, policy->v.nodes)))

1728

nd = first_node(policy->v.nodes);

1732

nd = first_node(policy->v.nodes);

1729

break;

1733

break;

1730

default:

1734

default:

1731

BUG();

1735

BUG();

1732

}

1736

}

1733

return node_zonelist(nd, gfp);

1737

return node_zonelist(nd, gfp);

1734

}

1738

}

1735

1739

1736

/* Do dynamic interleaving for a process */

1740

/* Do dynamic interleaving for a process */

1737

static unsigned interleave_nodes(struct mempolicy *policy)

1741

static unsigned interleave_nodes(struct mempolicy *policy)

1738

{

1742

{

1739

unsigned nid, next;

1743

unsigned nid, next;

1740

struct task_struct *me = current;

1744

struct task_struct *me = current;

1741

1745

1742

nid = me->il_next;

1746

nid = me->il_next;

1743

next = next_node(nid, policy->v.nodes);

1747

next = next_node(nid, policy->v.nodes);

1744

if (next >= MAX_NUMNODES)

1748

if (next >= MAX_NUMNODES)

1745

next = first_node(policy->v.nodes);

1749

next = first_node(policy->v.nodes);

1746

if (next < MAX_NUMNODES)

1750

if (next < MAX_NUMNODES)

1747

me->il_next = next;

1751

me->il_next = next;

1748

return nid;

1752

return nid;

1749

}

1753

}

1750

1754

1751

/*

1755

/*

1752

* Depending on the memory policy provide a node from which to allocate the

1756

* Depending on the memory policy provide a node from which to allocate the

1753

* next slab entry.

1757

* next slab entry.

1754

*/

1758

*/

1755

unsigned int mempolicy_slab_node(void)

1759

unsigned int mempolicy_slab_node(void)

1756

{

1760

{

1757

struct mempolicy *policy;

1761

struct mempolicy *policy;

1758

int node = numa_mem_id();

1762

int node = numa_mem_id();

1759

1763

1760

if (in_interrupt())

1764

if (in_interrupt())

1761

return node;

1765

return node;

1762

1766

1763

policy = current->mempolicy;

1767

policy = current->mempolicy;

1764

if (!policy || policy->flags & MPOL_F_LOCAL)

1768

if (!policy || policy->flags & MPOL_F_LOCAL)

1765

return node;

1769

return node;

1766

1770

1767

switch (policy->mode) {

1771

switch (policy->mode) {

1768

case MPOL_PREFERRED:

1772

case MPOL_PREFERRED:

1769

/*

1773

/*

1770

* handled MPOL_F_LOCAL above

1774

* handled MPOL_F_LOCAL above

1771

*/

1775

*/

1772

return policy->v.preferred_node;

1776

return policy->v.preferred_node;

1773

1777

1774

case MPOL_INTERLEAVE:

1778

case MPOL_INTERLEAVE:

1775

return interleave_nodes(policy);

1779

return interleave_nodes(policy);

1776

1780

1777

case MPOL_BIND: {

1781

case MPOL_BIND: {

1778

/*

1782

/*

1779

* Follow bind policy behavior and start allocation at the

1783

* Follow bind policy behavior and start allocation at the

1780

* first node.

1784

* first node.

1781

*/

1785

*/

1782

struct zonelist *zonelist;

1786

struct zonelist *zonelist;

1783

struct zone *zone;

1787

struct zone *zone;

1784

enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);

1788

enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);

1785

zonelist = &NODE_DATA(node)->node_zonelists[0];

1789

zonelist = &NODE_DATA(node)->node_zonelists[0];

1786

(void)first_zones_zonelist(zonelist, highest_zoneidx,

1790

(void)first_zones_zonelist(zonelist, highest_zoneidx,

1787

&policy->v.nodes,

1791

&policy->v.nodes,

1788

&zone);

1792

&zone);

1789

return zone ? zone->node : node;

1793

return zone ? zone->node : node;

1790

}

1794

}

1791

1795

1792

default:

1796

default:

1793

BUG();

1797

BUG();

1794

}

1798

}

1795

}

1799

}

1796

1800

1797

/* Do static interleaving for a VMA with known offset. */

1801

/* Do static interleaving for a VMA with known offset. */

1798

static unsigned offset_il_node(struct mempolicy *pol,

1802

static unsigned offset_il_node(struct mempolicy *pol,

1799

struct vm_area_struct *vma, unsigned long off)

1803

struct vm_area_struct *vma, unsigned long off)

1800

{

1804

{

1801

unsigned nnodes = nodes_weight(pol->v.nodes);

1805

unsigned nnodes = nodes_weight(pol->v.nodes);

1802

unsigned target;

1806

unsigned target;

1803

int c;

1807

int c;

1804

int nid = NUMA_NO_NODE;

1808

int nid = NUMA_NO_NODE;

1805

1809

1806

if (!nnodes)

1810

if (!nnodes)

1807

return numa_node_id();

1811

return numa_node_id();

1808

target = (unsigned int)off % nnodes;

1812

target = (unsigned int)off % nnodes;

1809

c = 0;

1813

c = 0;

1810

do {

1814

do {

1811

nid = next_node(nid, pol->v.nodes);

1815

nid = next_node(nid, pol->v.nodes);

1812

c++;

1816

c++;

1813

} while (c <= target);

1817

} while (c <= target);

1814

return nid;

1818

return nid;

1815

}

1819

}

1816

1820

1817

/* Determine a node number for interleave */

1821

/* Determine a node number for interleave */

1818

static inline unsigned interleave_nid(struct mempolicy *pol,

1822

static inline unsigned interleave_nid(struct mempolicy *pol,

1819

struct vm_area_struct *vma, unsigned long addr, int shift)

1823

struct vm_area_struct *vma, unsigned long addr, int shift)

1820

{

1824

{

1821

if (vma) {

1825

if (vma) {

1822

unsigned long off;

1826

unsigned long off;

1823

1827

1824

/*

1828

/*

1825

* for small pages, there is no difference between

1829

* for small pages, there is no difference between

1826

* shift and PAGE_SHIFT, so the bit-shift is safe.

1830

* shift and PAGE_SHIFT, so the bit-shift is safe.

1827

* for huge pages, since vm_pgoff is in units of small

1831

* for huge pages, since vm_pgoff is in units of small

1828

* pages, we need to shift off the always 0 bits to get

1832

* pages, we need to shift off the always 0 bits to get

1829

* a useful offset.

1833

* a useful offset.

1830

*/

1834

*/

1831

BUG_ON(shift < PAGE_SHIFT);

1835

BUG_ON(shift < PAGE_SHIFT);

1832

off = vma->vm_pgoff >> (shift - PAGE_SHIFT);

1836

off = vma->vm_pgoff >> (shift - PAGE_SHIFT);

1833

off += (addr - vma->vm_start) >> shift;

1837

off += (addr - vma->vm_start) >> shift;

1834

return offset_il_node(pol, vma, off);

1838

return offset_il_node(pol, vma, off);

1835

} else

1839

} else

1836

return interleave_nodes(pol);

1840

return interleave_nodes(pol);

1837

}

1841

}

1838

1842

1839

/*

1843

/*

1840

* Return the bit number of a random bit set in the nodemask.

1844

* Return the bit number of a random bit set in the nodemask.

1841

* (returns NUMA_NO_NODE if nodemask is empty)

1845

* (returns NUMA_NO_NODE if nodemask is empty)

1842

*/

1846

*/

1843

int node_random(const nodemask_t *maskp)

1847

int node_random(const nodemask_t *maskp)

1844

{

1848

{

1845

int w, bit = NUMA_NO_NODE;

1849

int w, bit = NUMA_NO_NODE;

1846

1850

1847

w = nodes_weight(*maskp);

1851

w = nodes_weight(*maskp);

1848

if (w)

1852

if (w)

1849

bit = bitmap_ord_to_pos(maskp->bits,

1853

bit = bitmap_ord_to_pos(maskp->bits,

1850

get_random_int() % w, MAX_NUMNODES);

1854

get_random_int() % w, MAX_NUMNODES);

1851

return bit;

1855

return bit;

1852

}

1856

}

1853

1857

1854

#ifdef CONFIG_HUGETLBFS

1858

#ifdef CONFIG_HUGETLBFS

1855

/*

1859

/*

1856

* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)

1860

* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)

1857

* @vma = virtual memory area whose policy is sought

1861

* @vma = virtual memory area whose policy is sought

1858

* @addr = address in @vma for shared policy lookup and interleave policy

1862

* @addr = address in @vma for shared policy lookup and interleave policy

1859

* @gfp_flags = for requested zone

1863

* @gfp_flags = for requested zone

1860

* @mpol = pointer to mempolicy pointer for reference counted mempolicy

1864

* @mpol = pointer to mempolicy pointer for reference counted mempolicy

1861

* @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask

1865

* @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask

1862

*

1866

*

1863

* Returns a zonelist suitable for a huge page allocation and a pointer

1867

* Returns a zonelist suitable for a huge page allocation and a pointer

1864

* to the struct mempolicy for conditional unref after allocation.

1868

* to the struct mempolicy for conditional unref after allocation.

1865

* If the effective policy is 'BIND, returns a pointer to the mempolicy's

1869

* If the effective policy is 'BIND, returns a pointer to the mempolicy's

1866

* @nodemask for filtering the zonelist.

1870

* @nodemask for filtering the zonelist.

1867

*

1871

*

1868

* Must be protected by read_mems_allowed_begin()

1872

* Must be protected by read_mems_allowed_begin()

1869

*/

1873

*/

1870

struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,

1874

struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,

1871

gfp_t gfp_flags, struct mempolicy **mpol,

1875

gfp_t gfp_flags, struct mempolicy **mpol,

1872

nodemask_t **nodemask)

1876

nodemask_t **nodemask)

1873

{

1877

{

1874

struct zonelist *zl;

1878

struct zonelist *zl;

1875

1879

1876

*mpol = get_vma_policy(current, vma, addr);

1880

*mpol = get_vma_policy(current, vma, addr);

1877

*nodemask = NULL; /* assume !MPOL_BIND */

1881

*nodemask = NULL; /* assume !MPOL_BIND */

1878

1882

1879

if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {

1883

if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {

1880

zl = node_zonelist(interleave_nid(*mpol, vma, addr,

1884

zl = node_zonelist(interleave_nid(*mpol, vma, addr,

1881

huge_page_shift(hstate_vma(vma))), gfp_flags);

1885

huge_page_shift(hstate_vma(vma))), gfp_flags);

1882

} else {

1886

} else {

1883

zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());

1887

zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());

1884

if ((*mpol)->mode == MPOL_BIND)

1888

if ((*mpol)->mode == MPOL_BIND)

1885

*nodemask = &(*mpol)->v.nodes;

1889

*nodemask = &(*mpol)->v.nodes;

1886

}

1890

}

1887

return zl;

1891

return zl;

1888

}

1892

}

1889

1893

1890

/*

1894

/*

1891

* init_nodemask_of_mempolicy

1895

* init_nodemask_of_mempolicy

1892

*

1896

*

1893

* If the current task's mempolicy is "default" [NULL], return 'false'

1897

* If the current task's mempolicy is "default" [NULL], return 'false'

1894

* to indicate default policy. Otherwise, extract the policy nodemask

1898

* to indicate default policy. Otherwise, extract the policy nodemask

1895

* for 'bind' or 'interleave' policy into the argument nodemask, or

1899

* for 'bind' or 'interleave' policy into the argument nodemask, or

1896

* initialize the argument nodemask to contain the single node for

1900

* initialize the argument nodemask to contain the single node for

1897

* 'preferred' or 'local' policy and return 'true' to indicate presence

1901

* 'preferred' or 'local' policy and return 'true' to indicate presence

1898

* of non-default mempolicy.

1902

* of non-default mempolicy.

1899

*

1903

*

1900

* We don't bother with reference counting the mempolicy [mpol_get/put]

1904

* We don't bother with reference counting the mempolicy [mpol_get/put]

1901

* because the current task is examining it's own mempolicy and a task's

1905

* because the current task is examining it's own mempolicy and a task's

1902

* mempolicy is only ever changed by the task itself.

1906

* mempolicy is only ever changed by the task itself.

1903

*

1907

*

1904

* N.B., it is the caller's responsibility to free a returned nodemask.

1908

* N.B., it is the caller's responsibility to free a returned nodemask.

1905

*/

1909

*/

1906

bool init_nodemask_of_mempolicy(nodemask_t *mask)

1910

bool init_nodemask_of_mempolicy(nodemask_t *mask)

1907

{

1911

{

1908

struct mempolicy *mempolicy;

1912

struct mempolicy *mempolicy;

1909

int nid;

1913

int nid;

1910

1914

1911

if (!(mask && current->mempolicy))

1915

if (!(mask && current->mempolicy))

1912

return false;

1916

return false;

1913

1917

1914

task_lock(current);

1918

task_lock(current);

1915

mempolicy = current->mempolicy;

1919

mempolicy = current->mempolicy;

1916

switch (mempolicy->mode) {

1920

switch (mempolicy->mode) {

1917

case MPOL_PREFERRED:

1921

case MPOL_PREFERRED:

1918

if (mempolicy->flags & MPOL_F_LOCAL)

1922

if (mempolicy->flags & MPOL_F_LOCAL)

1919

nid = numa_node_id();

1923

nid = numa_node_id();

1920

else

1924

else

1921

nid = mempolicy->v.preferred_node;

1925

nid = mempolicy->v.preferred_node;

1922

init_nodemask_of_node(mask, nid);

1926

init_nodemask_of_node(mask, nid);

1923

break;

1927

break;

1924

1928

1925

case MPOL_BIND:

1929

case MPOL_BIND:

1926

/* Fall through */

1930

/* Fall through */

1927

case MPOL_INTERLEAVE:

1931

case MPOL_INTERLEAVE:

1928

*mask = mempolicy->v.nodes;

1932

*mask = mempolicy->v.nodes;

1929

break;

1933

break;

1930

1934

1931

default:

1935

default:

1932

BUG();

1936

BUG();

1933

}

1937

}

1934

task_unlock(current);

1938

task_unlock(current);

1935

1939

1936

return true;

1940

return true;

1937

}

1941

}

1938

#endif

1942

#endif

1939

1943

1940

/*

1944

/*

1941

* mempolicy_nodemask_intersects

1945

* mempolicy_nodemask_intersects

1942

*

1946

*

1943

* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default

1947

* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default

1944

* policy. Otherwise, check for intersection between mask and the policy

1948

* policy. Otherwise, check for intersection between mask and the policy

1945

* nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'

1949

* nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'

1946

* policy, always return true since it may allocate elsewhere on fallback.

1950

* policy, always return true since it may allocate elsewhere on fallback.

1947

*

1951

*

1948

* Takes task_lock(tsk) to prevent freeing of its mempolicy.

1952

* Takes task_lock(tsk) to prevent freeing of its mempolicy.

1949

*/

1953

*/

1950

bool mempolicy_nodemask_intersects(struct task_struct *tsk,

1954

bool mempolicy_nodemask_intersects(struct task_struct *tsk,

1951

const nodemask_t *mask)

1955

const nodemask_t *mask)

1952

{

1956

{

1953

struct mempolicy *mempolicy;

1957

struct mempolicy *mempolicy;

1954

bool ret = true;

1958

bool ret = true;

1955

1959

1956

if (!mask)

1960

if (!mask)

1957

return ret;

1961

return ret;

1958

task_lock(tsk);

1962

task_lock(tsk);

1959

mempolicy = tsk->mempolicy;

1963

mempolicy = tsk->mempolicy;

1960

if (!mempolicy)

1964

if (!mempolicy)

1961

goto out;

1965

goto out;

1962

1966

1963

switch (mempolicy->mode) {

1967

switch (mempolicy->mode) {

1964

case MPOL_PREFERRED:

1968

case MPOL_PREFERRED:

1965

/*

1969

/*

1966

* MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to

1970

* MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to

1967

* allocate from, they may fallback to other nodes when oom.

1971

* allocate from, they may fallback to other nodes when oom.

1968

* Thus, it's possible for tsk to have allocated memory from

1972

* Thus, it's possible for tsk to have allocated memory from

1969

* nodes in mask.

1973

* nodes in mask.

1970

*/

1974

*/

1971

break;

1975

break;

1972

case MPOL_BIND:

1976

case MPOL_BIND:

1973

case MPOL_INTERLEAVE:

1977

case MPOL_INTERLEAVE:

1974

ret = nodes_intersects(mempolicy->v.nodes, *mask);

1978

ret = nodes_intersects(mempolicy->v.nodes, *mask);

1975

break;

1979

break;

1976

default:

1980

default:

1977

BUG();

1981

BUG();

1978

}

1982

}

1979

out:

1983

out:

1980

task_unlock(tsk);

1984

task_unlock(tsk);

1981

return ret;

1985

return ret;

1982

}

1986

}

1983

1987

1984

/* Allocate a page in interleaved policy.

1988

/* Allocate a page in interleaved policy.

1985

Own path because it needs to do special accounting. */

1989

Own path because it needs to do special accounting. */

1986

static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,

1990

static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,

1987

unsigned nid)

1991

unsigned nid)

1988

{

1992

{

1989

struct zonelist *zl;

1993

struct zonelist *zl;

1990

struct page *page;

1994

struct page *page;

1991

1995

1992

zl = node_zonelist(nid, gfp);

1996

zl = node_zonelist(nid, gfp);

1993

page = __alloc_pages(gfp, order, zl);

1997

page = __alloc_pages(gfp, order, zl);

1994

if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))

1998

if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))

1995

inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);

1999

inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);

1996

return page;

2000

return page;

1997

}

2001

}

1998

2002

1999

/**

2003

/**

2000

* alloc_pages_vma - Allocate a page for a VMA.

2004

* alloc_pages_vma - Allocate a page for a VMA.

2001

*

2005

*

2002

* @gfp:

2006

* @gfp:

2003

* %GFP_USER user allocation.

2007

* %GFP_USER user allocation.

2004

* %GFP_KERNEL kernel allocations,

2008

* %GFP_KERNEL kernel allocations,

2005

* %GFP_HIGHMEM highmem/user allocations,

2009

* %GFP_HIGHMEM highmem/user allocations,

2006

* %GFP_FS allocation should not call back into a file system.

2010

* %GFP_FS allocation should not call back into a file system.

2007

* %GFP_ATOMIC don't sleep.

2011

* %GFP_ATOMIC don't sleep.

2008

*

2012

*

2009

* @order:Order of the GFP allocation.

2013

* @order:Order of the GFP allocation.

2010

* @vma: Pointer to VMA or NULL if not available.

2014

* @vma: Pointer to VMA or NULL if not available.

2011

* @addr: Virtual Address of the allocation. Must be inside the VMA.

2015

* @addr: Virtual Address of the allocation. Must be inside the VMA.

2012

*

2016

*

2013

* This function allocates a page from the kernel page pool and applies

2017

* This function allocates a page from the kernel page pool and applies

2014

* a NUMA policy associated with the VMA or the current process.

2018

* a NUMA policy associated with the VMA or the current process.

2015

* When VMA is not NULL caller must hold down_read on the mmap_sem of the

2019

* When VMA is not NULL caller must hold down_read on the mmap_sem of the

2016

* mm_struct of the VMA to prevent it from going away. Should be used for

2020

* mm_struct of the VMA to prevent it from going away. Should be used for

2017

* all allocations for pages that will be mapped into

2021

* all allocations for pages that will be mapped into

2018

* user space. Returns NULL when no page can be allocated.

2022

* user space. Returns NULL when no page can be allocated.

2019

*

2023

*

2020

* Should be called with the mm_sem of the vma hold.

2024

* Should be called with the mm_sem of the vma hold.

2021

*/

2025

*/

2022

struct page *

2026

struct page *

2023

alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,

2027

alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,

2024

unsigned long addr, int node)

2028

unsigned long addr, int node)

2025

{

2029

{

2026

struct mempolicy *pol;

2030

struct mempolicy *pol;

2027

struct page *page;

2031

struct page *page;

2028

unsigned int cpuset_mems_cookie;

2032

unsigned int cpuset_mems_cookie;

2029

2033

2030

retry_cpuset:

2034

retry_cpuset:

2031

pol = get_vma_policy(current, vma, addr);

2035

pol = get_vma_policy(current, vma, addr);

2032

cpuset_mems_cookie = read_mems_allowed_begin();

2036

cpuset_mems_cookie = read_mems_allowed_begin();

2033

2037

2034

if (unlikely(pol->mode == MPOL_INTERLEAVE)) {

2038

if (unlikely(pol->mode == MPOL_INTERLEAVE)) {

2035

unsigned nid;

2039

unsigned nid;

2036

2040

2037

nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);

2041

nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);

2038

mpol_cond_put(pol);

2042

mpol_cond_put(pol);

2039

page = alloc_page_interleave(gfp, order, nid);

2043

page = alloc_page_interleave(gfp, order, nid);

2040

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2044

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2041

goto retry_cpuset;

2045

goto retry_cpuset;

2042

2046

2043

return page;

2047

return page;

2044

}

2048

}

2045

page = __alloc_pages_nodemask(gfp, order,

2049

page = __alloc_pages_nodemask(gfp, order,

2046

policy_zonelist(gfp, pol, node),

2050

policy_zonelist(gfp, pol, node),

2047

policy_nodemask(gfp, pol));

2051

policy_nodemask(gfp, pol));

2048

if (unlikely(mpol_needs_cond_ref(pol)))

2052

if (unlikely(mpol_needs_cond_ref(pol)))

2049

__mpol_put(pol);

2053

__mpol_put(pol);

2050

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2054

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2051

goto retry_cpuset;

2055

goto retry_cpuset;

2052

return page;

2056

return page;

2053

}

2057

}

2054

2058

2055

/**

2059

/**

2056

* alloc_pages_current - Allocate pages.

2060

* alloc_pages_current - Allocate pages.

2057

*

2061

*

2058

* @gfp:

2062

* @gfp:

2059

* %GFP_USER user allocation,

2063

* %GFP_USER user allocation,

2060

* %GFP_KERNEL kernel allocation,

2064

* %GFP_KERNEL kernel allocation,

2061

* %GFP_HIGHMEM highmem allocation,

2065

* %GFP_HIGHMEM highmem allocation,

2062

* %GFP_FS don't call back into a file system.

2066

* %GFP_FS don't call back into a file system.

2063

* %GFP_ATOMIC don't sleep.

2067

* %GFP_ATOMIC don't sleep.

2064

* @order: Power of two of allocation size in pages. 0 is a single page.

2068

* @order: Power of two of allocation size in pages. 0 is a single page.

2065

*

2069

*

2066

* Allocate a page from the kernel page pool. When not in

2070

* Allocate a page from the kernel page pool. When not in

2067

* interrupt context and apply the current process NUMA policy.

2071

* interrupt context and apply the current process NUMA policy.

2068

* Returns NULL when no page can be allocated.

2072

* Returns NULL when no page can be allocated.

2069

*

2073

*

2070

* Don't call cpuset_update_task_memory_state() unless

2074

* Don't call cpuset_update_task_memory_state() unless

2071

* 1) it's ok to take cpuset_sem (can WAIT), and

2075

* 1) it's ok to take cpuset_sem (can WAIT), and

2072

* 2) allocating for current task (not interrupt).

2076

* 2) allocating for current task (not interrupt).

2073

*/

2077

*/

2074

struct page *alloc_pages_current(gfp_t gfp, unsigned order)

2078

struct page *alloc_pages_current(gfp_t gfp, unsigned order)

2075

{

2079

{

2076

struct mempolicy *pol = get_task_policy(current);

2080

struct mempolicy *pol = get_task_policy(current);

2077

struct page *page;

2081

struct page *page;

2078

unsigned int cpuset_mems_cookie;

2082

unsigned int cpuset_mems_cookie;

2079

2083

2080

if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))

2084

if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))

2081

pol = &default_policy;

2085

pol = &default_policy;

2082

2086

2083

retry_cpuset:

2087

retry_cpuset:

2084

cpuset_mems_cookie = read_mems_allowed_begin();

2088

cpuset_mems_cookie = read_mems_allowed_begin();

2085

2089

2086

/*

2090

/*

2087

* No reference counting needed for current->mempolicy

2091

* No reference counting needed for current->mempolicy

2088

* nor system default_policy

2092

* nor system default_policy

2089

*/

2093

*/

2090

if (pol->mode == MPOL_INTERLEAVE)

2094

if (pol->mode == MPOL_INTERLEAVE)

2091

page = alloc_page_interleave(gfp, order, interleave_nodes(pol));

2095

page = alloc_page_interleave(gfp, order, interleave_nodes(pol));

2092

else

2096

else

2093

page = __alloc_pages_nodemask(gfp, order,

2097

page = __alloc_pages_nodemask(gfp, order,

2094

policy_zonelist(gfp, pol, numa_node_id()),

2098

policy_zonelist(gfp, pol, numa_node_id()),

2095

policy_nodemask(gfp, pol));

2099

policy_nodemask(gfp, pol));

2096

2100

2097

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2101

if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))

2098

goto retry_cpuset;

2102

goto retry_cpuset;

2099

2103

2100

return page;

2104

return page;

2101

}

2105

}

2102

EXPORT_SYMBOL(alloc_pages_current);

2106

EXPORT_SYMBOL(alloc_pages_current);

2103

2107

2104

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)

2108

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)

2105

{

2109

{

2106

struct mempolicy *pol = mpol_dup(vma_policy(src));

2110

struct mempolicy *pol = mpol_dup(vma_policy(src));

2107

2111

2108

if (IS_ERR(pol))

2112

if (IS_ERR(pol))

2109

return PTR_ERR(pol);

2113

return PTR_ERR(pol);

2110

dst->vm_policy = pol;

2114

dst->vm_policy = pol;

2111

return 0;

2115

return 0;

2112

}

2116

}

2113

2117

2114

/*

2118

/*

2115

* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it

2119

* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it

2116

* rebinds the mempolicy its copying by calling mpol_rebind_policy()

2120

* rebinds the mempolicy its copying by calling mpol_rebind_policy()

2117

* with the mems_allowed returned by cpuset_mems_allowed(). This

2121

* with the mems_allowed returned by cpuset_mems_allowed(). This

2118

* keeps mempolicies cpuset relative after its cpuset moves. See

2122

* keeps mempolicies cpuset relative after its cpuset moves. See

2119

* further kernel/cpuset.c update_nodemask().

2123

* further kernel/cpuset.c update_nodemask().

2120

*

2124

*

2121

* current's mempolicy may be rebinded by the other task(the task that changes

2125

* current's mempolicy may be rebinded by the other task(the task that changes

2122

* cpuset's mems), so we needn't do rebind work for current task.

2126

* cpuset's mems), so we needn't do rebind work for current task.

2123

*/

2127

*/

2124

2128

2125

/* Slow path of a mempolicy duplicate */

2129

/* Slow path of a mempolicy duplicate */

2126

struct mempolicy *__mpol_dup(struct mempolicy *old)

2130

struct mempolicy *__mpol_dup(struct mempolicy *old)

2127

{

2131

{

2128

struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

2132

struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

2129

2133

2130

if (!new)

2134

if (!new)

2131

return ERR_PTR(-ENOMEM);

2135

return ERR_PTR(-ENOMEM);

2132

2136

2133

/* task's mempolicy is protected by alloc_lock */

2137

/* task's mempolicy is protected by alloc_lock */

2134

if (old == current->mempolicy) {

2138

if (old == current->mempolicy) {

2135

task_lock(current);

2139

task_lock(current);

2136

*new = *old;

2140

*new = *old;

2137

task_unlock(current);

2141

task_unlock(current);

2138

} else

2142

} else

2139

*new = *old;

2143

*new = *old;

2140

2144

2141

rcu_read_lock();

2145

rcu_read_lock();

2142

if (current_cpuset_is_being_rebound()) {

2146

if (current_cpuset_is_being_rebound()) {

2143

nodemask_t mems = cpuset_mems_allowed(current);

2147

nodemask_t mems = cpuset_mems_allowed(current);

2144

if (new->flags & MPOL_F_REBINDING)

2148

if (new->flags & MPOL_F_REBINDING)

2145

mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);

2149

mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);

2146

else

2150

else

2147

mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);

2151

mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);

2148

}

2152

}

2149

rcu_read_unlock();

2153

rcu_read_unlock();

2150

atomic_set(&new->refcnt, 1);

2154

atomic_set(&new->refcnt, 1);

2151

return new;

2155

return new;

2152

}

2156

}

2153

2157

2154

/* Slow path of a mempolicy comparison */

2158

/* Slow path of a mempolicy comparison */

2155

bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)

2159

bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)

2156

{

2160

{

2157

if (!a || !b)

2161

if (!a || !b)

2158

return false;

2162

return false;

2159

if (a->mode != b->mode)

2163

if (a->mode != b->mode)

2160

return false;

2164

return false;

2161

if (a->flags != b->flags)

2165

if (a->flags != b->flags)

2162

return false;

2166

return false;

2163

if (mpol_store_user_nodemask(a))

2167

if (mpol_store_user_nodemask(a))

2164

if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))

2168

if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))

2165

return false;

2169

return false;

2166

2170

2167

switch (a->mode) {

2171

switch (a->mode) {

2168

case MPOL_BIND:

2172

case MPOL_BIND:

2169

/* Fall through */

2173

/* Fall through */

2170

case MPOL_INTERLEAVE:

2174

case MPOL_INTERLEAVE:

2171

return !!nodes_equal(a->v.nodes, b->v.nodes);

2175

return !!nodes_equal(a->v.nodes, b->v.nodes);

2172

case MPOL_PREFERRED:

2176

case MPOL_PREFERRED:

2173

return a->v.preferred_node == b->v.preferred_node;

2177

return a->v.preferred_node == b->v.preferred_node;

2174

default:

2178

default:

2175

BUG();

2179

BUG();

2176

return false;

2180

return false;

2177

}

2181

}

2178

}

2182

}

2179

2183

2180

/*

2184

/*

2181

* Shared memory backing store policy support.

2185

* Shared memory backing store policy support.

2182

*

2186

*

2183

* Remember policies even when nobody has shared memory mapped.

2187

* Remember policies even when nobody has shared memory mapped.

2184

* The policies are kept in Red-Black tree linked from the inode.

2188

* The policies are kept in Red-Black tree linked from the inode.

2185

* They are protected by the sp->lock spinlock, which should be held

2189

* They are protected by the sp->lock spinlock, which should be held

2186

* for any accesses to the tree.

2190

* for any accesses to the tree.

2187

*/

2191

*/

2188

2192

2189

/* lookup first element intersecting start-end */

2193

/* lookup first element intersecting start-end */

2190

/* Caller holds sp->lock */

2194

/* Caller holds sp->lock */

2191

static struct sp_node *

2195

static struct sp_node *

2192

sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)

2196

sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)

2193

{

2197

{

2194

struct rb_node *n = sp->root.rb_node;

2198

struct rb_node *n = sp->root.rb_node;

2195

2199

2196

while (n) {

2200

while (n) {

2197

struct sp_node *p = rb_entry(n, struct sp_node, nd);

2201

struct sp_node *p = rb_entry(n, struct sp_node, nd);

2198

2202

2199

if (start >= p->end)

2203

if (start >= p->end)

2200

n = n->rb_right;

2204

n = n->rb_right;

2201

else if (end <= p->start)

2205

else if (end <= p->start)

2202

n = n->rb_left;

2206

n = n->rb_left;

2203

else

2207

else

2204

break;

2208

break;

2205

}

2209

}

2206

if (!n)

2210

if (!n)

2207

return NULL;

2211

return NULL;

2208

for (;;) {

2212

for (;;) {

2209

struct sp_node *w = NULL;

2213

struct sp_node *w = NULL;

2210

struct rb_node *prev = rb_prev(n);

2214

struct rb_node *prev = rb_prev(n);

2211

if (!prev)

2215

if (!prev)

2212

break;

2216

break;

2213

w = rb_entry(prev, struct sp_node, nd);

2217

w = rb_entry(prev, struct sp_node, nd);

2214

if (w->end <= start)

2218

if (w->end <= start)

2215

break;

2219

break;

2216

n = prev;

2220

n = prev;

2217

}

2221

}

2218

return rb_entry(n, struct sp_node, nd);

2222

return rb_entry(n, struct sp_node, nd);

2219

}

2223

}

2220

2224

2221

/* Insert a new shared policy into the list. */

2225

/* Insert a new shared policy into the list. */

2222

/* Caller holds sp->lock */

2226

/* Caller holds sp->lock */

2223

static void sp_insert(struct shared_policy *sp, struct sp_node *new)

2227

static void sp_insert(struct shared_policy *sp, struct sp_node *new)

2224

{

2228

{

2225

struct rb_node **p = &sp->root.rb_node;

2229

struct rb_node **p = &sp->root.rb_node;

2226

struct rb_node *parent = NULL;

2230

struct rb_node *parent = NULL;

2227

struct sp_node *nd;

2231

struct sp_node *nd;

2228

2232

2229

while (*p) {

2233

while (*p) {

2230

parent = *p;

2234

parent = *p;

2231

nd = rb_entry(parent, struct sp_node, nd);

2235

nd = rb_entry(parent, struct sp_node, nd);

2232

if (new->start < nd->start)

2236

if (new->start < nd->start)

2233

p = &(*p)->rb_left;

2237

p = &(*p)->rb_left;

2234

else if (new->end > nd->end)

2238

else if (new->end > nd->end)

2235

p = &(*p)->rb_right;

2239

p = &(*p)->rb_right;

2236

else

2240

else

2237

BUG();

2241

BUG();

2238

}

2242

}

2239

rb_link_node(&new->nd, parent, p);

2243

rb_link_node(&new->nd, parent, p);

2240

rb_insert_color(&new->nd, &sp->root);

2244

rb_insert_color(&new->nd, &sp->root);

2241

pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,

2245

pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,

2242

new->policy ? new->policy->mode : 0);

2246

new->policy ? new->policy->mode : 0);

2243

}

2247

}

2244

2248

2245

/* Find shared policy intersecting idx */

2249

/* Find shared policy intersecting idx */

2246

struct mempolicy *

2250

struct mempolicy *

2247

mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)

2251

mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)

2248

{

2252

{

2249

struct mempolicy *pol = NULL;

2253

struct mempolicy *pol = NULL;

2250

struct sp_node *sn;

2254

struct sp_node *sn;

2251

2255

2252

if (!sp->root.rb_node)

2256

if (!sp->root.rb_node)

2253

return NULL;

2257

return NULL;

2254

spin_lock(&sp->lock);

2258

spin_lock(&sp->lock);

2255

sn = sp_lookup(sp, idx, idx+1);

2259

sn = sp_lookup(sp, idx, idx+1);

2256

if (sn) {

2260

if (sn) {

2257

mpol_get(sn->policy);

2261

mpol_get(sn->policy);

2258

pol = sn->policy;

2262

pol = sn->policy;

2259

}

2263

}

2260

spin_unlock(&sp->lock);

2264

spin_unlock(&sp->lock);

2261

return pol;

2265

return pol;

2262

}

2266

}

2263

2267

2264

static void sp_free(struct sp_node *n)

2268

static void sp_free(struct sp_node *n)

2265

{

2269

{

2266

mpol_put(n->policy);

2270

mpol_put(n->policy);

2267

kmem_cache_free(sn_cache, n);

2271

kmem_cache_free(sn_cache, n);

2268

}

2272

}

2269

2273

2270

/**

2274

/**

2271

* mpol_misplaced - check whether current page node is valid in policy

2275

* mpol_misplaced - check whether current page node is valid in policy

2272

*

2276

*

2273

* @page - page to be checked

2277

* @page - page to be checked

2274

* @vma - vm area where page mapped

2278

* @vma - vm area where page mapped

2275

* @addr - virtual address where page mapped

2279

* @addr - virtual address where page mapped

2276

*

2280

*

2277

* Lookup current policy node id for vma,addr and "compare to" page's

2281

* Lookup current policy node id for vma,addr and "compare to" page's

2278

* node id.

2282

* node id.

2279

*

2283

*

2280

* Returns:

2284

* Returns:

2281

* -1 - not misplaced, page is in the right node

2285

* -1 - not misplaced, page is in the right node

2282

* node - node id where the page should be

2286

* node - node id where the page should be

2283

*

2287

*

2284

* Policy determination "mimics" alloc_page_vma().

2288

* Policy determination "mimics" alloc_page_vma().

2285

* Called from fault path where we know the vma and faulting address.

2289

* Called from fault path where we know the vma and faulting address.

2286

*/

2290

*/

2287

int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)

2291

int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)

2288

{

2292

{

2289

struct mempolicy *pol;

2293

struct mempolicy *pol;

2290

struct zone *zone;

2294

struct zone *zone;

2291

int curnid = page_to_nid(page);

2295

int curnid = page_to_nid(page);

2292

unsigned long pgoff;

2296

unsigned long pgoff;

2293

int thiscpu = raw_smp_processor_id();

2297

int thiscpu = raw_smp_processor_id();

2294

int thisnid = cpu_to_node(thiscpu);

2298

int thisnid = cpu_to_node(thiscpu);

2295

int polnid = -1;

2299

int polnid = -1;

2296

int ret = -1;

2300

int ret = -1;

2297

2301

2298

BUG_ON(!vma);

2302

BUG_ON(!vma);

2299

2303

2300

pol = get_vma_policy(current, vma, addr);

2304

pol = get_vma_policy(current, vma, addr);

2301

if (!(pol->flags & MPOL_F_MOF))

2305

if (!(pol->flags & MPOL_F_MOF))

2302

goto out;

2306

goto out;

2303

2307

2304

switch (pol->mode) {

2308

switch (pol->mode) {

2305

case MPOL_INTERLEAVE:

2309

case MPOL_INTERLEAVE:

2306

BUG_ON(addr >= vma->vm_end);

2310

BUG_ON(addr >= vma->vm_end);

2307

BUG_ON(addr < vma->vm_start);

2311

BUG_ON(addr < vma->vm_start);

2308

2312

2309

pgoff = vma->vm_pgoff;

2313

pgoff = vma->vm_pgoff;

2310

pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;

2314

pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;

2311

polnid = offset_il_node(pol, vma, pgoff);

2315

polnid = offset_il_node(pol, vma, pgoff);

2312

break;

2316

break;

2313

2317

2314

case MPOL_PREFERRED:

2318

case MPOL_PREFERRED:

2315

if (pol->flags & MPOL_F_LOCAL)

2319

if (pol->flags & MPOL_F_LOCAL)

2316

polnid = numa_node_id();

2320

polnid = numa_node_id();

2317

else

2321

else

2318

polnid = pol->v.preferred_node;

2322

polnid = pol->v.preferred_node;

2319

break;

2323

break;

2320

2324

2321

case MPOL_BIND:

2325

case MPOL_BIND:

2322

/*

2326

/*

2323

* allows binding to multiple nodes.

2327

* allows binding to multiple nodes.

2324

* use current page if in policy nodemask,

2328

* use current page if in policy nodemask,

2325

* else select nearest allowed node, if any.

2329

* else select nearest allowed node, if any.

2326

* If no allowed nodes, use current [!misplaced].

2330

* If no allowed nodes, use current [!misplaced].

2327

*/

2331

*/

2328

if (node_isset(curnid, pol->v.nodes))

2332

if (node_isset(curnid, pol->v.nodes))

2329

goto out;

2333

goto out;

2330

(void)first_zones_zonelist(

2334

(void)first_zones_zonelist(

2331

node_zonelist(numa_node_id(), GFP_HIGHUSER),

2335

node_zonelist(numa_node_id(), GFP_HIGHUSER),

2332

gfp_zone(GFP_HIGHUSER),

2336

gfp_zone(GFP_HIGHUSER),

2333

&pol->v.nodes, &zone);

2337

&pol->v.nodes, &zone);

2334

polnid = zone->node;

2338

polnid = zone->node;

2335

break;

2339

break;

2336

2340

2337

default:

2341

default:

2338

BUG();

2342

BUG();

2339

}

2343

}

2340

2344

2341

/* Migrate the page towards the node whose CPU is referencing it */

2345

/* Migrate the page towards the node whose CPU is referencing it */

2342

if (pol->flags & MPOL_F_MORON) {

2346

if (pol->flags & MPOL_F_MORON) {

2343

polnid = thisnid;

2347

polnid = thisnid;

2344

2348

2345

if (!should_numa_migrate_memory(current, page, curnid, thiscpu))

2349

if (!should_numa_migrate_memory(current, page, curnid, thiscpu))

2346

goto out;

2350

goto out;

2347

}

2351

}

2348

2352

2349

if (curnid != polnid)

2353

if (curnid != polnid)

2350

ret = polnid;

2354

ret = polnid;

2351

out:

2355

out:

2352

mpol_cond_put(pol);

2356

mpol_cond_put(pol);

2353

2357

2354

return ret;

2358

return ret;

2355

}

2359

}

2356

2360

2357

static void sp_delete(struct shared_policy *sp, struct sp_node *n)

2361

static void sp_delete(struct shared_policy *sp, struct sp_node *n)

2358

{

2362

{

2359

pr_debug("deleting %lx-l%lx\n", n->start, n->end);

2363

pr_debug("deleting %lx-l%lx\n", n->start, n->end);

2360

rb_erase(&n->nd, &sp->root);

2364

rb_erase(&n->nd, &sp->root);

2361

sp_free(n);

2365

sp_free(n);

2362

}

2366

}

2363

2367

2364

static void sp_node_init(struct sp_node *node, unsigned long start,

2368

static void sp_node_init(struct sp_node *node, unsigned long start,

2365

unsigned long end, struct mempolicy *pol)

2369

unsigned long end, struct mempolicy *pol)

2366

{

2370

{

2367

node->start = start;

2371

node->start = start;

2368

node->end = end;

2372

node->end = end;

2369

node->policy = pol;

2373

node->policy = pol;

2370

}

2374

}

2371

2375

2372

static struct sp_node *sp_alloc(unsigned long start, unsigned long end,

2376

static struct sp_node *sp_alloc(unsigned long start, unsigned long end,

2373

struct mempolicy *pol)

2377

struct mempolicy *pol)

2374

{

2378

{

2375

struct sp_node *n;

2379

struct sp_node *n;

2376

struct mempolicy *newpol;

2380

struct mempolicy *newpol;

2377

2381

2378

n = kmem_cache_alloc(sn_cache, GFP_KERNEL);

2382

n = kmem_cache_alloc(sn_cache, GFP_KERNEL);

2379

if (!n)

2383

if (!n)

2380

return NULL;

2384

return NULL;

2381

2385

2382

newpol = mpol_dup(pol);

2386

newpol = mpol_dup(pol);

2383

if (IS_ERR(newpol)) {

2387

if (IS_ERR(newpol)) {

2384

kmem_cache_free(sn_cache, n);

2388

kmem_cache_free(sn_cache, n);

2385

return NULL;

2389

return NULL;

2386

}

2390

}

2387

newpol->flags |= MPOL_F_SHARED;

2391

newpol->flags |= MPOL_F_SHARED;

2388

sp_node_init(n, start, end, newpol);

2392

sp_node_init(n, start, end, newpol);

2389

2393

2390

return n;

2394

return n;

2391

}

2395

}

2392

2396

2393

/* Replace a policy range. */

2397

/* Replace a policy range. */

2394

static int shared_policy_replace(struct shared_policy *sp, unsigned long start,

2398

static int shared_policy_replace(struct shared_policy *sp, unsigned long start,

2395

unsigned long end, struct sp_node *new)

2399

unsigned long end, struct sp_node *new)

2396

{

2400

{

2397

struct sp_node *n;

2401

struct sp_node *n;

2398

struct sp_node *n_new = NULL;

2402

struct sp_node *n_new = NULL;

2399

struct mempolicy *mpol_new = NULL;

2403

struct mempolicy *mpol_new = NULL;

2400

int ret = 0;

2404

int ret = 0;

2401

2405

2402

restart:

2406

restart:

2403

spin_lock(&sp->lock);

2407

spin_lock(&sp->lock);

2404

n = sp_lookup(sp, start, end);

2408

n = sp_lookup(sp, start, end);

2405

/* Take care of old policies in the same range. */

2409

/* Take care of old policies in the same range. */

2406

while (n && n->start < end) {

2410

while (n && n->start < end) {

2407

struct rb_node *next = rb_next(&n->nd);

2411

struct rb_node *next = rb_next(&n->nd);

2408

if (n->start >= start) {

2412

if (n->start >= start) {

2409

if (n->end <= end)

2413

if (n->end <= end)

2410

sp_delete(sp, n);

2414

sp_delete(sp, n);

2411

else

2415

else

2412

n->start = end;

2416

n->start = end;

2413

} else {

2417

} else {

2414

/* Old policy spanning whole new range. */

2418

/* Old policy spanning whole new range. */

2415

if (n->end > end) {

2419

if (n->end > end) {

2416

if (!n_new)

2420

if (!n_new)

2417

goto alloc_new;

2421

goto alloc_new;

2418

2422

2419

*mpol_new = *n->policy;

2423

*mpol_new = *n->policy;

2420

atomic_set(&mpol_new->refcnt, 1);

2424

atomic_set(&mpol_new->refcnt, 1);

2421

sp_node_init(n_new, end, n->end, mpol_new);

2425

sp_node_init(n_new, end, n->end, mpol_new);

2422

n->end = start;

2426

n->end = start;

2423

sp_insert(sp, n_new);

2427

sp_insert(sp, n_new);

2424

n_new = NULL;

2428

n_new = NULL;

2425

mpol_new = NULL;

2429

mpol_new = NULL;

2426

break;

2430

break;

2427

} else

2431

} else

2428

n->end = start;

2432

n->end = start;

2429

}

2433

}

2430

if (!next)

2434

if (!next)

2431

break;

2435

break;

2432

n = rb_entry(next, struct sp_node, nd);

2436

n = rb_entry(next, struct sp_node, nd);

2433

}

2437

}

2434

if (new)

2438

if (new)

2435

sp_insert(sp, new);

2439

sp_insert(sp, new);

2436

spin_unlock(&sp->lock);

2440

spin_unlock(&sp->lock);

2437

ret = 0;

2441

ret = 0;

2438

2442

2439

err_out:

2443

err_out:

2440

if (mpol_new)

2444

if (mpol_new)

2441

mpol_put(mpol_new);

2445

mpol_put(mpol_new);

2442

if (n_new)

2446

if (n_new)

2443

kmem_cache_free(sn_cache, n_new);

2447

kmem_cache_free(sn_cache, n_new);

2444

2448

2445

return ret;

2449

return ret;

2446

2450

2447

alloc_new:

2451

alloc_new:

2448

spin_unlock(&sp->lock);

2452

spin_unlock(&sp->lock);

2449

ret = -ENOMEM;

2453

ret = -ENOMEM;

2450

n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);

2454

n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);

2451

if (!n_new)

2455

if (!n_new)

2452

goto err_out;

2456

goto err_out;

2453

mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

2457

mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

2454

if (!mpol_new)

2458

if (!mpol_new)

2455

goto err_out;

2459

goto err_out;

2456

goto restart;

2460

goto restart;

2457

}

2461

}

2458

2462

2459

/**

2463

/**

2460

* mpol_shared_policy_init - initialize shared policy for inode

2464

* mpol_shared_policy_init - initialize shared policy for inode

2461

* @sp: pointer to inode shared policy

2465

* @sp: pointer to inode shared policy

2462

* @mpol: struct mempolicy to install

2466

* @mpol: struct mempolicy to install

2463

*

2467

*

2464

* Install non-NULL @mpol in inode's shared policy rb-tree.

2468

* Install non-NULL @mpol in inode's shared policy rb-tree.

2465

* On entry, the current task has a reference on a non-NULL @mpol.

2469

* On entry, the current task has a reference on a non-NULL @mpol.

2466

* This must be released on exit.

2470

* This must be released on exit.

2467

* This is called at get_inode() calls and we can use GFP_KERNEL.

2471

* This is called at get_inode() calls and we can use GFP_KERNEL.

2468

*/

2472

*/

2469

void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)

2473

void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)

2470

{

2474

{

2471

int ret;

2475

int ret;

2472

2476

2473

sp->root = RB_ROOT; /* empty tree == default mempolicy */

2477

sp->root = RB_ROOT; /* empty tree == default mempolicy */

2474

spin_lock_init(&sp->lock);

2478

spin_lock_init(&sp->lock);

2475

2479

2476

if (mpol) {

2480

if (mpol) {

2477

struct vm_area_struct pvma;

2481

struct vm_area_struct pvma;

2478

struct mempolicy *new;

2482

struct mempolicy *new;

2479

NODEMASK_SCRATCH(scratch);

2483

NODEMASK_SCRATCH(scratch);

2480

2484

2481

if (!scratch)

2485

if (!scratch)

2482

goto put_mpol;

2486

goto put_mpol;

2483

/* contextualize the tmpfs mount point mempolicy */

2487

/* contextualize the tmpfs mount point mempolicy */

2484

new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);

2488

new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);

2485

if (IS_ERR(new))

2489

if (IS_ERR(new))

2486

goto free_scratch; /* no valid nodemask intersection */

2490

goto free_scratch; /* no valid nodemask intersection */

2487

2491

2488

task_lock(current);

2492

task_lock(current);

2489

ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);

2493

ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);

2490

task_unlock(current);

2494

task_unlock(current);

2491

if (ret)

2495

if (ret)

2492

goto put_new;

2496

goto put_new;

2493

2497

2494

/* Create pseudo-vma that contains just the policy */

2498

/* Create pseudo-vma that contains just the policy */

2495

memset(&pvma, 0, sizeof(struct vm_area_struct));

2499

memset(&pvma, 0, sizeof(struct vm_area_struct));

2496

pvma.vm_end = TASK_SIZE; /* policy covers entire file */

2500

pvma.vm_end = TASK_SIZE; /* policy covers entire file */

2497

mpol_set_shared_policy(sp, &pvma, new); /* adds ref */

2501

mpol_set_shared_policy(sp, &pvma, new); /* adds ref */

2498

2502

2499

put_new:

2503

put_new:

2500

mpol_put(new); /* drop initial ref */

2504

mpol_put(new); /* drop initial ref */

2501

free_scratch:

2505

free_scratch:

2502

NODEMASK_SCRATCH_FREE(scratch);

2506

NODEMASK_SCRATCH_FREE(scratch);

2503

put_mpol:

2507

put_mpol:

2504

mpol_put(mpol); /* drop our incoming ref on sb mpol */

2508

mpol_put(mpol); /* drop our incoming ref on sb mpol */

2505

}

2509

}

2506

}

2510

}

2507

2511

2508

int mpol_set_shared_policy(struct shared_policy *info,

2512

int mpol_set_shared_policy(struct shared_policy *info,

2509

struct vm_area_struct *vma, struct mempolicy *npol)

2513

struct vm_area_struct *vma, struct mempolicy *npol)

2510

{

2514

{

2511

int err;

2515

int err;

2512

struct sp_node *new = NULL;

2516

struct sp_node *new = NULL;

2513

unsigned long sz = vma_pages(vma);

2517

unsigned long sz = vma_pages(vma);

2514

2518

2515

pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",

2519

pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",

2516

vma->vm_pgoff,

2520

vma->vm_pgoff,

2517

sz, npol ? npol->mode : -1,

2521

sz, npol ? npol->mode : -1,

2518

npol ? npol->flags : -1,

2522

npol ? npol->flags : -1,

2519

npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);

2523

npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);

2520

2524

2521

if (npol) {

2525

if (npol) {

2522

new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);

2526

new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);

2523

if (!new)

2527

if (!new)

2524

return -ENOMEM;

2528

return -ENOMEM;

2525

}

2529

}

2526

err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);

2530

err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);

2527

if (err && new)

2531

if (err && new)

2528

sp_free(new);

2532

sp_free(new);

2529

return err;

2533

return err;

2530

}

2534

}

2531

2535

2532

/* Free a backing policy store on inode delete. */

2536

/* Free a backing policy store on inode delete. */

2533

void mpol_free_shared_policy(struct shared_policy *p)

2537

void mpol_free_shared_policy(struct shared_policy *p)

2534

{

2538

{

2535

struct sp_node *n;

2539

struct sp_node *n;

2536

struct rb_node *next;

2540

struct rb_node *next;

2537

2541

2538

if (!p->root.rb_node)

2542

if (!p->root.rb_node)

2539

return;

2543

return;

2540

spin_lock(&p->lock);

2544

spin_lock(&p->lock);

2541

next = rb_first(&p->root);

2545

next = rb_first(&p->root);

2542

while (next) {

2546

while (next) {

2543

n = rb_entry(next, struct sp_node, nd);

2547

n = rb_entry(next, struct sp_node, nd);

2544

next = rb_next(&n->nd);

2548

next = rb_next(&n->nd);

2545

sp_delete(p, n);

2549

sp_delete(p, n);

2546

}

2550

}

2547

spin_unlock(&p->lock);

2551

spin_unlock(&p->lock);

2548

}

2552

}

2549

2553

2550

#ifdef CONFIG_NUMA_BALANCING

2554

#ifdef CONFIG_NUMA_BALANCING

2551

static int __initdata numabalancing_override;

2555

static int __initdata numabalancing_override;

2552

2556

2553

static void __init check_numabalancing_enable(void)

2557

static void __init check_numabalancing_enable(void)

2554

{

2558

{

2555

bool numabalancing_default = false;

2559

bool numabalancing_default = false;

2556

2560

2557

if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))

2561

if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))

2558

numabalancing_default = true;

2562

numabalancing_default = true;

2559

2563

2560

/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */

2564

/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */

2561

if (numabalancing_override)

2565

if (numabalancing_override)

2562

set_numabalancing_state(numabalancing_override == 1);

2566

set_numabalancing_state(numabalancing_override == 1);

2563

2567

2564

if (nr_node_ids > 1 && !numabalancing_override) {

2568

if (nr_node_ids > 1 && !numabalancing_override) {

2565

pr_info("%s automatic NUMA balancing. "

2569

pr_info("%s automatic NUMA balancing. "

2566

"Configure with numa_balancing= or the "

2570

"Configure with numa_balancing= or the "

2567

"kernel.numa_balancing sysctl",

2571

"kernel.numa_balancing sysctl",

2568

numabalancing_default ? "Enabling" : "Disabling");

2572

numabalancing_default ? "Enabling" : "Disabling");

2569

set_numabalancing_state(numabalancing_default);

2573

set_numabalancing_state(numabalancing_default);

2570

}

2574

}

2571

}

2575

}

2572

2576

2573

static int __init setup_numabalancing(char *str)

2577

static int __init setup_numabalancing(char *str)

2574

{

2578

{

2575

int ret = 0;

2579

int ret = 0;

2576

if (!str)

2580

if (!str)

2577

goto out;

2581

goto out;

2578

2582

2579

if (!strcmp(str, "enable")) {

2583

if (!strcmp(str, "enable")) {

2580

numabalancing_override = 1;

2584

numabalancing_override = 1;

2581

ret = 1;

2585

ret = 1;

2582

} else if (!strcmp(str, "disable")) {

2586

} else if (!strcmp(str, "disable")) {

2583

numabalancing_override = -1;

2587

numabalancing_override = -1;

2584

ret = 1;

2588

ret = 1;

2585

}

2589

}

2586

out:

2590

out:

2587

if (!ret)

2591

if (!ret)

2588

pr_warn("Unable to parse numa_balancing=\n");

2592

pr_warn("Unable to parse numa_balancing=\n");

2589

2593

2590

return ret;

2594

return ret;

2591

}

2595

}

2592

__setup("numa_balancing=", setup_numabalancing);

2596

__setup("numa_balancing=", setup_numabalancing);

2593

#else

2597

#else

2594

static inline void __init check_numabalancing_enable(void)

2598

static inline void __init check_numabalancing_enable(void)

2595

{

2599

{

2596

}

2600

}

2597

#endif /* CONFIG_NUMA_BALANCING */

2601

#endif /* CONFIG_NUMA_BALANCING */

2598

2602

2599

/* assumes fs == KERNEL_DS */

2603

/* assumes fs == KERNEL_DS */

2600

void __init numa_policy_init(void)

2604

void __init numa_policy_init(void)

2601

{

2605

{

2602

nodemask_t interleave_nodes;

2606

nodemask_t interleave_nodes;

2603

unsigned long largest = 0;

2607

unsigned long largest = 0;

2604

int nid, prefer = 0;

2608

int nid, prefer = 0;

2605

2609

2606

policy_cache = kmem_cache_create("numa_policy",

2610

policy_cache = kmem_cache_create("numa_policy",

2607

sizeof(struct mempolicy),

2611

sizeof(struct mempolicy),

2608

0, SLAB_PANIC, NULL);

2612

0, SLAB_PANIC, NULL);

2609

2613

2610

sn_cache = kmem_cache_create("shared_policy_node",

2614

sn_cache = kmem_cache_create("shared_policy_node",

2611

sizeof(struct sp_node),

2615

sizeof(struct sp_node),

2612

0, SLAB_PANIC, NULL);

2616

0, SLAB_PANIC, NULL);

2613

2617

2614

for_each_node(nid) {

2618

for_each_node(nid) {

2615

preferred_node_policy[nid] = (struct mempolicy) {

2619

preferred_node_policy[nid] = (struct mempolicy) {

2616

.refcnt = ATOMIC_INIT(1),

2620

.refcnt = ATOMIC_INIT(1),

2617

.mode = MPOL_PREFERRED,

2621

.mode = MPOL_PREFERRED,

2618

.flags = MPOL_F_MOF | MPOL_F_MORON,

2622

.flags = MPOL_F_MOF | MPOL_F_MORON,

2619

.v = { .preferred_node = nid, },

2623

.v = { .preferred_node = nid, },

2620

};

2624

};

2621

}

2625

}

2622

2626

2623

/*

2627

/*

2624

* Set interleaving policy for system init. Interleaving is only

2628

* Set interleaving policy for system init. Interleaving is only

2625

* enabled across suitably sized nodes (default is >= 16MB), or

2629

* enabled across suitably sized nodes (default is >= 16MB), or

2626

* fall back to the largest node if they're all smaller.

2630

* fall back to the largest node if they're all smaller.

2627

*/

2631

*/

2628

nodes_clear(interleave_nodes);

2632

nodes_clear(interleave_nodes);

2629

for_each_node_state(nid, N_MEMORY) {

2633

for_each_node_state(nid, N_MEMORY) {

2630

unsigned long total_pages = node_present_pages(nid);

2634

unsigned long total_pages = node_present_pages(nid);

2631

2635

2632

/* Preserve the largest node */

2636

/* Preserve the largest node */

2633

if (largest < total_pages) {

2637

if (largest < total_pages) {

2634

largest = total_pages;

2638

largest = total_pages;

2635

prefer = nid;

2639

prefer = nid;

2636

}

2640

}

2637

2641

2638

/* Interleave this node? */

2642

/* Interleave this node? */

2639

if ((total_pages << PAGE_SHIFT) >= (16 << 20))

2643

if ((total_pages << PAGE_SHIFT) >= (16 << 20))

2640

node_set(nid, interleave_nodes);

2644

node_set(nid, interleave_nodes);

2641

}

2645

}

2642

2646

2643

/* All too small, use the largest */

2647

/* All too small, use the largest */

2644

if (unlikely(nodes_empty(interleave_nodes)))

2648

if (unlikely(nodes_empty(interleave_nodes)))

2645

node_set(prefer, interleave_nodes);

2649

node_set(prefer, interleave_nodes);

2646

2650

2647

if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))

2651

if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))

2648

printk("numa_policy_init: interleaving failed\n");

2652

printk("numa_policy_init: interleaving failed\n");

2649

2653

2650

check_numabalancing_enable();

2654

check_numabalancing_enable();

2651

}

2655

}

2652

2656

2653

/* Reset policy of current process to default */

2657

/* Reset policy of current process to default */

2654

void numa_default_policy(void)

2658

void numa_default_policy(void)

2655

{

2659

{

2656

do_set_mempolicy(MPOL_DEFAULT, 0, NULL);

2660

do_set_mempolicy(MPOL_DEFAULT, 0, NULL);

2657

}

2661

}

2658

2662

2659

/*

2663

/*

2660

* Parse and format mempolicy from/to strings

2664

* Parse and format mempolicy from/to strings

2661

*/

2665

*/

2662

2666

2663

/*

2667

/*

2664

* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.

2668

* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.

2665

*/

2669

*/

2666

static const char * const policy_modes[] =

2670

static const char * const policy_modes[] =

2667

{

2671

{

2668

[MPOL_DEFAULT] = "default",

2672

[MPOL_DEFAULT] = "default",

2669

[MPOL_PREFERRED] = "prefer",

2673

[MPOL_PREFERRED] = "prefer",

2670

[MPOL_BIND] = "bind",

2674

[MPOL_BIND] = "bind",

2671

[MPOL_INTERLEAVE] = "interleave",

2675

[MPOL_INTERLEAVE] = "interleave",

2672

[MPOL_LOCAL] = "local",

2676

[MPOL_LOCAL] = "local",

2673

};

2677

};

2674

2678

2675

2679

2676

#ifdef CONFIG_TMPFS

2680

#ifdef CONFIG_TMPFS

2677

/**

2681

/**

2678

* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.

2682

* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.

2679

* @str: string containing mempolicy to parse

2683

* @str: string containing mempolicy to parse

2680

* @mpol: pointer to struct mempolicy pointer, returned on success.

2684

* @mpol: pointer to struct mempolicy pointer, returned on success.

2681

*

2685

*

2682

* Format of input:

2686

* Format of input:

2683

* <mode>[=<flags>][:<nodelist>]

2687

* <mode>[=<flags>][:<nodelist>]

2684

*

2688

*

2685

* On success, returns 0, else 1

2689

* On success, returns 0, else 1

2686

*/

2690

*/

2687

int mpol_parse_str(char *str, struct mempolicy **mpol)

2691

int mpol_parse_str(char *str, struct mempolicy **mpol)

2688

{

2692

{

2689

struct mempolicy *new = NULL;

2693

struct mempolicy *new = NULL;

2690

unsigned short mode;

2694

unsigned short mode;

2691

unsigned short mode_flags;

2695

unsigned short mode_flags;

2692

nodemask_t nodes;

2696

nodemask_t nodes;

2693

char *nodelist = strchr(str, ':');

2697

char *nodelist = strchr(str, ':');

2694

char *flags = strchr(str, '=');

2698

char *flags = strchr(str, '=');

2695

int err = 1;

2699

int err = 1;

2696

2700

2697

if (nodelist) {

2701

if (nodelist) {

2698

/* NUL-terminate mode or flags string */

2702

/* NUL-terminate mode or flags string */

2699

*nodelist++ = '\0';

2703

*nodelist++ = '\0';

2700

if (nodelist_parse(nodelist, nodes))

2704

if (nodelist_parse(nodelist, nodes))

2701

goto out;

2705

goto out;

2702

if (!nodes_subset(nodes, node_states[N_MEMORY]))

2706

if (!nodes_subset(nodes, node_states[N_MEMORY]))

2703

goto out;

2707

goto out;

2704

} else

2708

} else

2705

nodes_clear(nodes);

2709

nodes_clear(nodes);

2706

2710

2707

if (flags)

2711

if (flags)

2708

*flags++ = '\0'; /* terminate mode string */

2712

*flags++ = '\0'; /* terminate mode string */

2709

2713

2710

for (mode = 0; mode < MPOL_MAX; mode++) {

2714

for (mode = 0; mode < MPOL_MAX; mode++) {

2711

if (!strcmp(str, policy_modes[mode])) {

2715

if (!strcmp(str, policy_modes[mode])) {

2712

break;

2716

break;

2713

}

2717

}

2714

}

2718

}

2715

if (mode >= MPOL_MAX)

2719

if (mode >= MPOL_MAX)

2716

goto out;

2720

goto out;

2717

2721

2718

switch (mode) {

2722

switch (mode) {

2719

case MPOL_PREFERRED:

2723

case MPOL_PREFERRED:

2720

/*

2724

/*

2721

* Insist on a nodelist of one node only

2725

* Insist on a nodelist of one node only

2722

*/

2726

*/

2723

if (nodelist) {

2727

if (nodelist) {

2724

char *rest = nodelist;

2728

char *rest = nodelist;

2725

while (isdigit(*rest))

2729

while (isdigit(*rest))

2726

rest++;

2730

rest++;

2727

if (*rest)

2731

if (*rest)

2728

goto out;

2732

goto out;

2729

}

2733

}

2730

break;

2734

break;

2731

case MPOL_INTERLEAVE:

2735

case MPOL_INTERLEAVE:

2732

/*

2736

/*

2733

* Default to online nodes with memory if no nodelist

2737

* Default to online nodes with memory if no nodelist

2734

*/

2738

*/

2735

if (!nodelist)

2739

if (!nodelist)

2736

nodes = node_states[N_MEMORY];

2740

nodes = node_states[N_MEMORY];

2737

break;

2741

break;

2738

case MPOL_LOCAL:

2742

case MPOL_LOCAL:

2739

/*

2743

/*

2740

* Don't allow a nodelist; mpol_new() checks flags

2744

* Don't allow a nodelist; mpol_new() checks flags

2741

*/

2745

*/

2742

if (nodelist)

2746

if (nodelist)

2743

goto out;

2747

goto out;

2744

mode = MPOL_PREFERRED;

2748

mode = MPOL_PREFERRED;

2745

break;

2749

break;

2746

case MPOL_DEFAULT:

2750

case MPOL_DEFAULT:

2747

/*

2751

/*

2748

* Insist on a empty nodelist

2752

* Insist on a empty nodelist

2749

*/

2753

*/

2750

if (!nodelist)

2754

if (!nodelist)

2751

err = 0;

2755

err = 0;

2752

goto out;

2756

goto out;

2753

case MPOL_BIND:

2757

case MPOL_BIND:

2754

/*

2758

/*

2755

* Insist on a nodelist

2759

* Insist on a nodelist

2756

*/

2760

*/

2757

if (!nodelist)

2761

if (!nodelist)

2758

goto out;

2762

goto out;

2759

}

2763

}

2760

2764

2761

mode_flags = 0;

2765

mode_flags = 0;

2762

if (flags) {

2766

if (flags) {

2763

/*

2767

/*

2764

* Currently, we only support two mutually exclusive

2768

* Currently, we only support two mutually exclusive

2765

* mode flags.

2769

* mode flags.

2766

*/

2770

*/

2767

if (!strcmp(flags, "static"))

2771

if (!strcmp(flags, "static"))

2768

mode_flags |= MPOL_F_STATIC_NODES;

2772

mode_flags |= MPOL_F_STATIC_NODES;

2769

else if (!strcmp(flags, "relative"))

2773

else if (!strcmp(flags, "relative"))

2770

mode_flags |= MPOL_F_RELATIVE_NODES;

2774

mode_flags |= MPOL_F_RELATIVE_NODES;

2771

else

2775

else

2772

goto out;

2776

goto out;

2773

}

2777

}

2774

2778

2775

new = mpol_new(mode, mode_flags, &nodes);

2779

new = mpol_new(mode, mode_flags, &nodes);

2776

if (IS_ERR(new))

2780

if (IS_ERR(new))

2777

goto out;

2781

goto out;

2778

2782

2779

/*

2783

/*

2780

* Save nodes for mpol_to_str() to show the tmpfs mount options

2784

* Save nodes for mpol_to_str() to show the tmpfs mount options

2781

* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.

2785

* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.

2782

*/

2786

*/

2783

if (mode != MPOL_PREFERRED)

2787

if (mode != MPOL_PREFERRED)

2784

new->v.nodes = nodes;

2788

new->v.nodes = nodes;

2785

else if (nodelist)

2789

else if (nodelist)

2786

new->v.preferred_node = first_node(nodes);

2790

new->v.preferred_node = first_node(nodes);

2787

else

2791

else

2788

new->flags |= MPOL_F_LOCAL;

2792

new->flags |= MPOL_F_LOCAL;

2789

2793

2790

/*

2794

/*

2791

* Save nodes for contextualization: this will be used to "clone"

2795

* Save nodes for contextualization: this will be used to "clone"

2792

* the mempolicy in a specific context [cpuset] at a later time.

2796

* the mempolicy in a specific context [cpuset] at a later time.

2793

*/

2797

*/

2794

new->w.user_nodemask = nodes;

2798

new->w.user_nodemask = nodes;

2795

2799

2796

err = 0;

2800

err = 0;

2797

2801

2798

out:

2802

out:

2799

/* Restore string for error message */

2803

/* Restore string for error message */

2800

if (nodelist)

2804

if (nodelist)

2801

*--nodelist = ':';

2805

*--nodelist = ':';

2802

if (flags)

2806

if (flags)

2803

*--flags = '=';

2807

*--flags = '=';

2804

if (!err)

2808

if (!err)

2805

*mpol = new;

2809

*mpol = new;

2806

return err;

2810

return err;

2807

}

2811

}

2808

#endif /* CONFIG_TMPFS */

2812

#endif /* CONFIG_TMPFS */

2809

2813

2810

/**

2814

/**

2811

* mpol_to_str - format a mempolicy structure for printing

2815

* mpol_to_str - format a mempolicy structure for printing

2812

* @buffer: to contain formatted mempolicy string

2816

* @buffer: to contain formatted mempolicy string

2813

* @maxlen: length of @buffer

2817

* @maxlen: length of @buffer

2814

* @pol: pointer to mempolicy to be formatted

2818

* @pol: pointer to mempolicy to be formatted

2815

*

2819

*

2816

* Convert @pol into a string. If @buffer is too short, truncate the string.

2820

* Convert @pol into a string. If @buffer is too short, truncate the string.

2817

* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the

2821

* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the

2818

* longest flag, "relative", and to display at least a few node ids.

2822

* longest flag, "relative", and to display at least a few node ids.

2819

*/

2823

*/

2820

void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)

2824

void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)

2821

{

2825

{

2822

char *p = buffer;

2826

char *p = buffer;

2823

nodemask_t nodes = NODE_MASK_NONE;

2827

nodemask_t nodes = NODE_MASK_NONE;

2824

unsigned short mode = MPOL_DEFAULT;

2828

unsigned short mode = MPOL_DEFAULT;

2825

unsigned short flags = 0;

2829

unsigned short flags = 0;

2826

2830

2827

if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {

2831

if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {

2828

mode = pol->mode;

2832

mode = pol->mode;

2829

flags = pol->flags;

2833

flags = pol->flags;

2830

}

2834

}

2831

2835

2832

switch (mode) {

2836

switch (mode) {

2833

case MPOL_DEFAULT:

2837

case MPOL_DEFAULT:

2834

break;

2838

break;

2835

case MPOL_PREFERRED:

2839

case MPOL_PREFERRED:

2836

if (flags & MPOL_F_LOCAL)

2840

if (flags & MPOL_F_LOCAL)

2837

mode = MPOL_LOCAL;

2841

mode = MPOL_LOCAL;

2838

else

2842

else

2839

node_set(pol->v.preferred_node, nodes);

2843

node_set(pol->v.preferred_node, nodes);

2840

break;

2844

break;

2841

case MPOL_BIND:

2845

case MPOL_BIND:

2842

case MPOL_INTERLEAVE:

2846

case MPOL_INTERLEAVE:

2843

nodes = pol->v.nodes;

2847

nodes = pol->v.nodes;

2844

break;

2848

break;

2845

default:

2849

default:

2846

WARN_ON_ONCE(1);

2850

WARN_ON_ONCE(1);

2847

snprintf(p, maxlen, "unknown");

2851

snprintf(p, maxlen, "unknown");

2848

return;

2852

return;

2849

}

2853

}

2850

2854

2851

p += snprintf(p, maxlen, "%s", policy_modes[mode]);

2855

p += snprintf(p, maxlen, "%s", policy_modes[mode]);

2852

2856

2853

if (flags & MPOL_MODE_FLAGS) {

2857

if (flags & MPOL_MODE_FLAGS) {

2854

p += snprintf(p, buffer + maxlen - p, "=");

2858

p += snprintf(p, buffer + maxlen - p, "=");

2855

2859

2856

/*

2860

/*

2857

* Currently, the only defined flags are mutually exclusive

2861

* Currently, the only defined flags are mutually exclusive

2858

*/

2862

*/

2859

if (flags & MPOL_F_STATIC_NODES)

2863

if (flags & MPOL_F_STATIC_NODES)

2860

p += snprintf(p, buffer + maxlen - p, "static");

2864

p += snprintf(p, buffer + maxlen - p, "static");

2861

else if (flags & MPOL_F_RELATIVE_NODES)

2865

else if (flags & MPOL_F_RELATIVE_NODES)

2862

p += snprintf(p, buffer + maxlen - p, "relative");

2866

p += snprintf(p, buffer + maxlen - p, "relative");

2863

}

2867

}

2864

2868

2865

if (!nodes_empty(nodes)) {

2869

if (!nodes_empty(nodes)) {

2866

p += snprintf(p, buffer + maxlen - p, ":");

2870

p += snprintf(p, buffer + maxlen - p, ":");

2867

p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);

2871

p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);

2868

}

2872

}

2869

}

2873

}

2870

2874

GITLAB

mm: add !pte_present() check on existing hugetlb_entry callbacks

 /*
  * Simple NUMA memory policy for the Linux kernel.
  *
  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
  * Subject to the GNU Public License, version 2.
  *
  * NUMA policy allows the user to give hints in which node(s) memory should
  * be allocated.
  *
  * Support four policies per VMA and per process:
  *
  * The VMA policy has priority over the process policy for a page fault.
  *
  * interleave     Allocate memory interleaved over a set of nodes,
  *                with normal fallback if it fails.
  *                For VMA based allocations this interleaves based on the
  *                offset into the backing object or offset into the mapping
  *                for anonymous memory. For process policy an process counter
  *                is used.
  *
  * bind           Only allocate memory on a specific set of nodes,
  *                no fallback.
  *                FIXME: memory is allocated starting with the first node
  *                to the last. It would be better if bind would truly restrict
  *                the allocation to memory nodes instead
  *
  * preferred       Try a specific node first before normal fallback.
  *                As a special case NUMA_NO_NODE here means do the allocation
  *                on the local CPU. This is normally identical to default,
  *                but useful to set in a VMA when you have a non default
  *                process policy.
  *
  * default        Allocate on the local node first, or when on a VMA
  *                use the process policy. This is what Linux always did
  *		  in a NUMA aware kernel and still does by, ahem, default.
  *
  * The process policy is applied for most non interrupt memory allocations
  * in that process' context. Interrupts ignore the policies and always
  * try to allocate on the local CPU. The VMA policy is only applied for memory
  * allocations for a VMA in the VM.
  *
  * Currently there are a few corner cases in swapping where the policy
  * is not applied, but the majority should be handled. When process policy
  * is used it is not remembered over swap outs/swap ins.
  *
  * Only the highest zone in the zone hierarchy gets policied. Allocations
  * requesting a lower zone just use default policy. This implies that
  * on systems with highmem kernel lowmem allocation don't get policied.
  * Same with GFP_DMA allocations.
  *
  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  * all users and remembered even when nobody has memory mapped.
  */
 /* Notebook:
    fix mmap readahead to honour policy and enable policy for any page cache
    object
    statistics for bigpages
    global policy for page cache? currently it uses process policy. Requires
    first item above.
    handle mremap for shared memory (currently ignored for the policy)
    grows down?
    make bind policy root only? It can trigger oom much faster and the
    kernel is not always grateful with that.
 */
 #include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
 #include <linux/mm_inline.h>
 #include <linux/mmu_notifier.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
 #include <linux/random.h>
 #include "internal.h"
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
 /* Highest zone. An specific allocation for a zone below that is not
    policied. */
 enum zone_type policy_zone = 0;
 /*
  * run-time system-wide default policy => local allocation
  */
 static struct mempolicy default_policy = {
 	.refcnt = ATOMIC_INIT(1), /* never free it */
 	.mode = MPOL_PREFERRED,
 	.flags = MPOL_F_LOCAL,
 };
 static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 static struct mempolicy *get_task_policy(struct task_struct *p)
 {
 	struct mempolicy *pol = p->mempolicy;
 	if (!pol) {
 		int node = numa_node_id();
 		if (node != NUMA_NO_NODE) {
 			pol = &preferred_node_policy[node];
 			/*
 			 * preferred_node_policy is not initialised early in
 			 * boot
 			 */
 			if (!pol->mode)
 				pol = NULL;
 		}
 	}
 	return pol;
 }
 static const struct mempolicy_operations {
 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 	/*
 	 * If read-side task has no lock to protect task->mempolicy, write-side
 	 * task will rebind the task->mempolicy by two step. The first step is
 	 * setting all the newly nodes, and the second step is cleaning all the
 	 * disallowed nodes. In this way, we can avoid finding no node to alloc
 	 * page.
 	 * If we have a lock to protect task->mempolicy in read-side, we do
 	 * rebind directly.
 	 *
 	 * step:
 	 * 	MPOL_REBIND_ONCE - do rebind work at once
 	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
 	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
 	 */
 	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
 			enum mpol_rebind_step step);
 } mpol_ops[MPOL_MAX];
 /* Check that the nodemask contains at least one populated zone */
 static int is_valid_nodemask(const nodemask_t *nodemask)
 {
 	return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 }
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 {
 	return pol->flags & MPOL_MODE_FLAGS;
 }
 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 				   const nodemask_t *rel)
 {
 	nodemask_t tmp;
 	nodes_fold(tmp, *orig, nodes_weight(*rel));
 	nodes_onto(*ret, tmp, *rel);
 }
 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	if (nodes_empty(*nodes))
 		return -EINVAL;
 	pol->v.nodes = *nodes;
 	return 0;
 }
 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	if (!nodes)
 		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
 	else if (nodes_empty(*nodes))
 		return -EINVAL;			/*  no allowed nodes */
 	else
 		pol->v.preferred_node = first_node(*nodes);
 	return 0;
 }
 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	if (!is_valid_nodemask(nodes))
 		return -EINVAL;
 	pol->v.nodes = *nodes;
 	return 0;
 }
 /*
  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
  * any, for the new policy.  mpol_new() has already validated the nodes
  * parameter with respect to the policy mode and flags.  But, we need to
  * handle an empty nodemask with MPOL_PREFERRED here.
  *
  * Must be called holding task's alloc_lock to protect task's mems_allowed
  * and mempolicy.  May also be called holding the mmap_semaphore for write.
  */
 static int mpol_set_nodemask(struct mempolicy *pol,
 		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 {
 	int ret;
 	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 	if (pol == NULL)
 		return 0;
 	/* Check N_MEMORY */
 	nodes_and(nsc->mask1,
 		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
 	VM_BUG_ON(!nodes);
 	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 		nodes = NULL;	/* explicit local allocation */
 	else {
 		if (pol->flags & MPOL_F_RELATIVE_NODES)
 			mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 		else
 			nodes_and(nsc->mask2, *nodes, nsc->mask1);
 		if (mpol_store_user_nodemask(pol))
 			pol->w.user_nodemask = *nodes;
 		else
 			pol->w.cpuset_mems_allowed =
 						cpuset_current_mems_allowed;
 	}
 	if (nodes)
 		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 	else
 		ret = mpol_ops[pol->mode].create(pol, NULL);
 	return ret;
 }
 /*
  * This function just creates a new policy, does some check and simple
  * initialization. You must invoke mpol_set_nodemask() to set nodes.
  */
 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 				  nodemask_t *nodes)
 {
 	struct mempolicy *policy;
 	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
 	if (mode == MPOL_DEFAULT) {
 		if (nodes && !nodes_empty(*nodes))
 			return ERR_PTR(-EINVAL);
 		return NULL;
 	}
 	VM_BUG_ON(!nodes);
 	/*
 	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 	 * All other modes require a valid pointer to a non-empty nodemask.
 	 */
 	if (mode == MPOL_PREFERRED) {
 		if (nodes_empty(*nodes)) {
 			if (((flags & MPOL_F_STATIC_NODES) ||
 			     (flags & MPOL_F_RELATIVE_NODES)))
 				return ERR_PTR(-EINVAL);
 		}
 	} else if (mode == MPOL_LOCAL) {
 		if (!nodes_empty(*nodes))
 			return ERR_PTR(-EINVAL);
 		mode = MPOL_PREFERRED;
 	} else if (nodes_empty(*nodes))
 		return ERR_PTR(-EINVAL);
 	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!policy)
 		return ERR_PTR(-ENOMEM);
 	atomic_set(&policy->refcnt, 1);
 	policy->mode = mode;
 	policy->flags = flags;
 	return policy;
 }
 /* Slow path of a mpol destructor. */
 void __mpol_put(struct mempolicy *p)
 {
 	if (!atomic_dec_and_test(&p->refcnt))
 		return;
 	kmem_cache_free(policy_cache, p);
 }
 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
 				enum mpol_rebind_step step)
 {
 }
 /*
  * step:
  * 	MPOL_REBIND_ONCE  - do rebind work at once
  * 	MPOL_REBIND_STEP1 - set all the newly nodes
  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
  */
 static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 				 enum mpol_rebind_step step)
 {
 	nodemask_t tmp;
 	if (pol->flags & MPOL_F_STATIC_NODES)
 		nodes_and(tmp, pol->w.user_nodemask, *nodes);
 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 	else {
 		/*
 		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
 		 * result
 		 */
 		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
 			nodes_remap(tmp, pol->v.nodes,
 					pol->w.cpuset_mems_allowed, *nodes);
 			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
 		} else if (step == MPOL_REBIND_STEP2) {
 			tmp = pol->w.cpuset_mems_allowed;
 			pol->w.cpuset_mems_allowed = *nodes;
 		} else
 			BUG();
 	}
 	if (nodes_empty(tmp))
 		tmp = *nodes;
 	if (step == MPOL_REBIND_STEP1)
 		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
 	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
 		pol->v.nodes = tmp;
 	else
 		BUG();
 	if (!node_isset(current->il_next, tmp)) {
 		current->il_next = next_node(current->il_next, tmp);
 		if (current->il_next >= MAX_NUMNODES)
 			current->il_next = first_node(tmp);
 		if (current->il_next >= MAX_NUMNODES)
 			current->il_next = numa_node_id();
 	}
 }
 static void mpol_rebind_preferred(struct mempolicy *pol,
 				  const nodemask_t *nodes,
 				  enum mpol_rebind_step step)
 {
 	nodemask_t tmp;
 	if (pol->flags & MPOL_F_STATIC_NODES) {
 		int node = first_node(pol->w.user_nodemask);
 		if (node_isset(node, *nodes)) {
 			pol->v.preferred_node = node;
 			pol->flags &= ~MPOL_F_LOCAL;
 		} else
 			pol->flags |= MPOL_F_LOCAL;
 	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 		pol->v.preferred_node = first_node(tmp);
 	} else if (!(pol->flags & MPOL_F_LOCAL)) {
 		pol->v.preferred_node = node_remap(pol->v.preferred_node,
 						   pol->w.cpuset_mems_allowed,
 						   *nodes);
 		pol->w.cpuset_mems_allowed = *nodes;
 	}
 }
 /*
  * mpol_rebind_policy - Migrate a policy to a different set of nodes
  *
  * If read-side task has no lock to protect task->mempolicy, write-side
  * task will rebind the task->mempolicy by two step. The first step is
  * setting all the newly nodes, and the second step is cleaning all the
  * disallowed nodes. In this way, we can avoid finding no node to alloc
  * page.
  * If we have a lock to protect task->mempolicy in read-side, we do
  * rebind directly.
  *
  * step:
  * 	MPOL_REBIND_ONCE  - do rebind work at once
  * 	MPOL_REBIND_STEP1 - set all the newly nodes
  * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
  */
 static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 				enum mpol_rebind_step step)
 {
 	if (!pol)
 		return;
 	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
 	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
 		return;
 	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
 		BUG();
 	if (step == MPOL_REBIND_STEP1)
 		pol->flags |= MPOL_F_REBINDING;
 	else if (step == MPOL_REBIND_STEP2)
 		pol->flags &= ~MPOL_F_REBINDING;
 	else if (step >= MPOL_REBIND_NSTEP)
 		BUG();
 	mpol_ops[pol->mode].rebind(pol, newmask, step);
 }
 /*
  * Wrapper for mpol_rebind_policy() that just requires task
  * pointer, and updates task mempolicy.
  *
  * Called with task's alloc_lock held.
  */
 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 			enum mpol_rebind_step step)
 {
 	mpol_rebind_policy(tsk->mempolicy, new, step);
 }
 /*
  * Rebind each vma in mm to new nodemask.
  *
  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
  */
 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 	struct vm_area_struct *vma;
 	down_write(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next)
 		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
 	up_write(&mm->mmap_sem);
 }
 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 	[MPOL_DEFAULT] = {
 		.rebind = mpol_rebind_default,
 	},
 	[MPOL_INTERLEAVE] = {
 		.create = mpol_new_interleave,
 		.rebind = mpol_rebind_nodemask,
 	},
 	[MPOL_PREFERRED] = {
 		.create = mpol_new_preferred,
 		.rebind = mpol_rebind_preferred,
 	},
 	[MPOL_BIND] = {
 		.create = mpol_new_bind,
 		.rebind = mpol_rebind_nodemask,
 	},
 };
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags);
 /*
  * Scan through pages checking if pages follow certain conditions,
  * and move them to the pagelist if they do.
  */
 static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
 		void *private)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
 	spinlock_t *ptl;
 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		struct page *page;
 		int nid;
 		if (!pte_present(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
 		if (!page)
 			continue;
 		/*
 		 * vm_normal_page() filters out zero pages, but there might
 		 * still be PageReserved pages to skip, perhaps in a VDSO.
 		 */
 		if (PageReserved(page))
 			continue;
 		nid = page_to_nid(page);
 		if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 			continue;
 		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 			migrate_page_add(page, private, flags);
 		else
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
 	return addr != end;
 }
 static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
 		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
 				    void *private)
 {
 #ifdef CONFIG_HUGETLB_PAGE
 	int nid;
 	struct page *page;
 	spinlock_t *ptl;
+	pte_t entry;
 	ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
-	page = pte_page(huge_ptep_get((pte_t *)pmd));
+	entry = huge_ptep_get((pte_t *)pmd);
+	if (!pte_present(entry))
+		goto unlock;
+	page = pte_page(entry);
 	nid = page_to_nid(page);
 	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 		goto unlock;
 	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
 	if (flags & (MPOL_MF_MOVE_ALL) ||
 	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
 		isolate_huge_page(page, private);
 unlock:
 	spin_unlock(ptl);
 #else
 	BUG();
 #endif
 }
 static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
 		void *private)
 {
 	pmd_t *pmd;
 	unsigned long next;
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (!pmd_present(*pmd))
 			continue;
 		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
 			queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
 						flags, private);
 			continue;
 		}
 		split_huge_page_pmd(vma, addr, pmd);
 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 			continue;
 		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
 				    flags, private))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
 		void *private)
 {
 	pud_t *pud;
 	unsigned long next;
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
 			continue;
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
 				    flags, private))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
 		void *private)
 {
 	pgd_t *pgd;
 	unsigned long next;
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
 		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
 				    flags, private))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
 }
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * This is used to mark a range of virtual addresses to be inaccessible.
  * These are later cleared by a NUMA hinting fault. Depending on these
  * faults, pages may be migrated for better NUMA placement.
  *
  * This is assuming that NUMA faults are handled using PROT_NONE. If
  * an architecture makes a different choice, it will need further
  * changes to the core.
  */
 unsigned long change_prot_numa(struct vm_area_struct *vma,
 			unsigned long addr, unsigned long end)
 {
 	int nr_updated;
 	nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
 	if (nr_updated)
 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
 	return nr_updated;
 }
 #else
 static unsigned long change_prot_numa(struct vm_area_struct *vma,
 			unsigned long addr, unsigned long end)
 {
 	return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 /*
  * Walk through page tables and collect pages to be migrated.
  *
  * If pages found in a given range are on a set of nodes (determined by
  * @nodes and @flags,) it's isolated and queued to the pagelist which is
  * passed via @private.)
  */
 static struct vm_area_struct *
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags, void *private)
 {
 	int err;
 	struct vm_area_struct *first, *vma, *prev;
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 		unsigned long endvma = vma->vm_end;
 		if (endvma > end)
 			endvma = end;
 		if (vma->vm_start > start)
 			start = vma->vm_start;
 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 			if (!vma->vm_next && vma->vm_end < end)
 				return ERR_PTR(-EFAULT);
 			if (prev && prev->vm_end < vma->vm_start)
 				return ERR_PTR(-EFAULT);
 		}
 		if (flags & MPOL_MF_LAZY) {
 			change_prot_numa(vma, start, endvma);
 			goto next;
 		}
 		if ((flags & MPOL_MF_STRICT) ||
 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 		      vma_migratable(vma))) {
 			err = queue_pages_pgd_range(vma, start, endvma, nodes,
 						flags, private);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
 			}
 		}
 next:
 		prev = vma;
 	}
 	return first;
 }
 /*
  * Apply policy to a single VMA
  * This must be called with the mmap_sem held for writing.
  */
 static int vma_replace_policy(struct vm_area_struct *vma,
 						struct mempolicy *pol)
 {
 	int err;
 	struct mempolicy *old;
 	struct mempolicy *new;
 	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
 		 vma->vm_ops, vma->vm_file,
 		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 	new = mpol_dup(pol);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	if (vma->vm_ops && vma->vm_ops->set_policy) {
 		err = vma->vm_ops->set_policy(vma, new);
 		if (err)
 			goto err_out;
 	}
 	old = vma->vm_policy;
 	vma->vm_policy = new; /* protected by mmap_sem */
 	mpol_put(old);
 	return 0;
  err_out:
 	mpol_put(new);
 	return err;
 }
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
 		       unsigned long end, struct mempolicy *new_pol)
 {
 	struct vm_area_struct *next;
 	struct vm_area_struct *prev;
 	struct vm_area_struct *vma;
 	int err = 0;
 	pgoff_t pgoff;
 	unsigned long vmstart;
 	unsigned long vmend;
 	vma = find_vma(mm, start);
 	if (!vma || vma->vm_start > start)
 		return -EFAULT;
 	prev = vma->vm_prev;
 	if (start > vma->vm_start)
 		prev = vma;
 	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
 		next = vma->vm_next;
 		vmstart = max(start, vma->vm_start);
 		vmend   = min(end, vma->vm_end);
 		if (mpol_equal(vma_policy(vma), new_pol))
 			continue;
 		pgoff = vma->vm_pgoff +
 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
 				  vma->anon_vma, vma->vm_file, pgoff,
 				  new_pol);
 		if (prev) {
 			vma = prev;
 			next = vma->vm_next;
 			if (mpol_equal(vma_policy(vma), new_pol))
 				continue;
 			/* vma_merge() joined vma && vma->next, case 8 */
 			goto replace;
 		}
 		if (vma->vm_start != vmstart) {
 			err = split_vma(vma->vm_mm, vma, vmstart, 1);
 			if (err)
 				goto out;
 		}
 		if (vma->vm_end != vmend) {
 			err = split_vma(vma->vm_mm, vma, vmend, 0);
 			if (err)
 				goto out;
 		}
  replace:
 		err = vma_replace_policy(vma, new_pol);
 		if (err)
 			goto out;
 	}
  out:
 	return err;
 }
 /* Set the process memory policy */
 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 			     nodemask_t *nodes)
 {
 	struct mempolicy *new, *old;
 	struct mm_struct *mm = current->mm;
 	NODEMASK_SCRATCH(scratch);
 	int ret;
 	if (!scratch)
 		return -ENOMEM;
 	new = mpol_new(mode, flags, nodes);
 	if (IS_ERR(new)) {
 		ret = PTR_ERR(new);
 		goto out;
 	}
 	/*
 	 * prevent changing our mempolicy while show_numa_maps()
 	 * is using it.
 	 * Note:  do_set_mempolicy() can be called at init time
 	 * with no 'mm'.
 	 */
 	if (mm)
 		down_write(&mm->mmap_sem);
 	task_lock(current);
 	ret = mpol_set_nodemask(new, nodes, scratch);
 	if (ret) {
 		task_unlock(current);
 		if (mm)
 			up_write(&mm->mmap_sem);
 		mpol_put(new);
 		goto out;
 	}
 	old = current->mempolicy;
 	current->mempolicy = new;
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
 	task_unlock(current);
 	if (mm)
 		up_write(&mm->mmap_sem);
 	mpol_put(old);
 	ret = 0;
 out:
 	NODEMASK_SCRATCH_FREE(scratch);
 	return ret;
 }
 /*
  * Return nodemask for policy for get_mempolicy() query
  *
  * Called with task's alloc_lock held
  */
 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
 	nodes_clear(*nodes);
 	if (p == &default_policy)
 		return;
 	switch (p->mode) {
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
 		*nodes = p->v.nodes;
 		break;
 	case MPOL_PREFERRED:
 		if (!(p->flags & MPOL_F_LOCAL))
 			node_set(p->v.preferred_node, *nodes);
 		/* else return empty node mask for local allocation */
 		break;
 	default:
 		BUG();
 	}
 }
 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 {
 	struct page *p;
 	int err;
 	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 	if (err >= 0) {
 		err = page_to_nid(p);
 		put_page(p);
 	}
 	return err;
 }
 /* Retrieve NUMA policy */
 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			     unsigned long addr, unsigned long flags)
 {
 	int err;
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 	if (flags &
 		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 		return -EINVAL;
 	if (flags & MPOL_F_MEMS_ALLOWED) {
 		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 			return -EINVAL;
 		*policy = 0;	/* just so it's initialized */
 		task_lock(current);
 		*nmask  = cpuset_current_mems_allowed;
 		task_unlock(current);
 		return 0;
 	}
 	if (flags & MPOL_F_ADDR) {
 		/*
 		 * Do NOT fall back to task policy if the
 		 * vma/shared policy at addr is NULL.  We
 		 * want to return MPOL_DEFAULT in this case.
 		 */
 		down_read(&mm->mmap_sem);
 		vma = find_vma_intersection(mm, addr, addr+1);
 		if (!vma) {
 			up_read(&mm->mmap_sem);
 			return -EFAULT;
 		}
 		if (vma->vm_ops && vma->vm_ops->get_policy)
 			pol = vma->vm_ops->get_policy(vma, addr);
 		else
 			pol = vma->vm_policy;
 	} else if (addr)
 		return -EINVAL;
 	if (!pol)
 		pol = &default_policy;	/* indicates default behavior */
 	if (flags & MPOL_F_NODE) {
 		if (flags & MPOL_F_ADDR) {
 			err = lookup_node(mm, addr);
 			if (err < 0)
 				goto out;
 			*policy = err;
 		} else if (pol == current->mempolicy &&
 				pol->mode == MPOL_INTERLEAVE) {
 			*policy = current->il_next;
 		} else {
 			err = -EINVAL;
 			goto out;
 		}
 	} else {
 		*policy = pol == &default_policy ? MPOL_DEFAULT :
 						pol->mode;
 		/*
 		 * Internal mempolicy flags must be masked off before exposing
 		 * the policy to userspace.
 		 */
 		*policy |= (pol->flags & MPOL_MODE_FLAGS);
 	}
 	if (vma) {
 		up_read(&current->mm->mmap_sem);
 		vma = NULL;
 	}
 	err = 0;
 	if (nmask) {
 		if (mpol_store_user_nodemask(pol)) {
 			*nmask = pol->w.user_nodemask;
 		} else {
 			task_lock(current);
 			get_policy_nodemask(pol, nmask);
 			task_unlock(current);
 		}
 	}
  out:
 	mpol_cond_put(pol);
 	if (vma)
 		up_read(&current->mm->mmap_sem);
 	return err;
 }
 #ifdef CONFIG_MIGRATION
 /*
  * page migration
  */
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags)
 {
 	/*
 	 * Avoid migrating a page that is shared with others.
 	 */
 	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 		if (!isolate_lru_page(page)) {
 			list_add_tail(&page->lru, pagelist);
 			inc_zone_page_state(page, NR_ISOLATED_ANON +
 					    page_is_file_cache(page));
 		}
 	}
 }
 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 {
 	if (PageHuge(page))
 		return alloc_huge_page_node(page_hstate(compound_head(page)),
 					node);
 	else
 		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 }
 /*
  * Migrate pages from one node to a target node.
  * Returns error or the number of pages not migrated.
  */
 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 			   int flags)
 {
 	nodemask_t nmask;
 	LIST_HEAD(pagelist);
 	int err = 0;
 	nodes_clear(nmask);
 	node_set(source, nmask);
 	/*
 	 * This does not "check" the range but isolates all pages that
 	 * need migration.  Between passing in the full user address
 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
 	 */
 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
 	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 	if (!list_empty(&pagelist)) {
 		err = migrate_pages(&pagelist, new_node_page, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
 			putback_movable_pages(&pagelist);
 	}
 	return err;
 }
 /*
  * Move pages between the two nodesets so as to preserve the physical
  * layout as much as possible.
  *
  * Returns the number of page that could not be moved.
  */
 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 		     const nodemask_t *to, int flags)
 {
 	int busy = 0;
 	int err;
 	nodemask_t tmp;
 	err = migrate_prep();
 	if (err)
 		return err;
 	down_read(&mm->mmap_sem);
 	err = migrate_vmas(mm, from, to, flags);
 	if (err)
 		goto out;
 	/*
 	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 	 * bit in 'tmp', and return that <source, dest> pair for migration.
 	 * The pair of nodemasks 'to' and 'from' define the map.
 	 *
 	 * If no pair of bits is found that way, fallback to picking some
 	 * pair of 'source' and 'dest' bits that are not the same.  If the
 	 * 'source' and 'dest' bits are the same, this represents a node
 	 * that will be migrating to itself, so no pages need move.
 	 *
 	 * If no bits are left in 'tmp', or if all remaining bits left
 	 * in 'tmp' correspond to the same bit in 'to', return false
 	 * (nothing left to migrate).
 	 *
 	 * This lets us pick a pair of nodes to migrate between, such that
 	 * if possible the dest node is not already occupied by some other
 	 * source node, minimizing the risk of overloading the memory on a
 	 * node that would happen if we migrated incoming memory to a node
 	 * before migrating outgoing memory source that same node.
 	 *
 	 * A single scan of tmp is sufficient.  As we go, we remember the
 	 * most recent <s, d> pair that moved (s != d).  If we find a pair
 	 * that not only moved, but what's better, moved to an empty slot
 	 * (d is not set in tmp), then we break out then, with that pair.
 	 * Otherwise when we finish scanning from_tmp, we at least have the
 	 * most recent <s, d> pair that moved.  If we get all the way through
 	 * the scan of tmp without finding any node that moved, much less
 	 * moved to an empty node, then there is nothing left worth migrating.
 	 */
 	tmp = *from;
 	while (!nodes_empty(tmp)) {
 		int s,d;
 		int source = NUMA_NO_NODE;
 		int dest = 0;
 		for_each_node_mask(s, tmp) {
 			/*
 			 * do_migrate_pages() tries to maintain the relative
 			 * node relationship of the pages established between
 			 * threads and memory areas.
                          *
 			 * However if the number of source nodes is not equal to
 			 * the number of destination nodes we can not preserve
 			 * this node relative relationship.  In that case, skip
 			 * copying memory from a node that is in the destination
 			 * mask.
 			 *
 			 * Example: [2,3,4] -> [3,4,5] moves everything.
 			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
 			 */
 			if ((nodes_weight(*from) != nodes_weight(*to)) &&
 						(node_isset(s, *to)))
 				continue;
 			d = node_remap(s, *from, *to);
 			if (s == d)
 				continue;
 			source = s;	/* Node moved. Memorize */
 			dest = d;
 			/* dest not in remaining from nodes? */
 			if (!node_isset(dest, tmp))
 				break;
 		}
 		if (source == NUMA_NO_NODE)
 			break;
 		node_clear(source, tmp);
 		err = migrate_to_node(mm, source, dest, flags);
 		if (err > 0)
 			busy += err;
 		if (err < 0)
 			break;
 	}
 out:
 	up_read(&mm->mmap_sem);
 	if (err < 0)
 		return err;
 	return busy;
 }
 /*
  * Allocate a new page for page migration based on vma policy.
  * Start assuming that page is mapped by vma pointed to by @private.
  * Search forward from there, if not.  N.B., this assumes that the
  * list of pages handed to migrate_pages()--which is how we get here--
  * is in virtual address order.
  */
 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 {
 	struct vm_area_struct *vma = (struct vm_area_struct *)private;
 	unsigned long uninitialized_var(address);
 	while (vma) {
 		address = page_address_in_vma(page, vma);
 		if (address != -EFAULT)
 			break;
 		vma = vma->vm_next;
 	}
 	if (PageHuge(page)) {
 		BUG_ON(!vma);
 		return alloc_huge_page_noerr(vma, address, 1);
 	}
 	/*
 	 * if !vma, alloc_page_vma() will use task or system default policy
 	 */
 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 }
 #else
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags)
 {
 }
 int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 		     const nodemask_t *to, int flags)
 {
 	return -ENOSYS;
 }
 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 {
 	return NULL;
 }
 #endif
 static long do_mbind(unsigned long start, unsigned long len,
 		     unsigned short mode, unsigned short mode_flags,
 		     nodemask_t *nmask, unsigned long flags)
 {
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	struct mempolicy *new;
 	unsigned long end;
 	int err;
 	LIST_HEAD(pagelist);
 	if (flags & ~(unsigned long)MPOL_MF_VALID)
 		return -EINVAL;
 	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 		return -EPERM;
 	if (start & ~PAGE_MASK)
 		return -EINVAL;
 	if (mode == MPOL_DEFAULT)
 		flags &= ~MPOL_MF_STRICT;
 	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 	end = start + len;
 	if (end < start)
 		return -EINVAL;
 	if (end == start)
 		return 0;
 	new = mpol_new(mode, mode_flags, nmask);
 	if (IS_ERR(new))
 		return PTR_ERR(new);
 	if (flags & MPOL_MF_LAZY)
 		new->flags |= MPOL_F_MOF;
 	/*
 	 * If we are using the default policy then operation
 	 * on discontinuous address spaces is okay after all
 	 */
 	if (!new)
 		flags |= MPOL_MF_DISCONTIG_OK;
 	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 		 start, start + len, mode, mode_flags,
 		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 		err = migrate_prep();
 		if (err)
 			goto mpol_out;
 	}
 	{
 		NODEMASK_SCRATCH(scratch);
 		if (scratch) {
 			down_write(&mm->mmap_sem);
 			task_lock(current);
 			err = mpol_set_nodemask(new, nmask, scratch);
 			task_unlock(current);
 			if (err)
 				up_write(&mm->mmap_sem);
 		} else
 			err = -ENOMEM;
 		NODEMASK_SCRATCH_FREE(scratch);
 	}
 	if (err)
 		goto mpol_out;
 	vma = queue_pages_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
 	err = PTR_ERR(vma);	/* maybe ... */
 	if (!IS_ERR(vma))
 		err = mbind_range(mm, start, end, new);
 	if (!err) {
 		int nr_failed = 0;
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
 			nr_failed = migrate_pages(&pagelist, new_vma_page,
 					(unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
 		}
 		if (nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	} else
 		putback_movable_pages(&pagelist);
 	up_write(&mm->mmap_sem);
  mpol_out:
 	mpol_put(new);
 	return err;
 }
 /*
  * User space interface with variable sized bitmaps for nodelists.
  */
 /* Copy a node mask from user space. */
 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 		     unsigned long maxnode)
 {
 	unsigned long k;
 	unsigned long nlongs;
 	unsigned long endmask;
 	--maxnode;
 	nodes_clear(*nodes);
 	if (maxnode == 0 || !nmask)
 		return 0;
 	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 		return -EINVAL;
 	nlongs = BITS_TO_LONGS(maxnode);
 	if ((maxnode % BITS_PER_LONG) == 0)
 		endmask = ~0UL;
 	else
 		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 	/* When the user specified more nodes than supported just check
 	   if the non supported part is all zero. */
 	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 		if (nlongs > PAGE_SIZE/sizeof(long))
 			return -EINVAL;
 		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 			unsigned long t;
 			if (get_user(t, nmask + k))
 				return -EFAULT;
 			if (k == nlongs - 1) {
 				if (t & endmask)
 					return -EINVAL;
 			} else if (t)
 				return -EINVAL;
 		}
 		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 		endmask = ~0UL;
 	}
 	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 		return -EFAULT;
 	nodes_addr(*nodes)[nlongs-1] &= endmask;
 	return 0;
 }
 /* Copy a kernel node mask to user space */
 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 			      nodemask_t *nodes)
 {
 	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 	if (copy > nbytes) {
 		if (copy > PAGE_SIZE)
 			return -EINVAL;
 		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 			return -EFAULT;
 		copy = nbytes;
 	}
 	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 }
 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
 		unsigned long, mode, unsigned long __user *, nmask,
 		unsigned long, maxnode, unsigned, flags)
 {
 	nodemask_t nodes;
 	int err;
 	unsigned short mode_flags;
 	mode_flags = mode & MPOL_MODE_FLAGS;
 	mode &= ~MPOL_MODE_FLAGS;
 	if (mode >= MPOL_MAX)
 		return -EINVAL;
 	if ((mode_flags & MPOL_F_STATIC_NODES) &&
 	    (mode_flags & MPOL_F_RELATIVE_NODES))
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
 	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
 }
 /* Set the process memory policy */
 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
 		unsigned long, maxnode)
 {
 	int err;
 	nodemask_t nodes;
 	unsigned short flags;
 	flags = mode & MPOL_MODE_FLAGS;
 	mode &= ~MPOL_MODE_FLAGS;
 	if ((unsigned int)mode >= MPOL_MAX)
 		return -EINVAL;
 	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
 		return -EINVAL;
 	err = get_nodes(&nodes, nmask, maxnode);
 	if (err)
 		return err;
 	return do_set_mempolicy(mode, flags, &nodes);
 }
 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
 		const unsigned long __user *, old_nodes,
 		const unsigned long __user *, new_nodes)
 {
 	const struct cred *cred = current_cred(), *tcred;
 	struct mm_struct *mm = NULL;
 	struct task_struct *task;
 	nodemask_t task_nodes;
 	int err;
 	nodemask_t *old;
 	nodemask_t *new;
 	NODEMASK_SCRATCH(scratch);
 	if (!scratch)
 		return -ENOMEM;
 	old = &scratch->mask1;
 	new = &scratch->mask2;
 	err = get_nodes(old, old_nodes, maxnode);
 	if (err)
 		goto out;
 	err = get_nodes(new, new_nodes, maxnode);
 	if (err)
 		goto out;
 	/* Find the mm_struct */
 	rcu_read_lock();
 	task = pid ? find_task_by_vpid(pid) : current;
 	if (!task) {
 		rcu_read_unlock();
 		err = -ESRCH;
 		goto out;
 	}
 	get_task_struct(task);
 	err = -EINVAL;
 	/*
 	 * Check if this process has the right to modify the specified
 	 * process. The right exists if the process has administrative
 	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
 	tcred = __task_cred(task);
 	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
 	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
 	    !capable(CAP_SYS_NICE)) {
 		rcu_read_unlock();
 		err = -EPERM;
 		goto out_put;
 	}
 	rcu_read_unlock();
 	task_nodes = cpuset_mems_allowed(task);
 	/* Is the user allowed to access the target nodes? */
 	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
 		err = -EPERM;
 		goto out_put;
 	}
 	if (!nodes_subset(*new, node_states[N_MEMORY])) {
 		err = -EINVAL;
 		goto out_put;
 	}
 	err = security_task_movememory(task);
 	if (err)
 		goto out_put;
 	mm = get_task_mm(task);
 	put_task_struct(task);
 	if (!mm) {
 		err = -EINVAL;
 		goto out;
 	}
 	err = do_migrate_pages(mm, old, new,
 		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 	mmput(mm);
 out:
 	NODEMASK_SCRATCH_FREE(scratch);
 	return err;
 out_put:
 	put_task_struct(task);
 	goto out;
 }
 /* Retrieve NUMA policy */
 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 		unsigned long __user *, nmask, unsigned long, maxnode,
 		unsigned long, addr, unsigned long, flags)
 {
 	int err;
 	int uninitialized_var(pval);
 	nodemask_t nodes;
 	if (nmask != NULL && maxnode < MAX_NUMNODES)
 		return -EINVAL;
 	err = do_get_mempolicy(&pval, &nodes, addr, flags);
 	if (err)
 		return err;
 	if (policy && put_user(pval, policy))
 		return -EFAULT;
 	if (nmask)
 		err = copy_nodes_to_user(nmask, maxnode, &nodes);
 	return err;
 }
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
 		       compat_ulong_t __user *, nmask,
 		       compat_ulong_t, maxnode,
 		       compat_ulong_t, addr, compat_ulong_t, flags)
 {
 	long err;
 	unsigned long __user *nm = NULL;
 	unsigned long nr_bits, alloc_size;
 	DECLARE_BITMAP(bm, MAX_NUMNODES);
 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 	if (nmask)
 		nm = compat_alloc_user_space(alloc_size);
 	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 	if (!err && nmask) {
 		unsigned long copy_size;
 		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
 		err = copy_from_user(bm, nm, copy_size);
 		/* ensure entire bitmap is zeroed */
 		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 		err |= compat_put_bitmap(nmask, bm, nr_bits);
 	}
 	return err;
 }
 COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
 		       compat_ulong_t, maxnode)
 {
 	long err = 0;
 	unsigned long __user *nm = NULL;
 	unsigned long nr_bits, alloc_size;
 	DECLARE_BITMAP(bm, MAX_NUMNODES);
 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 	if (nmask) {
 		err = compat_get_bitmap(bm, nmask, nr_bits);
 		nm = compat_alloc_user_space(alloc_size);
 		err |= copy_to_user(nm, bm, alloc_size);
 	}
 	if (err)
 		return -EFAULT;
 	return sys_set_mempolicy(mode, nm, nr_bits+1);
 }
 COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
 		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
 		       compat_ulong_t, maxnode, compat_ulong_t, flags)
 {
 	long err = 0;
 	unsigned long __user *nm = NULL;
 	unsigned long nr_bits, alloc_size;
 	nodemask_t bm;
 	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 	if (nmask) {
 		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 		nm = compat_alloc_user_space(alloc_size);
 		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 	}
 	if (err)
 		return -EFAULT;
 	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 }
 #endif
 /*
  * get_vma_policy(@task, @vma, @addr)
  * @task - task for fallback if vma policy == default
  * @vma   - virtual memory area whose policy is sought
  * @addr  - address in @vma for shared policy lookup
  *
  * Returns effective policy for a VMA at specified address.
  * Falls back to @task or system default policy, as necessary.
  * Current or other task's task mempolicy and non-shared vma policies must be
  * protected by task_lock(task) by the caller.
  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
  * count--added by the get_policy() vm_op, as appropriate--to protect against
  * freeing by another task.  It is the caller's responsibility to free the
  * extra reference for shared policies.
  */
 struct mempolicy *get_vma_policy(struct task_struct *task,
 		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_task_policy(task);
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy) {
 			struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
 									addr);
 			if (vpol)
 				pol = vpol;
 		} else if (vma->vm_policy) {
 			pol = vma->vm_policy;
 			/*
 			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
 			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
 			 * count on these policies which will be dropped by
 			 * mpol_cond_put() later
 			 */
 			if (mpol_needs_cond_ref(pol))
 				mpol_get(pol);
 		}
 	}
 	if (!pol)
 		pol = &default_policy;
 	return pol;
 }
 bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
 {
 	struct mempolicy *pol = get_task_policy(task);
 	if (vma) {
 		if (vma->vm_ops && vma->vm_ops->get_policy) {
 			bool ret = false;
 			pol = vma->vm_ops->get_policy(vma, vma->vm_start);
 			if (pol && (pol->flags & MPOL_F_MOF))
 				ret = true;
 			mpol_cond_put(pol);
 			return ret;
 		} else if (vma->vm_policy) {
 			pol = vma->vm_policy;
 		}
 	}
 	if (!pol)
 		return default_policy.flags & MPOL_F_MOF;
 	return pol->flags & MPOL_F_MOF;
 }
 static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
 {
 	enum zone_type dynamic_policy_zone = policy_zone;
 	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
 	/*
 	 * if policy->v.nodes has movable memory only,
 	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
 	 *
 	 * policy->v.nodes is intersect with node_states[N_MEMORY].
 	 * so if the following test faile, it implies
 	 * policy->v.nodes has movable memory only.
 	 */
 	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
 		dynamic_policy_zone = ZONE_MOVABLE;
 	return zone >= dynamic_policy_zone;
 }
 /*
  * Return a nodemask representing a mempolicy for filtering nodes for
  * page allocation
  */
 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 {
 	/* Lower zones don't get a nodemask applied for MPOL_BIND */
 	if (unlikely(policy->mode == MPOL_BIND) &&
 			apply_policy_zone(policy, gfp_zone(gfp)) &&
 			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
 		return &policy->v.nodes;
 	return NULL;
 }
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
 	int nd)
 {
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
 		if (!(policy->flags & MPOL_F_LOCAL))
 			nd = policy->v.preferred_node;
 		break;
 	case MPOL_BIND:
 		/*
 		 * Normally, MPOL_BIND allocations are node-local within the
 		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
 		 * current node isn't part of the mask, we use the zonelist for
 		 * the first node in the mask instead.
 		 */
 		if (unlikely(gfp & __GFP_THISNODE) &&
 				unlikely(!node_isset(nd, policy->v.nodes)))
 			nd = first_node(policy->v.nodes);
 		break;
 	default:
 		BUG();
 	}
 	return node_zonelist(nd, gfp);
 }
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
 	unsigned nid, next;
 	struct task_struct *me = current;
 	nid = me->il_next;
 	next = next_node(nid, policy->v.nodes);
 	if (next >= MAX_NUMNODES)
 		next = first_node(policy->v.nodes);
 	if (next < MAX_NUMNODES)
 		me->il_next = next;
 	return nid;
 }
 /*
  * Depending on the memory policy provide a node from which to allocate the
  * next slab entry.
  */
 unsigned int mempolicy_slab_node(void)
 {
 	struct mempolicy *policy;
 	int node = numa_mem_id();
 	if (in_interrupt())
 		return node;
 	policy = current->mempolicy;
 	if (!policy || policy->flags & MPOL_F_LOCAL)
 		return node;
 	switch (policy->mode) {
 	case MPOL_PREFERRED:
 		/*
 		 * handled MPOL_F_LOCAL above
 		 */
 		return policy->v.preferred_node;
 	case MPOL_INTERLEAVE:
 		return interleave_nodes(policy);
 	case MPOL_BIND: {
 		/*
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
 		struct zonelist *zonelist;
 		struct zone *zone;
 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
 		zonelist = &NODE_DATA(node)->node_zonelists[0];
 		(void)first_zones_zonelist(zonelist, highest_zoneidx,
 							&policy->v.nodes,
 							&zone);
 		return zone ? zone->node : node;
 	}
 	default:
 		BUG();
 	}
 }
 /* Do static interleaving for a VMA with known offset. */
 static unsigned offset_il_node(struct mempolicy *pol,
 		struct vm_area_struct *vma, unsigned long off)
 {
 	unsigned nnodes = nodes_weight(pol->v.nodes);
 	unsigned target;
 	int c;
 	int nid = NUMA_NO_NODE;
 	if (!nnodes)
 		return numa_node_id();
 	target = (unsigned int)off % nnodes;
 	c = 0;
 	do {
 		nid = next_node(nid, pol->v.nodes);
 		c++;
 	} while (c <= target);
 	return nid;
 }
 /* Determine a node number for interleave */
 static inline unsigned interleave_nid(struct mempolicy *pol,
 		 struct vm_area_struct *vma, unsigned long addr, int shift)
 {
 	if (vma) {
 		unsigned long off;
 		/*
 		 * for small pages, there is no difference between
 		 * shift and PAGE_SHIFT, so the bit-shift is safe.
 		 * for huge pages, since vm_pgoff is in units of small
 		 * pages, we need to shift off the always 0 bits to get
 		 * a useful offset.
 		 */
 		BUG_ON(shift < PAGE_SHIFT);
 		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
 		off += (addr - vma->vm_start) >> shift;
 		return offset_il_node(pol, vma, off);
 	} else
 		return interleave_nodes(pol);
 }
 /*
  * Return the bit number of a random bit set in the nodemask.
  * (returns NUMA_NO_NODE if nodemask is empty)
  */
 int node_random(const nodemask_t *maskp)
 {
 	int w, bit = NUMA_NO_NODE;
 	w = nodes_weight(*maskp);
 	if (w)
 		bit = bitmap_ord_to_pos(maskp->bits,
 			get_random_int() % w, MAX_NUMNODES);
 	return bit;
 }
 #ifdef CONFIG_HUGETLBFS
 /*
  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
  * @vma = virtual memory area whose policy is sought
  * @addr = address in @vma for shared policy lookup and interleave policy
  * @gfp_flags = for requested zone
  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
  *
  * Returns a zonelist suitable for a huge page allocation and a pointer
  * to the struct mempolicy for conditional unref after allocation.
  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
  * @nodemask for filtering the zonelist.
  *
  * Must be protected by read_mems_allowed_begin()
  */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 				gfp_t gfp_flags, struct mempolicy **mpol,
 				nodemask_t **nodemask)
 {
 	struct zonelist *zl;
 	*mpol = get_vma_policy(current, vma, addr);
 	*nodemask = NULL;	/* assume !MPOL_BIND */
 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
 				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
 		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
 		if ((*mpol)->mode == MPOL_BIND)
 			*nodemask = &(*mpol)->v.nodes;
 	}
 	return zl;
 }
 /*
  * init_nodemask_of_mempolicy
  *
  * If the current task's mempolicy is "default" [NULL], return 'false'
  * to indicate default policy.  Otherwise, extract the policy nodemask
  * for 'bind' or 'interleave' policy into the argument nodemask, or
  * initialize the argument nodemask to contain the single node for
  * 'preferred' or 'local' policy and return 'true' to indicate presence
  * of non-default mempolicy.
  *
  * We don't bother with reference counting the mempolicy [mpol_get/put]
  * because the current task is examining it's own mempolicy and a task's
  * mempolicy is only ever changed by the task itself.
  *
  * N.B., it is the caller's responsibility to free a returned nodemask.
  */
 bool init_nodemask_of_mempolicy(nodemask_t *mask)
 {
 	struct mempolicy *mempolicy;
 	int nid;
 	if (!(mask && current->mempolicy))
 		return false;
 	task_lock(current);
 	mempolicy = current->mempolicy;
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
 		if (mempolicy->flags & MPOL_F_LOCAL)
 			nid = numa_node_id();
 		else
 			nid = mempolicy->v.preferred_node;
 		init_nodemask_of_node(mask, nid);
 		break;
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
 		*mask =  mempolicy->v.nodes;
 		break;
 	default:
 		BUG();
 	}
 	task_unlock(current);
 	return true;
 }
 #endif
 /*
  * mempolicy_nodemask_intersects
  *
  * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
  * policy.  Otherwise, check for intersection between mask and the policy
  * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
  * policy, always return true since it may allocate elsewhere on fallback.
  *
  * Takes task_lock(tsk) to prevent freeing of its mempolicy.
  */
 bool mempolicy_nodemask_intersects(struct task_struct *tsk,
 					const nodemask_t *mask)
 {
 	struct mempolicy *mempolicy;
 	bool ret = true;
 	if (!mask)
 		return ret;
 	task_lock(tsk);
 	mempolicy = tsk->mempolicy;
 	if (!mempolicy)
 		goto out;
 	switch (mempolicy->mode) {
 	case MPOL_PREFERRED:
 		/*
 		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
 		 * allocate from, they may fallback to other nodes when oom.
 		 * Thus, it's possible for tsk to have allocated memory from
 		 * nodes in mask.
 		 */
 		break;
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 		ret = nodes_intersects(mempolicy->v.nodes, *mask);
 		break;
 	default:
 		BUG();
 	}
 out:
 	task_unlock(tsk);
 	return ret;
 }
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 					unsigned nid)
 {
 	struct zonelist *zl;
 	struct page *page;
 	zl = node_zonelist(nid, gfp);
 	page = __alloc_pages(gfp, order, zl);
 	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
 	return page;
 }
 /**
  * 	alloc_pages_vma	- Allocate a page for a VMA.
  *
  * 	@gfp:
  *      %GFP_USER    user allocation.
  *      %GFP_KERNEL  kernel allocations,
  *      %GFP_HIGHMEM highmem/user allocations,
  *      %GFP_FS      allocation should not call back into a file system.
  *      %GFP_ATOMIC  don't sleep.
  *
  *	@order:Order of the GFP allocation.
  * 	@vma:  Pointer to VMA or NULL if not available.
  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
  *
  * 	This function allocates a page from the kernel page pool and applies
  *	a NUMA policy associated with the VMA or the current process.
  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
  *	mm_struct of the VMA to prevent it from going away. Should be used for
  *	all allocations for pages that will be mapped into
  * 	user space. Returns NULL when no page can be allocated.
  *
  *	Should be called with the mm_sem of the vma hold.
  */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 		unsigned long addr, int node)
 {
 	struct mempolicy *pol;
 	struct page *page;
 	unsigned int cpuset_mems_cookie;
 retry_cpuset:
 	pol = get_vma_policy(current, vma, addr);
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
 		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 			goto retry_cpuset;
 		return page;
 	}
 	page = __alloc_pages_nodemask(gfp, order,
 				      policy_zonelist(gfp, pol, node),
 				      policy_nodemask(gfp, pol));
 	if (unlikely(mpol_needs_cond_ref(pol)))
 		__mpol_put(pol);
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	return page;
 }
 /**
  * 	alloc_pages_current - Allocate pages.
  *
  *	@gfp:
  *		%GFP_USER   user allocation,
  *      	%GFP_KERNEL kernel allocation,
  *      	%GFP_HIGHMEM highmem allocation,
  *      	%GFP_FS     don't call back into a file system.
  *      	%GFP_ATOMIC don't sleep.
  *	@order: Power of two of allocation size in pages. 0 is a single page.
  *
  *	Allocate a page from the kernel page pool.  When not in
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
  *
  *	Don't call cpuset_update_task_memory_state() unless
  *	1) it's ok to take cpuset_sem (can WAIT), and
  *	2) allocating for current task (not interrupt).
  */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = get_task_policy(current);
 	struct page *page;
 	unsigned int cpuset_mems_cookie;
 	if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
 		pol = &default_policy;
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	/*
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
 	 */
 	if (pol->mode == MPOL_INTERLEAVE)
 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	else
 		page = __alloc_pages_nodemask(gfp, order,
 				policy_zonelist(gfp, pol, numa_node_id()),
 				policy_nodemask(gfp, pol));
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
 {
 	struct mempolicy *pol = mpol_dup(vma_policy(src));
 	if (IS_ERR(pol))
 		return PTR_ERR(pol);
 	dst->vm_policy = pol;
 	return 0;
 }
 /*
  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
  * with the mems_allowed returned by cpuset_mems_allowed().  This
  * keeps mempolicies cpuset relative after its cpuset moves.  See
  * further kernel/cpuset.c update_nodemask().
  *
  * current's mempolicy may be rebinded by the other task(the task that changes
  * cpuset's mems), so we needn't do rebind work for current task.
  */
 /* Slow path of a mempolicy duplicate */
 struct mempolicy *__mpol_dup(struct mempolicy *old)
 {
 	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!new)
 		return ERR_PTR(-ENOMEM);
 	/* task's mempolicy is protected by alloc_lock */
 	if (old == current->mempolicy) {
 		task_lock(current);
 		*new = *old;
 		task_unlock(current);
 	} else
 		*new = *old;
 	rcu_read_lock();
 	if (current_cpuset_is_being_rebound()) {
 		nodemask_t mems = cpuset_mems_allowed(current);
 		if (new->flags & MPOL_F_REBINDING)
 			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
 		else
 			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
 	}
 	rcu_read_unlock();
 	atomic_set(&new->refcnt, 1);
 	return new;
 }
 /* Slow path of a mempolicy comparison */
 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
 	if (!a || !b)
 		return false;
 	if (a->mode != b->mode)
 		return false;
 	if (a->flags != b->flags)
 		return false;
 	if (mpol_store_user_nodemask(a))
 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
 			return false;
 	switch (a->mode) {
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
 		return !!nodes_equal(a->v.nodes, b->v.nodes);
 	case MPOL_PREFERRED:
 		return a->v.preferred_node == b->v.preferred_node;
 	default:
 		BUG();
 		return false;
 	}
 }
 /*
  * Shared memory backing store policy support.
  *
  * Remember policies even when nobody has shared memory mapped.
  * The policies are kept in Red-Black tree linked from the inode.
  * They are protected by the sp->lock spinlock, which should be held
  * for any accesses to the tree.
  */
 /* lookup first element intersecting start-end */
 /* Caller holds sp->lock */
 static struct sp_node *
 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 {
 	struct rb_node *n = sp->root.rb_node;
 	while (n) {
 		struct sp_node *p = rb_entry(n, struct sp_node, nd);
 		if (start >= p->end)
 			n = n->rb_right;
 		else if (end <= p->start)
 			n = n->rb_left;
 		else
 			break;
 	}
 	if (!n)
 		return NULL;
 	for (;;) {
 		struct sp_node *w = NULL;
 		struct rb_node *prev = rb_prev(n);
 		if (!prev)
 			break;
 		w = rb_entry(prev, struct sp_node, nd);
 		if (w->end <= start)
 			break;
 		n = prev;
 	}
 	return rb_entry(n, struct sp_node, nd);
 }
 /* Insert a new shared policy into the list. */
 /* Caller holds sp->lock */
 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
 {
 	struct rb_node **p = &sp->root.rb_node;
 	struct rb_node *parent = NULL;
 	struct sp_node *nd;
 	while (*p) {
 		parent = *p;
 		nd = rb_entry(parent, struct sp_node, nd);
 		if (new->start < nd->start)
 			p = &(*p)->rb_left;
 		else if (new->end > nd->end)
 			p = &(*p)->rb_right;
 		else
 			BUG();
 	}
 	rb_link_node(&new->nd, parent, p);
 	rb_insert_color(&new->nd, &sp->root);
 	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
 		 new->policy ? new->policy->mode : 0);
 }
 /* Find shared policy intersecting idx */
 struct mempolicy *
 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 {
 	struct mempolicy *pol = NULL;
 	struct sp_node *sn;
 	if (!sp->root.rb_node)
 		return NULL;
 	spin_lock(&sp->lock);
 	sn = sp_lookup(sp, idx, idx+1);
 	if (sn) {
 		mpol_get(sn->policy);
 		pol = sn->policy;
 	}
 	spin_unlock(&sp->lock);
 	return pol;
 }
 static void sp_free(struct sp_node *n)
 {
 	mpol_put(n->policy);
 	kmem_cache_free(sn_cache, n);
 }
 /**
  * mpol_misplaced - check whether current page node is valid in policy
  *
  * @page   - page to be checked
  * @vma    - vm area where page mapped
  * @addr   - virtual address where page mapped
  *
  * Lookup current policy node id for vma,addr and "compare to" page's
  * node id.
  *
  * Returns:
  *	-1	- not misplaced, page is in the right node
  *	node	- node id where the page should be
  *
  * Policy determination "mimics" alloc_page_vma().
  * Called from fault path where we know the vma and faulting address.
  */
 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol;
 	struct zone *zone;
 	int curnid = page_to_nid(page);
 	unsigned long pgoff;
 	int thiscpu = raw_smp_processor_id();
 	int thisnid = cpu_to_node(thiscpu);
 	int polnid = -1;
 	int ret = -1;
 	BUG_ON(!vma);
 	pol = get_vma_policy(current, vma, addr);
 	if (!(pol->flags & MPOL_F_MOF))
 		goto out;
 	switch (pol->mode) {
 	case MPOL_INTERLEAVE:
 		BUG_ON(addr >= vma->vm_end);
 		BUG_ON(addr < vma->vm_start);
 		pgoff = vma->vm_pgoff;
 		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
 		polnid = offset_il_node(pol, vma, pgoff);
 		break;
 	case MPOL_PREFERRED:
 		if (pol->flags & MPOL_F_LOCAL)
 			polnid = numa_node_id();
 		else
 			polnid = pol->v.preferred_node;
 		break;
 	case MPOL_BIND:
 		/*
 		 * allows binding to multiple nodes.
 		 * use current page if in policy nodemask,
 		 * else select nearest allowed node, if any.
 		 * If no allowed nodes, use current [!misplaced].
 		 */
 		if (node_isset(curnid, pol->v.nodes))
 			goto out;
 		(void)first_zones_zonelist(
 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
 				gfp_zone(GFP_HIGHUSER),
 				&pol->v.nodes, &zone);
 		polnid = zone->node;
 		break;
 	default:
 		BUG();
 	}
 	/* Migrate the page towards the node whose CPU is referencing it */
 	if (pol->flags & MPOL_F_MORON) {
 		polnid = thisnid;
 		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
 			goto out;
 	}
 	if (curnid != polnid)
 		ret = polnid;
 out:
 	mpol_cond_put(pol);
 	return ret;
 }
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
 	rb_erase(&n->nd, &sp->root);
 	sp_free(n);
 }
 static void sp_node_init(struct sp_node *node, unsigned long start,
 			unsigned long end, struct mempolicy *pol)
 {
 	node->start = start;
 	node->end = end;
 	node->policy = pol;
 }
 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
 				struct mempolicy *pol)
 {
 	struct sp_node *n;
 	struct mempolicy *newpol;
 	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
 	if (!n)
 		return NULL;
 	newpol = mpol_dup(pol);
 	if (IS_ERR(newpol)) {
 		kmem_cache_free(sn_cache, n);
 		return NULL;
 	}
 	newpol->flags |= MPOL_F_SHARED;
 	sp_node_init(n, start, end, newpol);
 	return n;
 }
 /* Replace a policy range. */
 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
 				 unsigned long end, struct sp_node *new)
 {
 	struct sp_node *n;
 	struct sp_node *n_new = NULL;
 	struct mempolicy *mpol_new = NULL;
 	int ret = 0;
 restart:
 	spin_lock(&sp->lock);
 	n = sp_lookup(sp, start, end);
 	/* Take care of old policies in the same range. */
 	while (n && n->start < end) {
 		struct rb_node *next = rb_next(&n->nd);
 		if (n->start >= start) {
 			if (n->end <= end)
 				sp_delete(sp, n);
 			else
 				n->start = end;
 		} else {
 			/* Old policy spanning whole new range. */
 			if (n->end > end) {
 				if (!n_new)
 					goto alloc_new;
 				*mpol_new = *n->policy;
 				atomic_set(&mpol_new->refcnt, 1);
 				sp_node_init(n_new, end, n->end, mpol_new);
 				n->end = start;
 				sp_insert(sp, n_new);
 				n_new = NULL;
 				mpol_new = NULL;
 				break;
 			} else
 				n->end = start;
 		}
 		if (!next)
 			break;
 		n = rb_entry(next, struct sp_node, nd);
 	}
 	if (new)
 		sp_insert(sp, new);
 	spin_unlock(&sp->lock);
 	ret = 0;
 err_out:
 	if (mpol_new)
 		mpol_put(mpol_new);
 	if (n_new)
 		kmem_cache_free(sn_cache, n_new);
 	return ret;
 alloc_new:
 	spin_unlock(&sp->lock);
 	ret = -ENOMEM;
 	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
 	if (!n_new)
 		goto err_out;
 	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 	if (!mpol_new)
 		goto err_out;
 	goto restart;
 }
 /**
  * mpol_shared_policy_init - initialize shared policy for inode
  * @sp: pointer to inode shared policy
  * @mpol:  struct mempolicy to install
  *
  * Install non-NULL @mpol in inode's shared policy rb-tree.
  * On entry, the current task has a reference on a non-NULL @mpol.
  * This must be released on exit.
  * This is called at get_inode() calls and we can use GFP_KERNEL.
  */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
 	int ret;
 	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
 	spin_lock_init(&sp->lock);
 	if (mpol) {
 		struct vm_area_struct pvma;
 		struct mempolicy *new;
 		NODEMASK_SCRATCH(scratch);
 		if (!scratch)
 			goto put_mpol;
 		/* contextualize the tmpfs mount point mempolicy */
 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
 		if (IS_ERR(new))
 			goto free_scratch; /* no valid nodemask intersection */
 		task_lock(current);
 		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
 		task_unlock(current);
 		if (ret)
 			goto put_new;
 		/* Create pseudo-vma that contains just the policy */
 		memset(&pvma, 0, sizeof(struct vm_area_struct));
 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
 		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
 put_new:
 		mpol_put(new);			/* drop initial ref */
 free_scratch:
 		NODEMASK_SCRATCH_FREE(scratch);
 put_mpol:
 		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
 	}
 }
 int mpol_set_shared_policy(struct shared_policy *info,
 			struct vm_area_struct *vma, struct mempolicy *npol)
 {
 	int err;
 	struct sp_node *new = NULL;
 	unsigned long sz = vma_pages(vma);
 	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
 		 vma->vm_pgoff,
 		 sz, npol ? npol->mode : -1,
 		 npol ? npol->flags : -1,
 		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
 	if (npol) {
 		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
 		if (!new)
 			return -ENOMEM;
 	}
 	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
 	if (err && new)
 		sp_free(new);
 	return err;
 }
 /* Free a backing policy store on inode delete. */
 void mpol_free_shared_policy(struct shared_policy *p)
 {
 	struct sp_node *n;
 	struct rb_node *next;
 	if (!p->root.rb_node)
 		return;
 	spin_lock(&p->lock);
 	next = rb_first(&p->root);
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
 		sp_delete(p, n);
 	}
 	spin_unlock(&p->lock);
 }
 #ifdef CONFIG_NUMA_BALANCING
 static int __initdata numabalancing_override;
 static void __init check_numabalancing_enable(void)
 {
 	bool numabalancing_default = false;
 	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
 		numabalancing_default = true;
 	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
 	if (numabalancing_override)
 		set_numabalancing_state(numabalancing_override == 1);
 	if (nr_node_ids > 1 && !numabalancing_override) {
 		pr_info("%s automatic NUMA balancing. "
 			"Configure with numa_balancing= or the "
 			"kernel.numa_balancing sysctl",
 			numabalancing_default ? "Enabling" : "Disabling");
 		set_numabalancing_state(numabalancing_default);
 	}
 }
 static int __init setup_numabalancing(char *str)
 {
 	int ret = 0;
 	if (!str)
 		goto out;
 	if (!strcmp(str, "enable")) {
 		numabalancing_override = 1;
 		ret = 1;
 	} else if (!strcmp(str, "disable")) {
 		numabalancing_override = -1;
 		ret = 1;
 	}
 out:
 	if (!ret)
 		pr_warn("Unable to parse numa_balancing=\n");
 	return ret;
 }
 __setup("numa_balancing=", setup_numabalancing);
 #else
 static inline void __init check_numabalancing_enable(void)
 {
 }
 #endif /* CONFIG_NUMA_BALANCING */
 /* assumes fs == KERNEL_DS */
 void __init numa_policy_init(void)
 {
 	nodemask_t interleave_nodes;
 	unsigned long largest = 0;
 	int nid, prefer = 0;
 	policy_cache = kmem_cache_create("numa_policy",
 					 sizeof(struct mempolicy),
 					 0, SLAB_PANIC, NULL);
 	sn_cache = kmem_cache_create("shared_policy_node",
 				     sizeof(struct sp_node),
 				     0, SLAB_PANIC, NULL);
 	for_each_node(nid) {
 		preferred_node_policy[nid] = (struct mempolicy) {
 			.refcnt = ATOMIC_INIT(1),
 			.mode = MPOL_PREFERRED,
 			.flags = MPOL_F_MOF | MPOL_F_MORON,
 			.v = { .preferred_node = nid, },
 		};
 	}
 	/*
 	 * Set interleaving policy for system init. Interleaving is only
 	 * enabled across suitably sized nodes (default is >= 16MB), or
 	 * fall back to the largest node if they're all smaller.
 	 */
 	nodes_clear(interleave_nodes);
 	for_each_node_state(nid, N_MEMORY) {
 		unsigned long total_pages = node_present_pages(nid);
 		/* Preserve the largest node */
 		if (largest < total_pages) {
 			largest = total_pages;
 			prefer = nid;
 		}
 		/* Interleave this node? */
 		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
 			node_set(nid, interleave_nodes);
 	}
 	/* All too small, use the largest */
 	if (unlikely(nodes_empty(interleave_nodes)))
 		node_set(prefer, interleave_nodes);
 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
 		printk("numa_policy_init: interleaving failed\n");
 	check_numabalancing_enable();
 }
 /* Reset policy of current process to default */
 void numa_default_policy(void)
 {
 	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
 }
 /*
  * Parse and format mempolicy from/to strings
  */
 /*
  * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
  */
 static const char * const policy_modes[] =
 {
 	[MPOL_DEFAULT]    = "default",
 	[MPOL_PREFERRED]  = "prefer",
 	[MPOL_BIND]       = "bind",
 	[MPOL_INTERLEAVE] = "interleave",
 	[MPOL_LOCAL]      = "local",
 };
 #ifdef CONFIG_TMPFS
 /**
  * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
  * @str:  string containing mempolicy to parse
  * @mpol:  pointer to struct mempolicy pointer, returned on success.
  *
  * Format of input:
  *	<mode>[=<flags>][:<nodelist>]
  *
  * On success, returns 0, else 1
  */
 int mpol_parse_str(char *str, struct mempolicy **mpol)
 {
 	struct mempolicy *new = NULL;
 	unsigned short mode;
 	unsigned short mode_flags;
 	nodemask_t nodes;
 	char *nodelist = strchr(str, ':');
 	char *flags = strchr(str, '=');
 	int err = 1;
 	if (nodelist) {
 		/* NUL-terminate mode or flags string */
 		*nodelist++ = '\0';
 		if (nodelist_parse(nodelist, nodes))
 			goto out;
 		if (!nodes_subset(nodes, node_states[N_MEMORY]))
 			goto out;
 	} else
 		nodes_clear(nodes);
 	if (flags)
 		*flags++ = '\0';	/* terminate mode string */
 	for (mode = 0; mode < MPOL_MAX; mode++) {
 		if (!strcmp(str, policy_modes[mode])) {
 			break;
 		}
 	}
 	if (mode >= MPOL_MAX)
 		goto out;
 	switch (mode) {
 	case MPOL_PREFERRED:
 		/*
 		 * Insist on a nodelist of one node only
 		 */
 		if (nodelist) {
 			char *rest = nodelist;
 			while (isdigit(*rest))
 				rest++;
 			if (*rest)
 				goto out;
 		}
 		break;
 	case MPOL_INTERLEAVE:
 		/*
 		 * Default to online nodes with memory if no nodelist
 		 */
 		if (!nodelist)
 			nodes = node_states[N_MEMORY];
 		break;
 	case MPOL_LOCAL:
 		/*
 		 * Don't allow a nodelist;  mpol_new() checks flags
 		 */
 		if (nodelist)
 			goto out;
 		mode = MPOL_PREFERRED;
 		break;
 	case MPOL_DEFAULT:
 		/*
 		 * Insist on a empty nodelist
 		 */
 		if (!nodelist)
 			err = 0;
 		goto out;
 	case MPOL_BIND:
 		/*
 		 * Insist on a nodelist
 		 */
 		if (!nodelist)
 			goto out;
 	}
 	mode_flags = 0;
 	if (flags) {
 		/*
 		 * Currently, we only support two mutually exclusive
 		 * mode flags.
 		 */
 		if (!strcmp(flags, "static"))
 			mode_flags |= MPOL_F_STATIC_NODES;
 		else if (!strcmp(flags, "relative"))
 			mode_flags |= MPOL_F_RELATIVE_NODES;
 		else
 			goto out;
 	}
 	new = mpol_new(mode, mode_flags, &nodes);
 	if (IS_ERR(new))
 		goto out;
 	/*
 	 * Save nodes for mpol_to_str() to show the tmpfs mount options
 	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
 	 */
 	if (mode != MPOL_PREFERRED)
 		new->v.nodes = nodes;
 	else if (nodelist)
 		new->v.preferred_node = first_node(nodes);
 	else
 		new->flags |= MPOL_F_LOCAL;
 	/*
 	 * Save nodes for contextualization: this will be used to "clone"
 	 * the mempolicy in a specific context [cpuset] at a later time.
 	 */
 	new->w.user_nodemask = nodes;
 	err = 0;
 out:
 	/* Restore string for error message */
 	if (nodelist)
 		*--nodelist = ':';
 	if (flags)
 		*--flags = '=';
 	if (!err)
 		*mpol = new;
 	return err;
 }
 #endif /* CONFIG_TMPFS */
 /**
  * mpol_to_str - format a mempolicy structure for printing
  * @buffer:  to contain formatted mempolicy string
  * @maxlen:  length of @buffer
  * @pol:  pointer to mempolicy to be formatted
  *
  * Convert @pol into a string.  If @buffer is too short, truncate the string.
  * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
  * longest flag, "relative", and to display at least a few node ids.
  */
 void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
 {
 	char *p = buffer;
 	nodemask_t nodes = NODE_MASK_NONE;
 	unsigned short mode = MPOL_DEFAULT;
 	unsigned short flags = 0;
 	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
 		mode = pol->mode;
 		flags = pol->flags;
 	}
 	switch (mode) {
 	case MPOL_DEFAULT:
 		break;
 	case MPOL_PREFERRED:
 		if (flags & MPOL_F_LOCAL)
 			mode = MPOL_LOCAL;
 		else
 			node_set(pol->v.preferred_node, nodes);
 		break;
 	case MPOL_BIND:
 	case MPOL_INTERLEAVE:
 		nodes = pol->v.nodes;
 		break;
 	default:
 		WARN_ON_ONCE(1);
 		snprintf(p, maxlen, "unknown");
 		return;
 	}
 	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
 	if (flags & MPOL_MODE_FLAGS) {
 		p += snprintf(p, buffer + maxlen - p, "=");
 		/*
 		 * Currently, the only defined flags are mutually exclusive
 		 */
 		if (flags & MPOL_F_STATIC_NODES)
 			p += snprintf(p, buffer + maxlen - p, "static");
 		else if (flags & MPOL_F_RELATIVE_NODES)
 			p += snprintf(p, buffer + maxlen - p, "relative");
 	}
 	if (!nodes_empty(nodes)) {
 		p += snprintf(p, buffer + maxlen - p, ":");
 	 	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
 	}
 }