Commit 87946a72283be3de936adc754b7007df7d3e6aeb

Authored by Daisuke Nishimura
Committed by Linus Torvalds
1 parent 90254a6583

memcg: move charge of file pages

This patch adds support for moving charge of file pages, which include
normal file, tmpfs file and swaps of tmpfs file.  It's enabled by setting
bit 1 of <target cgroup>/memory.move_charge_at_immigrate.

Unlike the case of anonymous pages, file pages(and swaps) in the range
mmapped by the task will be moved even if the task hasn't done page fault,
i.e.  they might not be the task's "RSS", but other task's "RSS" that maps
the same file.  And mapcount of the page is ignored(the page can be moved
even if page_mapcount(page) > 1).  So, conditions that the page/swap
should be met to be moved is that it must be in the range mmapped by the
target task and it must be charged to the old cgroup.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix warning]
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 125 additions and 18 deletions Side-by-side Diff

Documentation/cgroups/memory.txt
... ... @@ -454,21 +454,27 @@
454 454 8.2 Type of charges which can be move
455 455  
456 456 Each bits of move_charge_at_immigrate has its own meaning about what type of
457   -charges should be moved.
  457 +charges should be moved. But in any cases, it must be noted that an account of
  458 +a page or a swap can be moved only when it is charged to the task's current(old)
  459 +memory cgroup.
458 460  
459 461 bit | what type of charges would be moved ?
460 462 -----+------------------------------------------------------------------------
461 463 0 | A charge of an anonymous page(or swap of it) used by the target task.
462 464 | Those pages and swaps must be used only by the target task. You must
463 465 | enable Swap Extension(see 2.4) to enable move of swap charges.
  466 + -----+------------------------------------------------------------------------
  467 + 1 | A charge of file pages(normal file, tmpfs file(e.g. ipc shared memory)
  468 + | and swaps of tmpfs file) mmaped by the target task. Unlike the case of
  469 + | anonymous pages, file pages(and swaps) in the range mmapped by the task
  470 + | will be moved even if the task hasn't done page fault, i.e. they might
  471 + | not be the task's "RSS", but other task's "RSS" that maps the same file.
  472 + | And mapcount of the page is ignored(the page can be moved even if
  473 + | page_mapcount(page) > 1). You must enable Swap Extension(see 2.4) to
  474 + | enable move of swap charges.
464 475  
465   -Note: Those pages and swaps must be charged to the old cgroup.
466   -Note: More type of pages(e.g. file cache, shmem,) will be supported by other
467   - bits in future.
468   -
469 476 8.3 TODO
470 477  
471   -- Add support for other types of pages(e.g. file cache, shmem, etc.).
472 478 - Implement madvise(2) to let users decide the vma to be moved or not to be
473 479 moved.
474 480 - All of moving charge operations are done under cgroup_mutex. It's not good
include/linux/swap.h
... ... @@ -282,6 +282,11 @@
282 282 extern int shmem_unuse(swp_entry_t entry, struct page *page);
283 283 #endif /* CONFIG_MMU */
284 284  
  285 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
  286 +extern void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
  287 + struct page **pagep, swp_entry_t *ent);
  288 +#endif
  289 +
285 290 extern void swap_unplug_io_fn(struct backing_dev_info *, struct page *);
286 291  
287 292 #ifdef CONFIG_SWAP
... ... @@ -250,6 +250,7 @@
250 250 */
251 251 enum move_type {
252 252 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
  253 + MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
253 254 NR_MOVE_TYPE,
254 255 };
255 256  
... ... @@ -272,6 +273,12 @@
272 273 &mc.to->move_charge_at_immigrate);
273 274 }
274 275  
  276 +static bool move_file(void)
  277 +{
  278 + return test_bit(MOVE_CHARGE_TYPE_FILE,
  279 + &mc.to->move_charge_at_immigrate);
  280 +}
  281 +
275 282 /*
276 283 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
277 284 * limit reclaim to prevent infinite loops, if they ever occur.
... ... @@ -4179,11 +4186,8 @@
4179 4186 /* we don't move shared anon */
4180 4187 if (!move_anon() || page_mapcount(page) > 2)
4181 4188 return NULL;
4182   - } else
4183   - /*
4184   - * TODO: We don't move charges of file(including shmem/tmpfs)
4185   - * pages for now.
4186   - */
  4189 + } else if (!move_file())
  4190 + /* we ignore mapcount for file pages */
4187 4191 return NULL;
4188 4192 if (!get_page_unless_zero(page))
4189 4193 return NULL;
... ... @@ -4212,6 +4216,39 @@
4212 4216 return page;
4213 4217 }
4214 4218  
  4219 +static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  4220 + unsigned long addr, pte_t ptent, swp_entry_t *entry)
  4221 +{
  4222 + struct page *page = NULL;
  4223 + struct inode *inode;
  4224 + struct address_space *mapping;
  4225 + pgoff_t pgoff;
  4226 +
  4227 + if (!vma->vm_file) /* anonymous vma */
  4228 + return NULL;
  4229 + if (!move_file())
  4230 + return NULL;
  4231 +
  4232 + inode = vma->vm_file->f_path.dentry->d_inode;
  4233 + mapping = vma->vm_file->f_mapping;
  4234 + if (pte_none(ptent))
  4235 + pgoff = linear_page_index(vma, addr);
  4236 + else /* pte_file(ptent) is true */
  4237 + pgoff = pte_to_pgoff(ptent);
  4238 +
  4239 + /* page is moved even if it's not RSS of this task(page-faulted). */
  4240 + if (!mapping_cap_swap_backed(mapping)) { /* normal file */
  4241 + page = find_get_page(mapping, pgoff);
  4242 + } else { /* shmem/tmpfs file. we should take account of swap too. */
  4243 + swp_entry_t ent;
  4244 + mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
  4245 + if (do_swap_account)
  4246 + entry->val = ent.val;
  4247 + }
  4248 +
  4249 + return page;
  4250 +}
  4251 +
4215 4252 static int is_target_pte_for_mc(struct vm_area_struct *vma,
4216 4253 unsigned long addr, pte_t ptent, union mc_target *target)
4217 4254 {
... ... @@ -4224,7 +4261,8 @@
4224 4261 page = mc_handle_present_pte(vma, addr, ptent);
4225 4262 else if (is_swap_pte(ptent))
4226 4263 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4227   - /* TODO: handle swap of shmes/tmpfs */
  4264 + else if (pte_none(ptent) || pte_file(ptent))
  4265 + page = mc_handle_file_pte(vma, addr, ptent, &ent);
4228 4266  
4229 4267 if (!page && !ent.val)
4230 4268 return 0;
... ... @@ -4285,9 +4323,6 @@
4285 4323 };
4286 4324 if (is_vm_hugetlb_page(vma))
4287 4325 continue;
4288   - /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4289   - if (vma->vm_flags & VM_SHARED)
4290   - continue;
4291 4326 walk_page_range(vma->vm_start, vma->vm_end,
4292 4327 &mem_cgroup_count_precharge_walk);
4293 4328 }
... ... @@ -4483,9 +4518,6 @@
4483 4518 .private = vma,
4484 4519 };
4485 4520 if (is_vm_hugetlb_page(vma))
4486   - continue;
4487   - /* TODO: We don't move charges of shmem/tmpfs pages for now. */
4488   - if (vma->vm_flags & VM_SHARED)
4489 4521 continue;
4490 4522 ret = walk_page_range(vma->vm_start, vma->vm_end,
4491 4523 &mem_cgroup_move_charge_walk);
... ... @@ -2559,6 +2559,45 @@
2559 2559 return error;
2560 2560 }
2561 2561  
  2562 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
  2563 +/**
  2564 + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
  2565 + * @inode: the inode to be searched
  2566 + * @pgoff: the offset to be searched
  2567 + * @pagep: the pointer for the found page to be stored
  2568 + * @ent: the pointer for the found swap entry to be stored
  2569 + *
  2570 + * If a page is found, refcount of it is incremented. Callers should handle
  2571 + * these refcount.
  2572 + */
  2573 +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
  2574 + struct page **pagep, swp_entry_t *ent)
  2575 +{
  2576 + swp_entry_t entry = { .val = 0 }, *ptr;
  2577 + struct page *page = NULL;
  2578 + struct shmem_inode_info *info = SHMEM_I(inode);
  2579 +
  2580 + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
  2581 + goto out;
  2582 +
  2583 + spin_lock(&info->lock);
  2584 + ptr = shmem_swp_entry(info, pgoff, NULL);
  2585 +#ifdef CONFIG_SWAP
  2586 + if (ptr && ptr->val) {
  2587 + entry.val = ptr->val;
  2588 + page = find_get_page(&swapper_space, entry.val);
  2589 + } else
  2590 +#endif
  2591 + page = find_get_page(inode->i_mapping, pgoff);
  2592 + if (ptr)
  2593 + shmem_swp_unmap(ptr);
  2594 + spin_unlock(&info->lock);
  2595 +out:
  2596 + *pagep = page;
  2597 + *ent = entry;
  2598 +}
  2599 +#endif
  2600 +
2562 2601 #else /* !CONFIG_SHMEM */
2563 2602  
2564 2603 /*
... ... @@ -2597,6 +2636,31 @@
2597 2636 {
2598 2637 return 0;
2599 2638 }
  2639 +
  2640 +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
  2641 +/**
  2642 + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
  2643 + * @inode: the inode to be searched
  2644 + * @pgoff: the offset to be searched
  2645 + * @pagep: the pointer for the found page to be stored
  2646 + * @ent: the pointer for the found swap entry to be stored
  2647 + *
  2648 + * If a page is found, refcount of it is incremented. Callers should handle
  2649 + * these refcount.
  2650 + */
  2651 +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
  2652 + struct page **pagep, swp_entry_t *ent)
  2653 +{
  2654 + struct page *page = NULL;
  2655 +
  2656 + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
  2657 + goto out;
  2658 + page = find_get_page(inode->i_mapping, pgoff);
  2659 +out:
  2660 + *pagep = page;
  2661 + *ent = (swp_entry_t){ .val = 0 };
  2662 +}
  2663 +#endif
2600 2664  
2601 2665 #define shmem_vm_ops generic_file_vm_ops
2602 2666 #define shmem_file_operations ramfs_file_operations