Commit 71fe804b6d56d6a7aed680e096901434cef6a2c3

Authored by Lee Schermerhorn
Committed by Linus Torvalds
1 parent 3f226aa1cb

mempolicy: use struct mempolicy pointer in shmem_sb_info

This patch replaces the mempolicy mode, mode_flags, and nodemask in the
shmem_sb_info struct with a struct mempolicy pointer, initialized to NULL.
This removes dependency on the details of mempolicy from shmem.c and hugetlbfs
inode.c and simplifies the interfaces.

mpol_parse_str() in mempolicy.c is changed to return, via a pointer to a
pointer arg, a struct mempolicy pointer on success.  For MPOL_DEFAULT, the
returned pointer is NULL.  Further, mpol_parse_str() now takes a 'no_context'
argument that causes the input nodemask to be stored in the w.user_nodemask of
the created mempolicy for use when the mempolicy is installed in a tmpfs inode
shared policy tree.  At that time, any cpuset contextualization is applied to
the original input nodemask.  This preserves the previous behavior where the
input nodemask was stored in the superblock.  We can think of the returned
mempolicy as "context free".

Because mpol_parse_str() is now calling mpol_new(), we can remove from
mpol_to_str() the semantic checks that mpol_new() already performs.

Add 'no_context' parameter to mpol_to_str() to specify that it should format
the nodemask in w.user_nodemask for 'bind' and 'interleave' policies.

Change mpol_shared_policy_init() to take a pointer to a "context free" struct
mempolicy and to create a new, "contextualized" mempolicy using the mode,
mode_flags and user_nodemask from the input mempolicy.

  Note: we know that the mempolicy passed to mpol_to_str() or
  mpol_shared_policy_init() from a tmpfs superblock is "context free".  This
  is currently the only instance thereof.  However, if we found more uses for
  this concept, and introduced any ambiguity as to whether a mempolicy was
  context free or not, we could add another internal mode flag to identify
  context free mempolicies.  Then, we could remove the 'no_context' argument
  from mpol_to_str().

Added shmem_get_sbmpol() to return a reference counted superblock mempolicy,
if one exists, to pass to mpol_shared_policy_init().  We must add the
reference under the sb stat_lock to prevent races with replacement of the mpol
by remount.  This reference is removed in mpol_shared_policy_init().

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: another build fix]
[akpm@linux-foundation.org: yet another build fix]
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 5 changed files with 134 additions and 95 deletions Side-by-side Diff

fs/hugetlbfs/inode.c
... ... @@ -504,7 +504,7 @@
504 504 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
505 505 INIT_LIST_HEAD(&inode->i_mapping->private_list);
506 506 info = HUGETLBFS_I(inode);
507   - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0, NULL);
  507 + mpol_shared_policy_init(&info->policy, NULL);
508 508 switch (mode & S_IFMT) {
509 509 default:
510 510 init_special_inode(inode, mode, dev);
include/linux/mempolicy.h
... ... @@ -182,8 +182,7 @@
182 182 spinlock_t lock;
183 183 };
184 184  
185   -void mpol_shared_policy_init(struct shared_policy *info, unsigned short mode,
186   - unsigned short flags, nodemask_t *nodes);
  185 +void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
187 186 int mpol_set_shared_policy(struct shared_policy *info,
188 187 struct vm_area_struct *vma,
189 188 struct mempolicy *new);
190 189  
... ... @@ -216,10 +215,10 @@
216 215  
217 216  
218 217 #ifdef CONFIG_TMPFS
219   -extern int mpol_parse_str(char *str, unsigned short *mode,
220   - unsigned short *mode_flags, nodemask_t *policy_nodes);
  218 +extern int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context);
221 219  
222   -extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
  220 +extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
  221 + int no_context);
223 222 #endif
224 223 #else
225 224  
... ... @@ -262,8 +261,8 @@
262 261 return -EINVAL;
263 262 }
264 263  
265   -static inline void mpol_shared_policy_init(struct shared_policy *info,
266   - unsigned short mode, unsigned short flags, nodemask_t *nodes)
  264 +static inline void mpol_shared_policy_init(struct shared_policy *sp,
  265 + struct mempolicy *mpol)
267 266 {
268 267 }
269 268  
270 269  
271 270  
... ... @@ -322,13 +321,14 @@
322 321 }
323 322  
324 323 #ifdef CONFIG_TMPFS
325   -static inline int mpol_parse_str(char *value, unsigned short *policy,
326   - unsigned short flags, nodemask_t *policy_nodes)
  324 +static inline int mpol_parse_str(char *str, struct mempolicy **mpol,
  325 + int no_context)
327 326 {
328   - return 1;
  327 + return 1; /* error */
329 328 }
330 329  
331   -static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
  330 +static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
  331 + int no_context)
332 332 {
333 333 return 0;
334 334 }
include/linux/shmem_fs.h
... ... @@ -34,9 +34,7 @@
34 34 uid_t uid; /* Mount uid for root directory */
35 35 gid_t gid; /* Mount gid for root directory */
36 36 mode_t mode; /* Mount mode for root directory */
37   - unsigned short policy; /* Default NUMA memory alloc policy */
38   - unsigned short flags; /* Optional mempolicy flags */
39   - nodemask_t policy_nodes; /* nodemask for preferred and bind */
  37 + struct mempolicy *mpol; /* default memory policy for mappings */
40 38 };
41 39  
42 40 static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
... ... @@ -1828,27 +1828,35 @@
1828 1828 return 0;
1829 1829 }
1830 1830  
1831   -void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1832   - unsigned short flags, nodemask_t *policy_nodes)
  1831 +/**
  1832 + * mpol_shared_policy_init - initialize shared policy for inode
  1833 + * @sp: pointer to inode shared policy
  1834 + * @mpol: struct mempolicy to install
  1835 + *
  1836 + * Install non-NULL @mpol in inode's shared policy rb-tree.
  1837 + * On entry, the current task has a reference on a non-NULL @mpol.
  1838 + * This must be released on exit.
  1839 + */
  1840 +void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1833 1841 {
1834   - info->root = RB_ROOT;
1835   - spin_lock_init(&info->lock);
  1842 + sp->root = RB_ROOT; /* empty tree == default mempolicy */
  1843 + spin_lock_init(&sp->lock);
1836 1844  
1837   - if (policy != MPOL_DEFAULT) {
1838   - struct mempolicy *newpol;
  1845 + if (mpol) {
  1846 + struct vm_area_struct pvma;
  1847 + struct mempolicy *new;
1839 1848  
1840   - /* Falls back to NULL policy [MPOL_DEFAULT] on any error */
1841   - newpol = mpol_new(policy, flags, policy_nodes);
1842   - if (!IS_ERR(newpol)) {
1843   - /* Create pseudo-vma that contains just the policy */
1844   - struct vm_area_struct pvma;
  1849 + /* contextualize the tmpfs mount point mempolicy */
  1850 + new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
  1851 + mpol_put(mpol); /* drop our ref on sb mpol */
  1852 + if (IS_ERR(new))
  1853 + return; /* no valid nodemask intersection */
1845 1854  
1846   - memset(&pvma, 0, sizeof(struct vm_area_struct));
1847   - /* Policy covers entire file */
1848   - pvma.vm_end = TASK_SIZE;
1849   - mpol_set_shared_policy(info, &pvma, newpol);
1850   - mpol_put(newpol);
1851   - }
  1855 + /* Create pseudo-vma that contains just the policy */
  1856 + memset(&pvma, 0, sizeof(struct vm_area_struct));
  1857 + pvma.vm_end = TASK_SIZE; /* policy covers entire file */
  1858 + mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
  1859 + mpol_put(new); /* drop initial ref */
1852 1860 }
1853 1861 }
1854 1862  
1855 1863  
1856 1864  
1857 1865  
... ... @@ -1962,18 +1970,27 @@
1962 1970 /**
1963 1971 * mpol_parse_str - parse string to mempolicy
1964 1972 * @str: string containing mempolicy to parse
1965   - * @mode: pointer to returned policy mode
1966   - * @mode_flags: pointer to returned flags
1967   - * @policy_nodes: pointer to returned nodemask
  1973 + * @mpol: pointer to struct mempolicy pointer, returned on success.
  1974 + * @no_context: flag whether to "contextualize" the mempolicy
1968 1975 *
1969 1976 * Format of input:
1970 1977 * <mode>[=<flags>][:<nodelist>]
1971 1978 *
1972   - * Currently only used for tmpfs/shmem mount options
  1979 + * if @no_context is true, save the input nodemask in w.user_nodemask in
  1980 + * the returned mempolicy. This will be used to "clone" the mempolicy in
  1981 + * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
  1982 + * mount option. Note that if 'static' or 'relative' mode flags were
  1983 + * specified, the input nodemask will already have been saved. Saving
  1984 + * it again is redundant, but safe.
  1985 + *
  1986 + * On success, returns 0, else 1
1973 1987 */
1974   -int mpol_parse_str(char *str, unsigned short *mode, unsigned short *mode_flags,
1975   - nodemask_t *policy_nodes)
  1988 +int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1976 1989 {
  1990 + struct mempolicy *new = NULL;
  1991 + unsigned short uninitialized_var(mode);
  1992 + unsigned short uninitialized_var(mode_flags);
  1993 + nodemask_t nodes;
1977 1994 char *nodelist = strchr(str, ':');
1978 1995 char *flags = strchr(str, '=');
1979 1996 int i;
1980 1997  
1981 1998  
1982 1999  
1983 2000  
1984 2001  
... ... @@ -1982,26 +1999,30 @@
1982 1999 if (nodelist) {
1983 2000 /* NUL-terminate mode or flags string */
1984 2001 *nodelist++ = '\0';
1985   - if (nodelist_parse(nodelist, *policy_nodes))
  2002 + if (nodelist_parse(nodelist, nodes))
1986 2003 goto out;
1987   - if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
  2004 + if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
1988 2005 goto out;
1989   - }
  2006 + } else
  2007 + nodes_clear(nodes);
  2008 +
1990 2009 if (flags)
1991 2010 *flags++ = '\0'; /* terminate mode string */
1992 2011  
1993 2012 for (i = 0; i <= MPOL_LOCAL; i++) {
1994 2013 if (!strcmp(str, policy_types[i])) {
1995   - *mode = i;
  2014 + mode = i;
1996 2015 break;
1997 2016 }
1998 2017 }
1999 2018 if (i > MPOL_LOCAL)
2000 2019 goto out;
2001 2020  
2002   - switch (*mode) {
  2021 + switch (mode) {
2003 2022 case MPOL_PREFERRED:
2004   - /* Insist on a nodelist of one node only */
  2023 + /*
  2024 + * Insist on a nodelist of one node only
  2025 + */
2005 2026 if (nodelist) {
2006 2027 char *rest = nodelist;
2007 2028 while (isdigit(*rest))
2008 2029  
2009 2030  
2010 2031  
2011 2032  
2012 2033  
2013 2034  
2014 2035  
2015 2036  
2016 2037  
2017 2038  
2018 2039  
2019 2040  
2020 2041  
... ... @@ -2010,63 +2031,73 @@
2010 2031 err = 0;
2011 2032 }
2012 2033 break;
2013   - case MPOL_BIND:
2014   - /* Insist on a nodelist */
2015   - if (nodelist)
2016   - err = 0;
2017   - break;
2018 2034 case MPOL_INTERLEAVE:
2019 2035 /*
2020 2036 * Default to online nodes with memory if no nodelist
2021 2037 */
2022 2038 if (!nodelist)
2023   - *policy_nodes = node_states[N_HIGH_MEMORY];
  2039 + nodes = node_states[N_HIGH_MEMORY];
2024 2040 err = 0;
2025 2041 break;
2026   - default:
  2042 + case MPOL_LOCAL:
2027 2043 /*
2028   - * MPOL_DEFAULT or MPOL_LOCAL
2029   - * Don't allow a nodelist nor flags
  2044 + * Don't allow a nodelist; mpol_new() checks flags
2030 2045 */
2031   - if (!nodelist && !flags)
2032   - err = 0;
2033   - if (*mode == MPOL_DEFAULT)
  2046 + if (nodelist)
2034 2047 goto out;
2035   - /* else MPOL_LOCAL */
2036   - *mode = MPOL_PREFERRED;
2037   - nodes_clear(*policy_nodes);
  2048 + mode = MPOL_PREFERRED;
2038 2049 break;
  2050 +
  2051 + /*
  2052 + * case MPOL_BIND: mpol_new() enforces non-empty nodemask.
  2053 + * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
  2054 + */
2039 2055 }
2040 2056  
2041   - *mode_flags = 0;
  2057 + mode_flags = 0;
2042 2058 if (flags) {
2043 2059 /*
2044 2060 * Currently, we only support two mutually exclusive
2045 2061 * mode flags.
2046 2062 */
2047 2063 if (!strcmp(flags, "static"))
2048   - *mode_flags |= MPOL_F_STATIC_NODES;
  2064 + mode_flags |= MPOL_F_STATIC_NODES;
2049 2065 else if (!strcmp(flags, "relative"))
2050   - *mode_flags |= MPOL_F_RELATIVE_NODES;
  2066 + mode_flags |= MPOL_F_RELATIVE_NODES;
2051 2067 else
2052 2068 err = 1;
2053 2069 }
  2070 +
  2071 + new = mpol_new(mode, mode_flags, &nodes);
  2072 + if (IS_ERR(new))
  2073 + err = 1;
  2074 + else if (no_context)
  2075 + new->w.user_nodemask = nodes; /* save for contextualization */
  2076 +
2054 2077 out:
2055 2078 /* Restore string for error message */
2056 2079 if (nodelist)
2057 2080 *--nodelist = ':';
2058 2081 if (flags)
2059 2082 *--flags = '=';
  2083 + if (!err)
  2084 + *mpol = new;
2060 2085 return err;
2061 2086 }
2062 2087 #endif /* CONFIG_TMPFS */
2063 2088  
2064   -/*
  2089 +/**
  2090 + * mpol_to_str - format a mempolicy structure for printing
  2091 + * @buffer: to contain formatted mempolicy string
  2092 + * @maxlen: length of @buffer
  2093 + * @pol: pointer to mempolicy to be formatted
  2094 + * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
  2095 + *
2065 2096 * Convert a mempolicy into a string.
2066 2097 * Returns the number of characters in buffer (if positive)
2067 2098 * or an error (negative)
2068 2099 */
2069   -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
  2100 +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2070 2101 {
2071 2102 char *p = buffer;
2072 2103 int l;
... ... @@ -2100,7 +2131,10 @@
2100 2131 case MPOL_BIND:
2101 2132 /* Fall through */
2102 2133 case MPOL_INTERLEAVE:
2103   - nodes = pol->v.nodes;
  2134 + if (no_context)
  2135 + nodes = pol->w.user_nodemask;
  2136 + else
  2137 + nodes = pol->v.nodes;
2104 2138 break;
2105 2139  
2106 2140 default:
... ... @@ -2231,7 +2265,7 @@
2231 2265 return 0;
2232 2266  
2233 2267 pol = get_vma_policy(priv->task, vma, vma->vm_start);
2234   - mpol_to_str(buffer, sizeof(buffer), pol);
  2268 + mpol_to_str(buffer, sizeof(buffer), pol, 0);
2235 2269 mpol_cond_put(pol);
2236 2270  
2237 2271 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
... ... @@ -1079,23 +1079,29 @@
1079 1079  
1080 1080 #ifdef CONFIG_NUMA
1081 1081 #ifdef CONFIG_TMPFS
1082   -static void shmem_show_mpol(struct seq_file *seq, unsigned short mode,
1083   - unsigned short flags, const nodemask_t policy_nodes)
  1082 +static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1084 1083 {
1085   - struct mempolicy temp;
1086 1084 char buffer[64];
1087 1085  
1088   - if (mode == MPOL_DEFAULT)
  1086 + if (!mpol || mpol->mode == MPOL_DEFAULT)
1089 1087 return; /* show nothing */
1090 1088  
1091   - temp.mode = mode;
1092   - temp.flags = flags;
1093   - temp.v.nodes = policy_nodes;
  1089 + mpol_to_str(buffer, sizeof(buffer), mpol, 1);
1094 1090  
1095   - mpol_to_str(buffer, sizeof(buffer), &temp);
1096   -
1097 1091 seq_printf(seq, ",mpol=%s", buffer);
1098 1092 }
  1093 +
  1094 +static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  1095 +{
  1096 + struct mempolicy *mpol = NULL;
  1097 + if (sbinfo->mpol) {
  1098 + spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
  1099 + mpol = sbinfo->mpol;
  1100 + mpol_get(mpol);
  1101 + spin_unlock(&sbinfo->stat_lock);
  1102 + }
  1103 + return mpol;
  1104 +}
1099 1105 #endif /* CONFIG_TMPFS */
1100 1106  
1101 1107 static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
... ... @@ -1135,8 +1141,7 @@
1135 1141 }
1136 1142 #else /* !CONFIG_NUMA */
1137 1143 #ifdef CONFIG_TMPFS
1138   -static inline void shmem_show_mpol(struct seq_file *seq, unsigned short policy,
1139   - unsigned short flags, const nodemask_t policy_nodes)
  1144 +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
1140 1145 {
1141 1146 }
1142 1147 #endif /* CONFIG_TMPFS */
... ... @@ -1154,6 +1159,13 @@
1154 1159 }
1155 1160 #endif /* CONFIG_NUMA */
1156 1161  
  1162 +#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
  1163 +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  1164 +{
  1165 + return NULL;
  1166 +}
  1167 +#endif
  1168 +
1157 1169 /*
1158 1170 * shmem_getpage - either get the page from swap or allocate a new one
1159 1171 *
... ... @@ -1508,8 +1520,8 @@
1508 1520 case S_IFREG:
1509 1521 inode->i_op = &shmem_inode_operations;
1510 1522 inode->i_fop = &shmem_file_operations;
1511   - mpol_shared_policy_init(&info->policy, sbinfo->policy,
1512   - sbinfo->flags, &sbinfo->policy_nodes);
  1523 + mpol_shared_policy_init(&info->policy,
  1524 + shmem_get_sbmpol(sbinfo));
1513 1525 break;
1514 1526 case S_IFDIR:
1515 1527 inc_nlink(inode);
... ... @@ -1523,8 +1535,7 @@
1523 1535 * Must not load anything in the rbtree,
1524 1536 * mpol_free_shared_policy will not be called.
1525 1537 */
1526   - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 0,
1527   - NULL);
  1538 + mpol_shared_policy_init(&info->policy, NULL);
1528 1539 break;
1529 1540 }
1530 1541 } else
... ... @@ -2139,8 +2150,7 @@
2139 2150 if (*rest)
2140 2151 goto bad_val;
2141 2152 } else if (!strcmp(this_char,"mpol")) {
2142   - if (mpol_parse_str(value, &sbinfo->policy,
2143   - &sbinfo->flags, &sbinfo->policy_nodes))
  2153 + if (mpol_parse_str(value, &sbinfo->mpol, 1))
2144 2154 goto bad_val;
2145 2155 } else {
2146 2156 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
... ... @@ -2191,9 +2201,9 @@
2191 2201 sbinfo->free_blocks = config.max_blocks - blocks;
2192 2202 sbinfo->max_inodes = config.max_inodes;
2193 2203 sbinfo->free_inodes = config.max_inodes - inodes;
2194   - sbinfo->policy = config.policy;
2195   - sbinfo->flags = config.flags;
2196   - sbinfo->policy_nodes = config.policy_nodes;
  2204 +
  2205 + mpol_put(sbinfo->mpol);
  2206 + sbinfo->mpol = config.mpol; /* transfers initial ref */
2197 2207 out:
2198 2208 spin_unlock(&sbinfo->stat_lock);
2199 2209 return error;
... ... @@ -2214,8 +2224,7 @@
2214 2224 seq_printf(seq, ",uid=%u", sbinfo->uid);
2215 2225 if (sbinfo->gid != 0)
2216 2226 seq_printf(seq, ",gid=%u", sbinfo->gid);
2217   - shmem_show_mpol(seq, sbinfo->policy, sbinfo->flags,
2218   - sbinfo->policy_nodes);
  2227 + shmem_show_mpol(seq, sbinfo->mpol);
2219 2228 return 0;
2220 2229 }
2221 2230 #endif /* CONFIG_TMPFS */
... ... @@ -2245,9 +2254,7 @@
2245 2254 sbinfo->mode = S_IRWXUGO | S_ISVTX;
2246 2255 sbinfo->uid = current->fsuid;
2247 2256 sbinfo->gid = current->fsgid;
2248   - sbinfo->policy = MPOL_DEFAULT;
2249   - sbinfo->flags = 0;
2250   - sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
  2257 + sbinfo->mpol = NULL;
2251 2258 sb->s_fs_info = sbinfo;
2252 2259  
2253 2260 #ifdef CONFIG_TMPFS