Commit 1fb9341ac34825aa40354e74d9a2c69df7d2c304
1 parent
0d21b0e347
Exists in
master
and in
20 other branches
module: put modules in list much earlier.
Prarit's excellent bug report: > In recent Fedora releases (F17 & F18) some users have reported seeing > messages similar to > > [ 15.478160] kvm: Could not allocate 304 bytes percpu data > [ 15.478174] PERCPU: allocation failed, size=304 align=32, alloc from > reserved chunk failed > > during system boot. In some cases, users have also reported seeing this > message along with a failed load of other modules. > > What is happening is systemd is loading an instance of the kvm module for > each cpu found (see commit e9bda3b). When the module load occurs the kernel > currently allocates the modules percpu data area prior to checking to see > if the module is already loaded or is in the process of being loaded. If > the module is already loaded, or finishes load, the module loading code > releases the current instance's module's percpu data. Now we have a new state MODULE_STATE_UNFORMED, we can insert the module into the list (and thus guarantee its uniqueness) before we allocate the per-cpu region. Reported-by: Prarit Bhargava <prarit@redhat.com> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Tested-by: Prarit Bhargava <prarit@redhat.com>
Showing 2 changed files with 50 additions and 41 deletions Side-by-side Diff
kernel/module.c
... | ... | @@ -3017,7 +3017,7 @@ |
3017 | 3017 | bool ret; |
3018 | 3018 | |
3019 | 3019 | mutex_lock(&module_mutex); |
3020 | - mod = find_module(name); | |
3020 | + mod = find_module_all(name, true); | |
3021 | 3021 | ret = !mod || mod->state == MODULE_STATE_LIVE |
3022 | 3022 | || mod->state == MODULE_STATE_GOING; |
3023 | 3023 | mutex_unlock(&module_mutex); |
... | ... | @@ -3141,6 +3141,32 @@ |
3141 | 3141 | goto free_copy; |
3142 | 3142 | } |
3143 | 3143 | |
3144 | + /* | |
3145 | + * We try to place it in the list now to make sure it's unique | |
3146 | + * before we dedicate too many resources. In particular, | |
3147 | + * temporary percpu memory exhaustion. | |
3148 | + */ | |
3149 | + mod->state = MODULE_STATE_UNFORMED; | |
3150 | +again: | |
3151 | + mutex_lock(&module_mutex); | |
3152 | + if ((old = find_module_all(mod->name, true)) != NULL) { | |
3153 | + if (old->state == MODULE_STATE_COMING | |
3154 | + || old->state == MODULE_STATE_UNFORMED) { | |
3155 | + /* Wait in case it fails to load. */ | |
3156 | + mutex_unlock(&module_mutex); | |
3157 | + err = wait_event_interruptible(module_wq, | |
3158 | + finished_loading(mod->name)); | |
3159 | + if (err) | |
3160 | + goto free_module; | |
3161 | + goto again; | |
3162 | + } | |
3163 | + err = -EEXIST; | |
3164 | + mutex_unlock(&module_mutex); | |
3165 | + goto free_module; | |
3166 | + } | |
3167 | + list_add_rcu(&mod->list, &modules); | |
3168 | + mutex_unlock(&module_mutex); | |
3169 | + | |
3144 | 3170 | #ifdef CONFIG_MODULE_SIG |
3145 | 3171 | mod->sig_ok = info->sig_ok; |
3146 | 3172 | if (!mod->sig_ok) |
... | ... | @@ -3150,7 +3176,7 @@ |
3150 | 3176 | /* Now module is in final location, initialize linked lists, etc. */ |
3151 | 3177 | err = module_unload_init(mod); |
3152 | 3178 | if (err) |
3153 | - goto free_module; | |
3179 | + goto unlink_mod; | |
3154 | 3180 | |
3155 | 3181 | /* Now we've got everything in the final locations, we can |
3156 | 3182 | * find optional sections. */ |
3157 | 3183 | |
3158 | 3184 | |
3159 | 3185 | |
3160 | 3186 | |
3161 | 3187 | |
3162 | 3188 | |
... | ... | @@ -3185,54 +3211,33 @@ |
3185 | 3211 | goto free_arch_cleanup; |
3186 | 3212 | } |
3187 | 3213 | |
3188 | - /* Mark state as coming so strong_try_module_get() ignores us. */ | |
3189 | - mod->state = MODULE_STATE_COMING; | |
3190 | - | |
3191 | - /* Now sew it into the lists so we can get lockdep and oops | |
3192 | - * info during argument parsing. No one should access us, since | |
3193 | - * strong_try_module_get() will fail. | |
3194 | - * lockdep/oops can run asynchronous, so use the RCU list insertion | |
3195 | - * function to insert in a way safe to concurrent readers. | |
3196 | - * The mutex protects against concurrent writers. | |
3197 | - */ | |
3198 | -again: | |
3199 | - mutex_lock(&module_mutex); | |
3200 | - if ((old = find_module(mod->name)) != NULL) { | |
3201 | - if (old->state == MODULE_STATE_COMING) { | |
3202 | - /* Wait in case it fails to load. */ | |
3203 | - mutex_unlock(&module_mutex); | |
3204 | - err = wait_event_interruptible(module_wq, | |
3205 | - finished_loading(mod->name)); | |
3206 | - if (err) | |
3207 | - goto free_arch_cleanup; | |
3208 | - goto again; | |
3209 | - } | |
3210 | - err = -EEXIST; | |
3211 | - goto unlock; | |
3212 | - } | |
3213 | - | |
3214 | - /* This has to be done once we're sure module name is unique. */ | |
3215 | 3214 | dynamic_debug_setup(info->debug, info->num_debug); |
3216 | 3215 | |
3217 | - /* Find duplicate symbols */ | |
3216 | + mutex_lock(&module_mutex); | |
3217 | + /* Find duplicate symbols (must be called under lock). */ | |
3218 | 3218 | err = verify_export_symbols(mod); |
3219 | 3219 | if (err < 0) |
3220 | - goto ddebug; | |
3220 | + goto ddebug_cleanup; | |
3221 | 3221 | |
3222 | + /* This relies on module_mutex for list integrity. */ | |
3222 | 3223 | module_bug_finalize(info->hdr, info->sechdrs, mod); |
3223 | - list_add_rcu(&mod->list, &modules); | |
3224 | + | |
3225 | + /* Mark state as coming so strong_try_module_get() ignores us, | |
3226 | + * but kallsyms etc. can see us. */ | |
3227 | + mod->state = MODULE_STATE_COMING; | |
3228 | + | |
3224 | 3229 | mutex_unlock(&module_mutex); |
3225 | 3230 | |
3226 | 3231 | /* Module is ready to execute: parsing args may do that. */ |
3227 | 3232 | err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, |
3228 | 3233 | -32768, 32767, &ddebug_dyndbg_module_param_cb); |
3229 | 3234 | if (err < 0) |
3230 | - goto unlink; | |
3235 | + goto bug_cleanup; | |
3231 | 3236 | |
3232 | 3237 | /* Link in to syfs. */ |
3233 | 3238 | err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); |
3234 | 3239 | if (err < 0) |
3235 | - goto unlink; | |
3240 | + goto bug_cleanup; | |
3236 | 3241 | |
3237 | 3242 | /* Get rid of temporary copy. */ |
3238 | 3243 | free_copy(info); |
3239 | 3244 | |
3240 | 3245 | |
3241 | 3246 | |
... | ... | @@ -3242,16 +3247,13 @@ |
3242 | 3247 | |
3243 | 3248 | return do_init_module(mod); |
3244 | 3249 | |
3245 | - unlink: | |
3250 | + bug_cleanup: | |
3251 | + /* module_bug_cleanup needs module_mutex protection */ | |
3246 | 3252 | mutex_lock(&module_mutex); |
3247 | - /* Unlink carefully: kallsyms could be walking list. */ | |
3248 | - list_del_rcu(&mod->list); | |
3249 | 3253 | module_bug_cleanup(mod); |
3250 | - wake_up_all(&module_wq); | |
3251 | - ddebug: | |
3252 | - dynamic_debug_remove(info->debug); | |
3253 | - unlock: | |
3254 | 3254 | mutex_unlock(&module_mutex); |
3255 | + ddebug_cleanup: | |
3256 | + dynamic_debug_remove(info->debug); | |
3255 | 3257 | synchronize_sched(); |
3256 | 3258 | kfree(mod->args); |
3257 | 3259 | free_arch_cleanup: |
... | ... | @@ -3260,6 +3262,12 @@ |
3260 | 3262 | free_modinfo(mod); |
3261 | 3263 | free_unload: |
3262 | 3264 | module_unload_free(mod); |
3265 | + unlink_mod: | |
3266 | + mutex_lock(&module_mutex); | |
3267 | + /* Unlink carefully: kallsyms could be walking list. */ | |
3268 | + list_del_rcu(&mod->list); | |
3269 | + wake_up_all(&module_wq); | |
3270 | + mutex_unlock(&module_mutex); | |
3263 | 3271 | free_module: |
3264 | 3272 | module_deallocate(mod, info); |
3265 | 3273 | free_copy: |