Commit 99fc8d424bc5d803fe92cad56c068fe64e73747a

Authored by Jesse Barnes
Committed by Ingo Molnar
1 parent 03252919b7

x86, 32-bit: trim memory not covered by wb mtrrs

On some machines, buggy BIOSes don't properly setup WB MTRRs to cover all
available RAM, meaning the last few megs (or even gigs) of memory will be
marked uncached.  Since Linux tends to allocate from high memory addresses
first, this causes the machine to be unusably slow as soon as the kernel
starts really using memory (i.e.  right around init time).

This patch works around the problem by scanning the MTRRs at boot and
figuring out whether the current end_pfn value (setup by early e820 code)
goes beyond the highest WB MTRR range, and if so, trimming it to match.  A
fairly obnoxious KERN_WARNING is printed too, letting the user know that
not all of their memory is available due to a likely BIOS bug.

Something similar could be done on i386 if needed, but the boot ordering
would be slightly different, since the MTRR code on i386 depends on the
boot_cpu_data structure being setup.

This patch fixes a bug in the last patch that caused the code to run on
non-Intel machines (AMD machines apparently don't need it and it's untested
on other non-Intel machines, so best keep it off).

Further enhancements and fixes from:

  Yinghai Lu <Yinghai.Lu@Sun.COM>
  Andi Kleen <ak@suse.de>

Signed-off-by: Jesse Barnes <jesse.barnes@intel.com>
Tested-by: Justin Piszcz <jpiszcz@lucidpixels.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

Showing 8 changed files with 140 additions and 39 deletions Side-by-side Diff

Documentation/kernel-parameters.txt
... ... @@ -570,6 +570,12 @@
570 570 See drivers/char/README.epca and
571 571 Documentation/digiepca.txt.
572 572  
  573 + disable_mtrr_trim [X86-64, Intel only]
  574 + By default the kernel will trim any uncacheable
  575 + memory out of your available memory pool based on
  576 + MTRR settings. This parameter disables that behavior,
  577 + possibly causing your machine to run very slowly.
  578 +
573 579 dmasound= [HW,OSS] Sound subsystem buffers
574 580  
575 581 dscc4.setup= [NET]
arch/x86/kernel/bugs_64.c
... ... @@ -13,7 +13,6 @@
13 13 void __init check_bugs(void)
14 14 {
15 15 identify_cpu(&boot_cpu_data);
16   - mtrr_bp_init();
17 16 #if !defined(CONFIG_SMP)
18 17 printk("CPU: ");
19 18 print_cpu_info(&boot_cpu_data);
arch/x86/kernel/cpu/mtrr/generic.c
... ... @@ -14,7 +14,7 @@
14 14 #include "mtrr.h"
15 15  
16 16 struct mtrr_state {
17   - struct mtrr_var_range *var_ranges;
  17 + struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
18 18 mtrr_type fixed_ranges[NUM_FIXED_RANGES];
19 19 unsigned char enabled;
20 20 unsigned char have_fixed;
... ... @@ -86,12 +86,6 @@
86 86 struct mtrr_var_range *vrs;
87 87 unsigned lo, dummy;
88 88  
89   - if (!mtrr_state.var_ranges) {
90   - mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range),
91   - GFP_KERNEL);
92   - if (!mtrr_state.var_ranges)
93   - return;
94   - }
95 89 vrs = mtrr_state.var_ranges;
96 90  
97 91 rdmsr(MTRRcap_MSR, lo, dummy);
arch/x86/kernel/cpu/mtrr/if.c
... ... @@ -11,10 +11,6 @@
11 11 #include <asm/mtrr.h>
12 12 #include "mtrr.h"
13 13  
14   -/* RED-PEN: this is accessed without any locking */
15   -extern unsigned int *usage_table;
16   -
17   -
18 14 #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
19 15  
20 16 static const char *const mtrr_strings[MTRR_NUM_TYPES] =
... ... @@ -397,7 +393,7 @@
397 393 for (i = 0; i < max; i++) {
398 394 mtrr_if->get(i, &base, &size, &type);
399 395 if (size == 0)
400   - usage_table[i] = 0;
  396 + mtrr_usage_table[i] = 0;
401 397 else {
402 398 if (size < (0x100000 >> PAGE_SHIFT)) {
403 399 /* less than 1MB */
... ... @@ -411,7 +407,7 @@
411 407 len += seq_printf(seq,
412 408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
413 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
414   - mtrr_attrib_to_str(type), usage_table[i]);
  410 + mtrr_attrib_to_str(type), mtrr_usage_table[i]);
415 411 }
416 412 }
417 413 return 0;
arch/x86/kernel/cpu/mtrr/main.c
... ... @@ -38,8 +38,8 @@
38 38 #include <linux/cpu.h>
39 39 #include <linux/mutex.h>
40 40  
  41 +#include <asm/e820.h>
41 42 #include <asm/mtrr.h>
42   -
43 43 #include <asm/uaccess.h>
44 44 #include <asm/processor.h>
45 45 #include <asm/msr.h>
... ... @@ -47,7 +47,7 @@
47 47  
48 48 u32 num_var_ranges = 0;
49 49  
50   -unsigned int *usage_table;
  50 +unsigned int mtrr_usage_table[MAX_VAR_RANGES];
51 51 static DEFINE_MUTEX(mtrr_mutex);
52 52  
53 53 u64 size_or_mask, size_and_mask;
54 54  
... ... @@ -121,13 +121,8 @@
121 121 int i, max;
122 122  
123 123 max = num_var_ranges;
124   - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
125   - == NULL) {
126   - printk(KERN_ERR "mtrr: could not allocate\n");
127   - return;
128   - }
129 124 for (i = 0; i < max; i++)
130   - usage_table[i] = 1;
  125 + mtrr_usage_table[i] = 1;
131 126 }
132 127  
133 128 struct set_mtrr_data {
... ... @@ -383,7 +378,7 @@
383 378 goto out;
384 379 }
385 380 if (increment)
386   - ++usage_table[i];
  381 + ++mtrr_usage_table[i];
387 382 error = i;
388 383 goto out;
389 384 }
390 385  
391 386  
... ... @@ -391,15 +386,15 @@
391 386 i = mtrr_if->get_free_region(base, size, replace);
392 387 if (i >= 0) {
393 388 set_mtrr(i, base, size, type);
394   - if (likely(replace < 0))
395   - usage_table[i] = 1;
396   - else {
397   - usage_table[i] = usage_table[replace];
  389 + if (likely(replace < 0)) {
  390 + mtrr_usage_table[i] = 1;
  391 + } else {
  392 + mtrr_usage_table[i] = mtrr_usage_table[replace];
398 393 if (increment)
399   - usage_table[i]++;
  394 + mtrr_usage_table[i]++;
400 395 if (unlikely(replace != i)) {
401 396 set_mtrr(replace, 0, 0, 0);
402   - usage_table[replace] = 0;
  397 + mtrr_usage_table[replace] = 0;
403 398 }
404 399 }
405 400 } else
406 401  
... ... @@ -529,11 +524,11 @@
529 524 printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
530 525 goto out;
531 526 }
532   - if (usage_table[reg] < 1) {
  527 + if (mtrr_usage_table[reg] < 1) {
533 528 printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
534 529 goto out;
535 530 }
536   - if (--usage_table[reg] < 1)
  531 + if (--mtrr_usage_table[reg] < 1)
537 532 set_mtrr(reg, 0, 0, 0);
538 533 error = reg;
539 534 out:
540 535  
541 536  
... ... @@ -593,17 +588,12 @@
593 588 unsigned long lsize;
594 589 };
595 590  
596   -static struct mtrr_value * mtrr_state;
  591 +static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
597 592  
598 593 static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
599 594 {
600 595 int i;
601   - int size = num_var_ranges * sizeof(struct mtrr_value);
602 596  
603   - mtrr_state = kzalloc(size,GFP_ATOMIC);
604   - if (!mtrr_state)
605   - return -ENOMEM;
606   -
607 597 for (i = 0; i < num_var_ranges; i++) {
608 598 mtrr_if->get(i,
609 599 &mtrr_state[i].lbase,
... ... @@ -624,7 +614,6 @@
624 614 mtrr_state[i].lsize,
625 615 mtrr_state[i].ltype);
626 616 }
627   - kfree(mtrr_state);
628 617 return 0;
629 618 }
630 619  
... ... @@ -635,6 +624,109 @@
635 624 .resume = mtrr_restore,
636 625 };
637 626  
  627 +#ifdef CONFIG_X86_64
  628 +static int disable_mtrr_trim;
  629 +
  630 +static int __init disable_mtrr_trim_setup(char *str)
  631 +{
  632 + disable_mtrr_trim = 1;
  633 + return 0;
  634 +}
  635 +early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
  636 +
  637 +/*
  638 + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB
  639 + * for memory >4GB. Check for that here.
  640 + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't
  641 + * apply to are wrong, but so far we don't know of any such case in the wild.
  642 + */
  643 +#define Tom2Enabled (1U << 21)
  644 +#define Tom2ForceMemTypeWB (1U << 22)
  645 +
  646 +static __init int amd_special_default_mtrr(unsigned long end_pfn)
  647 +{
  648 + u32 l, h;
  649 +
  650 + /* Doesn't apply to memory < 4GB */
  651 + if (end_pfn <= (0xffffffff >> PAGE_SHIFT))
  652 + return 0;
  653 + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
  654 + return 0;
  655 + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
  656 + return 0;
  657 + /* In case some hypervisor doesn't pass SYSCFG through */
  658 + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
  659 + return 0;
  660 + /*
  661 + * Memory between 4GB and top of mem is forced WB by this magic bit.
  662 + * Reserved before K8RevF, but should be zero there.
  663 + */
  664 + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) ==
  665 + (Tom2Enabled | Tom2ForceMemTypeWB))
  666 + return 1;
  667 + return 0;
  668 +}
  669 +
  670 +/**
  671 + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
  672 + *
  673 + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
  674 + * memory configurations. This routine checks that the highest MTRR matches
  675 + * the end of memory, to make sure the MTRRs having a write back type cover
  676 + * all of the memory the kernel is intending to use. If not, it'll trim any
  677 + * memory off the end by adjusting end_pfn, removing it from the kernel's
  678 + * allocation pools, warning the user with an obnoxious message.
  679 + */
  680 +int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
  681 +{
  682 + unsigned long i, base, size, highest_addr = 0, def, dummy;
  683 + mtrr_type type;
  684 + u64 trim_start, trim_size;
  685 +
  686 + /*
  687 + * Make sure we only trim uncachable memory on machines that
  688 + * support the Intel MTRR architecture:
  689 + */
  690 + rdmsr(MTRRdefType_MSR, def, dummy);
  691 + def &= 0xff;
  692 + if (!is_cpu(INTEL) || disable_mtrr_trim || def != MTRR_TYPE_UNCACHABLE)
  693 + return 0;
  694 +
  695 + /* Find highest cached pfn */
  696 + for (i = 0; i < num_var_ranges; i++) {
  697 + mtrr_if->get(i, &base, &size, &type);
  698 + if (type != MTRR_TYPE_WRBACK)
  699 + continue;
  700 + base <<= PAGE_SHIFT;
  701 + size <<= PAGE_SHIFT;
  702 + if (highest_addr < base + size)
  703 + highest_addr = base + size;
  704 + }
  705 +
  706 + if (amd_special_default_mtrr(end_pfn))
  707 + return 0;
  708 +
  709 + if ((highest_addr >> PAGE_SHIFT) < end_pfn) {
  710 + printk(KERN_WARNING "***************\n");
  711 + printk(KERN_WARNING "**** WARNING: likely BIOS bug\n");
  712 + printk(KERN_WARNING "**** MTRRs don't cover all of "
  713 + "memory, trimmed %ld pages\n", end_pfn -
  714 + (highest_addr >> PAGE_SHIFT));
  715 + printk(KERN_WARNING "***************\n");
  716 +
  717 + printk(KERN_INFO "update e820 for mtrr\n");
  718 + trim_start = highest_addr;
  719 + trim_size = end_pfn;
  720 + trim_size <<= PAGE_SHIFT;
  721 + trim_size -= trim_start;
  722 + add_memory_region(trim_start, trim_size, E820_RESERVED);
  723 + update_e820();
  724 + return 1;
  725 + }
  726 +
  727 + return 0;
  728 +}
  729 +#endif
638 730  
639 731 /**
640 732 * mtrr_bp_init - initialize mtrrs on the boot CPU
arch/x86/kernel/cpu/mtrr/mtrr.h
... ... @@ -12,6 +12,7 @@
12 12 #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
13 13  
14 14 #define NUM_FIXED_RANGES 88
  15 +#define MAX_VAR_RANGES 256
15 16 #define MTRRfix64K_00000_MSR 0x250
16 17 #define MTRRfix16K_80000_MSR 0x258
17 18 #define MTRRfix16K_A0000_MSR 0x259
... ... @@ -31,6 +32,8 @@
31 32 /* In the Intel processor's MTRR interface, the MTRR type is always held in
32 33 an 8 bit field: */
33 34 typedef u8 mtrr_type;
  35 +
  36 +extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
34 37  
35 38 struct mtrr_ops {
36 39 u32 vendor;
arch/x86/kernel/setup_64.c
... ... @@ -310,6 +310,13 @@
310 310 * we are rounding upwards:
311 311 */
312 312 end_pfn = e820_end_of_ram();
  313 + /* update e820 for memory not covered by WB MTRRs */
  314 + mtrr_bp_init();
  315 + if (mtrr_trim_uncached_memory(end_pfn)) {
  316 + e820_register_active_regions(0, 0, -1UL);
  317 + end_pfn = e820_end_of_ram();
  318 + }
  319 +
313 320 num_physpages = end_pfn;
314 321  
315 322 check_efer();
include/asm-x86/mtrr.h
... ... @@ -97,6 +97,7 @@
97 97 extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
98 98 extern void mtrr_ap_init(void);
99 99 extern void mtrr_bp_init(void);
  100 +extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
100 101 # else
101 102 #define mtrr_save_fixed_ranges(arg) do {} while (0)
102 103 #define mtrr_save_state() do {} while (0)
... ... @@ -120,7 +121,10 @@
120 121 {
121 122 return -ENODEV;
122 123 }
123   -
  124 +static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
  125 +{
  126 + return 0;
  127 +}
124 128 static __inline__ void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) {;}
125 129  
126 130 #define mtrr_ap_init() do {} while (0)