Commit c4fd308ed62f292518363ea9c6c2adb3c2d95f9d
Exists in
master
and in
39 other branches
Merge branch 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-pat-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: x86, pat: Update the page flags for memtype atomically instead of using memtype_lock x86, pat: In rbt_memtype_check_insert(), update new->type only if valid x86, pat: Migrate to rbtree only backend for pat memtype management x86, pat: Preparatory changes in pat.c for bigger rbtree change rbtree: Add support for augmented rbtrees
Showing 8 changed files Side-by-side Diff
Documentation/rbtree.txt
... | ... | @@ -189,4 +189,63 @@ |
189 | 189 | struct rb_node *node; |
190 | 190 | for (node = rb_first(&mytree); node; node = rb_next(node)) |
191 | 191 | printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); |
192 | + | |
193 | +Support for Augmented rbtrees | |
194 | +----------------------------- | |
195 | + | |
196 | +Augmented rbtree is an rbtree with "some" additional data stored in each node. | |
197 | +This data can be used to augment some new functionality to rbtree. | |
198 | +Augmented rbtree is an optional feature built on top of basic rbtree | |
199 | +infrastructure. rbtree user who wants this feature will have an augment | |
200 | +callback function in rb_root initialized. | |
201 | + | |
202 | +This callback function will be called from rbtree core routines whenever | |
203 | +a node has a change in one or both of its children. It is the responsibility | |
204 | +of the callback function to recalculate the additional data that is in the | |
205 | +rb node using new children information. Note that if this new additional | |
206 | +data affects the parent node's additional data, then callback function has | |
207 | +to handle it and do the recursive updates. | |
208 | + | |
209 | + | |
210 | +Interval tree is an example of augmented rb tree. Reference - | |
211 | +"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. | |
212 | +More details about interval trees: | |
213 | + | |
214 | +Classical rbtree has a single key and it cannot be directly used to store | |
215 | +interval ranges like [lo:hi] and do a quick lookup for any overlap with a new | |
216 | +lo:hi or to find whether there is an exact match for a new lo:hi. | |
217 | + | |
218 | +However, rbtree can be augmented to store such interval ranges in a structured | |
219 | +way making it possible to do efficient lookup and exact match. | |
220 | + | |
221 | +This "extra information" stored in each node is the maximum hi | |
222 | +(max_hi) value among all the nodes that are its descendents. This | |
223 | +information can be maintained at each node just be looking at the node | |
224 | +and its immediate children. And this will be used in O(log n) lookup | |
225 | +for lowest match (lowest start address among all possible matches) | |
226 | +with something like: | |
227 | + | |
228 | +find_lowest_match(lo, hi, node) | |
229 | +{ | |
230 | + lowest_match = NULL; | |
231 | + while (node) { | |
232 | + if (max_hi(node->left) > lo) { | |
233 | + // Lowest overlap if any must be on left side | |
234 | + node = node->left; | |
235 | + } else if (overlap(lo, hi, node)) { | |
236 | + lowest_match = node; | |
237 | + break; | |
238 | + } else if (lo > node->lo) { | |
239 | + // Lowest overlap if any must be on right side | |
240 | + node = node->right; | |
241 | + } else { | |
242 | + break; | |
243 | + } | |
244 | + } | |
245 | + return lowest_match; | |
246 | +} | |
247 | + | |
248 | +Finding exact match will be to first find lowest match and then to follow | |
249 | +successor nodes looking for exact match, until the start of a node is beyond | |
250 | +the hi value we are looking for. |
arch/x86/include/asm/cacheflush.h
... | ... | @@ -44,9 +44,6 @@ |
44 | 44 | memcpy(dst, src, len); |
45 | 45 | } |
46 | 46 | |
47 | -#define PG_WC PG_arch_1 | |
48 | -PAGEFLAG(WC, WC) | |
49 | - | |
50 | 47 | #ifdef CONFIG_X86_PAT |
51 | 48 | /* |
52 | 49 | * X86 PAT uses page flags WC and Uncached together to keep track of |
53 | 50 | |
54 | 51 | |
55 | 52 | |
56 | 53 | |
... | ... | @@ -55,16 +52,24 @@ |
55 | 52 | * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not |
56 | 53 | * been changed from its default (value of -1 used to denote this). |
57 | 54 | * Note we do not support _PAGE_CACHE_UC here. |
58 | - * | |
59 | - * Caller must hold memtype_lock for atomicity. | |
60 | 55 | */ |
56 | + | |
57 | +#define _PGMT_DEFAULT 0 | |
58 | +#define _PGMT_WC (1UL << PG_arch_1) | |
59 | +#define _PGMT_UC_MINUS (1UL << PG_uncached) | |
60 | +#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) | |
61 | +#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) | |
62 | +#define _PGMT_CLEAR_MASK (~_PGMT_MASK) | |
63 | + | |
61 | 64 | static inline unsigned long get_page_memtype(struct page *pg) |
62 | 65 | { |
63 | - if (!PageUncached(pg) && !PageWC(pg)) | |
66 | + unsigned long pg_flags = pg->flags & _PGMT_MASK; | |
67 | + | |
68 | + if (pg_flags == _PGMT_DEFAULT) | |
64 | 69 | return -1; |
65 | - else if (!PageUncached(pg) && PageWC(pg)) | |
70 | + else if (pg_flags == _PGMT_WC) | |
66 | 71 | return _PAGE_CACHE_WC; |
67 | - else if (PageUncached(pg) && !PageWC(pg)) | |
72 | + else if (pg_flags == _PGMT_UC_MINUS) | |
68 | 73 | return _PAGE_CACHE_UC_MINUS; |
69 | 74 | else |
70 | 75 | return _PAGE_CACHE_WB; |
71 | 76 | |
72 | 77 | |
73 | 78 | |
74 | 79 | |
75 | 80 | |
... | ... | @@ -72,25 +77,26 @@ |
72 | 77 | |
73 | 78 | static inline void set_page_memtype(struct page *pg, unsigned long memtype) |
74 | 79 | { |
80 | + unsigned long memtype_flags = _PGMT_DEFAULT; | |
81 | + unsigned long old_flags; | |
82 | + unsigned long new_flags; | |
83 | + | |
75 | 84 | switch (memtype) { |
76 | 85 | case _PAGE_CACHE_WC: |
77 | - ClearPageUncached(pg); | |
78 | - SetPageWC(pg); | |
86 | + memtype_flags = _PGMT_WC; | |
79 | 87 | break; |
80 | 88 | case _PAGE_CACHE_UC_MINUS: |
81 | - SetPageUncached(pg); | |
82 | - ClearPageWC(pg); | |
89 | + memtype_flags = _PGMT_UC_MINUS; | |
83 | 90 | break; |
84 | 91 | case _PAGE_CACHE_WB: |
85 | - SetPageUncached(pg); | |
86 | - SetPageWC(pg); | |
92 | + memtype_flags = _PGMT_WB; | |
87 | 93 | break; |
88 | - default: | |
89 | - case -1: | |
90 | - ClearPageUncached(pg); | |
91 | - ClearPageWC(pg); | |
92 | - break; | |
93 | 94 | } |
95 | + | |
96 | + do { | |
97 | + old_flags = pg->flags; | |
98 | + new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; | |
99 | + } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); | |
94 | 100 | } |
95 | 101 | #else |
96 | 102 | static inline unsigned long get_page_memtype(struct page *pg) { return -1; } |
arch/x86/mm/Makefile
arch/x86/mm/pat.c
... | ... | @@ -30,6 +30,8 @@ |
30 | 30 | #include <asm/pat.h> |
31 | 31 | #include <asm/io.h> |
32 | 32 | |
33 | +#include "pat_internal.h" | |
34 | + | |
33 | 35 | #ifdef CONFIG_X86_PAT |
34 | 36 | int __read_mostly pat_enabled = 1; |
35 | 37 | |
36 | 38 | |
37 | 39 | |
... | ... | @@ -53,19 +55,15 @@ |
53 | 55 | #endif |
54 | 56 | |
55 | 57 | |
56 | -static int debug_enable; | |
58 | +int pat_debug_enable; | |
57 | 59 | |
58 | 60 | static int __init pat_debug_setup(char *str) |
59 | 61 | { |
60 | - debug_enable = 1; | |
62 | + pat_debug_enable = 1; | |
61 | 63 | return 0; |
62 | 64 | } |
63 | 65 | __setup("debugpat", pat_debug_setup); |
64 | 66 | |
65 | -#define dprintk(fmt, arg...) \ | |
66 | - do { if (debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) | |
67 | - | |
68 | - | |
69 | 67 | static u64 __read_mostly boot_pat_state; |
70 | 68 | |
71 | 69 | enum { |
72 | 70 | |
... | ... | @@ -132,86 +130,9 @@ |
132 | 130 | |
133 | 131 | #undef PAT |
134 | 132 | |
135 | -static char *cattr_name(unsigned long flags) | |
136 | -{ | |
137 | - switch (flags & _PAGE_CACHE_MASK) { | |
138 | - case _PAGE_CACHE_UC: return "uncached"; | |
139 | - case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; | |
140 | - case _PAGE_CACHE_WB: return "write-back"; | |
141 | - case _PAGE_CACHE_WC: return "write-combining"; | |
142 | - default: return "broken"; | |
143 | - } | |
144 | -} | |
133 | +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */ | |
145 | 134 | |
146 | 135 | /* |
147 | - * The global memtype list keeps track of memory type for specific | |
148 | - * physical memory areas. Conflicting memory types in different | |
149 | - * mappings can cause CPU cache corruption. To avoid this we keep track. | |
150 | - * | |
151 | - * The list is sorted based on starting address and can contain multiple | |
152 | - * entries for each address (this allows reference counting for overlapping | |
153 | - * areas). All the aliases have the same cache attributes of course. | |
154 | - * Zero attributes are represented as holes. | |
155 | - * | |
156 | - * The data structure is a list that is also organized as an rbtree | |
157 | - * sorted on the start address of memtype range. | |
158 | - * | |
159 | - * memtype_lock protects both the linear list and rbtree. | |
160 | - */ | |
161 | - | |
162 | -struct memtype { | |
163 | - u64 start; | |
164 | - u64 end; | |
165 | - unsigned long type; | |
166 | - struct list_head nd; | |
167 | - struct rb_node rb; | |
168 | -}; | |
169 | - | |
170 | -static struct rb_root memtype_rbroot = RB_ROOT; | |
171 | -static LIST_HEAD(memtype_list); | |
172 | -static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ | |
173 | - | |
174 | -static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) | |
175 | -{ | |
176 | - struct rb_node *node = root->rb_node; | |
177 | - struct memtype *last_lower = NULL; | |
178 | - | |
179 | - while (node) { | |
180 | - struct memtype *data = container_of(node, struct memtype, rb); | |
181 | - | |
182 | - if (data->start < start) { | |
183 | - last_lower = data; | |
184 | - node = node->rb_right; | |
185 | - } else if (data->start > start) { | |
186 | - node = node->rb_left; | |
187 | - } else | |
188 | - return data; | |
189 | - } | |
190 | - | |
191 | - /* Will return NULL if there is no entry with its start <= start */ | |
192 | - return last_lower; | |
193 | -} | |
194 | - | |
195 | -static void memtype_rb_insert(struct rb_root *root, struct memtype *data) | |
196 | -{ | |
197 | - struct rb_node **new = &(root->rb_node); | |
198 | - struct rb_node *parent = NULL; | |
199 | - | |
200 | - while (*new) { | |
201 | - struct memtype *this = container_of(*new, struct memtype, rb); | |
202 | - | |
203 | - parent = *new; | |
204 | - if (data->start <= this->start) | |
205 | - new = &((*new)->rb_left); | |
206 | - else if (data->start > this->start) | |
207 | - new = &((*new)->rb_right); | |
208 | - } | |
209 | - | |
210 | - rb_link_node(&data->rb, parent, new); | |
211 | - rb_insert_color(&data->rb, root); | |
212 | -} | |
213 | - | |
214 | -/* | |
215 | 136 | * Does intersection of PAT memory type and MTRR memory type and returns |
216 | 137 | * the resulting memory type as PAT understands it. |
217 | 138 | * (Type in pat and mtrr will not have same value) |
... | ... | @@ -237,33 +158,6 @@ |
237 | 158 | return req_type; |
238 | 159 | } |
239 | 160 | |
240 | -static int | |
241 | -chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) | |
242 | -{ | |
243 | - if (new->type != entry->type) { | |
244 | - if (type) { | |
245 | - new->type = entry->type; | |
246 | - *type = entry->type; | |
247 | - } else | |
248 | - goto conflict; | |
249 | - } | |
250 | - | |
251 | - /* check overlaps with more than one entry in the list */ | |
252 | - list_for_each_entry_continue(entry, &memtype_list, nd) { | |
253 | - if (new->end <= entry->start) | |
254 | - break; | |
255 | - else if (new->type != entry->type) | |
256 | - goto conflict; | |
257 | - } | |
258 | - return 0; | |
259 | - | |
260 | - conflict: | |
261 | - printk(KERN_INFO "%s:%d conflicting memory types " | |
262 | - "%Lx-%Lx %s<->%s\n", current->comm, current->pid, new->start, | |
263 | - new->end, cattr_name(new->type), cattr_name(entry->type)); | |
264 | - return -EBUSY; | |
265 | -} | |
266 | - | |
267 | 161 | static int pat_pagerange_is_ram(unsigned long start, unsigned long end) |
268 | 162 | { |
269 | 163 | int ram_page = 0, not_rampage = 0; |
... | ... | @@ -296,8 +190,6 @@ |
296 | 190 | * Here we do two pass: |
297 | 191 | * - Find the memtype of all the pages in the range, look for any conflicts |
298 | 192 | * - In case of no conflicts, set the new memtype for pages in the range |
299 | - * | |
300 | - * Caller must hold memtype_lock for atomicity. | |
301 | 193 | */ |
302 | 194 | static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, |
303 | 195 | unsigned long *new_type) |
304 | 196 | |
... | ... | @@ -364,9 +256,8 @@ |
364 | 256 | int reserve_memtype(u64 start, u64 end, unsigned long req_type, |
365 | 257 | unsigned long *new_type) |
366 | 258 | { |
367 | - struct memtype *new, *entry; | |
259 | + struct memtype *new; | |
368 | 260 | unsigned long actual_type; |
369 | - struct list_head *where; | |
370 | 261 | int is_range_ram; |
371 | 262 | int err = 0; |
372 | 263 | |
373 | 264 | |
... | ... | @@ -404,9 +295,7 @@ |
404 | 295 | is_range_ram = pat_pagerange_is_ram(start, end); |
405 | 296 | if (is_range_ram == 1) { |
406 | 297 | |
407 | - spin_lock(&memtype_lock); | |
408 | 298 | err = reserve_ram_pages_type(start, end, req_type, new_type); |
409 | - spin_unlock(&memtype_lock); | |
410 | 299 | |
411 | 300 | return err; |
412 | 301 | } else if (is_range_ram < 0) { |
... | ... | @@ -423,42 +312,7 @@ |
423 | 312 | |
424 | 313 | spin_lock(&memtype_lock); |
425 | 314 | |
426 | - /* Search for existing mapping that overlaps the current range */ | |
427 | - where = NULL; | |
428 | - list_for_each_entry(entry, &memtype_list, nd) { | |
429 | - if (end <= entry->start) { | |
430 | - where = entry->nd.prev; | |
431 | - break; | |
432 | - } else if (start <= entry->start) { /* end > entry->start */ | |
433 | - err = chk_conflict(new, entry, new_type); | |
434 | - if (!err) { | |
435 | - dprintk("Overlap at 0x%Lx-0x%Lx\n", | |
436 | - entry->start, entry->end); | |
437 | - where = entry->nd.prev; | |
438 | - } | |
439 | - break; | |
440 | - } else if (start < entry->end) { /* start > entry->start */ | |
441 | - err = chk_conflict(new, entry, new_type); | |
442 | - if (!err) { | |
443 | - dprintk("Overlap at 0x%Lx-0x%Lx\n", | |
444 | - entry->start, entry->end); | |
445 | - | |
446 | - /* | |
447 | - * Move to right position in the linked | |
448 | - * list to add this new entry | |
449 | - */ | |
450 | - list_for_each_entry_continue(entry, | |
451 | - &memtype_list, nd) { | |
452 | - if (start <= entry->start) { | |
453 | - where = entry->nd.prev; | |
454 | - break; | |
455 | - } | |
456 | - } | |
457 | - } | |
458 | - break; | |
459 | - } | |
460 | - } | |
461 | - | |
315 | + err = rbt_memtype_check_insert(new, new_type); | |
462 | 316 | if (err) { |
463 | 317 | printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, " |
464 | 318 | "track %s, req %s\n", |
... | ... | @@ -469,13 +323,6 @@ |
469 | 323 | return err; |
470 | 324 | } |
471 | 325 | |
472 | - if (where) | |
473 | - list_add(&new->nd, where); | |
474 | - else | |
475 | - list_add_tail(&new->nd, &memtype_list); | |
476 | - | |
477 | - memtype_rb_insert(&memtype_rbroot, new); | |
478 | - | |
479 | 326 | spin_unlock(&memtype_lock); |
480 | 327 | |
481 | 328 | dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", |
... | ... | @@ -487,7 +334,6 @@ |
487 | 334 | |
488 | 335 | int free_memtype(u64 start, u64 end) |
489 | 336 | { |
490 | - struct memtype *entry, *saved_entry; | |
491 | 337 | int err = -EINVAL; |
492 | 338 | int is_range_ram; |
493 | 339 | |
494 | 340 | |
... | ... | @@ -501,9 +347,7 @@ |
501 | 347 | is_range_ram = pat_pagerange_is_ram(start, end); |
502 | 348 | if (is_range_ram == 1) { |
503 | 349 | |
504 | - spin_lock(&memtype_lock); | |
505 | 350 | err = free_ram_pages_type(start, end); |
506 | - spin_unlock(&memtype_lock); | |
507 | 351 | |
508 | 352 | return err; |
509 | 353 | } else if (is_range_ram < 0) { |
... | ... | @@ -511,46 +355,7 @@ |
511 | 355 | } |
512 | 356 | |
513 | 357 | spin_lock(&memtype_lock); |
514 | - | |
515 | - entry = memtype_rb_search(&memtype_rbroot, start); | |
516 | - if (unlikely(entry == NULL)) | |
517 | - goto unlock_ret; | |
518 | - | |
519 | - /* | |
520 | - * Saved entry points to an entry with start same or less than what | |
521 | - * we searched for. Now go through the list in both directions to look | |
522 | - * for the entry that matches with both start and end, with list stored | |
523 | - * in sorted start address | |
524 | - */ | |
525 | - saved_entry = entry; | |
526 | - list_for_each_entry_from(entry, &memtype_list, nd) { | |
527 | - if (entry->start == start && entry->end == end) { | |
528 | - rb_erase(&entry->rb, &memtype_rbroot); | |
529 | - list_del(&entry->nd); | |
530 | - kfree(entry); | |
531 | - err = 0; | |
532 | - break; | |
533 | - } else if (entry->start > start) { | |
534 | - break; | |
535 | - } | |
536 | - } | |
537 | - | |
538 | - if (!err) | |
539 | - goto unlock_ret; | |
540 | - | |
541 | - entry = saved_entry; | |
542 | - list_for_each_entry_reverse(entry, &memtype_list, nd) { | |
543 | - if (entry->start == start && entry->end == end) { | |
544 | - rb_erase(&entry->rb, &memtype_rbroot); | |
545 | - list_del(&entry->nd); | |
546 | - kfree(entry); | |
547 | - err = 0; | |
548 | - break; | |
549 | - } else if (entry->start < start) { | |
550 | - break; | |
551 | - } | |
552 | - } | |
553 | -unlock_ret: | |
358 | + err = rbt_memtype_erase(start, end); | |
554 | 359 | spin_unlock(&memtype_lock); |
555 | 360 | |
556 | 361 | if (err) { |
557 | 362 | |
... | ... | @@ -583,10 +388,8 @@ |
583 | 388 | |
584 | 389 | if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { |
585 | 390 | struct page *page; |
586 | - spin_lock(&memtype_lock); | |
587 | 391 | page = pfn_to_page(paddr >> PAGE_SHIFT); |
588 | 392 | rettype = get_page_memtype(page); |
589 | - spin_unlock(&memtype_lock); | |
590 | 393 | /* |
591 | 394 | * -1 from get_page_memtype() implies RAM page is in its |
592 | 395 | * default state and not reserved, and hence of type WB |
... | ... | @@ -599,7 +402,7 @@ |
599 | 402 | |
600 | 403 | spin_lock(&memtype_lock); |
601 | 404 | |
602 | - entry = memtype_rb_search(&memtype_rbroot, paddr); | |
405 | + entry = rbt_memtype_lookup(paddr); | |
603 | 406 | if (entry != NULL) |
604 | 407 | rettype = entry->type; |
605 | 408 | else |
606 | 409 | |
607 | 410 | |
608 | 411 | |
609 | 412 | |
610 | 413 | |
... | ... | @@ -936,29 +739,25 @@ |
936 | 739 | |
937 | 740 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) |
938 | 741 | |
939 | -/* get Nth element of the linked list */ | |
940 | 742 | static struct memtype *memtype_get_idx(loff_t pos) |
941 | 743 | { |
942 | - struct memtype *list_node, *print_entry; | |
943 | - int i = 1; | |
744 | + struct memtype *print_entry; | |
745 | + int ret; | |
944 | 746 | |
945 | - print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); | |
747 | + print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL); | |
946 | 748 | if (!print_entry) |
947 | 749 | return NULL; |
948 | 750 | |
949 | 751 | spin_lock(&memtype_lock); |
950 | - list_for_each_entry(list_node, &memtype_list, nd) { | |
951 | - if (pos == i) { | |
952 | - *print_entry = *list_node; | |
953 | - spin_unlock(&memtype_lock); | |
954 | - return print_entry; | |
955 | - } | |
956 | - ++i; | |
957 | - } | |
752 | + ret = rbt_memtype_copy_nth_element(print_entry, pos); | |
958 | 753 | spin_unlock(&memtype_lock); |
959 | - kfree(print_entry); | |
960 | 754 | |
961 | - return NULL; | |
755 | + if (!ret) { | |
756 | + return print_entry; | |
757 | + } else { | |
758 | + kfree(print_entry); | |
759 | + return NULL; | |
760 | + } | |
962 | 761 | } |
963 | 762 | |
964 | 763 | static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) |
arch/x86/mm/pat_internal.h
1 | +#ifndef __PAT_INTERNAL_H_ | |
2 | +#define __PAT_INTERNAL_H_ | |
3 | + | |
4 | +extern int pat_debug_enable; | |
5 | + | |
6 | +#define dprintk(fmt, arg...) \ | |
7 | + do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0) | |
8 | + | |
9 | +struct memtype { | |
10 | + u64 start; | |
11 | + u64 end; | |
12 | + u64 subtree_max_end; | |
13 | + unsigned long type; | |
14 | + struct rb_node rb; | |
15 | +}; | |
16 | + | |
17 | +static inline char *cattr_name(unsigned long flags) | |
18 | +{ | |
19 | + switch (flags & _PAGE_CACHE_MASK) { | |
20 | + case _PAGE_CACHE_UC: return "uncached"; | |
21 | + case _PAGE_CACHE_UC_MINUS: return "uncached-minus"; | |
22 | + case _PAGE_CACHE_WB: return "write-back"; | |
23 | + case _PAGE_CACHE_WC: return "write-combining"; | |
24 | + default: return "broken"; | |
25 | + } | |
26 | +} | |
27 | + | |
28 | +#ifdef CONFIG_X86_PAT | |
29 | +extern int rbt_memtype_check_insert(struct memtype *new, | |
30 | + unsigned long *new_type); | |
31 | +extern int rbt_memtype_erase(u64 start, u64 end); | |
32 | +extern struct memtype *rbt_memtype_lookup(u64 addr); | |
33 | +extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); | |
34 | +#else | |
35 | +static inline int rbt_memtype_check_insert(struct memtype *new, | |
36 | + unsigned long *new_type) | |
37 | +{ return 0; } | |
38 | +static inline int rbt_memtype_erase(u64 start, u64 end) | |
39 | +{ return 0; } | |
40 | +static inline struct memtype *rbt_memtype_lookup(u64 addr) | |
41 | +{ return NULL; } | |
42 | +static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) | |
43 | +{ return 0; } | |
44 | +#endif | |
45 | + | |
46 | +#endif /* __PAT_INTERNAL_H_ */ |
arch/x86/mm/pat_rbtree.c
1 | +/* | |
2 | + * Handle caching attributes in page tables (PAT) | |
3 | + * | |
4 | + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> | |
5 | + * Suresh B Siddha <suresh.b.siddha@intel.com> | |
6 | + * | |
7 | + * Interval tree (augmented rbtree) used to store the PAT memory type | |
8 | + * reservations. | |
9 | + */ | |
10 | + | |
11 | +#include <linux/seq_file.h> | |
12 | +#include <linux/debugfs.h> | |
13 | +#include <linux/kernel.h> | |
14 | +#include <linux/module.h> | |
15 | +#include <linux/rbtree.h> | |
16 | +#include <linux/sched.h> | |
17 | +#include <linux/gfp.h> | |
18 | + | |
19 | +#include <asm/pgtable.h> | |
20 | +#include <asm/pat.h> | |
21 | + | |
22 | +#include "pat_internal.h" | |
23 | + | |
24 | +/* | |
25 | + * The memtype tree keeps track of memory type for specific | |
26 | + * physical memory areas. Without proper tracking, conflicting memory | |
27 | + * types in different mappings can cause CPU cache corruption. | |
28 | + * | |
29 | + * The tree is an interval tree (augmented rbtree) with tree ordered | |
30 | + * on starting address. Tree can contain multiple entries for | |
31 | + * different regions which overlap. All the aliases have the same | |
32 | + * cache attributes of course. | |
33 | + * | |
34 | + * memtype_lock protects the rbtree. | |
35 | + */ | |
36 | + | |
37 | +static void memtype_rb_augment_cb(struct rb_node *node); | |
38 | +static struct rb_root memtype_rbroot = RB_AUGMENT_ROOT(&memtype_rb_augment_cb); | |
39 | + | |
40 | +static int is_node_overlap(struct memtype *node, u64 start, u64 end) | |
41 | +{ | |
42 | + if (node->start >= end || node->end <= start) | |
43 | + return 0; | |
44 | + | |
45 | + return 1; | |
46 | +} | |
47 | + | |
48 | +static u64 get_subtree_max_end(struct rb_node *node) | |
49 | +{ | |
50 | + u64 ret = 0; | |
51 | + if (node) { | |
52 | + struct memtype *data = container_of(node, struct memtype, rb); | |
53 | + ret = data->subtree_max_end; | |
54 | + } | |
55 | + return ret; | |
56 | +} | |
57 | + | |
58 | +/* Update 'subtree_max_end' for a node, based on node and its children */ | |
59 | +static void update_node_max_end(struct rb_node *node) | |
60 | +{ | |
61 | + struct memtype *data; | |
62 | + u64 max_end, child_max_end; | |
63 | + | |
64 | + if (!node) | |
65 | + return; | |
66 | + | |
67 | + data = container_of(node, struct memtype, rb); | |
68 | + max_end = data->end; | |
69 | + | |
70 | + child_max_end = get_subtree_max_end(node->rb_right); | |
71 | + if (child_max_end > max_end) | |
72 | + max_end = child_max_end; | |
73 | + | |
74 | + child_max_end = get_subtree_max_end(node->rb_left); | |
75 | + if (child_max_end > max_end) | |
76 | + max_end = child_max_end; | |
77 | + | |
78 | + data->subtree_max_end = max_end; | |
79 | +} | |
80 | + | |
81 | +/* Update 'subtree_max_end' for a node and all its ancestors */ | |
82 | +static void update_path_max_end(struct rb_node *node) | |
83 | +{ | |
84 | + u64 old_max_end, new_max_end; | |
85 | + | |
86 | + while (node) { | |
87 | + struct memtype *data = container_of(node, struct memtype, rb); | |
88 | + | |
89 | + old_max_end = data->subtree_max_end; | |
90 | + update_node_max_end(node); | |
91 | + new_max_end = data->subtree_max_end; | |
92 | + | |
93 | + if (new_max_end == old_max_end) | |
94 | + break; | |
95 | + | |
96 | + node = rb_parent(node); | |
97 | + } | |
98 | +} | |
99 | + | |
100 | +/* Find the first (lowest start addr) overlapping range from rb tree */ | |
101 | +static struct memtype *memtype_rb_lowest_match(struct rb_root *root, | |
102 | + u64 start, u64 end) | |
103 | +{ | |
104 | + struct rb_node *node = root->rb_node; | |
105 | + struct memtype *last_lower = NULL; | |
106 | + | |
107 | + while (node) { | |
108 | + struct memtype *data = container_of(node, struct memtype, rb); | |
109 | + | |
110 | + if (get_subtree_max_end(node->rb_left) > start) { | |
111 | + /* Lowest overlap if any must be on left side */ | |
112 | + node = node->rb_left; | |
113 | + } else if (is_node_overlap(data, start, end)) { | |
114 | + last_lower = data; | |
115 | + break; | |
116 | + } else if (start >= data->start) { | |
117 | + /* Lowest overlap if any must be on right side */ | |
118 | + node = node->rb_right; | |
119 | + } else { | |
120 | + break; | |
121 | + } | |
122 | + } | |
123 | + return last_lower; /* Returns NULL if there is no overlap */ | |
124 | +} | |
125 | + | |
126 | +static struct memtype *memtype_rb_exact_match(struct rb_root *root, | |
127 | + u64 start, u64 end) | |
128 | +{ | |
129 | + struct memtype *match; | |
130 | + | |
131 | + match = memtype_rb_lowest_match(root, start, end); | |
132 | + while (match != NULL && match->start < end) { | |
133 | + struct rb_node *node; | |
134 | + | |
135 | + if (match->start == start && match->end == end) | |
136 | + return match; | |
137 | + | |
138 | + node = rb_next(&match->rb); | |
139 | + if (node) | |
140 | + match = container_of(node, struct memtype, rb); | |
141 | + else | |
142 | + match = NULL; | |
143 | + } | |
144 | + | |
145 | + return NULL; /* Returns NULL if there is no exact match */ | |
146 | +} | |
147 | + | |
148 | +static int memtype_rb_check_conflict(struct rb_root *root, | |
149 | + u64 start, u64 end, | |
150 | + unsigned long reqtype, unsigned long *newtype) | |
151 | +{ | |
152 | + struct rb_node *node; | |
153 | + struct memtype *match; | |
154 | + int found_type = reqtype; | |
155 | + | |
156 | + match = memtype_rb_lowest_match(&memtype_rbroot, start, end); | |
157 | + if (match == NULL) | |
158 | + goto success; | |
159 | + | |
160 | + if (match->type != found_type && newtype == NULL) | |
161 | + goto failure; | |
162 | + | |
163 | + dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); | |
164 | + found_type = match->type; | |
165 | + | |
166 | + node = rb_next(&match->rb); | |
167 | + while (node) { | |
168 | + match = container_of(node, struct memtype, rb); | |
169 | + | |
170 | + if (match->start >= end) /* Checked all possible matches */ | |
171 | + goto success; | |
172 | + | |
173 | + if (is_node_overlap(match, start, end) && | |
174 | + match->type != found_type) { | |
175 | + goto failure; | |
176 | + } | |
177 | + | |
178 | + node = rb_next(&match->rb); | |
179 | + } | |
180 | +success: | |
181 | + if (newtype) | |
182 | + *newtype = found_type; | |
183 | + | |
184 | + return 0; | |
185 | + | |
186 | +failure: | |
187 | + printk(KERN_INFO "%s:%d conflicting memory types " | |
188 | + "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start, | |
189 | + end, cattr_name(found_type), cattr_name(match->type)); | |
190 | + return -EBUSY; | |
191 | +} | |
192 | + | |
193 | +static void memtype_rb_augment_cb(struct rb_node *node) | |
194 | +{ | |
195 | + if (node) | |
196 | + update_path_max_end(node); | |
197 | +} | |
198 | + | |
199 | +static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) | |
200 | +{ | |
201 | + struct rb_node **node = &(root->rb_node); | |
202 | + struct rb_node *parent = NULL; | |
203 | + | |
204 | + while (*node) { | |
205 | + struct memtype *data = container_of(*node, struct memtype, rb); | |
206 | + | |
207 | + parent = *node; | |
208 | + if (newdata->start <= data->start) | |
209 | + node = &((*node)->rb_left); | |
210 | + else if (newdata->start > data->start) | |
211 | + node = &((*node)->rb_right); | |
212 | + } | |
213 | + | |
214 | + rb_link_node(&newdata->rb, parent, node); | |
215 | + rb_insert_color(&newdata->rb, root); | |
216 | +} | |
217 | + | |
218 | +int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) | |
219 | +{ | |
220 | + int err = 0; | |
221 | + | |
222 | + err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end, | |
223 | + new->type, ret_type); | |
224 | + | |
225 | + if (!err) { | |
226 | + if (ret_type) | |
227 | + new->type = *ret_type; | |
228 | + | |
229 | + memtype_rb_insert(&memtype_rbroot, new); | |
230 | + } | |
231 | + return err; | |
232 | +} | |
233 | + | |
234 | +int rbt_memtype_erase(u64 start, u64 end) | |
235 | +{ | |
236 | + struct memtype *data; | |
237 | + | |
238 | + data = memtype_rb_exact_match(&memtype_rbroot, start, end); | |
239 | + if (!data) | |
240 | + return -EINVAL; | |
241 | + | |
242 | + rb_erase(&data->rb, &memtype_rbroot); | |
243 | + return 0; | |
244 | +} | |
245 | + | |
246 | +struct memtype *rbt_memtype_lookup(u64 addr) | |
247 | +{ | |
248 | + struct memtype *data; | |
249 | + data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); | |
250 | + return data; | |
251 | +} | |
252 | + | |
253 | +#if defined(CONFIG_DEBUG_FS) | |
254 | +int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) | |
255 | +{ | |
256 | + struct rb_node *node; | |
257 | + int i = 1; | |
258 | + | |
259 | + node = rb_first(&memtype_rbroot); | |
260 | + while (node && pos != i) { | |
261 | + node = rb_next(node); | |
262 | + i++; | |
263 | + } | |
264 | + | |
265 | + if (node) { /* pos == i */ | |
266 | + struct memtype *this = container_of(node, struct memtype, rb); | |
267 | + *out = *this; | |
268 | + return 0; | |
269 | + } else { | |
270 | + return 1; | |
271 | + } | |
272 | +} | |
273 | +#endif |
include/linux/rbtree.h
... | ... | @@ -110,6 +110,7 @@ |
110 | 110 | struct rb_root |
111 | 111 | { |
112 | 112 | struct rb_node *rb_node; |
113 | + void (*augment_cb)(struct rb_node *node); | |
113 | 114 | }; |
114 | 115 | |
115 | 116 | |
... | ... | @@ -129,7 +130,9 @@ |
129 | 130 | rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; |
130 | 131 | } |
131 | 132 | |
132 | -#define RB_ROOT (struct rb_root) { NULL, } | |
133 | +#define RB_ROOT (struct rb_root) { NULL, NULL, } | |
134 | +#define RB_AUGMENT_ROOT(x) (struct rb_root) { NULL, x} | |
135 | + | |
133 | 136 | #define rb_entry(ptr, type, member) container_of(ptr, type, member) |
134 | 137 | |
135 | 138 | #define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) |
lib/rbtree.c
... | ... | @@ -44,6 +44,11 @@ |
44 | 44 | else |
45 | 45 | root->rb_node = right; |
46 | 46 | rb_set_parent(node, right); |
47 | + | |
48 | + if (root->augment_cb) { | |
49 | + root->augment_cb(node); | |
50 | + root->augment_cb(right); | |
51 | + } | |
47 | 52 | } |
48 | 53 | |
49 | 54 | static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) |
50 | 55 | |
... | ... | @@ -67,12 +72,20 @@ |
67 | 72 | else |
68 | 73 | root->rb_node = left; |
69 | 74 | rb_set_parent(node, left); |
75 | + | |
76 | + if (root->augment_cb) { | |
77 | + root->augment_cb(node); | |
78 | + root->augment_cb(left); | |
79 | + } | |
70 | 80 | } |
71 | 81 | |
72 | 82 | void rb_insert_color(struct rb_node *node, struct rb_root *root) |
73 | 83 | { |
74 | 84 | struct rb_node *parent, *gparent; |
75 | 85 | |
86 | + if (root->augment_cb) | |
87 | + root->augment_cb(node); | |
88 | + | |
76 | 89 | while ((parent = rb_parent(node)) && rb_is_red(parent)) |
77 | 90 | { |
78 | 91 | gparent = rb_parent(parent); |
79 | 92 | |
... | ... | @@ -227,12 +240,15 @@ |
227 | 240 | else |
228 | 241 | { |
229 | 242 | struct rb_node *old = node, *left; |
243 | + int old_parent_cb = 0; | |
244 | + int successor_parent_cb = 0; | |
230 | 245 | |
231 | 246 | node = node->rb_right; |
232 | 247 | while ((left = node->rb_left) != NULL) |
233 | 248 | node = left; |
234 | 249 | |
235 | 250 | if (rb_parent(old)) { |
251 | + old_parent_cb = 1; | |
236 | 252 | if (rb_parent(old)->rb_left == old) |
237 | 253 | rb_parent(old)->rb_left = node; |
238 | 254 | else |
239 | 255 | |
... | ... | @@ -247,8 +263,10 @@ |
247 | 263 | if (parent == old) { |
248 | 264 | parent = node; |
249 | 265 | } else { |
266 | + successor_parent_cb = 1; | |
250 | 267 | if (child) |
251 | 268 | rb_set_parent(child, parent); |
269 | + | |
252 | 270 | parent->rb_left = child; |
253 | 271 | |
254 | 272 | node->rb_right = old->rb_right; |
... | ... | @@ -259,6 +277,24 @@ |
259 | 277 | node->rb_left = old->rb_left; |
260 | 278 | rb_set_parent(old->rb_left, node); |
261 | 279 | |
280 | + if (root->augment_cb) { | |
281 | + /* | |
282 | + * Here, three different nodes can have new children. | |
283 | + * The parent of the successor node that was selected | |
284 | + * to replace the node to be erased. | |
285 | + * The node that is getting erased and is now replaced | |
286 | + * by its successor. | |
287 | + * The parent of the node getting erased-replaced. | |
288 | + */ | |
289 | + if (successor_parent_cb) | |
290 | + root->augment_cb(parent); | |
291 | + | |
292 | + root->augment_cb(node); | |
293 | + | |
294 | + if (old_parent_cb) | |
295 | + root->augment_cb(rb_parent(old)); | |
296 | + } | |
297 | + | |
262 | 298 | goto color; |
263 | 299 | } |
264 | 300 | |
265 | 301 | |
266 | 302 | |
... | ... | @@ -267,15 +303,19 @@ |
267 | 303 | |
268 | 304 | if (child) |
269 | 305 | rb_set_parent(child, parent); |
270 | - if (parent) | |
271 | - { | |
306 | + | |
307 | + if (parent) { | |
272 | 308 | if (parent->rb_left == node) |
273 | 309 | parent->rb_left = child; |
274 | 310 | else |
275 | 311 | parent->rb_right = child; |
276 | - } | |
277 | - else | |
312 | + | |
313 | + if (root->augment_cb) | |
314 | + root->augment_cb(parent); | |
315 | + | |
316 | + } else { | |
278 | 317 | root->rb_node = child; |
318 | + } | |
279 | 319 | |
280 | 320 | color: |
281 | 321 | if (color == RB_BLACK) |