Blame view
kernel/bpf/stackmap.c
22.1 KB
25763b3c8 treewide: Replace... |
1 |
// SPDX-License-Identifier: GPL-2.0-only |
d5a3b1f69 bpf: introduce BP... |
2 |
/* Copyright (c) 2016 Facebook |
d5a3b1f69 bpf: introduce BP... |
3 4 5 6 |
*/ #include <linux/bpf.h> #include <linux/jhash.h> #include <linux/filter.h> |
7b04d6d60 bpf: Separate bpf... |
7 |
#include <linux/kernel.h> |
d5a3b1f69 bpf: introduce BP... |
8 9 |
#include <linux/stacktrace.h> #include <linux/perf_event.h> |
615755a77 bpf: extend stack... |
10 11 |
#include <linux/elf.h> #include <linux/pagemap.h> |
bae77c5eb bpf: enable stack... |
12 |
#include <linux/irq_work.h> |
c9a0f3b85 bpf: Resolve BTF ... |
13 |
#include <linux/btf_ids.h> |
557c0c6e7 bpf: convert stac... |
14 |
#include "percpu_freelist.h" |
d5a3b1f69 bpf: introduce BP... |
15 |
|
615755a77 bpf: extend stack... |
16 17 18 |
#define STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ BPF_F_STACK_BUILD_ID) |
6e71b04a8 bpf: Add file mod... |
19 |
|
d5a3b1f69 bpf: introduce BP... |
20 |
struct stack_map_bucket { |
557c0c6e7 bpf: convert stac... |
21 |
struct pcpu_freelist_node fnode; |
d5a3b1f69 bpf: introduce BP... |
22 23 |
u32 hash; u32 nr; |
615755a77 bpf: extend stack... |
24 |
u64 data[]; |
d5a3b1f69 bpf: introduce BP... |
25 26 27 28 |
}; struct bpf_stack_map { struct bpf_map map; |
557c0c6e7 bpf: convert stac... |
29 30 |
void *elems; struct pcpu_freelist freelist; |
d5a3b1f69 bpf: introduce BP... |
31 |
u32 n_buckets; |
557c0c6e7 bpf: convert stac... |
32 |
struct stack_map_bucket *buckets[]; |
d5a3b1f69 bpf: introduce BP... |
33 |
}; |
bae77c5eb bpf: enable stack... |
34 35 36 |
/* irq_work to run up_read() for build_id lookup in nmi context */ struct stack_map_irq_work { struct irq_work irq_work; |
0cc55a021 mmap locking API:... |
37 |
struct mm_struct *mm; |
bae77c5eb bpf: enable stack... |
38 39 40 41 42 |
}; static void do_up_read(struct irq_work *entry) { struct stack_map_irq_work *work; |
099bfaa73 bpf/stackmap: Don... |
43 44 |
if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) return; |
bae77c5eb bpf: enable stack... |
45 |
work = container_of(entry, struct stack_map_irq_work, irq_work); |
0cc55a021 mmap locking API:... |
46 |
mmap_read_unlock_non_owner(work->mm); |
bae77c5eb bpf: enable stack... |
47 48 49 |
} static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); |
615755a77 bpf: extend stack... |
50 51 52 53 54 55 56 57 58 59 |
static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); } static inline int stack_map_data_size(struct bpf_map *map) { return stack_map_use_build_id(map) ? sizeof(struct bpf_stack_build_id) : sizeof(u64); } |
557c0c6e7 bpf: convert stac... |
60 61 62 63 |
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; int err; |
96eabe7a4 bpf: Allow select... |
64 65 |
smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, smap->map.numa_node); |
557c0c6e7 bpf: convert stac... |
66 67 68 69 70 71 72 73 74 75 76 77 |
if (!smap->elems) return -ENOMEM; err = pcpu_freelist_init(&smap->freelist); if (err) goto free_elems; pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, smap->map.max_entries); return 0; free_elems: |
d407bd25a bpf: don't trigge... |
78 |
bpf_map_area_free(smap->elems); |
557c0c6e7 bpf: convert stac... |
79 80 |
return err; } |
d5a3b1f69 bpf: introduce BP... |
81 82 83 84 85 |
/* Called from syscall */ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; |
b936ca643 bpf: rework memlo... |
86 |
struct bpf_map_memory mem; |
d5a3b1f69 bpf: introduce BP... |
87 88 |
u64 cost, n_buckets; int err; |
2c78ee898 bpf: Implement CA... |
89 |
if (!bpf_capable()) |
d5a3b1f69 bpf: introduce BP... |
90 |
return ERR_PTR(-EPERM); |
6e71b04a8 bpf: Add file mod... |
91 |
if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) |
823707b68 bpf: check for re... |
92 |
return ERR_PTR(-EINVAL); |
d5a3b1f69 bpf: introduce BP... |
93 94 |
/* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || |
615755a77 bpf: extend stack... |
95 96 97 98 99 100 101 102 103 104 |
value_size < 8 || value_size % 8) return ERR_PTR(-EINVAL); BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); if (attr->map_flags & BPF_F_STACK_BUILD_ID) { if (value_size % sizeof(struct bpf_stack_build_id) || value_size / sizeof(struct bpf_stack_build_id) > sysctl_perf_event_max_stack) return ERR_PTR(-EINVAL); } else if (value_size / 8 > sysctl_perf_event_max_stack) |
d5a3b1f69 bpf: introduce BP... |
105 106 107 108 109 110 |
return ERR_PTR(-EINVAL); /* hash table size must be power of 2 */ n_buckets = roundup_pow_of_two(attr->max_entries); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); |
b936ca643 bpf: rework memlo... |
111 |
cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); |
c85d69135 bpf: move memory ... |
112 |
err = bpf_map_charge_init(&mem, cost); |
b936ca643 bpf: rework memlo... |
113 114 |
if (err) return ERR_PTR(err); |
d5a3b1f69 bpf: introduce BP... |
115 |
|
96eabe7a4 bpf: Allow select... |
116 |
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); |
b936ca643 bpf: rework memlo... |
117 118 |
if (!smap) { bpf_map_charge_finish(&mem); |
d407bd25a bpf: don't trigge... |
119 |
return ERR_PTR(-ENOMEM); |
b936ca643 bpf: rework memlo... |
120 |
} |
d5a3b1f69 bpf: introduce BP... |
121 |
|
bd475643d bpf: add helper f... |
122 |
bpf_map_init_from_attr(&smap->map, attr); |
d5a3b1f69 bpf: introduce BP... |
123 |
smap->map.value_size = value_size; |
d5a3b1f69 bpf: introduce BP... |
124 |
smap->n_buckets = n_buckets; |
557c0c6e7 bpf: convert stac... |
125 |
|
97c79a38c perf core: Per ev... |
126 |
err = get_callchain_buffers(sysctl_perf_event_max_stack); |
d5a3b1f69 bpf: introduce BP... |
127 |
if (err) |
b936ca643 bpf: rework memlo... |
128 |
goto free_charge; |
d5a3b1f69 bpf: introduce BP... |
129 |
|
557c0c6e7 bpf: convert stac... |
130 131 132 |
err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; |
b936ca643 bpf: rework memlo... |
133 |
bpf_map_charge_move(&smap->map.memory, &mem); |
d5a3b1f69 bpf: introduce BP... |
134 |
return &smap->map; |
557c0c6e7 bpf: convert stac... |
135 136 |
put_buffers: put_callchain_buffers(); |
b936ca643 bpf: rework memlo... |
137 138 |
free_charge: bpf_map_charge_finish(&mem); |
d407bd25a bpf: don't trigge... |
139 |
bpf_map_area_free(smap); |
d5a3b1f69 bpf: introduce BP... |
140 141 |
return ERR_PTR(err); } |
615755a77 bpf: extend stack... |
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
#define BPF_BUILD_ID 3 /* * Parse build id from the note segment. This logic can be shared between * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are * identical. */ static inline int stack_map_parse_build_id(void *page_addr, unsigned char *build_id, void *note_start, Elf32_Word note_size) { Elf32_Word note_offs = 0, new_offs; /* check for overflow */ if (note_start < page_addr || note_start + note_size < note_start) return -EINVAL; /* only supports note that fits in the first page */ if (note_start + note_size > page_addr + PAGE_SIZE) return -EINVAL; while (note_offs + sizeof(Elf32_Nhdr) < note_size) { Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); if (nhdr->n_type == BPF_BUILD_ID && nhdr->n_namesz == sizeof("GNU") && |
0b698005a bpf: don't assume... |
168 169 |
nhdr->n_descsz > 0 && nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { |
615755a77 bpf: extend stack... |
170 171 172 |
memcpy(build_id, note_start + note_offs + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), |
0b698005a bpf: don't assume... |
173 174 175 |
nhdr->n_descsz); memset(build_id + nhdr->n_descsz, 0, BPF_BUILD_ID_SIZE - nhdr->n_descsz); |
615755a77 bpf: extend stack... |
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); if (new_offs <= note_offs) /* overflow */ break; note_offs = new_offs; } return -EINVAL; } /* Parse build ID from 32-bit ELF */ static int stack_map_get_build_id_32(void *page_addr, unsigned char *build_id) { Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; Elf32_Phdr *phdr; int i; /* only supports phdr that fits in one page */ if (ehdr->e_phnum > (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) return -EINVAL; phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); |
b33164f2b bpf: Iterate thro... |
201 202 203 204 205 206 207 |
for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && !stack_map_parse_build_id(page_addr, build_id, page_addr + phdr[i].p_offset, phdr[i].p_filesz)) return 0; } |
615755a77 bpf: extend stack... |
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
return -EINVAL; } /* Parse build ID from 64-bit ELF */ static int stack_map_get_build_id_64(void *page_addr, unsigned char *build_id) { Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; Elf64_Phdr *phdr; int i; /* only supports phdr that fits in one page */ if (ehdr->e_phnum > (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) return -EINVAL; phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); |
b33164f2b bpf: Iterate thro... |
225 226 227 228 229 230 231 |
for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && !stack_map_parse_build_id(page_addr, build_id, page_addr + phdr[i].p_offset, phdr[i].p_filesz)) return 0; } |
615755a77 bpf: extend stack... |
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 |
return -EINVAL; } /* Parse build ID of ELF file mapped to vma */ static int stack_map_get_build_id(struct vm_area_struct *vma, unsigned char *build_id) { Elf32_Ehdr *ehdr; struct page *page; void *page_addr; int ret; /* only works for page backed storage */ if (!vma->vm_file) return -EINVAL; page = find_get_page(vma->vm_file->f_mapping, 0); if (!page) return -EFAULT; /* page not mapped */ ret = -EINVAL; |
beaf3d190 bpf: fix panic in... |
253 |
page_addr = kmap_atomic(page); |
615755a77 bpf: extend stack... |
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
ehdr = (Elf32_Ehdr *)page_addr; /* compare magic x7f "ELF" */ if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) goto out; /* only support executable file and shared object file */ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) goto out; if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) ret = stack_map_get_build_id_32(page_addr, build_id); else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) ret = stack_map_get_build_id_64(page_addr, build_id); out: |
beaf3d190 bpf: fix panic in... |
269 |
kunmap_atomic(page_addr); |
615755a77 bpf: extend stack... |
270 271 272 |
put_page(page); return ret; } |
5f4126327 bpf: change proto... |
273 |
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, |
615755a77 bpf: extend stack... |
274 275 276 277 |
u64 *ips, u32 trace_nr, bool user) { int i; struct vm_area_struct *vma; |
bae77c5eb bpf: enable stack... |
278 |
bool irq_work_busy = false; |
dc3b8ae9d bpf: avoid -Wmayb... |
279 |
struct stack_map_irq_work *work = NULL; |
bae77c5eb bpf: enable stack... |
280 |
|
eac9153f2 bpf/stackmap: Fix... |
281 |
if (irqs_disabled()) { |
099bfaa73 bpf/stackmap: Don... |
282 283 284 285 286 287 288 289 290 291 292 |
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { work = this_cpu_ptr(&up_read_work); if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { /* cannot queue more up_read, fallback */ irq_work_busy = true; } } else { /* * PREEMPT_RT does not allow to trylock mmap sem in * interrupt disabled context. Force the fallback code. */ |
bae77c5eb bpf: enable stack... |
293 |
irq_work_busy = true; |
099bfaa73 bpf/stackmap: Don... |
294 |
} |
bae77c5eb bpf: enable stack... |
295 |
} |
615755a77 bpf: extend stack... |
296 297 |
/* |
eac9153f2 bpf/stackmap: Fix... |
298 299 300 |
* We cannot do up_read() when the irq is disabled, because of * risk to deadlock with rq_lock. To do build_id lookup when the * irqs are disabled, we need to run up_read() in irq_work. We use |
bae77c5eb bpf: enable stack... |
301 302 |
* a percpu variable to do the irq_work. If the irq_work is * already used by another lookup, we fall back to report ips. |
615755a77 bpf: extend stack... |
303 304 305 306 |
* * Same fallback is used for kernel stack (!user) on a stackmap * with build_id. */ |
bae77c5eb bpf: enable stack... |
307 |
if (!user || !current || !current->mm || irq_work_busy || |
0cc55a021 mmap locking API:... |
308 |
!mmap_read_trylock_non_owner(current->mm)) { |
615755a77 bpf: extend stack... |
309 310 311 312 |
/* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; |
4af396ae4 bpf: zero out bui... |
313 |
memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); |
615755a77 bpf: extend stack... |
314 315 316 317 318 319 320 321 322 323 |
} return; } for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; |
4af396ae4 bpf: zero out bui... |
324 |
memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); |
615755a77 bpf: extend stack... |
325 326 327 328 329 330 |
continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } |
bae77c5eb bpf: enable stack... |
331 |
|
dc3b8ae9d bpf: avoid -Wmayb... |
332 |
if (!work) { |
0cc55a021 mmap locking API:... |
333 |
mmap_read_unlock_non_owner(current->mm); |
bae77c5eb bpf: enable stack... |
334 |
} else { |
0cc55a021 mmap locking API:... |
335 |
work->mm = current->mm; |
bae77c5eb bpf: enable stack... |
336 337 |
irq_work_queue(&work->irq_work); } |
615755a77 bpf: extend stack... |
338 |
} |
fa28dcb82 bpf: Introduce he... |
339 340 341 |
static struct perf_callchain_entry * get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) { |
046cc3dd9 bpf: Fix build wi... |
342 |
#ifdef CONFIG_STACKTRACE |
fa28dcb82 bpf: Introduce he... |
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 |
struct perf_callchain_entry *entry; int rctx; entry = get_callchain_entry(&rctx); if (!entry) return NULL; entry->nr = init_nr + stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), sysctl_perf_event_max_stack - init_nr, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is * necessary to fix this mismatch. */ if (__BITS_PER_LONG != 64) { unsigned long *from = (unsigned long *) entry->ip; u64 *to = entry->ip; int i; /* copy data from the end to avoid using extra buffer */ for (i = entry->nr - 1; i >= (int)init_nr; i--) to[i] = (u64)(from[i]); } put_callchain_entry(rctx); return entry; |
046cc3dd9 bpf: Fix build wi... |
372 373 374 |
#else /* CONFIG_STACKTRACE */ return NULL; #endif |
fa28dcb82 bpf: Introduce he... |
375 |
} |
7b04d6d60 bpf: Separate bpf... |
376 377 |
static long __bpf_get_stackid(struct bpf_map *map, struct perf_callchain_entry *trace, u64 flags) |
d5a3b1f69 bpf: introduce BP... |
378 |
{ |
d5a3b1f69 bpf: introduce BP... |
379 |
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); |
d5a3b1f69 bpf: introduce BP... |
380 |
struct stack_map_bucket *bucket, *new_bucket, *old_bucket; |
615755a77 bpf: extend stack... |
381 |
u32 max_depth = map->value_size / stack_map_data_size(map); |
c5dfd78eb perf core: Allow ... |
382 383 |
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 init_nr = sysctl_perf_event_max_stack - max_depth; |
d5a3b1f69 bpf: introduce BP... |
384 385 386 |
u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; |
d5a3b1f69 bpf: introduce BP... |
387 |
u64 *ips; |
615755a77 bpf: extend stack... |
388 |
bool hash_matches; |
d5a3b1f69 bpf: introduce BP... |
389 |
|
d5a3b1f69 bpf: introduce BP... |
390 |
/* get_perf_callchain() guarantees that trace->nr >= init_nr |
c5dfd78eb perf core: Allow ... |
391 |
* and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth |
d5a3b1f69 bpf: introduce BP... |
392 393 394 395 396 397 398 399 400 401 402 403 |
*/ trace_nr = trace->nr - init_nr; if (trace_nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; trace_nr -= skip; trace_len = trace_nr * sizeof(u64); ips = trace->ip + skip + init_nr; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); |
557c0c6e7 bpf: convert stac... |
404 |
bucket = READ_ONCE(smap->buckets[id]); |
d5a3b1f69 bpf: introduce BP... |
405 |
|
615755a77 bpf: extend stack... |
406 407 408 409 410 411 412 413 414 415 416 |
hash_matches = bucket && bucket->hash == hash; /* fast cmp */ if (hash_matches && flags & BPF_F_FAST_STACK_CMP) return id; if (stack_map_use_build_id(map)) { /* for build_id+offset, pop a bucket before slow cmp */ new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; |
5f4126327 bpf: change proto... |
417 418 419 420 |
new_bucket->nr = trace_nr; stack_map_get_build_id_offset( (struct bpf_stack_build_id *)new_bucket->data, ips, trace_nr, user); |
615755a77 bpf: extend stack... |
421 422 423 424 |
trace_len = trace_nr * sizeof(struct bpf_stack_build_id); if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, new_bucket->data, trace_len) == 0) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); |
d5a3b1f69 bpf: introduce BP... |
425 |
return id; |
615755a77 bpf: extend stack... |
426 427 428 429 430 431 432 433 |
} if (bucket && !(flags & BPF_F_REUSE_STACKID)) { pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); return -EEXIST; } } else { if (hash_matches && bucket->nr == trace_nr && memcmp(bucket->data, ips, trace_len) == 0) |
d5a3b1f69 bpf: introduce BP... |
434 |
return id; |
615755a77 bpf: extend stack... |
435 436 437 438 439 440 441 442 |
if (bucket && !(flags & BPF_F_REUSE_STACKID)) return -EEXIST; new_bucket = (struct stack_map_bucket *) pcpu_freelist_pop(&smap->freelist); if (unlikely(!new_bucket)) return -ENOMEM; memcpy(new_bucket->data, ips, trace_len); |
d5a3b1f69 bpf: introduce BP... |
443 |
} |
d5a3b1f69 bpf: introduce BP... |
444 445 446 447 448 |
new_bucket->hash = hash; new_bucket->nr = trace_nr; old_bucket = xchg(&smap->buckets[id], new_bucket); if (old_bucket) |
557c0c6e7 bpf: convert stac... |
449 |
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); |
d5a3b1f69 bpf: introduce BP... |
450 451 |
return id; } |
7b04d6d60 bpf: Separate bpf... |
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 |
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ u32 init_nr = sysctl_perf_event_max_stack - max_depth; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; trace = get_perf_callchain(regs, init_nr, kernel, user, sysctl_perf_event_max_stack, false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ return -EFAULT; return __bpf_get_stackid(map, trace, flags); } |
d5a3b1f69 bpf: introduce BP... |
475 476 477 478 479 480 481 482 |
const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; |
7b04d6d60 bpf: Separate bpf... |
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 |
static __u64 count_kernel_ip(struct perf_callchain_entry *trace) { __u64 nr_kernel = 0; while (nr_kernel < trace->nr) { if (trace->ip[nr_kernel] == PERF_CONTEXT_USER) break; nr_kernel++; } return nr_kernel; } BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, struct bpf_map *, map, u64, flags) { struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; __u64 nr_kernel; int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; user = flags & BPF_F_USER_STACK; kernel = !user; trace = ctx->data->callchain; if (unlikely(!trace)) return -EFAULT; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; ret = __bpf_get_stackid(map, trace, flags); /* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) return -EFAULT; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; ret = __bpf_get_stackid(map, trace, flags); } return ret; } const struct bpf_func_proto bpf_get_stackid_proto_pe = { .func = bpf_get_stackid_pe, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; |
fa28dcb82 bpf: Introduce he... |
551 |
static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, |
7b04d6d60 bpf: Separate bpf... |
552 |
struct perf_callchain_entry *trace_in, |
fa28dcb82 bpf: Introduce he... |
553 |
void *buf, u32 size, u64 flags) |
c195651e5 bpf: add bpf_get_... |
554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 |
{ u32 init_nr, trace_nr, copy_len, elem_size, num_elem; bool user_build_id = flags & BPF_F_USER_BUILD_ID; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; int err = -EINVAL; u64 *ips; if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; if (kernel && user_build_id) goto clear; elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) : sizeof(u64); if (unlikely(size % elem_size)) goto clear; |
fa28dcb82 bpf: Introduce he... |
574 575 576 |
/* cannot get valid user stack for task without user_mode regs */ if (task && user && !user_mode(regs)) goto err_fault; |
c195651e5 bpf: add bpf_get_... |
577 578 579 580 581 |
num_elem = size / elem_size; if (sysctl_perf_event_max_stack < num_elem) init_nr = 0; else init_nr = sysctl_perf_event_max_stack - num_elem; |
fa28dcb82 bpf: Introduce he... |
582 |
|
7b04d6d60 bpf: Separate bpf... |
583 584 585 |
if (trace_in) trace = trace_in; else if (kernel && task) |
fa28dcb82 bpf: Introduce he... |
586 587 588 589 590 |
trace = get_callchain_entry_for_task(task, init_nr); else trace = get_perf_callchain(regs, init_nr, kernel, user, sysctl_perf_event_max_stack, false, false); |
c195651e5 bpf: add bpf_get_... |
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 |
if (unlikely(!trace)) goto err_fault; trace_nr = trace->nr - init_nr; if (trace_nr < skip) goto err_fault; trace_nr -= skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; ips = trace->ip + skip + init_nr; if (user && user_build_id) stack_map_get_build_id_offset(buf, ips, trace_nr, user); else memcpy(buf, ips, copy_len); if (size > copy_len) memset(buf + copy_len, 0, size - copy_len); return copy_len; err_fault: err = -EFAULT; clear: memset(buf, 0, size); return err; } |
fa28dcb82 bpf: Introduce he... |
617 618 619 |
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, u64, flags) { |
7b04d6d60 bpf: Separate bpf... |
620 |
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); |
fa28dcb82 bpf: Introduce he... |
621 |
} |
c195651e5 bpf: add bpf_get_... |
622 623 624 625 626 627 628 629 630 |
const struct bpf_func_proto bpf_get_stack_proto = { .func = bpf_get_stack, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; |
fa28dcb82 bpf: Introduce he... |
631 632 633 634 |
BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { struct pt_regs *regs = task_pt_regs(task); |
7b04d6d60 bpf: Separate bpf... |
635 |
return __bpf_get_stack(regs, task, NULL, buf, size, flags); |
fa28dcb82 bpf: Introduce he... |
636 |
} |
9436ef6e8 bpf: Allow specif... |
637 |
BTF_ID_LIST_SINGLE(bpf_get_task_stack_btf_ids, struct, task_struct) |
c9a0f3b85 bpf: Resolve BTF ... |
638 |
|
fa28dcb82 bpf: Introduce he... |
639 640 641 642 643 |
const struct bpf_func_proto bpf_get_task_stack_proto = { .func = bpf_get_task_stack, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, |
9436ef6e8 bpf: Allow specif... |
644 |
.arg1_btf_id = &bpf_get_task_stack_btf_ids[0], |
fa28dcb82 bpf: Introduce he... |
645 646 647 |
.arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, |
fa28dcb82 bpf: Introduce he... |
648 |
}; |
7b04d6d60 bpf: Separate bpf... |
649 650 651 |
BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { |
2b9b305fc bpf: Fix build on... |
652 |
struct pt_regs *regs = (struct pt_regs *)(ctx->regs); |
7b04d6d60 bpf: Separate bpf... |
653 654 655 656 657 658 659 |
struct perf_event *event = ctx->event; struct perf_callchain_entry *trace; bool kernel, user; int err = -EINVAL; __u64 nr_kernel; if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) |
2b9b305fc bpf: Fix build on... |
660 |
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); |
7b04d6d60 bpf: Separate bpf... |
661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 |
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | BPF_F_USER_BUILD_ID))) goto clear; user = flags & BPF_F_USER_STACK; kernel = !user; err = -EFAULT; trace = ctx->data->callchain; if (unlikely(!trace)) goto clear; nr_kernel = count_kernel_ip(trace); if (kernel) { __u64 nr = trace->nr; trace->nr = nr_kernel; |
2b9b305fc bpf: Fix build on... |
680 |
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); |
7b04d6d60 bpf: Separate bpf... |
681 682 683 684 685 686 687 688 689 690 691 |
/* restore nr */ trace->nr = nr; } else { /* user */ u64 skip = flags & BPF_F_SKIP_FIELD_MASK; skip += nr_kernel; if (skip > BPF_F_SKIP_FIELD_MASK) goto clear; flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip; |
2b9b305fc bpf: Fix build on... |
692 |
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags); |
7b04d6d60 bpf: Separate bpf... |
693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 |
} return err; clear: memset(buf, 0, size); return err; } const struct bpf_func_proto bpf_get_stack_proto_pe = { .func = bpf_get_stack_pe, .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; |
557c0c6e7 bpf: convert stac... |
711 |
/* Called from eBPF program */ |
d5a3b1f69 bpf: introduce BP... |
712 713 |
static void *stack_map_lookup_elem(struct bpf_map *map, void *key) { |
3b4a63f67 bpf: return EOPNO... |
714 |
return ERR_PTR(-EOPNOTSUPP); |
557c0c6e7 bpf: convert stac... |
715 716 717 718 719 |
} /* Called from syscall */ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) { |
d5a3b1f69 bpf: introduce BP... |
720 |
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); |
557c0c6e7 bpf: convert stac... |
721 722 |
struct stack_map_bucket *bucket, *old_bucket; u32 id = *(u32 *)key, trace_len; |
d5a3b1f69 bpf: introduce BP... |
723 724 |
if (unlikely(id >= smap->n_buckets)) |
557c0c6e7 bpf: convert stac... |
725 726 727 728 729 |
return -ENOENT; bucket = xchg(&smap->buckets[id], NULL); if (!bucket) return -ENOENT; |
615755a77 bpf: extend stack... |
730 731 |
trace_len = bucket->nr * stack_map_data_size(map); memcpy(value, bucket->data, trace_len); |
557c0c6e7 bpf: convert stac... |
732 733 734 735 736 737 |
memset(value + trace_len, 0, map->value_size - trace_len); old_bucket = xchg(&smap->buckets[id], bucket); if (old_bucket) pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); return 0; |
d5a3b1f69 bpf: introduce BP... |
738 |
} |
16f07c551 bpf: implement sy... |
739 740 |
static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) |
d5a3b1f69 bpf: introduce BP... |
741 |
{ |
16f07c551 bpf: implement sy... |
742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 |
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); u32 id; WARN_ON_ONCE(!rcu_read_lock_held()); if (!key) { id = 0; } else { id = *(u32 *)key; if (id >= smap->n_buckets || !smap->buckets[id]) id = 0; else id++; } while (id < smap->n_buckets && !smap->buckets[id]) id++; if (id >= smap->n_buckets) return -ENOENT; *(u32 *)next_key = id; return 0; |
d5a3b1f69 bpf: introduce BP... |
766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 |
} static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { return -EINVAL; } /* Called from syscall or from eBPF program */ static int stack_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *old_bucket; u32 id = *(u32 *)key; if (unlikely(id >= smap->n_buckets)) return -E2BIG; old_bucket = xchg(&smap->buckets[id], NULL); if (old_bucket) { |
557c0c6e7 bpf: convert stac... |
786 |
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); |
d5a3b1f69 bpf: introduce BP... |
787 788 789 790 791 792 793 794 795 796 |
return 0; } else { return -ENOENT; } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void stack_map_free(struct bpf_map *map) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); |
d5a3b1f69 bpf: introduce BP... |
797 |
|
d407bd25a bpf: don't trigge... |
798 |
bpf_map_area_free(smap->elems); |
557c0c6e7 bpf: convert stac... |
799 |
pcpu_freelist_destroy(&smap->freelist); |
d407bd25a bpf: don't trigge... |
800 |
bpf_map_area_free(smap); |
d5a3b1f69 bpf: introduce BP... |
801 802 |
put_callchain_buffers(); } |
2872e9ac3 bpf: Set map_btf_... |
803 |
static int stack_trace_map_btf_id; |
144991602 bpf: rename stack... |
804 |
const struct bpf_map_ops stack_trace_map_ops = { |
f4d052592 bpf: Add map_meta... |
805 |
.map_meta_equal = bpf_map_meta_equal, |
d5a3b1f69 bpf: introduce BP... |
806 807 808 809 810 811 |
.map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, .map_lookup_elem = stack_map_lookup_elem, .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, |
e8d2bec04 bpf: decouple btf... |
812 |
.map_check_btf = map_check_no_btf, |
2872e9ac3 bpf: Set map_btf_... |
813 814 |
.map_btf_name = "bpf_stack_map", .map_btf_id = &stack_trace_map_btf_id, |
d5a3b1f69 bpf: introduce BP... |
815 |
}; |
bae77c5eb bpf: enable stack... |
816 817 818 819 820 821 822 823 824 825 826 827 828 |
static int __init stack_map_init(void) { int cpu; struct stack_map_irq_work *work; for_each_possible_cpu(cpu) { work = per_cpu_ptr(&up_read_work, cpu); init_irq_work(&work->irq_work, do_up_read); } return 0; } subsys_initcall(stack_map_init); |