Commit 98eedc3a9dbf90cecb91093d2a7fa083942b7d13
Committed by
H. Peter Anvin
1 parent
574c44fa8f
Exists in
master
and in
6 other branches
Document the vDSO and add a reference parser
It turns out that parsing the vDSO is nontrivial if you don't already have an ELF dynamic loader around. So document it in Documentation/ABI and add a reference CC0-licenced parser. This code is dedicated to Go issue 1933: http://code.google.com/p/go/issues/detail?id=1933 Signed-off-by: Andy Lutomirski <luto@mit.edu> Link: http://lkml.kernel.org/r/a315a9514cd71bcf29436cc31e35aada21a5ff21.1310563276.git.luto@mit.edu Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Showing 3 changed files with 394 additions and 0 deletions Side-by-side Diff
Documentation/ABI/stable/vdso
1 | +On some architectures, when the kernel loads any userspace program it | |
2 | +maps an ELF DSO into that program's address space. This DSO is called | |
3 | +the vDSO and it often contains useful and highly-optimized alternatives | |
4 | +to real syscalls. | |
5 | + | |
6 | +These functions are called just like ordinary C function according to | |
7 | +your platform's ABI. Call them from a sensible context. (For example, | |
8 | +if you set CS on x86 to something strange, the vDSO functions are | |
9 | +within their rights to crash.) In addition, if you pass a bad | |
10 | +pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT. | |
11 | + | |
12 | +To find the DSO, parse the auxiliary vector passed to the program's | |
13 | +entry point. The AT_SYSINFO_EHDR entry will point to the vDSO. | |
14 | + | |
15 | +The vDSO uses symbol versioning; whenever you request a symbol from the | |
16 | +vDSO, specify the version you are expecting. | |
17 | + | |
18 | +Programs that dynamically link to glibc will use the vDSO automatically. | |
19 | +Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c. | |
20 | + | |
21 | +Unless otherwise noted, the set of symbols with any given version and the | |
22 | +ABI of those symbols is considered stable. It may vary across architectures, | |
23 | +though. | |
24 | + | |
25 | +(As of this writing, this ABI documentation as been confirmed for x86_64. | |
26 | + The maintainers of the other vDSO-using architectures should confirm | |
27 | + that it is correct for their architecture.) |
Documentation/vDSO/parse_vdso.c
1 | +/* | |
2 | + * parse_vdso.c: Linux reference vDSO parser | |
3 | + * Written by Andrew Lutomirski, 2011. | |
4 | + * | |
5 | + * This code is meant to be linked in to various programs that run on Linux. | |
6 | + * As such, it is available with as few restrictions as possible. This file | |
7 | + * is licensed under the Creative Commons Zero License, version 1.0, | |
8 | + * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode | |
9 | + * | |
10 | + * The vDSO is a regular ELF DSO that the kernel maps into user space when | |
11 | + * it starts a program. It works equally well in statically and dynamically | |
12 | + * linked binaries. | |
13 | + * | |
14 | + * This code is tested on x86_64. In principle it should work on any 64-bit | |
15 | + * architecture that has a vDSO. | |
16 | + */ | |
17 | + | |
18 | +#include <stdbool.h> | |
19 | +#include <stdint.h> | |
20 | +#include <string.h> | |
21 | +#include <elf.h> | |
22 | + | |
23 | +/* | |
24 | + * To use this vDSO parser, first call one of the vdso_init_* functions. | |
25 | + * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR | |
26 | + * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv. | |
27 | + * Then call vdso_sym for each symbol you want. For example, to look up | |
28 | + * gettimeofday on x86_64, use: | |
29 | + * | |
30 | + * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday"); | |
31 | + * or | |
32 | + * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); | |
33 | + * | |
34 | + * vdso_sym will return 0 if the symbol doesn't exist or if the init function | |
35 | + * failed or was not called. vdso_sym is a little slow, so its return value | |
36 | + * should be cached. | |
37 | + * | |
38 | + * vdso_sym is threadsafe; the init functions are not. | |
39 | + * | |
40 | + * These are the prototypes: | |
41 | + */ | |
42 | +extern void vdso_init_from_auxv(void *auxv); | |
43 | +extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); | |
44 | +extern void *vdso_sym(const char *version, const char *name); | |
45 | + | |
46 | + | |
47 | +/* And here's the code. */ | |
48 | + | |
49 | +#ifndef __x86_64__ | |
50 | +# error Not yet ported to non-x86_64 architectures | |
51 | +#endif | |
52 | + | |
53 | +static struct vdso_info | |
54 | +{ | |
55 | + bool valid; | |
56 | + | |
57 | + /* Load information */ | |
58 | + uintptr_t load_addr; | |
59 | + uintptr_t load_offset; /* load_addr - recorded vaddr */ | |
60 | + | |
61 | + /* Symbol table */ | |
62 | + Elf64_Sym *symtab; | |
63 | + const char *symstrings; | |
64 | + Elf64_Word *bucket, *chain; | |
65 | + Elf64_Word nbucket, nchain; | |
66 | + | |
67 | + /* Version table */ | |
68 | + Elf64_Versym *versym; | |
69 | + Elf64_Verdef *verdef; | |
70 | +} vdso_info; | |
71 | + | |
72 | +/* Straight from the ELF specification. */ | |
73 | +static unsigned long elf_hash(const unsigned char *name) | |
74 | +{ | |
75 | + unsigned long h = 0, g; | |
76 | + while (*name) | |
77 | + { | |
78 | + h = (h << 4) + *name++; | |
79 | + if (g = h & 0xf0000000) | |
80 | + h ^= g >> 24; | |
81 | + h &= ~g; | |
82 | + } | |
83 | + return h; | |
84 | +} | |
85 | + | |
86 | +void vdso_init_from_sysinfo_ehdr(uintptr_t base) | |
87 | +{ | |
88 | + size_t i; | |
89 | + bool found_vaddr = false; | |
90 | + | |
91 | + vdso_info.valid = false; | |
92 | + | |
93 | + vdso_info.load_addr = base; | |
94 | + | |
95 | + Elf64_Ehdr *hdr = (Elf64_Ehdr*)base; | |
96 | + Elf64_Phdr *pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff); | |
97 | + Elf64_Dyn *dyn = 0; | |
98 | + | |
99 | + /* | |
100 | + * We need two things from the segment table: the load offset | |
101 | + * and the dynamic table. | |
102 | + */ | |
103 | + for (i = 0; i < hdr->e_phnum; i++) | |
104 | + { | |
105 | + if (pt[i].p_type == PT_LOAD && !found_vaddr) { | |
106 | + found_vaddr = true; | |
107 | + vdso_info.load_offset = base | |
108 | + + (uintptr_t)pt[i].p_offset | |
109 | + - (uintptr_t)pt[i].p_vaddr; | |
110 | + } else if (pt[i].p_type == PT_DYNAMIC) { | |
111 | + dyn = (Elf64_Dyn*)(base + pt[i].p_offset); | |
112 | + } | |
113 | + } | |
114 | + | |
115 | + if (!found_vaddr || !dyn) | |
116 | + return; /* Failed */ | |
117 | + | |
118 | + /* | |
119 | + * Fish out the useful bits of the dynamic table. | |
120 | + */ | |
121 | + Elf64_Word *hash = 0; | |
122 | + vdso_info.symstrings = 0; | |
123 | + vdso_info.symtab = 0; | |
124 | + vdso_info.versym = 0; | |
125 | + vdso_info.verdef = 0; | |
126 | + for (i = 0; dyn[i].d_tag != DT_NULL; i++) { | |
127 | + switch (dyn[i].d_tag) { | |
128 | + case DT_STRTAB: | |
129 | + vdso_info.symstrings = (const char *) | |
130 | + ((uintptr_t)dyn[i].d_un.d_ptr | |
131 | + + vdso_info.load_offset); | |
132 | + break; | |
133 | + case DT_SYMTAB: | |
134 | + vdso_info.symtab = (Elf64_Sym *) | |
135 | + ((uintptr_t)dyn[i].d_un.d_ptr | |
136 | + + vdso_info.load_offset); | |
137 | + break; | |
138 | + case DT_HASH: | |
139 | + hash = (Elf64_Word *) | |
140 | + ((uintptr_t)dyn[i].d_un.d_ptr | |
141 | + + vdso_info.load_offset); | |
142 | + break; | |
143 | + case DT_VERSYM: | |
144 | + vdso_info.versym = (Elf64_Versym *) | |
145 | + ((uintptr_t)dyn[i].d_un.d_ptr | |
146 | + + vdso_info.load_offset); | |
147 | + break; | |
148 | + case DT_VERDEF: | |
149 | + vdso_info.verdef = (Elf64_Verdef *) | |
150 | + ((uintptr_t)dyn[i].d_un.d_ptr | |
151 | + + vdso_info.load_offset); | |
152 | + break; | |
153 | + } | |
154 | + } | |
155 | + if (!vdso_info.symstrings || !vdso_info.symtab || !hash) | |
156 | + return; /* Failed */ | |
157 | + | |
158 | + if (!vdso_info.verdef) | |
159 | + vdso_info.versym = 0; | |
160 | + | |
161 | + /* Parse the hash table header. */ | |
162 | + vdso_info.nbucket = hash[0]; | |
163 | + vdso_info.nchain = hash[1]; | |
164 | + vdso_info.bucket = &hash[2]; | |
165 | + vdso_info.chain = &hash[vdso_info.nbucket + 2]; | |
166 | + | |
167 | + /* That's all we need. */ | |
168 | + vdso_info.valid = true; | |
169 | +} | |
170 | + | |
171 | +static bool vdso_match_version(Elf64_Versym ver, | |
172 | + const char *name, Elf64_Word hash) | |
173 | +{ | |
174 | + /* | |
175 | + * This is a helper function to check if the version indexed by | |
176 | + * ver matches name (which hashes to hash). | |
177 | + * | |
178 | + * The version definition table is a mess, and I don't know how | |
179 | + * to do this in better than linear time without allocating memory | |
180 | + * to build an index. I also don't know why the table has | |
181 | + * variable size entries in the first place. | |
182 | + * | |
183 | + * For added fun, I can't find a comprehensible specification of how | |
184 | + * to parse all the weird flags in the table. | |
185 | + * | |
186 | + * So I just parse the whole table every time. | |
187 | + */ | |
188 | + | |
189 | + /* First step: find the version definition */ | |
190 | + ver &= 0x7fff; /* Apparently bit 15 means "hidden" */ | |
191 | + Elf64_Verdef *def = vdso_info.verdef; | |
192 | + while(true) { | |
193 | + if ((def->vd_flags & VER_FLG_BASE) == 0 | |
194 | + && (def->vd_ndx & 0x7fff) == ver) | |
195 | + break; | |
196 | + | |
197 | + if (def->vd_next == 0) | |
198 | + return false; /* No definition. */ | |
199 | + | |
200 | + def = (Elf64_Verdef *)((char *)def + def->vd_next); | |
201 | + } | |
202 | + | |
203 | + /* Now figure out whether it matches. */ | |
204 | + Elf64_Verdaux *aux = (Elf64_Verdaux*)((char *)def + def->vd_aux); | |
205 | + return def->vd_hash == hash | |
206 | + && !strcmp(name, vdso_info.symstrings + aux->vda_name); | |
207 | +} | |
208 | + | |
209 | +void *vdso_sym(const char *version, const char *name) | |
210 | +{ | |
211 | + unsigned long ver_hash; | |
212 | + if (!vdso_info.valid) | |
213 | + return 0; | |
214 | + | |
215 | + ver_hash = elf_hash(version); | |
216 | + Elf64_Word chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; | |
217 | + | |
218 | + for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) { | |
219 | + Elf64_Sym *sym = &vdso_info.symtab[chain]; | |
220 | + | |
221 | + /* Check for a defined global or weak function w/ right name. */ | |
222 | + if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC) | |
223 | + continue; | |
224 | + if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && | |
225 | + ELF64_ST_BIND(sym->st_info) != STB_WEAK) | |
226 | + continue; | |
227 | + if (sym->st_shndx == SHN_UNDEF) | |
228 | + continue; | |
229 | + if (strcmp(name, vdso_info.symstrings + sym->st_name)) | |
230 | + continue; | |
231 | + | |
232 | + /* Check symbol version. */ | |
233 | + if (vdso_info.versym | |
234 | + && !vdso_match_version(vdso_info.versym[chain], | |
235 | + version, ver_hash)) | |
236 | + continue; | |
237 | + | |
238 | + return (void *)(vdso_info.load_offset + sym->st_value); | |
239 | + } | |
240 | + | |
241 | + return 0; | |
242 | +} | |
243 | + | |
244 | +void vdso_init_from_auxv(void *auxv) | |
245 | +{ | |
246 | + Elf64_auxv_t *elf_auxv = auxv; | |
247 | + for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++) | |
248 | + { | |
249 | + if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) { | |
250 | + vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val); | |
251 | + return; | |
252 | + } | |
253 | + } | |
254 | + | |
255 | + vdso_info.valid = false; | |
256 | +} |
Documentation/vDSO/vdso_test.c
1 | +/* | |
2 | + * vdso_test.c: Sample code to test parse_vdso.c on x86_64 | |
3 | + * Copyright (c) 2011 Andy Lutomirski | |
4 | + * Subject to the GNU General Public License, version 2 | |
5 | + * | |
6 | + * You can amuse yourself by compiling with: | |
7 | + * gcc -std=gnu99 -nostdlib | |
8 | + * -Os -fno-asynchronous-unwind-tables -flto | |
9 | + * vdso_test.c parse_vdso.c -o vdso_test | |
10 | + * to generate a small binary with no dependencies at all. | |
11 | + */ | |
12 | + | |
13 | +#include <sys/syscall.h> | |
14 | +#include <sys/time.h> | |
15 | +#include <unistd.h> | |
16 | +#include <stdint.h> | |
17 | + | |
18 | +extern void *vdso_sym(const char *version, const char *name); | |
19 | +extern void vdso_init_from_sysinfo_ehdr(uintptr_t base); | |
20 | +extern void vdso_init_from_auxv(void *auxv); | |
21 | + | |
22 | +/* We need a libc functions... */ | |
23 | +int strcmp(const char *a, const char *b) | |
24 | +{ | |
25 | + /* This implementation is buggy: it never returns -1. */ | |
26 | + while (*a || *b) { | |
27 | + if (*a != *b) | |
28 | + return 1; | |
29 | + if (*a == 0 || *b == 0) | |
30 | + return 1; | |
31 | + a++; | |
32 | + b++; | |
33 | + } | |
34 | + | |
35 | + return 0; | |
36 | +} | |
37 | + | |
38 | +/* ...and two syscalls. This is x86_64-specific. */ | |
39 | +static inline long linux_write(int fd, const void *data, size_t len) | |
40 | +{ | |
41 | + | |
42 | + long ret; | |
43 | + asm volatile ("syscall" : "=a" (ret) : "a" (__NR_write), | |
44 | + "D" (fd), "S" (data), "d" (len) : | |
45 | + "cc", "memory", "rcx", | |
46 | + "r8", "r9", "r10", "r11" ); | |
47 | + return ret; | |
48 | +} | |
49 | + | |
50 | +static inline void linux_exit(int code) | |
51 | +{ | |
52 | + asm volatile ("syscall" : : "a" (__NR_exit), "D" (code)); | |
53 | +} | |
54 | + | |
55 | +void to_base10(char *lastdig, uint64_t n) | |
56 | +{ | |
57 | + while (n) { | |
58 | + *lastdig = (n % 10) + '0'; | |
59 | + n /= 10; | |
60 | + lastdig--; | |
61 | + } | |
62 | +} | |
63 | + | |
64 | +__attribute__((externally_visible)) void c_main(void **stack) | |
65 | +{ | |
66 | + /* Parse the stack */ | |
67 | + long argc = (long)*stack; | |
68 | + stack += argc + 2; | |
69 | + | |
70 | + /* Now we're pointing at the environment. Skip it. */ | |
71 | + while(*stack) | |
72 | + stack++; | |
73 | + stack++; | |
74 | + | |
75 | + /* Now we're pointing at auxv. Initialize the vDSO parser. */ | |
76 | + vdso_init_from_auxv((void *)stack); | |
77 | + | |
78 | + /* Find gettimeofday. */ | |
79 | + typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); | |
80 | + gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday"); | |
81 | + | |
82 | + if (!gtod) | |
83 | + linux_exit(1); | |
84 | + | |
85 | + struct timeval tv; | |
86 | + long ret = gtod(&tv, 0); | |
87 | + | |
88 | + if (ret == 0) { | |
89 | + char buf[] = "The time is .000000\n"; | |
90 | + to_base10(buf + 31, tv.tv_sec); | |
91 | + to_base10(buf + 38, tv.tv_usec); | |
92 | + linux_write(1, buf, sizeof(buf) - 1); | |
93 | + } else { | |
94 | + linux_exit(ret); | |
95 | + } | |
96 | + | |
97 | + linux_exit(0); | |
98 | +} | |
99 | + | |
100 | +/* | |
101 | + * This is the real entry point. It passes the initial stack into | |
102 | + * the C entry point. | |
103 | + */ | |
104 | +asm ( | |
105 | + ".text\n" | |
106 | + ".global _start\n" | |
107 | + ".type _start,@function\n" | |
108 | + "_start:\n\t" | |
109 | + "mov %rsp,%rdi\n\t" | |
110 | + "jmp c_main" | |
111 | + ); |