Commit 98eedc3a9dbf90cecb91093d2a7fa083942b7d13

Authored by Andy Lutomirski
Committed by H. Peter Anvin
1 parent 574c44fa8f

Document the vDSO and add a reference parser

It turns out that parsing the vDSO is nontrivial if you don't already
have an ELF dynamic loader around.  So document it in Documentation/ABI
and add a reference CC0-licenced parser.

This code is dedicated to Go issue 1933:
http://code.google.com/p/go/issues/detail?id=1933

Signed-off-by: Andy Lutomirski <luto@mit.edu>
Link: http://lkml.kernel.org/r/a315a9514cd71bcf29436cc31e35aada21a5ff21.1310563276.git.luto@mit.edu
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

Showing 3 changed files with 394 additions and 0 deletions Side-by-side Diff

Documentation/ABI/stable/vdso
  1 +On some architectures, when the kernel loads any userspace program it
  2 +maps an ELF DSO into that program's address space. This DSO is called
  3 +the vDSO and it often contains useful and highly-optimized alternatives
  4 +to real syscalls.
  5 +
  6 +These functions are called just like ordinary C function according to
  7 +your platform's ABI. Call them from a sensible context. (For example,
  8 +if you set CS on x86 to something strange, the vDSO functions are
  9 +within their rights to crash.) In addition, if you pass a bad
  10 +pointer to a vDSO function, you might get SIGSEGV instead of -EFAULT.
  11 +
  12 +To find the DSO, parse the auxiliary vector passed to the program's
  13 +entry point. The AT_SYSINFO_EHDR entry will point to the vDSO.
  14 +
  15 +The vDSO uses symbol versioning; whenever you request a symbol from the
  16 +vDSO, specify the version you are expecting.
  17 +
  18 +Programs that dynamically link to glibc will use the vDSO automatically.
  19 +Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c.
  20 +
  21 +Unless otherwise noted, the set of symbols with any given version and the
  22 +ABI of those symbols is considered stable. It may vary across architectures,
  23 +though.
  24 +
  25 +(As of this writing, this ABI documentation as been confirmed for x86_64.
  26 + The maintainers of the other vDSO-using architectures should confirm
  27 + that it is correct for their architecture.)
Documentation/vDSO/parse_vdso.c
  1 +/*
  2 + * parse_vdso.c: Linux reference vDSO parser
  3 + * Written by Andrew Lutomirski, 2011.
  4 + *
  5 + * This code is meant to be linked in to various programs that run on Linux.
  6 + * As such, it is available with as few restrictions as possible. This file
  7 + * is licensed under the Creative Commons Zero License, version 1.0,
  8 + * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode
  9 + *
  10 + * The vDSO is a regular ELF DSO that the kernel maps into user space when
  11 + * it starts a program. It works equally well in statically and dynamically
  12 + * linked binaries.
  13 + *
  14 + * This code is tested on x86_64. In principle it should work on any 64-bit
  15 + * architecture that has a vDSO.
  16 + */
  17 +
  18 +#include <stdbool.h>
  19 +#include <stdint.h>
  20 +#include <string.h>
  21 +#include <elf.h>
  22 +
  23 +/*
  24 + * To use this vDSO parser, first call one of the vdso_init_* functions.
  25 + * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR
  26 + * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv.
  27 + * Then call vdso_sym for each symbol you want. For example, to look up
  28 + * gettimeofday on x86_64, use:
  29 + *
  30 + * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday");
  31 + * or
  32 + * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
  33 + *
  34 + * vdso_sym will return 0 if the symbol doesn't exist or if the init function
  35 + * failed or was not called. vdso_sym is a little slow, so its return value
  36 + * should be cached.
  37 + *
  38 + * vdso_sym is threadsafe; the init functions are not.
  39 + *
  40 + * These are the prototypes:
  41 + */
  42 +extern void vdso_init_from_auxv(void *auxv);
  43 +extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
  44 +extern void *vdso_sym(const char *version, const char *name);
  45 +
  46 +
  47 +/* And here's the code. */
  48 +
  49 +#ifndef __x86_64__
  50 +# error Not yet ported to non-x86_64 architectures
  51 +#endif
  52 +
  53 +static struct vdso_info
  54 +{
  55 + bool valid;
  56 +
  57 + /* Load information */
  58 + uintptr_t load_addr;
  59 + uintptr_t load_offset; /* load_addr - recorded vaddr */
  60 +
  61 + /* Symbol table */
  62 + Elf64_Sym *symtab;
  63 + const char *symstrings;
  64 + Elf64_Word *bucket, *chain;
  65 + Elf64_Word nbucket, nchain;
  66 +
  67 + /* Version table */
  68 + Elf64_Versym *versym;
  69 + Elf64_Verdef *verdef;
  70 +} vdso_info;
  71 +
  72 +/* Straight from the ELF specification. */
  73 +static unsigned long elf_hash(const unsigned char *name)
  74 +{
  75 + unsigned long h = 0, g;
  76 + while (*name)
  77 + {
  78 + h = (h << 4) + *name++;
  79 + if (g = h & 0xf0000000)
  80 + h ^= g >> 24;
  81 + h &= ~g;
  82 + }
  83 + return h;
  84 +}
  85 +
  86 +void vdso_init_from_sysinfo_ehdr(uintptr_t base)
  87 +{
  88 + size_t i;
  89 + bool found_vaddr = false;
  90 +
  91 + vdso_info.valid = false;
  92 +
  93 + vdso_info.load_addr = base;
  94 +
  95 + Elf64_Ehdr *hdr = (Elf64_Ehdr*)base;
  96 + Elf64_Phdr *pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff);
  97 + Elf64_Dyn *dyn = 0;
  98 +
  99 + /*
  100 + * We need two things from the segment table: the load offset
  101 + * and the dynamic table.
  102 + */
  103 + for (i = 0; i < hdr->e_phnum; i++)
  104 + {
  105 + if (pt[i].p_type == PT_LOAD && !found_vaddr) {
  106 + found_vaddr = true;
  107 + vdso_info.load_offset = base
  108 + + (uintptr_t)pt[i].p_offset
  109 + - (uintptr_t)pt[i].p_vaddr;
  110 + } else if (pt[i].p_type == PT_DYNAMIC) {
  111 + dyn = (Elf64_Dyn*)(base + pt[i].p_offset);
  112 + }
  113 + }
  114 +
  115 + if (!found_vaddr || !dyn)
  116 + return; /* Failed */
  117 +
  118 + /*
  119 + * Fish out the useful bits of the dynamic table.
  120 + */
  121 + Elf64_Word *hash = 0;
  122 + vdso_info.symstrings = 0;
  123 + vdso_info.symtab = 0;
  124 + vdso_info.versym = 0;
  125 + vdso_info.verdef = 0;
  126 + for (i = 0; dyn[i].d_tag != DT_NULL; i++) {
  127 + switch (dyn[i].d_tag) {
  128 + case DT_STRTAB:
  129 + vdso_info.symstrings = (const char *)
  130 + ((uintptr_t)dyn[i].d_un.d_ptr
  131 + + vdso_info.load_offset);
  132 + break;
  133 + case DT_SYMTAB:
  134 + vdso_info.symtab = (Elf64_Sym *)
  135 + ((uintptr_t)dyn[i].d_un.d_ptr
  136 + + vdso_info.load_offset);
  137 + break;
  138 + case DT_HASH:
  139 + hash = (Elf64_Word *)
  140 + ((uintptr_t)dyn[i].d_un.d_ptr
  141 + + vdso_info.load_offset);
  142 + break;
  143 + case DT_VERSYM:
  144 + vdso_info.versym = (Elf64_Versym *)
  145 + ((uintptr_t)dyn[i].d_un.d_ptr
  146 + + vdso_info.load_offset);
  147 + break;
  148 + case DT_VERDEF:
  149 + vdso_info.verdef = (Elf64_Verdef *)
  150 + ((uintptr_t)dyn[i].d_un.d_ptr
  151 + + vdso_info.load_offset);
  152 + break;
  153 + }
  154 + }
  155 + if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
  156 + return; /* Failed */
  157 +
  158 + if (!vdso_info.verdef)
  159 + vdso_info.versym = 0;
  160 +
  161 + /* Parse the hash table header. */
  162 + vdso_info.nbucket = hash[0];
  163 + vdso_info.nchain = hash[1];
  164 + vdso_info.bucket = &hash[2];
  165 + vdso_info.chain = &hash[vdso_info.nbucket + 2];
  166 +
  167 + /* That's all we need. */
  168 + vdso_info.valid = true;
  169 +}
  170 +
  171 +static bool vdso_match_version(Elf64_Versym ver,
  172 + const char *name, Elf64_Word hash)
  173 +{
  174 + /*
  175 + * This is a helper function to check if the version indexed by
  176 + * ver matches name (which hashes to hash).
  177 + *
  178 + * The version definition table is a mess, and I don't know how
  179 + * to do this in better than linear time without allocating memory
  180 + * to build an index. I also don't know why the table has
  181 + * variable size entries in the first place.
  182 + *
  183 + * For added fun, I can't find a comprehensible specification of how
  184 + * to parse all the weird flags in the table.
  185 + *
  186 + * So I just parse the whole table every time.
  187 + */
  188 +
  189 + /* First step: find the version definition */
  190 + ver &= 0x7fff; /* Apparently bit 15 means "hidden" */
  191 + Elf64_Verdef *def = vdso_info.verdef;
  192 + while(true) {
  193 + if ((def->vd_flags & VER_FLG_BASE) == 0
  194 + && (def->vd_ndx & 0x7fff) == ver)
  195 + break;
  196 +
  197 + if (def->vd_next == 0)
  198 + return false; /* No definition. */
  199 +
  200 + def = (Elf64_Verdef *)((char *)def + def->vd_next);
  201 + }
  202 +
  203 + /* Now figure out whether it matches. */
  204 + Elf64_Verdaux *aux = (Elf64_Verdaux*)((char *)def + def->vd_aux);
  205 + return def->vd_hash == hash
  206 + && !strcmp(name, vdso_info.symstrings + aux->vda_name);
  207 +}
  208 +
  209 +void *vdso_sym(const char *version, const char *name)
  210 +{
  211 + unsigned long ver_hash;
  212 + if (!vdso_info.valid)
  213 + return 0;
  214 +
  215 + ver_hash = elf_hash(version);
  216 + Elf64_Word chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
  217 +
  218 + for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
  219 + Elf64_Sym *sym = &vdso_info.symtab[chain];
  220 +
  221 + /* Check for a defined global or weak function w/ right name. */
  222 + if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
  223 + continue;
  224 + if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
  225 + ELF64_ST_BIND(sym->st_info) != STB_WEAK)
  226 + continue;
  227 + if (sym->st_shndx == SHN_UNDEF)
  228 + continue;
  229 + if (strcmp(name, vdso_info.symstrings + sym->st_name))
  230 + continue;
  231 +
  232 + /* Check symbol version. */
  233 + if (vdso_info.versym
  234 + && !vdso_match_version(vdso_info.versym[chain],
  235 + version, ver_hash))
  236 + continue;
  237 +
  238 + return (void *)(vdso_info.load_offset + sym->st_value);
  239 + }
  240 +
  241 + return 0;
  242 +}
  243 +
  244 +void vdso_init_from_auxv(void *auxv)
  245 +{
  246 + Elf64_auxv_t *elf_auxv = auxv;
  247 + for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++)
  248 + {
  249 + if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) {
  250 + vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val);
  251 + return;
  252 + }
  253 + }
  254 +
  255 + vdso_info.valid = false;
  256 +}
Documentation/vDSO/vdso_test.c
  1 +/*
  2 + * vdso_test.c: Sample code to test parse_vdso.c on x86_64
  3 + * Copyright (c) 2011 Andy Lutomirski
  4 + * Subject to the GNU General Public License, version 2
  5 + *
  6 + * You can amuse yourself by compiling with:
  7 + * gcc -std=gnu99 -nostdlib
  8 + * -Os -fno-asynchronous-unwind-tables -flto
  9 + * vdso_test.c parse_vdso.c -o vdso_test
  10 + * to generate a small binary with no dependencies at all.
  11 + */
  12 +
  13 +#include <sys/syscall.h>
  14 +#include <sys/time.h>
  15 +#include <unistd.h>
  16 +#include <stdint.h>
  17 +
  18 +extern void *vdso_sym(const char *version, const char *name);
  19 +extern void vdso_init_from_sysinfo_ehdr(uintptr_t base);
  20 +extern void vdso_init_from_auxv(void *auxv);
  21 +
  22 +/* We need a libc functions... */
  23 +int strcmp(const char *a, const char *b)
  24 +{
  25 + /* This implementation is buggy: it never returns -1. */
  26 + while (*a || *b) {
  27 + if (*a != *b)
  28 + return 1;
  29 + if (*a == 0 || *b == 0)
  30 + return 1;
  31 + a++;
  32 + b++;
  33 + }
  34 +
  35 + return 0;
  36 +}
  37 +
  38 +/* ...and two syscalls. This is x86_64-specific. */
  39 +static inline long linux_write(int fd, const void *data, size_t len)
  40 +{
  41 +
  42 + long ret;
  43 + asm volatile ("syscall" : "=a" (ret) : "a" (__NR_write),
  44 + "D" (fd), "S" (data), "d" (len) :
  45 + "cc", "memory", "rcx",
  46 + "r8", "r9", "r10", "r11" );
  47 + return ret;
  48 +}
  49 +
  50 +static inline void linux_exit(int code)
  51 +{
  52 + asm volatile ("syscall" : : "a" (__NR_exit), "D" (code));
  53 +}
  54 +
  55 +void to_base10(char *lastdig, uint64_t n)
  56 +{
  57 + while (n) {
  58 + *lastdig = (n % 10) + '0';
  59 + n /= 10;
  60 + lastdig--;
  61 + }
  62 +}
  63 +
  64 +__attribute__((externally_visible)) void c_main(void **stack)
  65 +{
  66 + /* Parse the stack */
  67 + long argc = (long)*stack;
  68 + stack += argc + 2;
  69 +
  70 + /* Now we're pointing at the environment. Skip it. */
  71 + while(*stack)
  72 + stack++;
  73 + stack++;
  74 +
  75 + /* Now we're pointing at auxv. Initialize the vDSO parser. */
  76 + vdso_init_from_auxv((void *)stack);
  77 +
  78 + /* Find gettimeofday. */
  79 + typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
  80 + gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
  81 +
  82 + if (!gtod)
  83 + linux_exit(1);
  84 +
  85 + struct timeval tv;
  86 + long ret = gtod(&tv, 0);
  87 +
  88 + if (ret == 0) {
  89 + char buf[] = "The time is .000000\n";
  90 + to_base10(buf + 31, tv.tv_sec);
  91 + to_base10(buf + 38, tv.tv_usec);
  92 + linux_write(1, buf, sizeof(buf) - 1);
  93 + } else {
  94 + linux_exit(ret);
  95 + }
  96 +
  97 + linux_exit(0);
  98 +}
  99 +
  100 +/*
  101 + * This is the real entry point. It passes the initial stack into
  102 + * the C entry point.
  103 + */
  104 +asm (
  105 + ".text\n"
  106 + ".global _start\n"
  107 + ".type _start,@function\n"
  108 + "_start:\n\t"
  109 + "mov %rsp,%rdi\n\t"
  110 + "jmp c_main"
  111 + );