Blame view
tools/lguest/lguest.c
57.6 KB
2e04ef769 lguest: fix comme... |
1 2 3 4 5 6 |
/*P:100 * This is the Launcher code, a simple program which lays out the "physical" * memory for the new Guest by mapping the kernel image and the virtual * devices, then opens /dev/lguest to tell the kernel about the Guest and * control it. :*/ |
8ca47e006 lguest: the docum... |
7 8 9 10 11 12 13 14 15 16 |
#define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include <stdio.h> #include <string.h> #include <unistd.h> #include <err.h> #include <stdint.h> #include <stdlib.h> #include <elf.h> #include <sys/mman.h> |
6649bb7af Accept elf files ... |
17 |
#include <sys/param.h> |
8ca47e006 lguest: the docum... |
18 19 20 |
#include <sys/types.h> #include <sys/stat.h> #include <sys/wait.h> |
659a0e663 lguest: have exam... |
21 |
#include <sys/eventfd.h> |
8ca47e006 lguest: the docum... |
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
#include <fcntl.h> #include <stdbool.h> #include <errno.h> #include <ctype.h> #include <sys/socket.h> #include <sys/ioctl.h> #include <sys/time.h> #include <time.h> #include <netinet/in.h> #include <net/if.h> #include <linux/sockios.h> #include <linux/if_tun.h> #include <sys/uio.h> #include <termios.h> #include <getopt.h> |
17cbca2ba Update example la... |
37 38 |
#include <assert.h> #include <sched.h> |
a586d4f60 virtio: simplify ... |
39 40 |
#include <limits.h> #include <stddef.h> |
a161883a2 lguest: Tell Gues... |
41 |
#include <signal.h> |
8aeb36e8f lguest: --usernam... |
42 43 |
#include <pwd.h> #include <grp.h> |
f846619e7 lguest: clean up ... |
44 45 46 47 48 49 50 |
#include <linux/virtio_config.h> #include <linux/virtio_net.h> #include <linux/virtio_blk.h> #include <linux/virtio_console.h> #include <linux/virtio_rng.h> #include <linux/virtio_ring.h> #include <asm/bootparam.h> |
07fe9977b lguest: move the ... |
51 |
#include "../../include/linux/lguest_launcher.h" |
2e04ef769 lguest: fix comme... |
52 |
/*L:110 |
9f54288de lguest: update co... |
53 |
* We can ignore the 43 include files we need for this program, but I do want |
2e04ef769 lguest: fix comme... |
54 |
* to draw attention to the use of kernel-style types. |
db24e8c2e lguest: example l... |
55 56 57 58 |
* * As Linus said, "C is a Spartan language, and so should your naming be." I * like these abbreviations, so we define them here. Note that u64 is always * unsigned long long, which works on all Linux systems: this means that we can |
2e04ef769 lguest: fix comme... |
59 60 |
* use %llu in printf for any u64. */ |
db24e8c2e lguest: example l... |
61 62 63 64 |
typedef unsigned long long u64; typedef uint32_t u32; typedef uint16_t u16; typedef uint8_t u8; |
dde797899 lguest: documenta... |
65 |
/*:*/ |
8ca47e006 lguest: the docum... |
66 |
|
8ca47e006 lguest: the docum... |
67 68 69 70 |
#define BRIDGE_PFX "bridge:" #ifndef SIOCBRADDIF #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ #endif |
3c6b5bfa3 Introduce guest m... |
71 72 |
/* We can have up to 256 pages for devices. */ #define DEVICE_PAGES 256 |
0f0c4fab8 lguest: Enlarge v... |
73 74 |
/* This will occupy 3 pages: it must be a power of 2. */ #define VIRTQUEUE_NUM 256 |
8ca47e006 lguest: the docum... |
75 |
|
2e04ef769 lguest: fix comme... |
76 77 78 79 |
/*L:120 * verbose is both a global flag and a macro. The C preprocessor allows * this, and although I wouldn't recommend it, it works quite nicely here. */ |
8ca47e006 lguest: the docum... |
80 81 82 |
static bool verbose; #define verbose(args...) \ do { if (verbose) printf(args); } while(0) |
dde797899 lguest: documenta... |
83 |
/*:*/ |
3c6b5bfa3 Introduce guest m... |
84 85 86 87 |
/* The pointer to the start of guest memory. */ static void *guest_base; /* The maximum guest physical address allowed, and maximum possible. */ static unsigned long guest_limit, guest_max; |
56739c802 lguest: cleanup p... |
88 89 |
/* The /dev/lguest file descriptor. */ static int lguest_fd; |
8ca47e006 lguest: the docum... |
90 |
|
e3283fa0c lguest: adapt lau... |
91 92 |
/* a per-cpu variable indicating whose vcpu is currently running */ static unsigned int __thread cpu_id; |
dde797899 lguest: documenta... |
93 |
/* This is our list of devices. */ |
1842f23c0 lguest and virtio... |
94 |
struct device_list { |
17cbca2ba Update example la... |
95 96 97 98 99 |
/* Counter to assign interrupt numbers. */ unsigned int next_irq; /* Counter to print out convenient device numbers. */ unsigned int device_num; |
dde797899 lguest: documenta... |
100 |
/* The descriptor page for the devices. */ |
17cbca2ba Update example la... |
101 |
u8 *descpage; |
dde797899 lguest: documenta... |
102 |
/* A single linked list of devices. */ |
8ca47e006 lguest: the docum... |
103 |
struct device *dev; |
2e04ef769 lguest: fix comme... |
104 |
/* And a pointer to the last device for easy append. */ |
a586d4f60 virtio: simplify ... |
105 |
struct device *lastdev; |
8ca47e006 lguest: the docum... |
106 |
}; |
17cbca2ba Update example la... |
107 108 |
/* The list of Guest devices, based on command line arguments. */ static struct device_list devices; |
dde797899 lguest: documenta... |
109 |
/* The device structure describes a single device. */ |
1842f23c0 lguest and virtio... |
110 |
struct device { |
dde797899 lguest: documenta... |
111 |
/* The linked-list pointer. */ |
8ca47e006 lguest: the docum... |
112 |
struct device *next; |
17cbca2ba Update example la... |
113 |
|
713b15b37 lguest: be parano... |
114 |
/* The device's descriptor, as mapped into the Guest. */ |
8ca47e006 lguest: the docum... |
115 |
struct lguest_device_desc *desc; |
17cbca2ba Update example la... |
116 |
|
713b15b37 lguest: be parano... |
117 118 119 |
/* We can't trust desc values once Guest has booted: we use these. */ unsigned int feature_len; unsigned int num_vq; |
17cbca2ba Update example la... |
120 121 |
/* The name of this device, for --verbose. */ const char *name; |
8ca47e006 lguest: the docum... |
122 |
|
17cbca2ba Update example la... |
123 124 |
/* Any queues attached to this device */ struct virtqueue *vq; |
8ca47e006 lguest: the docum... |
125 |
|
659a0e663 lguest: have exam... |
126 127 |
/* Is it operational */ bool running; |
a007a751d lguest: make Laun... |
128 |
|
8ca47e006 lguest: the docum... |
129 130 131 |
/* Device-specific data. */ void *priv; }; |
17cbca2ba Update example la... |
132 |
/* The virtqueue structure describes a queue attached to a device. */ |
1842f23c0 lguest and virtio... |
133 |
struct virtqueue { |
17cbca2ba Update example la... |
134 135 136 137 138 139 140 141 142 143 144 145 146 |
struct virtqueue *next; /* Which device owns me. */ struct device *dev; /* The configuration for this queue. */ struct lguest_vqconfig config; /* The actual ring of buffers. */ struct vring vring; /* Last available index we saw. */ u16 last_avail_idx; |
95c517c09 lguest: avoid sen... |
147 148 |
/* How many are used since we sent last irq? */ unsigned int pending_used; |
659a0e663 lguest: have exam... |
149 150 |
/* Eventfd where Guest notifications arrive. */ int eventfd; |
208876115 lguest: notify on... |
151 |
|
659a0e663 lguest: have exam... |
152 153 154 |
/* Function for the thread which is servicing this virtqueue. */ void (*service)(struct virtqueue *vq); pid_t thread; |
17cbca2ba Update example la... |
155 |
}; |
ec04b13f6 lguest: Reboot su... |
156 157 |
/* Remember the arguments to the program so we can "reboot" */ static char **main_args; |
659a0e663 lguest: have exam... |
158 159 |
/* The original tty settings to restore on exit. */ static struct termios orig_term; |
2e04ef769 lguest: fix comme... |
160 161 |
/* * We have to be careful with barriers: our devices are all run in separate |
f7027c638 lguest: get more ... |
162 |
* threads and so we need to make sure that changes visible to the Guest happen |
2e04ef769 lguest: fix comme... |
163 164 |
* in precise order. */ |
f7027c638 lguest: get more ... |
165 |
#define wmb() __asm__ __volatile__("" : : : "memory") |
b60da13fc lguest: suppress ... |
166 |
#define mb() __asm__ __volatile__("" : : : "memory") |
17cbca2ba Update example la... |
167 |
|
2e04ef769 lguest: fix comme... |
168 169 |
/* * Convert an iovec element to the given type. |
17cbca2ba Update example la... |
170 171 172 173 174 175 |
* * This is a fairly ugly trick: we need to know the size of the type and * alignment requirement to check the pointer is kosher. It's also nice to * have the name of the type in case we report failure. * * Typing those three things all the time is cumbersome and error prone, so we |
2e04ef769 lguest: fix comme... |
176 177 |
* have a macro which sets them all up and passes to the real function. */ |
17cbca2ba Update example la... |
178 179 180 181 182 183 184 185 186 187 188 189 |
#define convert(iov, type) \ ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) static void *_convert(struct iovec *iov, size_t size, size_t align, const char *name) { if (iov->iov_len != size) errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); if ((unsigned long)iov->iov_base % align != 0) errx(1, "Bad alignment %p for %s", iov->iov_base, name); return iov->iov_base; } |
b5111790f lguest: wrap last... |
190 191 |
/* Wrapper for the last available index. Makes it easier to change. */ #define lg_last_avail(vq) ((vq)->last_avail_idx) |
2e04ef769 lguest: fix comme... |
192 193 194 195 |
/* * The virtio configuration space is defined to be little-endian. x86 is * little-endian too, but it's nice to be explicit so we have these helpers. */ |
17cbca2ba Update example la... |
196 197 198 199 200 |
#define cpu_to_le16(v16) (v16) #define cpu_to_le32(v32) (v32) #define cpu_to_le64(v64) (v64) #define le16_to_cpu(v16) (v16) #define le32_to_cpu(v32) (v32) |
a586d4f60 virtio: simplify ... |
201 |
#define le64_to_cpu(v64) (v64) |
17cbca2ba Update example la... |
202 |
|
28fd6d7f9 lguest: virtio-rn... |
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
/* Is this iovec empty? */ static bool iov_empty(const struct iovec iov[], unsigned int num_iov) { unsigned int i; for (i = 0; i < num_iov; i++) if (iov[i].iov_len) return false; return true; } /* Take len bytes from the front of this iovec. */ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) { unsigned int i; for (i = 0; i < num_iov; i++) { unsigned int used; used = iov[i].iov_len < len ? iov[i].iov_len : len; iov[i].iov_base += used; iov[i].iov_len -= used; len -= used; } assert(len == 0); } |
6e5aa7efb virtio: reset fun... |
229 230 231 232 |
/* The device virtqueue descriptors are followed by feature bitmasks. */ static u8 *get_feature_bits(struct device *dev) { return (u8 *)(dev->desc + 1) |
713b15b37 lguest: be parano... |
233 |
+ dev->num_vq * sizeof(struct lguest_vqconfig); |
6e5aa7efb virtio: reset fun... |
234 |
} |
2e04ef769 lguest: fix comme... |
235 236 237 238 239 240 |
/*L:100 * The Launcher code itself takes us out into userspace, that scary place where * pointers run wild and free! Unfortunately, like most userspace programs, * it's quite boring (which is why everyone likes to hack on the kernel!). * Perhaps if you make up an Lguest Drinking Game at this point, it will get * you through this section. Or, maybe not. |
3c6b5bfa3 Introduce guest m... |
241 242 243 244 245 246 |
* * The Launcher sets up a big chunk of memory to be the Guest's "physical" * memory and stores it in "guest_base". In other words, Guest physical == * Launcher virtual with an offset. * * This can be tough to get your head around, but usually it just means that we |
a33f32244 Documentation/: i... |
247 |
* use these trivial conversion functions when the Guest gives us its |
2e04ef769 lguest: fix comme... |
248 249 |
* "physical" addresses: */ |
3c6b5bfa3 Introduce guest m... |
250 251 252 253 254 255 256 257 258 |
static void *from_guest_phys(unsigned long addr) { return guest_base + addr; } static unsigned long to_guest_phys(const void *addr) { return (addr - guest_base); } |
dde797899 lguest: documenta... |
259 260 261 262 |
/*L:130 * Loading the Kernel. * * We start with couple of simple helper routines. open_or_die() avoids |
2e04ef769 lguest: fix comme... |
263 264 |
* error-checking code cluttering the callers: */ |
8ca47e006 lguest: the docum... |
265 266 267 268 269 270 271 |
static int open_or_die(const char *name, int flags) { int fd = open(name, flags); if (fd < 0) err(1, "Failed to open %s", name); return fd; } |
3c6b5bfa3 Introduce guest m... |
272 273 |
/* map_zeroed_pages() takes a number of pages. */ static void *map_zeroed_pages(unsigned int num) |
8ca47e006 lguest: the docum... |
274 |
{ |
3c6b5bfa3 Introduce guest m... |
275 276 |
int fd = open_or_die("/dev/zero", O_RDONLY); void *addr; |
8ca47e006 lguest: the docum... |
277 |
|
2e04ef769 lguest: fix comme... |
278 279 |
/* * We use a private mapping (ie. if we write to the page, it will be |
5230ff0cc lguest: example l... |
280 281 |
* copied). We allocate an extra two pages PROT_NONE to act as guard * pages against read/write attempts that exceed allocated space. |
2e04ef769 lguest: fix comme... |
282 |
*/ |
5230ff0cc lguest: example l... |
283 284 |
addr = mmap(NULL, getpagesize() * (num+2), PROT_NONE, MAP_PRIVATE, fd, 0); |
3c6b5bfa3 Introduce guest m... |
285 |
if (addr == MAP_FAILED) |
af901ca18 tree-wide: fix as... |
286 |
err(1, "Mmapping %u pages of /dev/zero", num); |
a91d74a3c lguest: update co... |
287 |
|
5230ff0cc lguest: example l... |
288 289 290 |
if (mprotect(addr + getpagesize(), getpagesize() * num, PROT_READ|PROT_WRITE) == -1) err(1, "mprotect rw %u pages failed", num); |
a91d74a3c lguest: update co... |
291 292 293 294 |
/* * One neat mmap feature is that you can close the fd, and it * stays mapped. */ |
34bdaab44 lguest: Don't lea... |
295 |
close(fd); |
3c6b5bfa3 Introduce guest m... |
296 |
|
5230ff0cc lguest: example l... |
297 298 |
/* Return address after PROT_NONE page */ return addr + getpagesize(); |
3c6b5bfa3 Introduce guest m... |
299 300 301 302 303 304 305 306 307 308 309 |
} /* Get some more pages for a device. */ static void *get_pages(unsigned int num) { void *addr = from_guest_phys(guest_limit); guest_limit += num * getpagesize(); if (guest_limit > guest_max) errx(1, "Not enough memory for devices"); return addr; |
8ca47e006 lguest: the docum... |
310 |
} |
2e04ef769 lguest: fix comme... |
311 312 |
/* * This routine is used to load the kernel or initrd. It tries mmap, but if |
6649bb7af Accept elf files ... |
313 |
* that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), |
2e04ef769 lguest: fix comme... |
314 315 |
* it falls back to reading the memory in. */ |
6649bb7af Accept elf files ... |
316 317 318 |
static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) { ssize_t r; |
2e04ef769 lguest: fix comme... |
319 320 |
/* * We map writable even though for some segments are marked read-only. |
6649bb7af Accept elf files ... |
321 322 323 324 325 |
* The kernel really wants to be writable: it patches its own * instructions. * * MAP_PRIVATE means that the page won't be copied until a write is * done to it. This allows us to share untouched memory between |
2e04ef769 lguest: fix comme... |
326 327 |
* Guests. */ |
5230ff0cc lguest: example l... |
328 |
if (mmap(addr, len, PROT_READ|PROT_WRITE, |
6649bb7af Accept elf files ... |
329 330 331 332 333 334 335 336 |
MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) return; /* pread does a seek and a read in one shot: saves a few lines. */ r = pread(fd, addr, len, offset); if (r != len) err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); } |
2e04ef769 lguest: fix comme... |
337 338 |
/* * This routine takes an open vmlinux image, which is in ELF, and maps it into |
dde797899 lguest: documenta... |
339 340 341 342 |
* the Guest memory. ELF = Embedded Linking Format, which is the format used * by all modern binaries on Linux including the kernel. * * The ELF headers give *two* addresses: a physical address, and a virtual |
47436aa4a Boot with virtual... |
343 344 |
* address. We use the physical address; the Guest will map itself to the * virtual address. |
dde797899 lguest: documenta... |
345 |
* |
2e04ef769 lguest: fix comme... |
346 347 |
* We return the starting address. */ |
47436aa4a Boot with virtual... |
348 |
static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) |
8ca47e006 lguest: the docum... |
349 |
{ |
8ca47e006 lguest: the docum... |
350 351 |
Elf32_Phdr phdr[ehdr->e_phnum]; unsigned int i; |
8ca47e006 lguest: the docum... |
352 |
|
2e04ef769 lguest: fix comme... |
353 354 355 356 |
/* * Sanity checks on the main ELF header: an x86 executable with a * reasonable number of correctly-sized program headers. */ |
8ca47e006 lguest: the docum... |
357 358 359 360 361 |
if (ehdr->e_type != ET_EXEC || ehdr->e_machine != EM_386 || ehdr->e_phentsize != sizeof(Elf32_Phdr) || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) errx(1, "Malformed elf header"); |
2e04ef769 lguest: fix comme... |
362 363 |
/* * An ELF executable contains an ELF header and a number of "program" |
dde797899 lguest: documenta... |
364 |
* headers which indicate which parts ("segments") of the program to |
2e04ef769 lguest: fix comme... |
365 366 |
* load where. */ |
dde797899 lguest: documenta... |
367 368 |
/* We read in all the program headers at once: */ |
8ca47e006 lguest: the docum... |
369 370 371 372 |
if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) err(1, "Seeking to program headers"); if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) err(1, "Reading program headers"); |
2e04ef769 lguest: fix comme... |
373 374 375 376 |
/* * Try all the headers: there are usually only three. A read-only one, * a read-write one, and a "note" section which we don't load. */ |
8ca47e006 lguest: the docum... |
377 |
for (i = 0; i < ehdr->e_phnum; i++) { |
dde797899 lguest: documenta... |
378 |
/* If this isn't a loadable segment, we ignore it */ |
8ca47e006 lguest: the docum... |
379 380 381 382 383 384 |
if (phdr[i].p_type != PT_LOAD) continue; verbose("Section %i: size %i addr %p ", i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); |
6649bb7af Accept elf files ... |
385 |
/* We map this section of the file at its physical address. */ |
3c6b5bfa3 Introduce guest m... |
386 |
map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), |
6649bb7af Accept elf files ... |
387 |
phdr[i].p_offset, phdr[i].p_filesz); |
8ca47e006 lguest: the docum... |
388 |
} |
814a0e5cd Revert lguest mag... |
389 390 |
/* The entry point is given in the ELF header. */ return ehdr->e_entry; |
8ca47e006 lguest: the docum... |
391 |
} |
2e04ef769 lguest: fix comme... |
392 393 394 395 |
/*L:150 * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed * to jump into it and it will unpack itself. We used to have to perform some * hairy magic because the unpacking code scared me. |
dde797899 lguest: documenta... |
396 |
* |
5bbf89fc2 Loading bzImage d... |
397 398 |
* Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote * a small patch to jump over the tricky bits in the Guest, so now we just read |
2e04ef769 lguest: fix comme... |
399 400 |
* the funky header so we know where in the file to load, and away we go! */ |
47436aa4a Boot with virtual... |
401 |
static unsigned long load_bzimage(int fd) |
8ca47e006 lguest: the docum... |
402 |
{ |
43d33b21a Use "struct boot_... |
403 |
struct boot_params boot; |
5bbf89fc2 Loading bzImage d... |
404 405 406 |
int r; /* Modern bzImages get loaded at 1M. */ void *p = from_guest_phys(0x100000); |
2e04ef769 lguest: fix comme... |
407 408 |
/* * Go back to the start of the file and read the header. It should be |
395cf9691 doc: fix broken r... |
409 |
* a Linux boot header (see Documentation/x86/boot.txt) |
2e04ef769 lguest: fix comme... |
410 |
*/ |
5bbf89fc2 Loading bzImage d... |
411 |
lseek(fd, 0, SEEK_SET); |
43d33b21a Use "struct boot_... |
412 |
read(fd, &boot, sizeof(boot)); |
5bbf89fc2 Loading bzImage d... |
413 |
|
43d33b21a Use "struct boot_... |
414 415 |
/* Inside the setup_hdr, we expect the magic "HdrS" */ if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) |
5bbf89fc2 Loading bzImage d... |
416 |
errx(1, "This doesn't look like a bzImage to me"); |
43d33b21a Use "struct boot_... |
417 418 |
/* Skip over the extra sectors of the header. */ lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); |
5bbf89fc2 Loading bzImage d... |
419 420 421 422 |
/* Now read everything into memory. in nice big chunks. */ while ((r = read(fd, p, 65536)) > 0) p += r; |
43d33b21a Use "struct boot_... |
423 424 |
/* Finally, code32_start tells us where to enter the kernel. */ return boot.hdr.code32_start; |
8ca47e006 lguest: the docum... |
425 |
} |
2e04ef769 lguest: fix comme... |
426 427 |
/*L:140 * Loading the kernel is easy when it's a "vmlinux", but most kernels |
e1e72965e lguest: documenta... |
428 |
* come wrapped up in the self-decompressing "bzImage" format. With a little |
2e04ef769 lguest: fix comme... |
429 430 |
* work, we can load those, too. */ |
47436aa4a Boot with virtual... |
431 |
static unsigned long load_kernel(int fd) |
8ca47e006 lguest: the docum... |
432 433 |
{ Elf32_Ehdr hdr; |
dde797899 lguest: documenta... |
434 |
/* Read in the first few bytes. */ |
8ca47e006 lguest: the docum... |
435 436 |
if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) err(1, "Reading kernel"); |
dde797899 lguest: documenta... |
437 |
/* If it's an ELF file, it starts with "\177ELF" */ |
8ca47e006 lguest: the docum... |
438 |
if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) |
47436aa4a Boot with virtual... |
439 |
return map_elf(fd, &hdr); |
8ca47e006 lguest: the docum... |
440 |
|
a6bd8e130 lguest: comment d... |
441 |
/* Otherwise we assume it's a bzImage, and try to load it. */ |
47436aa4a Boot with virtual... |
442 |
return load_bzimage(fd); |
8ca47e006 lguest: the docum... |
443 |
} |
2e04ef769 lguest: fix comme... |
444 445 |
/* * This is a trivial little helper to align pages. Andi Kleen hated it because |
dde797899 lguest: documenta... |
446 447 448 |
* it calls getpagesize() twice: "it's dumb code." * * Kernel guys get really het up about optimization, even when it's not |
2e04ef769 lguest: fix comme... |
449 450 |
* necessary. I leave this code as a reaction against that. */ |
8ca47e006 lguest: the docum... |
451 452 |
static inline unsigned long page_align(unsigned long addr) { |
dde797899 lguest: documenta... |
453 |
/* Add upwards and truncate downwards. */ |
8ca47e006 lguest: the docum... |
454 455 |
return ((addr + getpagesize()-1) & ~(getpagesize()-1)); } |
2e04ef769 lguest: fix comme... |
456 457 458 459 460 |
/*L:180 * An "initial ram disk" is a disk image loaded into memory along with the * kernel which the kernel can use to boot from without needing any drivers. * Most distributions now use this as standard: the initrd contains the code to * load the appropriate driver modules for the current machine. |
dde797899 lguest: documenta... |
461 462 |
* * Importantly, James Morris works for RedHat, and Fedora uses initrds for its |
2e04ef769 lguest: fix comme... |
463 464 |
* kernels. He sent me this (and tells me when I break it). */ |
8ca47e006 lguest: the docum... |
465 466 467 468 469 |
static unsigned long load_initrd(const char *name, unsigned long mem) { int ifd; struct stat st; unsigned long len; |
8ca47e006 lguest: the docum... |
470 471 |
ifd = open_or_die(name, O_RDONLY); |
dde797899 lguest: documenta... |
472 |
/* fstat() is needed to get the file size. */ |
8ca47e006 lguest: the docum... |
473 474 |
if (fstat(ifd, &st) < 0) err(1, "fstat() on initrd '%s'", name); |
2e04ef769 lguest: fix comme... |
475 476 477 478 |
/* * We map the initrd at the top of memory, but mmap wants it to be * page-aligned, so we round the size up for that. */ |
8ca47e006 lguest: the docum... |
479 |
len = page_align(st.st_size); |
3c6b5bfa3 Introduce guest m... |
480 |
map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); |
2e04ef769 lguest: fix comme... |
481 482 483 484 |
/* * Once a file is mapped, you can close the file descriptor. It's a * little odd, but quite useful. */ |
8ca47e006 lguest: the docum... |
485 |
close(ifd); |
6649bb7af Accept elf files ... |
486 487 |
verbose("mapped initrd %s size=%lu @ %p ", name, len, (void*)mem-len); |
dde797899 lguest: documenta... |
488 489 |
/* We return the initrd size. */ |
8ca47e006 lguest: the docum... |
490 491 |
return len; } |
e1e72965e lguest: documenta... |
492 |
/*:*/ |
8ca47e006 lguest: the docum... |
493 |
|
2e04ef769 lguest: fix comme... |
494 495 496 497 |
/* * Simple routine to roll all the commandline arguments together with spaces * between them. */ |
8ca47e006 lguest: the docum... |
498 499 500 501 502 |
static void concat(char *dst, char *args[]) { unsigned int i, len = 0; for (i = 0; args[i]; i++) { |
1ef36fa64 lguest: Do not ap... |
503 504 505 506 |
if (i) { strcat(dst+len, " "); len++; } |
8ca47e006 lguest: the docum... |
507 |
strcpy(dst+len, args[i]); |
1ef36fa64 lguest: Do not ap... |
508 |
len += strlen(args[i]); |
8ca47e006 lguest: the docum... |
509 510 511 512 |
} /* In case it's empty. */ dst[len] = '\0'; } |
2e04ef769 lguest: fix comme... |
513 514 |
/*L:185 * This is where we actually tell the kernel to initialize the Guest. We |
e1e72965e lguest: documenta... |
515 |
* saw the arguments it expects when we looked at initialize() in lguest_user.c: |
58a245664 lguest: move the ... |
516 |
* the base of Guest "physical" memory, the top physical page to allow and the |
2e04ef769 lguest: fix comme... |
517 518 |
* entry point for the Guest. */ |
56739c802 lguest: cleanup p... |
519 |
static void tell_kernel(unsigned long start) |
8ca47e006 lguest: the docum... |
520 |
{ |
511801dc3 Change example la... |
521 522 |
unsigned long args[] = { LHREQ_INITIALIZE, (unsigned long)guest_base, |
58a245664 lguest: move the ... |
523 |
guest_limit / getpagesize(), start }; |
3c6b5bfa3 Introduce guest m... |
524 525 526 |
verbose("Guest: %p - %p (%#lx) ", guest_base, guest_base + guest_limit, guest_limit); |
56739c802 lguest: cleanup p... |
527 528 |
lguest_fd = open_or_die("/dev/lguest", O_RDWR); if (write(lguest_fd, args, sizeof(args)) < 0) |
8ca47e006 lguest: the docum... |
529 |
err(1, "Writing to /dev/lguest"); |
8ca47e006 lguest: the docum... |
530 |
} |
dde797899 lguest: documenta... |
531 |
/*:*/ |
8ca47e006 lguest: the docum... |
532 |
|
a91d74a3c lguest: update co... |
533 |
/*L:200 |
dde797899 lguest: documenta... |
534 535 |
* Device Handling. * |
e1e72965e lguest: documenta... |
536 |
* When the Guest gives us a buffer, it sends an array of addresses and sizes. |
dde797899 lguest: documenta... |
537 |
* We need to make sure it's not trying to reach into the Launcher itself, so |
e1e72965e lguest: documenta... |
538 |
* we have a convenient routine which checks it and exits with an error message |
dde797899 lguest: documenta... |
539 540 |
* if something funny is going on: */ |
8ca47e006 lguest: the docum... |
541 542 543 |
static void *_check_pointer(unsigned long addr, unsigned int size, unsigned int line) { |
2e04ef769 lguest: fix comme... |
544 |
/* |
5230ff0cc lguest: example l... |
545 546 |
* Check if the requested address and size exceeds the allocated memory, * or addr + size wraps around. |
2e04ef769 lguest: fix comme... |
547 |
*/ |
5230ff0cc lguest: example l... |
548 |
if ((addr + size) > guest_limit || (addr + size) < addr) |
17cbca2ba Update example la... |
549 |
errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); |
2e04ef769 lguest: fix comme... |
550 551 552 553 |
/* * We return a pointer for the caller's convenience, now we know it's * safe to use. */ |
3c6b5bfa3 Introduce guest m... |
554 |
return from_guest_phys(addr); |
8ca47e006 lguest: the docum... |
555 |
} |
dde797899 lguest: documenta... |
556 |
/* A macro which transparently hands the line number to the real function. */ |
8ca47e006 lguest: the docum... |
557 |
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) |
2e04ef769 lguest: fix comme... |
558 559 |
/* * Each buffer in the virtqueues is actually a chain of descriptors. This |
e1e72965e lguest: documenta... |
560 |
* function returns the next descriptor in the chain, or vq->vring.num if we're |
2e04ef769 lguest: fix comme... |
561 562 |
* at the end. */ |
d1f0132e7 lguest: add suppo... |
563 564 |
static unsigned next_desc(struct vring_desc *desc, unsigned int i, unsigned int max) |
17cbca2ba Update example la... |
565 566 567 568 |
{ unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ |
d1f0132e7 lguest: add suppo... |
569 570 |
if (!(desc[i].flags & VRING_DESC_F_NEXT)) return max; |
17cbca2ba Update example la... |
571 572 |
/* Check they're not leading us off end of descriptors. */ |
d1f0132e7 lguest: add suppo... |
573 |
next = desc[i].next; |
17cbca2ba Update example la... |
574 575 |
/* Make sure compiler knows to grab that: we don't want it changing! */ wmb(); |
d1f0132e7 lguest: add suppo... |
576 |
if (next >= max) |
17cbca2ba Update example la... |
577 578 579 580 |
errx(1, "Desc next is %u", next); return next; } |
a91d74a3c lguest: update co... |
581 582 583 584 |
/* * This actually sends the interrupt for this virtqueue, if we've used a * buffer. */ |
38bc2b8c5 lguest: implement... |
585 586 587 |
static void trigger_irq(struct virtqueue *vq) { unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; |
95c517c09 lguest: avoid sen... |
588 589 590 591 |
/* Don't inform them if nothing used. */ if (!vq->pending_used) return; vq->pending_used = 0; |
ca60a42c9 lguest: don't for... |
592 593 |
/* If they don't want an interrupt, don't send one... */ if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { |
990c91f0a lguest: remove su... |
594 |
return; |
ca60a42c9 lguest: don't for... |
595 |
} |
38bc2b8c5 lguest: implement... |
596 597 598 599 600 |
/* Send the Guest an interrupt tell them we used something up. */ if (write(lguest_fd, buf, sizeof(buf)) != 0) err(1, "Triggering irq %i", vq->config.irq); } |
2e04ef769 lguest: fix comme... |
601 |
/* |
a91d74a3c lguest: update co... |
602 |
* This looks in the virtqueue for the first available buffer, and converts |
17cbca2ba Update example la... |
603 604 605 606 |
* it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * |
a91d74a3c lguest: update co... |
607 |
* This function waits if necessary, and returns the descriptor number found. |
2e04ef769 lguest: fix comme... |
608 |
*/ |
659a0e663 lguest: have exam... |
609 610 611 |
static unsigned wait_for_vq_desc(struct virtqueue *vq, struct iovec iov[], unsigned int *out_num, unsigned int *in_num) |
17cbca2ba Update example la... |
612 |
{ |
d1f0132e7 lguest: add suppo... |
613 614 |
unsigned int i, head, max; struct vring_desc *desc; |
659a0e663 lguest: have exam... |
615 |
u16 last_avail = lg_last_avail(vq); |
a91d74a3c lguest: update co... |
616 |
/* There's nothing available? */ |
659a0e663 lguest: have exam... |
617 618 |
while (last_avail == vq->vring.avail->idx) { u64 event; |
a91d74a3c lguest: update co... |
619 620 621 622 |
/* * Since we're about to sleep, now is a good time to tell the * Guest about what we've used up to now. */ |
38bc2b8c5 lguest: implement... |
623 |
trigger_irq(vq); |
b60da13fc lguest: suppress ... |
624 625 |
/* OK, now we need to know about added descriptors. */ vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; |
2e04ef769 lguest: fix comme... |
626 627 628 629 |
/* * They could have slipped one in as we were doing that: make * sure it's written, then check again. */ |
b60da13fc lguest: suppress ... |
630 631 632 633 634 |
mb(); if (last_avail != vq->vring.avail->idx) { vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; break; } |
659a0e663 lguest: have exam... |
635 636 637 |
/* Nothing new? Wait for eventfd to tell us they refilled. */ if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) errx(1, "Event read failed?"); |
b60da13fc lguest: suppress ... |
638 639 640 |
/* We don't need to be notified again. */ vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; |
659a0e663 lguest: have exam... |
641 |
} |
17cbca2ba Update example la... |
642 643 |
/* Check it isn't doing very strange things with descriptor numbers. */ |
b5111790f lguest: wrap last... |
644 |
if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) |
17cbca2ba Update example la... |
645 |
errx(1, "Guest moved used index from %u to %u", |
b5111790f lguest: wrap last... |
646 |
last_avail, vq->vring.avail->idx); |
17cbca2ba Update example la... |
647 |
|
2e04ef769 lguest: fix comme... |
648 649 650 651 |
/* * Grab the next descriptor number they're advertising, and increment * the index we've seen. */ |
b5111790f lguest: wrap last... |
652 653 |
head = vq->vring.avail->ring[last_avail % vq->vring.num]; lg_last_avail(vq)++; |
17cbca2ba Update example la... |
654 655 656 657 658 659 660 |
/* If their number is silly, that's a fatal mistake. */ if (head >= vq->vring.num) errx(1, "Guest says index %u is available", head); /* When we start there are none of either input nor output. */ *out_num = *in_num = 0; |
d1f0132e7 lguest: add suppo... |
661 662 |
max = vq->vring.num; desc = vq->vring.desc; |
17cbca2ba Update example la... |
663 |
i = head; |
d1f0132e7 lguest: add suppo... |
664 |
|
2e04ef769 lguest: fix comme... |
665 666 667 668 |
/* * If this is an indirect entry, then this buffer contains a descriptor * table which we handle as if it's any normal descriptor chain. */ |
d1f0132e7 lguest: add suppo... |
669 670 671 672 673 674 675 676 |
if (desc[i].flags & VRING_DESC_F_INDIRECT) { if (desc[i].len % sizeof(struct vring_desc)) errx(1, "Invalid size for indirect buffer table"); max = desc[i].len / sizeof(struct vring_desc); desc = check_pointer(desc[i].addr, desc[i].len); i = 0; } |
17cbca2ba Update example la... |
677 678 |
do { /* Grab the first descriptor, and check it's OK. */ |
d1f0132e7 lguest: add suppo... |
679 |
iov[*out_num + *in_num].iov_len = desc[i].len; |
17cbca2ba Update example la... |
680 |
iov[*out_num + *in_num].iov_base |
d1f0132e7 lguest: add suppo... |
681 |
= check_pointer(desc[i].addr, desc[i].len); |
17cbca2ba Update example la... |
682 |
/* If this is an input descriptor, increment that count. */ |
d1f0132e7 lguest: add suppo... |
683 |
if (desc[i].flags & VRING_DESC_F_WRITE) |
17cbca2ba Update example la... |
684 685 |
(*in_num)++; else { |
2e04ef769 lguest: fix comme... |
686 687 688 689 |
/* * If it's an output descriptor, they're all supposed * to come before any input descriptors. */ |
17cbca2ba Update example la... |
690 691 692 693 694 695 |
if (*in_num) errx(1, "Descriptor has out after in"); (*out_num)++; } /* If we've got too many, that implies a descriptor loop. */ |
d1f0132e7 lguest: add suppo... |
696 |
if (*out_num + *in_num > max) |
17cbca2ba Update example la... |
697 |
errx(1, "Looped descriptor"); |
d1f0132e7 lguest: add suppo... |
698 |
} while ((i = next_desc(desc, i, max)) != max); |
dde797899 lguest: documenta... |
699 |
|
17cbca2ba Update example la... |
700 |
return head; |
8ca47e006 lguest: the docum... |
701 |
} |
2e04ef769 lguest: fix comme... |
702 |
/* |
a91d74a3c lguest: update co... |
703 704 705 |
* After we've used one of their buffers, we tell the Guest about it. Sometime * later we'll want to send them an interrupt using trigger_irq(); note that * wait_for_vq_desc() does that for us if it has to wait. |
2e04ef769 lguest: fix comme... |
706 |
*/ |
17cbca2ba Update example la... |
707 |
static void add_used(struct virtqueue *vq, unsigned int head, int len) |
8ca47e006 lguest: the docum... |
708 |
{ |
17cbca2ba Update example la... |
709 |
struct vring_used_elem *used; |
2e04ef769 lguest: fix comme... |
710 711 712 713 |
/* * The virtqueue contains a ring of used buffers. Get a pointer to the * next entry in that used ring. */ |
17cbca2ba Update example la... |
714 715 716 717 718 719 |
used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; used->id = head; used->len = len; /* Make sure buffer is written before we update index. */ wmb(); vq->vring.used->idx++; |
95c517c09 lguest: avoid sen... |
720 |
vq->pending_used++; |
8ca47e006 lguest: the docum... |
721 |
} |
17cbca2ba Update example la... |
722 |
/* And here's the combo meal deal. Supersize me! */ |
56739c802 lguest: cleanup p... |
723 |
static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) |
8ca47e006 lguest: the docum... |
724 |
{ |
17cbca2ba Update example la... |
725 |
add_used(vq, head, len); |
56739c802 lguest: cleanup p... |
726 |
trigger_irq(vq); |
8ca47e006 lguest: the docum... |
727 |
} |
e1e72965e lguest: documenta... |
728 729 730 |
/* * The Console * |
2e04ef769 lguest: fix comme... |
731 732 |
* We associate some data with the console for our exit hack. */ |
1842f23c0 lguest and virtio... |
733 |
struct console_abort { |
dde797899 lguest: documenta... |
734 |
/* How many times have they hit ^C? */ |
8ca47e006 lguest: the docum... |
735 |
int count; |
dde797899 lguest: documenta... |
736 |
/* When did they start? */ |
8ca47e006 lguest: the docum... |
737 738 |
struct timeval start; }; |
dde797899 lguest: documenta... |
739 |
/* This is the routine which handles console input (ie. stdin). */ |
659a0e663 lguest: have exam... |
740 |
static void console_input(struct virtqueue *vq) |
8ca47e006 lguest: the docum... |
741 |
{ |
8ca47e006 lguest: the docum... |
742 |
int len; |
17cbca2ba Update example la... |
743 |
unsigned int head, in_num, out_num; |
659a0e663 lguest: have exam... |
744 745 |
struct console_abort *abort = vq->dev->priv; struct iovec iov[vq->vring.num]; |
56ae43dfe Example launcher ... |
746 |
|
a91d74a3c lguest: update co... |
747 |
/* Make sure there's a descriptor available. */ |
659a0e663 lguest: have exam... |
748 |
head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
56ae43dfe Example launcher ... |
749 |
if (out_num) |
17cbca2ba Update example la... |
750 |
errx(1, "Output buffers in console in queue?"); |
8ca47e006 lguest: the docum... |
751 |
|
a91d74a3c lguest: update co... |
752 |
/* Read into it. This is where we usually wait. */ |
659a0e663 lguest: have exam... |
753 |
len = readv(STDIN_FILENO, iov, in_num); |
8ca47e006 lguest: the docum... |
754 |
if (len <= 0) { |
659a0e663 lguest: have exam... |
755 |
/* Ran out of input? */ |
8ca47e006 lguest: the docum... |
756 |
warnx("Failed to get console input, ignoring console."); |
2e04ef769 lguest: fix comme... |
757 758 759 760 |
/* * For simplicity, dying threads kill the whole Launcher. So * just nap here. */ |
659a0e663 lguest: have exam... |
761 762 |
for (;;) pause(); |
8ca47e006 lguest: the docum... |
763 |
} |
a91d74a3c lguest: update co... |
764 |
/* Tell the Guest we used a buffer. */ |
659a0e663 lguest: have exam... |
765 |
add_used_and_trigger(vq, head, len); |
8ca47e006 lguest: the docum... |
766 |
|
2e04ef769 lguest: fix comme... |
767 768 |
/* * Three ^C within one second? Exit. |
dde797899 lguest: documenta... |
769 |
* |
659a0e663 lguest: have exam... |
770 771 772 |
* This is such a hack, but works surprisingly well. Each ^C has to * be in a buffer by itself, so they can't be too fast. But we check * that we get three within about a second, so they can't be too |
2e04ef769 lguest: fix comme... |
773 774 |
* slow. */ |
659a0e663 lguest: have exam... |
775 |
if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { |
8ca47e006 lguest: the docum... |
776 |
abort->count = 0; |
659a0e663 lguest: have exam... |
777 778 |
return; } |
8ca47e006 lguest: the docum... |
779 |
|
659a0e663 lguest: have exam... |
780 781 782 783 784 785 786 787 788 789 790 |
abort->count++; if (abort->count == 1) gettimeofday(&abort->start, NULL); else if (abort->count == 3) { struct timeval now; gettimeofday(&now, NULL); /* Kill all Launcher processes with SIGINT, like normal ^C */ if (now.tv_sec <= abort->start.tv_sec+1) kill(0, SIGINT); abort->count = 0; } |
8ca47e006 lguest: the docum... |
791 |
} |
659a0e663 lguest: have exam... |
792 793 |
/* This is the routine which handles console output (ie. stdout). */ static void console_output(struct virtqueue *vq) |
8ca47e006 lguest: the docum... |
794 |
{ |
17cbca2ba Update example la... |
795 |
unsigned int head, out, in; |
17cbca2ba Update example la... |
796 |
struct iovec iov[vq->vring.num]; |
a91d74a3c lguest: update co... |
797 |
/* We usually wait in here, for the Guest to give us something. */ |
659a0e663 lguest: have exam... |
798 799 800 |
head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in console output queue?"); |
a91d74a3c lguest: update co... |
801 802 |
/* writev can return a partial write, so we loop here. */ |
659a0e663 lguest: have exam... |
803 804 |
while (!iov_empty(iov, out)) { int len = writev(STDOUT_FILENO, iov, out); |
e0377e252 lguest: Do not ex... |
805 806 807 808 |
if (len <= 0) { warn("Write to stdout gave %i (%d)", len, errno); break; } |
659a0e663 lguest: have exam... |
809 |
iov_consume(iov, out, len); |
17cbca2ba Update example la... |
810 |
} |
a91d74a3c lguest: update co... |
811 812 813 814 815 |
/* * We're finished with that buffer: if we're going to sleep, * wait_for_vq_desc() will prod the Guest with an interrupt. */ |
38bc2b8c5 lguest: implement... |
816 |
add_used(vq, head, 0); |
a161883a2 lguest: Tell Gues... |
817 |
} |
e1e72965e lguest: documenta... |
818 819 820 821 |
/* * The Network * * Handling output for network is also simple: we get all the output buffers |
659a0e663 lguest: have exam... |
822 |
* and write them to /dev/net/tun. |
a6bd8e130 lguest: comment d... |
823 |
*/ |
659a0e663 lguest: have exam... |
824 825 826 827 828 |
struct net_info { int tunfd; }; static void net_output(struct virtqueue *vq) |
8ca47e006 lguest: the docum... |
829 |
{ |
659a0e663 lguest: have exam... |
830 831 |
struct net_info *net_info = vq->dev->priv; unsigned int head, out, in; |
17cbca2ba Update example la... |
832 |
struct iovec iov[vq->vring.num]; |
a161883a2 lguest: Tell Gues... |
833 |
|
a91d74a3c lguest: update co... |
834 |
/* We usually wait in here for the Guest to give us a packet. */ |
659a0e663 lguest: have exam... |
835 836 837 |
head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in net output queue?"); |
a91d74a3c lguest: update co... |
838 839 840 841 |
/* * Send the whole thing through to /dev/net/tun. It expects the exact * same format: what a coincidence! */ |
659a0e663 lguest: have exam... |
842 |
if (writev(net_info->tunfd, iov, out) < 0) |
e0377e252 lguest: Do not ex... |
843 |
warnx("Write to tun failed (%d)?", errno); |
a91d74a3c lguest: update co... |
844 845 846 847 848 |
/* * Done with that one; wait_for_vq_desc() will send the interrupt if * all packets are processed. */ |
38bc2b8c5 lguest: implement... |
849 |
add_used(vq, head, 0); |
8ca47e006 lguest: the docum... |
850 |
} |
a91d74a3c lguest: update co... |
851 852 853 854 855 856 |
/* * Handling network input is a bit trickier, because I've tried to optimize it. * * First we have a helper routine which tells is if from this file descriptor * (ie. the /dev/net/tun device) will block: */ |
4a8962e21 lguest: try to ba... |
857 858 859 860 861 862 863 864 |
static bool will_block(int fd) { fd_set fdset; struct timeval zero = { 0, 0 }; FD_ZERO(&fdset); FD_SET(fd, &fdset); return select(fd+1, &fdset, NULL, NULL, &zero) != 1; } |
a91d74a3c lguest: update co... |
865 866 867 868 869 |
/* * This handles packets coming in from the tun device to our Guest. Like all * service routines, it gets called again as soon as it returns, so you don't * see a while(1) loop here. */ |
659a0e663 lguest: have exam... |
870 |
static void net_input(struct virtqueue *vq) |
8ca47e006 lguest: the docum... |
871 |
{ |
8ca47e006 lguest: the docum... |
872 |
int len; |
659a0e663 lguest: have exam... |
873 874 875 |
unsigned int head, out, in; struct iovec iov[vq->vring.num]; struct net_info *net_info = vq->dev->priv; |
a91d74a3c lguest: update co... |
876 877 878 879 |
/* * Get a descriptor to write an incoming packet into. This will also * send an interrupt if they're out of descriptors. */ |
659a0e663 lguest: have exam... |
880 881 882 |
head = wait_for_vq_desc(vq, iov, &out, &in); if (out) errx(1, "Output buffers in net input queue?"); |
4a8962e21 lguest: try to ba... |
883 |
|
a91d74a3c lguest: update co... |
884 885 886 887 |
/* * If it looks like we'll block reading from the tun device, send them * an interrupt. */ |
4a8962e21 lguest: try to ba... |
888 889 |
if (vq->pending_used && will_block(net_info->tunfd)) trigger_irq(vq); |
a91d74a3c lguest: update co... |
890 891 892 893 |
/* * Read in the packet. This is where we normally wait (when there's no * incoming network traffic). */ |
659a0e663 lguest: have exam... |
894 |
len = readv(net_info->tunfd, iov, in); |
8ca47e006 lguest: the docum... |
895 |
if (len <= 0) |
e0377e252 lguest: Do not ex... |
896 |
warn("Failed to read from tun (%d).", errno); |
a91d74a3c lguest: update co... |
897 898 899 900 901 |
/* * Mark that packet buffer as used, but don't interrupt here. We want * to wait until we've done as much work as we can. */ |
4a8962e21 lguest: try to ba... |
902 |
add_used(vq, head, len); |
659a0e663 lguest: have exam... |
903 |
} |
a91d74a3c lguest: update co... |
904 |
/*:*/ |
dde797899 lguest: documenta... |
905 |
|
a91d74a3c lguest: update co... |
906 |
/* This is the helper to create threads: run the service routine in a loop. */ |
659a0e663 lguest: have exam... |
907 908 909 |
static int do_thread(void *_vq) { struct virtqueue *vq = _vq; |
17cbca2ba Update example la... |
910 |
|
659a0e663 lguest: have exam... |
911 912 913 914 |
for (;;) vq->service(vq); return 0; } |
17cbca2ba Update example la... |
915 |
|
2e04ef769 lguest: fix comme... |
916 917 918 919 |
/* * When a child dies, we kill our entire process group with SIGTERM. This * also has the side effect that the shell restores the console for us! */ |
659a0e663 lguest: have exam... |
920 921 922 |
static void kill_launcher(int signal) { kill(0, SIGTERM); |
8ca47e006 lguest: the docum... |
923 |
} |
659a0e663 lguest: have exam... |
924 |
static void reset_device(struct device *dev) |
56ae43dfe Example launcher ... |
925 |
{ |
659a0e663 lguest: have exam... |
926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 |
struct virtqueue *vq; verbose("Resetting device %s ", dev->name); /* Clear any features they've acked. */ memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); /* We're going to be explicitly killing threads, so ignore them. */ signal(SIGCHLD, SIG_IGN); /* Zero out the virtqueues, get rid of their threads */ for (vq = dev->vq; vq; vq = vq->next) { if (vq->thread != (pid_t)-1) { kill(vq->thread, SIGTERM); waitpid(vq->thread, NULL, 0); vq->thread = (pid_t)-1; } memset(vq->vring.desc, 0, vring_size(vq->config.num, LGUEST_VRING_ALIGN)); lg_last_avail(vq) = 0; } dev->running = false; /* Now we care if threads die. */ signal(SIGCHLD, (void *)kill_launcher); |
56ae43dfe Example launcher ... |
952 |
} |
a91d74a3c lguest: update co... |
953 954 955 |
/*L:216 * This actually creates the thread which services the virtqueue for a device. */ |
659a0e663 lguest: have exam... |
956 |
static void create_thread(struct virtqueue *vq) |
5dae785a8 lguest: net block... |
957 |
{ |
2e04ef769 lguest: fix comme... |
958 |
/* |
a91d74a3c lguest: update co... |
959 960 |
* Create stack for thread. Since the stack grows upwards, we point * the stack pointer to the end of this region. |
2e04ef769 lguest: fix comme... |
961 |
*/ |
659a0e663 lguest: have exam... |
962 963 964 965 966 967 968 969 970 |
char *stack = malloc(32768); unsigned long args[] = { LHREQ_EVENTFD, vq->config.pfn*getpagesize(), 0 }; /* Create a zero-initialized eventfd. */ vq->eventfd = eventfd(0, 0); if (vq->eventfd < 0) err(1, "Creating eventfd"); args[2] = vq->eventfd; |
a91d74a3c lguest: update co... |
971 972 973 974 |
/* * Attach an eventfd to this virtqueue: it will go off when the Guest * does an LHCALL_NOTIFY for this vq. */ |
659a0e663 lguest: have exam... |
975 976 |
if (write(lguest_fd, &args, sizeof(args)) != 0) err(1, "Attaching eventfd"); |
a91d74a3c lguest: update co... |
977 978 979 980 |
/* * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so * we get a signal if it dies. */ |
659a0e663 lguest: have exam... |
981 982 983 |
vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); if (vq->thread == (pid_t)-1) err(1, "Creating clone"); |
a91d74a3c lguest: update co... |
984 985 |
/* We close our local copy now the child has it. */ |
659a0e663 lguest: have exam... |
986 |
close(vq->eventfd); |
5dae785a8 lguest: net block... |
987 |
} |
659a0e663 lguest: have exam... |
988 |
static void start_device(struct device *dev) |
6e5aa7efb virtio: reset fun... |
989 |
{ |
659a0e663 lguest: have exam... |
990 |
unsigned int i; |
6e5aa7efb virtio: reset fun... |
991 |
struct virtqueue *vq; |
659a0e663 lguest: have exam... |
992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 |
verbose("Device %s OK: offered", dev->name); for (i = 0; i < dev->feature_len; i++) verbose(" %02x", get_feature_bits(dev)[i]); verbose(", accepted"); for (i = 0; i < dev->feature_len; i++) verbose(" %02x", get_feature_bits(dev) [dev->feature_len+i]); for (vq = dev->vq; vq; vq = vq->next) { if (vq->service) create_thread(vq); } dev->running = true; } static void cleanup_devices(void) { struct device *dev; for (dev = devices.dev; dev; dev = dev->next) reset_device(dev); |
6e5aa7efb virtio: reset fun... |
1013 |
|
659a0e663 lguest: have exam... |
1014 1015 1016 1017 |
/* If we saved off the original terminal settings, restore them now. */ if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); } |
6e5aa7efb virtio: reset fun... |
1018 |
|
659a0e663 lguest: have exam... |
1019 1020 1021 1022 1023 1024 1025 |
/* When the Guest tells us they updated the status field, we handle it. */ static void update_device_status(struct device *dev) { /* A zero status is a reset, otherwise it's a set of flags. */ if (dev->desc->status == 0) reset_device(dev); else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { |
a007a751d lguest: make Laun... |
1026 |
warnx("Device %s configuration FAILED", dev->name); |
659a0e663 lguest: have exam... |
1027 1028 |
if (dev->running) reset_device(dev); |
3c3ed482d lguest: Simplify ... |
1029 1030 1031 1032 |
} else { if (dev->running) err(1, "Device %s features finalized twice", dev->name); start_device(dev); |
6e5aa7efb virtio: reset fun... |
1033 1034 |
} } |
a91d74a3c lguest: update co... |
1035 1036 1037 1038 |
/*L:215 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In * particular, it's used to notify us of device status changes during boot. */ |
56739c802 lguest: cleanup p... |
1039 |
static void handle_output(unsigned long addr) |
8ca47e006 lguest: the docum... |
1040 1041 |
{ struct device *i; |
17cbca2ba Update example la... |
1042 |
|
659a0e663 lguest: have exam... |
1043 |
/* Check each device. */ |
17cbca2ba Update example la... |
1044 |
for (i = devices.dev; i; i = i->next) { |
659a0e663 lguest: have exam... |
1045 |
struct virtqueue *vq; |
a91d74a3c lguest: update co... |
1046 1047 1048 1049 |
/* * Notifications to device descriptors mean they updated the * device status. */ |
6e5aa7efb virtio: reset fun... |
1050 |
if (from_guest_phys(addr) == i->desc) { |
a007a751d lguest: make Laun... |
1051 |
update_device_status(i); |
6e5aa7efb virtio: reset fun... |
1052 1053 |
return; } |
3c3ed482d lguest: Simplify ... |
1054 |
/* Devices should not be used before features are finalized. */ |
17cbca2ba Update example la... |
1055 |
for (vq = i->vq; vq; vq = vq->next) { |
659a0e663 lguest: have exam... |
1056 |
if (addr != vq->config.pfn*getpagesize()) |
6e5aa7efb virtio: reset fun... |
1057 |
continue; |
3c3ed482d lguest: Simplify ... |
1058 |
errx(1, "Notification on %s before setup!", i->name); |
8ca47e006 lguest: the docum... |
1059 1060 |
} } |
dde797899 lguest: documenta... |
1061 |
|
2e04ef769 lguest: fix comme... |
1062 1063 1064 1065 1066 |
/* * Early console write is done using notify on a nul-terminated string * in Guest memory. It's also great for hacking debugging messages * into a Guest. */ |
17cbca2ba Update example la... |
1067 1068 1069 1070 1071 |
if (addr >= guest_limit) errx(1, "Bad NOTIFY %#lx", addr); write(STDOUT_FILENO, from_guest_phys(addr), strnlen(from_guest_phys(addr), guest_limit - addr)); |
8ca47e006 lguest: the docum... |
1072 |
} |
dde797899 lguest: documenta... |
1073 1074 1075 1076 1077 |
/*L:190 * Device Setup * * All devices need a descriptor so the Guest knows it exists, and a "struct * device" so the Launcher can keep track of it. We have common helper |
a6bd8e130 lguest: comment d... |
1078 1079 |
* routines to allocate and manage them. */ |
8ca47e006 lguest: the docum... |
1080 |
|
2e04ef769 lguest: fix comme... |
1081 1082 |
/* * The layout of the device page is a "struct lguest_device_desc" followed by a |
a586d4f60 virtio: simplify ... |
1083 1084 |
* number of virtqueue descriptors, then two sets of feature bits, then an * array of configuration bytes. This routine returns the configuration |
2e04ef769 lguest: fix comme... |
1085 1086 |
* pointer. */ |
a586d4f60 virtio: simplify ... |
1087 1088 1089 |
static u8 *device_config(const struct device *dev) { return (void *)(dev->desc + 1) |
713b15b37 lguest: be parano... |
1090 1091 |
+ dev->num_vq * sizeof(struct lguest_vqconfig) + dev->feature_len * 2; |
17cbca2ba Update example la... |
1092 |
} |
2e04ef769 lguest: fix comme... |
1093 1094 |
/* * This routine allocates a new "struct lguest_device_desc" from descriptor |
a586d4f60 virtio: simplify ... |
1095 |
* table page just above the Guest's normal memory. It returns a pointer to |
2e04ef769 lguest: fix comme... |
1096 1097 |
* that descriptor. */ |
a586d4f60 virtio: simplify ... |
1098 |
static struct lguest_device_desc *new_dev_desc(u16 type) |
17cbca2ba Update example la... |
1099 |
{ |
a586d4f60 virtio: simplify ... |
1100 1101 |
struct lguest_device_desc d = { .type = type }; void *p; |
17cbca2ba Update example la... |
1102 |
|
a586d4f60 virtio: simplify ... |
1103 1104 1105 1106 1107 1108 |
/* Figure out where the next device config is, based on the last one. */ if (devices.lastdev) p = device_config(devices.lastdev) + devices.lastdev->desc->config_len; else p = devices.descpage; |
17cbca2ba Update example la... |
1109 |
|
a586d4f60 virtio: simplify ... |
1110 1111 1112 |
/* We only have one page for all the descriptors. */ if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) errx(1, "Too many devices"); |
17cbca2ba Update example la... |
1113 |
|
a586d4f60 virtio: simplify ... |
1114 1115 |
/* p might not be aligned, so we memcpy in. */ return memcpy(p, &d, sizeof(d)); |
17cbca2ba Update example la... |
1116 |
} |
2e04ef769 lguest: fix comme... |
1117 1118 1119 1120 |
/* * Each device descriptor is followed by the description of its virtqueues. We * specify how many descriptors the virtqueue is to have. */ |
17cbca2ba Update example la... |
1121 |
static void add_virtqueue(struct device *dev, unsigned int num_descs, |
659a0e663 lguest: have exam... |
1122 |
void (*service)(struct virtqueue *)) |
17cbca2ba Update example la... |
1123 1124 1125 1126 |
{ unsigned int pages; struct virtqueue **i, *vq = malloc(sizeof(*vq)); void *p; |
a6bd8e130 lguest: comment d... |
1127 |
/* First we need some memory for this virtqueue. */ |
2966af73e virtio: use LGUES... |
1128 |
pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1) |
42b36cc0c virtio: Force use... |
1129 |
/ getpagesize(); |
17cbca2ba Update example la... |
1130 |
p = get_pages(pages); |
d1c856e0f lguest: Fix unini... |
1131 1132 1133 1134 |
/* Initialize the virtqueue */ vq->next = NULL; vq->last_avail_idx = 0; vq->dev = dev; |
a91d74a3c lguest: update co... |
1135 1136 1137 1138 1139 |
/* * This is the routine the service thread will run, and its Process ID * once it's running. */ |
659a0e663 lguest: have exam... |
1140 1141 |
vq->service = service; vq->thread = (pid_t)-1; |
d1c856e0f lguest: Fix unini... |
1142 |
|
17cbca2ba Update example la... |
1143 1144 1145 1146 1147 1148 |
/* Initialize the configuration. */ vq->config.num = num_descs; vq->config.irq = devices.next_irq++; vq->config.pfn = to_guest_phys(p) / getpagesize(); /* Initialize the vring. */ |
2966af73e virtio: use LGUES... |
1149 |
vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); |
17cbca2ba Update example la... |
1150 |
|
2e04ef769 lguest: fix comme... |
1151 1152 |
/* * Append virtqueue to this device's descriptor. We use |
a586d4f60 virtio: simplify ... |
1153 1154 |
* device_config() to get the end of the device's current virtqueues; * we check that we haven't added any config or feature information |
2e04ef769 lguest: fix comme... |
1155 1156 |
* yet, otherwise we'd be overwriting them. */ |
a586d4f60 virtio: simplify ... |
1157 1158 |
assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); memcpy(device_config(dev), &vq->config, sizeof(vq->config)); |
713b15b37 lguest: be parano... |
1159 |
dev->num_vq++; |
a586d4f60 virtio: simplify ... |
1160 1161 1162 1163 |
dev->desc->num_vq++; verbose("Virtqueue page %#lx ", to_guest_phys(p)); |
17cbca2ba Update example la... |
1164 |
|
2e04ef769 lguest: fix comme... |
1165 1166 1167 1168 |
/* * Add to tail of list, so dev->vq is first vq, dev->vq->next is * second. */ |
17cbca2ba Update example la... |
1169 1170 |
for (i = &dev->vq; *i; i = &(*i)->next); *i = vq; |
8ca47e006 lguest: the docum... |
1171 |
} |
2e04ef769 lguest: fix comme... |
1172 1173 1174 1175 |
/* * The first half of the feature bitmask is for us to advertise features. The * second half is for the Guest to accept features. */ |
a586d4f60 virtio: simplify ... |
1176 1177 |
static void add_feature(struct device *dev, unsigned bit) { |
6e5aa7efb virtio: reset fun... |
1178 |
u8 *features = get_feature_bits(dev); |
a586d4f60 virtio: simplify ... |
1179 1180 1181 1182 |
/* We can't extend the feature bits once we've added config bytes */ if (dev->desc->feature_len <= bit / CHAR_BIT) { assert(dev->desc->config_len == 0); |
713b15b37 lguest: be parano... |
1183 |
dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; |
a586d4f60 virtio: simplify ... |
1184 |
} |
a586d4f60 virtio: simplify ... |
1185 1186 |
features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); } |
2e04ef769 lguest: fix comme... |
1187 1188 |
/* * This routine sets the configuration fields for an existing device's |
a586d4f60 virtio: simplify ... |
1189 |
* descriptor. It only works for the last device, but that's OK because that's |
2e04ef769 lguest: fix comme... |
1190 1191 |
* how we use it. */ |
a586d4f60 virtio: simplify ... |
1192 1193 1194 1195 1196 1197 1198 1199 1200 |
static void set_config(struct device *dev, unsigned len, const void *conf) { /* Check we haven't overflowed our single page. */ if (device_config(dev) + len > devices.descpage + getpagesize()) errx(1, "Too many devices"); /* Copy in the config information, and store the length. */ memcpy(device_config(dev), conf, len); dev->desc->config_len = len; |
8ef562d11 lguest: fix descr... |
1201 1202 1203 |
/* Size must fit in config_len field (8 bits)! */ assert(dev->desc->config_len == len); |
a586d4f60 virtio: simplify ... |
1204 |
} |
2e04ef769 lguest: fix comme... |
1205 1206 |
/* * This routine does all the creation and setup of a new device, including |
a91d74a3c lguest: update co... |
1207 1208 |
* calling new_dev_desc() to allocate the descriptor and device memory. We * don't actually start the service threads until later. |
a6bd8e130 lguest: comment d... |
1209 |
* |
2e04ef769 lguest: fix comme... |
1210 1211 |
* See what I mean about userspace being boring? */ |
659a0e663 lguest: have exam... |
1212 |
static struct device *new_device(const char *name, u16 type) |
8ca47e006 lguest: the docum... |
1213 1214 |
{ struct device *dev = malloc(sizeof(*dev)); |
dde797899 lguest: documenta... |
1215 |
/* Now we populate the fields one at a time. */ |
17cbca2ba Update example la... |
1216 |
dev->desc = new_dev_desc(type); |
17cbca2ba Update example la... |
1217 |
dev->name = name; |
d1c856e0f lguest: Fix unini... |
1218 |
dev->vq = NULL; |
713b15b37 lguest: be parano... |
1219 1220 |
dev->feature_len = 0; dev->num_vq = 0; |
659a0e663 lguest: have exam... |
1221 |
dev->running = false; |
a586d4f60 virtio: simplify ... |
1222 |
|
2e04ef769 lguest: fix comme... |
1223 1224 |
/* * Append to device list. Prepending to a single-linked list is |
a586d4f60 virtio: simplify ... |
1225 1226 |
* easier, but the user expects the devices to be arranged on the bus * in command-line order. The first network device on the command line |
2e04ef769 lguest: fix comme... |
1227 1228 |
* is eth0, the first block device /dev/vda, etc. */ |
a586d4f60 virtio: simplify ... |
1229 1230 1231 1232 1233 |
if (devices.lastdev) devices.lastdev->next = dev; else devices.dev = dev; devices.lastdev = dev; |
8ca47e006 lguest: the docum... |
1234 1235 |
return dev; } |
2e04ef769 lguest: fix comme... |
1236 1237 1238 1239 |
/* * Our first setup routine is the console. It's a fairly simple device, but * UNIX tty handling makes it uglier than it could be. */ |
17cbca2ba Update example la... |
1240 |
static void setup_console(void) |
8ca47e006 lguest: the docum... |
1241 1242 |
{ struct device *dev; |
dde797899 lguest: documenta... |
1243 |
/* If we can save the initial standard input settings... */ |
8ca47e006 lguest: the docum... |
1244 1245 |
if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { struct termios term = orig_term; |
2e04ef769 lguest: fix comme... |
1246 1247 1248 1249 |
/* * Then we turn off echo, line buffering and ^C etc: We want a * raw input stream to the Guest. */ |
8ca47e006 lguest: the docum... |
1250 1251 |
term.c_lflag &= ~(ISIG|ICANON|ECHO); tcsetattr(STDIN_FILENO, TCSANOW, &term); |
8ca47e006 lguest: the docum... |
1252 |
} |
659a0e663 lguest: have exam... |
1253 |
dev = new_device("console", VIRTIO_ID_CONSOLE); |
dde797899 lguest: documenta... |
1254 |
/* We store the console state in dev->priv, and initialize it. */ |
8ca47e006 lguest: the docum... |
1255 1256 |
dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; |
8ca47e006 lguest: the docum... |
1257 |
|
2e04ef769 lguest: fix comme... |
1258 1259 |
/* * The console needs two virtqueues: the input then the output. When |
56ae43dfe Example launcher ... |
1260 1261 |
* they put something the input queue, we make sure we're listening to * stdin. When they put something in the output queue, we write it to |
2e04ef769 lguest: fix comme... |
1262 1263 |
* stdout. */ |
659a0e663 lguest: have exam... |
1264 1265 |
add_virtqueue(dev, VIRTQUEUE_NUM, console_input); add_virtqueue(dev, VIRTQUEUE_NUM, console_output); |
17cbca2ba Update example la... |
1266 |
|
659a0e663 lguest: have exam... |
1267 1268 |
verbose("device %u: console ", ++devices.device_num); |
8ca47e006 lguest: the docum... |
1269 |
} |
17cbca2ba Update example la... |
1270 |
/*:*/ |
8ca47e006 lguest: the docum... |
1271 |
|
2e04ef769 lguest: fix comme... |
1272 1273 |
/*M:010 * Inter-guest networking is an interesting area. Simplest is to have a |
17cbca2ba Update example la... |
1274 1275 |
* --sharenet=<name> option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. |
dde797899 lguest: documenta... |
1276 |
* |
9f54288de lguest: update co... |
1277 |
* More sophisticated is to use one of the tools developed for project like UML |
17cbca2ba Update example la... |
1278 |
* to do networking. |
dde797899 lguest: documenta... |
1279 |
* |
17cbca2ba Update example la... |
1280 1281 1282 1283 1284 1285 1286 |
* Faster is to do virtio bonding in kernel. Doing this 1:1 would be * completely generic ("here's my vring, attach to your vring") and would work * for any traffic. Of course, namespace and permissions issues need to be * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide * multiple inter-guest channels behind one interface, although it would * require some manner of hotplugging new virtio channels. * |
9f54288de lguest: update co... |
1287 |
* Finally, we could use a virtio network switch in the kernel, ie. vhost. |
2e04ef769 lguest: fix comme... |
1288 |
:*/ |
8ca47e006 lguest: the docum... |
1289 1290 1291 |
static u32 str2ip(const char *ipaddr) { |
dec6a2be0 lguest: Support a... |
1292 |
unsigned int b[4]; |
8ca47e006 lguest: the docum... |
1293 |
|
dec6a2be0 lguest: Support a... |
1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 |
if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) errx(1, "Failed to parse IP address '%s'", ipaddr); return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; } static void str2mac(const char *macaddr, unsigned char mac[6]) { unsigned int m[6]; if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) errx(1, "Failed to parse mac address '%s'", macaddr); mac[0] = m[0]; mac[1] = m[1]; mac[2] = m[2]; mac[3] = m[3]; mac[4] = m[4]; mac[5] = m[5]; |
8ca47e006 lguest: the docum... |
1311 |
} |
2e04ef769 lguest: fix comme... |
1312 1313 |
/* * This code is "adapted" from libbridge: it attaches the Host end of the |
dde797899 lguest: documenta... |
1314 1315 1316 |
* network device to the bridge device specified by the command line. * * This is yet another James Morris contribution (I'm an IP-level guy, so I |
2e04ef769 lguest: fix comme... |
1317 1318 |
* dislike bridging), and I just try not to break it. */ |
8ca47e006 lguest: the docum... |
1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 |
static void add_to_bridge(int fd, const char *if_name, const char *br_name) { int ifidx; struct ifreq ifr; if (!*br_name) errx(1, "must specify bridge name"); ifidx = if_nametoindex(if_name); if (!ifidx) errx(1, "interface %s does not exist!", if_name); strncpy(ifr.ifr_name, br_name, IFNAMSIZ); |
dec6a2be0 lguest: Support a... |
1332 |
ifr.ifr_name[IFNAMSIZ-1] = '\0'; |
8ca47e006 lguest: the docum... |
1333 1334 1335 1336 |
ifr.ifr_ifindex = ifidx; if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) err(1, "can't add %s to bridge %s", if_name, br_name); } |
2e04ef769 lguest: fix comme... |
1337 1338 |
/* * This sets up the Host end of the network device with an IP address, brings |
dde797899 lguest: documenta... |
1339 |
* it up so packets will flow, the copies the MAC address into the hwaddr |
2e04ef769 lguest: fix comme... |
1340 1341 |
* pointer. */ |
dec6a2be0 lguest: Support a... |
1342 |
static void configure_device(int fd, const char *tapif, u32 ipaddr) |
8ca47e006 lguest: the docum... |
1343 1344 |
{ struct ifreq ifr; |
f846619e7 lguest: clean up ... |
1345 |
struct sockaddr_in sin; |
8ca47e006 lguest: the docum... |
1346 1347 |
memset(&ifr, 0, sizeof(ifr)); |
dec6a2be0 lguest: Support a... |
1348 1349 1350 |
strcpy(ifr.ifr_name, tapif); /* Don't read these incantations. Just cut & paste them like I did! */ |
f846619e7 lguest: clean up ... |
1351 1352 1353 |
sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(ipaddr); memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); |
8ca47e006 lguest: the docum... |
1354 |
if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) |
dec6a2be0 lguest: Support a... |
1355 |
err(1, "Setting %s interface address", tapif); |
8ca47e006 lguest: the docum... |
1356 1357 |
ifr.ifr_flags = IFF_UP; if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) |
dec6a2be0 lguest: Support a... |
1358 1359 |
err(1, "Bringing interface %s up", tapif); } |
dec6a2be0 lguest: Support a... |
1360 |
static int get_tun_device(char tapif[IFNAMSIZ]) |
8ca47e006 lguest: the docum... |
1361 |
{ |
8ca47e006 lguest: the docum... |
1362 |
struct ifreq ifr; |
dec6a2be0 lguest: Support a... |
1363 1364 1365 1366 |
int netfd; /* Start with this zeroed. Messy but sure. */ memset(&ifr, 0, sizeof(ifr)); |
8ca47e006 lguest: the docum... |
1367 |
|
2e04ef769 lguest: fix comme... |
1368 1369 |
/* * We open the /dev/net/tun device and tell it we want a tap device. A |
dde797899 lguest: documenta... |
1370 1371 |
* tap device is like a tun device, only somehow different. To tell * the truth, I completely blundered my way through this code, but it |
2e04ef769 lguest: fix comme... |
1372 1373 |
* works now! */ |
8ca47e006 lguest: the docum... |
1374 |
netfd = open_or_die("/dev/net/tun", O_RDWR); |
398f187d7 lguest: Use GSO/I... |
1375 |
ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; |
8ca47e006 lguest: the docum... |
1376 1377 1378 |
strcpy(ifr.ifr_name, "tap%d"); if (ioctl(netfd, TUNSETIFF, &ifr) != 0) err(1, "configuring /dev/net/tun"); |
dec6a2be0 lguest: Support a... |
1379 |
|
398f187d7 lguest: Use GSO/I... |
1380 1381 1382 |
if (ioctl(netfd, TUNSETOFFLOAD, TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) err(1, "Could not set features for tun device"); |
2e04ef769 lguest: fix comme... |
1383 1384 1385 1386 |
/* * We don't need checksums calculated for packets coming in this * device: trust us! */ |
8ca47e006 lguest: the docum... |
1387 |
ioctl(netfd, TUNSETNOCSUM, 1); |
dec6a2be0 lguest: Support a... |
1388 1389 1390 |
memcpy(tapif, ifr.ifr_name, IFNAMSIZ); return netfd; } |
2e04ef769 lguest: fix comme... |
1391 1392 |
/*L:195 * Our network is a Host<->Guest network. This can either use bridging or |
dec6a2be0 lguest: Support a... |
1393 1394 |
* routing, but the principle is the same: it uses the "tun" device to inject * packets into the Host as if they came in from a normal network card. We |
2e04ef769 lguest: fix comme... |
1395 1396 |
* just shunt packets between the Guest and the tun device. */ |
dec6a2be0 lguest: Support a... |
1397 1398 1399 |
static void setup_tun_net(char *arg) { struct device *dev; |
659a0e663 lguest: have exam... |
1400 1401 |
struct net_info *net_info = malloc(sizeof(*net_info)); int ipfd; |
dec6a2be0 lguest: Support a... |
1402 1403 1404 1405 |
u32 ip = INADDR_ANY; bool bridging = false; char tapif[IFNAMSIZ], *p; struct virtio_net_config conf; |
659a0e663 lguest: have exam... |
1406 |
net_info->tunfd = get_tun_device(tapif); |
dec6a2be0 lguest: Support a... |
1407 |
|
17cbca2ba Update example la... |
1408 |
/* First we create a new network device. */ |
659a0e663 lguest: have exam... |
1409 1410 |
dev = new_device("net", VIRTIO_ID_NET); dev->priv = net_info; |
dde797899 lguest: documenta... |
1411 |
|
2e04ef769 lguest: fix comme... |
1412 |
/* Network devices need a recv and a send queue, just like console. */ |
659a0e663 lguest: have exam... |
1413 1414 |
add_virtqueue(dev, VIRTQUEUE_NUM, net_input); add_virtqueue(dev, VIRTQUEUE_NUM, net_output); |
8ca47e006 lguest: the docum... |
1415 |
|
2e04ef769 lguest: fix comme... |
1416 1417 1418 1419 |
/* * We need a socket to perform the magic network ioctls to bring up the * tap interface, connect to the bridge etc. Any socket will do! */ |
8ca47e006 lguest: the docum... |
1420 1421 1422 |
ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); if (ipfd < 0) err(1, "opening IP socket"); |
dde797899 lguest: documenta... |
1423 |
/* If the command line was --tunnet=bridge:<name> do bridging. */ |
8ca47e006 lguest: the docum... |
1424 |
if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { |
dec6a2be0 lguest: Support a... |
1425 1426 1427 1428 1429 1430 1431 1432 |
arg += strlen(BRIDGE_PFX); bridging = true; } /* A mac address may follow the bridge name or IP address */ p = strchr(arg, ':'); if (p) { str2mac(p+1, conf.mac); |
40c42076e lguest: don't set... |
1433 |
add_feature(dev, VIRTIO_NET_F_MAC); |
dec6a2be0 lguest: Support a... |
1434 |
*p = '\0'; |
dec6a2be0 lguest: Support a... |
1435 1436 1437 1438 1439 1440 |
} /* arg is now either an IP address or a bridge name */ if (bridging) add_to_bridge(ipfd, tapif, arg); else |
8ca47e006 lguest: the docum... |
1441 |
ip = str2ip(arg); |
dec6a2be0 lguest: Support a... |
1442 1443 |
/* Set up the tun device. */ configure_device(ipfd, tapif, ip); |
8ca47e006 lguest: the docum... |
1444 |
|
398f187d7 lguest: Use GSO/I... |
1445 1446 1447 |
/* Expect Guest to handle everything except UFO */ add_feature(dev, VIRTIO_NET_F_CSUM); add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); |
398f187d7 lguest: Use GSO/I... |
1448 1449 1450 1451 1452 1453 |
add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); add_feature(dev, VIRTIO_NET_F_GUEST_ECN); add_feature(dev, VIRTIO_NET_F_HOST_TSO4); add_feature(dev, VIRTIO_NET_F_HOST_TSO6); add_feature(dev, VIRTIO_NET_F_HOST_ECN); |
d1f0132e7 lguest: add suppo... |
1454 1455 |
/* We handle indirect ring entries */ add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); |
a586d4f60 virtio: simplify ... |
1456 |
set_config(dev, sizeof(conf), &conf); |
8ca47e006 lguest: the docum... |
1457 |
|
a586d4f60 virtio: simplify ... |
1458 |
/* We don't need the socket any more; setup is done. */ |
8ca47e006 lguest: the docum... |
1459 |
close(ipfd); |
dec6a2be0 lguest: Support a... |
1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 |
devices.device_num++; if (bridging) verbose("device %u: tun %s attached to bridge: %s ", devices.device_num, tapif, arg); else verbose("device %u: tun %s: %s ", devices.device_num, tapif, arg); |
8ca47e006 lguest: the docum... |
1470 |
} |
a91d74a3c lguest: update co... |
1471 |
/*:*/ |
17cbca2ba Update example la... |
1472 |
|
e1e72965e lguest: documenta... |
1473 |
/* This hangs off device->priv. */ |
1842f23c0 lguest and virtio... |
1474 |
struct vblk_info { |
17cbca2ba Update example la... |
1475 1476 1477 1478 1479 |
/* The size of the file. */ off64_t len; /* The file descriptor for the file. */ int fd; |
17cbca2ba Update example la... |
1480 |
}; |
e1e72965e lguest: documenta... |
1481 1482 1483 |
/*L:210 * The Disk * |
a91d74a3c lguest: update co... |
1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 |
* The disk only has one virtqueue, so it only has one thread. It is really * simple: the Guest asks for a block number and we read or write that position * in the file. * * Before we serviced each virtqueue in a separate thread, that was unacceptably * slow: the Guest waits until the read is finished before running anything * else, even if it could have been doing useful work. * * We could have used async I/O, except it's reputed to suck so hard that * characters actually go missing from your code when you try to use it. |
e1e72965e lguest: documenta... |
1494 |
*/ |
659a0e663 lguest: have exam... |
1495 |
static void blk_request(struct virtqueue *vq) |
17cbca2ba Update example la... |
1496 |
{ |
659a0e663 lguest: have exam... |
1497 |
struct vblk_info *vblk = vq->dev->priv; |
17cbca2ba Update example la... |
1498 1499 |
unsigned int head, out_num, in_num, wlen; int ret; |
cb38fa23c virtio: de-struct... |
1500 |
u8 *in; |
17cbca2ba Update example la... |
1501 |
struct virtio_blk_outhdr *out; |
659a0e663 lguest: have exam... |
1502 |
struct iovec iov[vq->vring.num]; |
17cbca2ba Update example la... |
1503 |
off64_t off; |
a91d74a3c lguest: update co... |
1504 1505 1506 1507 |
/* * Get the next request, where we normally wait. It triggers the * interrupt to acknowledge previously serviced requests (if any). */ |
659a0e663 lguest: have exam... |
1508 |
head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
17cbca2ba Update example la... |
1509 |
|
2e04ef769 lguest: fix comme... |
1510 1511 |
/* * Every block request should contain at least one output buffer |
e1e72965e lguest: documenta... |
1512 |
* (detailing the location on disk and the type of request) and one |
2e04ef769 lguest: fix comme... |
1513 1514 |
* input buffer (to hold the result). */ |
17cbca2ba Update example la... |
1515 1516 1517 1518 1519 |
if (out_num == 0 || in_num == 0) errx(1, "Bad virtblk cmd %u out=%u in=%u", head, out_num, in_num); out = convert(&iov[0], struct virtio_blk_outhdr); |
cb38fa23c virtio: de-struct... |
1520 |
in = convert(&iov[out_num+in_num-1], u8); |
a91d74a3c lguest: update co... |
1521 1522 1523 1524 |
/* * For historical reasons, block operations are expressed in 512 byte * "sectors". */ |
17cbca2ba Update example la... |
1525 |
off = out->sector * 512; |
2e04ef769 lguest: fix comme... |
1526 |
/* |
2e04ef769 lguest: fix comme... |
1527 1528 1529 |
* In general the virtio block driver is allowed to try SCSI commands. * It'd be nice if we supported eject, for example, but we don't. */ |
17cbca2ba Update example la... |
1530 1531 1532 |
if (out->type & VIRTIO_BLK_T_SCSI_CMD) { fprintf(stderr, "Scsi commands unsupported "); |
cb38fa23c virtio: de-struct... |
1533 |
*in = VIRTIO_BLK_S_UNSUPP; |
1200e646a lguest: Fix lgues... |
1534 |
wlen = sizeof(*in); |
17cbca2ba Update example la... |
1535 |
} else if (out->type & VIRTIO_BLK_T_OUT) { |
2e04ef769 lguest: fix comme... |
1536 1537 1538 1539 1540 1541 |
/* * Write * * Move to the right location in the block file. This can fail * if they try to write past end. */ |
17cbca2ba Update example la... |
1542 1543 1544 1545 1546 1547 |
if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); ret = writev(vblk->fd, iov+1, out_num-1); verbose("WRITE to sector %llu: %i ", out->sector, ret); |
2e04ef769 lguest: fix comme... |
1548 1549 |
/* * Grr... Now we know how long the descriptor they sent was, we |
17cbca2ba Update example la... |
1550 |
* make sure they didn't try to write over the end of the block |
2e04ef769 lguest: fix comme... |
1551 1552 |
* file (possibly extending it). */ |
17cbca2ba Update example la... |
1553 1554 1555 1556 1557 1558 |
if (ret > 0 && off + ret > vblk->len) { /* Trim it back to the correct length */ ftruncate64(vblk->fd, vblk->len); /* Die, bad Guest, die. */ errx(1, "Write past end %llu+%u", off, ret); } |
7bc9fddab lguest: replace V... |
1559 1560 1561 1562 1563 1564 1565 1566 |
wlen = sizeof(*in); *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); } else if (out->type & VIRTIO_BLK_T_FLUSH) { /* Flush */ ret = fdatasync(vblk->fd); verbose("FLUSH fdatasync: %i ", ret); |
1200e646a lguest: Fix lgues... |
1567 |
wlen = sizeof(*in); |
cb38fa23c virtio: de-struct... |
1568 |
*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); |
17cbca2ba Update example la... |
1569 |
} else { |
2e04ef769 lguest: fix comme... |
1570 1571 1572 1573 1574 1575 |
/* * Read * * Move to the right location in the block file. This can fail * if they try to read past end. */ |
17cbca2ba Update example la... |
1576 1577 1578 1579 1580 1581 1582 |
if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); ret = readv(vblk->fd, iov+1, in_num-1); verbose("READ from sector %llu: %i ", out->sector, ret); if (ret >= 0) { |
1200e646a lguest: Fix lgues... |
1583 |
wlen = sizeof(*in) + ret; |
cb38fa23c virtio: de-struct... |
1584 |
*in = VIRTIO_BLK_S_OK; |
17cbca2ba Update example la... |
1585 |
} else { |
1200e646a lguest: Fix lgues... |
1586 |
wlen = sizeof(*in); |
cb38fa23c virtio: de-struct... |
1587 |
*in = VIRTIO_BLK_S_IOERR; |
17cbca2ba Update example la... |
1588 1589 |
} } |
a91d74a3c lguest: update co... |
1590 |
/* Finished that request. */ |
38bc2b8c5 lguest: implement... |
1591 |
add_used(vq, head, wlen); |
17cbca2ba Update example la... |
1592 |
} |
e1e72965e lguest: documenta... |
1593 |
/*L:198 This actually sets up a virtual block device. */ |
17cbca2ba Update example la... |
1594 1595 |
static void setup_block_file(const char *filename) { |
17cbca2ba Update example la... |
1596 1597 |
struct device *dev; struct vblk_info *vblk; |
a586d4f60 virtio: simplify ... |
1598 |
struct virtio_blk_config conf; |
17cbca2ba Update example la... |
1599 |
|
2e04ef769 lguest: fix comme... |
1600 |
/* Creat the device. */ |
659a0e663 lguest: have exam... |
1601 |
dev = new_device("block", VIRTIO_ID_BLOCK); |
17cbca2ba Update example la... |
1602 |
|
e1e72965e lguest: documenta... |
1603 |
/* The device has one virtqueue, where the Guest places requests. */ |
659a0e663 lguest: have exam... |
1604 |
add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); |
17cbca2ba Update example la... |
1605 1606 1607 1608 1609 1610 1611 |
/* Allocate the room for our own bookkeeping */ vblk = dev->priv = malloc(sizeof(*vblk)); /* First we open the file and store the length. */ vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); vblk->len = lseek64(vblk->fd, 0, SEEK_END); |
7bc9fddab lguest: replace V... |
1612 1613 |
/* We support FLUSH. */ add_feature(dev, VIRTIO_BLK_F_FLUSH); |
a586d4f60 virtio: simplify ... |
1614 |
|
17cbca2ba Update example la... |
1615 |
/* Tell Guest how many sectors this device has. */ |
a586d4f60 virtio: simplify ... |
1616 |
conf.capacity = cpu_to_le64(vblk->len / 512); |
17cbca2ba Update example la... |
1617 |
|
2e04ef769 lguest: fix comme... |
1618 1619 1620 1621 |
/* * Tell Guest not to put in too many descriptors at once: two are used * for the in and out elements. */ |
a586d4f60 virtio: simplify ... |
1622 1623 |
add_feature(dev, VIRTIO_BLK_F_SEG_MAX); conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); |
8ef562d11 lguest: fix descr... |
1624 1625 |
/* Don't try to put whole struct: we have 8 bit limit. */ set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf); |
17cbca2ba Update example la... |
1626 |
|
17cbca2ba Update example la... |
1627 1628 |
verbose("device %u: virtblock %llu sectors ", |
659a0e663 lguest: have exam... |
1629 |
++devices.device_num, le64_to_cpu(conf.capacity)); |
17cbca2ba Update example la... |
1630 |
} |
28fd6d7f9 lguest: virtio-rn... |
1631 |
|
2e04ef769 lguest: fix comme... |
1632 1633 |
/*L:211 * Our random number generator device reads from /dev/random into the Guest's |
28fd6d7f9 lguest: virtio-rn... |
1634 1635 1636 1637 |
* input buffers. The usual case is that the Guest doesn't want random numbers * and so has no buffers although /dev/random is still readable, whereas * console is the reverse. * |
2e04ef769 lguest: fix comme... |
1638 1639 1640 1641 1642 |
* The same logic applies, however. */ struct rng_info { int rfd; }; |
659a0e663 lguest: have exam... |
1643 |
static void rng_input(struct virtqueue *vq) |
28fd6d7f9 lguest: virtio-rn... |
1644 1645 1646 |
{ int len; unsigned int head, in_num, out_num, totlen = 0; |
659a0e663 lguest: have exam... |
1647 1648 |
struct rng_info *rng_info = vq->dev->priv; struct iovec iov[vq->vring.num]; |
28fd6d7f9 lguest: virtio-rn... |
1649 1650 |
/* First we need a buffer from the Guests's virtqueue. */ |
659a0e663 lguest: have exam... |
1651 |
head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
28fd6d7f9 lguest: virtio-rn... |
1652 1653 |
if (out_num) errx(1, "Output buffers in rng?"); |
2e04ef769 lguest: fix comme... |
1654 |
/* |
a91d74a3c lguest: update co... |
1655 1656 |
* Just like the console write, we loop to cover the whole iovec. * In this case, short reads actually happen quite a bit. |
2e04ef769 lguest: fix comme... |
1657 |
*/ |
28fd6d7f9 lguest: virtio-rn... |
1658 |
while (!iov_empty(iov, in_num)) { |
659a0e663 lguest: have exam... |
1659 |
len = readv(rng_info->rfd, iov, in_num); |
28fd6d7f9 lguest: virtio-rn... |
1660 1661 1662 1663 1664 1665 1666 |
if (len <= 0) err(1, "Read from /dev/random gave %i", len); iov_consume(iov, in_num, len); totlen += len; } /* Tell the Guest about the new input. */ |
38bc2b8c5 lguest: implement... |
1667 |
add_used(vq, head, totlen); |
28fd6d7f9 lguest: virtio-rn... |
1668 |
} |
2e04ef769 lguest: fix comme... |
1669 1670 1671 |
/*L:199 * This creates a "hardware" random number device for the Guest. */ |
28fd6d7f9 lguest: virtio-rn... |
1672 1673 1674 |
static void setup_rng(void) { struct device *dev; |
659a0e663 lguest: have exam... |
1675 |
struct rng_info *rng_info = malloc(sizeof(*rng_info)); |
28fd6d7f9 lguest: virtio-rn... |
1676 |
|
2e04ef769 lguest: fix comme... |
1677 |
/* Our device's privat info simply contains the /dev/random fd. */ |
659a0e663 lguest: have exam... |
1678 |
rng_info->rfd = open_or_die("/dev/random", O_RDONLY); |
28fd6d7f9 lguest: virtio-rn... |
1679 |
|
2e04ef769 lguest: fix comme... |
1680 |
/* Create the new device. */ |
659a0e663 lguest: have exam... |
1681 1682 |
dev = new_device("rng", VIRTIO_ID_RNG); dev->priv = rng_info; |
28fd6d7f9 lguest: virtio-rn... |
1683 1684 |
/* The device has one virtqueue, where the Guest places inbufs. */ |
659a0e663 lguest: have exam... |
1685 |
add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); |
28fd6d7f9 lguest: virtio-rn... |
1686 1687 1688 1689 |
verbose("device %u: rng ", devices.device_num++); } |
a6bd8e130 lguest: comment d... |
1690 |
/* That's the end of device setup. */ |
ec04b13f6 lguest: Reboot su... |
1691 |
|
a6bd8e130 lguest: comment d... |
1692 |
/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ |
ec04b13f6 lguest: Reboot su... |
1693 1694 1695 |
static void __attribute__((noreturn)) restart_guest(void) { unsigned int i; |
2e04ef769 lguest: fix comme... |
1696 1697 1698 1699 |
/* * Since we don't track all open fds, we simply close everything beyond * stderr. */ |
ec04b13f6 lguest: Reboot su... |
1700 1701 |
for (i = 3; i < FD_SETSIZE; i++) close(i); |
8c79873da lguest: turn Wake... |
1702 |
|
659a0e663 lguest: have exam... |
1703 1704 |
/* Reset all the devices (kills all threads). */ cleanup_devices(); |
ec04b13f6 lguest: Reboot su... |
1705 1706 1707 |
execv(main_args[0], main_args); err(1, "Could not exec %s", main_args[0]); } |
8ca47e006 lguest: the docum... |
1708 |
|
2e04ef769 lguest: fix comme... |
1709 1710 1711 1712 |
/*L:220 * Finally we reach the core of the Launcher which runs the Guest, serves * its input and output, and finally, lays it to rest. */ |
56739c802 lguest: cleanup p... |
1713 |
static void __attribute__((noreturn)) run_guest(void) |
8ca47e006 lguest: the docum... |
1714 1715 |
{ for (;;) { |
17cbca2ba Update example la... |
1716 |
unsigned long notify_addr; |
8ca47e006 lguest: the docum... |
1717 1718 1719 |
int readval; /* We read from the /dev/lguest device to run the Guest. */ |
e3283fa0c lguest: adapt lau... |
1720 1721 |
readval = pread(lguest_fd, ¬ify_addr, sizeof(notify_addr), cpu_id); |
8ca47e006 lguest: the docum... |
1722 |
|
17cbca2ba Update example la... |
1723 1724 1725 1726 |
/* One unsigned long means the Guest did HCALL_NOTIFY */ if (readval == sizeof(notify_addr)) { verbose("Notify on address %#lx ", notify_addr); |
56739c802 lguest: cleanup p... |
1727 |
handle_output(notify_addr); |
dde797899 lguest: documenta... |
1728 |
/* ENOENT means the Guest died. Reading tells us why. */ |
8ca47e006 lguest: the docum... |
1729 1730 |
} else if (errno == ENOENT) { char reason[1024] = { 0 }; |
e3283fa0c lguest: adapt lau... |
1731 |
pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); |
8ca47e006 lguest: the docum... |
1732 |
errx(1, "%s", reason); |
ec04b13f6 lguest: Reboot su... |
1733 1734 1735 |
/* ERESTART means that we need to reboot the guest */ } else if (errno == ERESTART) { restart_guest(); |
659a0e663 lguest: have exam... |
1736 1737 |
/* Anything else means a bug or incompatible change. */ } else |
8ca47e006 lguest: the docum... |
1738 |
err(1, "Running guest failed"); |
8ca47e006 lguest: the docum... |
1739 1740 |
} } |
a6bd8e130 lguest: comment d... |
1741 |
/*L:240 |
e1e72965e lguest: documenta... |
1742 1743 1744 |
* This is the end of the Launcher. The good news: we are over halfway * through! The bad news: the most fiendish part of the code still lies ahead * of us. |
dde797899 lguest: documenta... |
1745 |
* |
e1e72965e lguest: documenta... |
1746 1747 |
* Are you ready? Take a deep breath and join me in the core of the Host, in * "make Host". |
2e04ef769 lguest: fix comme... |
1748 |
:*/ |
8ca47e006 lguest: the docum... |
1749 1750 1751 |
static struct option opts[] = { { "verbose", 0, NULL, 'v' }, |
8ca47e006 lguest: the docum... |
1752 1753 |
{ "tunnet", 1, NULL, 't' }, { "block", 1, NULL, 'b' }, |
28fd6d7f9 lguest: virtio-rn... |
1754 |
{ "rng", 0, NULL, 'r' }, |
8ca47e006 lguest: the docum... |
1755 |
{ "initrd", 1, NULL, 'i' }, |
8aeb36e8f lguest: --usernam... |
1756 1757 |
{ "username", 1, NULL, 'u' }, { "chroot", 1, NULL, 'c' }, |
8ca47e006 lguest: the docum... |
1758 1759 1760 1761 1762 |
{ NULL }, }; static void usage(void) { errx(1, "Usage: lguest [--verbose] " |
dec6a2be0 lguest: Support a... |
1763 1764 |
"[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>) " |
8ca47e006 lguest: the docum... |
1765 1766 1767 1768 |
"|--block=<filename>|--initrd=<filename>]... " "<mem-in-mb> vmlinux [args...]"); } |
3c6b5bfa3 Introduce guest m... |
1769 |
/*L:105 The main routine is where the real work begins: */ |
8ca47e006 lguest: the docum... |
1770 1771 |
int main(int argc, char *argv[]) { |
2e04ef769 lguest: fix comme... |
1772 |
/* Memory, code startpoint and size of the (optional) initrd. */ |
58a245664 lguest: move the ... |
1773 |
unsigned long mem = 0, start, initrd_size = 0; |
56739c802 lguest: cleanup p... |
1774 1775 |
/* Two temporaries. */ int i, c; |
3c6b5bfa3 Introduce guest m... |
1776 |
/* The boot information for the Guest. */ |
43d33b21a Use "struct boot_... |
1777 |
struct boot_params *boot; |
dde797899 lguest: documenta... |
1778 |
/* If they specify an initrd file to load. */ |
8ca47e006 lguest: the docum... |
1779 |
const char *initrd_name = NULL; |
8aeb36e8f lguest: --usernam... |
1780 1781 1782 1783 1784 |
/* Password structure for initgroups/setres[gu]id */ struct passwd *user_details = NULL; /* Directory to chroot to */ char *chroot_path = NULL; |
ec04b13f6 lguest: Reboot su... |
1785 1786 |
/* Save the args: we "reboot" by execing ourselves again. */ main_args = argv; |
ec04b13f6 lguest: Reboot su... |
1787 |
|
2e04ef769 lguest: fix comme... |
1788 1789 |
/* * First we initialize the device list. We keep a pointer to the last |
659a0e663 lguest: have exam... |
1790 |
* device, and the next interrupt number to use for devices (1: |
2e04ef769 lguest: fix comme... |
1791 1792 |
* remember that 0 is used by the timer). */ |
a586d4f60 virtio: simplify ... |
1793 |
devices.lastdev = NULL; |
17cbca2ba Update example la... |
1794 |
devices.next_irq = 1; |
8ca47e006 lguest: the docum... |
1795 |
|
a91d74a3c lguest: update co... |
1796 |
/* We're CPU 0. In fact, that's the only CPU possible right now. */ |
e3283fa0c lguest: adapt lau... |
1797 |
cpu_id = 0; |
a91d74a3c lguest: update co... |
1798 |
|
2e04ef769 lguest: fix comme... |
1799 1800 |
/* * We need to know how much memory so we can set up the device |
dde797899 lguest: documenta... |
1801 1802 |
* descriptor and memory pages for the devices as we parse the command * line. So we quickly look through the arguments to find the amount |
2e04ef769 lguest: fix comme... |
1803 1804 |
* of memory now. */ |
6570c4599 link lguest examp... |
1805 1806 |
for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { |
3c6b5bfa3 Introduce guest m... |
1807 |
mem = atoi(argv[i]) * 1024 * 1024; |
2e04ef769 lguest: fix comme... |
1808 1809 |
/* * We start by mapping anonymous pages over all of |
3c6b5bfa3 Introduce guest m... |
1810 1811 |
* guest-physical memory range. This fills it with 0, * and ensures that the Guest won't be killed when it |
2e04ef769 lguest: fix comme... |
1812 1813 |
* tries to access it. */ |
3c6b5bfa3 Introduce guest m... |
1814 1815 1816 1817 |
guest_base = map_zeroed_pages(mem / getpagesize() + DEVICE_PAGES); guest_limit = mem; guest_max = mem + DEVICE_PAGES*getpagesize(); |
17cbca2ba Update example la... |
1818 |
devices.descpage = get_pages(1); |
6570c4599 link lguest examp... |
1819 1820 1821 |
break; } } |
dde797899 lguest: documenta... |
1822 1823 |
/* The options are fairly straight-forward */ |
8ca47e006 lguest: the docum... |
1824 1825 1826 1827 1828 |
while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { switch (c) { case 'v': verbose = true; break; |
8ca47e006 lguest: the docum... |
1829 |
case 't': |
17cbca2ba Update example la... |
1830 |
setup_tun_net(optarg); |
8ca47e006 lguest: the docum... |
1831 1832 |
break; case 'b': |
17cbca2ba Update example la... |
1833 |
setup_block_file(optarg); |
8ca47e006 lguest: the docum... |
1834 |
break; |
28fd6d7f9 lguest: virtio-rn... |
1835 1836 1837 |
case 'r': setup_rng(); break; |
8ca47e006 lguest: the docum... |
1838 1839 1840 |
case 'i': initrd_name = optarg; break; |
8aeb36e8f lguest: --usernam... |
1841 1842 1843 1844 1845 1846 1847 1848 |
case 'u': user_details = getpwnam(optarg); if (!user_details) err(1, "getpwnam failed, incorrect username?"); break; case 'c': chroot_path = optarg; break; |
8ca47e006 lguest: the docum... |
1849 1850 1851 1852 1853 |
default: warnx("Unknown argument %s", argv[optind]); usage(); } } |
2e04ef769 lguest: fix comme... |
1854 1855 1856 1857 |
/* * After the other arguments we expect memory and kernel image name, * followed by command line arguments for the kernel. */ |
8ca47e006 lguest: the docum... |
1858 1859 |
if (optind + 2 > argc) usage(); |
3c6b5bfa3 Introduce guest m... |
1860 1861 |
verbose("Guest base is at %p ", guest_base); |
dde797899 lguest: documenta... |
1862 |
/* We always have a console device */ |
17cbca2ba Update example la... |
1863 |
setup_console(); |
8ca47e006 lguest: the docum... |
1864 |
|
8ca47e006 lguest: the docum... |
1865 |
/* Now we load the kernel */ |
47436aa4a Boot with virtual... |
1866 |
start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); |
8ca47e006 lguest: the docum... |
1867 |
|
3c6b5bfa3 Introduce guest m... |
1868 1869 |
/* Boot information is stashed at physical address 0 */ boot = from_guest_phys(0); |
dde797899 lguest: documenta... |
1870 |
/* Map the initrd image if requested (at top of physical memory) */ |
8ca47e006 lguest: the docum... |
1871 1872 |
if (initrd_name) { initrd_size = load_initrd(initrd_name, mem); |
2e04ef769 lguest: fix comme... |
1873 1874 1875 1876 |
/* * These are the location in the Linux boot header where the * start and size of the initrd are expected to be found. */ |
43d33b21a Use "struct boot_... |
1877 1878 |
boot->hdr.ramdisk_image = mem - initrd_size; boot->hdr.ramdisk_size = initrd_size; |
dde797899 lguest: documenta... |
1879 |
/* The bootloader type 0xFF means "unknown"; that's OK. */ |
43d33b21a Use "struct boot_... |
1880 |
boot->hdr.type_of_loader = 0xFF; |
8ca47e006 lguest: the docum... |
1881 |
} |
2e04ef769 lguest: fix comme... |
1882 1883 1884 1885 |
/* * The Linux boot header contains an "E820" memory map: ours is a * simple, single region. */ |
43d33b21a Use "struct boot_... |
1886 1887 |
boot->e820_entries = 1; boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); |
2e04ef769 lguest: fix comme... |
1888 1889 1890 1891 |
/* * The boot header contains a command line pointer: we put the command * line after the boot header. */ |
43d33b21a Use "struct boot_... |
1892 |
boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); |
e1e72965e lguest: documenta... |
1893 |
/* We use a simple helper to copy the arguments separated by spaces. */ |
43d33b21a Use "struct boot_... |
1894 |
concat((char *)(boot + 1), argv+optind+2); |
dde797899 lguest: documenta... |
1895 |
|
e22a53982 lguest: allow boo... |
1896 1897 |
/* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */ boot->hdr.kernel_alignment = 0x1000000; |
814a0e5cd Revert lguest mag... |
1898 |
/* Boot protocol version: 2.07 supports the fields for lguest. */ |
43d33b21a Use "struct boot_... |
1899 |
boot->hdr.version = 0x207; |
814a0e5cd Revert lguest mag... |
1900 1901 |
/* The hardware_subarch value of "1" tells the Guest it's an lguest. */ |
43d33b21a Use "struct boot_... |
1902 |
boot->hdr.hardware_subarch = 1; |
814a0e5cd Revert lguest mag... |
1903 |
|
43d33b21a Use "struct boot_... |
1904 1905 |
/* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; |
8ca47e006 lguest: the docum... |
1906 |
|
9f54288de lguest: update co... |
1907 |
/* We tell the kernel to initialize the Guest. */ |
56739c802 lguest: cleanup p... |
1908 |
tell_kernel(start); |
dde797899 lguest: documenta... |
1909 |
|
a91d74a3c lguest: update co... |
1910 |
/* Ensure that we terminate if a device-servicing child dies. */ |
659a0e663 lguest: have exam... |
1911 1912 1913 1914 |
signal(SIGCHLD, kill_launcher); /* If we exit via err(), this kills all the threads, restores tty. */ atexit(cleanup_devices); |
8ca47e006 lguest: the docum... |
1915 |
|
8aeb36e8f lguest: --usernam... |
1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 |
/* If requested, chroot to a directory */ if (chroot_path) { if (chroot(chroot_path) != 0) err(1, "chroot(\"%s\") failed", chroot_path); if (chdir("/") != 0) err(1, "chdir(\"/\") failed"); verbose("chroot done "); } /* If requested, drop privileges */ if (user_details) { uid_t u; gid_t g; u = user_details->pw_uid; g = user_details->pw_gid; if (initgroups(user_details->pw_name, g) != 0) err(1, "initgroups failed"); if (setresgid(g, g, g) != 0) err(1, "setresgid failed"); if (setresuid(u, u, u) != 0) err(1, "setresuid failed"); verbose("Dropping privileges completed "); } |
dde797899 lguest: documenta... |
1948 |
/* Finally, run the Guest. This doesn't return. */ |
56739c802 lguest: cleanup p... |
1949 |
run_guest(); |
8ca47e006 lguest: the docum... |
1950 |
} |
f56a384e9 lguest: documenta... |
1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 |
/*:*/ /*M:999 * Mastery is done: you now know everything I do. * * But surely you have seen code, features and bugs in your wanderings which * you now yearn to attack? That is the real game, and I look forward to you * patching and forking lguest into the Your-Name-Here-visor. * * Farewell, and good coding! * Rusty Russell. */ |