Blame view

tools/lguest/lguest.c 57.6 KB
2e04ef769   Rusty Russell   lguest: fix comme...
1
2
3
4
5
6
  /*P:100
   * This is the Launcher code, a simple program which lays out the "physical"
   * memory for the new Guest by mapping the kernel image and the virtual
   * devices, then opens /dev/lguest to tell the kernel about the Guest and
   * control it.
  :*/
8ca47e006   Rusty Russell   lguest: the docum...
7
8
9
10
11
12
13
14
15
16
  #define _LARGEFILE64_SOURCE
  #define _GNU_SOURCE
  #include <stdio.h>
  #include <string.h>
  #include <unistd.h>
  #include <err.h>
  #include <stdint.h>
  #include <stdlib.h>
  #include <elf.h>
  #include <sys/mman.h>
6649bb7af   Ronald G. Minnich   Accept elf files ...
17
  #include <sys/param.h>
8ca47e006   Rusty Russell   lguest: the docum...
18
19
20
  #include <sys/types.h>
  #include <sys/stat.h>
  #include <sys/wait.h>
659a0e663   Rusty Russell   lguest: have exam...
21
  #include <sys/eventfd.h>
8ca47e006   Rusty Russell   lguest: the docum...
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
  #include <fcntl.h>
  #include <stdbool.h>
  #include <errno.h>
  #include <ctype.h>
  #include <sys/socket.h>
  #include <sys/ioctl.h>
  #include <sys/time.h>
  #include <time.h>
  #include <netinet/in.h>
  #include <net/if.h>
  #include <linux/sockios.h>
  #include <linux/if_tun.h>
  #include <sys/uio.h>
  #include <termios.h>
  #include <getopt.h>
17cbca2ba   Rusty Russell   Update example la...
37
38
  #include <assert.h>
  #include <sched.h>
a586d4f60   Rusty Russell   virtio: simplify ...
39
40
  #include <limits.h>
  #include <stddef.h>
a161883a2   Rusty Russell   lguest: Tell Gues...
41
  #include <signal.h>
8aeb36e8f   Philip Sanderson   lguest: --usernam...
42
43
  #include <pwd.h>
  #include <grp.h>
f846619e7   Rusty Russell   lguest: clean up ...
44
45
46
47
48
49
50
  #include <linux/virtio_config.h>
  #include <linux/virtio_net.h>
  #include <linux/virtio_blk.h>
  #include <linux/virtio_console.h>
  #include <linux/virtio_rng.h>
  #include <linux/virtio_ring.h>
  #include <asm/bootparam.h>
07fe9977b   Davidlohr Bueso   lguest: move the ...
51
  #include "../../include/linux/lguest_launcher.h"
2e04ef769   Rusty Russell   lguest: fix comme...
52
  /*L:110
9f54288de   Rusty Russell   lguest: update co...
53
   * We can ignore the 43 include files we need for this program, but I do want
2e04ef769   Rusty Russell   lguest: fix comme...
54
   * to draw attention to the use of kernel-style types.
db24e8c2e   Rusty Russell   lguest: example l...
55
56
57
58
   *
   * As Linus said, "C is a Spartan language, and so should your naming be."  I
   * like these abbreviations, so we define them here.  Note that u64 is always
   * unsigned long long, which works on all Linux systems: this means that we can
2e04ef769   Rusty Russell   lguest: fix comme...
59
60
   * use %llu in printf for any u64.
   */
db24e8c2e   Rusty Russell   lguest: example l...
61
62
63
64
  typedef unsigned long long u64;
  typedef uint32_t u32;
  typedef uint16_t u16;
  typedef uint8_t u8;
dde797899   Rusty Russell   lguest: documenta...
65
  /*:*/
8ca47e006   Rusty Russell   lguest: the docum...
66

8ca47e006   Rusty Russell   lguest: the docum...
67
68
69
70
  #define BRIDGE_PFX "bridge:"
  #ifndef SIOCBRADDIF
  #define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
  #endif
3c6b5bfa3   Rusty Russell   Introduce guest m...
71
72
  /* We can have up to 256 pages for devices. */
  #define DEVICE_PAGES 256
0f0c4fab8   Rusty Russell   lguest: Enlarge v...
73
74
  /* This will occupy 3 pages: it must be a power of 2. */
  #define VIRTQUEUE_NUM 256
8ca47e006   Rusty Russell   lguest: the docum...
75

2e04ef769   Rusty Russell   lguest: fix comme...
76
77
78
79
  /*L:120
   * verbose is both a global flag and a macro.  The C preprocessor allows
   * this, and although I wouldn't recommend it, it works quite nicely here.
   */
8ca47e006   Rusty Russell   lguest: the docum...
80
81
82
  static bool verbose;
  #define verbose(args...) \
  	do { if (verbose) printf(args); } while(0)
dde797899   Rusty Russell   lguest: documenta...
83
  /*:*/
3c6b5bfa3   Rusty Russell   Introduce guest m...
84
85
86
87
  /* The pointer to the start of guest memory. */
  static void *guest_base;
  /* The maximum guest physical address allowed, and maximum possible. */
  static unsigned long guest_limit, guest_max;
56739c802   Rusty Russell   lguest: cleanup p...
88
89
  /* The /dev/lguest file descriptor. */
  static int lguest_fd;
8ca47e006   Rusty Russell   lguest: the docum...
90

e3283fa0c   Glauber de Oliveira Costa   lguest: adapt lau...
91
92
  /* a per-cpu variable indicating whose vcpu is currently running */
  static unsigned int __thread cpu_id;
dde797899   Rusty Russell   lguest: documenta...
93
  /* This is our list of devices. */
1842f23c0   Rusty Russell   lguest and virtio...
94
  struct device_list {
17cbca2ba   Rusty Russell   Update example la...
95
96
97
98
99
  	/* Counter to assign interrupt numbers. */
  	unsigned int next_irq;
  
  	/* Counter to print out convenient device numbers. */
  	unsigned int device_num;
dde797899   Rusty Russell   lguest: documenta...
100
  	/* The descriptor page for the devices. */
17cbca2ba   Rusty Russell   Update example la...
101
  	u8 *descpage;
dde797899   Rusty Russell   lguest: documenta...
102
  	/* A single linked list of devices. */
8ca47e006   Rusty Russell   lguest: the docum...
103
  	struct device *dev;
2e04ef769   Rusty Russell   lguest: fix comme...
104
  	/* And a pointer to the last device for easy append. */
a586d4f60   Rusty Russell   virtio: simplify ...
105
  	struct device *lastdev;
8ca47e006   Rusty Russell   lguest: the docum...
106
  };
17cbca2ba   Rusty Russell   Update example la...
107
108
  /* The list of Guest devices, based on command line arguments. */
  static struct device_list devices;
dde797899   Rusty Russell   lguest: documenta...
109
  /* The device structure describes a single device. */
1842f23c0   Rusty Russell   lguest and virtio...
110
  struct device {
dde797899   Rusty Russell   lguest: documenta...
111
  	/* The linked-list pointer. */
8ca47e006   Rusty Russell   lguest: the docum...
112
  	struct device *next;
17cbca2ba   Rusty Russell   Update example la...
113

713b15b37   Rusty Russell   lguest: be parano...
114
  	/* The device's descriptor, as mapped into the Guest. */
8ca47e006   Rusty Russell   lguest: the docum...
115
  	struct lguest_device_desc *desc;
17cbca2ba   Rusty Russell   Update example la...
116

713b15b37   Rusty Russell   lguest: be parano...
117
118
119
  	/* We can't trust desc values once Guest has booted: we use these. */
  	unsigned int feature_len;
  	unsigned int num_vq;
17cbca2ba   Rusty Russell   Update example la...
120
121
  	/* The name of this device, for --verbose. */
  	const char *name;
8ca47e006   Rusty Russell   lguest: the docum...
122

17cbca2ba   Rusty Russell   Update example la...
123
124
  	/* Any queues attached to this device */
  	struct virtqueue *vq;
8ca47e006   Rusty Russell   lguest: the docum...
125

659a0e663   Rusty Russell   lguest: have exam...
126
127
  	/* Is it operational */
  	bool running;
a007a751d   Rusty Russell   lguest: make Laun...
128

8ca47e006   Rusty Russell   lguest: the docum...
129
130
131
  	/* Device-specific data. */
  	void *priv;
  };
17cbca2ba   Rusty Russell   Update example la...
132
  /* The virtqueue structure describes a queue attached to a device. */
1842f23c0   Rusty Russell   lguest and virtio...
133
  struct virtqueue {
17cbca2ba   Rusty Russell   Update example la...
134
135
136
137
138
139
140
141
142
143
144
145
146
  	struct virtqueue *next;
  
  	/* Which device owns me. */
  	struct device *dev;
  
  	/* The configuration for this queue. */
  	struct lguest_vqconfig config;
  
  	/* The actual ring of buffers. */
  	struct vring vring;
  
  	/* Last available index we saw. */
  	u16 last_avail_idx;
95c517c09   Rusty Russell   lguest: avoid sen...
147
148
  	/* How many are used since we sent last irq? */
  	unsigned int pending_used;
659a0e663   Rusty Russell   lguest: have exam...
149
150
  	/* Eventfd where Guest notifications arrive. */
  	int eventfd;
208876115   Rusty Russell   lguest: notify on...
151

659a0e663   Rusty Russell   lguest: have exam...
152
153
154
  	/* Function for the thread which is servicing this virtqueue. */
  	void (*service)(struct virtqueue *vq);
  	pid_t thread;
17cbca2ba   Rusty Russell   Update example la...
155
  };
ec04b13f6   Balaji Rao   lguest: Reboot su...
156
157
  /* Remember the arguments to the program so we can "reboot" */
  static char **main_args;
659a0e663   Rusty Russell   lguest: have exam...
158
159
  /* The original tty settings to restore on exit. */
  static struct termios orig_term;
2e04ef769   Rusty Russell   lguest: fix comme...
160
161
  /*
   * We have to be careful with barriers: our devices are all run in separate
f7027c638   Rusty Russell   lguest: get more ...
162
   * threads and so we need to make sure that changes visible to the Guest happen
2e04ef769   Rusty Russell   lguest: fix comme...
163
164
   * in precise order.
   */
f7027c638   Rusty Russell   lguest: get more ...
165
  #define wmb() __asm__ __volatile__("" : : : "memory")
b60da13fc   Rusty Russell   lguest: suppress ...
166
  #define mb() __asm__ __volatile__("" : : : "memory")
17cbca2ba   Rusty Russell   Update example la...
167

2e04ef769   Rusty Russell   lguest: fix comme...
168
169
  /*
   * Convert an iovec element to the given type.
17cbca2ba   Rusty Russell   Update example la...
170
171
172
173
174
175
   *
   * This is a fairly ugly trick: we need to know the size of the type and
   * alignment requirement to check the pointer is kosher.  It's also nice to
   * have the name of the type in case we report failure.
   *
   * Typing those three things all the time is cumbersome and error prone, so we
2e04ef769   Rusty Russell   lguest: fix comme...
176
177
   * have a macro which sets them all up and passes to the real function.
   */
17cbca2ba   Rusty Russell   Update example la...
178
179
180
181
182
183
184
185
186
187
188
189
  #define convert(iov, type) \
  	((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
  
  static void *_convert(struct iovec *iov, size_t size, size_t align,
  		      const char *name)
  {
  	if (iov->iov_len != size)
  		errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
  	if ((unsigned long)iov->iov_base % align != 0)
  		errx(1, "Bad alignment %p for %s", iov->iov_base, name);
  	return iov->iov_base;
  }
b5111790f   Rusty Russell   lguest: wrap last...
190
191
  /* Wrapper for the last available index.  Makes it easier to change. */
  #define lg_last_avail(vq)	((vq)->last_avail_idx)
2e04ef769   Rusty Russell   lguest: fix comme...
192
193
194
195
  /*
   * The virtio configuration space is defined to be little-endian.  x86 is
   * little-endian too, but it's nice to be explicit so we have these helpers.
   */
17cbca2ba   Rusty Russell   Update example la...
196
197
198
199
200
  #define cpu_to_le16(v16) (v16)
  #define cpu_to_le32(v32) (v32)
  #define cpu_to_le64(v64) (v64)
  #define le16_to_cpu(v16) (v16)
  #define le32_to_cpu(v32) (v32)
a586d4f60   Rusty Russell   virtio: simplify ...
201
  #define le64_to_cpu(v64) (v64)
17cbca2ba   Rusty Russell   Update example la...
202

28fd6d7f9   Rusty Russell   lguest: virtio-rn...
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
  /* Is this iovec empty? */
  static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
  {
  	unsigned int i;
  
  	for (i = 0; i < num_iov; i++)
  		if (iov[i].iov_len)
  			return false;
  	return true;
  }
  
  /* Take len bytes from the front of this iovec. */
  static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
  {
  	unsigned int i;
  
  	for (i = 0; i < num_iov; i++) {
  		unsigned int used;
  
  		used = iov[i].iov_len < len ? iov[i].iov_len : len;
  		iov[i].iov_base += used;
  		iov[i].iov_len -= used;
  		len -= used;
  	}
  	assert(len == 0);
  }
6e5aa7efb   Rusty Russell   virtio: reset fun...
229
230
231
232
  /* The device virtqueue descriptors are followed by feature bitmasks. */
  static u8 *get_feature_bits(struct device *dev)
  {
  	return (u8 *)(dev->desc + 1)
713b15b37   Rusty Russell   lguest: be parano...
233
  		+ dev->num_vq * sizeof(struct lguest_vqconfig);
6e5aa7efb   Rusty Russell   virtio: reset fun...
234
  }
2e04ef769   Rusty Russell   lguest: fix comme...
235
236
237
238
239
240
  /*L:100
   * The Launcher code itself takes us out into userspace, that scary place where
   * pointers run wild and free!  Unfortunately, like most userspace programs,
   * it's quite boring (which is why everyone likes to hack on the kernel!).
   * Perhaps if you make up an Lguest Drinking Game at this point, it will get
   * you through this section.  Or, maybe not.
3c6b5bfa3   Rusty Russell   Introduce guest m...
241
242
243
244
245
246
   *
   * The Launcher sets up a big chunk of memory to be the Guest's "physical"
   * memory and stores it in "guest_base".  In other words, Guest physical ==
   * Launcher virtual with an offset.
   *
   * This can be tough to get your head around, but usually it just means that we
a33f32244   Francis Galiegue   Documentation/: i...
247
   * use these trivial conversion functions when the Guest gives us its
2e04ef769   Rusty Russell   lguest: fix comme...
248
249
   * "physical" addresses:
   */
3c6b5bfa3   Rusty Russell   Introduce guest m...
250
251
252
253
254
255
256
257
258
  static void *from_guest_phys(unsigned long addr)
  {
  	return guest_base + addr;
  }
  
  static unsigned long to_guest_phys(const void *addr)
  {
  	return (addr - guest_base);
  }
dde797899   Rusty Russell   lguest: documenta...
259
260
261
262
  /*L:130
   * Loading the Kernel.
   *
   * We start with couple of simple helper routines.  open_or_die() avoids
2e04ef769   Rusty Russell   lguest: fix comme...
263
264
   * error-checking code cluttering the callers:
   */
8ca47e006   Rusty Russell   lguest: the docum...
265
266
267
268
269
270
271
  static int open_or_die(const char *name, int flags)
  {
  	int fd = open(name, flags);
  	if (fd < 0)
  		err(1, "Failed to open %s", name);
  	return fd;
  }
3c6b5bfa3   Rusty Russell   Introduce guest m...
272
273
  /* map_zeroed_pages() takes a number of pages. */
  static void *map_zeroed_pages(unsigned int num)
8ca47e006   Rusty Russell   lguest: the docum...
274
  {
3c6b5bfa3   Rusty Russell   Introduce guest m...
275
276
  	int fd = open_or_die("/dev/zero", O_RDONLY);
  	void *addr;
8ca47e006   Rusty Russell   lguest: the docum...
277

2e04ef769   Rusty Russell   lguest: fix comme...
278
279
  	/*
  	 * We use a private mapping (ie. if we write to the page, it will be
5230ff0cc   Philip Sanderson   lguest: example l...
280
281
  	 * copied). We allocate an extra two pages PROT_NONE to act as guard
  	 * pages against read/write attempts that exceed allocated space.
2e04ef769   Rusty Russell   lguest: fix comme...
282
  	 */
5230ff0cc   Philip Sanderson   lguest: example l...
283
284
  	addr = mmap(NULL, getpagesize() * (num+2),
  		    PROT_NONE, MAP_PRIVATE, fd, 0);
3c6b5bfa3   Rusty Russell   Introduce guest m...
285
  	if (addr == MAP_FAILED)
af901ca18   André Goddard Rosa   tree-wide: fix as...
286
  		err(1, "Mmapping %u pages of /dev/zero", num);
a91d74a3c   Rusty Russell   lguest: update co...
287

5230ff0cc   Philip Sanderson   lguest: example l...
288
289
290
  	if (mprotect(addr + getpagesize(), getpagesize() * num,
  		     PROT_READ|PROT_WRITE) == -1)
  		err(1, "mprotect rw %u pages failed", num);
a91d74a3c   Rusty Russell   lguest: update co...
291
292
293
294
  	/*
  	 * One neat mmap feature is that you can close the fd, and it
  	 * stays mapped.
  	 */
34bdaab44   Mark McLoughlin   lguest: Don't lea...
295
  	close(fd);
3c6b5bfa3   Rusty Russell   Introduce guest m...
296

5230ff0cc   Philip Sanderson   lguest: example l...
297
298
  	/* Return address after PROT_NONE page */
  	return addr + getpagesize();
3c6b5bfa3   Rusty Russell   Introduce guest m...
299
300
301
302
303
304
305
306
307
308
309
  }
  
  /* Get some more pages for a device. */
  static void *get_pages(unsigned int num)
  {
  	void *addr = from_guest_phys(guest_limit);
  
  	guest_limit += num * getpagesize();
  	if (guest_limit > guest_max)
  		errx(1, "Not enough memory for devices");
  	return addr;
8ca47e006   Rusty Russell   lguest: the docum...
310
  }
2e04ef769   Rusty Russell   lguest: fix comme...
311
312
  /*
   * This routine is used to load the kernel or initrd.  It tries mmap, but if
6649bb7af   Ronald G. Minnich   Accept elf files ...
313
   * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
2e04ef769   Rusty Russell   lguest: fix comme...
314
315
   * it falls back to reading the memory in.
   */
6649bb7af   Ronald G. Minnich   Accept elf files ...
316
317
318
  static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
  {
  	ssize_t r;
2e04ef769   Rusty Russell   lguest: fix comme...
319
320
  	/*
  	 * We map writable even though for some segments are marked read-only.
6649bb7af   Ronald G. Minnich   Accept elf files ...
321
322
323
324
325
  	 * The kernel really wants to be writable: it patches its own
  	 * instructions.
  	 *
  	 * MAP_PRIVATE means that the page won't be copied until a write is
  	 * done to it.  This allows us to share untouched memory between
2e04ef769   Rusty Russell   lguest: fix comme...
326
327
  	 * Guests.
  	 */
5230ff0cc   Philip Sanderson   lguest: example l...
328
  	if (mmap(addr, len, PROT_READ|PROT_WRITE,
6649bb7af   Ronald G. Minnich   Accept elf files ...
329
330
331
332
333
334
335
336
  		 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
  		return;
  
  	/* pread does a seek and a read in one shot: saves a few lines. */
  	r = pread(fd, addr, len, offset);
  	if (r != len)
  		err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
  }
2e04ef769   Rusty Russell   lguest: fix comme...
337
338
  /*
   * This routine takes an open vmlinux image, which is in ELF, and maps it into
dde797899   Rusty Russell   lguest: documenta...
339
340
341
342
   * the Guest memory.  ELF = Embedded Linking Format, which is the format used
   * by all modern binaries on Linux including the kernel.
   *
   * The ELF headers give *two* addresses: a physical address, and a virtual
47436aa4a   Rusty Russell   Boot with virtual...
343
344
   * address.  We use the physical address; the Guest will map itself to the
   * virtual address.
dde797899   Rusty Russell   lguest: documenta...
345
   *
2e04ef769   Rusty Russell   lguest: fix comme...
346
347
   * We return the starting address.
   */
47436aa4a   Rusty Russell   Boot with virtual...
348
  static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
8ca47e006   Rusty Russell   lguest: the docum...
349
  {
8ca47e006   Rusty Russell   lguest: the docum...
350
351
  	Elf32_Phdr phdr[ehdr->e_phnum];
  	unsigned int i;
8ca47e006   Rusty Russell   lguest: the docum...
352

2e04ef769   Rusty Russell   lguest: fix comme...
353
354
355
356
  	/*
  	 * Sanity checks on the main ELF header: an x86 executable with a
  	 * reasonable number of correctly-sized program headers.
  	 */
8ca47e006   Rusty Russell   lguest: the docum...
357
358
359
360
361
  	if (ehdr->e_type != ET_EXEC
  	    || ehdr->e_machine != EM_386
  	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
  	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
  		errx(1, "Malformed elf header");
2e04ef769   Rusty Russell   lguest: fix comme...
362
363
  	/*
  	 * An ELF executable contains an ELF header and a number of "program"
dde797899   Rusty Russell   lguest: documenta...
364
  	 * headers which indicate which parts ("segments") of the program to
2e04ef769   Rusty Russell   lguest: fix comme...
365
366
  	 * load where.
  	 */
dde797899   Rusty Russell   lguest: documenta...
367
368
  
  	/* We read in all the program headers at once: */
8ca47e006   Rusty Russell   lguest: the docum...
369
370
371
372
  	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
  		err(1, "Seeking to program headers");
  	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
  		err(1, "Reading program headers");
2e04ef769   Rusty Russell   lguest: fix comme...
373
374
375
376
  	/*
  	 * Try all the headers: there are usually only three.  A read-only one,
  	 * a read-write one, and a "note" section which we don't load.
  	 */
8ca47e006   Rusty Russell   lguest: the docum...
377
  	for (i = 0; i < ehdr->e_phnum; i++) {
dde797899   Rusty Russell   lguest: documenta...
378
  		/* If this isn't a loadable segment, we ignore it */
8ca47e006   Rusty Russell   lguest: the docum...
379
380
381
382
383
384
  		if (phdr[i].p_type != PT_LOAD)
  			continue;
  
  		verbose("Section %i: size %i addr %p
  ",
  			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
6649bb7af   Ronald G. Minnich   Accept elf files ...
385
  		/* We map this section of the file at its physical address. */
3c6b5bfa3   Rusty Russell   Introduce guest m...
386
  		map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
6649bb7af   Ronald G. Minnich   Accept elf files ...
387
  		       phdr[i].p_offset, phdr[i].p_filesz);
8ca47e006   Rusty Russell   lguest: the docum...
388
  	}
814a0e5cd   Rusty Russell   Revert lguest mag...
389
390
  	/* The entry point is given in the ELF header. */
  	return ehdr->e_entry;
8ca47e006   Rusty Russell   lguest: the docum...
391
  }
2e04ef769   Rusty Russell   lguest: fix comme...
392
393
394
395
  /*L:150
   * A bzImage, unlike an ELF file, is not meant to be loaded.  You're supposed
   * to jump into it and it will unpack itself.  We used to have to perform some
   * hairy magic because the unpacking code scared me.
dde797899   Rusty Russell   lguest: documenta...
396
   *
5bbf89fc2   Rusty Russell   Loading bzImage d...
397
398
   * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
   * a small patch to jump over the tricky bits in the Guest, so now we just read
2e04ef769   Rusty Russell   lguest: fix comme...
399
400
   * the funky header so we know where in the file to load, and away we go!
   */
47436aa4a   Rusty Russell   Boot with virtual...
401
  static unsigned long load_bzimage(int fd)
8ca47e006   Rusty Russell   lguest: the docum...
402
  {
43d33b21a   Rusty Russell   Use "struct boot_...
403
  	struct boot_params boot;
5bbf89fc2   Rusty Russell   Loading bzImage d...
404
405
406
  	int r;
  	/* Modern bzImages get loaded at 1M. */
  	void *p = from_guest_phys(0x100000);
2e04ef769   Rusty Russell   lguest: fix comme...
407
408
  	/*
  	 * Go back to the start of the file and read the header.  It should be
395cf9691   Paul Bolle   doc: fix broken r...
409
  	 * a Linux boot header (see Documentation/x86/boot.txt)
2e04ef769   Rusty Russell   lguest: fix comme...
410
  	 */
5bbf89fc2   Rusty Russell   Loading bzImage d...
411
  	lseek(fd, 0, SEEK_SET);
43d33b21a   Rusty Russell   Use "struct boot_...
412
  	read(fd, &boot, sizeof(boot));
5bbf89fc2   Rusty Russell   Loading bzImage d...
413

43d33b21a   Rusty Russell   Use "struct boot_...
414
415
  	/* Inside the setup_hdr, we expect the magic "HdrS" */
  	if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
5bbf89fc2   Rusty Russell   Loading bzImage d...
416
  		errx(1, "This doesn't look like a bzImage to me");
43d33b21a   Rusty Russell   Use "struct boot_...
417
418
  	/* Skip over the extra sectors of the header. */
  	lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
5bbf89fc2   Rusty Russell   Loading bzImage d...
419
420
421
422
  
  	/* Now read everything into memory. in nice big chunks. */
  	while ((r = read(fd, p, 65536)) > 0)
  		p += r;
43d33b21a   Rusty Russell   Use "struct boot_...
423
424
  	/* Finally, code32_start tells us where to enter the kernel. */
  	return boot.hdr.code32_start;
8ca47e006   Rusty Russell   lguest: the docum...
425
  }
2e04ef769   Rusty Russell   lguest: fix comme...
426
427
  /*L:140
   * Loading the kernel is easy when it's a "vmlinux", but most kernels
e1e72965e   Rusty Russell   lguest: documenta...
428
   * come wrapped up in the self-decompressing "bzImage" format.  With a little
2e04ef769   Rusty Russell   lguest: fix comme...
429
430
   * work, we can load those, too.
   */
47436aa4a   Rusty Russell   Boot with virtual...
431
  static unsigned long load_kernel(int fd)
8ca47e006   Rusty Russell   lguest: the docum...
432
433
  {
  	Elf32_Ehdr hdr;
dde797899   Rusty Russell   lguest: documenta...
434
  	/* Read in the first few bytes. */
8ca47e006   Rusty Russell   lguest: the docum...
435
436
  	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
  		err(1, "Reading kernel");
dde797899   Rusty Russell   lguest: documenta...
437
  	/* If it's an ELF file, it starts with "\177ELF" */
8ca47e006   Rusty Russell   lguest: the docum...
438
  	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
47436aa4a   Rusty Russell   Boot with virtual...
439
  		return map_elf(fd, &hdr);
8ca47e006   Rusty Russell   lguest: the docum...
440

a6bd8e130   Rusty Russell   lguest: comment d...
441
  	/* Otherwise we assume it's a bzImage, and try to load it. */
47436aa4a   Rusty Russell   Boot with virtual...
442
  	return load_bzimage(fd);
8ca47e006   Rusty Russell   lguest: the docum...
443
  }
2e04ef769   Rusty Russell   lguest: fix comme...
444
445
  /*
   * This is a trivial little helper to align pages.  Andi Kleen hated it because
dde797899   Rusty Russell   lguest: documenta...
446
447
448
   * it calls getpagesize() twice: "it's dumb code."
   *
   * Kernel guys get really het up about optimization, even when it's not
2e04ef769   Rusty Russell   lguest: fix comme...
449
450
   * necessary.  I leave this code as a reaction against that.
   */
8ca47e006   Rusty Russell   lguest: the docum...
451
452
  static inline unsigned long page_align(unsigned long addr)
  {
dde797899   Rusty Russell   lguest: documenta...
453
  	/* Add upwards and truncate downwards. */
8ca47e006   Rusty Russell   lguest: the docum...
454
455
  	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
  }
2e04ef769   Rusty Russell   lguest: fix comme...
456
457
458
459
460
  /*L:180
   * An "initial ram disk" is a disk image loaded into memory along with the
   * kernel which the kernel can use to boot from without needing any drivers.
   * Most distributions now use this as standard: the initrd contains the code to
   * load the appropriate driver modules for the current machine.
dde797899   Rusty Russell   lguest: documenta...
461
462
   *
   * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
2e04ef769   Rusty Russell   lguest: fix comme...
463
464
   * kernels.  He sent me this (and tells me when I break it).
   */
8ca47e006   Rusty Russell   lguest: the docum...
465
466
467
468
469
  static unsigned long load_initrd(const char *name, unsigned long mem)
  {
  	int ifd;
  	struct stat st;
  	unsigned long len;
8ca47e006   Rusty Russell   lguest: the docum...
470
471
  
  	ifd = open_or_die(name, O_RDONLY);
dde797899   Rusty Russell   lguest: documenta...
472
  	/* fstat() is needed to get the file size. */
8ca47e006   Rusty Russell   lguest: the docum...
473
474
  	if (fstat(ifd, &st) < 0)
  		err(1, "fstat() on initrd '%s'", name);
2e04ef769   Rusty Russell   lguest: fix comme...
475
476
477
478
  	/*
  	 * We map the initrd at the top of memory, but mmap wants it to be
  	 * page-aligned, so we round the size up for that.
  	 */
8ca47e006   Rusty Russell   lguest: the docum...
479
  	len = page_align(st.st_size);
3c6b5bfa3   Rusty Russell   Introduce guest m...
480
  	map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
2e04ef769   Rusty Russell   lguest: fix comme...
481
482
483
484
  	/*
  	 * Once a file is mapped, you can close the file descriptor.  It's a
  	 * little odd, but quite useful.
  	 */
8ca47e006   Rusty Russell   lguest: the docum...
485
  	close(ifd);
6649bb7af   Ronald G. Minnich   Accept elf files ...
486
487
  	verbose("mapped initrd %s size=%lu @ %p
  ", name, len, (void*)mem-len);
dde797899   Rusty Russell   lguest: documenta...
488
489
  
  	/* We return the initrd size. */
8ca47e006   Rusty Russell   lguest: the docum...
490
491
  	return len;
  }
e1e72965e   Rusty Russell   lguest: documenta...
492
  /*:*/
8ca47e006   Rusty Russell   lguest: the docum...
493

2e04ef769   Rusty Russell   lguest: fix comme...
494
495
496
497
  /*
   * Simple routine to roll all the commandline arguments together with spaces
   * between them.
   */
8ca47e006   Rusty Russell   lguest: the docum...
498
499
500
501
502
  static void concat(char *dst, char *args[])
  {
  	unsigned int i, len = 0;
  
  	for (i = 0; args[i]; i++) {
1ef36fa64   Paul Bolle   lguest: Do not ap...
503
504
505
506
  		if (i) {
  			strcat(dst+len, " ");
  			len++;
  		}
8ca47e006   Rusty Russell   lguest: the docum...
507
  		strcpy(dst+len, args[i]);
1ef36fa64   Paul Bolle   lguest: Do not ap...
508
  		len += strlen(args[i]);
8ca47e006   Rusty Russell   lguest: the docum...
509
510
511
512
  	}
  	/* In case it's empty. */
  	dst[len] = '\0';
  }
2e04ef769   Rusty Russell   lguest: fix comme...
513
514
  /*L:185
   * This is where we actually tell the kernel to initialize the Guest.  We
e1e72965e   Rusty Russell   lguest: documenta...
515
   * saw the arguments it expects when we looked at initialize() in lguest_user.c:
58a245664   Matias Zabaljauregui   lguest: move the ...
516
   * the base of Guest "physical" memory, the top physical page to allow and the
2e04ef769   Rusty Russell   lguest: fix comme...
517
518
   * entry point for the Guest.
   */
56739c802   Rusty Russell   lguest: cleanup p...
519
  static void tell_kernel(unsigned long start)
8ca47e006   Rusty Russell   lguest: the docum...
520
  {
511801dc3   Jes Sorensen   Change example la...
521
522
  	unsigned long args[] = { LHREQ_INITIALIZE,
  				 (unsigned long)guest_base,
58a245664   Matias Zabaljauregui   lguest: move the ...
523
  				 guest_limit / getpagesize(), start };
3c6b5bfa3   Rusty Russell   Introduce guest m...
524
525
526
  	verbose("Guest: %p - %p (%#lx)
  ",
  		guest_base, guest_base + guest_limit, guest_limit);
56739c802   Rusty Russell   lguest: cleanup p...
527
528
  	lguest_fd = open_or_die("/dev/lguest", O_RDWR);
  	if (write(lguest_fd, args, sizeof(args)) < 0)
8ca47e006   Rusty Russell   lguest: the docum...
529
  		err(1, "Writing to /dev/lguest");
8ca47e006   Rusty Russell   lguest: the docum...
530
  }
dde797899   Rusty Russell   lguest: documenta...
531
  /*:*/
8ca47e006   Rusty Russell   lguest: the docum...
532

a91d74a3c   Rusty Russell   lguest: update co...
533
  /*L:200
dde797899   Rusty Russell   lguest: documenta...
534
535
   * Device Handling.
   *
e1e72965e   Rusty Russell   lguest: documenta...
536
   * When the Guest gives us a buffer, it sends an array of addresses and sizes.
dde797899   Rusty Russell   lguest: documenta...
537
   * We need to make sure it's not trying to reach into the Launcher itself, so
e1e72965e   Rusty Russell   lguest: documenta...
538
   * we have a convenient routine which checks it and exits with an error message
dde797899   Rusty Russell   lguest: documenta...
539
540
   * if something funny is going on:
   */
8ca47e006   Rusty Russell   lguest: the docum...
541
542
543
  static void *_check_pointer(unsigned long addr, unsigned int size,
  			    unsigned int line)
  {
2e04ef769   Rusty Russell   lguest: fix comme...
544
  	/*
5230ff0cc   Philip Sanderson   lguest: example l...
545
546
  	 * Check if the requested address and size exceeds the allocated memory,
  	 * or addr + size wraps around.
2e04ef769   Rusty Russell   lguest: fix comme...
547
  	 */
5230ff0cc   Philip Sanderson   lguest: example l...
548
  	if ((addr + size) > guest_limit || (addr + size) < addr)
17cbca2ba   Rusty Russell   Update example la...
549
  		errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
2e04ef769   Rusty Russell   lguest: fix comme...
550
551
552
553
  	/*
  	 * We return a pointer for the caller's convenience, now we know it's
  	 * safe to use.
  	 */
3c6b5bfa3   Rusty Russell   Introduce guest m...
554
  	return from_guest_phys(addr);
8ca47e006   Rusty Russell   lguest: the docum...
555
  }
dde797899   Rusty Russell   lguest: documenta...
556
  /* A macro which transparently hands the line number to the real function. */
8ca47e006   Rusty Russell   lguest: the docum...
557
  #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
2e04ef769   Rusty Russell   lguest: fix comme...
558
559
  /*
   * Each buffer in the virtqueues is actually a chain of descriptors.  This
e1e72965e   Rusty Russell   lguest: documenta...
560
   * function returns the next descriptor in the chain, or vq->vring.num if we're
2e04ef769   Rusty Russell   lguest: fix comme...
561
562
   * at the end.
   */
d1f0132e7   Mark McLoughlin   lguest: add suppo...
563
564
  static unsigned next_desc(struct vring_desc *desc,
  			  unsigned int i, unsigned int max)
17cbca2ba   Rusty Russell   Update example la...
565
566
567
568
  {
  	unsigned int next;
  
  	/* If this descriptor says it doesn't chain, we're done. */
d1f0132e7   Mark McLoughlin   lguest: add suppo...
569
570
  	if (!(desc[i].flags & VRING_DESC_F_NEXT))
  		return max;
17cbca2ba   Rusty Russell   Update example la...
571
572
  
  	/* Check they're not leading us off end of descriptors. */
d1f0132e7   Mark McLoughlin   lguest: add suppo...
573
  	next = desc[i].next;
17cbca2ba   Rusty Russell   Update example la...
574
575
  	/* Make sure compiler knows to grab that: we don't want it changing! */
  	wmb();
d1f0132e7   Mark McLoughlin   lguest: add suppo...
576
  	if (next >= max)
17cbca2ba   Rusty Russell   Update example la...
577
578
579
580
  		errx(1, "Desc next is %u", next);
  
  	return next;
  }
a91d74a3c   Rusty Russell   lguest: update co...
581
582
583
584
  /*
   * This actually sends the interrupt for this virtqueue, if we've used a
   * buffer.
   */
38bc2b8c5   Rusty Russell   lguest: implement...
585
586
587
  static void trigger_irq(struct virtqueue *vq)
  {
  	unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
95c517c09   Rusty Russell   lguest: avoid sen...
588
589
590
591
  	/* Don't inform them if nothing used. */
  	if (!vq->pending_used)
  		return;
  	vq->pending_used = 0;
ca60a42c9   Rusty Russell   lguest: don't for...
592
593
  	/* If they don't want an interrupt, don't send one... */
  	if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
990c91f0a   Rusty Russell   lguest: remove su...
594
  		return;
ca60a42c9   Rusty Russell   lguest: don't for...
595
  	}
38bc2b8c5   Rusty Russell   lguest: implement...
596
597
598
599
600
  
  	/* Send the Guest an interrupt tell them we used something up. */
  	if (write(lguest_fd, buf, sizeof(buf)) != 0)
  		err(1, "Triggering irq %i", vq->config.irq);
  }
2e04ef769   Rusty Russell   lguest: fix comme...
601
  /*
a91d74a3c   Rusty Russell   lguest: update co...
602
   * This looks in the virtqueue for the first available buffer, and converts
17cbca2ba   Rusty Russell   Update example la...
603
604
605
606
   * it to an iovec for convenient access.  Since descriptors consist of some
   * number of output then some number of input descriptors, it's actually two
   * iovecs, but we pack them into one and note how many of each there were.
   *
a91d74a3c   Rusty Russell   lguest: update co...
607
   * This function waits if necessary, and returns the descriptor number found.
2e04ef769   Rusty Russell   lguest: fix comme...
608
   */
659a0e663   Rusty Russell   lguest: have exam...
609
610
611
  static unsigned wait_for_vq_desc(struct virtqueue *vq,
  				 struct iovec iov[],
  				 unsigned int *out_num, unsigned int *in_num)
17cbca2ba   Rusty Russell   Update example la...
612
  {
d1f0132e7   Mark McLoughlin   lguest: add suppo...
613
614
  	unsigned int i, head, max;
  	struct vring_desc *desc;
659a0e663   Rusty Russell   lguest: have exam...
615
  	u16 last_avail = lg_last_avail(vq);
a91d74a3c   Rusty Russell   lguest: update co...
616
  	/* There's nothing available? */
659a0e663   Rusty Russell   lguest: have exam...
617
618
  	while (last_avail == vq->vring.avail->idx) {
  		u64 event;
a91d74a3c   Rusty Russell   lguest: update co...
619
620
621
622
  		/*
  		 * Since we're about to sleep, now is a good time to tell the
  		 * Guest about what we've used up to now.
  		 */
38bc2b8c5   Rusty Russell   lguest: implement...
623
  		trigger_irq(vq);
b60da13fc   Rusty Russell   lguest: suppress ...
624
625
  		/* OK, now we need to know about added descriptors. */
  		vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
2e04ef769   Rusty Russell   lguest: fix comme...
626
627
628
629
  		/*
  		 * They could have slipped one in as we were doing that: make
  		 * sure it's written, then check again.
  		 */
b60da13fc   Rusty Russell   lguest: suppress ...
630
631
632
633
634
  		mb();
  		if (last_avail != vq->vring.avail->idx) {
  			vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
  			break;
  		}
659a0e663   Rusty Russell   lguest: have exam...
635
636
637
  		/* Nothing new?  Wait for eventfd to tell us they refilled. */
  		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
  			errx(1, "Event read failed?");
b60da13fc   Rusty Russell   lguest: suppress ...
638
639
640
  
  		/* We don't need to be notified again. */
  		vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
659a0e663   Rusty Russell   lguest: have exam...
641
  	}
17cbca2ba   Rusty Russell   Update example la...
642
643
  
  	/* Check it isn't doing very strange things with descriptor numbers. */
b5111790f   Rusty Russell   lguest: wrap last...
644
  	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
17cbca2ba   Rusty Russell   Update example la...
645
  		errx(1, "Guest moved used index from %u to %u",
b5111790f   Rusty Russell   lguest: wrap last...
646
  		     last_avail, vq->vring.avail->idx);
17cbca2ba   Rusty Russell   Update example la...
647

2e04ef769   Rusty Russell   lguest: fix comme...
648
649
650
651
  	/*
  	 * Grab the next descriptor number they're advertising, and increment
  	 * the index we've seen.
  	 */
b5111790f   Rusty Russell   lguest: wrap last...
652
653
  	head = vq->vring.avail->ring[last_avail % vq->vring.num];
  	lg_last_avail(vq)++;
17cbca2ba   Rusty Russell   Update example la...
654
655
656
657
658
659
660
  
  	/* If their number is silly, that's a fatal mistake. */
  	if (head >= vq->vring.num)
  		errx(1, "Guest says index %u is available", head);
  
  	/* When we start there are none of either input nor output. */
  	*out_num = *in_num = 0;
d1f0132e7   Mark McLoughlin   lguest: add suppo...
661
662
  	max = vq->vring.num;
  	desc = vq->vring.desc;
17cbca2ba   Rusty Russell   Update example la...
663
  	i = head;
d1f0132e7   Mark McLoughlin   lguest: add suppo...
664

2e04ef769   Rusty Russell   lguest: fix comme...
665
666
667
668
  	/*
  	 * If this is an indirect entry, then this buffer contains a descriptor
  	 * table which we handle as if it's any normal descriptor chain.
  	 */
d1f0132e7   Mark McLoughlin   lguest: add suppo...
669
670
671
672
673
674
675
676
  	if (desc[i].flags & VRING_DESC_F_INDIRECT) {
  		if (desc[i].len % sizeof(struct vring_desc))
  			errx(1, "Invalid size for indirect buffer table");
  
  		max = desc[i].len / sizeof(struct vring_desc);
  		desc = check_pointer(desc[i].addr, desc[i].len);
  		i = 0;
  	}
17cbca2ba   Rusty Russell   Update example la...
677
678
  	do {
  		/* Grab the first descriptor, and check it's OK. */
d1f0132e7   Mark McLoughlin   lguest: add suppo...
679
  		iov[*out_num + *in_num].iov_len = desc[i].len;
17cbca2ba   Rusty Russell   Update example la...
680
  		iov[*out_num + *in_num].iov_base
d1f0132e7   Mark McLoughlin   lguest: add suppo...
681
  			= check_pointer(desc[i].addr, desc[i].len);
17cbca2ba   Rusty Russell   Update example la...
682
  		/* If this is an input descriptor, increment that count. */
d1f0132e7   Mark McLoughlin   lguest: add suppo...
683
  		if (desc[i].flags & VRING_DESC_F_WRITE)
17cbca2ba   Rusty Russell   Update example la...
684
685
  			(*in_num)++;
  		else {
2e04ef769   Rusty Russell   lguest: fix comme...
686
687
688
689
  			/*
  			 * If it's an output descriptor, they're all supposed
  			 * to come before any input descriptors.
  			 */
17cbca2ba   Rusty Russell   Update example la...
690
691
692
693
694
695
  			if (*in_num)
  				errx(1, "Descriptor has out after in");
  			(*out_num)++;
  		}
  
  		/* If we've got too many, that implies a descriptor loop. */
d1f0132e7   Mark McLoughlin   lguest: add suppo...
696
  		if (*out_num + *in_num > max)
17cbca2ba   Rusty Russell   Update example la...
697
  			errx(1, "Looped descriptor");
d1f0132e7   Mark McLoughlin   lguest: add suppo...
698
  	} while ((i = next_desc(desc, i, max)) != max);
dde797899   Rusty Russell   lguest: documenta...
699

17cbca2ba   Rusty Russell   Update example la...
700
  	return head;
8ca47e006   Rusty Russell   lguest: the docum...
701
  }
2e04ef769   Rusty Russell   lguest: fix comme...
702
  /*
a91d74a3c   Rusty Russell   lguest: update co...
703
704
705
   * After we've used one of their buffers, we tell the Guest about it.  Sometime
   * later we'll want to send them an interrupt using trigger_irq(); note that
   * wait_for_vq_desc() does that for us if it has to wait.
2e04ef769   Rusty Russell   lguest: fix comme...
706
   */
17cbca2ba   Rusty Russell   Update example la...
707
  static void add_used(struct virtqueue *vq, unsigned int head, int len)
8ca47e006   Rusty Russell   lguest: the docum...
708
  {
17cbca2ba   Rusty Russell   Update example la...
709
  	struct vring_used_elem *used;
2e04ef769   Rusty Russell   lguest: fix comme...
710
711
712
713
  	/*
  	 * The virtqueue contains a ring of used buffers.  Get a pointer to the
  	 * next entry in that used ring.
  	 */
17cbca2ba   Rusty Russell   Update example la...
714
715
716
717
718
719
  	used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
  	used->id = head;
  	used->len = len;
  	/* Make sure buffer is written before we update index. */
  	wmb();
  	vq->vring.used->idx++;
95c517c09   Rusty Russell   lguest: avoid sen...
720
  	vq->pending_used++;
8ca47e006   Rusty Russell   lguest: the docum...
721
  }
17cbca2ba   Rusty Russell   Update example la...
722
  /* And here's the combo meal deal.  Supersize me! */
56739c802   Rusty Russell   lguest: cleanup p...
723
  static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
8ca47e006   Rusty Russell   lguest: the docum...
724
  {
17cbca2ba   Rusty Russell   Update example la...
725
  	add_used(vq, head, len);
56739c802   Rusty Russell   lguest: cleanup p...
726
  	trigger_irq(vq);
8ca47e006   Rusty Russell   lguest: the docum...
727
  }
e1e72965e   Rusty Russell   lguest: documenta...
728
729
730
  /*
   * The Console
   *
2e04ef769   Rusty Russell   lguest: fix comme...
731
732
   * We associate some data with the console for our exit hack.
   */
1842f23c0   Rusty Russell   lguest and virtio...
733
  struct console_abort {
dde797899   Rusty Russell   lguest: documenta...
734
  	/* How many times have they hit ^C? */
8ca47e006   Rusty Russell   lguest: the docum...
735
  	int count;
dde797899   Rusty Russell   lguest: documenta...
736
  	/* When did they start? */
8ca47e006   Rusty Russell   lguest: the docum...
737
738
  	struct timeval start;
  };
dde797899   Rusty Russell   lguest: documenta...
739
  /* This is the routine which handles console input (ie. stdin). */
659a0e663   Rusty Russell   lguest: have exam...
740
  static void console_input(struct virtqueue *vq)
8ca47e006   Rusty Russell   lguest: the docum...
741
  {
8ca47e006   Rusty Russell   lguest: the docum...
742
  	int len;
17cbca2ba   Rusty Russell   Update example la...
743
  	unsigned int head, in_num, out_num;
659a0e663   Rusty Russell   lguest: have exam...
744
745
  	struct console_abort *abort = vq->dev->priv;
  	struct iovec iov[vq->vring.num];
56ae43dfe   Rusty Russell   Example launcher ...
746

a91d74a3c   Rusty Russell   lguest: update co...
747
  	/* Make sure there's a descriptor available. */
659a0e663   Rusty Russell   lguest: have exam...
748
  	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
56ae43dfe   Rusty Russell   Example launcher ...
749
  	if (out_num)
17cbca2ba   Rusty Russell   Update example la...
750
  		errx(1, "Output buffers in console in queue?");
8ca47e006   Rusty Russell   lguest: the docum...
751

a91d74a3c   Rusty Russell   lguest: update co...
752
  	/* Read into it.  This is where we usually wait. */
659a0e663   Rusty Russell   lguest: have exam...
753
  	len = readv(STDIN_FILENO, iov, in_num);
8ca47e006   Rusty Russell   lguest: the docum...
754
  	if (len <= 0) {
659a0e663   Rusty Russell   lguest: have exam...
755
  		/* Ran out of input? */
8ca47e006   Rusty Russell   lguest: the docum...
756
  		warnx("Failed to get console input, ignoring console.");
2e04ef769   Rusty Russell   lguest: fix comme...
757
758
759
760
  		/*
  		 * For simplicity, dying threads kill the whole Launcher.  So
  		 * just nap here.
  		 */
659a0e663   Rusty Russell   lguest: have exam...
761
762
  		for (;;)
  			pause();
8ca47e006   Rusty Russell   lguest: the docum...
763
  	}
a91d74a3c   Rusty Russell   lguest: update co...
764
  	/* Tell the Guest we used a buffer. */
659a0e663   Rusty Russell   lguest: have exam...
765
  	add_used_and_trigger(vq, head, len);
8ca47e006   Rusty Russell   lguest: the docum...
766

2e04ef769   Rusty Russell   lguest: fix comme...
767
768
  	/*
  	 * Three ^C within one second?  Exit.
dde797899   Rusty Russell   lguest: documenta...
769
  	 *
659a0e663   Rusty Russell   lguest: have exam...
770
771
772
  	 * This is such a hack, but works surprisingly well.  Each ^C has to
  	 * be in a buffer by itself, so they can't be too fast.  But we check
  	 * that we get three within about a second, so they can't be too
2e04ef769   Rusty Russell   lguest: fix comme...
773
774
  	 * slow.
  	 */
659a0e663   Rusty Russell   lguest: have exam...
775
  	if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
8ca47e006   Rusty Russell   lguest: the docum...
776
  		abort->count = 0;
659a0e663   Rusty Russell   lguest: have exam...
777
778
  		return;
  	}
8ca47e006   Rusty Russell   lguest: the docum...
779

659a0e663   Rusty Russell   lguest: have exam...
780
781
782
783
784
785
786
787
788
789
790
  	abort->count++;
  	if (abort->count == 1)
  		gettimeofday(&abort->start, NULL);
  	else if (abort->count == 3) {
  		struct timeval now;
  		gettimeofday(&now, NULL);
  		/* Kill all Launcher processes with SIGINT, like normal ^C */
  		if (now.tv_sec <= abort->start.tv_sec+1)
  			kill(0, SIGINT);
  		abort->count = 0;
  	}
8ca47e006   Rusty Russell   lguest: the docum...
791
  }
659a0e663   Rusty Russell   lguest: have exam...
792
793
  /* This is the routine which handles console output (ie. stdout). */
  static void console_output(struct virtqueue *vq)
8ca47e006   Rusty Russell   lguest: the docum...
794
  {
17cbca2ba   Rusty Russell   Update example la...
795
  	unsigned int head, out, in;
17cbca2ba   Rusty Russell   Update example la...
796
  	struct iovec iov[vq->vring.num];
a91d74a3c   Rusty Russell   lguest: update co...
797
  	/* We usually wait in here, for the Guest to give us something. */
659a0e663   Rusty Russell   lguest: have exam...
798
799
800
  	head = wait_for_vq_desc(vq, iov, &out, &in);
  	if (in)
  		errx(1, "Input buffers in console output queue?");
a91d74a3c   Rusty Russell   lguest: update co...
801
802
  
  	/* writev can return a partial write, so we loop here. */
659a0e663   Rusty Russell   lguest: have exam...
803
804
  	while (!iov_empty(iov, out)) {
  		int len = writev(STDOUT_FILENO, iov, out);
e0377e252   Sakari Ailus   lguest: Do not ex...
805
806
807
808
  		if (len <= 0) {
  			warn("Write to stdout gave %i (%d)", len, errno);
  			break;
  		}
659a0e663   Rusty Russell   lguest: have exam...
809
  		iov_consume(iov, out, len);
17cbca2ba   Rusty Russell   Update example la...
810
  	}
a91d74a3c   Rusty Russell   lguest: update co...
811
812
813
814
815
  
  	/*
  	 * We're finished with that buffer: if we're going to sleep,
  	 * wait_for_vq_desc() will prod the Guest with an interrupt.
  	 */
38bc2b8c5   Rusty Russell   lguest: implement...
816
  	add_used(vq, head, 0);
a161883a2   Rusty Russell   lguest: Tell Gues...
817
  }
e1e72965e   Rusty Russell   lguest: documenta...
818
819
820
821
  /*
   * The Network
   *
   * Handling output for network is also simple: we get all the output buffers
659a0e663   Rusty Russell   lguest: have exam...
822
   * and write them to /dev/net/tun.
a6bd8e130   Rusty Russell   lguest: comment d...
823
   */
659a0e663   Rusty Russell   lguest: have exam...
824
825
826
827
828
  struct net_info {
  	int tunfd;
  };
  
  static void net_output(struct virtqueue *vq)
8ca47e006   Rusty Russell   lguest: the docum...
829
  {
659a0e663   Rusty Russell   lguest: have exam...
830
831
  	struct net_info *net_info = vq->dev->priv;
  	unsigned int head, out, in;
17cbca2ba   Rusty Russell   Update example la...
832
  	struct iovec iov[vq->vring.num];
a161883a2   Rusty Russell   lguest: Tell Gues...
833

a91d74a3c   Rusty Russell   lguest: update co...
834
  	/* We usually wait in here for the Guest to give us a packet. */
659a0e663   Rusty Russell   lguest: have exam...
835
836
837
  	head = wait_for_vq_desc(vq, iov, &out, &in);
  	if (in)
  		errx(1, "Input buffers in net output queue?");
a91d74a3c   Rusty Russell   lguest: update co...
838
839
840
841
  	/*
  	 * Send the whole thing through to /dev/net/tun.  It expects the exact
  	 * same format: what a coincidence!
  	 */
659a0e663   Rusty Russell   lguest: have exam...
842
  	if (writev(net_info->tunfd, iov, out) < 0)
e0377e252   Sakari Ailus   lguest: Do not ex...
843
  		warnx("Write to tun failed (%d)?", errno);
a91d74a3c   Rusty Russell   lguest: update co...
844
845
846
847
848
  
  	/*
  	 * Done with that one; wait_for_vq_desc() will send the interrupt if
  	 * all packets are processed.
  	 */
38bc2b8c5   Rusty Russell   lguest: implement...
849
  	add_used(vq, head, 0);
8ca47e006   Rusty Russell   lguest: the docum...
850
  }
a91d74a3c   Rusty Russell   lguest: update co...
851
852
853
854
855
856
  /*
   * Handling network input is a bit trickier, because I've tried to optimize it.
   *
   * First we have a helper routine which tells is if from this file descriptor
   * (ie. the /dev/net/tun device) will block:
   */
4a8962e21   Rusty Russell   lguest: try to ba...
857
858
859
860
861
862
863
864
  static bool will_block(int fd)
  {
  	fd_set fdset;
  	struct timeval zero = { 0, 0 };
  	FD_ZERO(&fdset);
  	FD_SET(fd, &fdset);
  	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
  }
a91d74a3c   Rusty Russell   lguest: update co...
865
866
867
868
869
  /*
   * This handles packets coming in from the tun device to our Guest.  Like all
   * service routines, it gets called again as soon as it returns, so you don't
   * see a while(1) loop here.
   */
659a0e663   Rusty Russell   lguest: have exam...
870
  static void net_input(struct virtqueue *vq)
8ca47e006   Rusty Russell   lguest: the docum...
871
  {
8ca47e006   Rusty Russell   lguest: the docum...
872
  	int len;
659a0e663   Rusty Russell   lguest: have exam...
873
874
875
  	unsigned int head, out, in;
  	struct iovec iov[vq->vring.num];
  	struct net_info *net_info = vq->dev->priv;
a91d74a3c   Rusty Russell   lguest: update co...
876
877
878
879
  	/*
  	 * Get a descriptor to write an incoming packet into.  This will also
  	 * send an interrupt if they're out of descriptors.
  	 */
659a0e663   Rusty Russell   lguest: have exam...
880
881
882
  	head = wait_for_vq_desc(vq, iov, &out, &in);
  	if (out)
  		errx(1, "Output buffers in net input queue?");
4a8962e21   Rusty Russell   lguest: try to ba...
883

a91d74a3c   Rusty Russell   lguest: update co...
884
885
886
887
  	/*
  	 * If it looks like we'll block reading from the tun device, send them
  	 * an interrupt.
  	 */
4a8962e21   Rusty Russell   lguest: try to ba...
888
889
  	if (vq->pending_used && will_block(net_info->tunfd))
  		trigger_irq(vq);
a91d74a3c   Rusty Russell   lguest: update co...
890
891
892
893
  	/*
  	 * Read in the packet.  This is where we normally wait (when there's no
  	 * incoming network traffic).
  	 */
659a0e663   Rusty Russell   lguest: have exam...
894
  	len = readv(net_info->tunfd, iov, in);
8ca47e006   Rusty Russell   lguest: the docum...
895
  	if (len <= 0)
e0377e252   Sakari Ailus   lguest: Do not ex...
896
  		warn("Failed to read from tun (%d).", errno);
a91d74a3c   Rusty Russell   lguest: update co...
897
898
899
900
901
  
  	/*
  	 * Mark that packet buffer as used, but don't interrupt here.  We want
  	 * to wait until we've done as much work as we can.
  	 */
4a8962e21   Rusty Russell   lguest: try to ba...
902
  	add_used(vq, head, len);
659a0e663   Rusty Russell   lguest: have exam...
903
  }
a91d74a3c   Rusty Russell   lguest: update co...
904
  /*:*/
dde797899   Rusty Russell   lguest: documenta...
905

a91d74a3c   Rusty Russell   lguest: update co...
906
  /* This is the helper to create threads: run the service routine in a loop. */
659a0e663   Rusty Russell   lguest: have exam...
907
908
909
  static int do_thread(void *_vq)
  {
  	struct virtqueue *vq = _vq;
17cbca2ba   Rusty Russell   Update example la...
910

659a0e663   Rusty Russell   lguest: have exam...
911
912
913
914
  	for (;;)
  		vq->service(vq);
  	return 0;
  }
17cbca2ba   Rusty Russell   Update example la...
915

2e04ef769   Rusty Russell   lguest: fix comme...
916
917
918
919
  /*
   * When a child dies, we kill our entire process group with SIGTERM.  This
   * also has the side effect that the shell restores the console for us!
   */
659a0e663   Rusty Russell   lguest: have exam...
920
921
922
  static void kill_launcher(int signal)
  {
  	kill(0, SIGTERM);
8ca47e006   Rusty Russell   lguest: the docum...
923
  }
659a0e663   Rusty Russell   lguest: have exam...
924
  static void reset_device(struct device *dev)
56ae43dfe   Rusty Russell   Example launcher ...
925
  {
659a0e663   Rusty Russell   lguest: have exam...
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
  	struct virtqueue *vq;
  
  	verbose("Resetting device %s
  ", dev->name);
  
  	/* Clear any features they've acked. */
  	memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
  
  	/* We're going to be explicitly killing threads, so ignore them. */
  	signal(SIGCHLD, SIG_IGN);
  
  	/* Zero out the virtqueues, get rid of their threads */
  	for (vq = dev->vq; vq; vq = vq->next) {
  		if (vq->thread != (pid_t)-1) {
  			kill(vq->thread, SIGTERM);
  			waitpid(vq->thread, NULL, 0);
  			vq->thread = (pid_t)-1;
  		}
  		memset(vq->vring.desc, 0,
  		       vring_size(vq->config.num, LGUEST_VRING_ALIGN));
  		lg_last_avail(vq) = 0;
  	}
  	dev->running = false;
  
  	/* Now we care if threads die. */
  	signal(SIGCHLD, (void *)kill_launcher);
56ae43dfe   Rusty Russell   Example launcher ...
952
  }
a91d74a3c   Rusty Russell   lguest: update co...
953
954
955
  /*L:216
   * This actually creates the thread which services the virtqueue for a device.
   */
659a0e663   Rusty Russell   lguest: have exam...
956
  static void create_thread(struct virtqueue *vq)
5dae785a8   Rusty Russell   lguest: net block...
957
  {
2e04ef769   Rusty Russell   lguest: fix comme...
958
  	/*
a91d74a3c   Rusty Russell   lguest: update co...
959
960
  	 * Create stack for thread.  Since the stack grows upwards, we point
  	 * the stack pointer to the end of this region.
2e04ef769   Rusty Russell   lguest: fix comme...
961
  	 */
659a0e663   Rusty Russell   lguest: have exam...
962
963
964
965
966
967
968
969
970
  	char *stack = malloc(32768);
  	unsigned long args[] = { LHREQ_EVENTFD,
  				 vq->config.pfn*getpagesize(), 0 };
  
  	/* Create a zero-initialized eventfd. */
  	vq->eventfd = eventfd(0, 0);
  	if (vq->eventfd < 0)
  		err(1, "Creating eventfd");
  	args[2] = vq->eventfd;
a91d74a3c   Rusty Russell   lguest: update co...
971
972
973
974
  	/*
  	 * Attach an eventfd to this virtqueue: it will go off when the Guest
  	 * does an LHCALL_NOTIFY for this vq.
  	 */
659a0e663   Rusty Russell   lguest: have exam...
975
976
  	if (write(lguest_fd, &args, sizeof(args)) != 0)
  		err(1, "Attaching eventfd");
a91d74a3c   Rusty Russell   lguest: update co...
977
978
979
980
  	/*
  	 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
  	 * we get a signal if it dies.
  	 */
659a0e663   Rusty Russell   lguest: have exam...
981
982
983
  	vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
  	if (vq->thread == (pid_t)-1)
  		err(1, "Creating clone");
a91d74a3c   Rusty Russell   lguest: update co...
984
985
  
  	/* We close our local copy now the child has it. */
659a0e663   Rusty Russell   lguest: have exam...
986
  	close(vq->eventfd);
5dae785a8   Rusty Russell   lguest: net block...
987
  }
659a0e663   Rusty Russell   lguest: have exam...
988
  static void start_device(struct device *dev)
6e5aa7efb   Rusty Russell   virtio: reset fun...
989
  {
659a0e663   Rusty Russell   lguest: have exam...
990
  	unsigned int i;
6e5aa7efb   Rusty Russell   virtio: reset fun...
991
  	struct virtqueue *vq;
659a0e663   Rusty Russell   lguest: have exam...
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
  	verbose("Device %s OK: offered", dev->name);
  	for (i = 0; i < dev->feature_len; i++)
  		verbose(" %02x", get_feature_bits(dev)[i]);
  	verbose(", accepted");
  	for (i = 0; i < dev->feature_len; i++)
  		verbose(" %02x", get_feature_bits(dev)
  			[dev->feature_len+i]);
  
  	for (vq = dev->vq; vq; vq = vq->next) {
  		if (vq->service)
  			create_thread(vq);
  	}
  	dev->running = true;
  }
  
  static void cleanup_devices(void)
  {
  	struct device *dev;
  
  	for (dev = devices.dev; dev; dev = dev->next)
  		reset_device(dev);
6e5aa7efb   Rusty Russell   virtio: reset fun...
1013

659a0e663   Rusty Russell   lguest: have exam...
1014
1015
1016
1017
  	/* If we saved off the original terminal settings, restore them now. */
  	if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
  		tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
  }
6e5aa7efb   Rusty Russell   virtio: reset fun...
1018

659a0e663   Rusty Russell   lguest: have exam...
1019
1020
1021
1022
1023
1024
1025
  /* When the Guest tells us they updated the status field, we handle it. */
  static void update_device_status(struct device *dev)
  {
  	/* A zero status is a reset, otherwise it's a set of flags. */
  	if (dev->desc->status == 0)
  		reset_device(dev);
  	else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
a007a751d   Rusty Russell   lguest: make Laun...
1026
  		warnx("Device %s configuration FAILED", dev->name);
659a0e663   Rusty Russell   lguest: have exam...
1027
1028
  		if (dev->running)
  			reset_device(dev);
3c3ed482d   Rusty Russell   lguest: Simplify ...
1029
1030
1031
1032
  	} else {
  		if (dev->running)
  			err(1, "Device %s features finalized twice", dev->name);
  		start_device(dev);
6e5aa7efb   Rusty Russell   virtio: reset fun...
1033
1034
  	}
  }
a91d74a3c   Rusty Russell   lguest: update co...
1035
1036
1037
1038
  /*L:215
   * This is the generic routine we call when the Guest uses LHCALL_NOTIFY.  In
   * particular, it's used to notify us of device status changes during boot.
   */
56739c802   Rusty Russell   lguest: cleanup p...
1039
  static void handle_output(unsigned long addr)
8ca47e006   Rusty Russell   lguest: the docum...
1040
1041
  {
  	struct device *i;
17cbca2ba   Rusty Russell   Update example la...
1042

659a0e663   Rusty Russell   lguest: have exam...
1043
  	/* Check each device. */
17cbca2ba   Rusty Russell   Update example la...
1044
  	for (i = devices.dev; i; i = i->next) {
659a0e663   Rusty Russell   lguest: have exam...
1045
  		struct virtqueue *vq;
a91d74a3c   Rusty Russell   lguest: update co...
1046
1047
1048
1049
  		/*
  		 * Notifications to device descriptors mean they updated the
  		 * device status.
  		 */
6e5aa7efb   Rusty Russell   virtio: reset fun...
1050
  		if (from_guest_phys(addr) == i->desc) {
a007a751d   Rusty Russell   lguest: make Laun...
1051
  			update_device_status(i);
6e5aa7efb   Rusty Russell   virtio: reset fun...
1052
1053
  			return;
  		}
3c3ed482d   Rusty Russell   lguest: Simplify ...
1054
  		/* Devices should not be used before features are finalized. */
17cbca2ba   Rusty Russell   Update example la...
1055
  		for (vq = i->vq; vq; vq = vq->next) {
659a0e663   Rusty Russell   lguest: have exam...
1056
  			if (addr != vq->config.pfn*getpagesize())
6e5aa7efb   Rusty Russell   virtio: reset fun...
1057
  				continue;
3c3ed482d   Rusty Russell   lguest: Simplify ...
1058
  			errx(1, "Notification on %s before setup!", i->name);
8ca47e006   Rusty Russell   lguest: the docum...
1059
1060
  		}
  	}
dde797899   Rusty Russell   lguest: documenta...
1061

2e04ef769   Rusty Russell   lguest: fix comme...
1062
1063
1064
1065
1066
  	/*
  	 * Early console write is done using notify on a nul-terminated string
  	 * in Guest memory.  It's also great for hacking debugging messages
  	 * into a Guest.
  	 */
17cbca2ba   Rusty Russell   Update example la...
1067
1068
1069
1070
1071
  	if (addr >= guest_limit)
  		errx(1, "Bad NOTIFY %#lx", addr);
  
  	write(STDOUT_FILENO, from_guest_phys(addr),
  	      strnlen(from_guest_phys(addr), guest_limit - addr));
8ca47e006   Rusty Russell   lguest: the docum...
1072
  }
dde797899   Rusty Russell   lguest: documenta...
1073
1074
1075
1076
1077
  /*L:190
   * Device Setup
   *
   * All devices need a descriptor so the Guest knows it exists, and a "struct
   * device" so the Launcher can keep track of it.  We have common helper
a6bd8e130   Rusty Russell   lguest: comment d...
1078
1079
   * routines to allocate and manage them.
   */
8ca47e006   Rusty Russell   lguest: the docum...
1080

2e04ef769   Rusty Russell   lguest: fix comme...
1081
1082
  /*
   * The layout of the device page is a "struct lguest_device_desc" followed by a
a586d4f60   Rusty Russell   virtio: simplify ...
1083
1084
   * number of virtqueue descriptors, then two sets of feature bits, then an
   * array of configuration bytes.  This routine returns the configuration
2e04ef769   Rusty Russell   lguest: fix comme...
1085
1086
   * pointer.
   */
a586d4f60   Rusty Russell   virtio: simplify ...
1087
1088
1089
  static u8 *device_config(const struct device *dev)
  {
  	return (void *)(dev->desc + 1)
713b15b37   Rusty Russell   lguest: be parano...
1090
1091
  		+ dev->num_vq * sizeof(struct lguest_vqconfig)
  		+ dev->feature_len * 2;
17cbca2ba   Rusty Russell   Update example la...
1092
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1093
1094
  /*
   * This routine allocates a new "struct lguest_device_desc" from descriptor
a586d4f60   Rusty Russell   virtio: simplify ...
1095
   * table page just above the Guest's normal memory.  It returns a pointer to
2e04ef769   Rusty Russell   lguest: fix comme...
1096
1097
   * that descriptor.
   */
a586d4f60   Rusty Russell   virtio: simplify ...
1098
  static struct lguest_device_desc *new_dev_desc(u16 type)
17cbca2ba   Rusty Russell   Update example la...
1099
  {
a586d4f60   Rusty Russell   virtio: simplify ...
1100
1101
  	struct lguest_device_desc d = { .type = type };
  	void *p;
17cbca2ba   Rusty Russell   Update example la...
1102

a586d4f60   Rusty Russell   virtio: simplify ...
1103
1104
1105
1106
1107
1108
  	/* Figure out where the next device config is, based on the last one. */
  	if (devices.lastdev)
  		p = device_config(devices.lastdev)
  			+ devices.lastdev->desc->config_len;
  	else
  		p = devices.descpage;
17cbca2ba   Rusty Russell   Update example la...
1109

a586d4f60   Rusty Russell   virtio: simplify ...
1110
1111
1112
  	/* We only have one page for all the descriptors. */
  	if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
  		errx(1, "Too many devices");
17cbca2ba   Rusty Russell   Update example la...
1113

a586d4f60   Rusty Russell   virtio: simplify ...
1114
1115
  	/* p might not be aligned, so we memcpy in. */
  	return memcpy(p, &d, sizeof(d));
17cbca2ba   Rusty Russell   Update example la...
1116
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1117
1118
1119
1120
  /*
   * Each device descriptor is followed by the description of its virtqueues.  We
   * specify how many descriptors the virtqueue is to have.
   */
17cbca2ba   Rusty Russell   Update example la...
1121
  static void add_virtqueue(struct device *dev, unsigned int num_descs,
659a0e663   Rusty Russell   lguest: have exam...
1122
  			  void (*service)(struct virtqueue *))
17cbca2ba   Rusty Russell   Update example la...
1123
1124
1125
1126
  {
  	unsigned int pages;
  	struct virtqueue **i, *vq = malloc(sizeof(*vq));
  	void *p;
a6bd8e130   Rusty Russell   lguest: comment d...
1127
  	/* First we need some memory for this virtqueue. */
2966af73e   Rusty Russell   virtio: use LGUES...
1128
  	pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1)
42b36cc0c   Rusty Russell   virtio: Force use...
1129
  		/ getpagesize();
17cbca2ba   Rusty Russell   Update example la...
1130
  	p = get_pages(pages);
d1c856e0f   Rusty Russell   lguest: Fix unini...
1131
1132
1133
1134
  	/* Initialize the virtqueue */
  	vq->next = NULL;
  	vq->last_avail_idx = 0;
  	vq->dev = dev;
a91d74a3c   Rusty Russell   lguest: update co...
1135
1136
1137
1138
1139
  
  	/*
  	 * This is the routine the service thread will run, and its Process ID
  	 * once it's running.
  	 */
659a0e663   Rusty Russell   lguest: have exam...
1140
1141
  	vq->service = service;
  	vq->thread = (pid_t)-1;
d1c856e0f   Rusty Russell   lguest: Fix unini...
1142

17cbca2ba   Rusty Russell   Update example la...
1143
1144
1145
1146
1147
1148
  	/* Initialize the configuration. */
  	vq->config.num = num_descs;
  	vq->config.irq = devices.next_irq++;
  	vq->config.pfn = to_guest_phys(p) / getpagesize();
  
  	/* Initialize the vring. */
2966af73e   Rusty Russell   virtio: use LGUES...
1149
  	vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
17cbca2ba   Rusty Russell   Update example la...
1150

2e04ef769   Rusty Russell   lguest: fix comme...
1151
1152
  	/*
  	 * Append virtqueue to this device's descriptor.  We use
a586d4f60   Rusty Russell   virtio: simplify ...
1153
1154
  	 * device_config() to get the end of the device's current virtqueues;
  	 * we check that we haven't added any config or feature information
2e04ef769   Rusty Russell   lguest: fix comme...
1155
1156
  	 * yet, otherwise we'd be overwriting them.
  	 */
a586d4f60   Rusty Russell   virtio: simplify ...
1157
1158
  	assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
  	memcpy(device_config(dev), &vq->config, sizeof(vq->config));
713b15b37   Rusty Russell   lguest: be parano...
1159
  	dev->num_vq++;
a586d4f60   Rusty Russell   virtio: simplify ...
1160
1161
1162
1163
  	dev->desc->num_vq++;
  
  	verbose("Virtqueue page %#lx
  ", to_guest_phys(p));
17cbca2ba   Rusty Russell   Update example la...
1164

2e04ef769   Rusty Russell   lguest: fix comme...
1165
1166
1167
1168
  	/*
  	 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
  	 * second.
  	 */
17cbca2ba   Rusty Russell   Update example la...
1169
1170
  	for (i = &dev->vq; *i; i = &(*i)->next);
  	*i = vq;
8ca47e006   Rusty Russell   lguest: the docum...
1171
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1172
1173
1174
1175
  /*
   * The first half of the feature bitmask is for us to advertise features.  The
   * second half is for the Guest to accept features.
   */
a586d4f60   Rusty Russell   virtio: simplify ...
1176
1177
  static void add_feature(struct device *dev, unsigned bit)
  {
6e5aa7efb   Rusty Russell   virtio: reset fun...
1178
  	u8 *features = get_feature_bits(dev);
a586d4f60   Rusty Russell   virtio: simplify ...
1179
1180
1181
1182
  
  	/* We can't extend the feature bits once we've added config bytes */
  	if (dev->desc->feature_len <= bit / CHAR_BIT) {
  		assert(dev->desc->config_len == 0);
713b15b37   Rusty Russell   lguest: be parano...
1183
  		dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
a586d4f60   Rusty Russell   virtio: simplify ...
1184
  	}
a586d4f60   Rusty Russell   virtio: simplify ...
1185
1186
  	features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1187
1188
  /*
   * This routine sets the configuration fields for an existing device's
a586d4f60   Rusty Russell   virtio: simplify ...
1189
   * descriptor.  It only works for the last device, but that's OK because that's
2e04ef769   Rusty Russell   lguest: fix comme...
1190
1191
   * how we use it.
   */
a586d4f60   Rusty Russell   virtio: simplify ...
1192
1193
1194
1195
1196
1197
1198
1199
1200
  static void set_config(struct device *dev, unsigned len, const void *conf)
  {
  	/* Check we haven't overflowed our single page. */
  	if (device_config(dev) + len > devices.descpage + getpagesize())
  		errx(1, "Too many devices");
  
  	/* Copy in the config information, and store the length. */
  	memcpy(device_config(dev), conf, len);
  	dev->desc->config_len = len;
8ef562d11   Rusty Russell   lguest: fix descr...
1201
1202
1203
  
  	/* Size must fit in config_len field (8 bits)! */
  	assert(dev->desc->config_len == len);
a586d4f60   Rusty Russell   virtio: simplify ...
1204
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1205
1206
  /*
   * This routine does all the creation and setup of a new device, including
a91d74a3c   Rusty Russell   lguest: update co...
1207
1208
   * calling new_dev_desc() to allocate the descriptor and device memory.  We
   * don't actually start the service threads until later.
a6bd8e130   Rusty Russell   lguest: comment d...
1209
   *
2e04ef769   Rusty Russell   lguest: fix comme...
1210
1211
   * See what I mean about userspace being boring?
   */
659a0e663   Rusty Russell   lguest: have exam...
1212
  static struct device *new_device(const char *name, u16 type)
8ca47e006   Rusty Russell   lguest: the docum...
1213
1214
  {
  	struct device *dev = malloc(sizeof(*dev));
dde797899   Rusty Russell   lguest: documenta...
1215
  	/* Now we populate the fields one at a time. */
17cbca2ba   Rusty Russell   Update example la...
1216
  	dev->desc = new_dev_desc(type);
17cbca2ba   Rusty Russell   Update example la...
1217
  	dev->name = name;
d1c856e0f   Rusty Russell   lguest: Fix unini...
1218
  	dev->vq = NULL;
713b15b37   Rusty Russell   lguest: be parano...
1219
1220
  	dev->feature_len = 0;
  	dev->num_vq = 0;
659a0e663   Rusty Russell   lguest: have exam...
1221
  	dev->running = false;
a586d4f60   Rusty Russell   virtio: simplify ...
1222

2e04ef769   Rusty Russell   lguest: fix comme...
1223
1224
  	/*
  	 * Append to device list.  Prepending to a single-linked list is
a586d4f60   Rusty Russell   virtio: simplify ...
1225
1226
  	 * easier, but the user expects the devices to be arranged on the bus
  	 * in command-line order.  The first network device on the command line
2e04ef769   Rusty Russell   lguest: fix comme...
1227
1228
  	 * is eth0, the first block device /dev/vda, etc.
  	 */
a586d4f60   Rusty Russell   virtio: simplify ...
1229
1230
1231
1232
1233
  	if (devices.lastdev)
  		devices.lastdev->next = dev;
  	else
  		devices.dev = dev;
  	devices.lastdev = dev;
8ca47e006   Rusty Russell   lguest: the docum...
1234
1235
  	return dev;
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1236
1237
1238
1239
  /*
   * Our first setup routine is the console.  It's a fairly simple device, but
   * UNIX tty handling makes it uglier than it could be.
   */
17cbca2ba   Rusty Russell   Update example la...
1240
  static void setup_console(void)
8ca47e006   Rusty Russell   lguest: the docum...
1241
1242
  {
  	struct device *dev;
dde797899   Rusty Russell   lguest: documenta...
1243
  	/* If we can save the initial standard input settings... */
8ca47e006   Rusty Russell   lguest: the docum...
1244
1245
  	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
  		struct termios term = orig_term;
2e04ef769   Rusty Russell   lguest: fix comme...
1246
1247
1248
1249
  		/*
  		 * Then we turn off echo, line buffering and ^C etc: We want a
  		 * raw input stream to the Guest.
  		 */
8ca47e006   Rusty Russell   lguest: the docum...
1250
1251
  		term.c_lflag &= ~(ISIG|ICANON|ECHO);
  		tcsetattr(STDIN_FILENO, TCSANOW, &term);
8ca47e006   Rusty Russell   lguest: the docum...
1252
  	}
659a0e663   Rusty Russell   lguest: have exam...
1253
  	dev = new_device("console", VIRTIO_ID_CONSOLE);
dde797899   Rusty Russell   lguest: documenta...
1254
  	/* We store the console state in dev->priv, and initialize it. */
8ca47e006   Rusty Russell   lguest: the docum...
1255
1256
  	dev->priv = malloc(sizeof(struct console_abort));
  	((struct console_abort *)dev->priv)->count = 0;
8ca47e006   Rusty Russell   lguest: the docum...
1257

2e04ef769   Rusty Russell   lguest: fix comme...
1258
1259
  	/*
  	 * The console needs two virtqueues: the input then the output.  When
56ae43dfe   Rusty Russell   Example launcher ...
1260
1261
  	 * they put something the input queue, we make sure we're listening to
  	 * stdin.  When they put something in the output queue, we write it to
2e04ef769   Rusty Russell   lguest: fix comme...
1262
1263
  	 * stdout.
  	 */
659a0e663   Rusty Russell   lguest: have exam...
1264
1265
  	add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
  	add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
17cbca2ba   Rusty Russell   Update example la...
1266

659a0e663   Rusty Russell   lguest: have exam...
1267
1268
  	verbose("device %u: console
  ", ++devices.device_num);
8ca47e006   Rusty Russell   lguest: the docum...
1269
  }
17cbca2ba   Rusty Russell   Update example la...
1270
  /*:*/
8ca47e006   Rusty Russell   lguest: the docum...
1271

2e04ef769   Rusty Russell   lguest: fix comme...
1272
1273
  /*M:010
   * Inter-guest networking is an interesting area.  Simplest is to have a
17cbca2ba   Rusty Russell   Update example la...
1274
1275
   * --sharenet=<name> option which opens or creates a named pipe.  This can be
   * used to send packets to another guest in a 1:1 manner.
dde797899   Rusty Russell   lguest: documenta...
1276
   *
9f54288de   Rusty Russell   lguest: update co...
1277
   * More sophisticated is to use one of the tools developed for project like UML
17cbca2ba   Rusty Russell   Update example la...
1278
   * to do networking.
dde797899   Rusty Russell   lguest: documenta...
1279
   *
17cbca2ba   Rusty Russell   Update example la...
1280
1281
1282
1283
1284
1285
1286
   * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
   * completely generic ("here's my vring, attach to your vring") and would work
   * for any traffic.  Of course, namespace and permissions issues need to be
   * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
   * multiple inter-guest channels behind one interface, although it would
   * require some manner of hotplugging new virtio channels.
   *
9f54288de   Rusty Russell   lguest: update co...
1287
   * Finally, we could use a virtio network switch in the kernel, ie. vhost.
2e04ef769   Rusty Russell   lguest: fix comme...
1288
  :*/
8ca47e006   Rusty Russell   lguest: the docum...
1289
1290
1291
  
  static u32 str2ip(const char *ipaddr)
  {
dec6a2be0   Mark McLoughlin   lguest: Support a...
1292
  	unsigned int b[4];
8ca47e006   Rusty Russell   lguest: the docum...
1293

dec6a2be0   Mark McLoughlin   lguest: Support a...
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
  	if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
  		errx(1, "Failed to parse IP address '%s'", ipaddr);
  	return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
  }
  
  static void str2mac(const char *macaddr, unsigned char mac[6])
  {
  	unsigned int m[6];
  	if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
  		   &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
  		errx(1, "Failed to parse mac address '%s'", macaddr);
  	mac[0] = m[0];
  	mac[1] = m[1];
  	mac[2] = m[2];
  	mac[3] = m[3];
  	mac[4] = m[4];
  	mac[5] = m[5];
8ca47e006   Rusty Russell   lguest: the docum...
1311
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1312
1313
  /*
   * This code is "adapted" from libbridge: it attaches the Host end of the
dde797899   Rusty Russell   lguest: documenta...
1314
1315
1316
   * network device to the bridge device specified by the command line.
   *
   * This is yet another James Morris contribution (I'm an IP-level guy, so I
2e04ef769   Rusty Russell   lguest: fix comme...
1317
1318
   * dislike bridging), and I just try not to break it.
   */
8ca47e006   Rusty Russell   lguest: the docum...
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
  static void add_to_bridge(int fd, const char *if_name, const char *br_name)
  {
  	int ifidx;
  	struct ifreq ifr;
  
  	if (!*br_name)
  		errx(1, "must specify bridge name");
  
  	ifidx = if_nametoindex(if_name);
  	if (!ifidx)
  		errx(1, "interface %s does not exist!", if_name);
  
  	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
dec6a2be0   Mark McLoughlin   lguest: Support a...
1332
  	ifr.ifr_name[IFNAMSIZ-1] = '\0';
8ca47e006   Rusty Russell   lguest: the docum...
1333
1334
1335
1336
  	ifr.ifr_ifindex = ifidx;
  	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
  		err(1, "can't add %s to bridge %s", if_name, br_name);
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1337
1338
  /*
   * This sets up the Host end of the network device with an IP address, brings
dde797899   Rusty Russell   lguest: documenta...
1339
   * it up so packets will flow, the copies the MAC address into the hwaddr
2e04ef769   Rusty Russell   lguest: fix comme...
1340
1341
   * pointer.
   */
dec6a2be0   Mark McLoughlin   lguest: Support a...
1342
  static void configure_device(int fd, const char *tapif, u32 ipaddr)
8ca47e006   Rusty Russell   lguest: the docum...
1343
1344
  {
  	struct ifreq ifr;
f846619e7   Rusty Russell   lguest: clean up ...
1345
  	struct sockaddr_in sin;
8ca47e006   Rusty Russell   lguest: the docum...
1346
1347
  
  	memset(&ifr, 0, sizeof(ifr));
dec6a2be0   Mark McLoughlin   lguest: Support a...
1348
1349
1350
  	strcpy(ifr.ifr_name, tapif);
  
  	/* Don't read these incantations.  Just cut & paste them like I did! */
f846619e7   Rusty Russell   lguest: clean up ...
1351
1352
1353
  	sin.sin_family = AF_INET;
  	sin.sin_addr.s_addr = htonl(ipaddr);
  	memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
8ca47e006   Rusty Russell   lguest: the docum...
1354
  	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
dec6a2be0   Mark McLoughlin   lguest: Support a...
1355
  		err(1, "Setting %s interface address", tapif);
8ca47e006   Rusty Russell   lguest: the docum...
1356
1357
  	ifr.ifr_flags = IFF_UP;
  	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
dec6a2be0   Mark McLoughlin   lguest: Support a...
1358
1359
  		err(1, "Bringing interface %s up", tapif);
  }
dec6a2be0   Mark McLoughlin   lguest: Support a...
1360
  static int get_tun_device(char tapif[IFNAMSIZ])
8ca47e006   Rusty Russell   lguest: the docum...
1361
  {
8ca47e006   Rusty Russell   lguest: the docum...
1362
  	struct ifreq ifr;
dec6a2be0   Mark McLoughlin   lguest: Support a...
1363
1364
1365
1366
  	int netfd;
  
  	/* Start with this zeroed.  Messy but sure. */
  	memset(&ifr, 0, sizeof(ifr));
8ca47e006   Rusty Russell   lguest: the docum...
1367

2e04ef769   Rusty Russell   lguest: fix comme...
1368
1369
  	/*
  	 * We open the /dev/net/tun device and tell it we want a tap device.  A
dde797899   Rusty Russell   lguest: documenta...
1370
1371
  	 * tap device is like a tun device, only somehow different.  To tell
  	 * the truth, I completely blundered my way through this code, but it
2e04ef769   Rusty Russell   lguest: fix comme...
1372
1373
  	 * works now!
  	 */
8ca47e006   Rusty Russell   lguest: the docum...
1374
  	netfd = open_or_die("/dev/net/tun", O_RDWR);
398f187d7   Rusty Russell   lguest: Use GSO/I...
1375
  	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
8ca47e006   Rusty Russell   lguest: the docum...
1376
1377
1378
  	strcpy(ifr.ifr_name, "tap%d");
  	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
  		err(1, "configuring /dev/net/tun");
dec6a2be0   Mark McLoughlin   lguest: Support a...
1379

398f187d7   Rusty Russell   lguest: Use GSO/I...
1380
1381
1382
  	if (ioctl(netfd, TUNSETOFFLOAD,
  		  TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
  		err(1, "Could not set features for tun device");
2e04ef769   Rusty Russell   lguest: fix comme...
1383
1384
1385
1386
  	/*
  	 * We don't need checksums calculated for packets coming in this
  	 * device: trust us!
  	 */
8ca47e006   Rusty Russell   lguest: the docum...
1387
  	ioctl(netfd, TUNSETNOCSUM, 1);
dec6a2be0   Mark McLoughlin   lguest: Support a...
1388
1389
1390
  	memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
  	return netfd;
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1391
1392
  /*L:195
   * Our network is a Host<->Guest network.  This can either use bridging or
dec6a2be0   Mark McLoughlin   lguest: Support a...
1393
1394
   * routing, but the principle is the same: it uses the "tun" device to inject
   * packets into the Host as if they came in from a normal network card.  We
2e04ef769   Rusty Russell   lguest: fix comme...
1395
1396
   * just shunt packets between the Guest and the tun device.
   */
dec6a2be0   Mark McLoughlin   lguest: Support a...
1397
1398
1399
  static void setup_tun_net(char *arg)
  {
  	struct device *dev;
659a0e663   Rusty Russell   lguest: have exam...
1400
1401
  	struct net_info *net_info = malloc(sizeof(*net_info));
  	int ipfd;
dec6a2be0   Mark McLoughlin   lguest: Support a...
1402
1403
1404
1405
  	u32 ip = INADDR_ANY;
  	bool bridging = false;
  	char tapif[IFNAMSIZ], *p;
  	struct virtio_net_config conf;
659a0e663   Rusty Russell   lguest: have exam...
1406
  	net_info->tunfd = get_tun_device(tapif);
dec6a2be0   Mark McLoughlin   lguest: Support a...
1407

17cbca2ba   Rusty Russell   Update example la...
1408
  	/* First we create a new network device. */
659a0e663   Rusty Russell   lguest: have exam...
1409
1410
  	dev = new_device("net", VIRTIO_ID_NET);
  	dev->priv = net_info;
dde797899   Rusty Russell   lguest: documenta...
1411

2e04ef769   Rusty Russell   lguest: fix comme...
1412
  	/* Network devices need a recv and a send queue, just like console. */
659a0e663   Rusty Russell   lguest: have exam...
1413
1414
  	add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
  	add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
8ca47e006   Rusty Russell   lguest: the docum...
1415

2e04ef769   Rusty Russell   lguest: fix comme...
1416
1417
1418
1419
  	/*
  	 * We need a socket to perform the magic network ioctls to bring up the
  	 * tap interface, connect to the bridge etc.  Any socket will do!
  	 */
8ca47e006   Rusty Russell   lguest: the docum...
1420
1421
1422
  	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
  	if (ipfd < 0)
  		err(1, "opening IP socket");
dde797899   Rusty Russell   lguest: documenta...
1423
  	/* If the command line was --tunnet=bridge:<name> do bridging. */
8ca47e006   Rusty Russell   lguest: the docum...
1424
  	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
dec6a2be0   Mark McLoughlin   lguest: Support a...
1425
1426
1427
1428
1429
1430
1431
1432
  		arg += strlen(BRIDGE_PFX);
  		bridging = true;
  	}
  
  	/* A mac address may follow the bridge name or IP address */
  	p = strchr(arg, ':');
  	if (p) {
  		str2mac(p+1, conf.mac);
40c42076e   Rusty Russell   lguest: don't set...
1433
  		add_feature(dev, VIRTIO_NET_F_MAC);
dec6a2be0   Mark McLoughlin   lguest: Support a...
1434
  		*p = '\0';
dec6a2be0   Mark McLoughlin   lguest: Support a...
1435
1436
1437
1438
1439
1440
  	}
  
  	/* arg is now either an IP address or a bridge name */
  	if (bridging)
  		add_to_bridge(ipfd, tapif, arg);
  	else
8ca47e006   Rusty Russell   lguest: the docum...
1441
  		ip = str2ip(arg);
dec6a2be0   Mark McLoughlin   lguest: Support a...
1442
1443
  	/* Set up the tun device. */
  	configure_device(ipfd, tapif, ip);
8ca47e006   Rusty Russell   lguest: the docum...
1444

398f187d7   Rusty Russell   lguest: Use GSO/I...
1445
1446
1447
  	/* Expect Guest to handle everything except UFO */
  	add_feature(dev, VIRTIO_NET_F_CSUM);
  	add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
398f187d7   Rusty Russell   lguest: Use GSO/I...
1448
1449
1450
1451
1452
1453
  	add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
  	add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
  	add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
  	add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
  	add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
  	add_feature(dev, VIRTIO_NET_F_HOST_ECN);
d1f0132e7   Mark McLoughlin   lguest: add suppo...
1454
1455
  	/* We handle indirect ring entries */
  	add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
a586d4f60   Rusty Russell   virtio: simplify ...
1456
  	set_config(dev, sizeof(conf), &conf);
8ca47e006   Rusty Russell   lguest: the docum...
1457

a586d4f60   Rusty Russell   virtio: simplify ...
1458
  	/* We don't need the socket any more; setup is done. */
8ca47e006   Rusty Russell   lguest: the docum...
1459
  	close(ipfd);
dec6a2be0   Mark McLoughlin   lguest: Support a...
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
  	devices.device_num++;
  
  	if (bridging)
  		verbose("device %u: tun %s attached to bridge: %s
  ",
  			devices.device_num, tapif, arg);
  	else
  		verbose("device %u: tun %s: %s
  ",
  			devices.device_num, tapif, arg);
8ca47e006   Rusty Russell   lguest: the docum...
1470
  }
a91d74a3c   Rusty Russell   lguest: update co...
1471
  /*:*/
17cbca2ba   Rusty Russell   Update example la...
1472

e1e72965e   Rusty Russell   lguest: documenta...
1473
  /* This hangs off device->priv. */
1842f23c0   Rusty Russell   lguest and virtio...
1474
  struct vblk_info {
17cbca2ba   Rusty Russell   Update example la...
1475
1476
1477
1478
1479
  	/* The size of the file. */
  	off64_t len;
  
  	/* The file descriptor for the file. */
  	int fd;
17cbca2ba   Rusty Russell   Update example la...
1480
  };
e1e72965e   Rusty Russell   lguest: documenta...
1481
1482
1483
  /*L:210
   * The Disk
   *
a91d74a3c   Rusty Russell   lguest: update co...
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
   * The disk only has one virtqueue, so it only has one thread.  It is really
   * simple: the Guest asks for a block number and we read or write that position
   * in the file.
   *
   * Before we serviced each virtqueue in a separate thread, that was unacceptably
   * slow: the Guest waits until the read is finished before running anything
   * else, even if it could have been doing useful work.
   *
   * We could have used async I/O, except it's reputed to suck so hard that
   * characters actually go missing from your code when you try to use it.
e1e72965e   Rusty Russell   lguest: documenta...
1494
   */
659a0e663   Rusty Russell   lguest: have exam...
1495
  static void blk_request(struct virtqueue *vq)
17cbca2ba   Rusty Russell   Update example la...
1496
  {
659a0e663   Rusty Russell   lguest: have exam...
1497
  	struct vblk_info *vblk = vq->dev->priv;
17cbca2ba   Rusty Russell   Update example la...
1498
1499
  	unsigned int head, out_num, in_num, wlen;
  	int ret;
cb38fa23c   Rusty Russell   virtio: de-struct...
1500
  	u8 *in;
17cbca2ba   Rusty Russell   Update example la...
1501
  	struct virtio_blk_outhdr *out;
659a0e663   Rusty Russell   lguest: have exam...
1502
  	struct iovec iov[vq->vring.num];
17cbca2ba   Rusty Russell   Update example la...
1503
  	off64_t off;
a91d74a3c   Rusty Russell   lguest: update co...
1504
1505
1506
1507
  	/*
  	 * Get the next request, where we normally wait.  It triggers the
  	 * interrupt to acknowledge previously serviced requests (if any).
  	 */
659a0e663   Rusty Russell   lguest: have exam...
1508
  	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
17cbca2ba   Rusty Russell   Update example la...
1509

2e04ef769   Rusty Russell   lguest: fix comme...
1510
1511
  	/*
  	 * Every block request should contain at least one output buffer
e1e72965e   Rusty Russell   lguest: documenta...
1512
  	 * (detailing the location on disk and the type of request) and one
2e04ef769   Rusty Russell   lguest: fix comme...
1513
1514
  	 * input buffer (to hold the result).
  	 */
17cbca2ba   Rusty Russell   Update example la...
1515
1516
1517
1518
1519
  	if (out_num == 0 || in_num == 0)
  		errx(1, "Bad virtblk cmd %u out=%u in=%u",
  		     head, out_num, in_num);
  
  	out = convert(&iov[0], struct virtio_blk_outhdr);
cb38fa23c   Rusty Russell   virtio: de-struct...
1520
  	in = convert(&iov[out_num+in_num-1], u8);
a91d74a3c   Rusty Russell   lguest: update co...
1521
1522
1523
1524
  	/*
  	 * For historical reasons, block operations are expressed in 512 byte
  	 * "sectors".
  	 */
17cbca2ba   Rusty Russell   Update example la...
1525
  	off = out->sector * 512;
2e04ef769   Rusty Russell   lguest: fix comme...
1526
  	/*
2e04ef769   Rusty Russell   lguest: fix comme...
1527
1528
1529
  	 * In general the virtio block driver is allowed to try SCSI commands.
  	 * It'd be nice if we supported eject, for example, but we don't.
  	 */
17cbca2ba   Rusty Russell   Update example la...
1530
1531
1532
  	if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
  		fprintf(stderr, "Scsi commands unsupported
  ");
cb38fa23c   Rusty Russell   virtio: de-struct...
1533
  		*in = VIRTIO_BLK_S_UNSUPP;
1200e646a   Anthony Liguori   lguest: Fix lgues...
1534
  		wlen = sizeof(*in);
17cbca2ba   Rusty Russell   Update example la...
1535
  	} else if (out->type & VIRTIO_BLK_T_OUT) {
2e04ef769   Rusty Russell   lguest: fix comme...
1536
1537
1538
1539
1540
1541
  		/*
  		 * Write
  		 *
  		 * Move to the right location in the block file.  This can fail
  		 * if they try to write past end.
  		 */
17cbca2ba   Rusty Russell   Update example la...
1542
1543
1544
1545
1546
1547
  		if (lseek64(vblk->fd, off, SEEK_SET) != off)
  			err(1, "Bad seek to sector %llu", out->sector);
  
  		ret = writev(vblk->fd, iov+1, out_num-1);
  		verbose("WRITE to sector %llu: %i
  ", out->sector, ret);
2e04ef769   Rusty Russell   lguest: fix comme...
1548
1549
  		/*
  		 * Grr... Now we know how long the descriptor they sent was, we
17cbca2ba   Rusty Russell   Update example la...
1550
  		 * make sure they didn't try to write over the end of the block
2e04ef769   Rusty Russell   lguest: fix comme...
1551
1552
  		 * file (possibly extending it).
  		 */
17cbca2ba   Rusty Russell   Update example la...
1553
1554
1555
1556
1557
1558
  		if (ret > 0 && off + ret > vblk->len) {
  			/* Trim it back to the correct length */
  			ftruncate64(vblk->fd, vblk->len);
  			/* Die, bad Guest, die. */
  			errx(1, "Write past end %llu+%u", off, ret);
  		}
7bc9fddab   Tejun Heo   lguest: replace V...
1559
1560
1561
1562
1563
1564
1565
1566
  
  		wlen = sizeof(*in);
  		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
  	} else if (out->type & VIRTIO_BLK_T_FLUSH) {
  		/* Flush */
  		ret = fdatasync(vblk->fd);
  		verbose("FLUSH fdatasync: %i
  ", ret);
1200e646a   Anthony Liguori   lguest: Fix lgues...
1567
  		wlen = sizeof(*in);
cb38fa23c   Rusty Russell   virtio: de-struct...
1568
  		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
17cbca2ba   Rusty Russell   Update example la...
1569
  	} else {
2e04ef769   Rusty Russell   lguest: fix comme...
1570
1571
1572
1573
1574
1575
  		/*
  		 * Read
  		 *
  		 * Move to the right location in the block file.  This can fail
  		 * if they try to read past end.
  		 */
17cbca2ba   Rusty Russell   Update example la...
1576
1577
1578
1579
1580
1581
1582
  		if (lseek64(vblk->fd, off, SEEK_SET) != off)
  			err(1, "Bad seek to sector %llu", out->sector);
  
  		ret = readv(vblk->fd, iov+1, in_num-1);
  		verbose("READ from sector %llu: %i
  ", out->sector, ret);
  		if (ret >= 0) {
1200e646a   Anthony Liguori   lguest: Fix lgues...
1583
  			wlen = sizeof(*in) + ret;
cb38fa23c   Rusty Russell   virtio: de-struct...
1584
  			*in = VIRTIO_BLK_S_OK;
17cbca2ba   Rusty Russell   Update example la...
1585
  		} else {
1200e646a   Anthony Liguori   lguest: Fix lgues...
1586
  			wlen = sizeof(*in);
cb38fa23c   Rusty Russell   virtio: de-struct...
1587
  			*in = VIRTIO_BLK_S_IOERR;
17cbca2ba   Rusty Russell   Update example la...
1588
1589
  		}
  	}
a91d74a3c   Rusty Russell   lguest: update co...
1590
  	/* Finished that request. */
38bc2b8c5   Rusty Russell   lguest: implement...
1591
  	add_used(vq, head, wlen);
17cbca2ba   Rusty Russell   Update example la...
1592
  }
e1e72965e   Rusty Russell   lguest: documenta...
1593
  /*L:198 This actually sets up a virtual block device. */
17cbca2ba   Rusty Russell   Update example la...
1594
1595
  static void setup_block_file(const char *filename)
  {
17cbca2ba   Rusty Russell   Update example la...
1596
1597
  	struct device *dev;
  	struct vblk_info *vblk;
a586d4f60   Rusty Russell   virtio: simplify ...
1598
  	struct virtio_blk_config conf;
17cbca2ba   Rusty Russell   Update example la...
1599

2e04ef769   Rusty Russell   lguest: fix comme...
1600
  	/* Creat the device. */
659a0e663   Rusty Russell   lguest: have exam...
1601
  	dev = new_device("block", VIRTIO_ID_BLOCK);
17cbca2ba   Rusty Russell   Update example la...
1602

e1e72965e   Rusty Russell   lguest: documenta...
1603
  	/* The device has one virtqueue, where the Guest places requests. */
659a0e663   Rusty Russell   lguest: have exam...
1604
  	add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
17cbca2ba   Rusty Russell   Update example la...
1605
1606
1607
1608
1609
1610
1611
  
  	/* Allocate the room for our own bookkeeping */
  	vblk = dev->priv = malloc(sizeof(*vblk));
  
  	/* First we open the file and store the length. */
  	vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
  	vblk->len = lseek64(vblk->fd, 0, SEEK_END);
7bc9fddab   Tejun Heo   lguest: replace V...
1612
1613
  	/* We support FLUSH. */
  	add_feature(dev, VIRTIO_BLK_F_FLUSH);
a586d4f60   Rusty Russell   virtio: simplify ...
1614

17cbca2ba   Rusty Russell   Update example la...
1615
  	/* Tell Guest how many sectors this device has. */
a586d4f60   Rusty Russell   virtio: simplify ...
1616
  	conf.capacity = cpu_to_le64(vblk->len / 512);
17cbca2ba   Rusty Russell   Update example la...
1617

2e04ef769   Rusty Russell   lguest: fix comme...
1618
1619
1620
1621
  	/*
  	 * Tell Guest not to put in too many descriptors at once: two are used
  	 * for the in and out elements.
  	 */
a586d4f60   Rusty Russell   virtio: simplify ...
1622
1623
  	add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
  	conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
8ef562d11   Rusty Russell   lguest: fix descr...
1624
1625
  	/* Don't try to put whole struct: we have 8 bit limit. */
  	set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf);
17cbca2ba   Rusty Russell   Update example la...
1626

17cbca2ba   Rusty Russell   Update example la...
1627
1628
  	verbose("device %u: virtblock %llu sectors
  ",
659a0e663   Rusty Russell   lguest: have exam...
1629
  		++devices.device_num, le64_to_cpu(conf.capacity));
17cbca2ba   Rusty Russell   Update example la...
1630
  }
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1631

2e04ef769   Rusty Russell   lguest: fix comme...
1632
1633
  /*L:211
   * Our random number generator device reads from /dev/random into the Guest's
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1634
1635
1636
1637
   * input buffers.  The usual case is that the Guest doesn't want random numbers
   * and so has no buffers although /dev/random is still readable, whereas
   * console is the reverse.
   *
2e04ef769   Rusty Russell   lguest: fix comme...
1638
1639
1640
1641
1642
   * The same logic applies, however.
   */
  struct rng_info {
  	int rfd;
  };
659a0e663   Rusty Russell   lguest: have exam...
1643
  static void rng_input(struct virtqueue *vq)
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1644
1645
1646
  {
  	int len;
  	unsigned int head, in_num, out_num, totlen = 0;
659a0e663   Rusty Russell   lguest: have exam...
1647
1648
  	struct rng_info *rng_info = vq->dev->priv;
  	struct iovec iov[vq->vring.num];
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1649
1650
  
  	/* First we need a buffer from the Guests's virtqueue. */
659a0e663   Rusty Russell   lguest: have exam...
1651
  	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1652
1653
  	if (out_num)
  		errx(1, "Output buffers in rng?");
2e04ef769   Rusty Russell   lguest: fix comme...
1654
  	/*
a91d74a3c   Rusty Russell   lguest: update co...
1655
1656
  	 * Just like the console write, we loop to cover the whole iovec.
  	 * In this case, short reads actually happen quite a bit.
2e04ef769   Rusty Russell   lguest: fix comme...
1657
  	 */
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1658
  	while (!iov_empty(iov, in_num)) {
659a0e663   Rusty Russell   lguest: have exam...
1659
  		len = readv(rng_info->rfd, iov, in_num);
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1660
1661
1662
1663
1664
1665
1666
  		if (len <= 0)
  			err(1, "Read from /dev/random gave %i", len);
  		iov_consume(iov, in_num, len);
  		totlen += len;
  	}
  
  	/* Tell the Guest about the new input. */
38bc2b8c5   Rusty Russell   lguest: implement...
1667
  	add_used(vq, head, totlen);
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1668
  }
2e04ef769   Rusty Russell   lguest: fix comme...
1669
1670
1671
  /*L:199
   * This creates a "hardware" random number device for the Guest.
   */
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1672
1673
1674
  static void setup_rng(void)
  {
  	struct device *dev;
659a0e663   Rusty Russell   lguest: have exam...
1675
  	struct rng_info *rng_info = malloc(sizeof(*rng_info));
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1676

2e04ef769   Rusty Russell   lguest: fix comme...
1677
  	/* Our device's privat info simply contains the /dev/random fd. */
659a0e663   Rusty Russell   lguest: have exam...
1678
  	rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1679

2e04ef769   Rusty Russell   lguest: fix comme...
1680
  	/* Create the new device. */
659a0e663   Rusty Russell   lguest: have exam...
1681
1682
  	dev = new_device("rng", VIRTIO_ID_RNG);
  	dev->priv = rng_info;
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1683
1684
  
  	/* The device has one virtqueue, where the Guest places inbufs. */
659a0e663   Rusty Russell   lguest: have exam...
1685
  	add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1686
1687
1688
1689
  
  	verbose("device %u: rng
  ", devices.device_num++);
  }
a6bd8e130   Rusty Russell   lguest: comment d...
1690
  /* That's the end of device setup. */
ec04b13f6   Balaji Rao   lguest: Reboot su...
1691

a6bd8e130   Rusty Russell   lguest: comment d...
1692
  /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
ec04b13f6   Balaji Rao   lguest: Reboot su...
1693
1694
1695
  static void __attribute__((noreturn)) restart_guest(void)
  {
  	unsigned int i;
2e04ef769   Rusty Russell   lguest: fix comme...
1696
1697
1698
1699
  	/*
  	 * Since we don't track all open fds, we simply close everything beyond
  	 * stderr.
  	 */
ec04b13f6   Balaji Rao   lguest: Reboot su...
1700
1701
  	for (i = 3; i < FD_SETSIZE; i++)
  		close(i);
8c79873da   Rusty Russell   lguest: turn Wake...
1702

659a0e663   Rusty Russell   lguest: have exam...
1703
1704
  	/* Reset all the devices (kills all threads). */
  	cleanup_devices();
ec04b13f6   Balaji Rao   lguest: Reboot su...
1705
1706
1707
  	execv(main_args[0], main_args);
  	err(1, "Could not exec %s", main_args[0]);
  }
8ca47e006   Rusty Russell   lguest: the docum...
1708

2e04ef769   Rusty Russell   lguest: fix comme...
1709
1710
1711
1712
  /*L:220
   * Finally we reach the core of the Launcher which runs the Guest, serves
   * its input and output, and finally, lays it to rest.
   */
56739c802   Rusty Russell   lguest: cleanup p...
1713
  static void __attribute__((noreturn)) run_guest(void)
8ca47e006   Rusty Russell   lguest: the docum...
1714
1715
  {
  	for (;;) {
17cbca2ba   Rusty Russell   Update example la...
1716
  		unsigned long notify_addr;
8ca47e006   Rusty Russell   lguest: the docum...
1717
1718
1719
  		int readval;
  
  		/* We read from the /dev/lguest device to run the Guest. */
e3283fa0c   Glauber de Oliveira Costa   lguest: adapt lau...
1720
1721
  		readval = pread(lguest_fd, &notify_addr,
  				sizeof(notify_addr), cpu_id);
8ca47e006   Rusty Russell   lguest: the docum...
1722

17cbca2ba   Rusty Russell   Update example la...
1723
1724
1725
1726
  		/* One unsigned long means the Guest did HCALL_NOTIFY */
  		if (readval == sizeof(notify_addr)) {
  			verbose("Notify on address %#lx
  ", notify_addr);
56739c802   Rusty Russell   lguest: cleanup p...
1727
  			handle_output(notify_addr);
dde797899   Rusty Russell   lguest: documenta...
1728
  		/* ENOENT means the Guest died.  Reading tells us why. */
8ca47e006   Rusty Russell   lguest: the docum...
1729
1730
  		} else if (errno == ENOENT) {
  			char reason[1024] = { 0 };
e3283fa0c   Glauber de Oliveira Costa   lguest: adapt lau...
1731
  			pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
8ca47e006   Rusty Russell   lguest: the docum...
1732
  			errx(1, "%s", reason);
ec04b13f6   Balaji Rao   lguest: Reboot su...
1733
1734
1735
  		/* ERESTART means that we need to reboot the guest */
  		} else if (errno == ERESTART) {
  			restart_guest();
659a0e663   Rusty Russell   lguest: have exam...
1736
1737
  		/* Anything else means a bug or incompatible change. */
  		} else
8ca47e006   Rusty Russell   lguest: the docum...
1738
  			err(1, "Running guest failed");
8ca47e006   Rusty Russell   lguest: the docum...
1739
1740
  	}
  }
a6bd8e130   Rusty Russell   lguest: comment d...
1741
  /*L:240
e1e72965e   Rusty Russell   lguest: documenta...
1742
1743
1744
   * This is the end of the Launcher.  The good news: we are over halfway
   * through!  The bad news: the most fiendish part of the code still lies ahead
   * of us.
dde797899   Rusty Russell   lguest: documenta...
1745
   *
e1e72965e   Rusty Russell   lguest: documenta...
1746
1747
   * Are you ready?  Take a deep breath and join me in the core of the Host, in
   * "make Host".
2e04ef769   Rusty Russell   lguest: fix comme...
1748
  :*/
8ca47e006   Rusty Russell   lguest: the docum...
1749
1750
1751
  
  static struct option opts[] = {
  	{ "verbose", 0, NULL, 'v' },
8ca47e006   Rusty Russell   lguest: the docum...
1752
1753
  	{ "tunnet", 1, NULL, 't' },
  	{ "block", 1, NULL, 'b' },
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1754
  	{ "rng", 0, NULL, 'r' },
8ca47e006   Rusty Russell   lguest: the docum...
1755
  	{ "initrd", 1, NULL, 'i' },
8aeb36e8f   Philip Sanderson   lguest: --usernam...
1756
1757
  	{ "username", 1, NULL, 'u' },
  	{ "chroot", 1, NULL, 'c' },
8ca47e006   Rusty Russell   lguest: the docum...
1758
1759
1760
1761
1762
  	{ NULL },
  };
  static void usage(void)
  {
  	errx(1, "Usage: lguest [--verbose] "
dec6a2be0   Mark McLoughlin   lguest: Support a...
1763
1764
  	     "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)
  "
8ca47e006   Rusty Russell   lguest: the docum...
1765
1766
1767
1768
  	     "|--block=<filename>|--initrd=<filename>]...
  "
  	     "<mem-in-mb> vmlinux [args...]");
  }
3c6b5bfa3   Rusty Russell   Introduce guest m...
1769
  /*L:105 The main routine is where the real work begins: */
8ca47e006   Rusty Russell   lguest: the docum...
1770
1771
  int main(int argc, char *argv[])
  {
2e04ef769   Rusty Russell   lguest: fix comme...
1772
  	/* Memory, code startpoint and size of the (optional) initrd. */
58a245664   Matias Zabaljauregui   lguest: move the ...
1773
  	unsigned long mem = 0, start, initrd_size = 0;
56739c802   Rusty Russell   lguest: cleanup p...
1774
1775
  	/* Two temporaries. */
  	int i, c;
3c6b5bfa3   Rusty Russell   Introduce guest m...
1776
  	/* The boot information for the Guest. */
43d33b21a   Rusty Russell   Use "struct boot_...
1777
  	struct boot_params *boot;
dde797899   Rusty Russell   lguest: documenta...
1778
  	/* If they specify an initrd file to load. */
8ca47e006   Rusty Russell   lguest: the docum...
1779
  	const char *initrd_name = NULL;
8aeb36e8f   Philip Sanderson   lguest: --usernam...
1780
1781
1782
1783
1784
  	/* Password structure for initgroups/setres[gu]id */
  	struct passwd *user_details = NULL;
  
  	/* Directory to chroot to */
  	char *chroot_path = NULL;
ec04b13f6   Balaji Rao   lguest: Reboot su...
1785
1786
  	/* Save the args: we "reboot" by execing ourselves again. */
  	main_args = argv;
ec04b13f6   Balaji Rao   lguest: Reboot su...
1787

2e04ef769   Rusty Russell   lguest: fix comme...
1788
1789
  	/*
  	 * First we initialize the device list.  We keep a pointer to the last
659a0e663   Rusty Russell   lguest: have exam...
1790
  	 * device, and the next interrupt number to use for devices (1:
2e04ef769   Rusty Russell   lguest: fix comme...
1791
1792
  	 * remember that 0 is used by the timer).
  	 */
a586d4f60   Rusty Russell   virtio: simplify ...
1793
  	devices.lastdev = NULL;
17cbca2ba   Rusty Russell   Update example la...
1794
  	devices.next_irq = 1;
8ca47e006   Rusty Russell   lguest: the docum...
1795

a91d74a3c   Rusty Russell   lguest: update co...
1796
  	/* We're CPU 0.  In fact, that's the only CPU possible right now. */
e3283fa0c   Glauber de Oliveira Costa   lguest: adapt lau...
1797
  	cpu_id = 0;
a91d74a3c   Rusty Russell   lguest: update co...
1798

2e04ef769   Rusty Russell   lguest: fix comme...
1799
1800
  	/*
  	 * We need to know how much memory so we can set up the device
dde797899   Rusty Russell   lguest: documenta...
1801
1802
  	 * descriptor and memory pages for the devices as we parse the command
  	 * line.  So we quickly look through the arguments to find the amount
2e04ef769   Rusty Russell   lguest: fix comme...
1803
1804
  	 * of memory now.
  	 */
6570c4599   Rusty Russell   link lguest examp...
1805
1806
  	for (i = 1; i < argc; i++) {
  		if (argv[i][0] != '-') {
3c6b5bfa3   Rusty Russell   Introduce guest m...
1807
  			mem = atoi(argv[i]) * 1024 * 1024;
2e04ef769   Rusty Russell   lguest: fix comme...
1808
1809
  			/*
  			 * We start by mapping anonymous pages over all of
3c6b5bfa3   Rusty Russell   Introduce guest m...
1810
1811
  			 * guest-physical memory range.  This fills it with 0,
  			 * and ensures that the Guest won't be killed when it
2e04ef769   Rusty Russell   lguest: fix comme...
1812
1813
  			 * tries to access it.
  			 */
3c6b5bfa3   Rusty Russell   Introduce guest m...
1814
1815
1816
1817
  			guest_base = map_zeroed_pages(mem / getpagesize()
  						      + DEVICE_PAGES);
  			guest_limit = mem;
  			guest_max = mem + DEVICE_PAGES*getpagesize();
17cbca2ba   Rusty Russell   Update example la...
1818
  			devices.descpage = get_pages(1);
6570c4599   Rusty Russell   link lguest examp...
1819
1820
1821
  			break;
  		}
  	}
dde797899   Rusty Russell   lguest: documenta...
1822
1823
  
  	/* The options are fairly straight-forward */
8ca47e006   Rusty Russell   lguest: the docum...
1824
1825
1826
1827
1828
  	while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
  		switch (c) {
  		case 'v':
  			verbose = true;
  			break;
8ca47e006   Rusty Russell   lguest: the docum...
1829
  		case 't':
17cbca2ba   Rusty Russell   Update example la...
1830
  			setup_tun_net(optarg);
8ca47e006   Rusty Russell   lguest: the docum...
1831
1832
  			break;
  		case 'b':
17cbca2ba   Rusty Russell   Update example la...
1833
  			setup_block_file(optarg);
8ca47e006   Rusty Russell   lguest: the docum...
1834
  			break;
28fd6d7f9   Rusty Russell   lguest: virtio-rn...
1835
1836
1837
  		case 'r':
  			setup_rng();
  			break;
8ca47e006   Rusty Russell   lguest: the docum...
1838
1839
1840
  		case 'i':
  			initrd_name = optarg;
  			break;
8aeb36e8f   Philip Sanderson   lguest: --usernam...
1841
1842
1843
1844
1845
1846
1847
1848
  		case 'u':
  			user_details = getpwnam(optarg);
  			if (!user_details)
  				err(1, "getpwnam failed, incorrect username?");
  			break;
  		case 'c':
  			chroot_path = optarg;
  			break;
8ca47e006   Rusty Russell   lguest: the docum...
1849
1850
1851
1852
1853
  		default:
  			warnx("Unknown argument %s", argv[optind]);
  			usage();
  		}
  	}
2e04ef769   Rusty Russell   lguest: fix comme...
1854
1855
1856
1857
  	/*
  	 * After the other arguments we expect memory and kernel image name,
  	 * followed by command line arguments for the kernel.
  	 */
8ca47e006   Rusty Russell   lguest: the docum...
1858
1859
  	if (optind + 2 > argc)
  		usage();
3c6b5bfa3   Rusty Russell   Introduce guest m...
1860
1861
  	verbose("Guest base is at %p
  ", guest_base);
dde797899   Rusty Russell   lguest: documenta...
1862
  	/* We always have a console device */
17cbca2ba   Rusty Russell   Update example la...
1863
  	setup_console();
8ca47e006   Rusty Russell   lguest: the docum...
1864

8ca47e006   Rusty Russell   lguest: the docum...
1865
  	/* Now we load the kernel */
47436aa4a   Rusty Russell   Boot with virtual...
1866
  	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
8ca47e006   Rusty Russell   lguest: the docum...
1867

3c6b5bfa3   Rusty Russell   Introduce guest m...
1868
1869
  	/* Boot information is stashed at physical address 0 */
  	boot = from_guest_phys(0);
dde797899   Rusty Russell   lguest: documenta...
1870
  	/* Map the initrd image if requested (at top of physical memory) */
8ca47e006   Rusty Russell   lguest: the docum...
1871
1872
  	if (initrd_name) {
  		initrd_size = load_initrd(initrd_name, mem);
2e04ef769   Rusty Russell   lguest: fix comme...
1873
1874
1875
1876
  		/*
  		 * These are the location in the Linux boot header where the
  		 * start and size of the initrd are expected to be found.
  		 */
43d33b21a   Rusty Russell   Use "struct boot_...
1877
1878
  		boot->hdr.ramdisk_image = mem - initrd_size;
  		boot->hdr.ramdisk_size = initrd_size;
dde797899   Rusty Russell   lguest: documenta...
1879
  		/* The bootloader type 0xFF means "unknown"; that's OK. */
43d33b21a   Rusty Russell   Use "struct boot_...
1880
  		boot->hdr.type_of_loader = 0xFF;
8ca47e006   Rusty Russell   lguest: the docum...
1881
  	}
2e04ef769   Rusty Russell   lguest: fix comme...
1882
1883
1884
1885
  	/*
  	 * The Linux boot header contains an "E820" memory map: ours is a
  	 * simple, single region.
  	 */
43d33b21a   Rusty Russell   Use "struct boot_...
1886
1887
  	boot->e820_entries = 1;
  	boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
2e04ef769   Rusty Russell   lguest: fix comme...
1888
1889
1890
1891
  	/*
  	 * The boot header contains a command line pointer: we put the command
  	 * line after the boot header.
  	 */
43d33b21a   Rusty Russell   Use "struct boot_...
1892
  	boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
e1e72965e   Rusty Russell   lguest: documenta...
1893
  	/* We use a simple helper to copy the arguments separated by spaces. */
43d33b21a   Rusty Russell   Use "struct boot_...
1894
  	concat((char *)(boot + 1), argv+optind+2);
dde797899   Rusty Russell   lguest: documenta...
1895

e22a53982   Rusty Russell   lguest: allow boo...
1896
1897
  	/* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
  	boot->hdr.kernel_alignment = 0x1000000;
814a0e5cd   Rusty Russell   Revert lguest mag...
1898
  	/* Boot protocol version: 2.07 supports the fields for lguest. */
43d33b21a   Rusty Russell   Use "struct boot_...
1899
  	boot->hdr.version = 0x207;
814a0e5cd   Rusty Russell   Revert lguest mag...
1900
1901
  
  	/* The hardware_subarch value of "1" tells the Guest it's an lguest. */
43d33b21a   Rusty Russell   Use "struct boot_...
1902
  	boot->hdr.hardware_subarch = 1;
814a0e5cd   Rusty Russell   Revert lguest mag...
1903

43d33b21a   Rusty Russell   Use "struct boot_...
1904
1905
  	/* Tell the entry path not to try to reload segment registers. */
  	boot->hdr.loadflags |= KEEP_SEGMENTS;
8ca47e006   Rusty Russell   lguest: the docum...
1906

9f54288de   Rusty Russell   lguest: update co...
1907
  	/* We tell the kernel to initialize the Guest. */
56739c802   Rusty Russell   lguest: cleanup p...
1908
  	tell_kernel(start);
dde797899   Rusty Russell   lguest: documenta...
1909

a91d74a3c   Rusty Russell   lguest: update co...
1910
  	/* Ensure that we terminate if a device-servicing child dies. */
659a0e663   Rusty Russell   lguest: have exam...
1911
1912
1913
1914
  	signal(SIGCHLD, kill_launcher);
  
  	/* If we exit via err(), this kills all the threads, restores tty. */
  	atexit(cleanup_devices);
8ca47e006   Rusty Russell   lguest: the docum...
1915

8aeb36e8f   Philip Sanderson   lguest: --usernam...
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
  	/* If requested, chroot to a directory */
  	if (chroot_path) {
  		if (chroot(chroot_path) != 0)
  			err(1, "chroot(\"%s\") failed", chroot_path);
  
  		if (chdir("/") != 0)
  			err(1, "chdir(\"/\") failed");
  
  		verbose("chroot done
  ");
  	}
  
  	/* If requested, drop privileges */
  	if (user_details) {
  		uid_t u;
  		gid_t g;
  
  		u = user_details->pw_uid;
  		g = user_details->pw_gid;
  
  		if (initgroups(user_details->pw_name, g) != 0)
  			err(1, "initgroups failed");
  
  		if (setresgid(g, g, g) != 0)
  			err(1, "setresgid failed");
  
  		if (setresuid(u, u, u) != 0)
  			err(1, "setresuid failed");
  
  		verbose("Dropping privileges completed
  ");
  	}
dde797899   Rusty Russell   lguest: documenta...
1948
  	/* Finally, run the Guest.  This doesn't return. */
56739c802   Rusty Russell   lguest: cleanup p...
1949
  	run_guest();
8ca47e006   Rusty Russell   lguest: the docum...
1950
  }
f56a384e9   Rusty Russell   lguest: documenta...
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
  /*:*/
  
  /*M:999
   * Mastery is done: you now know everything I do.
   *
   * But surely you have seen code, features and bugs in your wanderings which
   * you now yearn to attack?  That is the real game, and I look forward to you
   * patching and forking lguest into the Your-Name-Here-visor.
   *
   * Farewell, and good coding!
   * Rusty Russell.
   */