Commit a24e809902339458416900869abdcc51a44bfd48

Authored by Andi Kleen
Committed by Avi Kivity
1 parent 376d41ff26

KVM: Fix unused but set warnings

No real bugs in this one.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

Showing 2 changed files with 1 additions and 2 deletions Inline Diff

arch/x86/kvm/paging_tmpl.h
1 /* 1 /*
2 * Kernel-based Virtual Machine driver for Linux 2 * Kernel-based Virtual Machine driver for Linux
3 * 3 *
4 * This module enables machines with Intel VT-x extensions to run virtual 4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affilates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
14 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
15 * 15 *
16 * This work is licensed under the terms of the GNU GPL, version 2. See 16 * This work is licensed under the terms of the GNU GPL, version 2. See
17 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
18 * 18 *
19 */ 19 */
20 20
21 /* 21 /*
22 * We need the mmu code to access both 32-bit and 64-bit guest ptes, 22 * We need the mmu code to access both 32-bit and 64-bit guest ptes,
23 * so the code in this file is compiled twice, once per pte size. 23 * so the code in this file is compiled twice, once per pte size.
24 */ 24 */
25 25
26 #if PTTYPE == 64 26 #if PTTYPE == 64
27 #define pt_element_t u64 27 #define pt_element_t u64
28 #define guest_walker guest_walker64 28 #define guest_walker guest_walker64
29 #define FNAME(name) paging##64_##name 29 #define FNAME(name) paging##64_##name
30 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 30 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 34 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
35 #define PT_LEVEL_BITS PT64_LEVEL_BITS 35 #define PT_LEVEL_BITS PT64_LEVEL_BITS
36 #ifdef CONFIG_X86_64 36 #ifdef CONFIG_X86_64
37 #define PT_MAX_FULL_LEVELS 4 37 #define PT_MAX_FULL_LEVELS 4
38 #define CMPXCHG cmpxchg 38 #define CMPXCHG cmpxchg
39 #else 39 #else
40 #define CMPXCHG cmpxchg64 40 #define CMPXCHG cmpxchg64
41 #define PT_MAX_FULL_LEVELS 2 41 #define PT_MAX_FULL_LEVELS 2
42 #endif 42 #endif
43 #elif PTTYPE == 32 43 #elif PTTYPE == 32
44 #define pt_element_t u32 44 #define pt_element_t u32
45 #define guest_walker guest_walker32 45 #define guest_walker guest_walker32
46 #define FNAME(name) paging##32_##name 46 #define FNAME(name) paging##32_##name
47 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 47 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
48 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 48 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
49 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 49 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
50 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 50 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
51 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 51 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
52 #define PT_LEVEL_BITS PT32_LEVEL_BITS 52 #define PT_LEVEL_BITS PT32_LEVEL_BITS
53 #define PT_MAX_FULL_LEVELS 2 53 #define PT_MAX_FULL_LEVELS 2
54 #define CMPXCHG cmpxchg 54 #define CMPXCHG cmpxchg
55 #else 55 #else
56 #error Invalid PTTYPE value 56 #error Invalid PTTYPE value
57 #endif 57 #endif
58 58
59 #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) 59 #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
60 #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) 60 #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
61 61
62 /* 62 /*
63 * The guest_walker structure emulates the behavior of the hardware page 63 * The guest_walker structure emulates the behavior of the hardware page
64 * table walker. 64 * table walker.
65 */ 65 */
66 struct guest_walker { 66 struct guest_walker {
67 int level; 67 int level;
68 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 68 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
69 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 69 pt_element_t ptes[PT_MAX_FULL_LEVELS];
70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71 unsigned pt_access; 71 unsigned pt_access;
72 unsigned pte_access; 72 unsigned pte_access;
73 gfn_t gfn; 73 gfn_t gfn;
74 u32 error_code; 74 u32 error_code;
75 }; 75 };
76 76
77 static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 77 static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
78 { 78 {
79 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 79 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
80 } 80 }
81 81
82 static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, 82 static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
83 gfn_t table_gfn, unsigned index, 83 gfn_t table_gfn, unsigned index,
84 pt_element_t orig_pte, pt_element_t new_pte) 84 pt_element_t orig_pte, pt_element_t new_pte)
85 { 85 {
86 pt_element_t ret; 86 pt_element_t ret;
87 pt_element_t *table; 87 pt_element_t *table;
88 struct page *page; 88 struct page *page;
89 89
90 page = gfn_to_page(kvm, table_gfn); 90 page = gfn_to_page(kvm, table_gfn);
91 91
92 table = kmap_atomic(page, KM_USER0); 92 table = kmap_atomic(page, KM_USER0);
93 ret = CMPXCHG(&table[index], orig_pte, new_pte); 93 ret = CMPXCHG(&table[index], orig_pte, new_pte);
94 kunmap_atomic(table, KM_USER0); 94 kunmap_atomic(table, KM_USER0);
95 95
96 kvm_release_page_dirty(page); 96 kvm_release_page_dirty(page);
97 97
98 return (ret != orig_pte); 98 return (ret != orig_pte);
99 } 99 }
100 100
101 static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) 101 static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
102 { 102 {
103 unsigned access; 103 unsigned access;
104 104
105 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 105 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
106 #if PTTYPE == 64 106 #if PTTYPE == 64
107 if (is_nx(vcpu)) 107 if (is_nx(vcpu))
108 access &= ~(gpte >> PT64_NX_SHIFT); 108 access &= ~(gpte >> PT64_NX_SHIFT);
109 #endif 109 #endif
110 return access; 110 return access;
111 } 111 }
112 112
113 /* 113 /*
114 * Fetch a guest pte for a guest virtual address 114 * Fetch a guest pte for a guest virtual address
115 */ 115 */
116 static int FNAME(walk_addr)(struct guest_walker *walker, 116 static int FNAME(walk_addr)(struct guest_walker *walker,
117 struct kvm_vcpu *vcpu, gva_t addr, 117 struct kvm_vcpu *vcpu, gva_t addr,
118 int write_fault, int user_fault, int fetch_fault) 118 int write_fault, int user_fault, int fetch_fault)
119 { 119 {
120 pt_element_t pte; 120 pt_element_t pte;
121 gfn_t table_gfn; 121 gfn_t table_gfn;
122 unsigned index, pt_access, pte_access; 122 unsigned index, pt_access, pte_access;
123 gpa_t pte_gpa; 123 gpa_t pte_gpa;
124 int rsvd_fault = 0; 124 int rsvd_fault = 0;
125 125
126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
127 fetch_fault); 127 fetch_fault);
128 walk: 128 walk:
129 walker->level = vcpu->arch.mmu.root_level; 129 walker->level = vcpu->arch.mmu.root_level;
130 pte = vcpu->arch.cr3; 130 pte = vcpu->arch.cr3;
131 #if PTTYPE == 64 131 #if PTTYPE == 64
132 if (!is_long_mode(vcpu)) { 132 if (!is_long_mode(vcpu)) {
133 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 133 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
134 trace_kvm_mmu_paging_element(pte, walker->level); 134 trace_kvm_mmu_paging_element(pte, walker->level);
135 if (!is_present_gpte(pte)) 135 if (!is_present_gpte(pte))
136 goto not_present; 136 goto not_present;
137 --walker->level; 137 --walker->level;
138 } 138 }
139 #endif 139 #endif
140 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 140 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
141 (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); 141 (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0);
142 142
143 pt_access = ACC_ALL; 143 pt_access = ACC_ALL;
144 144
145 for (;;) { 145 for (;;) {
146 index = PT_INDEX(addr, walker->level); 146 index = PT_INDEX(addr, walker->level);
147 147
148 table_gfn = gpte_to_gfn(pte); 148 table_gfn = gpte_to_gfn(pte);
149 pte_gpa = gfn_to_gpa(table_gfn); 149 pte_gpa = gfn_to_gpa(table_gfn);
150 pte_gpa += index * sizeof(pt_element_t); 150 pte_gpa += index * sizeof(pt_element_t);
151 walker->table_gfn[walker->level - 1] = table_gfn; 151 walker->table_gfn[walker->level - 1] = table_gfn;
152 walker->pte_gpa[walker->level - 1] = pte_gpa; 152 walker->pte_gpa[walker->level - 1] = pte_gpa;
153 153
154 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) 154 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
155 goto not_present; 155 goto not_present;
156 156
157 trace_kvm_mmu_paging_element(pte, walker->level); 157 trace_kvm_mmu_paging_element(pte, walker->level);
158 158
159 if (!is_present_gpte(pte)) 159 if (!is_present_gpte(pte))
160 goto not_present; 160 goto not_present;
161 161
162 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); 162 rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
163 if (rsvd_fault) 163 if (rsvd_fault)
164 goto access_error; 164 goto access_error;
165 165
166 if (write_fault && !is_writable_pte(pte)) 166 if (write_fault && !is_writable_pte(pte))
167 if (user_fault || is_write_protection(vcpu)) 167 if (user_fault || is_write_protection(vcpu))
168 goto access_error; 168 goto access_error;
169 169
170 if (user_fault && !(pte & PT_USER_MASK)) 170 if (user_fault && !(pte & PT_USER_MASK))
171 goto access_error; 171 goto access_error;
172 172
173 #if PTTYPE == 64 173 #if PTTYPE == 64
174 if (fetch_fault && (pte & PT64_NX_MASK)) 174 if (fetch_fault && (pte & PT64_NX_MASK))
175 goto access_error; 175 goto access_error;
176 #endif 176 #endif
177 177
178 if (!(pte & PT_ACCESSED_MASK)) { 178 if (!(pte & PT_ACCESSED_MASK)) {
179 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 179 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
180 sizeof(pte)); 180 sizeof(pte));
181 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 181 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
182 index, pte, pte|PT_ACCESSED_MASK)) 182 index, pte, pte|PT_ACCESSED_MASK))
183 goto walk; 183 goto walk;
184 mark_page_dirty(vcpu->kvm, table_gfn); 184 mark_page_dirty(vcpu->kvm, table_gfn);
185 pte |= PT_ACCESSED_MASK; 185 pte |= PT_ACCESSED_MASK;
186 } 186 }
187 187
188 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); 188 pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
189 189
190 walker->ptes[walker->level - 1] = pte; 190 walker->ptes[walker->level - 1] = pte;
191 191
192 if ((walker->level == PT_PAGE_TABLE_LEVEL) || 192 if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
193 ((walker->level == PT_DIRECTORY_LEVEL) && 193 ((walker->level == PT_DIRECTORY_LEVEL) &&
194 is_large_pte(pte) && 194 is_large_pte(pte) &&
195 (PTTYPE == 64 || is_pse(vcpu))) || 195 (PTTYPE == 64 || is_pse(vcpu))) ||
196 ((walker->level == PT_PDPE_LEVEL) && 196 ((walker->level == PT_PDPE_LEVEL) &&
197 is_large_pte(pte) && 197 is_large_pte(pte) &&
198 is_long_mode(vcpu))) { 198 is_long_mode(vcpu))) {
199 int lvl = walker->level; 199 int lvl = walker->level;
200 200
201 walker->gfn = gpte_to_gfn_lvl(pte, lvl); 201 walker->gfn = gpte_to_gfn_lvl(pte, lvl);
202 walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) 202 walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl))
203 >> PAGE_SHIFT; 203 >> PAGE_SHIFT;
204 204
205 if (PTTYPE == 32 && 205 if (PTTYPE == 32 &&
206 walker->level == PT_DIRECTORY_LEVEL && 206 walker->level == PT_DIRECTORY_LEVEL &&
207 is_cpuid_PSE36()) 207 is_cpuid_PSE36())
208 walker->gfn += pse36_gfn_delta(pte); 208 walker->gfn += pse36_gfn_delta(pte);
209 209
210 break; 210 break;
211 } 211 }
212 212
213 pt_access = pte_access; 213 pt_access = pte_access;
214 --walker->level; 214 --walker->level;
215 } 215 }
216 216
217 if (write_fault && !is_dirty_gpte(pte)) { 217 if (write_fault && !is_dirty_gpte(pte)) {
218 bool ret; 218 bool ret;
219 219
220 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 220 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
221 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 221 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
222 pte|PT_DIRTY_MASK); 222 pte|PT_DIRTY_MASK);
223 if (ret) 223 if (ret)
224 goto walk; 224 goto walk;
225 mark_page_dirty(vcpu->kvm, table_gfn); 225 mark_page_dirty(vcpu->kvm, table_gfn);
226 pte |= PT_DIRTY_MASK; 226 pte |= PT_DIRTY_MASK;
227 walker->ptes[walker->level - 1] = pte; 227 walker->ptes[walker->level - 1] = pte;
228 } 228 }
229 229
230 walker->pt_access = pt_access; 230 walker->pt_access = pt_access;
231 walker->pte_access = pte_access; 231 walker->pte_access = pte_access;
232 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 232 pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
233 __func__, (u64)pte, pte_access, pt_access); 233 __func__, (u64)pte, pte_access, pt_access);
234 return 1; 234 return 1;
235 235
236 not_present: 236 not_present:
237 walker->error_code = 0; 237 walker->error_code = 0;
238 goto err; 238 goto err;
239 239
240 access_error: 240 access_error:
241 walker->error_code = PFERR_PRESENT_MASK; 241 walker->error_code = PFERR_PRESENT_MASK;
242 242
243 err: 243 err:
244 if (write_fault) 244 if (write_fault)
245 walker->error_code |= PFERR_WRITE_MASK; 245 walker->error_code |= PFERR_WRITE_MASK;
246 if (user_fault) 246 if (user_fault)
247 walker->error_code |= PFERR_USER_MASK; 247 walker->error_code |= PFERR_USER_MASK;
248 if (fetch_fault) 248 if (fetch_fault)
249 walker->error_code |= PFERR_FETCH_MASK; 249 walker->error_code |= PFERR_FETCH_MASK;
250 if (rsvd_fault) 250 if (rsvd_fault)
251 walker->error_code |= PFERR_RSVD_MASK; 251 walker->error_code |= PFERR_RSVD_MASK;
252 trace_kvm_mmu_walker_error(walker->error_code); 252 trace_kvm_mmu_walker_error(walker->error_code);
253 return 0; 253 return 0;
254 } 254 }
255 255
256 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 256 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
257 u64 *spte, const void *pte) 257 u64 *spte, const void *pte)
258 { 258 {
259 pt_element_t gpte; 259 pt_element_t gpte;
260 unsigned pte_access; 260 unsigned pte_access;
261 pfn_t pfn; 261 pfn_t pfn;
262 u64 new_spte; 262 u64 new_spte;
263 263
264 gpte = *(const pt_element_t *)pte; 264 gpte = *(const pt_element_t *)pte;
265 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 265 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
266 if (!is_present_gpte(gpte)) { 266 if (!is_present_gpte(gpte)) {
267 if (page->unsync) 267 if (page->unsync)
268 new_spte = shadow_trap_nonpresent_pte; 268 new_spte = shadow_trap_nonpresent_pte;
269 else 269 else
270 new_spte = shadow_notrap_nonpresent_pte; 270 new_spte = shadow_notrap_nonpresent_pte;
271 __set_spte(spte, new_spte); 271 __set_spte(spte, new_spte);
272 } 272 }
273 return; 273 return;
274 } 274 }
275 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 275 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
276 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); 276 pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
277 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 277 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
278 return; 278 return;
279 pfn = vcpu->arch.update_pte.pfn; 279 pfn = vcpu->arch.update_pte.pfn;
280 if (is_error_pfn(pfn)) 280 if (is_error_pfn(pfn))
281 return; 281 return;
282 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) 282 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
283 return; 283 return;
284 kvm_get_pfn(pfn); 284 kvm_get_pfn(pfn);
285 /* 285 /*
286 * we call mmu_set_spte() with reset_host_protection = true beacuse that 286 * we call mmu_set_spte() with reset_host_protection = true beacuse that
287 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 287 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
288 */ 288 */
289 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 289 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
290 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, 290 gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
291 gpte_to_gfn(gpte), pfn, true, true); 291 gpte_to_gfn(gpte), pfn, true, true);
292 } 292 }
293 293
294 /* 294 /*
295 * Fetch a shadow pte for a specific level in the paging hierarchy. 295 * Fetch a shadow pte for a specific level in the paging hierarchy.
296 */ 296 */
297 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 297 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
298 struct guest_walker *gw, 298 struct guest_walker *gw,
299 int user_fault, int write_fault, int hlevel, 299 int user_fault, int write_fault, int hlevel,
300 int *ptwrite, pfn_t pfn) 300 int *ptwrite, pfn_t pfn)
301 { 301 {
302 unsigned access = gw->pt_access; 302 unsigned access = gw->pt_access;
303 struct kvm_mmu_page *shadow_page; 303 struct kvm_mmu_page *shadow_page;
304 u64 spte, *sptep = NULL; 304 u64 spte, *sptep = NULL;
305 int direct; 305 int direct;
306 gfn_t table_gfn; 306 gfn_t table_gfn;
307 int r; 307 int r;
308 int level; 308 int level;
309 pt_element_t curr_pte; 309 pt_element_t curr_pte;
310 struct kvm_shadow_walk_iterator iterator; 310 struct kvm_shadow_walk_iterator iterator;
311 311
312 if (!is_present_gpte(gw->ptes[gw->level - 1])) 312 if (!is_present_gpte(gw->ptes[gw->level - 1]))
313 return NULL; 313 return NULL;
314 314
315 for_each_shadow_entry(vcpu, addr, iterator) { 315 for_each_shadow_entry(vcpu, addr, iterator) {
316 level = iterator.level; 316 level = iterator.level;
317 sptep = iterator.sptep; 317 sptep = iterator.sptep;
318 if (iterator.level == hlevel) { 318 if (iterator.level == hlevel) {
319 mmu_set_spte(vcpu, sptep, access, 319 mmu_set_spte(vcpu, sptep, access,
320 gw->pte_access & access, 320 gw->pte_access & access,
321 user_fault, write_fault, 321 user_fault, write_fault,
322 gw->ptes[gw->level-1] & PT_DIRTY_MASK, 322 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
323 ptwrite, level, 323 ptwrite, level,
324 gw->gfn, pfn, false, true); 324 gw->gfn, pfn, false, true);
325 break; 325 break;
326 } 326 }
327 327
328 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) 328 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
329 continue; 329 continue;
330 330
331 if (is_large_pte(*sptep)) { 331 if (is_large_pte(*sptep)) {
332 rmap_remove(vcpu->kvm, sptep); 332 rmap_remove(vcpu->kvm, sptep);
333 __set_spte(sptep, shadow_trap_nonpresent_pte); 333 __set_spte(sptep, shadow_trap_nonpresent_pte);
334 kvm_flush_remote_tlbs(vcpu->kvm); 334 kvm_flush_remote_tlbs(vcpu->kvm);
335 } 335 }
336 336
337 if (level <= gw->level) { 337 if (level <= gw->level) {
338 int delta = level - gw->level + 1; 338 int delta = level - gw->level + 1;
339 direct = 1; 339 direct = 1;
340 if (!is_dirty_gpte(gw->ptes[level - delta])) 340 if (!is_dirty_gpte(gw->ptes[level - delta]))
341 access &= ~ACC_WRITE_MASK; 341 access &= ~ACC_WRITE_MASK;
342 /* 342 /*
343 * It is a large guest pages backed by small host pages, 343 * It is a large guest pages backed by small host pages,
344 * So we set @direct(@shadow_page->role.direct)=1, and 344 * So we set @direct(@shadow_page->role.direct)=1, and
345 * set @table_gfn(@shadow_page->gfn)=the base page frame 345 * set @table_gfn(@shadow_page->gfn)=the base page frame
346 * for linear translations. 346 * for linear translations.
347 */ 347 */
348 table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 348 table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
349 access &= gw->pte_access; 349 access &= gw->pte_access;
350 } else { 350 } else {
351 direct = 0; 351 direct = 0;
352 table_gfn = gw->table_gfn[level - 2]; 352 table_gfn = gw->table_gfn[level - 2];
353 } 353 }
354 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 354 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
355 direct, access, sptep); 355 direct, access, sptep);
356 if (!direct) { 356 if (!direct) {
357 r = kvm_read_guest_atomic(vcpu->kvm, 357 r = kvm_read_guest_atomic(vcpu->kvm,
358 gw->pte_gpa[level - 2], 358 gw->pte_gpa[level - 2],
359 &curr_pte, sizeof(curr_pte)); 359 &curr_pte, sizeof(curr_pte));
360 if (r || curr_pte != gw->ptes[level - 2]) { 360 if (r || curr_pte != gw->ptes[level - 2]) {
361 kvm_mmu_put_page(shadow_page, sptep); 361 kvm_mmu_put_page(shadow_page, sptep);
362 kvm_release_pfn_clean(pfn); 362 kvm_release_pfn_clean(pfn);
363 sptep = NULL; 363 sptep = NULL;
364 break; 364 break;
365 } 365 }
366 } 366 }
367 367
368 spte = __pa(shadow_page->spt) 368 spte = __pa(shadow_page->spt)
369 | PT_PRESENT_MASK | PT_ACCESSED_MASK 369 | PT_PRESENT_MASK | PT_ACCESSED_MASK
370 | PT_WRITABLE_MASK | PT_USER_MASK; 370 | PT_WRITABLE_MASK | PT_USER_MASK;
371 *sptep = spte; 371 *sptep = spte;
372 } 372 }
373 373
374 return sptep; 374 return sptep;
375 } 375 }
376 376
377 /* 377 /*
378 * Page fault handler. There are several causes for a page fault: 378 * Page fault handler. There are several causes for a page fault:
379 * - there is no shadow pte for the guest pte 379 * - there is no shadow pte for the guest pte
380 * - write access through a shadow pte marked read only so that we can set 380 * - write access through a shadow pte marked read only so that we can set
381 * the dirty bit 381 * the dirty bit
382 * - write access to a shadow pte marked read only so we can update the page 382 * - write access to a shadow pte marked read only so we can update the page
383 * dirty bitmap, when userspace requests it 383 * dirty bitmap, when userspace requests it
384 * - mmio access; in this case we will never install a present shadow pte 384 * - mmio access; in this case we will never install a present shadow pte
385 * - normal guest page fault due to the guest pte marked not present, not 385 * - normal guest page fault due to the guest pte marked not present, not
386 * writable, or not executable 386 * writable, or not executable
387 * 387 *
388 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 388 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
389 * a negative value on error. 389 * a negative value on error.
390 */ 390 */
391 static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 391 static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
392 u32 error_code) 392 u32 error_code)
393 { 393 {
394 int write_fault = error_code & PFERR_WRITE_MASK; 394 int write_fault = error_code & PFERR_WRITE_MASK;
395 int user_fault = error_code & PFERR_USER_MASK; 395 int user_fault = error_code & PFERR_USER_MASK;
396 int fetch_fault = error_code & PFERR_FETCH_MASK; 396 int fetch_fault = error_code & PFERR_FETCH_MASK;
397 struct guest_walker walker; 397 struct guest_walker walker;
398 u64 *sptep; 398 u64 *sptep;
399 int write_pt = 0; 399 int write_pt = 0;
400 int r; 400 int r;
401 pfn_t pfn; 401 pfn_t pfn;
402 int level = PT_PAGE_TABLE_LEVEL; 402 int level = PT_PAGE_TABLE_LEVEL;
403 unsigned long mmu_seq; 403 unsigned long mmu_seq;
404 404
405 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 405 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
406 kvm_mmu_audit(vcpu, "pre page fault"); 406 kvm_mmu_audit(vcpu, "pre page fault");
407 407
408 r = mmu_topup_memory_caches(vcpu); 408 r = mmu_topup_memory_caches(vcpu);
409 if (r) 409 if (r)
410 return r; 410 return r;
411 411
412 /* 412 /*
413 * Look up the guest pte for the faulting address. 413 * Look up the guest pte for the faulting address.
414 */ 414 */
415 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 415 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
416 fetch_fault); 416 fetch_fault);
417 417
418 /* 418 /*
419 * The page is not mapped by the guest. Let the guest handle it. 419 * The page is not mapped by the guest. Let the guest handle it.
420 */ 420 */
421 if (!r) { 421 if (!r) {
422 pgprintk("%s: guest page fault\n", __func__); 422 pgprintk("%s: guest page fault\n", __func__);
423 inject_page_fault(vcpu, addr, walker.error_code); 423 inject_page_fault(vcpu, addr, walker.error_code);
424 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 424 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
425 return 0; 425 return 0;
426 } 426 }
427 427
428 if (walker.level >= PT_DIRECTORY_LEVEL) { 428 if (walker.level >= PT_DIRECTORY_LEVEL) {
429 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 429 level = min(walker.level, mapping_level(vcpu, walker.gfn));
430 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 430 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
431 } 431 }
432 432
433 mmu_seq = vcpu->kvm->mmu_notifier_seq; 433 mmu_seq = vcpu->kvm->mmu_notifier_seq;
434 smp_rmb(); 434 smp_rmb();
435 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 435 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
436 436
437 /* mmio */ 437 /* mmio */
438 if (is_error_pfn(pfn)) 438 if (is_error_pfn(pfn))
439 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); 439 return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
440 440
441 spin_lock(&vcpu->kvm->mmu_lock); 441 spin_lock(&vcpu->kvm->mmu_lock);
442 if (mmu_notifier_retry(vcpu, mmu_seq)) 442 if (mmu_notifier_retry(vcpu, mmu_seq))
443 goto out_unlock; 443 goto out_unlock;
444 kvm_mmu_free_some_pages(vcpu); 444 kvm_mmu_free_some_pages(vcpu);
445 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 445 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
446 level, &write_pt, pfn); 446 level, &write_pt, pfn);
447 (void)sptep;
447 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 448 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
448 sptep, *sptep, write_pt); 449 sptep, *sptep, write_pt);
449 450
450 if (!write_pt) 451 if (!write_pt)
451 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 452 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
452 453
453 ++vcpu->stat.pf_fixed; 454 ++vcpu->stat.pf_fixed;
454 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 455 kvm_mmu_audit(vcpu, "post page fault (fixed)");
455 spin_unlock(&vcpu->kvm->mmu_lock); 456 spin_unlock(&vcpu->kvm->mmu_lock);
456 457
457 return write_pt; 458 return write_pt;
458 459
459 out_unlock: 460 out_unlock:
460 spin_unlock(&vcpu->kvm->mmu_lock); 461 spin_unlock(&vcpu->kvm->mmu_lock);
461 kvm_release_pfn_clean(pfn); 462 kvm_release_pfn_clean(pfn);
462 return 0; 463 return 0;
463 } 464 }
464 465
465 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 466 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
466 { 467 {
467 struct kvm_shadow_walk_iterator iterator; 468 struct kvm_shadow_walk_iterator iterator;
468 struct kvm_mmu_page *sp; 469 struct kvm_mmu_page *sp;
469 gpa_t pte_gpa = -1; 470 gpa_t pte_gpa = -1;
470 int level; 471 int level;
471 u64 *sptep; 472 u64 *sptep;
472 int need_flush = 0; 473 int need_flush = 0;
473 474
474 spin_lock(&vcpu->kvm->mmu_lock); 475 spin_lock(&vcpu->kvm->mmu_lock);
475 476
476 for_each_shadow_entry(vcpu, gva, iterator) { 477 for_each_shadow_entry(vcpu, gva, iterator) {
477 level = iterator.level; 478 level = iterator.level;
478 sptep = iterator.sptep; 479 sptep = iterator.sptep;
479 480
480 sp = page_header(__pa(sptep)); 481 sp = page_header(__pa(sptep));
481 if (is_last_spte(*sptep, level)) { 482 if (is_last_spte(*sptep, level)) {
482 int offset, shift; 483 int offset, shift;
483 484
484 if (!sp->unsync) 485 if (!sp->unsync)
485 break; 486 break;
486 487
487 shift = PAGE_SHIFT - 488 shift = PAGE_SHIFT -
488 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; 489 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
489 offset = sp->role.quadrant << shift; 490 offset = sp->role.quadrant << shift;
490 491
491 pte_gpa = (sp->gfn << PAGE_SHIFT) + offset; 492 pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
492 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 493 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
493 494
494 if (is_shadow_present_pte(*sptep)) { 495 if (is_shadow_present_pte(*sptep)) {
495 rmap_remove(vcpu->kvm, sptep); 496 rmap_remove(vcpu->kvm, sptep);
496 if (is_large_pte(*sptep)) 497 if (is_large_pte(*sptep))
497 --vcpu->kvm->stat.lpages; 498 --vcpu->kvm->stat.lpages;
498 need_flush = 1; 499 need_flush = 1;
499 } 500 }
500 __set_spte(sptep, shadow_trap_nonpresent_pte); 501 __set_spte(sptep, shadow_trap_nonpresent_pte);
501 break; 502 break;
502 } 503 }
503 504
504 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) 505 if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
505 break; 506 break;
506 } 507 }
507 508
508 if (need_flush) 509 if (need_flush)
509 kvm_flush_remote_tlbs(vcpu->kvm); 510 kvm_flush_remote_tlbs(vcpu->kvm);
510 511
511 atomic_inc(&vcpu->kvm->arch.invlpg_counter); 512 atomic_inc(&vcpu->kvm->arch.invlpg_counter);
512 513
513 spin_unlock(&vcpu->kvm->mmu_lock); 514 spin_unlock(&vcpu->kvm->mmu_lock);
514 515
515 if (pte_gpa == -1) 516 if (pte_gpa == -1)
516 return; 517 return;
517 518
518 if (mmu_topup_memory_caches(vcpu)) 519 if (mmu_topup_memory_caches(vcpu))
519 return; 520 return;
520 kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0); 521 kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
521 } 522 }
522 523
523 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 524 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
524 u32 *error) 525 u32 *error)
525 { 526 {
526 struct guest_walker walker; 527 struct guest_walker walker;
527 gpa_t gpa = UNMAPPED_GVA; 528 gpa_t gpa = UNMAPPED_GVA;
528 int r; 529 int r;
529 530
530 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 531 r = FNAME(walk_addr)(&walker, vcpu, vaddr,
531 !!(access & PFERR_WRITE_MASK), 532 !!(access & PFERR_WRITE_MASK),
532 !!(access & PFERR_USER_MASK), 533 !!(access & PFERR_USER_MASK),
533 !!(access & PFERR_FETCH_MASK)); 534 !!(access & PFERR_FETCH_MASK));
534 535
535 if (r) { 536 if (r) {
536 gpa = gfn_to_gpa(walker.gfn); 537 gpa = gfn_to_gpa(walker.gfn);
537 gpa |= vaddr & ~PAGE_MASK; 538 gpa |= vaddr & ~PAGE_MASK;
538 } else if (error) 539 } else if (error)
539 *error = walker.error_code; 540 *error = walker.error_code;
540 541
541 return gpa; 542 return gpa;
542 } 543 }
543 544
544 static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, 545 static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
545 struct kvm_mmu_page *sp) 546 struct kvm_mmu_page *sp)
546 { 547 {
547 int i, j, offset, r; 548 int i, j, offset, r;
548 pt_element_t pt[256 / sizeof(pt_element_t)]; 549 pt_element_t pt[256 / sizeof(pt_element_t)];
549 gpa_t pte_gpa; 550 gpa_t pte_gpa;
550 551
551 if (sp->role.direct 552 if (sp->role.direct
552 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { 553 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
553 nonpaging_prefetch_page(vcpu, sp); 554 nonpaging_prefetch_page(vcpu, sp);
554 return; 555 return;
555 } 556 }
556 557
557 pte_gpa = gfn_to_gpa(sp->gfn); 558 pte_gpa = gfn_to_gpa(sp->gfn);
558 if (PTTYPE == 32) { 559 if (PTTYPE == 32) {
559 offset = sp->role.quadrant << PT64_LEVEL_BITS; 560 offset = sp->role.quadrant << PT64_LEVEL_BITS;
560 pte_gpa += offset * sizeof(pt_element_t); 561 pte_gpa += offset * sizeof(pt_element_t);
561 } 562 }
562 563
563 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) { 564 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
564 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); 565 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
565 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); 566 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
566 for (j = 0; j < ARRAY_SIZE(pt); ++j) 567 for (j = 0; j < ARRAY_SIZE(pt); ++j)
567 if (r || is_present_gpte(pt[j])) 568 if (r || is_present_gpte(pt[j]))
568 sp->spt[i+j] = shadow_trap_nonpresent_pte; 569 sp->spt[i+j] = shadow_trap_nonpresent_pte;
569 else 570 else
570 sp->spt[i+j] = shadow_notrap_nonpresent_pte; 571 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
571 } 572 }
572 } 573 }
573 574
574 /* 575 /*
575 * Using the cached information from sp->gfns is safe because: 576 * Using the cached information from sp->gfns is safe because:
576 * - The spte has a reference to the struct page, so the pfn for a given gfn 577 * - The spte has a reference to the struct page, so the pfn for a given gfn
577 * can't change unless all sptes pointing to it are nuked first. 578 * can't change unless all sptes pointing to it are nuked first.
578 * - Alias changes zap the entire shadow cache. 579 * - Alias changes zap the entire shadow cache.
579 */ 580 */
580 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 581 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
581 { 582 {
582 int i, offset, nr_present; 583 int i, offset, nr_present;
583 bool reset_host_protection; 584 bool reset_host_protection;
584 gpa_t first_pte_gpa; 585 gpa_t first_pte_gpa;
585 586
586 offset = nr_present = 0; 587 offset = nr_present = 0;
587 588
588 /* direct kvm_mmu_page can not be unsync. */ 589 /* direct kvm_mmu_page can not be unsync. */
589 BUG_ON(sp->role.direct); 590 BUG_ON(sp->role.direct);
590 591
591 if (PTTYPE == 32) 592 if (PTTYPE == 32)
592 offset = sp->role.quadrant << PT64_LEVEL_BITS; 593 offset = sp->role.quadrant << PT64_LEVEL_BITS;
593 594
594 first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); 595 first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
595 596
596 for (i = 0; i < PT64_ENT_PER_PAGE; i++) { 597 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
597 unsigned pte_access; 598 unsigned pte_access;
598 pt_element_t gpte; 599 pt_element_t gpte;
599 gpa_t pte_gpa; 600 gpa_t pte_gpa;
600 gfn_t gfn; 601 gfn_t gfn;
601 602
602 if (!is_shadow_present_pte(sp->spt[i])) 603 if (!is_shadow_present_pte(sp->spt[i]))
603 continue; 604 continue;
604 605
605 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); 606 pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
606 607
607 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, 608 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
608 sizeof(pt_element_t))) 609 sizeof(pt_element_t)))
609 return -EINVAL; 610 return -EINVAL;
610 611
611 gfn = gpte_to_gfn(gpte); 612 gfn = gpte_to_gfn(gpte);
612 if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] || 613 if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] ||
613 !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { 614 !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
614 u64 nonpresent; 615 u64 nonpresent;
615 616
616 rmap_remove(vcpu->kvm, &sp->spt[i]); 617 rmap_remove(vcpu->kvm, &sp->spt[i]);
617 if (is_present_gpte(gpte)) 618 if (is_present_gpte(gpte))
618 nonpresent = shadow_trap_nonpresent_pte; 619 nonpresent = shadow_trap_nonpresent_pte;
619 else 620 else
620 nonpresent = shadow_notrap_nonpresent_pte; 621 nonpresent = shadow_notrap_nonpresent_pte;
621 __set_spte(&sp->spt[i], nonpresent); 622 __set_spte(&sp->spt[i], nonpresent);
622 continue; 623 continue;
623 } 624 }
624 625
625 nr_present++; 626 nr_present++;
626 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 627 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
627 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { 628 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
628 pte_access &= ~ACC_WRITE_MASK; 629 pte_access &= ~ACC_WRITE_MASK;
629 reset_host_protection = 0; 630 reset_host_protection = 0;
630 } else { 631 } else {
631 reset_host_protection = 1; 632 reset_host_protection = 1;
632 } 633 }
633 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 634 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
634 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 635 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
635 spte_to_pfn(sp->spt[i]), true, false, 636 spte_to_pfn(sp->spt[i]), true, false,
636 reset_host_protection); 637 reset_host_protection);
637 } 638 }
638 639
639 return !nr_present; 640 return !nr_present;
640 } 641 }
641 642
642 #undef pt_element_t 643 #undef pt_element_t
643 #undef guest_walker 644 #undef guest_walker
644 #undef FNAME 645 #undef FNAME
645 #undef PT_BASE_ADDR_MASK 646 #undef PT_BASE_ADDR_MASK
646 #undef PT_INDEX 647 #undef PT_INDEX
647 #undef PT_LEVEL_MASK 648 #undef PT_LEVEL_MASK
648 #undef PT_LVL_ADDR_MASK 649 #undef PT_LVL_ADDR_MASK
649 #undef PT_LVL_OFFSET_MASK 650 #undef PT_LVL_OFFSET_MASK
650 #undef PT_LEVEL_BITS 651 #undef PT_LEVEL_BITS
651 #undef PT_MAX_FULL_LEVELS 652 #undef PT_MAX_FULL_LEVELS
652 #undef gpte_to_gfn 653 #undef gpte_to_gfn
653 #undef gpte_to_gfn_lvl 654 #undef gpte_to_gfn_lvl
654 #undef CMPXCHG 655 #undef CMPXCHG
655 656
virt/kvm/assigned-dev.c
1 /* 1 /*
2 * Kernel-based Virtual Machine - device assignment support 2 * Kernel-based Virtual Machine - device assignment support
3 * 3 *
4 * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. 4 * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
5 * 5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See 6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory. 7 * the COPYING file in the top-level directory.
8 * 8 *
9 */ 9 */
10 10
11 #include <linux/kvm_host.h> 11 #include <linux/kvm_host.h>
12 #include <linux/kvm.h> 12 #include <linux/kvm.h>
13 #include <linux/uaccess.h> 13 #include <linux/uaccess.h>
14 #include <linux/vmalloc.h> 14 #include <linux/vmalloc.h>
15 #include <linux/errno.h> 15 #include <linux/errno.h>
16 #include <linux/spinlock.h> 16 #include <linux/spinlock.h>
17 #include <linux/pci.h> 17 #include <linux/pci.h>
18 #include <linux/interrupt.h> 18 #include <linux/interrupt.h>
19 #include <linux/slab.h> 19 #include <linux/slab.h>
20 #include "irq.h" 20 #include "irq.h"
21 21
22 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 22 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
23 int assigned_dev_id) 23 int assigned_dev_id)
24 { 24 {
25 struct list_head *ptr; 25 struct list_head *ptr;
26 struct kvm_assigned_dev_kernel *match; 26 struct kvm_assigned_dev_kernel *match;
27 27
28 list_for_each(ptr, head) { 28 list_for_each(ptr, head) {
29 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); 29 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
30 if (match->assigned_dev_id == assigned_dev_id) 30 if (match->assigned_dev_id == assigned_dev_id)
31 return match; 31 return match;
32 } 32 }
33 return NULL; 33 return NULL;
34 } 34 }
35 35
36 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel 36 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
37 *assigned_dev, int irq) 37 *assigned_dev, int irq)
38 { 38 {
39 int i, index; 39 int i, index;
40 struct msix_entry *host_msix_entries; 40 struct msix_entry *host_msix_entries;
41 41
42 host_msix_entries = assigned_dev->host_msix_entries; 42 host_msix_entries = assigned_dev->host_msix_entries;
43 43
44 index = -1; 44 index = -1;
45 for (i = 0; i < assigned_dev->entries_nr; i++) 45 for (i = 0; i < assigned_dev->entries_nr; i++)
46 if (irq == host_msix_entries[i].vector) { 46 if (irq == host_msix_entries[i].vector) {
47 index = i; 47 index = i;
48 break; 48 break;
49 } 49 }
50 if (index < 0) { 50 if (index < 0) {
51 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); 51 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
52 return 0; 52 return 0;
53 } 53 }
54 54
55 return index; 55 return index;
56 } 56 }
57 57
58 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 58 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
59 { 59 {
60 struct kvm_assigned_dev_kernel *assigned_dev; 60 struct kvm_assigned_dev_kernel *assigned_dev;
61 struct kvm *kvm;
62 int i; 61 int i;
63 62
64 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 63 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
65 interrupt_work); 64 interrupt_work);
66 kvm = assigned_dev->kvm;
67 65
68 spin_lock_irq(&assigned_dev->assigned_dev_lock); 66 spin_lock_irq(&assigned_dev->assigned_dev_lock);
69 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 67 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
70 struct kvm_guest_msix_entry *guest_entries = 68 struct kvm_guest_msix_entry *guest_entries =
71 assigned_dev->guest_msix_entries; 69 assigned_dev->guest_msix_entries;
72 for (i = 0; i < assigned_dev->entries_nr; i++) { 70 for (i = 0; i < assigned_dev->entries_nr; i++) {
73 if (!(guest_entries[i].flags & 71 if (!(guest_entries[i].flags &
74 KVM_ASSIGNED_MSIX_PENDING)) 72 KVM_ASSIGNED_MSIX_PENDING))
75 continue; 73 continue;
76 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; 74 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
77 kvm_set_irq(assigned_dev->kvm, 75 kvm_set_irq(assigned_dev->kvm,
78 assigned_dev->irq_source_id, 76 assigned_dev->irq_source_id,
79 guest_entries[i].vector, 1); 77 guest_entries[i].vector, 1);
80 } 78 }
81 } else 79 } else
82 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 80 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
83 assigned_dev->guest_irq, 1); 81 assigned_dev->guest_irq, 1);
84 82
85 spin_unlock_irq(&assigned_dev->assigned_dev_lock); 83 spin_unlock_irq(&assigned_dev->assigned_dev_lock);
86 } 84 }
87 85
88 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 86 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
89 { 87 {
90 unsigned long flags; 88 unsigned long flags;
91 struct kvm_assigned_dev_kernel *assigned_dev = 89 struct kvm_assigned_dev_kernel *assigned_dev =
92 (struct kvm_assigned_dev_kernel *) dev_id; 90 (struct kvm_assigned_dev_kernel *) dev_id;
93 91
94 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); 92 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
95 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 93 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
96 int index = find_index_from_host_irq(assigned_dev, irq); 94 int index = find_index_from_host_irq(assigned_dev, irq);
97 if (index < 0) 95 if (index < 0)
98 goto out; 96 goto out;
99 assigned_dev->guest_msix_entries[index].flags |= 97 assigned_dev->guest_msix_entries[index].flags |=
100 KVM_ASSIGNED_MSIX_PENDING; 98 KVM_ASSIGNED_MSIX_PENDING;
101 } 99 }
102 100
103 schedule_work(&assigned_dev->interrupt_work); 101 schedule_work(&assigned_dev->interrupt_work);
104 102
105 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { 103 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
106 disable_irq_nosync(irq); 104 disable_irq_nosync(irq);
107 assigned_dev->host_irq_disabled = true; 105 assigned_dev->host_irq_disabled = true;
108 } 106 }
109 107
110 out: 108 out:
111 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); 109 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
112 return IRQ_HANDLED; 110 return IRQ_HANDLED;
113 } 111 }
114 112
115 /* Ack the irq line for an assigned device */ 113 /* Ack the irq line for an assigned device */
116 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 114 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
117 { 115 {
118 struct kvm_assigned_dev_kernel *dev; 116 struct kvm_assigned_dev_kernel *dev;
119 unsigned long flags; 117 unsigned long flags;
120 118
121 if (kian->gsi == -1) 119 if (kian->gsi == -1)
122 return; 120 return;
123 121
124 dev = container_of(kian, struct kvm_assigned_dev_kernel, 122 dev = container_of(kian, struct kvm_assigned_dev_kernel,
125 ack_notifier); 123 ack_notifier);
126 124
127 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 125 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
128 126
129 /* The guest irq may be shared so this ack may be 127 /* The guest irq may be shared so this ack may be
130 * from another device. 128 * from another device.
131 */ 129 */
132 spin_lock_irqsave(&dev->assigned_dev_lock, flags); 130 spin_lock_irqsave(&dev->assigned_dev_lock, flags);
133 if (dev->host_irq_disabled) { 131 if (dev->host_irq_disabled) {
134 enable_irq(dev->host_irq); 132 enable_irq(dev->host_irq);
135 dev->host_irq_disabled = false; 133 dev->host_irq_disabled = false;
136 } 134 }
137 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); 135 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
138 } 136 }
139 137
140 static void deassign_guest_irq(struct kvm *kvm, 138 static void deassign_guest_irq(struct kvm *kvm,
141 struct kvm_assigned_dev_kernel *assigned_dev) 139 struct kvm_assigned_dev_kernel *assigned_dev)
142 { 140 {
143 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); 141 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
144 assigned_dev->ack_notifier.gsi = -1; 142 assigned_dev->ack_notifier.gsi = -1;
145 143
146 if (assigned_dev->irq_source_id != -1) 144 if (assigned_dev->irq_source_id != -1)
147 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 145 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
148 assigned_dev->irq_source_id = -1; 146 assigned_dev->irq_source_id = -1;
149 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); 147 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
150 } 148 }
151 149
152 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ 150 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
153 static void deassign_host_irq(struct kvm *kvm, 151 static void deassign_host_irq(struct kvm *kvm,
154 struct kvm_assigned_dev_kernel *assigned_dev) 152 struct kvm_assigned_dev_kernel *assigned_dev)
155 { 153 {
156 /* 154 /*
157 * In kvm_free_device_irq, cancel_work_sync return true if: 155 * In kvm_free_device_irq, cancel_work_sync return true if:
158 * 1. work is scheduled, and then cancelled. 156 * 1. work is scheduled, and then cancelled.
159 * 2. work callback is executed. 157 * 2. work callback is executed.
160 * 158 *
161 * The first one ensured that the irq is disabled and no more events 159 * The first one ensured that the irq is disabled and no more events
162 * would happen. But for the second one, the irq may be enabled (e.g. 160 * would happen. But for the second one, the irq may be enabled (e.g.
163 * for MSI). So we disable irq here to prevent further events. 161 * for MSI). So we disable irq here to prevent further events.
164 * 162 *
165 * Notice this maybe result in nested disable if the interrupt type is 163 * Notice this maybe result in nested disable if the interrupt type is
166 * INTx, but it's OK for we are going to free it. 164 * INTx, but it's OK for we are going to free it.
167 * 165 *
168 * If this function is a part of VM destroy, please ensure that till 166 * If this function is a part of VM destroy, please ensure that till
169 * now, the kvm state is still legal for probably we also have to wait 167 * now, the kvm state is still legal for probably we also have to wait
170 * interrupt_work done. 168 * interrupt_work done.
171 */ 169 */
172 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 170 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
173 int i; 171 int i;
174 for (i = 0; i < assigned_dev->entries_nr; i++) 172 for (i = 0; i < assigned_dev->entries_nr; i++)
175 disable_irq_nosync(assigned_dev-> 173 disable_irq_nosync(assigned_dev->
176 host_msix_entries[i].vector); 174 host_msix_entries[i].vector);
177 175
178 cancel_work_sync(&assigned_dev->interrupt_work); 176 cancel_work_sync(&assigned_dev->interrupt_work);
179 177
180 for (i = 0; i < assigned_dev->entries_nr; i++) 178 for (i = 0; i < assigned_dev->entries_nr; i++)
181 free_irq(assigned_dev->host_msix_entries[i].vector, 179 free_irq(assigned_dev->host_msix_entries[i].vector,
182 (void *)assigned_dev); 180 (void *)assigned_dev);
183 181
184 assigned_dev->entries_nr = 0; 182 assigned_dev->entries_nr = 0;
185 kfree(assigned_dev->host_msix_entries); 183 kfree(assigned_dev->host_msix_entries);
186 kfree(assigned_dev->guest_msix_entries); 184 kfree(assigned_dev->guest_msix_entries);
187 pci_disable_msix(assigned_dev->dev); 185 pci_disable_msix(assigned_dev->dev);
188 } else { 186 } else {
189 /* Deal with MSI and INTx */ 187 /* Deal with MSI and INTx */
190 disable_irq_nosync(assigned_dev->host_irq); 188 disable_irq_nosync(assigned_dev->host_irq);
191 cancel_work_sync(&assigned_dev->interrupt_work); 189 cancel_work_sync(&assigned_dev->interrupt_work);
192 190
193 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 191 free_irq(assigned_dev->host_irq, (void *)assigned_dev);
194 192
195 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) 193 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
196 pci_disable_msi(assigned_dev->dev); 194 pci_disable_msi(assigned_dev->dev);
197 } 195 }
198 196
199 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); 197 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
200 } 198 }
201 199
202 static int kvm_deassign_irq(struct kvm *kvm, 200 static int kvm_deassign_irq(struct kvm *kvm,
203 struct kvm_assigned_dev_kernel *assigned_dev, 201 struct kvm_assigned_dev_kernel *assigned_dev,
204 unsigned long irq_requested_type) 202 unsigned long irq_requested_type)
205 { 203 {
206 unsigned long guest_irq_type, host_irq_type; 204 unsigned long guest_irq_type, host_irq_type;
207 205
208 if (!irqchip_in_kernel(kvm)) 206 if (!irqchip_in_kernel(kvm))
209 return -EINVAL; 207 return -EINVAL;
210 /* no irq assignment to deassign */ 208 /* no irq assignment to deassign */
211 if (!assigned_dev->irq_requested_type) 209 if (!assigned_dev->irq_requested_type)
212 return -ENXIO; 210 return -ENXIO;
213 211
214 host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; 212 host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
215 guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; 213 guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
216 214
217 if (host_irq_type) 215 if (host_irq_type)
218 deassign_host_irq(kvm, assigned_dev); 216 deassign_host_irq(kvm, assigned_dev);
219 if (guest_irq_type) 217 if (guest_irq_type)
220 deassign_guest_irq(kvm, assigned_dev); 218 deassign_guest_irq(kvm, assigned_dev);
221 219
222 return 0; 220 return 0;
223 } 221 }
224 222
225 static void kvm_free_assigned_irq(struct kvm *kvm, 223 static void kvm_free_assigned_irq(struct kvm *kvm,
226 struct kvm_assigned_dev_kernel *assigned_dev) 224 struct kvm_assigned_dev_kernel *assigned_dev)
227 { 225 {
228 kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); 226 kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
229 } 227 }
230 228
231 static void kvm_free_assigned_device(struct kvm *kvm, 229 static void kvm_free_assigned_device(struct kvm *kvm,
232 struct kvm_assigned_dev_kernel 230 struct kvm_assigned_dev_kernel
233 *assigned_dev) 231 *assigned_dev)
234 { 232 {
235 kvm_free_assigned_irq(kvm, assigned_dev); 233 kvm_free_assigned_irq(kvm, assigned_dev);
236 234
237 pci_reset_function(assigned_dev->dev); 235 pci_reset_function(assigned_dev->dev);
238 236
239 pci_release_regions(assigned_dev->dev); 237 pci_release_regions(assigned_dev->dev);
240 pci_disable_device(assigned_dev->dev); 238 pci_disable_device(assigned_dev->dev);
241 pci_dev_put(assigned_dev->dev); 239 pci_dev_put(assigned_dev->dev);
242 240
243 list_del(&assigned_dev->list); 241 list_del(&assigned_dev->list);
244 kfree(assigned_dev); 242 kfree(assigned_dev);
245 } 243 }
246 244
247 void kvm_free_all_assigned_devices(struct kvm *kvm) 245 void kvm_free_all_assigned_devices(struct kvm *kvm)
248 { 246 {
249 struct list_head *ptr, *ptr2; 247 struct list_head *ptr, *ptr2;
250 struct kvm_assigned_dev_kernel *assigned_dev; 248 struct kvm_assigned_dev_kernel *assigned_dev;
251 249
252 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { 250 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
253 assigned_dev = list_entry(ptr, 251 assigned_dev = list_entry(ptr,
254 struct kvm_assigned_dev_kernel, 252 struct kvm_assigned_dev_kernel,
255 list); 253 list);
256 254
257 kvm_free_assigned_device(kvm, assigned_dev); 255 kvm_free_assigned_device(kvm, assigned_dev);
258 } 256 }
259 } 257 }
260 258
261 static int assigned_device_enable_host_intx(struct kvm *kvm, 259 static int assigned_device_enable_host_intx(struct kvm *kvm,
262 struct kvm_assigned_dev_kernel *dev) 260 struct kvm_assigned_dev_kernel *dev)
263 { 261 {
264 dev->host_irq = dev->dev->irq; 262 dev->host_irq = dev->dev->irq;
265 /* Even though this is PCI, we don't want to use shared 263 /* Even though this is PCI, we don't want to use shared
266 * interrupts. Sharing host devices with guest-assigned devices 264 * interrupts. Sharing host devices with guest-assigned devices
267 * on the same interrupt line is not a happy situation: there 265 * on the same interrupt line is not a happy situation: there
268 * are going to be long delays in accepting, acking, etc. 266 * are going to be long delays in accepting, acking, etc.
269 */ 267 */
270 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 268 if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
271 0, "kvm_assigned_intx_device", (void *)dev)) 269 0, "kvm_assigned_intx_device", (void *)dev))
272 return -EIO; 270 return -EIO;
273 return 0; 271 return 0;
274 } 272 }
275 273
276 #ifdef __KVM_HAVE_MSI 274 #ifdef __KVM_HAVE_MSI
277 static int assigned_device_enable_host_msi(struct kvm *kvm, 275 static int assigned_device_enable_host_msi(struct kvm *kvm,
278 struct kvm_assigned_dev_kernel *dev) 276 struct kvm_assigned_dev_kernel *dev)
279 { 277 {
280 int r; 278 int r;
281 279
282 if (!dev->dev->msi_enabled) { 280 if (!dev->dev->msi_enabled) {
283 r = pci_enable_msi(dev->dev); 281 r = pci_enable_msi(dev->dev);
284 if (r) 282 if (r)
285 return r; 283 return r;
286 } 284 }
287 285
288 dev->host_irq = dev->dev->irq; 286 dev->host_irq = dev->dev->irq;
289 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, 287 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
290 "kvm_assigned_msi_device", (void *)dev)) { 288 "kvm_assigned_msi_device", (void *)dev)) {
291 pci_disable_msi(dev->dev); 289 pci_disable_msi(dev->dev);
292 return -EIO; 290 return -EIO;
293 } 291 }
294 292
295 return 0; 293 return 0;
296 } 294 }
297 #endif 295 #endif
298 296
299 #ifdef __KVM_HAVE_MSIX 297 #ifdef __KVM_HAVE_MSIX
300 static int assigned_device_enable_host_msix(struct kvm *kvm, 298 static int assigned_device_enable_host_msix(struct kvm *kvm,
301 struct kvm_assigned_dev_kernel *dev) 299 struct kvm_assigned_dev_kernel *dev)
302 { 300 {
303 int i, r = -EINVAL; 301 int i, r = -EINVAL;
304 302
305 /* host_msix_entries and guest_msix_entries should have been 303 /* host_msix_entries and guest_msix_entries should have been
306 * initialized */ 304 * initialized */
307 if (dev->entries_nr == 0) 305 if (dev->entries_nr == 0)
308 return r; 306 return r;
309 307
310 r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); 308 r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
311 if (r) 309 if (r)
312 return r; 310 return r;
313 311
314 for (i = 0; i < dev->entries_nr; i++) { 312 for (i = 0; i < dev->entries_nr; i++) {
315 r = request_irq(dev->host_msix_entries[i].vector, 313 r = request_irq(dev->host_msix_entries[i].vector,
316 kvm_assigned_dev_intr, 0, 314 kvm_assigned_dev_intr, 0,
317 "kvm_assigned_msix_device", 315 "kvm_assigned_msix_device",
318 (void *)dev); 316 (void *)dev);
319 if (r) 317 if (r)
320 goto err; 318 goto err;
321 } 319 }
322 320
323 return 0; 321 return 0;
324 err: 322 err:
325 for (i -= 1; i >= 0; i--) 323 for (i -= 1; i >= 0; i--)
326 free_irq(dev->host_msix_entries[i].vector, (void *)dev); 324 free_irq(dev->host_msix_entries[i].vector, (void *)dev);
327 pci_disable_msix(dev->dev); 325 pci_disable_msix(dev->dev);
328 return r; 326 return r;
329 } 327 }
330 328
331 #endif 329 #endif
332 330
333 static int assigned_device_enable_guest_intx(struct kvm *kvm, 331 static int assigned_device_enable_guest_intx(struct kvm *kvm,
334 struct kvm_assigned_dev_kernel *dev, 332 struct kvm_assigned_dev_kernel *dev,
335 struct kvm_assigned_irq *irq) 333 struct kvm_assigned_irq *irq)
336 { 334 {
337 dev->guest_irq = irq->guest_irq; 335 dev->guest_irq = irq->guest_irq;
338 dev->ack_notifier.gsi = irq->guest_irq; 336 dev->ack_notifier.gsi = irq->guest_irq;
339 return 0; 337 return 0;
340 } 338 }
341 339
342 #ifdef __KVM_HAVE_MSI 340 #ifdef __KVM_HAVE_MSI
343 static int assigned_device_enable_guest_msi(struct kvm *kvm, 341 static int assigned_device_enable_guest_msi(struct kvm *kvm,
344 struct kvm_assigned_dev_kernel *dev, 342 struct kvm_assigned_dev_kernel *dev,
345 struct kvm_assigned_irq *irq) 343 struct kvm_assigned_irq *irq)
346 { 344 {
347 dev->guest_irq = irq->guest_irq; 345 dev->guest_irq = irq->guest_irq;
348 dev->ack_notifier.gsi = -1; 346 dev->ack_notifier.gsi = -1;
349 dev->host_irq_disabled = false; 347 dev->host_irq_disabled = false;
350 return 0; 348 return 0;
351 } 349 }
352 #endif 350 #endif
353 351
354 #ifdef __KVM_HAVE_MSIX 352 #ifdef __KVM_HAVE_MSIX
355 static int assigned_device_enable_guest_msix(struct kvm *kvm, 353 static int assigned_device_enable_guest_msix(struct kvm *kvm,
356 struct kvm_assigned_dev_kernel *dev, 354 struct kvm_assigned_dev_kernel *dev,
357 struct kvm_assigned_irq *irq) 355 struct kvm_assigned_irq *irq)
358 { 356 {
359 dev->guest_irq = irq->guest_irq; 357 dev->guest_irq = irq->guest_irq;
360 dev->ack_notifier.gsi = -1; 358 dev->ack_notifier.gsi = -1;
361 dev->host_irq_disabled = false; 359 dev->host_irq_disabled = false;
362 return 0; 360 return 0;
363 } 361 }
364 #endif 362 #endif
365 363
366 static int assign_host_irq(struct kvm *kvm, 364 static int assign_host_irq(struct kvm *kvm,
367 struct kvm_assigned_dev_kernel *dev, 365 struct kvm_assigned_dev_kernel *dev,
368 __u32 host_irq_type) 366 __u32 host_irq_type)
369 { 367 {
370 int r = -EEXIST; 368 int r = -EEXIST;
371 369
372 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) 370 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
373 return r; 371 return r;
374 372
375 switch (host_irq_type) { 373 switch (host_irq_type) {
376 case KVM_DEV_IRQ_HOST_INTX: 374 case KVM_DEV_IRQ_HOST_INTX:
377 r = assigned_device_enable_host_intx(kvm, dev); 375 r = assigned_device_enable_host_intx(kvm, dev);
378 break; 376 break;
379 #ifdef __KVM_HAVE_MSI 377 #ifdef __KVM_HAVE_MSI
380 case KVM_DEV_IRQ_HOST_MSI: 378 case KVM_DEV_IRQ_HOST_MSI:
381 r = assigned_device_enable_host_msi(kvm, dev); 379 r = assigned_device_enable_host_msi(kvm, dev);
382 break; 380 break;
383 #endif 381 #endif
384 #ifdef __KVM_HAVE_MSIX 382 #ifdef __KVM_HAVE_MSIX
385 case KVM_DEV_IRQ_HOST_MSIX: 383 case KVM_DEV_IRQ_HOST_MSIX:
386 r = assigned_device_enable_host_msix(kvm, dev); 384 r = assigned_device_enable_host_msix(kvm, dev);
387 break; 385 break;
388 #endif 386 #endif
389 default: 387 default:
390 r = -EINVAL; 388 r = -EINVAL;
391 } 389 }
392 390
393 if (!r) 391 if (!r)
394 dev->irq_requested_type |= host_irq_type; 392 dev->irq_requested_type |= host_irq_type;
395 393
396 return r; 394 return r;
397 } 395 }
398 396
399 static int assign_guest_irq(struct kvm *kvm, 397 static int assign_guest_irq(struct kvm *kvm,
400 struct kvm_assigned_dev_kernel *dev, 398 struct kvm_assigned_dev_kernel *dev,
401 struct kvm_assigned_irq *irq, 399 struct kvm_assigned_irq *irq,
402 unsigned long guest_irq_type) 400 unsigned long guest_irq_type)
403 { 401 {
404 int id; 402 int id;
405 int r = -EEXIST; 403 int r = -EEXIST;
406 404
407 if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) 405 if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
408 return r; 406 return r;
409 407
410 id = kvm_request_irq_source_id(kvm); 408 id = kvm_request_irq_source_id(kvm);
411 if (id < 0) 409 if (id < 0)
412 return id; 410 return id;
413 411
414 dev->irq_source_id = id; 412 dev->irq_source_id = id;
415 413
416 switch (guest_irq_type) { 414 switch (guest_irq_type) {
417 case KVM_DEV_IRQ_GUEST_INTX: 415 case KVM_DEV_IRQ_GUEST_INTX:
418 r = assigned_device_enable_guest_intx(kvm, dev, irq); 416 r = assigned_device_enable_guest_intx(kvm, dev, irq);
419 break; 417 break;
420 #ifdef __KVM_HAVE_MSI 418 #ifdef __KVM_HAVE_MSI
421 case KVM_DEV_IRQ_GUEST_MSI: 419 case KVM_DEV_IRQ_GUEST_MSI:
422 r = assigned_device_enable_guest_msi(kvm, dev, irq); 420 r = assigned_device_enable_guest_msi(kvm, dev, irq);
423 break; 421 break;
424 #endif 422 #endif
425 #ifdef __KVM_HAVE_MSIX 423 #ifdef __KVM_HAVE_MSIX
426 case KVM_DEV_IRQ_GUEST_MSIX: 424 case KVM_DEV_IRQ_GUEST_MSIX:
427 r = assigned_device_enable_guest_msix(kvm, dev, irq); 425 r = assigned_device_enable_guest_msix(kvm, dev, irq);
428 break; 426 break;
429 #endif 427 #endif
430 default: 428 default:
431 r = -EINVAL; 429 r = -EINVAL;
432 } 430 }
433 431
434 if (!r) { 432 if (!r) {
435 dev->irq_requested_type |= guest_irq_type; 433 dev->irq_requested_type |= guest_irq_type;
436 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); 434 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
437 } else 435 } else
438 kvm_free_irq_source_id(kvm, dev->irq_source_id); 436 kvm_free_irq_source_id(kvm, dev->irq_source_id);
439 437
440 return r; 438 return r;
441 } 439 }
442 440
443 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ 441 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
444 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, 442 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
445 struct kvm_assigned_irq *assigned_irq) 443 struct kvm_assigned_irq *assigned_irq)
446 { 444 {
447 int r = -EINVAL; 445 int r = -EINVAL;
448 struct kvm_assigned_dev_kernel *match; 446 struct kvm_assigned_dev_kernel *match;
449 unsigned long host_irq_type, guest_irq_type; 447 unsigned long host_irq_type, guest_irq_type;
450 448
451 if (!irqchip_in_kernel(kvm)) 449 if (!irqchip_in_kernel(kvm))
452 return r; 450 return r;
453 451
454 mutex_lock(&kvm->lock); 452 mutex_lock(&kvm->lock);
455 r = -ENODEV; 453 r = -ENODEV;
456 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 454 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
457 assigned_irq->assigned_dev_id); 455 assigned_irq->assigned_dev_id);
458 if (!match) 456 if (!match)
459 goto out; 457 goto out;
460 458
461 host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); 459 host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
462 guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); 460 guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
463 461
464 r = -EINVAL; 462 r = -EINVAL;
465 /* can only assign one type at a time */ 463 /* can only assign one type at a time */
466 if (hweight_long(host_irq_type) > 1) 464 if (hweight_long(host_irq_type) > 1)
467 goto out; 465 goto out;
468 if (hweight_long(guest_irq_type) > 1) 466 if (hweight_long(guest_irq_type) > 1)
469 goto out; 467 goto out;
470 if (host_irq_type == 0 && guest_irq_type == 0) 468 if (host_irq_type == 0 && guest_irq_type == 0)
471 goto out; 469 goto out;
472 470
473 r = 0; 471 r = 0;
474 if (host_irq_type) 472 if (host_irq_type)
475 r = assign_host_irq(kvm, match, host_irq_type); 473 r = assign_host_irq(kvm, match, host_irq_type);
476 if (r) 474 if (r)
477 goto out; 475 goto out;
478 476
479 if (guest_irq_type) 477 if (guest_irq_type)
480 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); 478 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
481 out: 479 out:
482 mutex_unlock(&kvm->lock); 480 mutex_unlock(&kvm->lock);
483 return r; 481 return r;
484 } 482 }
485 483
486 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, 484 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
487 struct kvm_assigned_irq 485 struct kvm_assigned_irq
488 *assigned_irq) 486 *assigned_irq)
489 { 487 {
490 int r = -ENODEV; 488 int r = -ENODEV;
491 struct kvm_assigned_dev_kernel *match; 489 struct kvm_assigned_dev_kernel *match;
492 490
493 mutex_lock(&kvm->lock); 491 mutex_lock(&kvm->lock);
494 492
495 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 493 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
496 assigned_irq->assigned_dev_id); 494 assigned_irq->assigned_dev_id);
497 if (!match) 495 if (!match)
498 goto out; 496 goto out;
499 497
500 r = kvm_deassign_irq(kvm, match, assigned_irq->flags); 498 r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
501 out: 499 out:
502 mutex_unlock(&kvm->lock); 500 mutex_unlock(&kvm->lock);
503 return r; 501 return r;
504 } 502 }
505 503
506 static int kvm_vm_ioctl_assign_device(struct kvm *kvm, 504 static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
507 struct kvm_assigned_pci_dev *assigned_dev) 505 struct kvm_assigned_pci_dev *assigned_dev)
508 { 506 {
509 int r = 0, idx; 507 int r = 0, idx;
510 struct kvm_assigned_dev_kernel *match; 508 struct kvm_assigned_dev_kernel *match;
511 struct pci_dev *dev; 509 struct pci_dev *dev;
512 510
513 mutex_lock(&kvm->lock); 511 mutex_lock(&kvm->lock);
514 idx = srcu_read_lock(&kvm->srcu); 512 idx = srcu_read_lock(&kvm->srcu);
515 513
516 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 514 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
517 assigned_dev->assigned_dev_id); 515 assigned_dev->assigned_dev_id);
518 if (match) { 516 if (match) {
519 /* device already assigned */ 517 /* device already assigned */
520 r = -EEXIST; 518 r = -EEXIST;
521 goto out; 519 goto out;
522 } 520 }
523 521
524 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); 522 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
525 if (match == NULL) { 523 if (match == NULL) {
526 printk(KERN_INFO "%s: Couldn't allocate memory\n", 524 printk(KERN_INFO "%s: Couldn't allocate memory\n",
527 __func__); 525 __func__);
528 r = -ENOMEM; 526 r = -ENOMEM;
529 goto out; 527 goto out;
530 } 528 }
531 dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, 529 dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
532 assigned_dev->busnr, 530 assigned_dev->busnr,
533 assigned_dev->devfn); 531 assigned_dev->devfn);
534 if (!dev) { 532 if (!dev) {
535 printk(KERN_INFO "%s: host device not found\n", __func__); 533 printk(KERN_INFO "%s: host device not found\n", __func__);
536 r = -EINVAL; 534 r = -EINVAL;
537 goto out_free; 535 goto out_free;
538 } 536 }
539 if (pci_enable_device(dev)) { 537 if (pci_enable_device(dev)) {
540 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); 538 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
541 r = -EBUSY; 539 r = -EBUSY;
542 goto out_put; 540 goto out_put;
543 } 541 }
544 r = pci_request_regions(dev, "kvm_assigned_device"); 542 r = pci_request_regions(dev, "kvm_assigned_device");
545 if (r) { 543 if (r) {
546 printk(KERN_INFO "%s: Could not get access to device regions\n", 544 printk(KERN_INFO "%s: Could not get access to device regions\n",
547 __func__); 545 __func__);
548 goto out_disable; 546 goto out_disable;
549 } 547 }
550 548
551 pci_reset_function(dev); 549 pci_reset_function(dev);
552 550
553 match->assigned_dev_id = assigned_dev->assigned_dev_id; 551 match->assigned_dev_id = assigned_dev->assigned_dev_id;
554 match->host_segnr = assigned_dev->segnr; 552 match->host_segnr = assigned_dev->segnr;
555 match->host_busnr = assigned_dev->busnr; 553 match->host_busnr = assigned_dev->busnr;
556 match->host_devfn = assigned_dev->devfn; 554 match->host_devfn = assigned_dev->devfn;
557 match->flags = assigned_dev->flags; 555 match->flags = assigned_dev->flags;
558 match->dev = dev; 556 match->dev = dev;
559 spin_lock_init(&match->assigned_dev_lock); 557 spin_lock_init(&match->assigned_dev_lock);
560 match->irq_source_id = -1; 558 match->irq_source_id = -1;
561 match->kvm = kvm; 559 match->kvm = kvm;
562 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; 560 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
563 INIT_WORK(&match->interrupt_work, 561 INIT_WORK(&match->interrupt_work,
564 kvm_assigned_dev_interrupt_work_handler); 562 kvm_assigned_dev_interrupt_work_handler);
565 563
566 list_add(&match->list, &kvm->arch.assigned_dev_head); 564 list_add(&match->list, &kvm->arch.assigned_dev_head);
567 565
568 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { 566 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) {
569 if (!kvm->arch.iommu_domain) { 567 if (!kvm->arch.iommu_domain) {
570 r = kvm_iommu_map_guest(kvm); 568 r = kvm_iommu_map_guest(kvm);
571 if (r) 569 if (r)
572 goto out_list_del; 570 goto out_list_del;
573 } 571 }
574 r = kvm_assign_device(kvm, match); 572 r = kvm_assign_device(kvm, match);
575 if (r) 573 if (r)
576 goto out_list_del; 574 goto out_list_del;
577 } 575 }
578 576
579 out: 577 out:
580 srcu_read_unlock(&kvm->srcu, idx); 578 srcu_read_unlock(&kvm->srcu, idx);
581 mutex_unlock(&kvm->lock); 579 mutex_unlock(&kvm->lock);
582 return r; 580 return r;
583 out_list_del: 581 out_list_del:
584 list_del(&match->list); 582 list_del(&match->list);
585 pci_release_regions(dev); 583 pci_release_regions(dev);
586 out_disable: 584 out_disable:
587 pci_disable_device(dev); 585 pci_disable_device(dev);
588 out_put: 586 out_put:
589 pci_dev_put(dev); 587 pci_dev_put(dev);
590 out_free: 588 out_free:
591 kfree(match); 589 kfree(match);
592 srcu_read_unlock(&kvm->srcu, idx); 590 srcu_read_unlock(&kvm->srcu, idx);
593 mutex_unlock(&kvm->lock); 591 mutex_unlock(&kvm->lock);
594 return r; 592 return r;
595 } 593 }
596 594
597 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, 595 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
598 struct kvm_assigned_pci_dev *assigned_dev) 596 struct kvm_assigned_pci_dev *assigned_dev)
599 { 597 {
600 int r = 0; 598 int r = 0;
601 struct kvm_assigned_dev_kernel *match; 599 struct kvm_assigned_dev_kernel *match;
602 600
603 mutex_lock(&kvm->lock); 601 mutex_lock(&kvm->lock);
604 602
605 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 603 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
606 assigned_dev->assigned_dev_id); 604 assigned_dev->assigned_dev_id);
607 if (!match) { 605 if (!match) {
608 printk(KERN_INFO "%s: device hasn't been assigned before, " 606 printk(KERN_INFO "%s: device hasn't been assigned before, "
609 "so cannot be deassigned\n", __func__); 607 "so cannot be deassigned\n", __func__);
610 r = -EINVAL; 608 r = -EINVAL;
611 goto out; 609 goto out;
612 } 610 }
613 611
614 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) 612 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)
615 kvm_deassign_device(kvm, match); 613 kvm_deassign_device(kvm, match);
616 614
617 kvm_free_assigned_device(kvm, match); 615 kvm_free_assigned_device(kvm, match);
618 616
619 out: 617 out:
620 mutex_unlock(&kvm->lock); 618 mutex_unlock(&kvm->lock);
621 return r; 619 return r;
622 } 620 }
623 621
624 622
625 #ifdef __KVM_HAVE_MSIX 623 #ifdef __KVM_HAVE_MSIX
626 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, 624 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
627 struct kvm_assigned_msix_nr *entry_nr) 625 struct kvm_assigned_msix_nr *entry_nr)
628 { 626 {
629 int r = 0; 627 int r = 0;
630 struct kvm_assigned_dev_kernel *adev; 628 struct kvm_assigned_dev_kernel *adev;
631 629
632 mutex_lock(&kvm->lock); 630 mutex_lock(&kvm->lock);
633 631
634 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 632 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
635 entry_nr->assigned_dev_id); 633 entry_nr->assigned_dev_id);
636 if (!adev) { 634 if (!adev) {
637 r = -EINVAL; 635 r = -EINVAL;
638 goto msix_nr_out; 636 goto msix_nr_out;
639 } 637 }
640 638
641 if (adev->entries_nr == 0) { 639 if (adev->entries_nr == 0) {
642 adev->entries_nr = entry_nr->entry_nr; 640 adev->entries_nr = entry_nr->entry_nr;
643 if (adev->entries_nr == 0 || 641 if (adev->entries_nr == 0 ||
644 adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { 642 adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
645 r = -EINVAL; 643 r = -EINVAL;
646 goto msix_nr_out; 644 goto msix_nr_out;
647 } 645 }
648 646
649 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * 647 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
650 entry_nr->entry_nr, 648 entry_nr->entry_nr,
651 GFP_KERNEL); 649 GFP_KERNEL);
652 if (!adev->host_msix_entries) { 650 if (!adev->host_msix_entries) {
653 r = -ENOMEM; 651 r = -ENOMEM;
654 goto msix_nr_out; 652 goto msix_nr_out;
655 } 653 }
656 adev->guest_msix_entries = kzalloc( 654 adev->guest_msix_entries = kzalloc(
657 sizeof(struct kvm_guest_msix_entry) * 655 sizeof(struct kvm_guest_msix_entry) *
658 entry_nr->entry_nr, GFP_KERNEL); 656 entry_nr->entry_nr, GFP_KERNEL);
659 if (!adev->guest_msix_entries) { 657 if (!adev->guest_msix_entries) {
660 kfree(adev->host_msix_entries); 658 kfree(adev->host_msix_entries);
661 r = -ENOMEM; 659 r = -ENOMEM;
662 goto msix_nr_out; 660 goto msix_nr_out;
663 } 661 }
664 } else /* Not allowed set MSI-X number twice */ 662 } else /* Not allowed set MSI-X number twice */
665 r = -EINVAL; 663 r = -EINVAL;
666 msix_nr_out: 664 msix_nr_out:
667 mutex_unlock(&kvm->lock); 665 mutex_unlock(&kvm->lock);
668 return r; 666 return r;
669 } 667 }
670 668
671 static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, 669 static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
672 struct kvm_assigned_msix_entry *entry) 670 struct kvm_assigned_msix_entry *entry)
673 { 671 {
674 int r = 0, i; 672 int r = 0, i;
675 struct kvm_assigned_dev_kernel *adev; 673 struct kvm_assigned_dev_kernel *adev;
676 674
677 mutex_lock(&kvm->lock); 675 mutex_lock(&kvm->lock);
678 676
679 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 677 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
680 entry->assigned_dev_id); 678 entry->assigned_dev_id);
681 679
682 if (!adev) { 680 if (!adev) {
683 r = -EINVAL; 681 r = -EINVAL;
684 goto msix_entry_out; 682 goto msix_entry_out;
685 } 683 }
686 684
687 for (i = 0; i < adev->entries_nr; i++) 685 for (i = 0; i < adev->entries_nr; i++)
688 if (adev->guest_msix_entries[i].vector == 0 || 686 if (adev->guest_msix_entries[i].vector == 0 ||
689 adev->guest_msix_entries[i].entry == entry->entry) { 687 adev->guest_msix_entries[i].entry == entry->entry) {
690 adev->guest_msix_entries[i].entry = entry->entry; 688 adev->guest_msix_entries[i].entry = entry->entry;
691 adev->guest_msix_entries[i].vector = entry->gsi; 689 adev->guest_msix_entries[i].vector = entry->gsi;
692 adev->host_msix_entries[i].entry = entry->entry; 690 adev->host_msix_entries[i].entry = entry->entry;
693 break; 691 break;
694 } 692 }
695 if (i == adev->entries_nr) { 693 if (i == adev->entries_nr) {
696 r = -ENOSPC; 694 r = -ENOSPC;
697 goto msix_entry_out; 695 goto msix_entry_out;
698 } 696 }
699 697
700 msix_entry_out: 698 msix_entry_out:
701 mutex_unlock(&kvm->lock); 699 mutex_unlock(&kvm->lock);
702 700
703 return r; 701 return r;
704 } 702 }
705 #endif 703 #endif
706 704
707 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, 705 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
708 unsigned long arg) 706 unsigned long arg)
709 { 707 {
710 void __user *argp = (void __user *)arg; 708 void __user *argp = (void __user *)arg;
711 int r = -ENOTTY; 709 int r = -ENOTTY;
712 710
713 switch (ioctl) { 711 switch (ioctl) {
714 case KVM_ASSIGN_PCI_DEVICE: { 712 case KVM_ASSIGN_PCI_DEVICE: {
715 struct kvm_assigned_pci_dev assigned_dev; 713 struct kvm_assigned_pci_dev assigned_dev;
716 714
717 r = -EFAULT; 715 r = -EFAULT;
718 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 716 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
719 goto out; 717 goto out;
720 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); 718 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
721 if (r) 719 if (r)
722 goto out; 720 goto out;
723 break; 721 break;
724 } 722 }
725 case KVM_ASSIGN_IRQ: { 723 case KVM_ASSIGN_IRQ: {
726 r = -EOPNOTSUPP; 724 r = -EOPNOTSUPP;
727 break; 725 break;
728 } 726 }
729 #ifdef KVM_CAP_ASSIGN_DEV_IRQ 727 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
730 case KVM_ASSIGN_DEV_IRQ: { 728 case KVM_ASSIGN_DEV_IRQ: {
731 struct kvm_assigned_irq assigned_irq; 729 struct kvm_assigned_irq assigned_irq;
732 730
733 r = -EFAULT; 731 r = -EFAULT;
734 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 732 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
735 goto out; 733 goto out;
736 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); 734 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
737 if (r) 735 if (r)
738 goto out; 736 goto out;
739 break; 737 break;
740 } 738 }
741 case KVM_DEASSIGN_DEV_IRQ: { 739 case KVM_DEASSIGN_DEV_IRQ: {
742 struct kvm_assigned_irq assigned_irq; 740 struct kvm_assigned_irq assigned_irq;
743 741
744 r = -EFAULT; 742 r = -EFAULT;
745 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 743 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
746 goto out; 744 goto out;
747 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); 745 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
748 if (r) 746 if (r)
749 goto out; 747 goto out;
750 break; 748 break;
751 } 749 }
752 #endif 750 #endif
753 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 751 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
754 case KVM_DEASSIGN_PCI_DEVICE: { 752 case KVM_DEASSIGN_PCI_DEVICE: {
755 struct kvm_assigned_pci_dev assigned_dev; 753 struct kvm_assigned_pci_dev assigned_dev;
756 754
757 r = -EFAULT; 755 r = -EFAULT;
758 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 756 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
759 goto out; 757 goto out;
760 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); 758 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
761 if (r) 759 if (r)
762 goto out; 760 goto out;
763 break; 761 break;
764 } 762 }
765 #endif 763 #endif
766 #ifdef KVM_CAP_IRQ_ROUTING 764 #ifdef KVM_CAP_IRQ_ROUTING
767 case KVM_SET_GSI_ROUTING: { 765 case KVM_SET_GSI_ROUTING: {
768 struct kvm_irq_routing routing; 766 struct kvm_irq_routing routing;
769 struct kvm_irq_routing __user *urouting; 767 struct kvm_irq_routing __user *urouting;
770 struct kvm_irq_routing_entry *entries; 768 struct kvm_irq_routing_entry *entries;
771 769
772 r = -EFAULT; 770 r = -EFAULT;
773 if (copy_from_user(&routing, argp, sizeof(routing))) 771 if (copy_from_user(&routing, argp, sizeof(routing)))
774 goto out; 772 goto out;
775 r = -EINVAL; 773 r = -EINVAL;
776 if (routing.nr >= KVM_MAX_IRQ_ROUTES) 774 if (routing.nr >= KVM_MAX_IRQ_ROUTES)
777 goto out; 775 goto out;
778 if (routing.flags) 776 if (routing.flags)
779 goto out; 777 goto out;
780 r = -ENOMEM; 778 r = -ENOMEM;
781 entries = vmalloc(routing.nr * sizeof(*entries)); 779 entries = vmalloc(routing.nr * sizeof(*entries));
782 if (!entries) 780 if (!entries)
783 goto out; 781 goto out;
784 r = -EFAULT; 782 r = -EFAULT;
785 urouting = argp; 783 urouting = argp;
786 if (copy_from_user(entries, urouting->entries, 784 if (copy_from_user(entries, urouting->entries,
787 routing.nr * sizeof(*entries))) 785 routing.nr * sizeof(*entries)))
788 goto out_free_irq_routing; 786 goto out_free_irq_routing;
789 r = kvm_set_irq_routing(kvm, entries, routing.nr, 787 r = kvm_set_irq_routing(kvm, entries, routing.nr,
790 routing.flags); 788 routing.flags);
791 out_free_irq_routing: 789 out_free_irq_routing:
792 vfree(entries); 790 vfree(entries);
793 break; 791 break;
794 } 792 }
795 #endif /* KVM_CAP_IRQ_ROUTING */ 793 #endif /* KVM_CAP_IRQ_ROUTING */
796 #ifdef __KVM_HAVE_MSIX 794 #ifdef __KVM_HAVE_MSIX
797 case KVM_ASSIGN_SET_MSIX_NR: { 795 case KVM_ASSIGN_SET_MSIX_NR: {
798 struct kvm_assigned_msix_nr entry_nr; 796 struct kvm_assigned_msix_nr entry_nr;
799 r = -EFAULT; 797 r = -EFAULT;
800 if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) 798 if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
801 goto out; 799 goto out;
802 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); 800 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
803 if (r) 801 if (r)
804 goto out; 802 goto out;
805 break; 803 break;
806 } 804 }
807 case KVM_ASSIGN_SET_MSIX_ENTRY: { 805 case KVM_ASSIGN_SET_MSIX_ENTRY: {
808 struct kvm_assigned_msix_entry entry; 806 struct kvm_assigned_msix_entry entry;
809 r = -EFAULT; 807 r = -EFAULT;
810 if (copy_from_user(&entry, argp, sizeof entry)) 808 if (copy_from_user(&entry, argp, sizeof entry))
811 goto out; 809 goto out;
812 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); 810 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
813 if (r) 811 if (r)
814 goto out; 812 goto out;
815 break; 813 break;
816 } 814 }
817 #endif 815 #endif
818 } 816 }
819 out: 817 out:
820 return r; 818 return r;
821 } 819 }
822 820
823 821