mmu.c 59.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5
/*
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */
6 7 8 9

#include <linux/mman.h>
#include <linux/kvm_host.h>
#include <linux/io.h>
10
#include <linux/hugetlb.h>
11
#include <linux/sched/signal.h>
12
#include <trace/events/kvm.h>
13
#include <asm/pgalloc.h>
14
#include <asm/cacheflush.h>
15 16
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
17
#include <asm/kvm_pgtable.h>
18
#include <asm/kvm_ras.h>
19
#include <asm/kvm_asm.h>
20
#include <asm/kvm_emulate.h>
21
#include <asm/virt.h>
22 23

#include "trace.h"
24

25
static struct kvm_pgtable *hyp_pgtable;
26 27
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);

28 29 30
static unsigned long __ro_after_init hyp_idmap_start;
static unsigned long __ro_after_init hyp_idmap_end;
static phys_addr_t __ro_after_init hyp_idmap_vector;
31

32
static unsigned long __ro_after_init io_map_base;
33

34 35
static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
					   phys_addr_t size)
36 37 38 39 40
{
	phys_addr_t boundary = ALIGN_DOWN(addr + size, size);

	return (boundary - 1 < end - 1) ? boundary : end;
}
41

42 43 44 45 46 47 48
static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
{
	phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);

	return __stage2_range_addr_end(addr, end, size);
}

49 50 51 52 53 54 55
/*
 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
 * long will also starve other vCPUs. We have to also make sure that the page
 * tables are not freed while we released the lock.
 */
56
static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
57 58 59 60
			      phys_addr_t end,
			      int (*fn)(struct kvm_pgtable *, u64, u64),
			      bool resched)
{
61
	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
62 63 64 65
	int ret;
	u64 next;

	do {
66
		struct kvm_pgtable *pgt = mmu->pgt;
67 68 69
		if (!pgt)
			return -EINVAL;

70
		next = stage2_range_addr_end(addr, end);
71 72 73 74 75
		ret = fn(pgt, addr, next - addr);
		if (ret)
			break;

		if (resched && next != end)
76
			cond_resched_rwlock_write(&kvm->mmu_lock);
77 78 79 80 81
	} while (addr = next, addr != end);

	return ret;
}

82 83
#define stage2_apply_range_resched(mmu, addr, end, fn)			\
	stage2_apply_range(mmu, addr, end, fn, true)
84

85 86 87 88 89 90 91 92 93 94
/*
 * Get the maximum number of page-tables pages needed to split a range
 * of blocks into PAGE_SIZE PTEs. It assumes the range is already
 * mapped at level 2, or at level 1 if allowed.
 */
static int kvm_mmu_split_nr_page_tables(u64 range)
{
	int n = 0;

	if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
95 96
		n += DIV_ROUND_UP(range, PUD_SIZE);
	n += DIV_ROUND_UP(range, PMD_SIZE);
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
	return n;
}

static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
{
	struct kvm_mmu_memory_cache *cache;
	u64 chunk_size, min;

	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
		return true;

	chunk_size = kvm->arch.mmu.split_page_chunk_size;
	min = kvm_mmu_split_nr_page_tables(chunk_size);
	cache = &kvm->arch.mmu.split_page_cache;
	return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
}

static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
				    phys_addr_t end)
{
	struct kvm_mmu_memory_cache *cache;
	struct kvm_pgtable *pgt;
	int ret, cache_capacity;
	u64 next, chunk_size;

	lockdep_assert_held_write(&kvm->mmu_lock);

	chunk_size = kvm->arch.mmu.split_page_chunk_size;
	cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);

	if (chunk_size == 0)
		return 0;

	cache = &kvm->arch.mmu.split_page_cache;

	do {
		if (need_split_memcache_topup_or_resched(kvm)) {
			write_unlock(&kvm->mmu_lock);
			cond_resched();
			/* Eager page splitting is best-effort. */
			ret = __kvm_mmu_topup_memory_cache(cache,
							   cache_capacity,
							   cache_capacity);
			write_lock(&kvm->mmu_lock);
			if (ret)
				break;
		}

		pgt = kvm->arch.mmu.pgt;
		if (!pgt)
			return -EINVAL;

		next = __stage2_range_addr_end(addr, end, chunk_size);
		ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache);
		if (ret)
			break;
	} while (addr = next, addr != end);

	return ret;
}

158 159 160
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
	return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
161 162 163
}

/**
164
 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8
165 166 167 168
 * @kvm:	pointer to kvm structure.
 *
 * Interface to HYP function to flush all VM TLB entries
 */
169
int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
170
{
171
	kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
172
	return 0;
173
}
174

175 176 177 178 179 180 181 182
int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
				      gfn_t gfn, u64 nr_pages)
{
	kvm_tlb_flush_vmid_range(&kvm->arch.mmu,
				gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
	return 0;
}

183 184
static bool kvm_is_device_pfn(unsigned long pfn)
{
185
	return !pfn_is_map_memory(pfn);
186 187
}

188 189 190
static void *stage2_memcache_zalloc_page(void *arg)
{
	struct kvm_mmu_memory_cache *mc = arg;
191
	void *virt;
192 193

	/* Allocated with __GFP_ZERO, so no need to zero */
194 195 196 197
	virt = kvm_mmu_memory_cache_alloc(mc);
	if (virt)
		kvm_account_pgtable_pages(virt, 1);
	return virt;
198 199 200 201 202 203 204
}

static void *kvm_host_zalloc_pages_exact(size_t size)
{
	return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
}

205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
static void *kvm_s2_zalloc_pages_exact(size_t size)
{
	void *virt = kvm_host_zalloc_pages_exact(size);

	if (virt)
		kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT));
	return virt;
}

static void kvm_s2_free_pages_exact(void *virt, size_t size)
{
	kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT));
	free_pages_exact(virt, size);
}

220 221
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;

222
static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
223 224 225
{
	struct page *page = container_of(head, struct page, rcu_head);
	void *pgtable = page_to_virt(page);
226
	s8 level = page_private(page);
227

228
	kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level);
229 230
}

231
static void stage2_free_unlinked_table(void *addr, s8 level)
232
{
233 234 235
	struct page *page = virt_to_page(addr);

	set_page_private(page, (unsigned long)level);
236
	call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
237 238
}

239 240 241 242 243 244 245 246 247 248
static void kvm_host_get_page(void *addr)
{
	get_page(virt_to_page(addr));
}

static void kvm_host_put_page(void *addr)
{
	put_page(virt_to_page(addr));
}

249 250 251 252 253 254 255 256 257
static void kvm_s2_put_page(void *addr)
{
	struct page *p = virt_to_page(addr);
	/* Dropping last refcount, the page will be freed */
	if (page_count(p) == 1)
		kvm_account_pgtable_pages(addr, -1);
	put_page(p);
}

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
static int kvm_host_page_count(void *addr)
{
	return page_count(virt_to_page(addr));
}

static phys_addr_t kvm_host_pa(void *addr)
{
	return __pa(addr);
}

static void *kvm_host_va(phys_addr_t phys)
{
	return __va(phys);
}

273 274 275 276 277 278 279 280 281 282
static void clean_dcache_guest_page(void *va, size_t size)
{
	__clean_dcache_guest_page(va, size);
}

static void invalidate_icache_guest_page(void *va, size_t size)
{
	__invalidate_icache_guest_page(va, size);
}

283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
/*
 * Unmapping vs dcache management:
 *
 * If a guest maps certain memory pages as uncached, all writes will
 * bypass the data cache and go directly to RAM.  However, the CPUs
 * can still speculate reads (not writes) and fill cache lines with
 * data.
 *
 * Those cache lines will be *clean* cache lines though, so a
 * clean+invalidate operation is equivalent to an invalidate
 * operation, because no cache lines are marked dirty.
 *
 * Those clean cache lines could be filled prior to an uncached write
 * by the guest, and the cache coherent IO subsystem would therefore
 * end up writing old data to disk.
 *
 * This is why right after unmapping a page/section and invalidating
300 301
 * the corresponding TLBs, we flush to make sure the IO subsystem will
 * never hit in the cache.
302 303 304 305
 *
 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
 * we then fully enforce cacheability of RAM, no matter what the guest
 * does.
306
 */
307
/**
308
 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range
309
 * @mmu:   The KVM stage-2 MMU pointer
310 311
 * @start: The intermediate physical base address of the range to unmap
 * @size:  The size of the area to unmap
312
 * @may_block: Whether or not we are permitted to block
313 314 315 316 317 318
 *
 * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
 * destroying the VM), otherwise another faulting VCPU may come in and mess
 * with things behind our backs.
 */
319 320
static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
				 bool may_block)
321
{
322
	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
323
	phys_addr_t end = start + size;
324

325
	lockdep_assert_held_write(&kvm->mmu_lock);
326
	WARN_ON(size & ~PAGE_MASK);
327
	WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap,
328
				   may_block));
329 330
}

331
void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
332 333 334 335
{
	__unmap_stage2_range(mmu, start, size, true);
}

336 337 338 339 340
void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
	stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_flush);
}

341 342 343 344 345 346
static void stage2_flush_memslot(struct kvm *kvm,
				 struct kvm_memory_slot *memslot)
{
	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
	phys_addr_t end = addr + PAGE_SIZE * memslot->npages;

347
	kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
348 349 350 351 352 353 354 355 356
}

/**
 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
 * @kvm: The struct kvm pointer
 *
 * Go through the stage 2 page tables and invalidate any cache lines
 * backing memory already mapped to the VM.
 */
357
static void stage2_flush_vm(struct kvm *kvm)
358 359 360
{
	struct kvm_memslots *slots;
	struct kvm_memory_slot *memslot;
361
	int idx, bkt;
362 363

	idx = srcu_read_lock(&kvm->srcu);
364
	write_lock(&kvm->mmu_lock);
365 366

	slots = kvm_memslots(kvm);
367
	kvm_for_each_memslot(memslot, bkt, slots)
368 369
		stage2_flush_memslot(kvm, memslot);

370 371
	kvm_nested_s2_flush(kvm);

372
	write_unlock(&kvm->mmu_lock);
373 374 375
	srcu_read_unlock(&kvm->srcu, idx);
}

376
/**
377
 * free_hyp_pgds - free Hyp-mode page tables
378
 */
379
void __init free_hyp_pgds(void)
380
{
381
	mutex_lock(&kvm_hyp_pgd_mutex);
382 383 384
	if (hyp_pgtable) {
		kvm_pgtable_hyp_destroy(hyp_pgtable);
		kfree(hyp_pgtable);
385
		hyp_pgtable = NULL;
386
	}
387 388 389
	mutex_unlock(&kvm_hyp_pgd_mutex);
}

390 391
static bool kvm_host_owns_hyp_mappings(void)
{
392 393 394
	if (is_kernel_in_hyp_mode())
		return false;

395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410
	if (static_branch_likely(&kvm_protected_mode_initialized))
		return false;

	/*
	 * This can happen at boot time when __create_hyp_mappings() is called
	 * after the hyp protection has been enabled, but the static key has
	 * not been flipped yet.
	 */
	if (!hyp_pgtable && is_protected_kvm_enabled())
		return false;

	WARN_ON(!hyp_pgtable);

	return true;
}

411 412
int __create_hyp_mappings(unsigned long start, unsigned long size,
			  unsigned long phys, enum kvm_pgtable_prot prot)
413
{
414
	int err;
415

416 417
	if (WARN_ON(!kvm_host_owns_hyp_mappings()))
		return -EINVAL;
418

419
	mutex_lock(&kvm_hyp_pgd_mutex);
420
	err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
421
	mutex_unlock(&kvm_hyp_pgd_mutex);
422

423 424 425
	return err;
}

426 427 428 429 430 431 432 433 434 435 436
static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
{
	if (!is_vmalloc_addr(kaddr)) {
		BUG_ON(!virt_addr_valid(kaddr));
		return __pa(kaddr);
	} else {
		return page_to_phys(vmalloc_to_page(kaddr)) +
		       offset_in_page(kaddr);
	}
}

437 438 439 440 441 442 443 444 445 446 447
struct hyp_shared_pfn {
	u64 pfn;
	int count;
	struct rb_node node;
};

static DEFINE_MUTEX(hyp_shared_pfns_lock);
static struct rb_root hyp_shared_pfns = RB_ROOT;

static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
					      struct rb_node **parent)
448
{
449 450 451 452 453 454 455 456 457 458 459 460 461 462
	struct hyp_shared_pfn *this;

	*node = &hyp_shared_pfns.rb_node;
	*parent = NULL;
	while (**node) {
		this = container_of(**node, struct hyp_shared_pfn, node);
		*parent = **node;
		if (this->pfn < pfn)
			*node = &((**node)->rb_left);
		else if (this->pfn > pfn)
			*node = &((**node)->rb_right);
		else
			return this;
	}
463

464 465 466 467
	return NULL;
}

static int share_pfn_hyp(u64 pfn)
468
{
469 470 471 472 473 474 475 476 477
	struct rb_node **node, *parent;
	struct hyp_shared_pfn *this;
	int ret = 0;

	mutex_lock(&hyp_shared_pfns_lock);
	this = find_shared_pfn(pfn, &node, &parent);
	if (this) {
		this->count++;
		goto unlock;
478 479
	}

480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
	this = kzalloc(sizeof(*this), GFP_KERNEL);
	if (!this) {
		ret = -ENOMEM;
		goto unlock;
	}

	this->pfn = pfn;
	this->count = 1;
	rb_link_node(&this->node, parent, node);
	rb_insert_color(&this->node, &hyp_shared_pfns);
	ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
unlock:
	mutex_unlock(&hyp_shared_pfns_lock);

	return ret;
495 496
}

497
static int unshare_pfn_hyp(u64 pfn)
498
{
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
	struct rb_node **node, *parent;
	struct hyp_shared_pfn *this;
	int ret = 0;

	mutex_lock(&hyp_shared_pfns_lock);
	this = find_shared_pfn(pfn, &node, &parent);
	if (WARN_ON(!this)) {
		ret = -ENOENT;
		goto unlock;
	}

	this->count--;
	if (this->count)
		goto unlock;

	rb_erase(&this->node, &hyp_shared_pfns);
	kfree(this);
	ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
unlock:
	mutex_unlock(&hyp_shared_pfns_lock);

	return ret;
}

523 524
int kvm_share_hyp(void *from, void *to)
{
525 526
	phys_addr_t start, end, cur;
	u64 pfn;
527 528
	int ret;

529 530 531 532 533 534 535 536 537 538 539 540 541 542
	if (is_kernel_in_hyp_mode())
		return 0;

	/*
	 * The share hcall maps things in the 'fixed-offset' region of the hyp
	 * VA space, so we can only share physically contiguous data-structures
	 * for now.
	 */
	if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
		return -EINVAL;

	if (kvm_host_owns_hyp_mappings())
		return create_hyp_mappings(from, to, PAGE_HYP);

543 544 545 546 547
	start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
	end = PAGE_ALIGN(__pa(to));
	for (cur = start; cur < end; cur += PAGE_SIZE) {
		pfn = __phys_to_pfn(cur);
		ret = share_pfn_hyp(pfn);
548 549 550 551 552 553 554
		if (ret)
			return ret;
	}

	return 0;
}

555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
void kvm_unshare_hyp(void *from, void *to)
{
	phys_addr_t start, end, cur;
	u64 pfn;

	if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
		return;

	start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
	end = PAGE_ALIGN(__pa(to));
	for (cur = start; cur < end; cur += PAGE_SIZE) {
		pfn = __phys_to_pfn(cur);
		WARN_ON(unshare_pfn_hyp(pfn));
	}
}

571
/**
572
 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
573 574
 * @from:	The virtual kernel start address of the range
 * @to:		The virtual kernel end address of the range (exclusive)
575
 * @prot:	The protection to be applied to this range
576
 *
577 578 579
 * The same virtual address as the kernel virtual address is also used
 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
 * physical pages.
580
 */
581
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
582
{
583 584
	phys_addr_t phys_addr;
	unsigned long virt_addr;
585 586
	unsigned long start = kern_hyp_va((unsigned long)from);
	unsigned long end = kern_hyp_va((unsigned long)to);
587

588 589 590
	if (is_kernel_in_hyp_mode())
		return 0;

591 592
	if (!kvm_host_owns_hyp_mappings())
		return -EPERM;
593

594 595
	start = start & PAGE_MASK;
	end = PAGE_ALIGN(end);
596

597 598
	for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
		int err;
599

600
		phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
601
		err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
602
					    prot);
603 604 605 606 607
		if (err)
			return err;
	}

	return 0;
608 609
}

610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628
static int __hyp_alloc_private_va_range(unsigned long base)
{
	lockdep_assert_held(&kvm_hyp_pgd_mutex);

	if (!PAGE_ALIGNED(base))
		return -EINVAL;

	/*
	 * Verify that BIT(VA_BITS - 1) hasn't been flipped by
	 * allocating the new area, as it would indicate we've
	 * overflowed the idmap/IO address range.
	 */
	if ((base ^ io_map_base) & BIT(VA_BITS - 1))
		return -ENOMEM;

	io_map_base = base;

	return 0;
}
629 630 631 632 633 634 635 636 637 638 639 640

/**
 * hyp_alloc_private_va_range - Allocates a private VA range.
 * @size:	The size of the VA range to reserve.
 * @haddr:	The hypervisor virtual start address of the allocation.
 *
 * The private virtual address (VA) range is allocated below io_map_base
 * and aligned based on the order of @size.
 *
 * Return: 0 on success or negative error code on failure.
 */
int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
641
{
642 643
	unsigned long base;
	int ret = 0;
644

645
	mutex_lock(&kvm_hyp_pgd_mutex);
646

647
	/*
648
	 * This assumes that we have enough space below the idmap
649 650 651 652
	 * page to allocate our VAs. If not, the check in
	 * __hyp_alloc_private_va_range() will kick. A potential
	 * alternative would be to detect that overflow and switch
	 * to an allocation above the idmap.
653 654 655
	 *
	 * The allocated size is always a multiple of PAGE_SIZE.
	 */
656 657 658
	size = PAGE_ALIGN(size);
	base = io_map_base - size;
	ret = __hyp_alloc_private_va_range(base);
659 660 661

	mutex_unlock(&kvm_hyp_pgd_mutex);

662 663 664
	if (!ret)
		*haddr = base;

665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
	return ret;
}

static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
					unsigned long *haddr,
					enum kvm_pgtable_prot prot)
{
	unsigned long addr;
	int ret = 0;

	if (!kvm_host_owns_hyp_mappings()) {
		addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
					 phys_addr, size, prot);
		if (IS_ERR_VALUE(addr))
			return addr;
		*haddr = addr;

		return 0;
	}

	size = PAGE_ALIGN(size + offset_in_page(phys_addr));
	ret = hyp_alloc_private_va_range(size, &addr);
687
	if (ret)
688
		return ret;
689

690
	ret = __create_hyp_mappings(addr, size, phys_addr, prot);
691
	if (ret)
692
		return ret;
693

694
	*haddr = addr + offset_in_page(phys_addr);
695 696 697
	return ret;
}

698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739
int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
{
	unsigned long base;
	size_t size;
	int ret;

	mutex_lock(&kvm_hyp_pgd_mutex);
	/*
	 * Efficient stack verification using the PAGE_SHIFT bit implies
	 * an alignment of our allocation on the order of the size.
	 */
	size = PAGE_SIZE * 2;
	base = ALIGN_DOWN(io_map_base - size, size);

	ret = __hyp_alloc_private_va_range(base);

	mutex_unlock(&kvm_hyp_pgd_mutex);

	if (ret) {
		kvm_err("Cannot allocate hyp stack guard page\n");
		return ret;
	}

	/*
	 * Since the stack grows downwards, map the stack to the page
	 * at the higher address and leave the lower guard page
	 * unbacked.
	 *
	 * Any valid stack address now has the PAGE_SHIFT bit as 1
	 * and addresses corresponding to the guard page have the
	 * PAGE_SHIFT bit as 0 - this is used for overflow detection.
	 */
	ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr,
				    PAGE_HYP);
	if (ret)
		kvm_err("Cannot map hyp stack\n");

	*haddr = base + size;

	return ret;
}

740 741 742 743 744 745 746 747 748 749 750 751 752 753
/**
 * create_hyp_io_mappings - Map IO into both kernel and HYP
 * @phys_addr:	The physical start address which gets mapped
 * @size:	Size of the region being mapped
 * @kaddr:	Kernel VA for this mapping
 * @haddr:	HYP VA for this mapping
 */
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
			   void __iomem **kaddr,
			   void __iomem **haddr)
{
	unsigned long addr;
	int ret;

754 755 756
	if (is_protected_kvm_enabled())
		return -EPERM;

757 758 759 760 761 762 763 764 765 766 767
	*kaddr = ioremap(phys_addr, size);
	if (!*kaddr)
		return -ENOMEM;

	if (is_kernel_in_hyp_mode()) {
		*haddr = *kaddr;
		return 0;
	}

	ret = __create_hyp_private_mapping(phys_addr, size,
					   &addr, PAGE_HYP_DEVICE);
768 769 770
	if (ret) {
		iounmap(*kaddr);
		*kaddr = NULL;
771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
		*haddr = NULL;
		return ret;
	}

	*haddr = (void __iomem *)addr;
	return 0;
}

/**
 * create_hyp_exec_mappings - Map an executable range into HYP
 * @phys_addr:	The physical start address which gets mapped
 * @size:	Size of the region being mapped
 * @haddr:	HYP VA for this mapping
 */
int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
			     void **haddr)
{
	unsigned long addr;
	int ret;

	BUG_ON(is_kernel_in_hyp_mode());

	ret = __create_hyp_private_mapping(phys_addr, size,
					   &addr, PAGE_HYP_EXEC);
	if (ret) {
		*haddr = NULL;
797 798 799
		return ret;
	}

800
	*haddr = (void *)addr;
801
	return 0;
802 803
}

804 805 806 807 808 809 810 811
static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
	/* We shouldn't need any other callback to walk the PT */
	.phys_to_virt		= kvm_host_va,
};

static int get_user_mapping_size(struct kvm *kvm, u64 addr)
{
	struct kvm_pgtable pgt = {
812
		.pgd		= (kvm_pteref_t)kvm->mm->pgd,
813
		.ia_bits	= vabits_actual,
814
		.start_level	= (KVM_PGTABLE_LAST_LEVEL -
815
				   ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1),
816 817
		.mm_ops		= &kvm_user_mm_ops,
	};
818
	unsigned long flags;
819
	kvm_pte_t pte = 0;	/* Keep GCC quiet... */
820
	s8 level = S8_MAX;
821 822
	int ret;

823 824 825 826 827 828
	/*
	 * Disable IRQs so that we hazard against a concurrent
	 * teardown of the userspace page tables (which relies on
	 * IPI-ing threads).
	 */
	local_irq_save(flags);
829
	ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
830 831 832 833 834 835 836 837 838
	local_irq_restore(flags);

	if (ret)
		return ret;

	/*
	 * Not seeing an error, but not updating level? Something went
	 * deeply wrong...
	 */
839 840 841
	if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL))
		return -EFAULT;
	if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL))
842 843 844 845 846
		return -EFAULT;

	/* Oops, the userspace PTs are gone... Replay the fault */
	if (!kvm_pte_valid(pte))
		return -EAGAIN;
847 848 849 850

	return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
}

851 852
static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
	.zalloc_page		= stage2_memcache_zalloc_page,
853 854
	.zalloc_pages_exact	= kvm_s2_zalloc_pages_exact,
	.free_pages_exact	= kvm_s2_free_pages_exact,
855
	.free_unlinked_table	= stage2_free_unlinked_table,
856
	.get_page		= kvm_host_get_page,
857
	.put_page		= kvm_s2_put_page,
858 859 860
	.page_count		= kvm_host_page_count,
	.phys_to_virt		= kvm_host_va,
	.virt_to_phys		= kvm_host_pa,
861 862
	.dcache_clean_inval_poc	= clean_dcache_guest_page,
	.icache_inval_pou	= invalidate_icache_guest_page,
863 864
};

865
static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
866
{
867 868 869 870 871 872 873 874
	u32 kvm_ipa_limit = get_kvm_ipa_limit();
	u64 mmfr0, mmfr1;
	u32 phys_shift;

	if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
		return -EINVAL;

	phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
875 876 877
	if (is_protected_kvm_enabled()) {
		phys_shift = kvm_ipa_limit;
	} else if (phys_shift) {
878 879 880 881 882 883 884 885 886 887 888 889 890 891
		if (phys_shift > kvm_ipa_limit ||
		    phys_shift < ARM64_MIN_PARANGE_BITS)
			return -EINVAL;
	} else {
		phys_shift = KVM_PHYS_SHIFT;
		if (phys_shift > kvm_ipa_limit) {
			pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
				     current->comm);
			return -EINVAL;
		}
	}

	mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
	mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
892
	mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
893

894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926
	return 0;
}

/**
 * kvm_init_stage2_mmu - Initialise a S2 MMU structure
 * @kvm:	The pointer to the KVM structure
 * @mmu:	The pointer to the s2 MMU structure
 * @type:	The machine type of the virtual machine
 *
 * Allocates only the stage-2 HW PGD level table(s).
 * Note we don't need locking here as this is only called in two cases:
 *
 * - when the VM is created, which can't race against anything
 *
 * - when secondary kvm_s2_mmu structures are initialised for NV
 *   guests, and the caller must hold kvm->lock as this is called on a
 *   per-vcpu basis.
 */
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
{
	int cpu, err;
	struct kvm_pgtable *pgt;

	/*
	 * If we already have our page tables in place, and that the
	 * MMU context is the canonical one, we have a bug somewhere,
	 * as this is only supposed to ever happen once per VM.
	 *
	 * Otherwise, we're building nested page tables, and that's
	 * probably because userspace called KVM_ARM_VCPU_INIT more
	 * than once on the same vcpu. Since that's actually legal,
	 * don't kick a fuss and leave gracefully.
	 */
927
	if (mmu->pgt != NULL) {
928 929 930
		if (kvm_is_nested_s2_mmu(kvm, mmu))
			return 0;

931 932 933 934
		kvm_err("kvm_arch already initialized?\n");
		return -EINVAL;
	}

935 936 937 938
	err = kvm_init_ipa_range(mmu, type);
	if (err)
		return err;

939
	pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
940
	if (!pgt)
941 942
		return -ENOMEM;

943 944
	mmu->arch = &kvm->arch;
	err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops);
945 946
	if (err)
		goto out_free_pgtable;
947

948 949
	mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
	if (!mmu->last_vcpu_ran) {
950 951
		err = -ENOMEM;
		goto out_destroy_pgtable;
952 953 954 955 956
	}

	for_each_possible_cpu(cpu)
		*per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;

957 958 959 960
	 /* The eager page splitting is disabled by default */
	mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
	mmu->split_page_cache.gfp_zero = __GFP_ZERO;

961 962
	mmu->pgt = pgt;
	mmu->pgd_phys = __pa(pgt->pgd);
963 964 965 966

	if (kvm_is_nested_s2_mmu(kvm, mmu))
		kvm_init_nested_s2_mmu(mmu);

967
	return 0;
968 969 970 971 972 973

out_destroy_pgtable:
	kvm_pgtable_stage2_destroy(pgt);
out_free_pgtable:
	kfree(pgt);
	return err;
974 975
}

976 977 978
void kvm_uninit_stage2_mmu(struct kvm *kvm)
{
	kvm_free_stage2_pgd(&kvm->arch.mmu);
979
	kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
980 981
}

982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
static void stage2_unmap_memslot(struct kvm *kvm,
				 struct kvm_memory_slot *memslot)
{
	hva_t hva = memslot->userspace_addr;
	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
	phys_addr_t size = PAGE_SIZE * memslot->npages;
	hva_t reg_end = hva + size;

	/*
	 * A memory region could potentially cover multiple VMAs, and any holes
	 * between them, so iterate over all of them to find out if we should
	 * unmap any of them.
	 *
	 *     +--------------------------------------------+
	 * +---------------+----------------+   +----------------+
	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
	 * +---------------+----------------+   +----------------+
	 *     |               memory region                |
	 *     +--------------------------------------------+
	 */
	do {
1003
		struct vm_area_struct *vma;
1004 1005
		hva_t vm_start, vm_end;

1006 1007
		vma = find_vma_intersection(current->mm, hva, reg_end);
		if (!vma)
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017
			break;

		/*
		 * Take the intersection of this VMA with the memory region
		 */
		vm_start = max(hva, vma->vm_start);
		vm_end = min(reg_end, vma->vm_end);

		if (!(vma->vm_flags & VM_PFNMAP)) {
			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
1018
			kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start);
1019 1020 1021 1022 1023 1024 1025 1026 1027
		}
		hva = vm_end;
	} while (hva < reg_end);
}

/**
 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
 * @kvm: The struct kvm pointer
 *
1028
 * Go through the memregions and unmap any regular RAM
1029 1030 1031 1032 1033 1034
 * backing memory already mapped to the VM.
 */
void stage2_unmap_vm(struct kvm *kvm)
{
	struct kvm_memslots *slots;
	struct kvm_memory_slot *memslot;
1035
	int idx, bkt;
1036 1037

	idx = srcu_read_lock(&kvm->srcu);
1038
	mmap_read_lock(current->mm);
1039
	write_lock(&kvm->mmu_lock);
1040 1041

	slots = kvm_memslots(kvm);
1042
	kvm_for_each_memslot(memslot, bkt, slots)
1043 1044
		stage2_unmap_memslot(kvm, memslot);

1045 1046
	kvm_nested_s2_unmap(kvm);

1047
	write_unlock(&kvm->mmu_lock);
1048
	mmap_read_unlock(current->mm);
1049 1050 1051
	srcu_read_unlock(&kvm->srcu, idx);
}

1052
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
1053
{
1054
	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
1055
	struct kvm_pgtable *pgt = NULL;
1056

1057
	write_lock(&kvm->mmu_lock);
1058 1059 1060 1061 1062
	pgt = mmu->pgt;
	if (pgt) {
		mmu->pgd_phys = 0;
		mmu->pgt = NULL;
		free_percpu(mmu->last_vcpu_ran);
1063
	}
1064
	write_unlock(&kvm->mmu_lock);
1065

1066 1067 1068
	if (pgt) {
		kvm_pgtable_stage2_destroy(pgt);
		kfree(pgt);
1069
	}
1070 1071
}

1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097
static void hyp_mc_free_fn(void *addr, void *unused)
{
	free_page((unsigned long)addr);
}

static void *hyp_mc_alloc_fn(void *unused)
{
	return (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
}

void free_hyp_memcache(struct kvm_hyp_memcache *mc)
{
	if (is_protected_kvm_enabled())
		__free_hyp_memcache(mc, hyp_mc_free_fn,
				    kvm_host_va, NULL);
}

int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
{
	if (!is_protected_kvm_enabled())
		return 0;

	return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
				    kvm_host_pa, NULL);
}

1098 1099 1100 1101 1102 1103 1104
/**
 * kvm_phys_addr_ioremap - map a device range to guest IPA
 *
 * @kvm:	The KVM pointer
 * @guest_ipa:	The IPA at which to insert the mapping
 * @pa:		The physical address of the device
 * @size:	The size of the mapping
1105
 * @writable:   Whether or not to create a writable mapping
1106 1107
 */
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
1108
			  phys_addr_t pa, unsigned long size, bool writable)
1109
{
1110
	phys_addr_t addr;
1111
	int ret = 0;
1112
	struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
1113 1114
	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
	struct kvm_pgtable *pgt = mmu->pgt;
1115 1116 1117
	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
				     KVM_PGTABLE_PROT_R |
				     (writable ? KVM_PGTABLE_PROT_W : 0);
1118

1119 1120 1121
	if (is_protected_kvm_enabled())
		return -EPERM;

1122 1123
	size += offset_in_page(guest_ipa);
	guest_ipa &= PAGE_MASK;
1124

1125
	for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
1126
		ret = kvm_mmu_topup_memory_cache(&cache,
1127
						 kvm_mmu_cache_min_pages(mmu));
1128
		if (ret)
1129 1130
			break;

1131
		write_lock(&kvm->mmu_lock);
1132
		ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot,
1133
					     &cache, 0);
1134
		write_unlock(&kvm->mmu_lock);
1135
		if (ret)
1136
			break;
1137

1138
		pa += PAGE_SIZE;
1139 1140
	}

1141
	kvm_mmu_free_memory_cache(&cache);
1142 1143 1144
	return ret;
}

1145
/**
1146
 * kvm_stage2_wp_range() - write protect stage2 memory region range
1147
 * @mmu:        The KVM stage-2 MMU pointer
1148 1149 1150
 * @addr:	Start address of range
 * @end:	End address of range
 */
1151
void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
1152
{
1153
	stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect);
1154 1155 1156 1157 1158 1159 1160 1161 1162
}

/**
 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
 * @kvm:	The KVM pointer
 * @slot:	The memory slot to write protect
 *
 * Called to start logging dirty pages after memory region
 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
1163
 * all present PUD, PMD and PTEs are write protected in the memory region.
1164 1165 1166 1167 1168
 * Afterwards read of dirty page log can be called.
 *
 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
 * serializing operations for VM memory regions.
 */
1169
static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
1170
{
1171 1172
	struct kvm_memslots *slots = kvm_memslots(kvm);
	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
1173 1174 1175 1176 1177 1178 1179
	phys_addr_t start, end;

	if (WARN_ON_ONCE(!memslot))
		return;

	start = memslot->base_gfn << PAGE_SHIFT;
	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
1180

1181
	write_lock(&kvm->mmu_lock);
1182 1183
	kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
	kvm_nested_s2_wp(kvm);
1184
	write_unlock(&kvm->mmu_lock);
1185
	kvm_flush_remote_tlbs_memslot(kvm, memslot);
1186
}
1187

1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215
/**
 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
 *				   pages for memory slot
 * @kvm:	The KVM pointer
 * @slot:	The memory slot to split
 *
 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
 * serializing operations for VM memory regions.
 */
static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
{
	struct kvm_memslots *slots;
	struct kvm_memory_slot *memslot;
	phys_addr_t start, end;

	lockdep_assert_held(&kvm->slots_lock);

	slots = kvm_memslots(kvm);
	memslot = id_to_memslot(slots, slot);

	start = memslot->base_gfn << PAGE_SHIFT;
	end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;

	write_lock(&kvm->mmu_lock);
	kvm_mmu_split_huge_pages(kvm, start, end);
	write_unlock(&kvm->mmu_lock);
}

1216
/*
1217 1218 1219 1220 1221 1222
 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
 * @kvm:	The KVM pointer
 * @slot:	The memory slot associated with mask
 * @gfn_offset:	The gfn offset in memory slot
 * @mask:	The mask of pages at offset 'gfn_offset' in this memory
 *		slot to enable dirty logging on
1223
 *
1224 1225
 * Writes protect selected pages to enable dirty logging, and then
 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
1226 1227 1228 1229 1230
 */
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
		struct kvm_memory_slot *slot,
		gfn_t gfn_offset, unsigned long mask)
{
1231 1232 1233 1234 1235 1236
	phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
	phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
	phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;

	lockdep_assert_held_write(&kvm->mmu_lock);

1237
	kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248

	/*
	 * Eager-splitting is done when manual-protect is set.  We
	 * also check for initially-all-set because we can avoid
	 * eager-splitting if initially-all-set is false.
	 * Initially-all-set equal false implies that huge-pages were
	 * already split when enabling dirty logging: no need to do it
	 * again.
	 */
	if (kvm_dirty_log_manual_protect_and_init_set(kvm))
		kvm_mmu_split_huge_pages(kvm, start, end);
1249 1250

	kvm_nested_s2_wp(kvm);
1251 1252
}

1253
static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
1254
{
1255
	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
1256 1257
}

1258 1259 1260
static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
					       unsigned long hva,
					       unsigned long map_size)
1261
{
1262
	gpa_t gpa_start;
1263 1264 1265
	hva_t uaddr_start, uaddr_end;
	size_t size;

1266 1267 1268 1269
	/* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
	if (map_size == PAGE_SIZE)
		return true;

1270 1271 1272 1273 1274 1275 1276 1277 1278
	size = memslot->npages * PAGE_SIZE;

	gpa_start = memslot->base_gfn << PAGE_SHIFT;

	uaddr_start = memslot->userspace_addr;
	uaddr_end = uaddr_start + size;

	/*
	 * Pages belonging to memslots that don't have the same alignment
1279 1280
	 * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
	 * PMD/PUD entries, because we'll end up mapping the wrong pages.
1281 1282 1283 1284 1285
	 *
	 * Consider a layout like the following:
	 *
	 *    memslot->userspace_addr:
	 *    +-----+--------------------+--------------------+---+
1286
	 *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
1287 1288
	 *    +-----+--------------------+--------------------+---+
	 *
1289
	 *    memslot->base_gfn << PAGE_SHIFT:
1290
	 *      +---+--------------------+--------------------+-----+
1291
	 *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
1292 1293
	 *      +---+--------------------+--------------------+-----+
	 *
1294
	 * If we create those stage-2 blocks, we'll end up with this incorrect
1295 1296 1297 1298 1299
	 * mapping:
	 *   d -> f
	 *   e -> g
	 *   f -> h
	 */
1300
	if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
1301 1302 1303 1304
		return false;

	/*
	 * Next, let's make sure we're not trying to map anything not covered
1305 1306
	 * by the memslot. This means we have to prohibit block size mappings
	 * for the beginning and end of a non-block aligned and non-block sized
1307 1308 1309 1310 1311 1312 1313 1314
	 * memory slot (illustrated by the head and tail parts of the
	 * userspace view above containing pages 'abcde' and 'xyz',
	 * respectively).
	 *
	 * Note that it doesn't matter if we do the check using the
	 * userspace_addr or the base_gfn, as both are equally aligned (per
	 * the check above) and equally sized.
	 */
1315 1316
	return (hva & ~(map_size - 1)) >= uaddr_start &&
	       (hva & ~(map_size - 1)) + map_size <= uaddr_end;
1317 1318
}

1319 1320 1321 1322 1323 1324 1325 1326
/*
 * Check if the given hva is backed by a transparent huge page (THP) and
 * whether it can be mapped using block mapping in stage2. If so, adjust
 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
 * supported. This will need to be updated to support other THP sizes.
 *
 * Returns the size of the mapping.
 */
1327
static long
1328
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
1329 1330 1331 1332 1333 1334 1335 1336 1337 1338
			    unsigned long hva, kvm_pfn_t *pfnp,
			    phys_addr_t *ipap)
{
	kvm_pfn_t pfn = *pfnp;

	/*
	 * Make sure the adjustment is done only for THP pages. Also make
	 * sure that the HVA and IPA are sufficiently aligned and that the
	 * block map is contained within the memslot.
	 */
1339 1340 1341 1342 1343 1344 1345 1346 1347
	if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
		int sz = get_user_mapping_size(kvm, hva);

		if (sz < 0)
			return sz;

		if (sz < PMD_SIZE)
			return PAGE_SIZE;

1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358
		*ipap &= PMD_MASK;
		pfn &= ~(PTRS_PER_PMD - 1);
		*pfnp = pfn;

		return PMD_SIZE;
	}

	/* Use page mapping if we cannot use block mapping. */
	return PAGE_SIZE;
}

1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387
static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
{
	unsigned long pa;

	if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
		return huge_page_shift(hstate_vma(vma));

	if (!(vma->vm_flags & VM_PFNMAP))
		return PAGE_SHIFT;

	VM_BUG_ON(is_vm_hugetlb_page(vma));

	pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);

#ifndef __PAGETABLE_PMD_FOLDED
	if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
	    ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
	    ALIGN(hva, PUD_SIZE) <= vma->vm_end)
		return PUD_SHIFT;
#endif

	if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
	    ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
	    ALIGN(hva, PMD_SIZE) <= vma->vm_end)
		return PMD_SHIFT;

	return PAGE_SHIFT;
}

1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
/*
 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
 * able to see the page's tags and therefore they must be initialised first. If
 * PG_mte_tagged is set, tags have already been initialised.
 *
 * The race in the test/set of the PG_mte_tagged flag is handled by:
 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
 *   racing to santise the same page
 * - mmap_lock protects between a VM faulting a page in and the VMM performing
 *   an mprotect() to add VM_MTE
 */
1399 1400
static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
			      unsigned long size)
1401 1402
{
	unsigned long i, nr_pages = size >> PAGE_SHIFT;
1403
	struct page *page = pfn_to_page(pfn);
1404 1405

	if (!kvm_has_mte(kvm))
1406
		return;
1407 1408

	for (i = 0; i < nr_pages; i++, page++) {
1409
		if (try_page_mte_tagging(page)) {
1410
			mte_clear_page_tags(page_address(page));
1411
			set_page_mte_tagged(page);
1412 1413 1414 1415
		}
	}
}

1416 1417 1418
static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
{
	return vma->vm_flags & VM_MTE_ALLOWED;
1419 1420
}

1421
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1422
			  struct kvm_s2_trans *nested,
1423
			  struct kvm_memory_slot *memslot, unsigned long hva,
1424
			  bool fault_is_perm)
1425
{
1426
	int ret = 0;
1427
	bool write_fault, writable, force_pte = false;
1428
	bool exec_fault, mte_allowed;
1429
	bool device = false, vfio_allow_any_uc = false;
1430
	unsigned long mmu_seq;
1431
	phys_addr_t ipa = fault_ipa;
1432
	struct kvm *kvm = vcpu->kvm;
1433
	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
1434
	struct vm_area_struct *vma;
1435
	short vma_shift;
1436
	gfn_t gfn;
1437
	kvm_pfn_t pfn;
1438
	bool logging_active = memslot_is_logging(memslot);
1439
	long vma_pagesize, fault_granule;
1440 1441
	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
	struct kvm_pgtable *pgt;
1442

1443 1444
	if (fault_is_perm)
		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
1445
	write_fault = kvm_is_write_fault(vcpu);
1446
	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
1447 1448
	VM_BUG_ON(write_fault && exec_fault);

1449
	if (fault_is_perm && !write_fault && !exec_fault) {
1450 1451 1452 1453
		kvm_err("Unexpected L2 read permission error\n");
		return -EFAULT;
	}

1454 1455 1456 1457 1458 1459
	/*
	 * Permission faults just need to update the existing leaf entry,
	 * and so normally don't require allocations from the memcache. The
	 * only exception to this is when dirty logging is enabled at runtime
	 * and a write fault needs to collapse a block entry into a table.
	 */
1460
	if (!fault_is_perm || (logging_active && write_fault)) {
1461
		ret = kvm_mmu_topup_memory_cache(memcache,
1462
						 kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu));
1463 1464 1465 1466
		if (ret)
			return ret;
	}

1467 1468 1469 1470
	/*
	 * Let's check if we will get back a huge page backed by hugetlbfs, or
	 * get block mapping for device MMIO region.
	 */
1471
	mmap_read_lock(current->mm);
1472
	vma = vma_lookup(current->mm, hva);
1473 1474
	if (unlikely(!vma)) {
		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
1475
		mmap_read_unlock(current->mm);
1476 1477 1478
		return -EFAULT;
	}

1479 1480 1481 1482 1483
	/*
	 * logging_active is guaranteed to never be true for VM_PFNMAP
	 * memslots.
	 */
	if (logging_active) {
1484
		force_pte = true;
1485
		vma_shift = PAGE_SHIFT;
1486 1487
	} else {
		vma_shift = get_vma_page_shift(vma, hva);
1488 1489
	}

1490
	switch (vma_shift) {
1491
#ifndef __PAGETABLE_PMD_FOLDED
1492 1493 1494 1495
	case PUD_SHIFT:
		if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
			break;
		fallthrough;
1496
#endif
1497 1498 1499 1500 1501 1502 1503 1504
	case CONT_PMD_SHIFT:
		vma_shift = PMD_SHIFT;
		fallthrough;
	case PMD_SHIFT:
		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
			break;
		fallthrough;
	case CONT_PTE_SHIFT:
1505
		vma_shift = PAGE_SHIFT;
1506 1507 1508 1509 1510 1511
		force_pte = true;
		fallthrough;
	case PAGE_SHIFT:
		break;
	default:
		WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
1512 1513
	}

1514
	vma_pagesize = 1UL << vma_shift;
1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542

	if (nested) {
		unsigned long max_map_size;

		max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;

		ipa = kvm_s2_trans_output(nested);

		/*
		 * If we're about to create a shadow stage 2 entry, then we
		 * can only create a block mapping if the guest stage 2 page
		 * table uses at least as big a mapping.
		 */
		max_map_size = min(kvm_s2_trans_size(nested), max_map_size);

		/*
		 * Be careful that if the mapping size falls between
		 * two host sizes, take the smallest of the two.
		 */
		if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
			max_map_size = PMD_SIZE;
		else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
			max_map_size = PAGE_SIZE;

		force_pte = (max_map_size == PAGE_SIZE);
		vma_pagesize = min(vma_pagesize, (long)max_map_size);
	}

1543 1544 1545 1546 1547 1548
	/*
	 * Both the canonical IPA and fault IPA must be hugepage-aligned to
	 * ensure we find the right PFN and lay down the mapping in the right
	 * place.
	 */
	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
1549
		fault_ipa &= ~(vma_pagesize - 1);
1550 1551
		ipa &= ~(vma_pagesize - 1);
	}
1552

1553
	gfn = ipa >> PAGE_SHIFT;
1554 1555
	mte_allowed = kvm_vma_mte_allowed(vma);

1556 1557
	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;

1558 1559
	/* Don't use the VMA after the unlock -- it may have vanished */
	vma = NULL;
1560 1561

	/*
1562 1563 1564
	 * Read mmu_invalidate_seq so that KVM can detect if the results of
	 * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to
	 * acquiring kvm->mmu_lock.
1565
	 *
1566 1567
	 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
	 * with the smp_wmb() in kvm_mmu_invalidate_end().
1568
	 */
1569 1570
	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
	mmap_read_unlock(current->mm);
1571

1572
	pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL,
1573
				   write_fault, &writable, NULL);
1574
	if (pfn == KVM_PFN_ERR_HWPOISON) {
1575
		kvm_send_hwpoison_signal(hva, vma_shift);
1576 1577
		return 0;
	}
1578
	if (is_error_noslot_pfn(pfn))
1579 1580
		return -EFAULT;

1581
	if (kvm_is_device_pfn(pfn)) {
1582 1583 1584 1585 1586 1587 1588 1589 1590 1591
		/*
		 * If the page was identified as device early by looking at
		 * the VMA flags, vma_pagesize is already representing the
		 * largest quantity we can map.  If instead it was mapped
		 * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE
		 * and must not be upgraded.
		 *
		 * In both cases, we don't let transparent_hugepage_adjust()
		 * change things at the last minute.
		 */
1592 1593
		device = true;
	} else if (logging_active && !write_fault) {
1594 1595 1596 1597
		/*
		 * Only actually map the page as writable if this was a write
		 * fault.
		 */
1598
		writable = false;
1599
	}
1600

1601
	if (exec_fault && device)
1602 1603
		return -ENOEXEC;

1604 1605 1606 1607
	/*
	 * Potentially reduce shadow S2 permissions to match the guest's own
	 * S2. For exec faults, we'd only reach this point if the guest
	 * actually allowed it (see kvm_s2_handle_perm_fault).
1608 1609 1610 1611 1612 1613
	 *
	 * Also encode the level of the original translation in the SW bits
	 * of the leaf entry as a proxy for the span of that translation.
	 * This will be retrieved on TLB invalidation from the guest and
	 * used to limit the invalidation scope if a TTL hint or a range
	 * isn't provided.
1614 1615 1616 1617 1618
	 */
	if (nested) {
		writable &= kvm_s2_trans_writable(nested);
		if (!kvm_s2_trans_readable(nested))
			prot &= ~KVM_PGTABLE_PROT_R;
1619 1620

		prot |= kvm_encode_nested_level(nested);
1621 1622
	}

1623
	read_lock(&kvm->mmu_lock);
1624
	pgt = vcpu->arch.hw_mmu->pgt;
1625 1626
	if (mmu_invalidate_retry(kvm, mmu_seq)) {
		ret = -EAGAIN;
1627
		goto out_unlock;
1628
	}
1629

1630 1631 1632 1633
	/*
	 * If we are not forced to use page mapping, check if we are
	 * backed by a THP and thus use block mapping if possible.
	 */
1634
	if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
1635
		if (fault_is_perm && fault_granule > PAGE_SIZE)
1636 1637 1638 1639 1640
			vma_pagesize = fault_granule;
		else
			vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
								   hva, &pfn,
								   &fault_ipa);
1641 1642 1643 1644 1645

		if (vma_pagesize < 0) {
			ret = vma_pagesize;
			goto out_unlock;
		}
1646
	}
1647

1648
	if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
1649
		/* Check the VMM hasn't introduced a new disallowed VMA */
1650
		if (mte_allowed) {
1651 1652
			sanitise_mte_tags(kvm, pfn, vma_pagesize);
		} else {
1653 1654
			ret = -EFAULT;
			goto out_unlock;
1655
		}
1656
	}
1657

1658
	if (writable)
1659
		prot |= KVM_PGTABLE_PROT_W;
1660

1661
	if (exec_fault)
1662
		prot |= KVM_PGTABLE_PROT_X;
1663

1664 1665 1666 1667 1668
	if (device) {
		if (vfio_allow_any_uc)
			prot |= KVM_PGTABLE_PROT_NORMAL_NC;
		else
			prot |= KVM_PGTABLE_PROT_DEVICE;
1669 1670
	} else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
		   (!nested || kvm_s2_trans_executable(nested))) {
1671
		prot |= KVM_PGTABLE_PROT_X;
1672
	}
1673

1674 1675 1676 1677 1678
	/*
	 * Under the premise of getting a FSC_PERM fault, we just need to relax
	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
	 * kvm_pgtable_stage2_map() should be called to change block size.
	 */
1679 1680 1681 1682 1683 1684
	if (fault_is_perm && vma_pagesize == fault_granule) {
		/*
		 * Drop the SW bits in favour of those stored in the
		 * PTE, which will be preserved.
		 */
		prot &= ~KVM_NV_GUEST_MAP_SZ;
1685
		ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
1686
	} else {
1687 1688
		ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
					     __pfn_to_phys(pfn), prot,
1689 1690 1691
					     memcache,
					     KVM_PGTABLE_WALK_HANDLE_FAULT |
					     KVM_PGTABLE_WALK_SHARED);
1692 1693
	}

1694 1695
out_unlock:
	read_unlock(&kvm->mmu_lock);
1696

1697 1698 1699
	/* Mark the page dirty only if the fault is handled successfully */
	if (writable && !ret) {
		kvm_set_pfn_dirty(pfn);
1700
		mark_page_dirty_in_slot(kvm, memslot, gfn);
1701 1702
	}

1703
	kvm_release_pfn_clean(pfn);
1704
	return ret != -EAGAIN ? ret : 0;
1705 1706
}

1707
/* Resolve the access fault by making the page young again. */
1708 1709
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
1710
	kvm_pte_t pte;
1711
	struct kvm_s2_mmu *mmu;
1712 1713 1714

	trace_kvm_access_fault(fault_ipa);

1715
	read_lock(&vcpu->kvm->mmu_lock);
1716
	mmu = vcpu->arch.hw_mmu;
1717
	pte = kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa);
1718
	read_unlock(&vcpu->kvm->mmu_lock);
1719

1720 1721
	if (kvm_pte_valid(pte))
		kvm_set_pfn_accessed(kvm_pte_to_pfn(pte));
1722 1723
}

1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734
/**
 * kvm_handle_guest_abort - handles all 2nd stage aborts
 * @vcpu:	the VCPU pointer
 *
 * Any abort that gets to the host is almost guaranteed to be caused by a
 * missing second stage translation table entry, which can mean that either the
 * guest simply needs more memory and we must allocate an appropriate page or it
 * can mean that the guest tried to access I/O memory, which is emulated by user
 * space. The distinction is based on the IPA causing the fault and whether this
 * memory region has been registered as standard RAM by user space.
 */
1735
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
1736
{
1737
	struct kvm_s2_trans nested_trans, *nested = NULL;
1738
	unsigned long esr;
1739 1740
	phys_addr_t fault_ipa; /* The address we faulted on */
	phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
1741
	struct kvm_memory_slot *memslot;
1742 1743
	unsigned long hva;
	bool is_iabt, write_fault, writable;
1744 1745 1746
	gfn_t gfn;
	int ret, idx;

1747
	esr = kvm_vcpu_get_esr(vcpu);
1748

1749
	ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
1750
	is_iabt = kvm_vcpu_trap_is_iabt(vcpu);
1751

1752
	if (esr_fsc_is_translation_fault(esr)) {
1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770
		/* Beyond sanitised PARange (which is the IPA limit) */
		if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
			kvm_inject_size_fault(vcpu);
			return 1;
		}

		/* Falls between the IPA range and the PARange? */
		if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
			fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);

			if (is_iabt)
				kvm_inject_pabt(vcpu, fault_ipa);
			else
				kvm_inject_dabt(vcpu, fault_ipa);
			return 1;
		}
	}

1771
	/* Synchronous External Abort? */
1772
	if (kvm_vcpu_abt_issea(vcpu)) {
1773 1774 1775 1776
		/*
		 * For RAS the host kernel may handle this abort.
		 * There is no need to pass the error into the guest.
		 */
1777
		if (kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_esr(vcpu)))
1778
			kvm_inject_vabt(vcpu);
1779 1780

		return 1;
1781 1782
	}

Gavin Shan's avatar
Gavin Shan committed
1783
	trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
1784
			      kvm_vcpu_get_hfar(vcpu), fault_ipa);
1785 1786

	/* Check the stage-2 fault is trans. fault or write fault */
1787 1788 1789
	if (!esr_fsc_is_translation_fault(esr) &&
	    !esr_fsc_is_permission_fault(esr) &&
	    !esr_fsc_is_access_flag_fault(esr)) {
1790 1791 1792
		kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
			kvm_vcpu_trap_get_class(vcpu),
			(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
Gavin Shan's avatar
Gavin Shan committed
1793
			(unsigned long)kvm_vcpu_get_esr(vcpu));
1794 1795 1796 1797 1798
		return -EFAULT;
	}

	idx = srcu_read_lock(&vcpu->kvm->srcu);

1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834
	/*
	 * We may have faulted on a shadow stage 2 page table if we are
	 * running a nested guest.  In this case, we have to resolve the L2
	 * IPA to the L1 IPA first, before knowing what kind of memory should
	 * back the L1 IPA.
	 *
	 * If the shadow stage 2 page table walk faults, then we simply inject
	 * this to the guest and carry on.
	 *
	 * If there are no shadow S2 PTs because S2 is disabled, there is
	 * nothing to walk and we treat it as a 1:1 before going through the
	 * canonical translation.
	 */
	if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
	    vcpu->arch.hw_mmu->nested_stage2_enabled) {
		u32 esr;

		ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
		if (ret) {
			esr = kvm_s2_trans_esr(&nested_trans);
			kvm_inject_s2_fault(vcpu, esr);
			goto out_unlock;
		}

		ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
		if (ret) {
			esr = kvm_s2_trans_esr(&nested_trans);
			kvm_inject_s2_fault(vcpu, esr);
			goto out_unlock;
		}

		ipa = kvm_s2_trans_output(&nested_trans);
		nested = &nested_trans;
	}

	gfn = ipa >> PAGE_SHIFT;
1835 1836
	memslot = gfn_to_memslot(vcpu->kvm, gfn);
	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
1837
	write_fault = kvm_is_write_fault(vcpu);
1838
	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
1839 1840 1841 1842 1843 1844
		/*
		 * The guest has put either its instructions or its page-tables
		 * somewhere it shouldn't have. Userspace won't be able to do
		 * anything about this (there's no syndrome for a start), so
		 * re-inject the abort back into the guest.
		 */
1845
		if (is_iabt) {
1846 1847
			ret = -ENOEXEC;
			goto out;
1848 1849
		}

1850
		if (kvm_vcpu_abt_iss1tw(vcpu)) {
1851 1852 1853 1854 1855
			kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
			ret = 1;
			goto out_unlock;
		}

1856 1857 1858 1859 1860 1861 1862 1863 1864 1865
		/*
		 * Check for a cache maintenance operation. Since we
		 * ended-up here, we know it is outside of any memory
		 * slot. But we can't find out if that is for a device,
		 * or if the guest is just being stupid. The only thing
		 * we know for sure is that this range cannot be cached.
		 *
		 * So let's assume that the guest is just being
		 * cautious, and skip the instruction.
		 */
1866
		if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
1867
			kvm_incr_pc(vcpu);
1868 1869 1870 1871
			ret = 1;
			goto out_unlock;
		}

1872 1873 1874 1875 1876 1877
		/*
		 * The IPA is reported as [MAX:12], so we need to
		 * complement it with the bottom 12 bits from the
		 * faulting VA. This is always 12 bits, irrespective
		 * of the page size.
		 */
1878 1879
		ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
		ret = io_mem_abort(vcpu, ipa);
1880 1881 1882
		goto out_unlock;
	}

1883
	/* Userspace should not be able to register out-of-bounds IPAs */
1884
	VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
1885

1886
	if (esr_fsc_is_access_flag_fault(esr)) {
1887 1888 1889 1890 1891
		handle_access_fault(vcpu, fault_ipa);
		ret = 1;
		goto out_unlock;
	}

1892
	ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
1893
			     esr_fsc_is_permission_fault(esr));
1894 1895
	if (ret == 0)
		ret = 1;
1896 1897 1898 1899 1900
out:
	if (ret == -ENOEXEC) {
		kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
		ret = 1;
	}
1901 1902 1903
out_unlock:
	srcu_read_unlock(&vcpu->kvm->srcu, idx);
	return ret;
1904 1905
}

1906
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
1907
{
1908
	if (!kvm->arch.mmu.pgt)
1909
		return false;
1910

1911 1912 1913
	__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
			     (range->end - range->start) << PAGE_SHIFT,
			     range->may_block);
1914

1915
	kvm_nested_s2_unmap(kvm);
1916
	return false;
1917 1918
}

1919
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1920
{
1921
	u64 size = (range->end - range->start) << PAGE_SHIFT;
1922

1923
	if (!kvm->arch.mmu.pgt)
1924
		return false;
1925

1926 1927 1928
	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
						   range->start << PAGE_SHIFT,
						   size, true);
1929 1930 1931 1932
	/*
	 * TODO: Handle nested_mmu structures here using the reverse mapping in
	 * a later version of patch series.
	 */
1933 1934
}

1935
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
1936
{
1937 1938
	u64 size = (range->end - range->start) << PAGE_SHIFT;

1939
	if (!kvm->arch.mmu.pgt)
1940
		return false;
1941

1942 1943 1944
	return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt,
						   range->start << PAGE_SHIFT,
						   size, false);
1945 1946
}

1947 1948
phys_addr_t kvm_mmu_get_httbr(void)
{
1949
	return __pa(hyp_pgtable->pgd);
1950 1951
}

1952 1953 1954 1955 1956
phys_addr_t kvm_get_idmap_vector(void)
{
	return hyp_idmap_vector;
}

1957
static int kvm_map_idmap_text(void)
1958
{
1959 1960 1961
	unsigned long size = hyp_idmap_end - hyp_idmap_start;
	int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
					PAGE_HYP_EXEC);
1962 1963 1964 1965 1966 1967 1968
	if (err)
		kvm_err("Failed to idmap %lx-%lx\n",
			hyp_idmap_start, hyp_idmap_end);

	return err;
}

1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
static void *kvm_hyp_zalloc_page(void *arg)
{
	return (void *)get_zeroed_page(GFP_KERNEL);
}

static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
	.zalloc_page		= kvm_hyp_zalloc_page,
	.get_page		= kvm_host_get_page,
	.put_page		= kvm_host_put_page,
	.phys_to_virt		= kvm_host_va,
	.virt_to_phys		= kvm_host_pa,
};

1982
int __init kvm_mmu_init(u32 *hyp_va_bits)
1983
{
1984
	int err;
1985 1986
	u32 idmap_bits;
	u32 kernel_bits;
1987

1988
	hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
1989
	hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
1990
	hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
1991
	hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
1992
	hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);
1993

1994 1995 1996 1997 1998
	/*
	 * We rely on the linker script to ensure at build time that the HYP
	 * init code does not cross a page boundary.
	 */
	BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
1999

2000
	/*
2001 2002 2003
	 * The ID map is always configured for 48 bits of translation, which
	 * may be fewer than the number of VA bits used by the regular kernel
	 * stage 1, when VA_BITS=52.
2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
	 *
	 * At EL2, there is only one TTBR register, and we can't switch between
	 * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
	 * line: we need to use the extended range with *both* our translation
	 * tables.
	 *
	 * So use the maximum of the idmap VA bits and the regular kernel stage
	 * 1 VA bits to assure that the hypervisor can both ID map its code page
	 * and map any kernel memory.
	 */
2014
	idmap_bits = IDMAP_VA_BITS;
2015 2016 2017
	kernel_bits = vabits_actual;
	*hyp_va_bits = max(idmap_bits, kernel_bits);

2018
	kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
2019 2020 2021 2022
	kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
	kvm_debug("HYP VA range: %lx:%lx\n",
		  kern_hyp_va(PAGE_OFFSET),
		  kern_hyp_va((unsigned long)high_memory - 1));
2023

2024
	if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
2025
	    hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
2026
	    hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
2027 2028 2029 2030 2031 2032 2033 2034 2035
		/*
		 * The idmap page is intersecting with the VA space,
		 * it is not safe to continue further.
		 */
		kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
		err = -EINVAL;
		goto out;
	}

2036 2037 2038
	hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
	if (!hyp_pgtable) {
		kvm_err("Hyp mode page-table not allocated\n");
2039 2040 2041 2042
		err = -ENOMEM;
		goto out;
	}

2043
	err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
2044 2045
	if (err)
		goto out_free_pgtable;
2046

2047 2048 2049
	err = kvm_map_idmap_text();
	if (err)
		goto out_destroy_pgtable;
2050

2051
	io_map_base = hyp_idmap_start;
2052
	return 0;
2053 2054 2055 2056 2057 2058

out_destroy_pgtable:
	kvm_pgtable_hyp_destroy(hyp_pgtable);
out_free_pgtable:
	kfree(hyp_pgtable);
	hyp_pgtable = NULL;
2059 2060
out:
	return err;
2061
}
2062 2063

void kvm_arch_commit_memory_region(struct kvm *kvm,
2064
				   struct kvm_memory_slot *old,
2065
				   const struct kvm_memory_slot *new,
2066 2067
				   enum kvm_mr_change change)
{
2068 2069
	bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;

2070 2071
	/*
	 * At this point memslot has been committed and there is an
2072
	 * allocated dirty_bitmap[], dirty pages will be tracked while the
2073 2074
	 * memory slot is write protected.
	 */
2075 2076 2077 2078 2079
	if (log_dirty_pages) {

		if (change == KVM_MR_DELETE)
			return;

2080
		/*
2081 2082
		 * Huge and normal pages are write-protected and split
		 * on either of these two cases:
2083 2084
		 *
		 * 1. with initial-all-set: gradually with CLEAR ioctls,
2085
		 */
2086 2087 2088 2089 2090 2091 2092 2093
		if (kvm_dirty_log_manual_protect_and_init_set(kvm))
			return;
		/*
		 * or
		 * 2. without initial-all-set: all in one shot when
		 *    enabling dirty logging.
		 */
		kvm_mmu_wp_memory_region(kvm, new->id);
2094 2095 2096 2097 2098 2099 2100 2101 2102 2103
		kvm_mmu_split_memory_region(kvm, new->id);
	} else {
		/*
		 * Free any leftovers from the eager page splitting cache. Do
		 * this when deleting, moving, disabling dirty logging, or
		 * creating the memslot (a nop). Doing it for deletes makes
		 * sure we don't leak memory, and there's no need to keep the
		 * cache around for any of the other cases.
		 */
		kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
2104
	}
2105 2106 2107
}

int kvm_arch_prepare_memory_region(struct kvm *kvm,
2108 2109
				   const struct kvm_memory_slot *old,
				   struct kvm_memory_slot *new,
2110 2111
				   enum kvm_mr_change change)
{
2112
	hva_t hva, reg_end;
2113 2114
	int ret = 0;

2115 2116
	if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
			change != KVM_MR_FLAGS_ONLY)
2117 2118
		return 0;

2119 2120 2121 2122
	/*
	 * Prevent userspace from creating a memory region outside of the IPA
	 * space addressable by the KVM guest IPA space.
	 */
2123
	if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
2124 2125
		return -EFAULT;

2126 2127 2128
	hva = new->userspace_addr;
	reg_end = hva + (new->npages << PAGE_SHIFT);

2129
	mmap_read_lock(current->mm);
2130 2131
	/*
	 * A memory region could potentially cover multiple VMAs, and any holes
2132
	 * between them, so iterate over all of them.
2133 2134 2135 2136 2137 2138 2139 2140 2141
	 *
	 *     +--------------------------------------------+
	 * +---------------+----------------+   +----------------+
	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
	 * +---------------+----------------+   +----------------+
	 *     |               memory region                |
	 *     +--------------------------------------------+
	 */
	do {
2142
		struct vm_area_struct *vma;
2143

2144 2145
		vma = find_vma_intersection(current->mm, hva, reg_end);
		if (!vma)
2146 2147
			break;

2148
		if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
2149 2150 2151
			ret = -EINVAL;
			break;
		}
2152

2153
		if (vma->vm_flags & VM_PFNMAP) {
2154
			/* IO region dirty page logging not allowed */
2155
			if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
2156
				ret = -EINVAL;
2157
				break;
2158
			}
2159
		}
2160
		hva = min(reg_end, vma->vm_end);
2161 2162
	} while (hva < reg_end);

2163
	mmap_read_unlock(current->mm);
2164
	return ret;
2165 2166
}

2167
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
2168 2169 2170
{
}

2171
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
2172 2173 2174 2175 2176 2177
{
}

void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
				   struct kvm_memory_slot *slot)
{
2178 2179 2180
	gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
	phys_addr_t size = slot->npages << PAGE_SHIFT;

2181
	write_lock(&kvm->mmu_lock);
2182
	kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size);
2183
	kvm_nested_s2_unmap(kvm);
2184
	write_unlock(&kvm->mmu_lock);
2185
}
2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216

/*
 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
 *
 * Main problems:
 * - S/W ops are local to a CPU (not broadcast)
 * - We have line migration behind our back (speculation)
 * - System caches don't support S/W at all (damn!)
 *
 * In the face of the above, the best we can do is to try and convert
 * S/W ops to VA ops. Because the guest is not allowed to infer the
 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
 * which is a rather good thing for us.
 *
 * Also, it is only used when turning caches on/off ("The expected
 * usage of the cache maintenance instructions that operate by set/way
 * is associated with the cache maintenance instructions associated
 * with the powerdown and powerup of caches, if this is required by
 * the implementation.").
 *
 * We use the following policy:
 *
 * - If we trap a S/W operation, we enable VM trapping to detect
 *   caches being turned on/off, and do a full clean.
 *
 * - We flush the caches on both caches being turned on and off.
 *
 * - Once the caches are enabled, we stop trapping VM ops.
 */
void kvm_set_way_flush(struct kvm_vcpu *vcpu)
{
2217
	unsigned long hcr = *vcpu_hcr(vcpu);
2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231

	/*
	 * If this is the first time we do a S/W operation
	 * (i.e. HCR_TVM not set) flush the whole memory, and set the
	 * VM trapping.
	 *
	 * Otherwise, rely on the VM trapping to wait for the MMU +
	 * Caches to be turned off. At that point, we'll be able to
	 * clean the caches again.
	 */
	if (!(hcr & HCR_TVM)) {
		trace_kvm_set_way_flush(*vcpu_pc(vcpu),
					vcpu_has_cache_enabled(vcpu));
		stage2_flush_vm(vcpu->kvm);
2232
		*vcpu_hcr(vcpu) = hcr | HCR_TVM;
2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249
	}
}

void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
{
	bool now_enabled = vcpu_has_cache_enabled(vcpu);

	/*
	 * If switching the MMU+caches on, need to invalidate the caches.
	 * If switching it off, need to clean the caches.
	 * Clean + invalidate does the trick always.
	 */
	if (now_enabled != was_enabled)
		stage2_flush_vm(vcpu->kvm);

	/* Caches are now on, stop trapping VM ops (until a S/W op) */
	if (now_enabled)
2250
		*vcpu_hcr(vcpu) &= ~HCR_TVM;
2251 2252 2253

	trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
}