hugetlbpage.c 9.45 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
/*
 *  IBM System z Huge TLB Page Support for Kernel.
 *
5
 *    Copyright IBM Corp. 2007,2020
6 7 8
 *    Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
 */

9 10 11
#define KMSG_COMPONENT "hugetlb"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

12
#include <asm/pgalloc.h>
13 14
#include <linux/mm.h>
#include <linux/hugetlb.h>
15 16 17
#include <linux/mman.h>
#include <linux/sched/mm.h>
#include <linux/security.h>
18

19 20 21 22 23 24
/*
 * If the bit selected by single-bit bitmask "a" is set within "x", move
 * it to the position indicated by single-bit bitmask "b".
 */
#define move_set_bit(x, a, b)	(((x) & (a)) >> ilog2(a) << ilog2(b))

25
static inline unsigned long __pte_to_rste(pte_t pte)
26
{
27
	unsigned long rste;
28 29

	/*
30
	 * Convert encoding		  pte bits	pmd / pud bits
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
	 *				lIR.uswrdy.p	dy..R...I...wr
	 * empty			010.000000.0 -> 00..0...1...00
	 * prot-none, clean, old	111.000000.1 -> 00..1...1...00
	 * prot-none, clean, young	111.000001.1 -> 01..1...1...00
	 * prot-none, dirty, old	111.000010.1 -> 10..1...1...00
	 * prot-none, dirty, young	111.000011.1 -> 11..1...1...00
	 * read-only, clean, old	111.000100.1 -> 00..1...1...01
	 * read-only, clean, young	101.000101.1 -> 01..1...0...01
	 * read-only, dirty, old	111.000110.1 -> 10..1...1...01
	 * read-only, dirty, young	101.000111.1 -> 11..1...0...01
	 * read-write, clean, old	111.001100.1 -> 00..1...1...11
	 * read-write, clean, young	101.001101.1 -> 01..1...0...11
	 * read-write, dirty, old	110.001110.1 -> 10..0...1...11
	 * read-write, dirty, young	100.001111.1 -> 11..0...0...11
	 * HW-bits: R read-only, I invalid
	 * SW-bits: p present, y young, d dirty, r read, w write, s special,
	 *	    u unused, l large
48 49
	 */
	if (pte_present(pte)) {
50
		rste = pte_val(pte) & PAGE_MASK;
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
		rste |= move_set_bit(pte_val(pte), _PAGE_READ,
				     _SEGMENT_ENTRY_READ);
		rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
				     _SEGMENT_ENTRY_WRITE);
		rste |= move_set_bit(pte_val(pte), _PAGE_INVALID,
				     _SEGMENT_ENTRY_INVALID);
		rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT,
				     _SEGMENT_ENTRY_PROTECT);
		rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY,
				     _SEGMENT_ENTRY_DIRTY);
		rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG,
				     _SEGMENT_ENTRY_YOUNG);
#ifdef CONFIG_MEM_SOFT_DIRTY
		rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY,
				     _SEGMENT_ENTRY_SOFT_DIRTY);
#endif
67 68
		rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
				     _SEGMENT_ENTRY_NOEXEC);
69
	} else
70
		rste = _SEGMENT_ENTRY_EMPTY;
71
	return rste;
72 73
}

74
static inline pte_t __rste_to_pte(unsigned long rste)
75
{
76
	unsigned long pteval;
77
	int present;
78

79 80 81 82 83
	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
		present = pud_present(__pud(rste));
	else
		present = pmd_present(__pmd(rste));

84
	/*
85
	 * Convert encoding		pmd / pud bits	    pte bits
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
	 *				dy..R...I...wr	  lIR.uswrdy.p
	 * empty			00..0...1...00 -> 010.000000.0
	 * prot-none, clean, old	00..1...1...00 -> 111.000000.1
	 * prot-none, clean, young	01..1...1...00 -> 111.000001.1
	 * prot-none, dirty, old	10..1...1...00 -> 111.000010.1
	 * prot-none, dirty, young	11..1...1...00 -> 111.000011.1
	 * read-only, clean, old	00..1...1...01 -> 111.000100.1
	 * read-only, clean, young	01..1...0...01 -> 101.000101.1
	 * read-only, dirty, old	10..1...1...01 -> 111.000110.1
	 * read-only, dirty, young	11..1...0...01 -> 101.000111.1
	 * read-write, clean, old	00..1...1...11 -> 111.001100.1
	 * read-write, clean, young	01..1...0...11 -> 101.001101.1
	 * read-write, dirty, old	10..0...1...11 -> 110.001110.1
	 * read-write, dirty, young	11..0...0...11 -> 100.001111.1
	 * HW-bits: R read-only, I invalid
	 * SW-bits: p present, y young, d dirty, r read, w write, s special,
	 *	    u unused, l large
103
	 */
104
	if (present) {
105 106 107 108 109 110 111 112
		pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
		pteval |= _PAGE_LARGE | _PAGE_PRESENT;
		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ);
		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE);
		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID);
		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT);
		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY);
		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG);
113
#ifdef CONFIG_MEM_SOFT_DIRTY
114
		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
115
#endif
116
		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
117
	} else
118 119
		pteval = _PAGE_INVALID;
	return __pte(pteval);
120
}
121

122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
{
	struct page *page;
	unsigned long size, paddr;

	if (!mm_uses_skeys(mm) ||
	    rste & _SEGMENT_ENTRY_INVALID)
		return;

	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
		page = pud_page(__pud(rste));
		size = PUD_SIZE;
		paddr = rste & PUD_MASK;
	} else {
		page = pmd_page(__pmd(rste));
		size = PMD_SIZE;
		paddr = rste & PMD_MASK;
	}

	if (!test_and_set_bit(PG_arch_1, &page->flags))
		__storage_key_init_range(paddr, paddr + size - 1);
}

145
void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
146
		     pte_t *ptep, pte_t pte)
147
{
148 149 150 151 152
	unsigned long rste;

	rste = __pte_to_rste(pte);
	if (!MACHINE_HAS_NX)
		rste &= ~_SEGMENT_ENTRY_NOEXEC;
153 154

	/* Set correct table type for 2G hugepages */
155 156 157 158 159
	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
		if (likely(pte_present(pte)))
			rste |= _REGION3_ENTRY_LARGE;
		rste |= _REGION_ENTRY_TYPE_R3;
	} else if (likely(pte_present(pte)))
160
		rste |= _SEGMENT_ENTRY_LARGE;
161

162
	clear_huge_pte_skeys(mm, rste);
163
	set_pte(ptep, __pte(rste));
164 165
}

166 167 168 169 170 171
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
		     pte_t *ptep, pte_t pte, unsigned long sz)
{
	__set_huge_pte_at(mm, addr, ptep, pte);
}

172 173
pte_t huge_ptep_get(pte_t *ptep)
{
174
	return __rste_to_pte(pte_val(*ptep));
175 176 177 178 179
}

pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
			      unsigned long addr, pte_t *ptep)
{
180
	pte_t pte = huge_ptep_get(ptep);
181
	pmd_t *pmdp = (pmd_t *) ptep;
182
	pud_t *pudp = (pud_t *) ptep;
183

184 185 186 187 188
	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
		pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY));
	else
		pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
	return pte;
189 190
}

191
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
192
			unsigned long addr, unsigned long sz)
193 194
{
	pgd_t *pgdp;
195
	p4d_t *p4dp;
196 197 198 199
	pud_t *pudp;
	pmd_t *pmdp = NULL;

	pgdp = pgd_offset(mm, addr);
200 201 202 203 204 205 206 207 208
	p4dp = p4d_alloc(mm, pgdp, addr);
	if (p4dp) {
		pudp = pud_alloc(mm, p4dp, addr);
		if (pudp) {
			if (sz == PUD_SIZE)
				return (pte_t *) pudp;
			else if (sz == PMD_SIZE)
				pmdp = pmd_alloc(mm, pudp, addr);
		}
209
	}
210 211 212
	return (pte_t *) pmdp;
}

213 214
pte_t *huge_pte_offset(struct mm_struct *mm,
		       unsigned long addr, unsigned long sz)
215 216
{
	pgd_t *pgdp;
217
	p4d_t *p4dp;
218 219 220 221 222
	pud_t *pudp;
	pmd_t *pmdp = NULL;

	pgdp = pgd_offset(mm, addr);
	if (pgd_present(*pgdp)) {
223 224 225 226
		p4dp = p4d_offset(pgdp, addr);
		if (p4d_present(*p4dp)) {
			pudp = pud_offset(p4dp, addr);
			if (pud_present(*pudp)) {
227
				if (pud_leaf(*pudp))
228 229 230
					return (pte_t *) pudp;
				pmdp = pmd_offset(pudp, addr);
			}
231
		}
232 233 234 235
	}
	return (pte_t *) pmdp;
}

236 237 238 239 240 241 242 243 244 245
bool __init arch_hugetlb_valid_size(unsigned long size)
{
	if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
		return true;
	else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE)
		return true;
	else
		return false;
}

246 247 248 249 250
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
		unsigned long addr, unsigned long len,
		unsigned long pgoff, unsigned long flags)
{
	struct hstate *h = hstate_file(file);
251
	struct vm_unmapped_area_info info = {};
252 253 254 255 256 257 258 259 260 261 262 263 264

	info.length = len;
	info.low_limit = current->mm->mmap_base;
	info.high_limit = TASK_SIZE;
	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
	return vm_unmapped_area(&info);
}

static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
		unsigned long addr0, unsigned long len,
		unsigned long pgoff, unsigned long flags)
{
	struct hstate *h = hstate_file(file);
265
	struct vm_unmapped_area_info info = {};
266 267 268 269
	unsigned long addr;

	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
	info.length = len;
270
	info.low_limit = PAGE_SIZE;
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
	info.high_limit = current->mm->mmap_base;
	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
	addr = vm_unmapped_area(&info);

	/*
	 * A failed mmap() very likely causes application failure,
	 * so fall back to the bottom-up function here. This scenario
	 * can happen with large stack limits and large mmap()
	 * allocations.
	 */
	if (addr & ~PAGE_MASK) {
		VM_BUG_ON(addr != -ENOMEM);
		info.flags = 0;
		info.low_limit = TASK_UNMAPPED_BASE;
		info.high_limit = TASK_SIZE;
		addr = vm_unmapped_area(&info);
	}

	return addr;
}

unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
		unsigned long len, unsigned long pgoff, unsigned long flags)
{
	struct hstate *h = hstate_file(file);
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;

	if (len & ~huge_page_mask(h))
		return -EINVAL;
	if (len > TASK_SIZE - mmap_min_addr)
		return -ENOMEM;

	if (flags & MAP_FIXED) {
		if (prepare_hugepage_range(file, addr, len))
			return -EINVAL;
		goto check_asce_limit;
	}

	if (addr) {
		addr = ALIGN(addr, huge_page_size(h));
		vma = find_vma(mm, addr);
		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
		    (!vma || addr + len <= vm_start_gap(vma)))
			goto check_asce_limit;
	}

318
	if (!test_bit(MMF_TOPDOWN, &mm->flags))
319 320 321 322 323
		addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
				pgoff, flags);
	else
		addr = hugetlb_get_unmapped_area_topdown(file, addr, len,
				pgoff, flags);
324
	if (offset_in_page(addr))
325 326 327
		return addr;

check_asce_limit:
328
	return check_asce_limit(mm, addr, len);
329
}