page_io.c 13.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 *  linux/mm/page_io.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, 
 *  Asynchronous swapping added 30.12.95. Stephen Tweedie
 *  Removed race in async swapping. 14.4.1996. Bruno Haible
 *  Add swap of shared pages through the page cache. 20.2.1998. Stephen Tweedie
 *  Always use brw_page, life becomes simpler. 12 May 1998 Eric Biederman
 */

#include <linux/mm.h>
#include <linux/kernel_stat.h>
16
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
17 18 19 20 21
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
#include <linux/writeback.h>
22
#include <linux/blkdev.h>
23
#include <linux/psi.h>
24
#include <linux/uio.h>
25
#include <linux/sched/task.h>
26
#include <linux/delayacct.h>
Johannes Weiner's avatar
Johannes Weiner committed
27
#include <linux/zswap.h>
28
#include "swap.h"
Linus Torvalds's avatar
Linus Torvalds committed
29

Christoph Hellwig's avatar
Christoph Hellwig committed
30
static void __end_swap_bio_write(struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
31
{
32
	struct folio *folio = bio_first_folio_all(bio);
Linus Torvalds's avatar
Linus Torvalds committed
33

34
	if (bio->bi_status) {
35 36 37 38 39 40
		/*
		 * We failed to write the page out to swap-space.
		 * Re-dirty the page in order to avoid it being reclaimed.
		 * Also print a dire warning that things will go BAD (tm)
		 * very quickly.
		 *
41
		 * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
42
		 */
43
		folio_mark_dirty(folio);
44 45 46
		pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
				     (unsigned long long)bio->bi_iter.bi_sector);
47
		folio_clear_reclaim(folio);
48
	}
49
	folio_end_writeback(folio);
Christoph Hellwig's avatar
Christoph Hellwig committed
50 51 52 53 54
}

static void end_swap_bio_write(struct bio *bio)
{
	__end_swap_bio_write(bio);
Linus Torvalds's avatar
Linus Torvalds committed
55 56 57
	bio_put(bio);
}

58
static void __end_swap_bio_read(struct bio *bio)
Linus Torvalds's avatar
Linus Torvalds committed
59
{
60
	struct folio *folio = bio_first_folio_all(bio);
Linus Torvalds's avatar
Linus Torvalds committed
61

62
	if (bio->bi_status) {
63 64 65
		pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
				     MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
				     (unsigned long long)bio->bi_iter.bi_sector);
66
	} else {
67
		folio_mark_uptodate(folio);
Linus Torvalds's avatar
Linus Torvalds committed
68
	}
69
	folio_unlock(folio);
70 71 72 73 74
}

static void end_swap_bio_read(struct bio *bio)
{
	__end_swap_bio_read(bio);
Linus Torvalds's avatar
Linus Torvalds committed
75 76 77
	bio_put(bio);
}

78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
int generic_swapfile_activate(struct swap_info_struct *sis,
				struct file *swap_file,
				sector_t *span)
{
	struct address_space *mapping = swap_file->f_mapping;
	struct inode *inode = mapping->host;
	unsigned blocks_per_page;
	unsigned long page_no;
	unsigned blkbits;
	sector_t probe_block;
	sector_t last_block;
	sector_t lowest_block = -1;
	sector_t highest_block = 0;
	int nr_extents = 0;
	int ret;

	blkbits = inode->i_blkbits;
	blocks_per_page = PAGE_SIZE >> blkbits;

	/*
98
	 * Map all the blocks into the extent tree.  This code doesn't try
99 100 101 102 103 104 105 106 107 108
	 * to be very smart.
	 */
	probe_block = 0;
	page_no = 0;
	last_block = i_size_read(inode) >> blkbits;
	while ((probe_block + blocks_per_page) <= last_block &&
			page_no < sis->max) {
		unsigned block_in_page;
		sector_t first_block;

109 110
		cond_resched();

111 112 113
		first_block = probe_block;
		ret = bmap(inode, &first_block);
		if (ret || !first_block)
114 115 116 117 118 119 120 121 122 123 124 125 126 127
			goto bad_bmap;

		/*
		 * It must be PAGE_SIZE aligned on-disk
		 */
		if (first_block & (blocks_per_page - 1)) {
			probe_block++;
			goto reprobe;
		}

		for (block_in_page = 1; block_in_page < blocks_per_page;
					block_in_page++) {
			sector_t block;

128 129 130
			block = probe_block + block_in_page;
			ret = bmap(inode, &block);
			if (ret || !block)
131
				goto bad_bmap;
132

133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
			if (block != first_block + block_in_page) {
				/* Discontiguity */
				probe_block++;
				goto reprobe;
			}
		}

		first_block >>= (PAGE_SHIFT - blkbits);
		if (page_no) {	/* exclude the header page */
			if (first_block < lowest_block)
				lowest_block = first_block;
			if (first_block > highest_block)
				highest_block = first_block;
		}

		/*
		 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
		 */
		ret = add_swap_extent(sis, page_no, 1, first_block);
		if (ret < 0)
			goto out;
		nr_extents += ret;
		page_no++;
		probe_block += blocks_per_page;
reprobe:
		continue;
	}
	ret = nr_extents;
	*span = 1 + highest_block - lowest_block;
	if (page_no == 0)
		page_no = 1;	/* force Empty message */
	sis->max = page_no;
	sis->pages = page_no - 1;
	sis->highest_bit = page_no - 1;
out:
	return ret;
bad_bmap:
170
	pr_err("swapon: swapfile has holes\n");
171 172 173 174
	ret = -EINVAL;
	goto out;
}

Linus Torvalds's avatar
Linus Torvalds committed
175 176 177 178 179 180
/*
 * We may have stale swap cache pages in memory: notice
 * them here and get rid of the unnecessary final write.
 */
int swap_writepage(struct page *page, struct writeback_control *wbc)
{
181
	struct folio *folio = page_folio(page);
182
	int ret;
Linus Torvalds's avatar
Linus Torvalds committed
183

184 185
	if (folio_free_swap(folio)) {
		folio_unlock(folio);
186
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
187
	}
188 189 190 191
	/*
	 * Arch code may have to preserve more data than just the page
	 * contents, e.g. memory tags.
	 */
192
	ret = arch_prepare_to_swap(&folio->page);
193
	if (ret) {
194 195
		folio_mark_dirty(folio);
		folio_unlock(folio);
196
		return ret;
197
	}
198
	if (zswap_store(folio)) {
199 200 201
		folio_start_writeback(folio);
		folio_unlock(folio);
		folio_end_writeback(folio);
202
		return 0;
203
	}
204
	__swap_writepage(folio, wbc);
205
	return 0;
206 207
}

208
static inline void count_swpout_vm_event(struct folio *folio)
209 210
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
211 212
	if (unlikely(folio_test_pmd_mappable(folio))) {
		count_memcg_folio_events(folio, THP_SWPOUT, 1);
213
		count_vm_event(THP_SWPOUT);
214
	}
215
#endif
216
	count_vm_events(PSWPOUT, folio_nr_pages(folio));
217 218
}

219
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
220
static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio)
221 222
{
	struct cgroup_subsys_state *css;
223
	struct mem_cgroup *memcg;
224

225
	memcg = folio_memcg(folio);
226
	if (!memcg)
227 228 229
		return;

	rcu_read_lock();
230
	css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
231 232 233 234
	bio_associate_blkg_from_css(bio, css);
	rcu_read_unlock();
}
#else
235
#define bio_associate_blkg_from_page(bio, folio)		do { } while (0)
236 237
#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */

238 239
struct swap_iocb {
	struct kiocb		iocb;
240 241
	struct bio_vec		bvec[SWAP_CLUSTER_MAX];
	int			pages;
NeilBrown's avatar
NeilBrown committed
242
	int			len;
243 244 245 246
};
static mempool_t *sio_pool;

int sio_pool_init(void)
247
{
248 249 250 251 252 253 254 255 256 257
	if (!sio_pool) {
		mempool_t *pool = mempool_create_kmalloc_pool(
			SWAP_CLUSTER_MAX, sizeof(struct swap_iocb));
		if (cmpxchg(&sio_pool, NULL, pool))
			mempool_destroy(pool);
	}
	if (!sio_pool)
		return -ENOMEM;
	return 0;
}
258

259 260 261
static void sio_write_complete(struct kiocb *iocb, long ret)
{
	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
262
	struct page *page = sio->bvec[0].bv_page;
263
	int p;
264

NeilBrown's avatar
NeilBrown committed
265
	if (ret != sio->len) {
266 267 268 269 270 271 272 273 274 275 276 277
		/*
		 * In the case of swap-over-nfs, this can be a
		 * temporary failure if the system has limited
		 * memory for allocating transmit buffers.
		 * Mark the page dirty and avoid
		 * folio_rotate_reclaimable but rate-limit the
		 * messages but do not flag PageError like
		 * the normal direct-to-bio case as it could
		 * be temporary.
		 */
		pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
				   ret, page_file_offset(page));
278 279
		for (p = 0; p < sio->pages; p++) {
			page = sio->bvec[p].bv_page;
280
			set_page_dirty(page);
281
			ClearPageReclaim(page);
282 283 284
		}
	}

285 286 287
	for (p = 0; p < sio->pages; p++)
		end_page_writeback(sio->bvec[p].bv_page);

288 289 290
	mempool_free(sio, sio_pool);
}

291
static void swap_writepage_fs(struct folio *folio, struct writeback_control *wbc)
292
{
293
	struct swap_iocb *sio = NULL;
294
	struct swap_info_struct *sis = swp_swap_info(folio->swap);
295
	struct file *swap_file = sis->swap_file;
296
	loff_t pos = folio_file_pos(folio);
297

298 299 300
	count_swpout_vm_event(folio);
	folio_start_writeback(folio);
	folio_unlock(folio);
301 302 303 304
	if (wbc->swap_plug)
		sio = *wbc->swap_plug;
	if (sio) {
		if (sio->iocb.ki_filp != swap_file ||
NeilBrown's avatar
NeilBrown committed
305
		    sio->iocb.ki_pos + sio->len != pos) {
306 307 308 309 310 311 312 313 314 315
			swap_write_unplug(sio);
			sio = NULL;
		}
	}
	if (!sio) {
		sio = mempool_alloc(sio_pool, GFP_NOIO);
		init_sync_kiocb(&sio->iocb, swap_file);
		sio->iocb.ki_complete = sio_write_complete;
		sio->iocb.ki_pos = pos;
		sio->pages = 0;
NeilBrown's avatar
NeilBrown committed
316
		sio->len = 0;
317
	}
318 319
	bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
	sio->len += folio_size(folio);
320 321 322 323 324 325 326
	sio->pages += 1;
	if (sio->pages == ARRAY_SIZE(sio->bvec) || !wbc->swap_plug) {
		swap_write_unplug(sio);
		sio = NULL;
	}
	if (wbc->swap_plug)
		*wbc->swap_plug = sio;
327 328
}

329
static void swap_writepage_bdev_sync(struct folio *folio,
330
		struct writeback_control *wbc, struct swap_info_struct *sis)
331
{
Christoph Hellwig's avatar
Christoph Hellwig committed
332 333
	struct bio_vec bv;
	struct bio bio;
334

Christoph Hellwig's avatar
Christoph Hellwig committed
335 336
	bio_init(&bio, sis->bdev, &bv, 1,
		 REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc));
337
	bio.bi_iter.bi_sector = swap_folio_sector(folio);
338
	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
339

340
	bio_associate_blkg_from_page(&bio, folio);
341
	count_swpout_vm_event(folio);
Christoph Hellwig's avatar
Christoph Hellwig committed
342

343 344
	folio_start_writeback(folio);
	folio_unlock(folio);
Christoph Hellwig's avatar
Christoph Hellwig committed
345 346 347 348 349

	submit_bio_wait(&bio);
	__end_swap_bio_write(&bio);
}

350
static void swap_writepage_bdev_async(struct folio *folio,
Christoph Hellwig's avatar
Christoph Hellwig committed
351 352 353
		struct writeback_control *wbc, struct swap_info_struct *sis)
{
	struct bio *bio;
354

355 356 357
	bio = bio_alloc(sis->bdev, 1,
			REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
			GFP_NOIO);
358
	bio->bi_iter.bi_sector = swap_folio_sector(folio);
359
	bio->bi_end_io = end_swap_bio_write;
360
	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
Christoph Hellwig's avatar
Christoph Hellwig committed
361

362
	bio_associate_blkg_from_page(bio, folio);
363
	count_swpout_vm_event(folio);
364 365
	folio_start_writeback(folio);
	folio_unlock(folio);
366
	submit_bio(bio);
Linus Torvalds's avatar
Linus Torvalds committed
367
}
368

369
void __swap_writepage(struct folio *folio, struct writeback_control *wbc)
370
{
371
	struct swap_info_struct *sis = swp_swap_info(folio->swap);
372

373
	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
374 375 376 377 378 379
	/*
	 * ->flags can be updated non-atomicially (scan_swap_map_slots),
	 * but that will never affect SWP_FS_OPS, so the data_race
	 * is safe.
	 */
	if (data_race(sis->flags & SWP_FS_OPS))
380
		swap_writepage_fs(folio, wbc);
Christoph Hellwig's avatar
Christoph Hellwig committed
381
	else if (sis->flags & SWP_SYNCHRONOUS_IO)
382
		swap_writepage_bdev_sync(folio, wbc, sis);
383
	else
384
		swap_writepage_bdev_async(folio, wbc, sis);
Linus Torvalds's avatar
Linus Torvalds committed
385 386
}

387 388 389 390 391 392
void swap_write_unplug(struct swap_iocb *sio)
{
	struct iov_iter from;
	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
	int ret;

393
	iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len);
394 395 396 397 398
	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
	if (ret != -EIOCBQUEUED)
		sio_write_complete(&sio->iocb, ret);
}

399 400 401
static void sio_read_complete(struct kiocb *iocb, long ret)
{
	struct swap_iocb *sio = container_of(iocb, struct swap_iocb, iocb);
402
	int p;
403

NeilBrown's avatar
NeilBrown committed
404
	if (ret == sio->len) {
405
		for (p = 0; p < sio->pages; p++) {
406
			struct folio *folio = page_folio(sio->bvec[p].bv_page);
407

408 409
			folio_mark_uptodate(folio);
			folio_unlock(folio);
410 411
		}
		count_vm_events(PSWPIN, sio->pages);
412
	} else {
413
		for (p = 0; p < sio->pages; p++) {
414
			struct folio *folio = page_folio(sio->bvec[p].bv_page);
415

416
			folio_unlock(folio);
417 418
		}
		pr_alert_ratelimited("Read-error on swap-device\n");
419 420 421 422
	}
	mempool_free(sio, sio_pool);
}

423
static void swap_readpage_fs(struct folio *folio, struct swap_iocb **plug)
424
{
425
	struct swap_info_struct *sis = swp_swap_info(folio->swap);
426
	struct swap_iocb *sio = NULL;
427
	loff_t pos = folio_file_pos(folio);
428

429 430 431 432
	if (plug)
		sio = *plug;
	if (sio) {
		if (sio->iocb.ki_filp != sis->swap_file ||
NeilBrown's avatar
NeilBrown committed
433
		    sio->iocb.ki_pos + sio->len != pos) {
434 435 436 437 438 439 440 441 442 443
			swap_read_unplug(sio);
			sio = NULL;
		}
	}
	if (!sio) {
		sio = mempool_alloc(sio_pool, GFP_KERNEL);
		init_sync_kiocb(&sio->iocb, sis->swap_file);
		sio->iocb.ki_pos = pos;
		sio->iocb.ki_complete = sio_read_complete;
		sio->pages = 0;
NeilBrown's avatar
NeilBrown committed
444
		sio->len = 0;
445
	}
446 447
	bvec_set_folio(&sio->bvec[sio->pages], folio, folio_size(folio), 0);
	sio->len += folio_size(folio);
448 449 450 451 452 453 454
	sio->pages += 1;
	if (sio->pages == ARRAY_SIZE(sio->bvec) || !plug) {
		swap_read_unplug(sio);
		sio = NULL;
	}
	if (plug)
		*plug = sio;
455 456
}

457
static void swap_readpage_bdev_sync(struct folio *folio,
458
		struct swap_info_struct *sis)
Linus Torvalds's avatar
Linus Torvalds committed
459
{
460 461
	struct bio_vec bv;
	struct bio bio;
462

463
	bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_READ);
464
	bio.bi_iter.bi_sector = swap_folio_sector(folio);
465
	bio_add_folio_nofail(&bio, folio, folio_size(folio), 0);
466 467 468 469
	/*
	 * Keep this task valid during swap readpage because the oom killer may
	 * attempt to access it in the page fault retry time check.
	 */
470
	get_task_struct(current);
471
	count_vm_event(PSWPIN);
472 473 474 475 476
	submit_bio_wait(&bio);
	__end_swap_bio_read(&bio);
	put_task_struct(current);
}

477
static void swap_readpage_bdev_async(struct folio *folio,
478
		struct swap_info_struct *sis)
Linus Torvalds's avatar
Linus Torvalds committed
479 480
{
	struct bio *bio;
481

482
	bio = bio_alloc(sis->bdev, 1, REQ_OP_READ, GFP_KERNEL);
483
	bio->bi_iter.bi_sector = swap_folio_sector(folio);
484
	bio->bi_end_io = end_swap_bio_read;
485
	bio_add_folio_nofail(bio, folio, folio_size(folio), 0);
486 487
	count_vm_event(PSWPIN);
	submit_bio(bio);
488 489 490 491
}

void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug)
{
492
	struct folio *folio = page_folio(page);
493
	struct swap_info_struct *sis = page_swap_info(page);
494
	bool workingset = folio_test_workingset(folio);
495
	unsigned long pflags;
496
	bool in_thrashing;
Linus Torvalds's avatar
Linus Torvalds committed
497

498 499 500
	VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio);
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
	VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);
501 502

	/*
503 504 505
	 * Count submission time as memory stall and delay. When the device
	 * is congested, or the submitting cgroup IO-throttled, submission
	 * can be a significant part of overall IO time.
506
	 */
507 508
	if (workingset) {
		delayacct_thrashing_start(&in_thrashing);
509
		psi_memstall_enter(&pflags);
510
	}
511
	delayacct_swapin_start();
512

513
	if (zswap_load(folio)) {
514 515
		folio_mark_uptodate(folio);
		folio_unlock(folio);
516
	} else if (data_race(sis->flags & SWP_FS_OPS)) {
517
		swap_readpage_fs(folio, plug);
Christoph Hellwig's avatar
Christoph Hellwig committed
518
	} else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) {
519
		swap_readpage_bdev_sync(folio, sis);
520
	} else {
521
		swap_readpage_bdev_async(folio, sis);
522 523
	}

524 525
	if (workingset) {
		delayacct_thrashing_end(&in_thrashing);
526
		psi_memstall_leave(&pflags);
527
	}
528
	delayacct_swapin_end();
Linus Torvalds's avatar
Linus Torvalds committed
529
}
530

531
void __swap_read_unplug(struct swap_iocb *sio)
532
{
533 534 535
	struct iov_iter from;
	struct address_space *mapping = sio->iocb.ki_filp->f_mapping;
	int ret;
536

537
	iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len);
538 539 540
	ret = mapping->a_ops->swap_rw(&sio->iocb, &from);
	if (ret != -EIOCBQUEUED)
		sio_read_complete(&sio->iocb, ret);
541
}