readahead.c 19.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6
/*
 * mm/readahead.c - address_space-level file readahead.
 *
 * Copyright (C) 2002, Linus Torvalds
 *
7
 * 09Apr2002	Andrew Morton
Linus Torvalds's avatar
Linus Torvalds committed
8 9 10 11
 *		Initial version.
 */

#include <linux/kernel.h>
12
#include <linux/dax.h>
13
#include <linux/gfp.h>
14
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
15
#include <linux/backing-dev.h>
16
#include <linux/task_io_accounting_ops.h>
Linus Torvalds's avatar
Linus Torvalds committed
17
#include <linux/pagevec.h>
Jens Axboe's avatar
Jens Axboe committed
18
#include <linux/pagemap.h>
19 20
#include <linux/syscalls.h>
#include <linux/file.h>
21
#include <linux/mm_inline.h>
22
#include <linux/blk-cgroup.h>
23
#include <linux/fadvise.h>
24
#include <linux/sched/mm.h>
Linus Torvalds's avatar
Linus Torvalds committed
25

26 27
#include "internal.h"

Linus Torvalds's avatar
Linus Torvalds committed
28 29 30 31 32 33 34
/*
 * Initialise a struct file's readahead state.  Assumes that the caller has
 * memset *ra to zero.
 */
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
35
	ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
36
	ra->prev_pos = -1;
Linus Torvalds's avatar
Linus Torvalds committed
37
}
38
EXPORT_SYMBOL_GPL(file_ra_state_init);
Linus Torvalds's avatar
Linus Torvalds committed
39

40 41
/*
 * see if a page needs releasing upon read_cache_pages() failure
42 43 44 45
 * - the caller of read_cache_pages() may have set PG_private or PG_fscache
 *   before calling, such as the NFS fs marking pages that are cached locally
 *   on disk, thus we need to give the fs a chance to clean up in the event of
 *   an error
46 47 48 49
 */
static void read_cache_pages_invalidate_page(struct address_space *mapping,
					     struct page *page)
{
50
	if (page_has_private(page)) {
51 52 53
		if (!trylock_page(page))
			BUG();
		page->mapping = mapping;
54
		do_invalidatepage(page, 0, PAGE_SIZE);
55 56 57
		page->mapping = NULL;
		unlock_page(page);
	}
58
	put_page(page);
59 60 61 62 63 64 65 66 67 68 69
}

/*
 * release a list of pages, invalidating them first if need be
 */
static void read_cache_pages_invalidate_pages(struct address_space *mapping,
					      struct list_head *pages)
{
	struct page *victim;

	while (!list_empty(pages)) {
70
		victim = lru_to_page(pages);
71 72 73 74 75
		list_del(&victim->lru);
		read_cache_pages_invalidate_page(mapping, victim);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
76
/**
77
 * read_cache_pages - populate an address space with some pages & start reads against them
Linus Torvalds's avatar
Linus Torvalds committed
78 79 80 81 82 83 84
 * @mapping: the address_space
 * @pages: The address of a list_head which contains the target pages.  These
 *   pages have their ->index populated and are otherwise uninitialised.
 * @filler: callback routine for filling a single page.
 * @data: private data for the callback routine.
 *
 * Hides the details of the LRU cache etc from the filesystems.
85 86
 *
 * Returns: %0 on success, error return by @filler otherwise
Linus Torvalds's avatar
Linus Torvalds committed
87 88 89 90 91 92 93 94
 */
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
			int (*filler)(void *, struct page *), void *data)
{
	struct page *page;
	int ret = 0;

	while (!list_empty(pages)) {
95
		page = lru_to_page(pages);
Linus Torvalds's avatar
Linus Torvalds committed
96
		list_del(&page->lru);
97
		if (add_to_page_cache_lru(page, mapping, page->index,
98
				readahead_gfp_mask(mapping))) {
99
			read_cache_pages_invalidate_page(mapping, page);
Linus Torvalds's avatar
Linus Torvalds committed
100 101
			continue;
		}
102
		put_page(page);
Nick Piggin's avatar
Nick Piggin committed
103

Linus Torvalds's avatar
Linus Torvalds committed
104
		ret = filler(data, page);
Nick Piggin's avatar
Nick Piggin committed
105
		if (unlikely(ret)) {
106
			read_cache_pages_invalidate_pages(mapping, pages);
Linus Torvalds's avatar
Linus Torvalds committed
107 108
			break;
		}
109
		task_io_account_read(PAGE_SIZE);
Linus Torvalds's avatar
Linus Torvalds committed
110 111 112 113 114 115
	}
	return ret;
}

EXPORT_SYMBOL(read_cache_pages);

116
static void read_pages(struct readahead_control *rac, struct list_head *pages,
117
		bool skip_page)
Linus Torvalds's avatar
Linus Torvalds committed
118
{
119
	const struct address_space_operations *aops = rac->mapping->a_ops;
120
	struct page *page;
Jens Axboe's avatar
Jens Axboe committed
121
	struct blk_plug plug;
Linus Torvalds's avatar
Linus Torvalds committed
122

123
	if (!readahead_count(rac))
124
		goto out;
125

Jens Axboe's avatar
Jens Axboe committed
126 127
	blk_start_plug(&plug);

128 129 130 131 132 133 134 135
	if (aops->readahead) {
		aops->readahead(rac);
		/* Clean up the remaining pages */
		while ((page = readahead_page(rac))) {
			unlock_page(page);
			put_page(page);
		}
	} else if (aops->readpages) {
136 137
		aops->readpages(rac->file, rac->mapping, pages,
				readahead_count(rac));
OGAWA Hirofumi's avatar
OGAWA Hirofumi committed
138 139
		/* Clean up the remaining pages */
		put_pages_list(pages);
140 141 142 143
		rac->_index += rac->_nr_pages;
		rac->_nr_pages = 0;
	} else {
		while ((page = readahead_page(rac))) {
144
			aops->readpage(rac->file, page);
145 146
			put_page(page);
		}
Linus Torvalds's avatar
Linus Torvalds committed
147
	}
Jens Axboe's avatar
Jens Axboe committed
148 149

	blk_finish_plug(&plug);
150 151

	BUG_ON(!list_empty(pages));
152 153 154 155 156
	BUG_ON(readahead_count(rac));

out:
	if (skip_page)
		rac->_index++;
Linus Torvalds's avatar
Linus Torvalds committed
157 158
}

159
/**
160 161
 * page_cache_ra_unbounded - Start unchecked readahead.
 * @ractl: Readahead control.
162 163 164 165 166 167 168 169 170 171
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
Linus Torvalds's avatar
Linus Torvalds committed
172
 */
173 174
void page_cache_ra_unbounded(struct readahead_control *ractl,
		unsigned long nr_to_read, unsigned long lookahead_size)
Linus Torvalds's avatar
Linus Torvalds committed
175
{
176 177
	struct address_space *mapping = ractl->mapping;
	unsigned long index = readahead_index(ractl);
Linus Torvalds's avatar
Linus Torvalds committed
178
	LIST_HEAD(page_pool);
179
	gfp_t gfp_mask = readahead_gfp_mask(mapping);
180
	unsigned long i;
Linus Torvalds's avatar
Linus Torvalds committed
181

182 183 184 185 186 187 188 189 190 191 192 193
	/*
	 * Partway through the readahead operation, we will have added
	 * locked pages to the page cache, but will not yet have submitted
	 * them for I/O.  Adding another page may need to allocate memory,
	 * which can trigger memory reclaim.  Telling the VM we're in
	 * the middle of a filesystem operation will cause it to not
	 * touch file-backed pages, preventing a deadlock.  Most (all?)
	 * filesystems already specify __GFP_NOFS in their mapping's
	 * gfp_mask, but let's be explicit here.
	 */
	unsigned int nofs = memalloc_nofs_save();

194
	filemap_invalidate_lock_shared(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
195 196 197
	/*
	 * Preallocate as many pages as we will need.
	 */
198
	for (i = 0; i < nr_to_read; i++) {
199
		struct page *page = xa_load(&mapping->i_pages, index + i);
Linus Torvalds's avatar
Linus Torvalds committed
200

201
		if (page && !xa_is_value(page)) {
202
			/*
203 204 205 206 207 208
			 * Page already present?  Kick off the current batch
			 * of contiguous pages before continuing with the
			 * next batch.  This page may be the one we would
			 * have intended to mark as Readahead, but we don't
			 * have a stable reference to this page, and it's
			 * not worth getting one just for that.
209
			 */
210
			read_pages(ractl, &page_pool, true);
211
			i = ractl->_index + ractl->_nr_pages - index - 1;
Linus Torvalds's avatar
Linus Torvalds committed
212
			continue;
213
		}
Linus Torvalds's avatar
Linus Torvalds committed
214

215
		page = __page_cache_alloc(gfp_mask);
Linus Torvalds's avatar
Linus Torvalds committed
216 217
		if (!page)
			break;
218 219 220 221 222 223
		if (mapping->a_ops->readpages) {
			page->index = index + i;
			list_add(&page->lru, &page_pool);
		} else if (add_to_page_cache_lru(page, mapping, index + i,
					gfp_mask) < 0) {
			put_page(page);
224
			read_pages(ractl, &page_pool, true);
225
			i = ractl->_index + ractl->_nr_pages - index - 1;
226 227
			continue;
		}
228
		if (i == nr_to_read - lookahead_size)
229
			SetPageReadahead(page);
230
		ractl->_nr_pages++;
Linus Torvalds's avatar
Linus Torvalds committed
231 232 233 234 235 236 237
	}

	/*
	 * Now start the IO.  We ignore I/O errors - if the page is not
	 * uptodate then the caller will launch readpage again, and
	 * will then handle the error.
	 */
238
	read_pages(ractl, &page_pool, false);
239
	filemap_invalidate_unlock_shared(mapping);
240
	memalloc_nofs_restore(nofs);
Linus Torvalds's avatar
Linus Torvalds committed
241
}
242
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
243 244

/*
245
 * do_page_cache_ra() actually reads a chunk of disk.  It allocates
246 247 248 249
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
250 251
void do_page_cache_ra(struct readahead_control *ractl,
		unsigned long nr_to_read, unsigned long lookahead_size)
252
{
253 254
	struct inode *inode = ractl->mapping->host;
	unsigned long index = readahead_index(ractl);
255 256 257 258 259 260 261 262 263 264 265 266 267
	loff_t isize = i_size_read(inode);
	pgoff_t end_index;	/* The last page we want to read */

	if (isize == 0)
		return;

	end_index = (isize - 1) >> PAGE_SHIFT;
	if (index > end_index)
		return;
	/* Don't read past the page containing the last byte of the file */
	if (nr_to_read > end_index - index)
		nr_to_read = end_index - index + 1;

268
	page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
269
}
Linus Torvalds's avatar
Linus Torvalds committed
270 271 272 273 274

/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
275
void force_page_cache_ra(struct readahead_control *ractl,
276
		unsigned long nr_to_read)
Linus Torvalds's avatar
Linus Torvalds committed
277
{
278
	struct address_space *mapping = ractl->mapping;
279
	struct file_ra_state *ra = ractl->ra;
280
	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
281
	unsigned long max_pages, index;
282

283 284
	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
			!mapping->a_ops->readahead))
285
		return;
Linus Torvalds's avatar
Linus Torvalds committed
286

287 288 289 290
	/*
	 * If the request exceeds the readahead window, allow the read to
	 * be up to the optimal hardware IO size
	 */
291
	index = readahead_index(ractl);
292
	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
293
	nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
Linus Torvalds's avatar
Linus Torvalds committed
294
	while (nr_to_read) {
295
		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
296 297 298

		if (this_chunk > nr_to_read)
			this_chunk = nr_to_read;
299 300
		ractl->_index = index;
		do_page_cache_ra(ractl, this_chunk, 0);
301

302
		index += this_chunk;
Linus Torvalds's avatar
Linus Torvalds committed
303 304 305 306
		nr_to_read -= this_chunk;
	}
}

307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-8 page = 32k initial, > 8 page = 128k initial
 */
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
	unsigned long newsize = roundup_pow_of_two(size);

	if (newsize <= max / 32)
		newsize = newsize * 4;
	else if (newsize <= max / 4)
		newsize = newsize * 2;
	else
		newsize = max;

	return newsize;
}

327 328 329 330
/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
331
static unsigned long get_next_ra_size(struct file_ra_state *ra,
332
				      unsigned long max)
333
{
334
	unsigned long cur = ra->size;
335 336

	if (cur < max / 16)
337 338 339 340
		return 4 * cur;
	if (cur <= max / 2)
		return 2 * cur;
	return max;
341 342 343 344 345 346 347 348
}

/*
 * On-demand readahead design.
 *
 * The fields in struct file_ra_state represent the most-recently-executed
 * readahead attempt:
 *
349 350 351 352
 *                        |<----- async_size ---------|
 *     |------------------- size -------------------->|
 *     |==================#===========================|
 *     ^start             ^page marked with PG_readahead
353 354 355 356
 *
 * To overlap application thinking time and disk I/O time, we do
 * `readahead pipelining': Do not wait until the application consumed all
 * readahead pages and stalled on the missing page at readahead_index;
357 358 359
 * Instead, submit an asynchronous readahead I/O as soon as there are
 * only async_size pages left in the readahead window. Normally async_size
 * will be equal to size, for maximum pipelining.
360 361 362
 *
 * In interleaved sequential reads, concurrent streams on the same fd can
 * be invalidating each other's readahead state. So we flag the new readahead
363
 * page at (start+size-async_size) with PG_readahead, and use it as readahead
364 365 366
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
367
 * prev_pos tracks the last visited byte in the _previous_ read request.
368 369 370 371 372 373 374 375 376 377 378 379 380 381
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
 * sequential ones.
 *
 * There is a special-case: if the first page which the application tries to
 * read happens to be the first page of the file, it is assumed that a linear
 * read is about to happen and the window is immediately set to the initial size
 * based on I/O request size and the max_readahead.
 *
 * The code ramps up the readahead size aggressively at first, but slow down as
 * it approaches max_readhead.
 */

382
/*
383
 * Count contiguously cached pages from @index-1 to @index-@max,
384 385 386 387 388
 * this count is a conservative estimation of
 * 	- length of the sequential read sequence, or
 * 	- thrashing threshold in memory tight systems
 */
static pgoff_t count_history_pages(struct address_space *mapping,
389
				   pgoff_t index, unsigned long max)
390 391 392 393
{
	pgoff_t head;

	rcu_read_lock();
394
	head = page_cache_prev_miss(mapping, index - 1, max);
395 396
	rcu_read_unlock();

397
	return index - 1 - head;
398 399 400 401 402 403 404
}

/*
 * page cache context based read-ahead
 */
static int try_context_readahead(struct address_space *mapping,
				 struct file_ra_state *ra,
405
				 pgoff_t index,
406 407 408 409 410
				 unsigned long req_size,
				 unsigned long max)
{
	pgoff_t size;

411
	size = count_history_pages(mapping, index, max);
412 413

	/*
414
	 * not enough history pages:
415 416
	 * it could be a random read
	 */
417
	if (size <= req_size)
418 419 420 421 422 423
		return 0;

	/*
	 * starts from beginning of file:
	 * it is a strong indication of long-run stream (or whole-file-read)
	 */
424
	if (size >= index)
425 426
		size *= 2;

427
	ra->start = index;
428 429
	ra->size = min(size + req_size, max);
	ra->async_size = 1;
430 431 432 433

	return 1;
}

434 435 436
/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
437
static void ondemand_readahead(struct readahead_control *ractl,
438
		bool hit_readahead_marker, unsigned long req_size)
439
{
440
	struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
441
	struct file_ra_state *ra = ractl->ra;
442
	unsigned long max_pages = ra->ra_pages;
443
	unsigned long add_pages;
444
	unsigned long index = readahead_index(ractl);
445
	pgoff_t prev_index;
446

447 448 449 450 451 452 453
	/*
	 * If the request exceeds the readahead window, allow the read to
	 * be up to the optimal hardware IO size
	 */
	if (req_size > max_pages && bdi->io_pages > max_pages)
		max_pages = min(req_size, bdi->io_pages);

454 455 456
	/*
	 * start of file
	 */
457
	if (!index)
458
		goto initial_readahead;
459 460

	/*
461
	 * It's the expected callback index, assume sequential access.
462 463
	 * Ramp up sizes, and push forward the readahead window.
	 */
464 465
	if ((index == (ra->start + ra->size - ra->async_size) ||
	     index == (ra->start + ra->size))) {
466
		ra->start += ra->size;
467
		ra->size = get_next_ra_size(ra, max_pages);
468 469
		ra->async_size = ra->size;
		goto readit;
470 471
	}

472 473 474 475 476 477 478 479 480
	/*
	 * Hit a marked page without valid readahead state.
	 * E.g. interleaved reads.
	 * Query the pagecache for async_size, which normally equals to
	 * readahead size. Ramp it up and use it as the new readahead size.
	 */
	if (hit_readahead_marker) {
		pgoff_t start;

Nick Piggin's avatar
Nick Piggin committed
481
		rcu_read_lock();
482 483
		start = page_cache_next_miss(ractl->mapping, index + 1,
				max_pages);
Nick Piggin's avatar
Nick Piggin committed
484
		rcu_read_unlock();
485

486
		if (!start || start - index > max_pages)
487
			return;
488 489

		ra->start = start;
490
		ra->size = start - index;	/* old async_size */
491
		ra->size += req_size;
492
		ra->size = get_next_ra_size(ra, max_pages);
493 494 495 496
		ra->async_size = ra->size;
		goto readit;
	}

497
	/*
498
	 * oversize read
499
	 */
500
	if (req_size > max_pages)
501 502 503 504
		goto initial_readahead;

	/*
	 * sequential cache miss
505 506
	 * trivial case: (index - prev_index) == 1
	 * unaligned reads: (index - prev_index) == 0
507
	 */
508 509
	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
	if (index - prev_index <= 1UL)
510 511
		goto initial_readahead;

512 513 514 515
	/*
	 * Query the page cache and look for the traces(cached history pages)
	 * that a sequential stream would leave behind.
	 */
516 517
	if (try_context_readahead(ractl->mapping, ra, index, req_size,
			max_pages))
518 519
		goto readit;

520 521 522 523
	/*
	 * standalone, small random read
	 * Read as is, and do not pollute the readahead state.
	 */
524
	do_page_cache_ra(ractl, req_size, 0);
525
	return;
526 527

initial_readahead:
528
	ra->start = index;
529
	ra->size = get_init_ra_size(req_size, max_pages);
530
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
531

532
readit:
533 534 535 536
	/*
	 * Will this read hit the readahead marker made by itself?
	 * If so, trigger the readahead marker hit now, and merge
	 * the resulted next readahead window into the current one.
537
	 * Take care of maximum IO pages as above.
538
	 */
539
	if (index == ra->start && ra->size == ra->async_size) {
540 541 542 543 544 545 546 547
		add_pages = get_next_ra_size(ra, max_pages);
		if (ra->size + add_pages <= max_pages) {
			ra->async_size = add_pages;
			ra->size += add_pages;
		} else {
			ra->size = max_pages;
			ra->async_size = max_pages >> 1;
		}
548 549
	}

550 551
	ractl->_index = ra->start;
	do_page_cache_ra(ractl, ra->size, ra->async_size);
552 553
}

554
void page_cache_sync_ra(struct readahead_control *ractl,
555
		unsigned long req_count)
556
{
557
	bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
558

559 560 561 562 563 564
	/*
	 * Even if read-ahead is disabled, issue this request as read-ahead
	 * as we'll need it to satisfy the requested range. The forced
	 * read-ahead will do the right thing and limit the read to just the
	 * requested range, which we'll set to 1 page for this case.
	 */
565
	if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
566 567 568 569 570
		if (!ractl->file)
			return;
		req_count = 1;
		do_forced_ra = true;
	}
571

572
	/* be dumb */
573
	if (do_forced_ra) {
574
		force_page_cache_ra(ractl, req_count);
575 576 577
		return;
	}

578
	/* do read-ahead */
579
	ondemand_readahead(ractl, false, req_count);
580
}
581
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
582

583
void page_cache_async_ra(struct readahead_control *ractl,
584
		struct page *page, unsigned long req_count)
585 586
{
	/* no read-ahead */
587
	if (!ractl->ra->ra_pages)
588 589 590 591 592 593 594 595 596 597 598 599 600
		return;

	/*
	 * Same bit is used for PG_readahead and PG_reclaim.
	 */
	if (PageWriteback(page))
		return;

	ClearPageReadahead(page);

	/*
	 * Defer asynchronous read-ahead on IO congestion.
	 */
601
	if (inode_read_congested(ractl->mapping->host))
602
		return;
603

604 605 606
	if (blk_cgroup_congested())
		return;

607
	/* do read-ahead */
608
	ondemand_readahead(ractl, true, req_count);
609
}
610
EXPORT_SYMBOL_GPL(page_cache_async_ra);
611

612
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
613 614
{
	ssize_t ret;
615
	struct fd f;
616 617

	ret = -EBADF;
618
	f = fdget(fd);
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634
	if (!f.file || !(f.file->f_mode & FMODE_READ))
		goto out;

	/*
	 * The readahead() syscall is intended to run only on files
	 * that can execute readahead. If readahead is not possible
	 * on this file, then we must return -EINVAL.
	 */
	ret = -EINVAL;
	if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
	    !S_ISREG(file_inode(f.file)->i_mode))
		goto out;

	ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
out:
	fdput(f);
635 636
	return ret;
}
637 638 639 640 641

SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
	return ksys_readahead(fd, offset, count);
}
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716

/**
 * readahead_expand - Expand a readahead request
 * @ractl: The request to be expanded
 * @new_start: The revised start
 * @new_len: The revised size of the request
 *
 * Attempt to expand a readahead request outwards from the current size to the
 * specified size by inserting locked pages before and after the current window
 * to increase the size to the new window.  This may involve the insertion of
 * THPs, in which case the window may get expanded even beyond what was
 * requested.
 *
 * The algorithm will stop if it encounters a conflicting page already in the
 * pagecache and leave a smaller expansion than requested.
 *
 * The caller must check for this by examining the revised @ractl object for a
 * different expansion than was requested.
 */
void readahead_expand(struct readahead_control *ractl,
		      loff_t new_start, size_t new_len)
{
	struct address_space *mapping = ractl->mapping;
	struct file_ra_state *ra = ractl->ra;
	pgoff_t new_index, new_nr_pages;
	gfp_t gfp_mask = readahead_gfp_mask(mapping);

	new_index = new_start / PAGE_SIZE;

	/* Expand the leading edge downwards */
	while (ractl->_index > new_index) {
		unsigned long index = ractl->_index - 1;
		struct page *page = xa_load(&mapping->i_pages, index);

		if (page && !xa_is_value(page))
			return; /* Page apparently present */

		page = __page_cache_alloc(gfp_mask);
		if (!page)
			return;
		if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
			put_page(page);
			return;
		}

		ractl->_nr_pages++;
		ractl->_index = page->index;
	}

	new_len += new_start - readahead_pos(ractl);
	new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);

	/* Expand the trailing edge upwards */
	while (ractl->_nr_pages < new_nr_pages) {
		unsigned long index = ractl->_index + ractl->_nr_pages;
		struct page *page = xa_load(&mapping->i_pages, index);

		if (page && !xa_is_value(page))
			return; /* Page apparently present */

		page = __page_cache_alloc(gfp_mask);
		if (!page)
			return;
		if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
			put_page(page);
			return;
		}
		ractl->_nr_pages++;
		if (ra) {
			ra->size++;
			ra->async_size++;
		}
	}
}
EXPORT_SYMBOL(readahead_expand);