filemap.c 41.5 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
Linus Torvalds's avatar
Linus Torvalds committed
2
 *	linux/mm/filemap.c
Linus Torvalds's avatar
Linus Torvalds committed
3
 *
Linus Torvalds's avatar
Linus Torvalds committed
4
 * Copyright (C) 1994, 1995  Linus Torvalds
Linus Torvalds's avatar
Linus Torvalds committed
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
 */

/*
 * This file handles the generic file mmap semantics used by
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem does this differently, for example)
 */
#include <linux/stat.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/shm.h>
#include <linux/errno.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/malloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
21 22
#include <linux/fs.h>
#include <linux/locks.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
#include <linux/pagemap.h>
Linus Torvalds's avatar
Linus Torvalds committed
24
#include <linux/swap.h>
Linus Torvalds's avatar
Linus Torvalds committed
25 26
#include <linux/smp.h>
#include <linux/smp_lock.h>
Linus Torvalds's avatar
Linus Torvalds committed
27
#include <linux/blkdev.h>
Linus Torvalds's avatar
Linus Torvalds committed
28
#include <linux/file.h>
Linus Torvalds's avatar
Linus Torvalds committed
29
#include <linux/swapctl.h>
Linus Torvalds's avatar
Linus Torvalds committed
30 31

#include <asm/system.h>
Linus Torvalds's avatar
Linus Torvalds committed
32
#include <asm/pgtable.h>
Linus Torvalds's avatar
Linus Torvalds committed
33
#include <asm/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
34 35 36 37

/*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
 * though.
Linus Torvalds's avatar
Linus Torvalds committed
38 39
 *
 * Shared mappings now work. 15.8.1995  Bruno.
Linus Torvalds's avatar
Linus Torvalds committed
40 41
 */

Linus Torvalds's avatar
Linus Torvalds committed
42
unsigned long page_cache_size = 0;
Linus Torvalds's avatar
Linus Torvalds committed
43 44
struct page * page_hash_table[PAGE_HASH_SIZE];

Linus Torvalds's avatar
Linus Torvalds committed
45 46 47 48
/*
 * Simple routines for both non-shared and shared mappings.
 */

Linus Torvalds's avatar
Linus Torvalds committed
49
#define release_page(page) __free_page((page))
Linus Torvalds's avatar
Linus Torvalds committed
50

Linus Torvalds's avatar
Linus Torvalds committed
51 52 53 54 55 56 57 58 59 60 61 62
/*
 * Invalidate the pages of an inode, removing all pages that aren't
 * locked down (those are sure to be up-to-date anyway, so we shouldn't
 * invalidate them).
 */
void invalidate_inode_pages(struct inode * inode)
{
	struct page ** p;
	struct page * page;

	p = &inode->i_pages;
	while ((page = *p) != NULL) {
Linus Torvalds's avatar
Linus Torvalds committed
63
		if (PageLocked(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
64 65 66 67 68 69 70 71 72 73
			p = &page->next;
			continue;
		}
		inode->i_nrpages--;
		if ((*p = page->next) != NULL)
			(*p)->prev = page->prev;
		page->next = NULL;
		page->prev = NULL;
		remove_page_from_hash_queue(page);
		page->inode = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
74
		__free_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
75 76 77 78 79 80 81 82 83
		continue;
	}
}

/*
 * Truncate the page cache at a set offset, removing the pages
 * that are beyond that offset (and zeroing out partial pages).
 */
void truncate_inode_pages(struct inode * inode, unsigned long start)
Linus Torvalds's avatar
Linus Torvalds committed
84
{
Linus Torvalds's avatar
Linus Torvalds committed
85
	struct page ** p;
Linus Torvalds's avatar
Linus Torvalds committed
86 87
	struct page * page;

Linus Torvalds's avatar
Linus Torvalds committed
88 89
repeat:
	p = &inode->i_pages;
Linus Torvalds's avatar
Linus Torvalds committed
90 91 92 93 94
	while ((page = *p) != NULL) {
		unsigned long offset = page->offset;

		/* page wholly truncated - free it */
		if (offset >= start) {
Linus Torvalds's avatar
Linus Torvalds committed
95
			if (PageLocked(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
96 97 98
				wait_on_page(page);
				goto repeat;
			}
Linus Torvalds's avatar
Linus Torvalds committed
99 100 101 102 103 104 105
			inode->i_nrpages--;
			if ((*p = page->next) != NULL)
				(*p)->prev = page->prev;
			page->next = NULL;
			page->prev = NULL;
			remove_page_from_hash_queue(page);
			page->inode = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
106
			__free_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
107 108 109 110 111
			continue;
		}
		p = &page->next;
		offset = start - offset;
		/* partial truncate, clear end of page */
Linus Torvalds's avatar
Linus Torvalds committed
112
		if (offset < PAGE_SIZE) {
Linus Torvalds's avatar
Linus Torvalds committed
113 114 115
			unsigned long address = page_address(page);
			memset((void *) (offset + address), 0, PAGE_SIZE - offset);
			flush_page_to_ram(address);
Linus Torvalds's avatar
Linus Torvalds committed
116
		}
Linus Torvalds's avatar
Linus Torvalds committed
117 118 119
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
120
int shrink_mmap(int priority, int gfp_mask)
Linus Torvalds's avatar
Linus Torvalds committed
121
{
Linus Torvalds's avatar
Linus Torvalds committed
122
	static unsigned long clock = 0;
Linus Torvalds's avatar
Linus Torvalds committed
123
	struct page * page;
Linus Torvalds's avatar
Linus Torvalds committed
124
	unsigned long limit = num_physpages;
Linus Torvalds's avatar
Linus Torvalds committed
125
	struct buffer_head *tmp, *bh;
Linus Torvalds's avatar
Linus Torvalds committed
126 127 128 129
	int count_max, count_min;

	count_max = (limit<<1) >> (priority>>1);
	count_min = (limit<<1) >> (priority);
Linus Torvalds's avatar
Linus Torvalds committed
130 131

	page = mem_map + clock;
Linus Torvalds's avatar
Linus Torvalds committed
132
	do {
Linus Torvalds's avatar
Linus Torvalds committed
133 134 135 136
		count_max--;
		if (page->inode || page->buffers)
			count_min--;

Linus Torvalds's avatar
Linus Torvalds committed
137
		if (PageLocked(page))
Linus Torvalds's avatar
Linus Torvalds committed
138
			goto next;
Linus Torvalds's avatar
Linus Torvalds committed
139
		if ((gfp_mask & __GFP_DMA) && !PageDMA(page))
Linus Torvalds's avatar
Linus Torvalds committed
140
			goto next;
Linus Torvalds's avatar
Linus Torvalds committed
141 142
		/* First of all, regenerate the page's referenced bit
                   from any buffers in the page */
Linus Torvalds's avatar
Linus Torvalds committed
143
		bh = page->buffers;
Linus Torvalds's avatar
Linus Torvalds committed
144 145 146 147 148
		if (bh) {
			tmp = bh;
			do {
				if (buffer_touched(tmp)) {
					clear_bit(BH_Touched, &tmp->b_state);
Linus Torvalds's avatar
Linus Torvalds committed
149
					set_bit(PG_referenced, &page->flags);
Linus Torvalds's avatar
Linus Torvalds committed
150 151 152
				}
				tmp = tmp->b_this_page;
			} while (tmp != bh);
Linus Torvalds's avatar
Linus Torvalds committed
153 154

			/* Refuse to swap out all buffer pages */
Linus Torvalds's avatar
Linus Torvalds committed
155
			if ((buffermem >> PAGE_SHIFT) * 100 < (buffer_mem.min_percent * num_physpages))
Linus Torvalds's avatar
Linus Torvalds committed
156
				goto next;
Linus Torvalds's avatar
Linus Torvalds committed
157 158 159 160 161 162 163
		}

		/* We can't throw away shared pages, but we do mark
		   them as referenced.  This relies on the fact that
		   no page is currently in both the page cache and the
		   buffer cache; we'd have to modify the following
		   test to allow for that case. */
Linus Torvalds's avatar
Linus Torvalds committed
164

Linus Torvalds's avatar
Linus Torvalds committed
165
		switch (atomic_read(&page->count)) {
Linus Torvalds's avatar
Linus Torvalds committed
166
			case 1:
Linus Torvalds's avatar
Linus Torvalds committed
167
				/* is it a swap-cache or page-cache page? */
Linus Torvalds's avatar
Linus Torvalds committed
168
				if (page->inode) {
Linus Torvalds's avatar
Linus Torvalds committed
169 170 171 172 173
					if (test_and_clear_bit(PG_referenced, &page->flags)) {
						touch_page(page);
						break;
					}
					age_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
174
					if (page->age || page_cache_size * 100 < (page_cache.min_percent * num_physpages))
Linus Torvalds's avatar
Linus Torvalds committed
175
						break;
Linus Torvalds's avatar
Linus Torvalds committed
176 177 178 179
					if (PageSwapCache(page)) {
						delete_from_swap_cache(page);
						return 1;
					}
Linus Torvalds's avatar
Linus Torvalds committed
180 181
					remove_page_from_hash_queue(page);
					remove_page_from_inode_queue(page);
Linus Torvalds's avatar
Linus Torvalds committed
182
					__free_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
183 184
					return 1;
				}
Linus Torvalds's avatar
Linus Torvalds committed
185 186 187 188
				/* It's not a cache page, so we don't do aging.
				 * If it has been referenced recently, don't free it */
				if (test_and_clear_bit(PG_referenced, &page->flags))
					break;
Linus Torvalds's avatar
Linus Torvalds committed
189 190

				/* is it a buffer cache page? */
Linus Torvalds's avatar
Linus Torvalds committed
191
				if ((gfp_mask & __GFP_IO) && bh && try_to_free_buffer(bh, &bh, 6))
Linus Torvalds's avatar
Linus Torvalds committed
192 193 194 195 196 197 198 199 200
					return 1;
				break;

			default:
				/* more than one users: we can't throw it away */
				set_bit(PG_referenced, &page->flags);
				/* fall through */
			case 0:
				/* nothing */
Linus Torvalds's avatar
Linus Torvalds committed
201
		}
Linus Torvalds's avatar
Linus Torvalds committed
202
next:
Linus Torvalds's avatar
Linus Torvalds committed
203 204 205 206 207 208
		page++;
		clock++;
		if (clock >= limit) {
			clock = 0;
			page = mem_map;
		}
Linus Torvalds's avatar
Linus Torvalds committed
209
	} while (count_max > 0 && count_min > 0);
Linus Torvalds's avatar
Linus Torvalds committed
210 211 212 213
	return 0;
}

/*
Linus Torvalds's avatar
Linus Torvalds committed
214
 * This is called from try_to_swap_out() when we try to get rid of some
Linus Torvalds's avatar
Linus Torvalds committed
215 216 217 218
 * pages..  If we're unmapping the last occurrence of this page, we also
 * free it from the page hash-queues etc, as we don't want to keep it
 * in-core unnecessarily.
 */
Linus Torvalds's avatar
Linus Torvalds committed
219
unsigned long page_unuse(struct page * page)
Linus Torvalds's avatar
Linus Torvalds committed
220
{
Linus Torvalds's avatar
Linus Torvalds committed
221
	int count = atomic_read(&page->count);
Linus Torvalds's avatar
Linus Torvalds committed
222 223 224

	if (count != 2)
		return count;
Linus Torvalds's avatar
Linus Torvalds committed
225
	if (!page->inode)
Linus Torvalds's avatar
Linus Torvalds committed
226
		return count;
Linus Torvalds's avatar
Linus Torvalds committed
227
	if (PageSwapCache(page))
Linus Torvalds's avatar
Linus Torvalds committed
228
		panic ("Doing a normal page_unuse of a swap cache page");
Linus Torvalds's avatar
Linus Torvalds committed
229 230 231
	remove_page_from_hash_queue(page);
	remove_page_from_inode_queue(page);
	__free_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
232 233 234 235
	return 1;
}

/*
Linus Torvalds's avatar
Linus Torvalds committed
236 237
 * Update a page cache copy, when we're doing a "write()" system call
 * See also "update_vm_cache()".
Linus Torvalds's avatar
Linus Torvalds committed
238
 */
Linus Torvalds's avatar
Linus Torvalds committed
239
void update_vm_cache(struct inode * inode, unsigned long pos, const char * buf, int count)
Linus Torvalds's avatar
Linus Torvalds committed
240
{
Linus Torvalds's avatar
Linus Torvalds committed
241
	unsigned long offset, len;
Linus Torvalds's avatar
Linus Torvalds committed
242

Linus Torvalds's avatar
Linus Torvalds committed
243 244 245 246 247 248 249 250 251 252 253
	offset = (pos & ~PAGE_MASK);
	pos = pos & PAGE_MASK;
	len = PAGE_SIZE - offset;
	do {
		struct page * page;

		if (len > count)
			len = count;
		page = find_page(inode, pos);
		if (page) {
			wait_on_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
254 255
			memcpy((void *) (offset + page_address(page)), buf, len);
			release_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
256 257 258 259 260 261 262
		}
		count -= len;
		buf += len;
		len = PAGE_SIZE;
		offset = 0;
		pos += PAGE_SIZE;
	} while (count);
Linus Torvalds's avatar
Linus Torvalds committed
263 264
}

Linus Torvalds's avatar
Linus Torvalds committed
265
static inline void add_to_page_cache(struct page * page,
Linus Torvalds's avatar
Linus Torvalds committed
266 267
	struct inode * inode, unsigned long offset,
	struct page **hash)
Linus Torvalds's avatar
Linus Torvalds committed
268
{
Linus Torvalds's avatar
Linus Torvalds committed
269
	atomic_inc(&page->count);
Linus Torvalds's avatar
Linus Torvalds committed
270 271 272
	page->flags &= ~((1 << PG_uptodate) | (1 << PG_error));
	page->offset = offset;
	add_page_to_inode_queue(inode, page);
Linus Torvalds's avatar
Linus Torvalds committed
273
	__add_page_to_hash_queue(page, hash);
Linus Torvalds's avatar
Linus Torvalds committed
274 275
}

Linus Torvalds's avatar
Linus Torvalds committed
276
/*
Linus Torvalds's avatar
Linus Torvalds committed
277 278 279
 * Try to read ahead in the file. "page_cache" is a potentially free page
 * that we could use for the cache (if it is 0 we can try to create one,
 * this is all overlapped with the IO on the previous page finishing anyway)
Linus Torvalds's avatar
Linus Torvalds committed
280
 */
Linus Torvalds's avatar
Linus Torvalds committed
281
static unsigned long try_to_read_ahead(struct file * file,
Linus Torvalds's avatar
Linus Torvalds committed
282
				unsigned long offset, unsigned long page_cache)
Linus Torvalds's avatar
Linus Torvalds committed
283
{
Linus Torvalds's avatar
Linus Torvalds committed
284
	struct inode *inode = file->f_dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
285
	struct page * page;
Linus Torvalds's avatar
Linus Torvalds committed
286
	struct page ** hash;
Linus Torvalds's avatar
Linus Torvalds committed
287

Linus Torvalds's avatar
Linus Torvalds committed
288
	offset &= PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
289 290
	switch (page_cache) {
	case 0:
Linus Torvalds's avatar
Linus Torvalds committed
291 292
		page_cache = __get_free_page(GFP_KERNEL);
		if (!page_cache)
Linus Torvalds's avatar
Linus Torvalds committed
293 294 295 296 297 298 299 300 301 302 303 304
			break;
	default:
		if (offset >= inode->i_size)
			break;
		hash = page_hash(inode, offset);
		page = __find_page(inode, offset, *hash);
		if (!page) {
			/*
			 * Ok, add the new page to the hash-queues...
			 */
			page = mem_map + MAP_NR(page_cache);
			add_to_page_cache(page, inode, offset, hash);
Linus Torvalds's avatar
Linus Torvalds committed
305
			inode->i_op->readpage(file, page);
Linus Torvalds's avatar
Linus Torvalds committed
306 307
			page_cache = 0;
		}
Linus Torvalds's avatar
Linus Torvalds committed
308
		release_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
309 310 311 312
	}
	return page_cache;
}

Linus Torvalds's avatar
Linus Torvalds committed
313 314
/* 
 * Wait for IO to complete on a locked page.
Linus Torvalds's avatar
Linus Torvalds committed
315 316 317 318
 *
 * This must be called with the caller "holding" the page,
 * ie with increased "page->count" so that the page won't
 * go away during the wait..
Linus Torvalds's avatar
Linus Torvalds committed
319 320 321
 */
void __wait_on_page(struct page *page)
{
Linus Torvalds's avatar
Linus Torvalds committed
322 323
	struct task_struct *tsk = current;
	struct wait_queue wait;
Linus Torvalds's avatar
Linus Torvalds committed
324

Linus Torvalds's avatar
Linus Torvalds committed
325
	wait.task = tsk;
Linus Torvalds's avatar
Linus Torvalds committed
326 327
	add_wait_queue(&page->wait, &wait);
repeat:
Linus Torvalds's avatar
Linus Torvalds committed
328
	tsk->state = TASK_UNINTERRUPTIBLE;
Linus Torvalds's avatar
Linus Torvalds committed
329
	run_task_queue(&tq_disk);
Linus Torvalds's avatar
Linus Torvalds committed
330
	if (PageLocked(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
331 332 333
		schedule();
		goto repeat;
	}
Linus Torvalds's avatar
Linus Torvalds committed
334
	tsk->state = TASK_RUNNING;
Linus Torvalds's avatar
Linus Torvalds committed
335 336 337
	remove_wait_queue(&page->wait, &wait);
}

Linus Torvalds's avatar
Linus Torvalds committed
338 339 340 341
#if 0
#define PROFILE_READAHEAD
#define DEBUG_READAHEAD
#endif
Linus Torvalds's avatar
Linus Torvalds committed
342

Linus Torvalds's avatar
Linus Torvalds committed
343
/*
Linus Torvalds's avatar
Linus Torvalds committed
344 345 346
 * Read-ahead profiling information
 * --------------------------------
 * Every PROFILE_MAXREADCOUNT, the following information is written 
Linus Torvalds's avatar
Linus Torvalds committed
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
 * to the syslog:
 *   Percentage of asynchronous read-ahead.
 *   Average of read-ahead fields context value.
 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
 * to the syslog.
 */

#ifdef PROFILE_READAHEAD

#define PROFILE_MAXREADCOUNT 1000

static unsigned long total_reada;
static unsigned long total_async;
static unsigned long total_ramax;
static unsigned long total_ralen;
static unsigned long total_rawin;

static void profile_readahead(int async, struct file *filp)
{
	unsigned long flags;

	++total_reada;
	if (async)
		++total_async;

	total_ramax	+= filp->f_ramax;
	total_ralen	+= filp->f_ralen;
	total_rawin	+= filp->f_rawin;

	if (total_reada > PROFILE_MAXREADCOUNT) {
		save_flags(flags);
		cli();
		if (!(total_reada > PROFILE_MAXREADCOUNT)) {
			restore_flags(flags);
			return;
		}

		printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
			total_ramax/total_reada,
			total_ralen/total_reada,
			total_rawin/total_reada,
			(total_async*100)/total_reada);
#ifdef DEBUG_READAHEAD
Linus Torvalds's avatar
Linus Torvalds committed
390 391
		printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%ld\n",
			filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
Linus Torvalds's avatar
Linus Torvalds committed
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
#endif

		total_reada	= 0;
		total_async	= 0;
		total_ramax	= 0;
		total_ralen	= 0;
		total_rawin	= 0;

		restore_flags(flags);
	}
}
#endif  /* defined PROFILE_READAHEAD */

/*
 * Read-ahead context:
 * -------------------
 * The read ahead context fields of the "struct file" are the following:
Linus Torvalds's avatar
Linus Torvalds committed
409
 * - f_raend : position of the first byte after the last page we tried to
Linus Torvalds's avatar
Linus Torvalds committed
410 411 412 413 414 415 416 417
 *             read ahead.
 * - f_ramax : current read-ahead maximum size.
 * - f_ralen : length of the current IO read block we tried to read-ahead.
 * - f_rawin : length of the current read-ahead window.
 *             if last read-ahead was synchronous then
 *                  f_rawin = f_ralen
 *             otherwise (was asynchronous)
 *                  f_rawin = previous value of f_ralen + f_ralen
Linus Torvalds's avatar
Linus Torvalds committed
418
 *
Linus Torvalds's avatar
Linus Torvalds committed
419 420 421 422 423 424 425
 * Read-ahead limits:
 * ------------------
 * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
 * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
 *
 * Synchronous read-ahead benefits:
 * --------------------------------
Linus Torvalds's avatar
Linus Torvalds committed
426
 * Using reasonable IO xfer length from peripheral devices increase system 
Linus Torvalds's avatar
Linus Torvalds committed
427
 * performances.
Linus Torvalds's avatar
Linus Torvalds committed
428
 * Reasonable means, in this context, not too large but not too small.
Linus Torvalds's avatar
Linus Torvalds committed
429 430
 * The actual maximum value is:
 *	MAX_READAHEAD + PAGE_SIZE = 76k is CONFIG_READA_SMALL is undefined
Linus Torvalds's avatar
Linus Torvalds committed
431
 *      and 32K if defined (4K page size assumed).
Linus Torvalds's avatar
Linus Torvalds committed
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
 *
 * Asynchronous read-ahead benefits:
 * ---------------------------------
 * Overlapping next read request and user process execution increase system 
 * performance.
 *
 * Read-ahead risks:
 * -----------------
 * We have to guess which further data are needed by the user process.
 * If these data are often not really needed, it's bad for system 
 * performances.
 * However, we know that files are often accessed sequentially by 
 * application programs and it seems that it is possible to have some good 
 * strategy in that guessing.
 * We only try to read-ahead files that seems to be read sequentially.
 *
 * Asynchronous read-ahead risks:
 * ------------------------------
 * In order to maximize overlapping, we must start some asynchronous read 
 * request from the device, as soon as possible.
Linus Torvalds's avatar
Linus Torvalds committed
452
 * We must be very careful about:
Linus Torvalds's avatar
Linus Torvalds committed
453
 * - The number of effective pending IO read requests.
Linus Torvalds's avatar
Linus Torvalds committed
454
 *   ONE seems to be the only reasonable value.
Linus Torvalds's avatar
Linus Torvalds committed
455
 * - The total memory pool usage for the file access stream.
Linus Torvalds's avatar
Linus Torvalds committed
456
 *   This maximum memory usage is implicitly 2 IO read chunks:
Linus Torvalds's avatar
Linus Torvalds committed
457
 *   2*(MAX_READAHEAD + PAGE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
Linus Torvalds's avatar
Linus Torvalds committed
458
 *   64k if defined (4K page size assumed).
Linus Torvalds's avatar
Linus Torvalds committed
459
 */
Linus Torvalds's avatar
Linus Torvalds committed
460

Linus Torvalds's avatar
Linus Torvalds committed
461 462 463 464 465 466 467
static inline int get_max_readahead(struct inode * inode)
{
	if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
		return MAX_READAHEAD;
	return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
}

Linus Torvalds's avatar
Linus Torvalds committed
468 469 470
static inline unsigned long generic_file_readahead(int reada_ok,
	struct file * filp, struct inode * inode,
	unsigned long ppos, struct page * page, unsigned long page_cache)
Linus Torvalds's avatar
Linus Torvalds committed
471 472
{
	unsigned long max_ahead, ahead;
Linus Torvalds's avatar
Linus Torvalds committed
473
	unsigned long raend;
Linus Torvalds's avatar
Linus Torvalds committed
474
	int max_readahead = get_max_readahead(inode);
Linus Torvalds's avatar
Linus Torvalds committed
475

Linus Torvalds's avatar
Linus Torvalds committed
476
	raend = filp->f_raend & PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
477
	max_ahead = 0;
Linus Torvalds's avatar
Linus Torvalds committed
478

Linus Torvalds's avatar
Linus Torvalds committed
479
/*
Linus Torvalds's avatar
Linus Torvalds committed
480 481 482 483 484 485
 * The current page is locked.
 * If the current position is inside the previous read IO request, do not
 * try to reread previously read ahead pages.
 * Otherwise decide or not to read ahead some pages synchronously.
 * If we are not going to read ahead, set the read ahead context for this 
 * page only.
Linus Torvalds's avatar
Linus Torvalds committed
486 487
 */
	if (PageLocked(page)) {
Linus Torvalds's avatar
Linus Torvalds committed
488 489 490
		if (!filp->f_ralen || ppos >= raend || ppos + filp->f_ralen < raend) {
			raend = ppos;
			if (raend < inode->i_size)
Linus Torvalds's avatar
Linus Torvalds committed
491 492 493
				max_ahead = filp->f_ramax;
			filp->f_rawin = 0;
			filp->f_ralen = PAGE_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
494 495 496 497
			if (!max_ahead) {
				filp->f_raend  = ppos + filp->f_ralen;
				filp->f_rawin += filp->f_ralen;
			}
Linus Torvalds's avatar
Linus Torvalds committed
498
		}
Linus Torvalds's avatar
Linus Torvalds committed
499 500
	}
/*
Linus Torvalds's avatar
Linus Torvalds committed
501 502 503 504 505 506
 * The current page is not locked.
 * If we were reading ahead and,
 * if the current max read ahead size is not zero and,
 * if the current position is inside the last read-ahead IO request,
 *   it is the moment to try to read ahead asynchronously.
 * We will later force unplug device in order to force asynchronous read IO.
Linus Torvalds's avatar
Linus Torvalds committed
507
 */
Linus Torvalds's avatar
Linus Torvalds committed
508 509
	else if (reada_ok && filp->f_ramax && raend >= PAGE_SIZE &&
	         ppos <= raend && ppos + filp->f_ralen >= raend) {
Linus Torvalds's avatar
Linus Torvalds committed
510
/*
Linus Torvalds's avatar
Linus Torvalds committed
511 512
 * Add ONE page to max_ahead in order to try to have about the same IO max size
 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_SIZE.
Linus Torvalds's avatar
Linus Torvalds committed
513 514
 * Compute the position of the last page we have tried to read in order to 
 * begin to read ahead just at the next page.
Linus Torvalds's avatar
Linus Torvalds committed
515
 */
Linus Torvalds's avatar
Linus Torvalds committed
516 517
		raend -= PAGE_SIZE;
		if (raend < inode->i_size)
Linus Torvalds's avatar
Linus Torvalds committed
518
			max_ahead = filp->f_ramax + PAGE_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
519 520 521 522

		if (max_ahead) {
			filp->f_rawin = filp->f_ralen;
			filp->f_ralen = 0;
Linus Torvalds's avatar
Linus Torvalds committed
523
			reada_ok      = 2;
Linus Torvalds's avatar
Linus Torvalds committed
524 525 526
		}
	}
/*
Linus Torvalds's avatar
Linus Torvalds committed
527 528 529
 * Try to read ahead pages.
 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
 * scheduler, will work enough for us to avoid too bad actuals IO requests.
Linus Torvalds's avatar
Linus Torvalds committed
530 531 532 533
 */
	ahead = 0;
	while (ahead < max_ahead) {
		ahead += PAGE_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
534
		page_cache = try_to_read_ahead(filp, raend + ahead,
Linus Torvalds's avatar
Linus Torvalds committed
535
						page_cache);
Linus Torvalds's avatar
Linus Torvalds committed
536 537
	}
/*
Linus Torvalds's avatar
Linus Torvalds committed
538 539 540 541
 * If we tried to read ahead some pages,
 * If we tried to read ahead asynchronously,
 *   Try to force unplug of the device in order to start an asynchronous
 *   read IO request.
Linus Torvalds's avatar
Linus Torvalds committed
542
 * Update the read-ahead context.
Linus Torvalds's avatar
Linus Torvalds committed
543
 * Store the length of the current read-ahead window.
Linus Torvalds's avatar
Linus Torvalds committed
544 545
 * Double the current max read ahead size.
 *   That heuristic avoid to do some large IO for files that are not really
Linus Torvalds's avatar
Linus Torvalds committed
546
 *   accessed sequentially.
Linus Torvalds's avatar
Linus Torvalds committed
547
 */
Linus Torvalds's avatar
Linus Torvalds committed
548
	if (ahead) {
Linus Torvalds's avatar
Linus Torvalds committed
549 550 551 552
		if (reada_ok == 2) {
			run_task_queue(&tq_disk);
		}

Linus Torvalds's avatar
Linus Torvalds committed
553 554
		filp->f_ralen += ahead;
		filp->f_rawin += filp->f_ralen;
Linus Torvalds's avatar
Linus Torvalds committed
555
		filp->f_raend = raend + ahead + PAGE_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
556

Linus Torvalds's avatar
Linus Torvalds committed
557
		filp->f_ramax += filp->f_ramax;
Linus Torvalds's avatar
Linus Torvalds committed
558

Linus Torvalds's avatar
Linus Torvalds committed
559 560
		if (filp->f_ramax > max_readahead)
			filp->f_ramax = max_readahead;
Linus Torvalds's avatar
Linus Torvalds committed
561

Linus Torvalds's avatar
Linus Torvalds committed
562
#ifdef PROFILE_READAHEAD
Linus Torvalds's avatar
Linus Torvalds committed
563
		profile_readahead((reada_ok == 2), filp);
Linus Torvalds's avatar
Linus Torvalds committed
564
#endif
Linus Torvalds's avatar
Linus Torvalds committed
565
	}
Linus Torvalds's avatar
Linus Torvalds committed
566

Linus Torvalds's avatar
Linus Torvalds committed
567 568 569
	return page_cache;
}

Linus Torvalds's avatar
Linus Torvalds committed
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
/*
 * "descriptor" for what we're up to with a read.
 * This allows us to use the same read code yet
 * have multiple different users of the data that
 * we read from a file.
 *
 * The simplest case just copies the data to user
 * mode.
 */
typedef struct {
	size_t written;
	size_t count;
	char * buf;
	int error;
} read_descriptor_t;

typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);
Linus Torvalds's avatar
Linus Torvalds committed
587

Linus Torvalds's avatar
Linus Torvalds committed
588 589 590 591 592 593 594 595
/*
 * This is a generic file read routine, and uses the
 * inode->i_op->readpage() function for the actual low-level
 * stuff.
 *
 * This is really ugly. But the goto's actually try to clarify some
 * of the logic when it comes to error handling etc.
 */
Linus Torvalds's avatar
Linus Torvalds committed
596
static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
Linus Torvalds's avatar
Linus Torvalds committed
597
{
Linus Torvalds's avatar
Linus Torvalds committed
598 599
	struct dentry *dentry = filp->f_dentry;
	struct inode *inode = dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
600
	size_t pos, pgpos, page_cache;
Linus Torvalds's avatar
Linus Torvalds committed
601
	int reada_ok;
Linus Torvalds's avatar
Linus Torvalds committed
602
	int max_readahead = get_max_readahead(inode);
Linus Torvalds's avatar
Linus Torvalds committed
603

Linus Torvalds's avatar
Linus Torvalds committed
604
	page_cache = 0;
Linus Torvalds's avatar
Linus Torvalds committed
605

Linus Torvalds's avatar
Linus Torvalds committed
606 607
	pos = *ppos;
	pgpos = pos & PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
608
/*
Linus Torvalds's avatar
Linus Torvalds committed
609 610 611 612 613
 * If the current position is outside the previous read-ahead window, 
 * we reset the current read-ahead context and set read ahead max to zero
 * (will be set to just needed value later),
 * otherwise, we assume that the file accesses are sequential enough to
 * continue read-ahead.
Linus Torvalds's avatar
Linus Torvalds committed
614
 */
Linus Torvalds's avatar
Linus Torvalds committed
615
	if (pgpos > filp->f_raend || pgpos + filp->f_rawin < filp->f_raend) {
Linus Torvalds's avatar
Linus Torvalds committed
616 617
		reada_ok = 0;
		filp->f_raend = 0;
Linus Torvalds's avatar
Linus Torvalds committed
618 619 620 621
		filp->f_ralen = 0;
		filp->f_ramax = 0;
		filp->f_rawin = 0;
	} else {
Linus Torvalds's avatar
Linus Torvalds committed
622
		reada_ok = 1;
Linus Torvalds's avatar
Linus Torvalds committed
623 624
	}
/*
Linus Torvalds's avatar
Linus Torvalds committed
625
 * Adjust the current value of read-ahead max.
Linus Torvalds's avatar
Linus Torvalds committed
626
 * If the read operation stay in the first half page, force no readahead.
Linus Torvalds's avatar
Linus Torvalds committed
627 628 629
 * Otherwise try to increase read ahead max just enough to do the read request.
 * Then, at least MIN_READAHEAD if read ahead is ok,
 * and at most MAX_READAHEAD in all cases.
Linus Torvalds's avatar
Linus Torvalds committed
630
 */
Linus Torvalds's avatar
Linus Torvalds committed
631
	if (pos + desc->count <= (PAGE_SIZE >> 1)) {
Linus Torvalds's avatar
Linus Torvalds committed
632
		filp->f_ramax = 0;
Linus Torvalds's avatar
Linus Torvalds committed
633
	} else {
Linus Torvalds's avatar
Linus Torvalds committed
634
		unsigned long needed;
Linus Torvalds's avatar
Linus Torvalds committed
635

Linus Torvalds's avatar
Linus Torvalds committed
636
		needed = ((pos + desc->count) & PAGE_MASK) - pgpos;
Linus Torvalds's avatar
Linus Torvalds committed
637 638 639 640 641 642

		if (filp->f_ramax < needed)
			filp->f_ramax = needed;

		if (reada_ok && filp->f_ramax < MIN_READAHEAD)
				filp->f_ramax = MIN_READAHEAD;
Linus Torvalds's avatar
Linus Torvalds committed
643 644
		if (filp->f_ramax > max_readahead)
			filp->f_ramax = max_readahead;
Linus Torvalds's avatar
Linus Torvalds committed
645
	}
Linus Torvalds's avatar
Linus Torvalds committed
646

Linus Torvalds's avatar
Linus Torvalds committed
647
	for (;;) {
Linus Torvalds's avatar
Linus Torvalds committed
648
		struct page *page, **hash;
Linus Torvalds's avatar
Linus Torvalds committed
649 650 651 652 653

		if (pos >= inode->i_size)
			break;

		/*
Linus Torvalds's avatar
Linus Torvalds committed
654
		 * Try to find the data in the page cache..
Linus Torvalds's avatar
Linus Torvalds committed
655
		 */
Linus Torvalds's avatar
Linus Torvalds committed
656 657
		hash = page_hash(inode, pos & PAGE_MASK);
		page = __find_page(inode, pos & PAGE_MASK, *hash);
Linus Torvalds's avatar
Linus Torvalds committed
658
		if (!page)
Linus Torvalds's avatar
Linus Torvalds committed
659
			goto no_cached_page;
Linus Torvalds's avatar
Linus Torvalds committed
660 661

found_page:
Linus Torvalds's avatar
Linus Torvalds committed
662
/*
Linus Torvalds's avatar
Linus Torvalds committed
663 664 665 666 667
 * Try to read ahead only if the current page is filled or being filled.
 * Otherwise, if we were reading ahead, decrease max read ahead size to
 * the minimum value.
 * In this context, that seems to may happen only on some read error or if 
 * the page has been rewritten.
Linus Torvalds's avatar
Linus Torvalds committed
668 669
 */
		if (PageUptodate(page) || PageLocked(page))
Linus Torvalds's avatar
Linus Torvalds committed
670
			page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_MASK, page, page_cache);
Linus Torvalds's avatar
Linus Torvalds committed
671 672 673
		else if (reada_ok && filp->f_ramax > MIN_READAHEAD)
				filp->f_ramax = MIN_READAHEAD;

Linus Torvalds's avatar
Linus Torvalds committed
674
		wait_on_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
675 676

		if (!PageUptodate(page))
Linus Torvalds's avatar
Linus Torvalds committed
677 678 679 680 681 682 683 684 685
			goto page_read_error;

success:
		/*
		 * Ok, we have the page, it's up-to-date and ok,
		 * so now we can finally copy it to user space...
		 */
	{
		unsigned long offset, nr;
Linus Torvalds's avatar
Linus Torvalds committed
686

Linus Torvalds's avatar
Linus Torvalds committed
687 688
		offset = pos & ~PAGE_MASK;
		nr = PAGE_SIZE - offset;
Linus Torvalds's avatar
Linus Torvalds committed
689 690
		if (nr > inode->i_size - pos)
			nr = inode->i_size - pos;
Linus Torvalds's avatar
Linus Torvalds committed
691 692 693 694 695 696 697 698 699

		/*
		 * The actor routine returns how many bytes were actually used..
		 * NOTE! This may not be the same as how much of a user buffer
		 * we filled up (we may be padding etc), so we can only update
		 * "pos" here (the actor routine has to update the user buffer
		 * pointers and the remaining count).
		 */
		nr = actor(desc, (const char *) (page_address(page) + offset), nr);
Linus Torvalds's avatar
Linus Torvalds committed
700
		pos += nr;
Linus Torvalds's avatar
Linus Torvalds committed
701 702
		release_page(page);
		if (nr && desc->count)
Linus Torvalds's avatar
Linus Torvalds committed
703 704
			continue;
		break;
Linus Torvalds's avatar
Linus Torvalds committed
705 706 707 708 709 710 711 712 713 714 715 716 717 718 719
	}

no_cached_page:
		/*
		 * Ok, it wasn't cached, so we need to create a new
		 * page..
		 */
		if (!page_cache) {
			page_cache = __get_free_page(GFP_KERNEL);
			/*
			 * That could have slept, so go around to the
			 * very beginning..
			 */
			if (page_cache)
				continue;
Linus Torvalds's avatar
Linus Torvalds committed
720
			desc->error = -ENOMEM;
Linus Torvalds's avatar
Linus Torvalds committed
721 722
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
723 724 725 726 727 728

		/*
		 * Ok, add the new page to the hash-queues...
		 */
		page = mem_map + MAP_NR(page_cache);
		page_cache = 0;
Linus Torvalds's avatar
Linus Torvalds committed
729
		add_to_page_cache(page, inode, pos & PAGE_MASK, hash);
Linus Torvalds's avatar
Linus Torvalds committed
730 731 732 733 734 735 736 737 738

		/*
		 * Error handling is tricky. If we get a read error,
		 * the cached page stays in the cache (but uptodate=0),
		 * and the next process that accesses it will try to
		 * re-read it. This is needed for NFS etc, where the
		 * identity of the reader can decide if we can read the
		 * page or not..
		 */
Linus Torvalds's avatar
Linus Torvalds committed
739 740 741 742 743 744 745 746 747 748
/*
 * We have to read the page.
 * If we were reading ahead, we had previously tried to read this page,
 * That means that the page has probably been removed from the cache before 
 * the application process needs it, or has been rewritten.
 * Decrease max readahead size to the minimum value in that situation.
 */
		if (reada_ok && filp->f_ramax > MIN_READAHEAD)
			filp->f_ramax = MIN_READAHEAD;

Linus Torvalds's avatar
Linus Torvalds committed
749 750 751 752 753 754 755 756
		{
			int error = inode->i_op->readpage(filp, page);
			if (!error)
				goto found_page;
			desc->error = error;
			release_page(page);
			break;
		}
Linus Torvalds's avatar
Linus Torvalds committed
757 758 759 760 761 762 763

page_read_error:
		/*
		 * We found the page, but it wasn't up-to-date.
		 * Try to re-read it _once_. We do this synchronously,
		 * because this happens only if there were errors.
		 */
Linus Torvalds's avatar
Linus Torvalds committed
764 765 766 767 768 769 770 771 772 773 774
		{
			int error = inode->i_op->readpage(filp, page);
			if (!error) {
				wait_on_page(page);
				if (PageUptodate(page) && !PageError(page))
					goto success;
				error = -EIO; /* Some unspecified error occurred.. */
			}
			desc->error = error;
			release_page(page);
			break;
Linus Torvalds's avatar
Linus Torvalds committed
775
		}
Linus Torvalds's avatar
Linus Torvalds committed
776
	}
Linus Torvalds's avatar
Linus Torvalds committed
777

Linus Torvalds's avatar
Linus Torvalds committed
778
	*ppos = pos;
Linus Torvalds's avatar
Linus Torvalds committed
779
	filp->f_reada = 1;
Linus Torvalds's avatar
Linus Torvalds committed
780 781
	if (page_cache)
		free_page(page_cache);
Linus Torvalds's avatar
Linus Torvalds committed
782
	UPDATE_ATIME(inode);
Linus Torvalds's avatar
Linus Torvalds committed
783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922
}

static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
{
	unsigned long left;
	unsigned long count = desc->count;

	if (size > count)
		size = count;
	left = __copy_to_user(desc->buf, area, size);
	if (left) {
		size -= left;
		desc->error = -EFAULT;
	}
	desc->count = count - size;
	desc->written += size;
	desc->buf += size;
	return size;
}

/*
 * This is the "read()" routine for all filesystems
 * that can use the page cache directly.
 */
ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
{
	ssize_t retval;

	retval = -EFAULT;
	if (access_ok(VERIFY_WRITE, buf, count)) {
		retval = 0;
		if (count) {
			read_descriptor_t desc;

			desc.written = 0;
			desc.count = count;
			desc.buf = buf;
			desc.error = 0;
			do_generic_file_read(filp, ppos, &desc, file_read_actor);

			retval = desc.written;
			if (!retval)
				retval = desc.error;
		}
	}
	return retval;
}

static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
{
	ssize_t written;
	unsigned long count = desc->count;
	struct file *file = (struct file *) desc->buf;
	struct inode *inode = file->f_dentry->d_inode;

	if (size > count)
		size = count;
	down(&inode->i_sem);
	set_fs(KERNEL_DS);
	written = file->f_op->write(file, area, size, &file->f_pos);
	set_fs(USER_DS);
	up(&inode->i_sem);
	if (written < 0) {
		desc->error = written;
		written = 0;
	}
	desc->count = count - written;
	desc->written += written;
	return written;
}

asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, size_t count)
{
	ssize_t retval;
	struct file * in_file, * out_file;
	struct inode * in_inode, * out_inode;

	lock_kernel();

	/*
	 * Get input file, and verify that it is ok..
	 */
	retval = -EBADF;
	in_file = fget(in_fd);
	if (!in_file)
		goto out;
	if (!(in_file->f_mode & FMODE_READ))
		goto fput_in;
	retval = -EINVAL;
	in_inode = in_file->f_dentry->d_inode;
	if (!in_inode)
		goto fput_in;
	if (!in_inode->i_op || !in_inode->i_op->readpage)
		goto fput_in;
	retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
	if (retval)
		goto fput_in;

	/*
	 * Get output file, and verify that it is ok..
	 */
	retval = -EBADF;
	out_file = fget(out_fd);
	if (!out_file)
		goto fput_in;
	if (!(out_file->f_mode & FMODE_WRITE))
		goto fput_out;
	retval = -EINVAL;
	if (!out_file->f_op || !out_file->f_op->write)
		goto fput_out;
	out_inode = out_file->f_dentry->d_inode;
	if (!out_inode)
		goto fput_out;
	retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
	if (retval)
		goto fput_out;

	retval = 0;
	if (count) {
		read_descriptor_t desc;

		desc.written = 0;
		desc.count = count;
		desc.buf = (char *) out_file;
		desc.error = 0;
		do_generic_file_read(in_file, &in_file->f_pos, &desc, file_send_actor);

		retval = desc.written;
		if (!retval)
			retval = desc.error;
	}


fput_out:
	fput(out_file);
fput_in:
	fput(in_file);
out:
	unlock_kernel();
	return retval;
Linus Torvalds's avatar
Linus Torvalds committed
923 924
}

Linus Torvalds's avatar
Linus Torvalds committed
925
/*
Linus Torvalds's avatar
Linus Torvalds committed
926 927 928 929 930 931 932
 * Semantics for shared and private memory areas are different past the end
 * of the file. A shared mapping past the last page of the file is an error
 * and results in a SIGBUS, while a private mapping just maps in a zero page.
 *
 * The goto's are kind of ugly, but this streamlines the normal case of having
 * it in the page cache, and handles the special cases reasonably without
 * having a lot of duplicated code.
Linus Torvalds's avatar
Linus Torvalds committed
933 934 935
 *
 * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
 * ahead of the wait if we're sure to need it.
Linus Torvalds's avatar
Linus Torvalds committed
936
 */
Linus Torvalds's avatar
Linus Torvalds committed
937
static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
Linus Torvalds's avatar
Linus Torvalds committed
938
{
Linus Torvalds's avatar
Linus Torvalds committed
939 940
	struct file * file = area->vm_file;
	struct dentry * dentry = file->f_dentry;
Linus Torvalds's avatar
Linus Torvalds committed
941
	struct inode * inode = dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
942
	unsigned long offset;
Linus Torvalds's avatar
Linus Torvalds committed
943
	struct page * page, **hash;
Linus Torvalds's avatar
Linus Torvalds committed
944 945 946 947 948 949
	unsigned long old_page, new_page;

	new_page = 0;
	offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
	if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
		goto no_page;
Linus Torvalds's avatar
Linus Torvalds committed
950

Linus Torvalds's avatar
Linus Torvalds committed
951 952 953
	/*
	 * Do we have something in the page cache already?
	 */
Linus Torvalds's avatar
Linus Torvalds committed
954 955
	hash = page_hash(inode, offset);
	page = __find_page(inode, offset, *hash);
Linus Torvalds's avatar
Linus Torvalds committed
956 957 958 959 960 961
	if (!page)
		goto no_cached_page;

found_page:
	/*
	 * Ok, found a page in the page cache, now we need to check
Linus Torvalds's avatar
Linus Torvalds committed
962 963
	 * that it's up-to-date.  First check whether we'll need an
	 * extra page -- better to overlap the allocation with the I/O.
Linus Torvalds's avatar
Linus Torvalds committed
964
	 */
Linus Torvalds's avatar
Linus Torvalds committed
965 966 967 968 969 970
	if (no_share && !new_page) {
		new_page = __get_free_page(GFP_KERNEL);
		if (!new_page)
			goto failure;
	}

Linus Torvalds's avatar
Linus Torvalds committed
971 972
	if (PageLocked(page))
		goto page_locked_wait;
Linus Torvalds's avatar
Linus Torvalds committed
973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
	if (!PageUptodate(page))
		goto page_read_error;

success:
	/*
	 * Found the page, need to check sharing and possibly
	 * copy it over to another page..
	 */
	old_page = page_address(page);
	if (!no_share) {
		/*
		 * Ok, we can share the cached page directly.. Get rid
		 * of any potential extra pages.
		 */
		if (new_page)
			free_page(new_page);

		flush_page_to_ram(old_page);
		return old_page;
	}

	/*
Linus Torvalds's avatar
Linus Torvalds committed
995
	 * No sharing ... copy to the new page.
Linus Torvalds's avatar
Linus Torvalds committed
996
	 */
Linus Torvalds's avatar
Linus Torvalds committed
997
	copy_page(new_page, old_page);
Linus Torvalds's avatar
Linus Torvalds committed
998 999 1000 1001 1002
	flush_page_to_ram(new_page);
	release_page(page);
	return new_page;

no_cached_page:
Linus Torvalds's avatar
Linus Torvalds committed
1003
	new_page = __get_free_page(GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
1004 1005 1006 1007 1008 1009 1010 1011 1012
	if (!new_page)
		goto no_page;

	/*
	 * During getting the above page we might have slept,
	 * so we need to re-check the situation with the page
	 * cache.. The page we just got may be useful if we
	 * can't share, so don't get rid of it here.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1013
	page = find_page(inode, offset);
Linus Torvalds's avatar
Linus Torvalds committed
1014
	if (page)
Linus Torvalds's avatar
Linus Torvalds committed
1015
		goto found_page;
Linus Torvalds's avatar
Linus Torvalds committed
1016 1017 1018 1019

	/*
	 * Now, create a new page-cache page from the page we got
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1020 1021
	page = mem_map + MAP_NR(new_page);
	new_page = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1022
	add_to_page_cache(page, inode, offset, hash);
Linus Torvalds's avatar
Linus Torvalds committed
1023

Linus Torvalds's avatar
Linus Torvalds committed
1024
	if (inode->i_op->readpage(file, page) != 0)
Linus Torvalds's avatar
Linus Torvalds committed
1025 1026 1027 1028 1029
		goto failure;

	/*
	 * Do a very limited read-ahead if appropriate
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1030
	if (PageLocked(page))
Linus Torvalds's avatar
Linus Torvalds committed
1031
		new_page = try_to_read_ahead(file, offset + PAGE_SIZE, 0);
Linus Torvalds's avatar
Linus Torvalds committed
1032
	goto found_page;
Linus Torvalds's avatar
Linus Torvalds committed
1033

Linus Torvalds's avatar
Linus Torvalds committed
1034 1035 1036 1037 1038
page_locked_wait:
	__wait_on_page(page);
	if (PageUptodate(page))
		goto success;
	
Linus Torvalds's avatar
Linus Torvalds committed
1039 1040 1041
page_read_error:
	/*
	 * Umm, take care of errors if the page isn't up-to-date.
Linus Torvalds's avatar
Linus Torvalds committed
1042 1043 1044
	 * Try to re-read it _once_. We do this synchronously,
	 * because there really aren't any performance issues here
	 * and we need to check for errors.
Linus Torvalds's avatar
Linus Torvalds committed
1045
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1046
	if (inode->i_op->readpage(file, page) != 0)
Linus Torvalds's avatar
Linus Torvalds committed
1047
		goto failure;
Linus Torvalds's avatar
Linus Torvalds committed
1048
	wait_on_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
1049 1050 1051 1052
	if (PageError(page))
		goto failure;
	if (PageUptodate(page))
		goto success;
Linus Torvalds's avatar
Linus Torvalds committed
1053

Linus Torvalds's avatar
Linus Torvalds committed
1054
	/*
Linus Torvalds's avatar
Linus Torvalds committed
1055
	 * Things didn't work out. Return zero to tell the
Linus Torvalds's avatar
Linus Torvalds committed
1056 1057 1058 1059
	 * mm layer so, possibly freeing the page cache page first.
	 */
failure:
	release_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
1060 1061
	if (new_page)
		free_page(new_page);
Linus Torvalds's avatar
Linus Torvalds committed
1062 1063
no_page:
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1064
}
Linus Torvalds's avatar
Linus Torvalds committed
1065

Linus Torvalds's avatar
Linus Torvalds committed
1066
/*
Linus Torvalds's avatar
Linus Torvalds committed
1067 1068
 * Tries to write a shared mapped page to its backing store. May return -EIO
 * if the disk is full.
Linus Torvalds's avatar
Linus Torvalds committed
1069
 */
Linus Torvalds's avatar
Linus Torvalds committed
1070 1071 1072
static inline int do_write_page(struct inode * inode, struct file * file,
	const char * page, unsigned long offset)
{
Linus Torvalds's avatar
Linus Torvalds committed
1073
	int retval;
Linus Torvalds's avatar
Linus Torvalds committed
1074
	unsigned long size;
1075
	loff_t loff = offset;
Linus Torvalds's avatar
Linus Torvalds committed
1076
	mm_segment_t old_fs;
Linus Torvalds's avatar
Linus Torvalds committed
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090

	size = offset + PAGE_SIZE;
	/* refuse to extend file size.. */
	if (S_ISREG(inode->i_mode)) {
		if (size > inode->i_size)
			size = inode->i_size;
		/* Ho humm.. We should have tested for this earlier */
		if (size < offset)
			return -EIO;
	}
	size -= offset;
	old_fs = get_fs();
	set_fs(KERNEL_DS);
	retval = -EIO;
1091
	if (size == file->f_op->write(file, (const char *) page, size, &loff))
Linus Torvalds's avatar
Linus Torvalds committed
1092 1093 1094 1095 1096
		retval = 0;
	set_fs(old_fs);
	return retval;
}

Linus Torvalds's avatar
Linus Torvalds committed
1097
static int filemap_write_page(struct vm_area_struct * vma,
Linus Torvalds's avatar
Linus Torvalds committed
1098 1099 1100
	unsigned long offset,
	unsigned long page)
{
Linus Torvalds's avatar
Linus Torvalds committed
1101
	int result;
Linus Torvalds's avatar
Linus Torvalds committed
1102
	struct file * file;
Linus Torvalds's avatar
Linus Torvalds committed
1103
	struct dentry * dentry;
Linus Torvalds's avatar
Linus Torvalds committed
1104
	struct inode * inode;
Linus Torvalds's avatar
Linus Torvalds committed
1105 1106
	struct buffer_head * bh;

Linus Torvalds's avatar
Linus Torvalds committed
1107
	bh = mem_map[MAP_NR(page)].buffers;
Linus Torvalds's avatar
Linus Torvalds committed
1108 1109 1110 1111
	if (bh) {
		/* whee.. just mark the buffer heads dirty */
		struct buffer_head * tmp = bh;
		do {
Linus Torvalds's avatar
Linus Torvalds committed
1112 1113 1114 1115
			/*
			 * WSH: There's a race here: mark_buffer_dirty()
			 * could block, and the buffers aren't pinned down.
			 */
Linus Torvalds's avatar
Linus Torvalds committed
1116 1117 1118
			mark_buffer_dirty(tmp, 0);
			tmp = tmp->b_this_page;
		} while (tmp != bh);
Linus Torvalds's avatar
Linus Torvalds committed
1119
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1120
	}
Linus Torvalds's avatar
Linus Torvalds committed
1121

Linus Torvalds's avatar
Linus Torvalds committed
1122 1123
	file = vma->vm_file;
	dentry = file->f_dentry;
Linus Torvalds's avatar
Linus Torvalds committed
1124
	inode = dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
1125
	if (!file->f_op->write)
Linus Torvalds's avatar
Linus Torvalds committed
1126
		return -EIO;
Linus Torvalds's avatar
Linus Torvalds committed
1127

Linus Torvalds's avatar
Linus Torvalds committed
1128
	/*
Linus Torvalds's avatar
Linus Torvalds committed
1129
	 * If a task terminates while we're swapping the page, the vma and
Linus Torvalds's avatar
Linus Torvalds committed
1130
	 * and file could be released ... increment the count to be safe.
Linus Torvalds's avatar
Linus Torvalds committed
1131
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1132
	file->f_count++;
Linus Torvalds's avatar
Linus Torvalds committed
1133
	down(&inode->i_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1134
	result = do_write_page(inode, file, (const char *) page, offset);
Linus Torvalds's avatar
Linus Torvalds committed
1135
	up(&inode->i_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1136
	fput(file);
Linus Torvalds's avatar
Linus Torvalds committed
1137
	return result;
Linus Torvalds's avatar
Linus Torvalds committed
1138 1139
}

Linus Torvalds's avatar
Linus Torvalds committed
1140

Linus Torvalds's avatar
Linus Torvalds committed
1141 1142 1143 1144 1145 1146 1147 1148
/*
 * Swapping to a shared file: while we're busy writing out the page
 * (and the page still exists in memory), we save the page information
 * in the page table, so that "filemap_swapin()" can re-use the page
 * immediately if it is called while we're busy swapping it out..
 *
 * Once we've written it all out, we mark the page entry "empty", which
 * will result in a normal page-in (instead of a swap-in) from the now
Linus Torvalds's avatar
Linus Torvalds committed
1149
 * up-to-date disk file.
Linus Torvalds's avatar
Linus Torvalds committed
1150
 */
Linus Torvalds's avatar
Linus Torvalds committed
1151
int filemap_swapout(struct vm_area_struct * vma,
Linus Torvalds's avatar
Linus Torvalds committed
1152 1153 1154
	unsigned long offset,
	pte_t *page_table)
{
Linus Torvalds's avatar
Linus Torvalds committed
1155
	int error;
Linus Torvalds's avatar
Linus Torvalds committed
1156 1157 1158
	unsigned long page = pte_page(*page_table);
	unsigned long entry = SWP_ENTRY(SHM_SWP_TYPE, MAP_NR(page));

Linus Torvalds's avatar
Linus Torvalds committed
1159
	flush_cache_page(vma, (offset + vma->vm_start - vma->vm_offset));
Linus Torvalds's avatar
Linus Torvalds committed
1160
	set_pte(page_table, __pte(entry));
Linus Torvalds's avatar
Linus Torvalds committed
1161
	flush_tlb_page(vma, (offset + vma->vm_start - vma->vm_offset));
Linus Torvalds's avatar
Linus Torvalds committed
1162
	error = filemap_write_page(vma, offset, page);
Linus Torvalds's avatar
Linus Torvalds committed
1163 1164
	if (pte_val(*page_table) == entry)
		pte_clear(page_table);
Linus Torvalds's avatar
Linus Torvalds committed
1165
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
}

/*
 * filemap_swapin() is called only if we have something in the page
 * tables that is non-zero (but not present), which we know to be the
 * page index of a page that is busy being swapped out (see above).
 * So we just use it directly..
 */
static pte_t filemap_swapin(struct vm_area_struct * vma,
	unsigned long offset,
	unsigned long entry)
{
	unsigned long page = SWP_OFFSET(entry);

Linus Torvalds's avatar
Linus Torvalds committed
1180
	atomic_inc(&mem_map[page].count);
Linus Torvalds's avatar
Linus Torvalds committed
1181
	page = (page << PAGE_SHIFT) + PAGE_OFFSET;
Linus Torvalds's avatar
Linus Torvalds committed
1182
	return mk_pte(page,vma->vm_page_prot);
Linus Torvalds's avatar
Linus Torvalds committed
1183 1184
}

Linus Torvalds's avatar
Linus Torvalds committed
1185 1186

static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
Linus Torvalds's avatar
Linus Torvalds committed
1187 1188
	unsigned long address, unsigned int flags)
{
Linus Torvalds's avatar
Linus Torvalds committed
1189 1190 1191 1192 1193 1194 1195 1196 1197
	pte_t pte = *ptep;
	unsigned long page;
	int error;

	if (!(flags & MS_INVALIDATE)) {
		if (!pte_present(pte))
			return 0;
		if (!pte_dirty(pte))
			return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1198
		flush_page_to_ram(pte_page(pte));
Linus Torvalds's avatar
Linus Torvalds committed
1199
		flush_cache_page(vma, address);
Linus Torvalds's avatar
Linus Torvalds committed
1200
		set_pte(ptep, pte_mkclean(pte));
Linus Torvalds's avatar
Linus Torvalds committed
1201
		flush_tlb_page(vma, address);
Linus Torvalds's avatar
Linus Torvalds committed
1202
		page = pte_page(pte);
Linus Torvalds's avatar
Linus Torvalds committed
1203
		atomic_inc(&mem_map[MAP_NR(page)].count);
Linus Torvalds's avatar
Linus Torvalds committed
1204
	} else {
Linus Torvalds's avatar
Linus Torvalds committed
1205 1206
		if (pte_none(pte))
			return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1207
		flush_cache_page(vma, address);
Linus Torvalds's avatar
Linus Torvalds committed
1208
		pte_clear(ptep);
Linus Torvalds's avatar
Linus Torvalds committed
1209
		flush_tlb_page(vma, address);
Linus Torvalds's avatar
Linus Torvalds committed
1210 1211 1212 1213 1214 1215 1216 1217 1218
		if (!pte_present(pte)) {
			swap_free(pte_val(pte));
			return 0;
		}
		page = pte_page(pte);
		if (!pte_dirty(pte) || flags == MS_INVALIDATE) {
			free_page(page);
			return 0;
		}
Linus Torvalds's avatar
Linus Torvalds committed
1219
	}
Linus Torvalds's avatar
Linus Torvalds committed
1220 1221 1222
	error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page);
	free_page(page);
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
1223 1224
}

Linus Torvalds's avatar
Linus Torvalds committed
1225
static inline int filemap_sync_pte_range(pmd_t * pmd,
Linus Torvalds's avatar
Linus Torvalds committed
1226 1227 1228 1229 1230
	unsigned long address, unsigned long size, 
	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
{
	pte_t * pte;
	unsigned long end;
Linus Torvalds's avatar
Linus Torvalds committed
1231
	int error;
Linus Torvalds's avatar
Linus Torvalds committed
1232 1233

	if (pmd_none(*pmd))
Linus Torvalds's avatar
Linus Torvalds committed
1234
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1235 1236 1237
	if (pmd_bad(*pmd)) {
		printk("filemap_sync_pte_range: bad pmd (%08lx)\n", pmd_val(*pmd));
		pmd_clear(pmd);
Linus Torvalds's avatar
Linus Torvalds committed
1238
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1239 1240 1241 1242 1243 1244 1245
	}
	pte = pte_offset(pmd, address);
	offset += address & PMD_MASK;
	address &= ~PMD_MASK;
	end = address + size;
	if (end > PMD_SIZE)
		end = PMD_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
1246
	error = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1247
	do {
Linus Torvalds's avatar
Linus Torvalds committed
1248
		error |= filemap_sync_pte(pte, vma, address + offset, flags);
Linus Torvalds's avatar
Linus Torvalds committed
1249 1250 1251
		address += PAGE_SIZE;
		pte++;
	} while (address < end);
Linus Torvalds's avatar
Linus Torvalds committed
1252
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
1253 1254
}

Linus Torvalds's avatar
Linus Torvalds committed
1255
static inline int filemap_sync_pmd_range(pgd_t * pgd,
Linus Torvalds's avatar
Linus Torvalds committed
1256 1257 1258 1259 1260
	unsigned long address, unsigned long size, 
	struct vm_area_struct *vma, unsigned int flags)
{
	pmd_t * pmd;
	unsigned long offset, end;
Linus Torvalds's avatar
Linus Torvalds committed
1261
	int error;
Linus Torvalds's avatar
Linus Torvalds committed
1262 1263

	if (pgd_none(*pgd))
Linus Torvalds's avatar
Linus Torvalds committed
1264
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1265 1266 1267
	if (pgd_bad(*pgd)) {
		printk("filemap_sync_pmd_range: bad pgd (%08lx)\n", pgd_val(*pgd));
		pgd_clear(pgd);
Linus Torvalds's avatar
Linus Torvalds committed
1268
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1269 1270
	}
	pmd = pmd_offset(pgd, address);
Linus Torvalds's avatar
Linus Torvalds committed
1271 1272
	offset = address & PGDIR_MASK;
	address &= ~PGDIR_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
1273 1274 1275
	end = address + size;
	if (end > PGDIR_SIZE)
		end = PGDIR_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
1276
	error = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1277
	do {
Linus Torvalds's avatar
Linus Torvalds committed
1278
		error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
Linus Torvalds's avatar
Linus Torvalds committed
1279 1280 1281
		address = (address + PMD_SIZE) & PMD_MASK;
		pmd++;
	} while (address < end);
Linus Torvalds's avatar
Linus Torvalds committed
1282
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
1283 1284
}

Linus Torvalds's avatar
Linus Torvalds committed
1285
static int filemap_sync(struct vm_area_struct * vma, unsigned long address,
Linus Torvalds's avatar
Linus Torvalds committed
1286 1287
	size_t size, unsigned int flags)
{
Linus Torvalds's avatar
Linus Torvalds committed
1288
	pgd_t * dir;
Linus Torvalds's avatar
Linus Torvalds committed
1289
	unsigned long end = address + size;
Linus Torvalds's avatar
Linus Torvalds committed
1290
	int error = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1291

Linus Torvalds's avatar
Linus Torvalds committed
1292
	dir = pgd_offset(vma->vm_mm, address);
Linus Torvalds's avatar
Linus Torvalds committed
1293
	flush_cache_range(vma->vm_mm, end - size, end);
Linus Torvalds's avatar
Linus Torvalds committed
1294
	while (address < end) {
Linus Torvalds's avatar
Linus Torvalds committed
1295
		error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
Linus Torvalds's avatar
Linus Torvalds committed
1296 1297
		address = (address + PGDIR_SIZE) & PGDIR_MASK;
		dir++;
Linus Torvalds's avatar
Linus Torvalds committed
1298
	}
Linus Torvalds's avatar
Linus Torvalds committed
1299
	flush_tlb_range(vma->vm_mm, end - size, end);
Linus Torvalds's avatar
Linus Torvalds committed
1300
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
1301 1302 1303
}

/*
Linus Torvalds's avatar
Linus Torvalds committed
1304
 * This handles (potentially partial) area unmaps..
Linus Torvalds's avatar
Linus Torvalds committed
1305
 */
Linus Torvalds's avatar
Linus Torvalds committed
1306
static void filemap_unmap(struct vm_area_struct *vma, unsigned long start, size_t len)
Linus Torvalds's avatar
Linus Torvalds committed
1307
{
Linus Torvalds's avatar
Linus Torvalds committed
1308
	filemap_sync(vma, start, len, MS_ASYNC);
Linus Torvalds's avatar
Linus Torvalds committed
1309 1310 1311 1312 1313 1314 1315 1316
}

/*
 * Shared mappings need to be able to do the right thing at
 * close/unmap/sync. They will also use the private file as
 * backing-store for swapping..
 */
static struct vm_operations_struct file_shared_mmap = {
Linus Torvalds's avatar
Linus Torvalds committed
1317 1318 1319 1320
	NULL,			/* no special open */
	NULL,			/* no special close */
	filemap_unmap,		/* unmap - we need to sync the pages */
	NULL,			/* no special protect */
Linus Torvalds's avatar
Linus Torvalds committed
1321
	filemap_sync,		/* sync */
Linus Torvalds's avatar
Linus Torvalds committed
1322
	NULL,			/* advise */
Linus Torvalds's avatar
Linus Torvalds committed
1323
	filemap_nopage,		/* nopage */
Linus Torvalds's avatar
Linus Torvalds committed
1324
	NULL,			/* wppage */
Linus Torvalds's avatar
Linus Torvalds committed
1325
	filemap_swapout,	/* swapout */
Linus Torvalds's avatar
Linus Torvalds committed
1326
	filemap_swapin,		/* swapin */
Linus Torvalds's avatar
Linus Torvalds committed
1327 1328 1329
};

/*
Linus Torvalds's avatar
Linus Torvalds committed
1330
 * Private mappings just need to be able to load in the map.
Linus Torvalds's avatar
Linus Torvalds committed
1331
 *
Linus Torvalds's avatar
Linus Torvalds committed
1332
 * (This is actually used for shared mappings as well, if we
Linus Torvalds's avatar
Linus Torvalds committed
1333 1334 1335 1336 1337 1338 1339 1340 1341
 * know they can't ever get write permissions..)
 */
static struct vm_operations_struct file_private_mmap = {
	NULL,			/* open */
	NULL,			/* close */
	NULL,			/* unmap */
	NULL,			/* protect */
	NULL,			/* sync */
	NULL,			/* advise */
Linus Torvalds's avatar
Linus Torvalds committed
1342
	filemap_nopage,		/* nopage */
Linus Torvalds's avatar
Linus Torvalds committed
1343 1344 1345 1346 1347 1348
	NULL,			/* wppage */
	NULL,			/* swapout */
	NULL,			/* swapin */
};

/* This is used for a general mmap of a disk file */
Linus Torvalds's avatar
Linus Torvalds committed
1349 1350

int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
Linus Torvalds's avatar
Linus Torvalds committed
1351 1352
{
	struct vm_operations_struct * ops;
Linus Torvalds's avatar
Linus Torvalds committed
1353
	struct inode *inode = file->f_dentry->d_inode;
Linus Torvalds's avatar
Linus Torvalds committed
1354

Linus Torvalds's avatar
Linus Torvalds committed
1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
		ops = &file_shared_mmap;
		/* share_page() can only guarantee proper page sharing if
		 * the offsets are all page aligned. */
		if (vma->vm_offset & (PAGE_SIZE - 1))
			return -EINVAL;
	} else {
		ops = &file_private_mmap;
		if (vma->vm_offset & (inode->i_sb->s_blocksize - 1))
			return -EINVAL;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1366 1367
	if (!inode->i_sb || !S_ISREG(inode->i_mode))
		return -EACCES;
Linus Torvalds's avatar
Linus Torvalds committed
1368
	if (!inode->i_op || !inode->i_op->readpage)
Linus Torvalds's avatar
Linus Torvalds committed
1369
		return -ENOEXEC;
Linus Torvalds's avatar
Linus Torvalds committed
1370
	UPDATE_ATIME(inode);
Linus Torvalds's avatar
Linus Torvalds committed
1371 1372
	vma->vm_file = file;
	file->f_count++;
Linus Torvalds's avatar
Linus Torvalds committed
1373 1374 1375
	vma->vm_ops = ops;
	return 0;
}
Linus Torvalds's avatar
Linus Torvalds committed
1376 1377 1378 1379 1380 1381 1382 1383 1384


/*
 * The msync() system call.
 */

static int msync_interval(struct vm_area_struct * vma,
	unsigned long start, unsigned long end, int flags)
{
Linus Torvalds's avatar
Linus Torvalds committed
1385
	if (vma->vm_file && vma->vm_ops && vma->vm_ops->sync) {
Linus Torvalds's avatar
Linus Torvalds committed
1386 1387
		int error;
		error = vma->vm_ops->sync(vma, start, end-start, flags);
Linus Torvalds's avatar
Linus Torvalds committed
1388
		if (!error && (flags & MS_SYNC)) {
Linus Torvalds's avatar
Linus Torvalds committed
1389 1390 1391
			struct file * file = vma->vm_file;
			if (file) {
				struct dentry * dentry = file->f_dentry;
Linus Torvalds's avatar
Linus Torvalds committed
1392 1393
				struct inode * inode = dentry->d_inode;
				down(&inode->i_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1394
				error = file_fsync(file, dentry);
Linus Torvalds's avatar
Linus Torvalds committed
1395 1396 1397 1398
				up(&inode->i_sem);
			}
		}
		return error;
Linus Torvalds's avatar
Linus Torvalds committed
1399 1400 1401 1402 1403 1404 1405 1406
	}
	return 0;
}

asmlinkage int sys_msync(unsigned long start, size_t len, int flags)
{
	unsigned long end;
	struct vm_area_struct * vma;
Linus Torvalds's avatar
Linus Torvalds committed
1407
	int unmapped_error, error = -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
1408

Linus Torvalds's avatar
Linus Torvalds committed
1409
	down(&current->mm->mmap_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1410
	lock_kernel();
Linus Torvalds's avatar
Linus Torvalds committed
1411
	if (start & ~PAGE_MASK)
Linus Torvalds's avatar
Linus Torvalds committed
1412
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1413 1414 1415
	len = (len + ~PAGE_MASK) & PAGE_MASK;
	end = start + len;
	if (end < start)
Linus Torvalds's avatar
Linus Torvalds committed
1416
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1417
	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
Linus Torvalds's avatar
Linus Torvalds committed
1418 1419
		goto out;
	error = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1420
	if (end == start)
Linus Torvalds's avatar
Linus Torvalds committed
1421
		goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1422 1423 1424 1425
	/*
	 * If the interval [start,end) covers some unmapped address ranges,
	 * just ignore them, but return -EFAULT at the end.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1426
	vma = find_vma(current->mm, start);
Linus Torvalds's avatar
Linus Torvalds committed
1427 1428 1429
	unmapped_error = 0;
	for (;;) {
		/* Still start < end. */
Linus Torvalds's avatar
Linus Torvalds committed
1430
		error = -EFAULT;
Linus Torvalds's avatar
Linus Torvalds committed
1431
		if (!vma)
Linus Torvalds's avatar
Linus Torvalds committed
1432
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1433 1434 1435 1436 1437 1438 1439 1440 1441 1442
		/* Here start < vma->vm_end. */
		if (start < vma->vm_start) {
			unmapped_error = -EFAULT;
			start = vma->vm_start;
		}
		/* Here vma->vm_start <= start < vma->vm_end. */
		if (end <= vma->vm_end) {
			if (start < end) {
				error = msync_interval(vma, start, end, flags);
				if (error)
Linus Torvalds's avatar
Linus Torvalds committed
1443
					goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1444
			}
Linus Torvalds's avatar
Linus Torvalds committed
1445 1446
			error = unmapped_error;
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1447 1448 1449 1450
		}
		/* Here vma->vm_start <= start < vma->vm_end < end. */
		error = msync_interval(vma, start, vma->vm_end, flags);
		if (error)
Linus Torvalds's avatar
Linus Torvalds committed
1451
			goto out;
Linus Torvalds's avatar
Linus Torvalds committed
1452 1453 1454
		start = vma->vm_end;
		vma = vma->vm_next;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1455
out:
Linus Torvalds's avatar
Linus Torvalds committed
1456
	unlock_kernel();
Linus Torvalds's avatar
Linus Torvalds committed
1457
	up(&current->mm->mmap_sem);
Linus Torvalds's avatar
Linus Torvalds committed
1458
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
1459
}
Linus Torvalds's avatar
Linus Torvalds committed
1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476

/*
 * Write to a file through the page cache. This is mainly for the
 * benefit of NFS and possibly other network-based file systems.
 *
 * We currently put everything into the page cache prior to writing it.
 * This is not a problem when writing full pages. With partial pages,
 * however, we first have to read the data into the cache, then
 * dirty the page, and finally schedule it for writing. Alternatively, we
 * could write-through just the portion of data that would go into that
 * page, but that would kill performance for applications that write data
 * line by line, and it's prone to race conditions.
 *
 * Note that this routine doesn't try to keep track of dirty pages. Each
 * file system has to do this all by itself, unfortunately.
 *							okir@monad.swb.de
 */
Linus Torvalds's avatar
Linus Torvalds committed
1477 1478 1479
ssize_t
generic_file_write(struct file *file, const char *buf,
		   size_t count, loff_t *ppos)
Linus Torvalds's avatar
Linus Torvalds committed
1480
{
Linus Torvalds's avatar
Linus Torvalds committed
1481 1482
	struct dentry	*dentry = file->f_dentry; 
	struct inode	*inode = dentry->d_inode; 
Linus Torvalds's avatar
Linus Torvalds committed
1483 1484
	struct page	*page, **hash;
	unsigned long	page_cache = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1485 1486
	unsigned long	pgpos, offset;
	unsigned long	bytes, written;
Linus Torvalds's avatar
Linus Torvalds committed
1487
	unsigned long	pos;
Linus Torvalds's avatar
Linus Torvalds committed
1488
	long		status, sync, didread;
Linus Torvalds's avatar
Linus Torvalds committed
1489 1490 1491 1492 1493

	if (!inode->i_op || !inode->i_op->updatepage)
		return -EIO;

	sync    = file->f_flags & O_SYNC;
Linus Torvalds's avatar
Linus Torvalds committed
1494
	pos     = *ppos;
Linus Torvalds's avatar
Linus Torvalds committed
1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506
	written = 0;
	status  = 0;

	if (file->f_flags & O_APPEND)
		pos = inode->i_size;

	while (count) {
		/*
		 * Try to find the page in the cache. If it isn't there,
		 * allocate a free page.
		 */
		offset = (pos & ~PAGE_MASK);
Linus Torvalds's avatar
Linus Torvalds committed
1507
		pgpos = pos & PAGE_MASK;
Linus Torvalds's avatar
Linus Torvalds committed
1508 1509 1510 1511

		if ((bytes = PAGE_SIZE - offset) > count)
			bytes = count;

Linus Torvalds's avatar
Linus Torvalds committed
1512 1513
		hash = page_hash(inode, pgpos);
		if (!(page = __find_page(inode, pgpos, *hash))) {
Linus Torvalds's avatar
Linus Torvalds committed
1514 1515
			if (!page_cache) {
				page_cache = __get_free_page(GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
1516 1517 1518 1519
				if (page_cache)
					continue;
				status = -ENOMEM;
				break;
Linus Torvalds's avatar
Linus Torvalds committed
1520 1521
			}
			page = mem_map + MAP_NR(page_cache);
Linus Torvalds's avatar
Linus Torvalds committed
1522
			add_to_page_cache(page, inode, pgpos, hash);
Linus Torvalds's avatar
Linus Torvalds committed
1523 1524 1525
			page_cache = 0;
		}

Linus Torvalds's avatar
Linus Torvalds committed
1526
		/*
Linus Torvalds's avatar
Linus Torvalds committed
1527 1528
		 * Note: setting of the PG_locked bit is handled
		 * below the i_op->xxx interface.
Linus Torvalds's avatar
Linus Torvalds committed
1529 1530 1531 1532
		 */
		didread = 0;
page_wait:
		wait_on_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
1533 1534
		if (PageUptodate(page))
			goto do_update_page;
Linus Torvalds's avatar
Linus Torvalds committed
1535 1536

		/*
Linus Torvalds's avatar
Linus Torvalds committed
1537
		 * The page is not up-to-date ... if we're writing less
Linus Torvalds's avatar
Linus Torvalds committed
1538
		 * than a full page of data, we may have to read it first.
Linus Torvalds's avatar
Linus Torvalds committed
1539 1540
		 * But if the page is past the current end of file, we must
		 * clear it before updating.
Linus Torvalds's avatar
Linus Torvalds committed
1541
		 */
Linus Torvalds's avatar
Linus Torvalds committed
1542 1543 1544 1545 1546
		if (bytes < PAGE_SIZE) {
			if (pgpos < inode->i_size) {
				status = -EIO;
				if (didread >= 2)
					goto done_with_page;
Linus Torvalds's avatar
Linus Torvalds committed
1547
				status = inode->i_op->readpage(file, page);
Linus Torvalds's avatar
Linus Torvalds committed
1548
				if (status < 0)
Linus Torvalds's avatar
Linus Torvalds committed
1549
					goto done_with_page;
Linus Torvalds's avatar
Linus Torvalds committed
1550
				didread++;
Linus Torvalds's avatar
Linus Torvalds committed
1551
				goto page_wait;
Linus Torvalds's avatar
Linus Torvalds committed
1552 1553 1554 1555
			} else {
				/* Must clear for partial writes */
				memset((void *) page_address(page), 0,
					 PAGE_SIZE);
Linus Torvalds's avatar
Linus Torvalds committed
1556 1557
			}
		}
Linus Torvalds's avatar
Linus Torvalds committed
1558 1559 1560 1561 1562 1563
		/*
		 * N.B. We should defer setting PG_uptodate at least until
		 * the data is copied. A failure in i_op->updatepage() could
		 * leave the page with garbage data.
		 */
		set_bit(PG_uptodate, &page->flags);
Linus Torvalds's avatar
Linus Torvalds committed
1564

Linus Torvalds's avatar
Linus Torvalds committed
1565
do_update_page:
Linus Torvalds's avatar
Linus Torvalds committed
1566
		/* All right, the page is there.  Now update it. */
Linus Torvalds's avatar
Linus Torvalds committed
1567
		status = inode->i_op->updatepage(file, page, buf,
Linus Torvalds's avatar
Linus Torvalds committed
1568
							offset, bytes, sync);
Linus Torvalds's avatar
Linus Torvalds committed
1569 1570
done_with_page:
		__free_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
1571 1572 1573 1574 1575 1576 1577 1578
		if (status < 0)
			break;

		written += status;
		count -= status;
		pos += status;
		buf += status;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1579
	*ppos = pos;
Linus Torvalds's avatar
Linus Torvalds committed
1580 1581 1582 1583 1584
	if (pos > inode->i_size)
		inode->i_size = pos;

	if (page_cache)
		free_page(page_cache);
Linus Torvalds's avatar
Linus Torvalds committed
1585
	return written ? written : status;
Linus Torvalds's avatar
Linus Torvalds committed
1586
}
Linus Torvalds's avatar
Linus Torvalds committed
1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603

/*
 * Support routines for directory cacheing using the page cache.
 */

/*
 * Finds the page at the specified offset, installing a new page
 * if requested.  The count is incremented and the page is locked.
 *
 * Note: we don't have to worry about races here, as the caller
 * is holding the inode semaphore.
 */
unsigned long get_cached_page(struct inode * inode, unsigned long offset,
				int new)
{
	struct page * page;
	struct page ** hash;
Linus Torvalds's avatar
Linus Torvalds committed
1604
	unsigned long page_cache = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617

	hash = page_hash(inode, offset);
	page = __find_page(inode, offset, *hash);
	if (!page) {
		if (!new)
			goto out;
		page_cache = get_free_page(GFP_KERNEL);
		if (!page_cache)
			goto out;
		page = mem_map + MAP_NR(page_cache);
		add_to_page_cache(page, inode, offset, hash);
	}
	if (atomic_read(&page->count) != 2)
Linus Torvalds's avatar
Linus Torvalds committed
1618
		printk(KERN_ERR "get_cached_page: page count=%d\n",
Linus Torvalds's avatar
Linus Torvalds committed
1619 1620
			atomic_read(&page->count));
	if (test_bit(PG_locked, &page->flags))
Linus Torvalds's avatar
Linus Torvalds committed
1621
		printk(KERN_ERR "get_cached_page: page already locked!\n");
Linus Torvalds's avatar
Linus Torvalds committed
1622
	set_bit(PG_locked, &page->flags);
Linus Torvalds's avatar
Linus Torvalds committed
1623
	page_cache = page_address(page);
Linus Torvalds's avatar
Linus Torvalds committed
1624 1625

out:
Linus Torvalds's avatar
Linus Torvalds committed
1626
	return page_cache;
Linus Torvalds's avatar
Linus Torvalds committed
1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644
}

/*
 * Unlock and free a page.
 */
void put_cached_page(unsigned long addr)
{
	struct page * page = mem_map + MAP_NR(addr);

	if (!test_bit(PG_locked, &page->flags))
		printk("put_cached_page: page not locked!\n");
	if (atomic_read(&page->count) != 2)
		printk("put_cached_page: page count=%d\n", 
			atomic_read(&page->count));
	clear_bit(PG_locked, &page->flags);
	wake_up(&page->wait);
	__free_page(page);
}