swap.c 31.9 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3
/*
 *  linux/mm/swap.c
 *
Linus Torvalds's avatar
Linus Torvalds committed
4
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
Linus Torvalds's avatar
Linus Torvalds committed
5 6 7 8 9
 */

/*
 * This file should contain most things doing the swapping from/to disk.
 * Started 18.12.91
Linus Torvalds's avatar
Linus Torvalds committed
10 11
 *
 * Swap aging added 23.2.95, Stephen Tweedie.
Linus Torvalds's avatar
Linus Torvalds committed
12 13 14 15 16 17
 */

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/head.h>
#include <linux/kernel.h>
Linus Torvalds's avatar
Linus Torvalds committed
18
#include <linux/kernel_stat.h>
19 20 21
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
Linus Torvalds's avatar
Linus Torvalds committed
22
#include <linux/swap.h>
Linus Torvalds's avatar
Linus Torvalds committed
23
#include <linux/fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
24
#include <linux/swapctl.h>
25

Linus Torvalds's avatar
Linus Torvalds committed
26
#include <asm/dma.h>
27
#include <asm/system.h> /* for cli()/sti() */
Linus Torvalds's avatar
Linus Torvalds committed
28
#include <asm/segment.h> /* for memcpy_to/fromfs */
29
#include <asm/bitops.h>
Linus Torvalds's avatar
Linus Torvalds committed
30
#include <asm/pgtable.h>
31

32 33 34 35 36
#define MAX_SWAPFILES 8

#define SWP_USED	1
#define SWP_WRITEOK	3

Linus Torvalds's avatar
Linus Torvalds committed
37
int min_free_pages = 20;
Linus Torvalds's avatar
Linus Torvalds committed
38

Linus Torvalds's avatar
Linus Torvalds committed
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
/*
 * Constants for the page aging mechanism: the maximum age (actually,
 * the maximum "youthfulness"); the quanta by which pages rejuvinate
 * and age; and the initial age for new pages. 
 */

swap_control_t swap_control = {
	20, 3, 1, 3,		/* Page aging */
	10, 2, 2, 0,		/* Buffer aging */
	32, 4,			/* Aging cluster */
	8192, 4096,		/* Pageout and bufferout weights */
	-200,			/* Buffer grace */
	1, 1,			/* Buffs/pages to free */
	RCL_ROUND_ROBIN		/* Balancing policy */
};

55 56
static int nr_swapfiles = 0;
static struct wait_queue * lock_queue = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
57 58 59 60
static struct {
	int head;	/* head of priority-ordered swapfile list */
	int next;	/* swapfile to be used next */
} swap_list = {-1, -1};
61 62

static struct swap_info_struct {
Linus Torvalds's avatar
Linus Torvalds committed
63
	unsigned int flags;
Linus Torvalds's avatar
Linus Torvalds committed
64
	kdev_t swap_device;
Linus Torvalds's avatar
Linus Torvalds committed
65
	struct inode * swap_file;
66
	unsigned char * swap_map;
67
	unsigned char * swap_lockmap;
68 69
	int lowest_bit;
	int highest_bit;
Linus Torvalds's avatar
Linus Torvalds committed
70 71
	int prio;			/* swap priority */
	int pages;
Linus Torvalds's avatar
Linus Torvalds committed
72
	unsigned long max;
Linus Torvalds's avatar
Linus Torvalds committed
73
	int next;			/* next entry on swap list */
74
} swap_info[MAX_SWAPFILES];
75

Linus Torvalds's avatar
Linus Torvalds committed
76
extern int shm_swap (int, unsigned long);
77

Linus Torvalds's avatar
Linus Torvalds committed
78 79 80 81 82 83 84
/*
 * To save us from swapping out pages which have just been swapped in and
 * have not been modified since then, we keep in swap_cache[page>>PAGE_SHIFT]
 * the swap entry which was last used to fill the page, or zero if the
 * page does not currently correspond to a page in swap. PAGE_DIRTY makes
 * this info useless.
 */
Linus Torvalds's avatar
Linus Torvalds committed
85 86 87
unsigned long *swap_cache;

#ifdef SWAP_CACHE_INFO
Linus Torvalds's avatar
Linus Torvalds committed
88 89 90 91 92 93 94 95
unsigned long swap_cache_add_total = 0;
unsigned long swap_cache_add_success = 0;
unsigned long swap_cache_del_total = 0;
unsigned long swap_cache_del_success = 0;
unsigned long swap_cache_find_total = 0;
unsigned long swap_cache_find_success = 0;

extern inline void show_swap_cache_info(void)
Linus Torvalds's avatar
Linus Torvalds committed
96 97
{
	printk("Swap cache: add %ld/%ld, delete %ld/%ld, find %ld/%ld\n",
Linus Torvalds's avatar
Linus Torvalds committed
98 99 100
		swap_cache_add_total, swap_cache_add_success, 
		swap_cache_del_total, swap_cache_del_success,
		swap_cache_find_total, swap_cache_find_success);
Linus Torvalds's avatar
Linus Torvalds committed
101 102 103
}
#endif

Linus Torvalds's avatar
Linus Torvalds committed
104
static int add_to_swap_cache(unsigned long addr, unsigned long entry)
Linus Torvalds's avatar
Linus Torvalds committed
105 106
{
	struct swap_info_struct * p = &swap_info[SWP_TYPE(entry)];
Linus Torvalds's avatar
Linus Torvalds committed
107

Linus Torvalds's avatar
Linus Torvalds committed
108
#ifdef SWAP_CACHE_INFO
Linus Torvalds's avatar
Linus Torvalds committed
109
	swap_cache_add_total++;
Linus Torvalds's avatar
Linus Torvalds committed
110
#endif
Linus Torvalds's avatar
Linus Torvalds committed
111
	if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
Linus Torvalds's avatar
Linus Torvalds committed
112
		entry = xchg(swap_cache + MAP_NR(addr), entry);
Linus Torvalds's avatar
Linus Torvalds committed
113 114 115 116
		if (entry)  {
			printk("swap_cache: replacing non-NULL entry\n");
		}
#ifdef SWAP_CACHE_INFO
Linus Torvalds's avatar
Linus Torvalds committed
117
		swap_cache_add_success++;
Linus Torvalds's avatar
Linus Torvalds committed
118 119 120 121 122 123
#endif
		return 1;
	}
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
124 125
static unsigned long init_swap_cache(unsigned long mem_start,
	unsigned long mem_end)
Linus Torvalds's avatar
Linus Torvalds committed
126
{
Linus Torvalds's avatar
Linus Torvalds committed
127 128 129 130
	unsigned long swap_cache_size;

	mem_start = (mem_start + 15) & ~15;
	swap_cache = (unsigned long *) mem_start;
Linus Torvalds's avatar
Linus Torvalds committed
131
	swap_cache_size = MAP_NR(mem_end);
Linus Torvalds's avatar
Linus Torvalds committed
132 133
	memset(swap_cache, 0, swap_cache_size * sizeof (unsigned long));
	return (unsigned long) (swap_cache + swap_cache_size);
Linus Torvalds's avatar
Linus Torvalds committed
134 135
}

Linus Torvalds's avatar
Linus Torvalds committed
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
/* General swap control */

/* Parse the kernel command line "swap=" option at load time: */
void swap_setup(char *str, int *ints)
{
	int * swap_vars[8] = {
		&MAX_PAGE_AGE,
		&PAGE_ADVANCE,
		&PAGE_DECLINE,
		&PAGE_INITIAL_AGE,
		&AGE_CLUSTER_FRACT,
		&AGE_CLUSTER_MIN,
		&PAGEOUT_WEIGHT,
		&BUFFEROUT_WEIGHT
	};
	int i;
	for (i=0; i < ints[0] && i < 8; i++) {
		if (ints[i+1])
			*(swap_vars[i]) = ints[i+1];
	}
}

/* Parse the kernel command line "buff=" option at load time: */
void buff_setup(char *str, int *ints)
{
	int * buff_vars[6] = {
		&MAX_BUFF_AGE,
		&BUFF_ADVANCE,
		&BUFF_DECLINE,
		&BUFF_INITIAL_AGE,
		&BUFFEROUT_WEIGHT,
		&BUFFERMEM_GRACE
	};
	int i;
	for (i=0; i < ints[0] && i < 6; i++) {
		if (ints[i+1])
			*(buff_vars[i]) = ints[i+1];
	}
}

/* Page aging */

178
void rw_swap_page(int rw, unsigned long entry, char * buf)
179
{
180
	unsigned long type, offset;
181
	struct swap_info_struct * p;
182

183 184
	type = SWP_TYPE(entry);
	if (type >= nr_swapfiles) {
185
		printk("Internal error: bad swap-device\n");
186 187
		return;
	}
188 189
	p = &swap_info[type];
	offset = SWP_OFFSET(entry);
Linus Torvalds's avatar
Linus Torvalds committed
190
	if (offset >= p->max) {
191 192 193
		printk("rw_swap_page: weirdness\n");
		return;
	}
Linus Torvalds's avatar
Linus Torvalds committed
194 195 196 197
	if (p->swap_map && !p->swap_map[offset]) {
		printk("Hmm.. Trying to use unallocated swap (%08lx)\n", entry);
		return;
	}
198 199 200 201
	if (!(p->flags & SWP_USED)) {
		printk("Trying to swap to unused swap-device\n");
		return;
	}
202
	while (set_bit(offset,p->swap_lockmap))
203
		sleep_on(&lock_queue);
Linus Torvalds's avatar
Linus Torvalds committed
204 205 206 207
	if (rw == READ)
		kstat.pswpin++;
	else
		kstat.pswpout++;
208
	if (p->swap_device) {
209
		ll_rw_page(rw,p->swap_device,offset,buf);
210
	} else if (p->swap_file) {
Linus Torvalds's avatar
Linus Torvalds committed
211
		struct inode *swapf = p->swap_file;
Linus Torvalds's avatar
Linus Torvalds committed
212
		unsigned int zones[PAGE_SIZE/512];
Linus Torvalds's avatar
Linus Torvalds committed
213 214 215 216 217 218 219 220 221 222 223 224
		int i;
		if (swapf->i_op->bmap == NULL
			&& swapf->i_op->smap != NULL){
			/*
				With MsDOS, we use msdos_smap which return
				a sector number (not a cluster or block number).
				It is a patch to enable the UMSDOS project.
				Other people are working on better solution.

				It sounds like ll_rw_swap_file defined
				it operation size (sector size) based on
				PAGE_SIZE and the number of block to read.
Linus Torvalds's avatar
Linus Torvalds committed
225 226
				So using bmap or smap should work even if
				smap will require more blocks.
Linus Torvalds's avatar
Linus Torvalds committed
227 228 229 230 231 232 233 234 235
			*/
			int j;
			unsigned int block = offset << 3;

			for (i=0, j=0; j< PAGE_SIZE ; i++, j += 512){
				if (!(zones[i] = swapf->i_op->smap(swapf,block++))) {
					printk("rw_swap_page: bad swap file\n");
					return;
				}
236
			}
Linus Torvalds's avatar
Linus Torvalds committed
237 238 239
		}else{
			int j;
			unsigned int block = offset
Linus Torvalds's avatar
Linus Torvalds committed
240
				<< (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
Linus Torvalds's avatar
Linus Torvalds committed
241 242 243 244 245 246 247 248

			for (i=0, j=0; j< PAGE_SIZE ; i++, j +=swapf->i_sb->s_blocksize)
				if (!(zones[i] = bmap(swapf,block++))) {
					printk("rw_swap_page: bad swap file\n");
					return;
				}
		}
		ll_rw_swap_file(rw,swapf->i_dev, zones, i,buf);
249 250
	} else
		printk("re_swap_page: no swap file or device\n");
Linus Torvalds's avatar
Linus Torvalds committed
251
	if (offset && !clear_bit(offset,p->swap_lockmap))
252 253
		printk("rw_swap_page: lock already cleared\n");
	wake_up(&lock_queue);
254
}
Linus Torvalds's avatar
Linus Torvalds committed
255

Linus Torvalds's avatar
Linus Torvalds committed
256
unsigned long get_swap_page(void)
Linus Torvalds's avatar
Linus Torvalds committed
257
{
258
	struct swap_info_struct * p;
Linus Torvalds's avatar
Linus Torvalds committed
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
	unsigned long offset, entry;
	int type, wrapped = 0;

	type = swap_list.next;
	if (type < 0)
	  return 0;

	while (1) {
		p = &swap_info[type];
		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
			for (offset = p->lowest_bit; offset <= p->highest_bit ; offset++) {
				if (p->swap_map[offset])
				  continue;
				if (test_bit(offset, p->swap_lockmap))
				  continue;
				p->swap_map[offset] = 1;
				nr_swap_pages--;
				if (offset == p->highest_bit)
				  p->highest_bit--;
				p->lowest_bit = offset;
				entry = SWP_ENTRY(type,offset);

				type = swap_info[type].next;
				if (type < 0 || p->prio != swap_info[type].prio) {
				    swap_list.next = swap_list.head;
				} else {
				    swap_list.next = type;
				}
				return entry;
			}
		}
		type = p->next;
		if (!wrapped) {
			if (type < 0 || p->prio != swap_info[type].prio) {
				type = swap_list.head;
				wrapped = 1;
			}
		} else if (type < 0) {
			return 0;	/* out of swap space */
298
		}
299
	}
Linus Torvalds's avatar
Linus Torvalds committed
300 301
}

Linus Torvalds's avatar
Linus Torvalds committed
302
void swap_duplicate(unsigned long entry)
Linus Torvalds's avatar
Linus Torvalds committed
303
{
304
	struct swap_info_struct * p;
305
	unsigned long offset, type;
306

307
	if (!entry)
Linus Torvalds's avatar
Linus Torvalds committed
308
		return;
309 310
	offset = SWP_OFFSET(entry);
	type = SWP_TYPE(entry);
Linus Torvalds's avatar
Linus Torvalds committed
311
	if (type & SHM_SWP_TYPE)
Linus Torvalds's avatar
Linus Torvalds committed
312
		return;
313 314
	if (type >= nr_swapfiles) {
		printk("Trying to duplicate nonexistent swap-page\n");
Linus Torvalds's avatar
Linus Torvalds committed
315
		return;
316
	}
317
	p = type + swap_info;
Linus Torvalds's avatar
Linus Torvalds committed
318
	if (offset >= p->max) {
Linus Torvalds's avatar
Linus Torvalds committed
319
		printk("swap_duplicate: weirdness\n");
Linus Torvalds's avatar
Linus Torvalds committed
320
		return;
321
	}
322
	if (!p->swap_map[offset]) {
323
		printk("swap_duplicate: trying to duplicate unused page\n");
Linus Torvalds's avatar
Linus Torvalds committed
324
		return;
325
	}
326
	p->swap_map[offset]++;
Linus Torvalds's avatar
Linus Torvalds committed
327
	return;
328 329
}

330
void swap_free(unsigned long entry)
331 332
{
	struct swap_info_struct * p;
333
	unsigned long offset, type;
334

335
	if (!entry)
336
		return;
337
	type = SWP_TYPE(entry);
Linus Torvalds's avatar
Linus Torvalds committed
338
	if (type & SHM_SWP_TYPE)
339
		return;
340
	if (type >= nr_swapfiles) {
341 342 343
		printk("Trying to free nonexistent swap-page\n");
		return;
	}
344 345
	p = & swap_info[type];
	offset = SWP_OFFSET(entry);
Linus Torvalds's avatar
Linus Torvalds committed
346
	if (offset >= p->max) {
347
		printk("swap_free: weirdness\n");
348 349 350 351 352 353
		return;
	}
	if (!(p->flags & SWP_USED)) {
		printk("Trying to free swap from unused swap-device\n");
		return;
	}
354 355 356 357 358
	if (offset < p->lowest_bit)
		p->lowest_bit = offset;
	if (offset > p->highest_bit)
		p->highest_bit = offset;
	if (!p->swap_map[offset])
359
		printk("swap_free: swap-space map bad (entry %08lx)\n",entry);
360
	else
361 362
		if (!--p->swap_map[offset])
			nr_swap_pages++;
Linus Torvalds's avatar
Linus Torvalds committed
363 364 365
	if (p->prio > swap_info[swap_list.next].prio) {
	    swap_list.next = swap_list.head;
	}
Linus Torvalds's avatar
Linus Torvalds committed
366 367
}

Linus Torvalds's avatar
Linus Torvalds committed
368 369 370 371 372 373 374
/*
 * The tests may look silly, but it essentially makes sure that
 * no other process did a swap-in on us just as we were waiting.
 *
 * Also, don't bother to add to the swap cache if this page-in
 * was due to a write access.
 */
Linus Torvalds's avatar
Linus Torvalds committed
375 376
void swap_in(struct task_struct * tsk, struct vm_area_struct * vma,
	pte_t * page_table, unsigned long entry, int write_access)
Linus Torvalds's avatar
Linus Torvalds committed
377
{
Linus Torvalds's avatar
Linus Torvalds committed
378
	unsigned long page = __get_free_page(GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
379

Linus Torvalds's avatar
Linus Torvalds committed
380 381 382 383 384
	if (pte_val(*page_table) != entry) {
		free_page(page);
		return;
	}
	if (!page) {
Linus Torvalds's avatar
Linus Torvalds committed
385
		set_pte(page_table, BAD_PAGE);
Linus Torvalds's avatar
Linus Torvalds committed
386
		swap_free(entry);
Linus Torvalds's avatar
Linus Torvalds committed
387
		oom(tsk);
Linus Torvalds's avatar
Linus Torvalds committed
388
		return;
389
	}
Linus Torvalds's avatar
Linus Torvalds committed
390
	read_swap_page(entry, (char *) page);
Linus Torvalds's avatar
Linus Torvalds committed
391 392 393 394
	if (pte_val(*page_table) != entry) {
		free_page(page);
		return;
	}
Linus Torvalds's avatar
Linus Torvalds committed
395
	vma->vm_mm->rss++;
Linus Torvalds's avatar
Linus Torvalds committed
396
	tsk->maj_flt++;
Linus Torvalds's avatar
Linus Torvalds committed
397
	if (!write_access && add_to_swap_cache(page, entry)) {
Linus Torvalds's avatar
Linus Torvalds committed
398
		set_pte(page_table, mk_pte(page, vma->vm_page_prot));
Linus Torvalds's avatar
Linus Torvalds committed
399 400
		return;
	}
Linus Torvalds's avatar
Linus Torvalds committed
401
	set_pte(page_table, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
Linus Torvalds's avatar
Linus Torvalds committed
402
  	swap_free(entry);
Linus Torvalds's avatar
Linus Torvalds committed
403
  	return;
Linus Torvalds's avatar
Linus Torvalds committed
404 405
}

Linus Torvalds's avatar
Linus Torvalds committed
406
/*
Linus Torvalds's avatar
Linus Torvalds committed
407
 * The swap-out functions return 1 if they successfully
Linus Torvalds's avatar
Linus Torvalds committed
408 409 410 411 412 413 414 415 416
 * threw something out, and we got a free page. It returns
 * zero if it couldn't do anything, and any other value
 * indicates it decreased rss, but the page was shared.
 *
 * NOTE! If it sleeps, it *must* return 1 to make sure we
 * don't continue with the swap-out. Otherwise we may be
 * using a process that no longer actually exists (it might
 * have died while we slept).
 */
Linus Torvalds's avatar
Linus Torvalds committed
417 418
static inline int try_to_swap_out(struct task_struct * tsk, struct vm_area_struct* vma,
	unsigned long address, pte_t * page_table, unsigned long limit)
Linus Torvalds's avatar
Linus Torvalds committed
419
{
Linus Torvalds's avatar
Linus Torvalds committed
420 421 422
	pte_t pte;
	unsigned long entry;
	unsigned long page;
Linus Torvalds's avatar
Linus Torvalds committed
423

Linus Torvalds's avatar
Linus Torvalds committed
424 425
	pte = *page_table;
	if (!pte_present(pte))
Linus Torvalds's avatar
Linus Torvalds committed
426
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
427
	page = pte_page(pte);
428
	if (page >= high_memory)
429
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
430 431
	if (page >= limit)
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
432

Linus Torvalds's avatar
Linus Torvalds committed
433
	if (mem_map[MAP_NR(page)].reserved)
434
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
435 436 437 438 439
	/* Deal with page aging.  Pages age from being unused; they
	 * rejuvinate on being accessed.  Only swap old pages (age==0
	 * is oldest). */
	if ((pte_dirty(pte) && delete_from_swap_cache(page)) 
	    || pte_young(pte))  {
Linus Torvalds's avatar
Linus Torvalds committed
440
		set_pte(page_table, pte_mkold(pte));
Linus Torvalds's avatar
Linus Torvalds committed
441
		touch_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
442
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
443
	}	
Linus Torvalds's avatar
Linus Torvalds committed
444 445 446
	age_page(page);
	if (age_of(page))
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
447
	if (pte_dirty(pte)) {
Linus Torvalds's avatar
Linus Torvalds committed
448
		if (vma->vm_ops && vma->vm_ops->swapout) {
Linus Torvalds's avatar
Linus Torvalds committed
449 450
			pid_t pid = tsk->pid;
			vma->vm_mm->rss--;
Linus Torvalds's avatar
Linus Torvalds committed
451 452
			if (vma->vm_ops->swapout(vma, address - vma->vm_start + vma->vm_offset, page_table))
				kill_proc(pid, SIGBUS, 1);
Linus Torvalds's avatar
Linus Torvalds committed
453
		} else {
Linus Torvalds's avatar
Linus Torvalds committed
454
			if (mem_map[MAP_NR(page)].count != 1)
Linus Torvalds's avatar
Linus Torvalds committed
455
				return 0;
Linus Torvalds's avatar
Linus Torvalds committed
456 457
			if (!(entry = get_swap_page()))
				return 0;
Linus Torvalds's avatar
Linus Torvalds committed
458
			vma->vm_mm->rss--;
Linus Torvalds's avatar
Linus Torvalds committed
459
			set_pte(page_table, __pte(entry));
Linus Torvalds's avatar
Linus Torvalds committed
460
			invalidate_page(vma->vm_mm, address);
Linus Torvalds's avatar
Linus Torvalds committed
461
			tsk->nswap++;
Linus Torvalds's avatar
Linus Torvalds committed
462 463
			write_swap_page(entry, (char *) page);
		}
Linus Torvalds's avatar
Linus Torvalds committed
464
		free_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
465
		return 1;	/* we slept: the process may not exist any more */
Linus Torvalds's avatar
Linus Torvalds committed
466
	}
Linus Torvalds's avatar
Linus Torvalds committed
467
        if ((entry = find_in_swap_cache(page)))  {
Linus Torvalds's avatar
Linus Torvalds committed
468
		if (mem_map[MAP_NR(page)].count != 1) {
Linus Torvalds's avatar
Linus Torvalds committed
469
			set_pte(page_table, pte_mkdirty(pte));
Linus Torvalds's avatar
Linus Torvalds committed
470
			printk("Aiee.. duplicated cached swap-cache entry\n");
Linus Torvalds's avatar
Linus Torvalds committed
471 472
			return 0;
		}
Linus Torvalds's avatar
Linus Torvalds committed
473
		vma->vm_mm->rss--;
Linus Torvalds's avatar
Linus Torvalds committed
474
		set_pte(page_table, __pte(entry));
Linus Torvalds's avatar
Linus Torvalds committed
475
		invalidate_page(vma->vm_mm, address);
Linus Torvalds's avatar
Linus Torvalds committed
476
		free_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
477 478
		return 1;
	} 
Linus Torvalds's avatar
Linus Torvalds committed
479
	vma->vm_mm->rss--;
Linus Torvalds's avatar
Linus Torvalds committed
480
	pte_clear(page_table);
Linus Torvalds's avatar
Linus Torvalds committed
481
	invalidate_page(vma->vm_mm, address);
Linus Torvalds's avatar
Linus Torvalds committed
482
	entry = mem_map[MAP_NR(page)].count;
Linus Torvalds's avatar
Linus Torvalds committed
483
	free_page(page);
Linus Torvalds's avatar
Linus Torvalds committed
484
	return entry;
Linus Torvalds's avatar
Linus Torvalds committed
485 486
}

487 488 489 490 491 492 493 494 495 496 497 498 499
/*
 * A new implementation of swap_out().  We do not swap complete processes,
 * but only a small number of blocks, before we continue with the next
 * process.  The number of blocks actually swapped is determined on the
 * number of page faults, that this process actually had in the last time,
 * so we won't swap heavily used processes all the time ...
 *
 * Note: the priority argument is a hint on much CPU to waste with the
 *       swap block search, not a hint, of how much blocks to swap with
 *       each process.
 *
 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de
 */
Linus Torvalds's avatar
Linus Torvalds committed
500

Linus Torvalds's avatar
Linus Torvalds committed
501 502
static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * vma,
	pmd_t *dir, unsigned long address, unsigned long end, unsigned long limit)
Linus Torvalds's avatar
Linus Torvalds committed
503 504
{
	pte_t * pte;
Linus Torvalds's avatar
Linus Torvalds committed
505
	unsigned long pmd_end;
Linus Torvalds's avatar
Linus Torvalds committed
506 507 508 509 510 511 512 513

	if (pmd_none(*dir))
		return 0;
	if (pmd_bad(*dir)) {
		printk("swap_out_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
		pmd_clear(dir);
		return 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
514
	
Linus Torvalds's avatar
Linus Torvalds committed
515
	pte = pte_offset(dir, address);
Linus Torvalds's avatar
Linus Torvalds committed
516 517 518 519 520
	
	pmd_end = (address + PMD_SIZE) & PMD_MASK;
	if (end > pmd_end)
		end = pmd_end;

Linus Torvalds's avatar
Linus Torvalds committed
521
	do {
Linus Torvalds's avatar
Linus Torvalds committed
522
		int result;
Linus Torvalds's avatar
Linus Torvalds committed
523
		tsk->swap_address = address + PAGE_SIZE;
Linus Torvalds's avatar
Linus Torvalds committed
524
		result = try_to_swap_out(tsk, vma, address, pte, limit);
Linus Torvalds's avatar
Linus Torvalds committed
525 526
		if (result)
			return result;
Linus Torvalds's avatar
Linus Torvalds committed
527 528 529 530 531 532
		address += PAGE_SIZE;
		pte++;
	} while (address < end);
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
533 534
static inline int swap_out_pgd(struct task_struct * tsk, struct vm_area_struct * vma,
	pgd_t *dir, unsigned long address, unsigned long end, unsigned long limit)
Linus Torvalds's avatar
Linus Torvalds committed
535 536
{
	pmd_t * pmd;
Linus Torvalds's avatar
Linus Torvalds committed
537
	unsigned long pgd_end;
Linus Torvalds's avatar
Linus Torvalds committed
538 539 540 541 542 543 544 545

	if (pgd_none(*dir))
		return 0;
	if (pgd_bad(*dir)) {
		printk("swap_out_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
		pgd_clear(dir);
		return 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
546

Linus Torvalds's avatar
Linus Torvalds committed
547
	pmd = pmd_offset(dir, address);
Linus Torvalds's avatar
Linus Torvalds committed
548 549 550 551 552

	pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;	
	if (end > pgd_end)
		end = pgd_end;
	
Linus Torvalds's avatar
Linus Torvalds committed
553
	do {
Linus Torvalds's avatar
Linus Torvalds committed
554
		int result = swap_out_pmd(tsk, vma, pmd, address, end, limit);
Linus Torvalds's avatar
Linus Torvalds committed
555 556
		if (result)
			return result;
Linus Torvalds's avatar
Linus Torvalds committed
557 558 559 560 561 562
		address = (address + PMD_SIZE) & PMD_MASK;
		pmd++;
	} while (address < end);
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
563 564
static int swap_out_vma(struct task_struct * tsk, struct vm_area_struct * vma,
	pgd_t *pgdir, unsigned long start, unsigned long limit)
Linus Torvalds's avatar
Linus Torvalds committed
565
{
Linus Torvalds's avatar
Linus Torvalds committed
566 567
	unsigned long end;

Linus Torvalds's avatar
Linus Torvalds committed
568
	/* Don't swap out areas like shared memory which have their
Linus Torvalds's avatar
Linus Torvalds committed
569 570
	    own separate swapping mechanism or areas which are locked down */
	if (vma->vm_flags & (VM_SHM | VM_LOCKED))
Linus Torvalds's avatar
Linus Torvalds committed
571 572
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
573
	end = vma->vm_end;
Linus Torvalds's avatar
Linus Torvalds committed
574
	while (start < end) {
Linus Torvalds's avatar
Linus Torvalds committed
575
		int result = swap_out_pgd(tsk, vma, pgdir, start, end, limit);
Linus Torvalds's avatar
Linus Torvalds committed
576 577
		if (result)
			return result;
Linus Torvalds's avatar
Linus Torvalds committed
578 579 580 581 582 583
		start = (start + PGDIR_SIZE) & PGDIR_MASK;
		pgdir++;
	}
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
584
static int swap_out_process(struct task_struct * p, unsigned long limit)
585
{
Linus Torvalds's avatar
Linus Torvalds committed
586
	unsigned long address;
Linus Torvalds's avatar
Linus Torvalds committed
587
	struct vm_area_struct* vma;
588 589 590 591

	/*
	 * Go through process' page directory.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
592 593
	address = p->swap_address;
	p->swap_address = 0;
Linus Torvalds's avatar
Linus Torvalds committed
594 595 596 597

	/*
	 * Find the proper vm-area
	 */
Linus Torvalds's avatar
Linus Torvalds committed
598 599 600
	vma = find_vma(p, address);
	if (!vma)
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
601 602 603
	if (address < vma->vm_start)
		address = vma->vm_start;

Linus Torvalds's avatar
Linus Torvalds committed
604
	for (;;) {
Linus Torvalds's avatar
Linus Torvalds committed
605
		int result = swap_out_vma(p, vma, pgd_offset(p->mm, address), address, limit);
Linus Torvalds's avatar
Linus Torvalds committed
606 607
		if (result)
			return result;
Linus Torvalds's avatar
Linus Torvalds committed
608 609
		vma = vma->vm_next;
		if (!vma)
Linus Torvalds's avatar
Linus Torvalds committed
610
			break;
Linus Torvalds's avatar
Linus Torvalds committed
611
		address = vma->vm_start;
612
	}
Linus Torvalds's avatar
Linus Torvalds committed
613
	p->swap_address = 0;
Linus Torvalds's avatar
Linus Torvalds committed
614
	return 0;
615 616
}

Linus Torvalds's avatar
Linus Torvalds committed
617
static int swap_out(unsigned int priority, unsigned long limit)
Linus Torvalds's avatar
Linus Torvalds committed
618
{
Linus Torvalds's avatar
Linus Torvalds committed
619
	static int swap_task;
Linus Torvalds's avatar
Linus Torvalds committed
620
	int loop, counter;
Linus Torvalds's avatar
Linus Torvalds committed
621 622
	struct task_struct *p;

Linus Torvalds's avatar
Linus Torvalds committed
623
	counter = ((PAGEOUT_WEIGHT * nr_tasks) >> 10) >> priority;
Linus Torvalds's avatar
Linus Torvalds committed
624
	for(; counter >= 0; counter--) {
Linus Torvalds's avatar
Linus Torvalds committed
625 626 627 628 629 630 631 632 633 634 635 636 637 638 639
		/*
		 * Check that swap_task is suitable for swapping.  If not, look for
		 * the next suitable process.
		 */
		loop = 0;
		while(1) {
			if (swap_task >= NR_TASKS) {
				swap_task = 1;
				if (loop)
					/* all processes are unswappable or already swapped out */
					return 0;
				loop = 1;
			}

			p = task[swap_task];
Linus Torvalds's avatar
Linus Torvalds committed
640
			if (p && p->swappable && p->mm->rss)
Linus Torvalds's avatar
Linus Torvalds committed
641 642 643 644 645 646 647 648
				break;

			swap_task++;
		}

		/*
		 * Determine the number of pages to swap from this process.
		 */
Linus Torvalds's avatar
Linus Torvalds committed
649
		if (!p->swap_cnt) {
Linus Torvalds's avatar
Linus Torvalds committed
650 651 652
 			/* Normalise the number of pages swapped by
			   multiplying by (RSS / 1MB) */
			p->swap_cnt = AGE_CLUSTER_SIZE(p->mm->rss);
Linus Torvalds's avatar
Linus Torvalds committed
653
		}
Linus Torvalds's avatar
Linus Torvalds committed
654
		if (!--p->swap_cnt)
Linus Torvalds's avatar
Linus Torvalds committed
655
			swap_task++;
Linus Torvalds's avatar
Linus Torvalds committed
656
		switch (swap_out_process(p, limit)) {
Linus Torvalds's avatar
Linus Torvalds committed
657
			case 0:
Linus Torvalds's avatar
Linus Torvalds committed
658
				if (p->swap_cnt)
Linus Torvalds's avatar
Linus Torvalds committed
659 660 661 662 663 664
					swap_task++;
				break;
			case 1:
				return 1;
			default:
				break;
Linus Torvalds's avatar
Linus Torvalds committed
665
		}
666
	}
Linus Torvalds's avatar
Linus Torvalds committed
667
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
668 669
}

Linus Torvalds's avatar
Linus Torvalds committed
670
/*
Linus Torvalds's avatar
Linus Torvalds committed
671 672 673
 * We are much more aggressive about trying to swap out than we used
 * to be.  This works out OK, because we now do proper aging on page
 * contents. 
Linus Torvalds's avatar
Linus Torvalds committed
674
 */
Linus Torvalds's avatar
Linus Torvalds committed
675
static int try_to_free_page(int priority, unsigned long limit)
676
{
Linus Torvalds's avatar
Linus Torvalds committed
677
	static int state = 0;
678 679
	int i=6;

Linus Torvalds's avatar
Linus Torvalds committed
680 681 682
	switch (state) {
		do {
		case 0:
Linus Torvalds's avatar
Linus Torvalds committed
683
			if (priority != GFP_NOBUFFER && shrink_buffers(i, limit))
Linus Torvalds's avatar
Linus Torvalds committed
684
				return 1;
Linus Torvalds's avatar
Linus Torvalds committed
685
			state = 1;
Linus Torvalds's avatar
Linus Torvalds committed
686
		case 1:
Linus Torvalds's avatar
Linus Torvalds committed
687
			if (shm_swap(i, limit))
Linus Torvalds's avatar
Linus Torvalds committed
688
				return 1;
Linus Torvalds's avatar
Linus Torvalds committed
689 690
			state = 2;
		default:
Linus Torvalds's avatar
Linus Torvalds committed
691
			if (swap_out(i, limit))
Linus Torvalds's avatar
Linus Torvalds committed
692
				return 1;
Linus Torvalds's avatar
Linus Torvalds committed
693
			state = 0;
Linus Torvalds's avatar
Linus Torvalds committed
694
		} while(i--);
695 696 697 698
	}
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
699 700 701
static inline void add_mem_queue(struct mem_list * head, struct mem_list * entry)
{
	entry->prev = head;
Linus Torvalds's avatar
Linus Torvalds committed
702
	(entry->next = head->next)->prev = entry;
Linus Torvalds's avatar
Linus Torvalds committed
703 704 705 706
	head->next = entry;
}

static inline void remove_mem_queue(struct mem_list * head, struct mem_list * entry)
707
{
Linus Torvalds's avatar
Linus Torvalds committed
708 709
	entry->next->prev = entry->prev;
	entry->prev->next = entry->next;
710 711
}

712 713 714 715 716 717 718 719 720 721
/*
 * Free_page() adds the page to the free lists. This is optimized for
 * fast normal cases (no error jumps taken normally).
 *
 * The way to optimize jumps for gcc-2.2.2 is to:
 *  - select the "normal" case and put it inside the if () { XXX }
 *  - no else-statements if you can avoid them
 *
 * With the above two rules, you get a straight-line execution path
 * for the normal case, giving better asm-code.
Linus Torvalds's avatar
Linus Torvalds committed
722 723 724 725
 *
 * free_page() may sleep since the page being freed may be a buffer
 * page or present in the swap cache. It will not sleep, however,
 * for a freshly allocated page (get_free_page()).
726
 */
Linus Torvalds's avatar
Linus Torvalds committed
727 728 729 730 731 732

/*
 * Buddy system. Hairy. You really aren't expected to understand this
 */
static inline void free_pages_ok(unsigned long addr, unsigned long order)
{
Linus Torvalds's avatar
Linus Torvalds committed
733
	unsigned long index = MAP_NR(addr) >> (1 + order);
Linus Torvalds's avatar
Linus Torvalds committed
734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749
	unsigned long mask = PAGE_MASK << order;

	addr &= mask;
	nr_free_pages += 1 << order;
	while (order < NR_MEM_LISTS-1) {
		if (!change_bit(index, free_area_map[order]))
			break;
		remove_mem_queue(free_area_list+order, (struct mem_list *) (addr ^ (1+~mask)));
		order++;
		index >>= 1;
		mask <<= 1;
		addr &= mask;
	}
	add_mem_queue(free_area_list+order, (struct mem_list *) addr);
}

Linus Torvalds's avatar
Linus Torvalds committed
750 751 752 753 754 755 756 757
static inline void check_free_buffers(unsigned long addr)
{
	struct buffer_head * bh;

	bh = buffer_pages[MAP_NR(addr)];
	if (bh) {
		struct buffer_head *tmp = bh;
		do {
Linus Torvalds's avatar
Linus Torvalds committed
758 759
			if (tmp->b_list == BUF_SHARED
			    && tmp->b_dev != B_FREE)
Linus Torvalds's avatar
Linus Torvalds committed
760 761 762 763 764 765
				refile_buffer(tmp);
			tmp = tmp->b_this_page;
		} while (tmp != bh);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
766
void free_pages(unsigned long addr, unsigned long order)
767
{
Linus Torvalds's avatar
Linus Torvalds committed
768
	if (MAP_NR(addr) < MAP_NR(high_memory)) {
Linus Torvalds's avatar
Linus Torvalds committed
769
		unsigned long flag;
Linus Torvalds's avatar
Linus Torvalds committed
770
		mem_map_t * map = mem_map + MAP_NR(addr);
Linus Torvalds's avatar
Linus Torvalds committed
771 772 773 774 775 776 777 778
		if (map->reserved)
			return;
		if (map->count) {
			save_flags(flag);
			cli();
			if (!--map->count) {
				free_pages_ok(addr, order);
				delete_from_swap_cache(addr);
779
			}
Linus Torvalds's avatar
Linus Torvalds committed
780 781 782
			restore_flags(flag);
			if (map->count == 1)
				check_free_buffers(addr);
783 784
			return;
		}
Linus Torvalds's avatar
Linus Torvalds committed
785
		printk("Trying to free free memory (%08lx): memory probably corrupted\n",addr);
Linus Torvalds's avatar
Linus Torvalds committed
786
		printk("PC = %p\n", __builtin_return_address(0));
787 788 789 790
		return;
	}
}

791
/*
Linus Torvalds's avatar
Linus Torvalds committed
792
 * Some ugly macros to speed up __get_free_pages()..
Linus Torvalds's avatar
Linus Torvalds committed
793
 */
Linus Torvalds's avatar
Linus Torvalds committed
794
#define RMQUEUE(order, limit) \
Linus Torvalds's avatar
Linus Torvalds committed
795 796
do { struct mem_list * queue = free_area_list+order; \
     unsigned long new_order = order; \
Linus Torvalds's avatar
Linus Torvalds committed
797 798 799 800 801 802 803 804 805 806 807 808 809
	do { struct mem_list *prev = queue, *ret; \
		while (queue != (ret = prev->next)) { \
			if ((unsigned long) ret < (limit)) { \
				(prev->next = ret->next)->prev = prev; \
				mark_used((unsigned long) ret, new_order); \
				nr_free_pages -= 1 << order; \
				restore_flags(flags); \
				EXPAND(ret, order, new_order); \
				return (unsigned long) ret; \
			} \
			prev = ret; \
		} \
		new_order++; queue++; \
Linus Torvalds's avatar
Linus Torvalds committed
810 811 812 813
	} while (new_order < NR_MEM_LISTS); \
} while (0)

static inline int mark_used(unsigned long addr, unsigned long order)
Linus Torvalds's avatar
Linus Torvalds committed
814
{
Linus Torvalds's avatar
Linus Torvalds committed
815
	return change_bit(MAP_NR(addr) >> (1+order), free_area_map[order]);
Linus Torvalds's avatar
Linus Torvalds committed
816
}
817

Linus Torvalds's avatar
Linus Torvalds committed
818 819 820 821 822 823 824
#define EXPAND(addr,low,high) \
do { unsigned long size = PAGE_SIZE << high; \
	while (high > low) { \
		high--; size >>= 1; cli(); \
		add_mem_queue(free_area_list+high, addr); \
		mark_used((unsigned long) addr, high); \
		restore_flags(flags); \
Linus Torvalds's avatar
Linus Torvalds committed
825
		addr = (struct mem_list *) (size + (unsigned long) addr); \
Linus Torvalds's avatar
Linus Torvalds committed
826
	} mem_map[MAP_NR((unsigned long) addr)].count = 1; \
Linus Torvalds's avatar
Linus Torvalds committed
827
	mem_map[MAP_NR((unsigned long) addr)].age = PAGE_INITIAL_AGE; \
Linus Torvalds's avatar
Linus Torvalds committed
828 829
} while (0)

Linus Torvalds's avatar
Linus Torvalds committed
830
unsigned long __get_free_pages(int priority, unsigned long order, unsigned long limit)
Linus Torvalds's avatar
Linus Torvalds committed
831 832
{
	unsigned long flags;
Linus Torvalds's avatar
Linus Torvalds committed
833
	int reserved_pages;
Linus Torvalds's avatar
Linus Torvalds committed
834

Linus Torvalds's avatar
Linus Torvalds committed
835 836
	if (order >= NR_MEM_LISTS)
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
837
	if (intr_count && priority != GFP_ATOMIC) {
Linus Torvalds's avatar
Linus Torvalds committed
838 839
		static int count = 0;
		if (++count < 5) {
Linus Torvalds's avatar
Linus Torvalds committed
840 841
			printk("gfp called nonatomically from interrupt %p\n",
				__builtin_return_address(0));
Linus Torvalds's avatar
Linus Torvalds committed
842 843
			priority = GFP_ATOMIC;
		}
Linus Torvalds's avatar
Linus Torvalds committed
844
	}
Linus Torvalds's avatar
Linus Torvalds committed
845 846 847
	reserved_pages = 5;
	if (priority != GFP_NFS)
		reserved_pages = min_free_pages;
Linus Torvalds's avatar
Linus Torvalds committed
848
	save_flags(flags);
Linus Torvalds's avatar
Linus Torvalds committed
849
repeat:
Linus Torvalds's avatar
Linus Torvalds committed
850
	cli();
Linus Torvalds's avatar
Linus Torvalds committed
851
	if ((priority==GFP_ATOMIC) || nr_free_pages > reserved_pages) {
Linus Torvalds's avatar
Linus Torvalds committed
852
		RMQUEUE(order, limit);
Linus Torvalds's avatar
Linus Torvalds committed
853
		restore_flags(flags);
854
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
855 856
	}
	restore_flags(flags);
Linus Torvalds's avatar
Linus Torvalds committed
857
	if (priority != GFP_BUFFER && try_to_free_page(priority, limit))
Linus Torvalds's avatar
Linus Torvalds committed
858
		goto repeat;
859 860 861
	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880
/*
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
 * memory on each free list with the exception of the first item on the list.
 */
void show_free_areas(void)
{
 	unsigned long order, flags;
 	unsigned long total = 0;

	printk("Free pages:      %6dkB\n ( ",nr_free_pages<<(PAGE_SHIFT-10));
	save_flags(flags);
	cli();
 	for (order=0 ; order < NR_MEM_LISTS; order++) {
		struct mem_list * tmp;
		unsigned long nr = 0;
		for (tmp = free_area_list[order].next ; tmp != free_area_list + order ; tmp = tmp->next) {
			nr ++;
		}
Linus Torvalds's avatar
Linus Torvalds committed
881 882
		total += nr * ((PAGE_SIZE>>10) << order);
		printk("%lu*%lukB ", nr, (PAGE_SIZE>>10) << order);
Linus Torvalds's avatar
Linus Torvalds committed
883 884 885
	}
	restore_flags(flags);
	printk("= %lukB)\n", total);
Linus Torvalds's avatar
Linus Torvalds committed
886 887 888
#ifdef SWAP_CACHE_INFO
	show_swap_cache_info();
#endif	
Linus Torvalds's avatar
Linus Torvalds committed
889 890
}

891 892 893
/*
 * Trying to stop swapping from a file is fraught with races, so
 * we repeat quite a bit here when we have to pause. swapoff()
Linus Torvalds's avatar
Linus Torvalds committed
894 895 896 897 898
 * isn't exactly timing-critical, so who cares (but this is /really/
 * inefficient, ugh).
 *
 * We return 1 after having slept, which makes the process start over
 * from the beginning for this process..
899
 */
Linus Torvalds's avatar
Linus Torvalds committed
900 901
static inline int unuse_pte(struct vm_area_struct * vma, unsigned long address,
	pte_t *dir, unsigned int type, unsigned long page)
902
{
Linus Torvalds's avatar
Linus Torvalds committed
903 904 905 906 907 908 909 910 911 912 913 914 915
	pte_t pte = *dir;

	if (pte_none(pte))
		return 0;
	if (pte_present(pte)) {
		unsigned long page = pte_page(pte);
		if (page >= high_memory)
			return 0;
		if (!in_swap_cache(page))
			return 0;
		if (SWP_TYPE(in_swap_cache(page)) != type)
			return 0;
		delete_from_swap_cache(page);
Linus Torvalds's avatar
Linus Torvalds committed
916
		set_pte(dir, pte_mkdirty(pte));
Linus Torvalds's avatar
Linus Torvalds committed
917 918 919 920 921 922 923 924 925
		return 0;
	}
	if (SWP_TYPE(pte_val(pte)) != type)
		return 0;
	read_swap_page(pte_val(pte), (char *) page);
	if (pte_val(*dir) != pte_val(pte)) {
		free_page(page);
		return 1;
	}
Linus Torvalds's avatar
Linus Torvalds committed
926
	set_pte(dir, pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))));
Linus Torvalds's avatar
Linus Torvalds committed
927
	++vma->vm_mm->rss;
Linus Torvalds's avatar
Linus Torvalds committed
928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
	swap_free(pte_val(pte));
	return 1;
}

static inline int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
	unsigned long address, unsigned long size, unsigned long offset,
	unsigned int type, unsigned long page)
{
	pte_t * pte;
	unsigned long end;

	if (pmd_none(*dir))
		return 0;
	if (pmd_bad(*dir)) {
		printk("unuse_pmd: bad pmd (%08lx)\n", pmd_val(*dir));
		pmd_clear(dir);
		return 0;
	}
	pte = pte_offset(dir, address);
	offset += address & PMD_MASK;
	address &= ~PMD_MASK;
	end = address + size;
	if (end > PMD_SIZE)
		end = PMD_SIZE;
	do {
		if (unuse_pte(vma, offset+address-vma->vm_start, pte, type, page))
			return 1;
		address += PAGE_SIZE;
		pte++;
	} while (address < end);
	return 0;
}

static inline int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
	unsigned long address, unsigned long size,
	unsigned int type, unsigned long page)
{
	pmd_t * pmd;
	unsigned long offset, end;

	if (pgd_none(*dir))
		return 0;
	if (pgd_bad(*dir)) {
		printk("unuse_pgd: bad pgd (%08lx)\n", pgd_val(*dir));
		pgd_clear(dir);
		return 0;
	}
	pmd = pmd_offset(dir, address);
	offset = address & PGDIR_MASK;
	address &= ~PGDIR_MASK;
	end = address + size;
	if (end > PGDIR_SIZE)
		end = PGDIR_SIZE;
	do {
		if (unuse_pmd(vma, pmd, address, end - address, offset, type, page))
			return 1;
		address = (address + PMD_SIZE) & PMD_MASK;
		pmd++;
	} while (address < end);
	return 0;
}

static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
	unsigned long start, unsigned long end,
	unsigned int type, unsigned long page)
{
	while (start < end) {
		if (unuse_pgd(vma, pgdir, start, end - start, type, page))
			return 1;
		start = (start + PGDIR_SIZE) & PGDIR_MASK;
		pgdir++;
	}
	return 0;
}

static int unuse_process(struct task_struct * p, unsigned int type, unsigned long page)
{
	struct vm_area_struct* vma;

	/*
	 * Go through process' page directory.
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1010 1011
	if (!p->mm || pgd_inuse(p->mm->pgd))
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1012 1013
	vma = p->mm->mmap;
	while (vma) {
Linus Torvalds's avatar
Linus Torvalds committed
1014
		pgd_t * pgd = pgd_offset(p->mm, vma->vm_start);
Linus Torvalds's avatar
Linus Torvalds committed
1015 1016 1017 1018 1019 1020
		if (unuse_vma(vma, pgd, vma->vm_start, vma->vm_end, type, page))
			return 1;
		vma = vma->vm_next;
	}
	return 0;
}
1021 1022

/*
Linus Torvalds's avatar
Linus Torvalds committed
1023 1024 1025
 * To avoid races, we repeat for each process after having
 * swapped something in. That gets rid of a few pesky races,
 * and "swapoff" isn't exactly timing critical.
1026
 */
Linus Torvalds's avatar
Linus Torvalds committed
1027 1028 1029 1030
static int try_to_unuse(unsigned int type)
{
	int nr;
	unsigned long page = get_free_page(GFP_KERNEL);
Linus Torvalds's avatar
Linus Torvalds committed
1031

Linus Torvalds's avatar
Linus Torvalds committed
1032 1033 1034 1035 1036 1037 1038 1039 1040
	if (!page)
		return -ENOMEM;
	nr = 0;
	while (nr < NR_TASKS) {
		if (task[nr]) {
			if (unuse_process(task[nr], type, page)) {
				page = get_free_page(GFP_KERNEL);
				if (!page)
					return -ENOMEM;
1041
				continue;
Linus Torvalds's avatar
Linus Torvalds committed
1042
			}
1043
		}
Linus Torvalds's avatar
Linus Torvalds committed
1044
		nr++;
1045
	}
Linus Torvalds's avatar
Linus Torvalds committed
1046
	free_page(page);
1047 1048 1049
	return 0;
}

1050
asmlinkage int sys_swapoff(const char * specialfile)
1051 1052 1053
{
	struct swap_info_struct * p;
	struct inode * inode;
Linus Torvalds's avatar
Linus Torvalds committed
1054
	struct file filp;
Linus Torvalds's avatar
Linus Torvalds committed
1055
	int i, type, prev;
1056 1057 1058 1059 1060 1061

	if (!suser())
		return -EPERM;
	i = namei(specialfile,&inode);
	if (i)
		return i;
Linus Torvalds's avatar
Linus Torvalds committed
1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
	prev = -1;
	for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
		p = swap_info + type;
		if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
			if (p->swap_file) {
				if (p->swap_file == inode)
				  break;
			} else {
				if (S_ISBLK(inode->i_mode)
				    && (p->swap_device == inode->i_rdev))
				  break;
			}
1074
		}
Linus Torvalds's avatar
Linus Torvalds committed
1075
		prev = type;
1076
	}
Linus Torvalds's avatar
Linus Torvalds committed
1077
	if (type < 0){
Linus Torvalds's avatar
Linus Torvalds committed
1078
		iput(inode);
1079
		return -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
1080
	}
Linus Torvalds's avatar
Linus Torvalds committed
1081 1082 1083 1084 1085 1086 1087 1088 1089
	if (prev < 0) {
		swap_list.head = p->next;
	} else {
		swap_info[prev].next = p->next;
	}
	if (type == swap_list.next) {
		/* just pick something that's safe... */
		swap_list.next = swap_list.head;
	}
1090
	p->flags = SWP_USED;
1091
	i = try_to_unuse(type);
1092
	if (i) {
Linus Torvalds's avatar
Linus Torvalds committed
1093
		iput(inode);
1094 1095 1096
		p->flags = SWP_WRITEOK;
		return i;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110

	if(p->swap_device){
		memset(&filp, 0, sizeof(filp));		
		filp.f_inode = inode;
		filp.f_mode = 3; /* read write */
		/* open it again to get fops */
		if( !blkdev_open(inode, &filp) &&
		   filp.f_op && filp.f_op->release){
			filp.f_op->release(inode,&filp);
			filp.f_op->release(inode,&filp);
		}
	}
	iput(inode);

1111
	nr_swap_pages -= p->pages;
1112 1113 1114
	iput(p->swap_file);
	p->swap_file = NULL;
	p->swap_device = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1115
	vfree(p->swap_map);
1116 1117 1118 1119
	p->swap_map = NULL;
	free_page((long) p->swap_lockmap);
	p->swap_lockmap = NULL;
	p->flags = 0;
1120
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1121 1122
}

1123 1124 1125 1126 1127
/*
 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
 *
 * The swapon system call
 */
Linus Torvalds's avatar
Linus Torvalds committed
1128
asmlinkage int sys_swapon(const char * specialfile, int swap_flags)
Linus Torvalds's avatar
Linus Torvalds committed
1129
{
1130
	struct swap_info_struct * p;
1131
	struct inode * swap_inode;
1132
	unsigned int type;
Linus Torvalds's avatar
Linus Torvalds committed
1133
	int i, j, prev;
Linus Torvalds's avatar
Linus Torvalds committed
1134
	int error;
Linus Torvalds's avatar
Linus Torvalds committed
1135
	struct file filp;
Linus Torvalds's avatar
Linus Torvalds committed
1136
	static int least_priority = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1137

Linus Torvalds's avatar
Linus Torvalds committed
1138
	memset(&filp, 0, sizeof(filp));
1139 1140
	if (!suser())
		return -EPERM;
1141
	p = swap_info;
1142
	for (type = 0 ; type < nr_swapfiles ; type++,p++)
1143 1144
		if (!(p->flags & SWP_USED))
			break;
1145
	if (type >= MAX_SWAPFILES)
1146
		return -EPERM;
1147 1148
	if (type >= nr_swapfiles)
		nr_swapfiles = type+1;
1149 1150 1151 1152 1153 1154 1155
	p->flags = SWP_USED;
	p->swap_file = NULL;
	p->swap_device = 0;
	p->swap_map = NULL;
	p->swap_lockmap = NULL;
	p->lowest_bit = 0;
	p->highest_bit = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1156
	p->max = 1;
Linus Torvalds's avatar
Linus Torvalds committed
1157 1158 1159 1160 1161 1162 1163
	p->next = -1;
	if (swap_flags & SWAP_FLAG_PREFER) {
		p->prio =
		  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
	} else {
		p->prio = --least_priority;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1164 1165
	error = namei(specialfile,&swap_inode);
	if (error)
Linus Torvalds's avatar
Linus Torvalds committed
1166
		goto bad_swap_2;
Linus Torvalds's avatar
Linus Torvalds committed
1167
	p->swap_file = swap_inode;
Linus Torvalds's avatar
Linus Torvalds committed
1168 1169
	error = -EBUSY;
	if (swap_inode->i_count != 1)
Linus Torvalds's avatar
Linus Torvalds committed
1170
		goto bad_swap_2;
Linus Torvalds's avatar
Linus Torvalds committed
1171
	error = -EINVAL;
Linus Torvalds's avatar
Linus Torvalds committed
1172

1173
	if (S_ISBLK(swap_inode->i_mode)) {
1174
		p->swap_device = swap_inode->i_rdev;
Linus Torvalds's avatar
Linus Torvalds committed
1175 1176 1177 1178

		filp.f_inode = swap_inode;
		filp.f_mode = 3; /* read write */
		error = blkdev_open(swap_inode, &filp);
Linus Torvalds's avatar
Linus Torvalds committed
1179
		p->swap_file = NULL;
1180
		iput(swap_inode);
Linus Torvalds's avatar
Linus Torvalds committed
1181 1182
		if(error)
			goto bad_swap_2;
Linus Torvalds's avatar
Linus Torvalds committed
1183 1184 1185 1186
		error = -ENODEV;
		if (!p->swap_device)
			goto bad_swap;
		error = -EBUSY;
1187
		for (i = 0 ; i < nr_swapfiles ; i++) {
1188
			if (i == type)
1189
				continue;
Linus Torvalds's avatar
Linus Torvalds committed
1190 1191
			if (p->swap_device == swap_info[i].swap_device)
				goto bad_swap;
1192
		}
Linus Torvalds's avatar
Linus Torvalds committed
1193
	} else if (!S_ISREG(swap_inode->i_mode))
Linus Torvalds's avatar
Linus Torvalds committed
1194
		goto bad_swap;
1195
	p->swap_lockmap = (unsigned char *) get_free_page(GFP_USER);
Linus Torvalds's avatar
Linus Torvalds committed
1196
	if (!p->swap_lockmap) {
1197
		printk("Unable to start swapping: out of memory :-)\n");
Linus Torvalds's avatar
Linus Torvalds committed
1198 1199 1200 1201
		error = -ENOMEM;
		goto bad_swap;
	}
	read_swap_page(SWP_ENTRY(type,0), (char *) p->swap_lockmap);
Linus Torvalds's avatar
Linus Torvalds committed
1202
	if (memcmp("SWAP-SPACE",p->swap_lockmap+PAGE_SIZE-10,10)) {
1203
		printk("Unable to find swap-space signature\n");
Linus Torvalds's avatar
Linus Torvalds committed
1204 1205
		error = -EINVAL;
		goto bad_swap;
Linus Torvalds's avatar
Linus Torvalds committed
1206
	}
Linus Torvalds's avatar
Linus Torvalds committed
1207
	memset(p->swap_lockmap+PAGE_SIZE-10,0,10);
Linus Torvalds's avatar
Linus Torvalds committed
1208
	j = 0;
1209 1210
	p->lowest_bit = 0;
	p->highest_bit = 0;
Linus Torvalds's avatar
Linus Torvalds committed
1211 1212
	for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
		if (test_bit(i,p->swap_lockmap)) {
1213 1214 1215
			if (!p->lowest_bit)
				p->lowest_bit = i;
			p->highest_bit = i;
Linus Torvalds's avatar
Linus Torvalds committed
1216
			p->max = i+1;
Linus Torvalds's avatar
Linus Torvalds committed
1217
			j++;
1218
		}
Linus Torvalds's avatar
Linus Torvalds committed
1219
	}
Linus Torvalds's avatar
Linus Torvalds committed
1220
	if (!j) {
1221
		printk("Empty swap-file\n");
Linus Torvalds's avatar
Linus Torvalds committed
1222 1223
		error = -EINVAL;
		goto bad_swap;
Linus Torvalds's avatar
Linus Torvalds committed
1224
	}
1225
	p->swap_map = (unsigned char *) vmalloc(p->max);
Linus Torvalds's avatar
Linus Torvalds committed
1226 1227 1228 1229 1230 1231 1232
	if (!p->swap_map) {
		error = -ENOMEM;
		goto bad_swap;
	}
	for (i = 1 ; i < p->max ; i++) {
		if (test_bit(i,p->swap_lockmap))
			p->swap_map[i] = 0;
1233
		else
Linus Torvalds's avatar
Linus Torvalds committed
1234 1235 1236 1237
			p->swap_map[i] = 0x80;
	}
	p->swap_map[0] = 0x80;
	memset(p->swap_lockmap,0,PAGE_SIZE);
1238
	p->flags = SWP_WRITEOK;
1239 1240
	p->pages = j;
	nr_swap_pages += j;
Linus Torvalds's avatar
Linus Torvalds committed
1241
	printk("Adding Swap: %dk swap-space\n",j<<(PAGE_SHIFT-10));
Linus Torvalds's avatar
Linus Torvalds committed
1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256

	/* insert swap space into swap_list: */
	prev = -1;
	for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
		if (p->prio >= swap_info[i].prio) {
			break;
		}
		prev = i;
	}
	p->next = i;
	if (prev < 0) {
		swap_list.head = swap_list.next = p - swap_info;
	} else {
		swap_info[prev].next = p - swap_info;
	}
1257
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1258
bad_swap:
Linus Torvalds's avatar
Linus Torvalds committed
1259 1260 1261
	if(filp.f_op && filp.f_op->release)
		filp.f_op->release(filp.f_inode,&filp);
bad_swap_2:
Linus Torvalds's avatar
Linus Torvalds committed
1262 1263 1264 1265 1266 1267 1268 1269 1270
	free_page((long) p->swap_lockmap);
	vfree(p->swap_map);
	iput(p->swap_file);
	p->swap_device = 0;
	p->swap_file = NULL;
	p->swap_map = NULL;
	p->swap_lockmap = NULL;
	p->flags = 0;
	return error;
Linus Torvalds's avatar
Linus Torvalds committed
1271
}
1272 1273 1274 1275 1276 1277 1278

void si_swapinfo(struct sysinfo *val)
{
	unsigned int i, j;

	val->freeswap = val->totalswap = 0;
	for (i = 0; i < nr_swapfiles; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
1279
		if ((swap_info[i].flags & SWP_WRITEOK) != SWP_WRITEOK)
1280
			continue;
Linus Torvalds's avatar
Linus Torvalds committed
1281
		for (j = 0; j < swap_info[i].max; ++j)
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294
			switch (swap_info[i].swap_map[j]) {
				case 128:
					continue;
				case 0:
					++val->freeswap;
				default:
					++val->totalswap;
			}
	}
	val->freeswap <<= PAGE_SHIFT;
	val->totalswap <<= PAGE_SHIFT;
	return;
}
Linus Torvalds's avatar
Linus Torvalds committed
1295

Linus Torvalds's avatar
Linus Torvalds committed
1296 1297
#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))

Linus Torvalds's avatar
Linus Torvalds committed
1298 1299
/*
 * set up the free-area data structures:
Linus Torvalds's avatar
Linus Torvalds committed
1300
 *   - mark all pages reserved
Linus Torvalds's avatar
Linus Torvalds committed
1301 1302 1303 1304 1305
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
unsigned long free_area_init(unsigned long start_mem, unsigned long end_mem)
{
Linus Torvalds's avatar
Linus Torvalds committed
1306
	mem_map_t * p;
Linus Torvalds's avatar
Linus Torvalds committed
1307 1308 1309
	unsigned long mask = PAGE_MASK;
	int i;

Linus Torvalds's avatar
Linus Torvalds committed
1310 1311 1312 1313
	/*
	 * select nr of pages we try to keep free for important stuff
	 * with a minimum of 16 pages. This is totally arbitrary
	 */
Linus Torvalds's avatar
Linus Torvalds committed
1314
	i = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT+6);
Linus Torvalds's avatar
Linus Torvalds committed
1315 1316 1317
	if (i < 16)
		i = 16;
	min_free_pages = i;
Linus Torvalds's avatar
Linus Torvalds committed
1318
	start_mem = init_swap_cache(start_mem, end_mem);
Linus Torvalds's avatar
Linus Torvalds committed
1319
	mem_map = (mem_map_t *) start_mem;
Linus Torvalds's avatar
Linus Torvalds committed
1320
	p = mem_map + MAP_NR(end_mem);
Linus Torvalds's avatar
Linus Torvalds committed
1321
	start_mem = LONG_ALIGN((unsigned long) p);
Linus Torvalds's avatar
Linus Torvalds committed
1322 1323 1324 1325 1326 1327
	while (p > mem_map) {
		--p;
		p->count = 0;
		p->dirty = 0;
		p->reserved = 1;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1328

Linus Torvalds's avatar
Linus Torvalds committed
1329
	for (i = 0 ; i < NR_MEM_LISTS ; i++) {
Linus Torvalds's avatar
Linus Torvalds committed
1330 1331
		unsigned long bitmap_size;
		free_area_list[i].prev = free_area_list[i].next = &free_area_list[i];
Linus Torvalds's avatar
Linus Torvalds committed
1332
		mask += mask;
Linus Torvalds's avatar
Linus Torvalds committed
1333
		end_mem = (end_mem + ~mask) & mask;
Linus Torvalds's avatar
Linus Torvalds committed
1334
		bitmap_size = (end_mem - PAGE_OFFSET) >> (PAGE_SHIFT + i);
Linus Torvalds's avatar
Linus Torvalds committed
1335
		bitmap_size = (bitmap_size + 7) >> 3;
Linus Torvalds's avatar
Linus Torvalds committed
1336
		bitmap_size = LONG_ALIGN(bitmap_size);
Linus Torvalds's avatar
Linus Torvalds committed
1337 1338 1339 1340 1341 1342
		free_area_map[i] = (unsigned char *) start_mem;
		memset((void *) start_mem, 0, bitmap_size);
		start_mem += bitmap_size;
	}
	return start_mem;
}