diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 57597335536d36b25316bc448c6335053d94c4f8..81c04ff87b4a6d975e703646a4c9d0035bf25c1b 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -963,13 +963,6 @@ Contains, as a percentage of total system memory, the number of pages at which a process which is generating disk writes will itself start writing out dirty data. -dirty_sync_ratio ----------------- - -Contains, as a percentage of total system memory, the number of pages at which -a process which is generating disk writes will itself start writing out dirty -data and waiting upon completion of that writeout. - dirty_writeback_centisecs ------------------------- diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6ff0af89ae7749a92efcd218e4df4bf1320dd4ef..ed6ccff766f4b6942781a83f6bda00336e7e97f8 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -21,13 +21,12 @@ Currently, these files are in /proc/sys/vm: - dirty_async_ratio - dirty_background_ratio - dirty_expire_centisecs -- dirty_sync_ratio - dirty_writeback_centisecs ============================================================== dirty_async_ratio, dirty_background_ratio, dirty_expire_centisecs, -dirty_sync_ratio dirty_writeback_centisecs: +dirty_writeback_centisecs: See Documentation/filesystems/proc.txt diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index 2458576ec8ae4bbc3eda60ef03155ce868a1ab70..5071c14da05987d2e173bcaecca2e2c90eed7721 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -286,7 +286,6 @@ void __init paging_init(void) for (nid = 0; nid < numnodes; nid++) { unsigned long start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT; unsigned long end_pfn = plat_node_bdata[nid].node_low_pfn; - unsigned long lmax_mapnr; if (dma_local_pfn >= end_pfn - start_pfn) zones_size[ZONE_DMA] = end_pfn - start_pfn; @@ -295,11 +294,6 @@ void __init paging_init(void) zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn; } free_area_init_node(nid, NODE_DATA(nid), NULL, zones_size, start_pfn, NULL); - lmax_mapnr = PLAT_NODE_DATA_STARTNR(nid) + PLAT_NODE_DATA_SIZE(nid); - if (lmax_mapnr > max_mapnr) { - max_mapnr = lmax_mapnr; - DBGDCONT("Grow max_mapnr to %ld\n", max_mapnr); - } } /* Initialize the kernel's ZERO_PGE. */ diff --git a/arch/i386/config.in b/arch/i386/config.in index 702ab169752ef9d2a41d86028c34a7fdf20e6ddd..861b2ad69154379da939c049db53d2c5e6036342 100644 --- a/arch/i386/config.in +++ b/arch/i386/config.in @@ -154,7 +154,7 @@ if [ "$CONFIG_MWINCHIP3D" = "y" ]; then define_bool CONFIG_X86_OOSTORE y fi -bool 'IA-32 Huge TLB Page Support (if available on processor)' CONFIG_HUGETLB_PAGE +bool 'Huge TLB Page Support' CONFIG_HUGETLB_PAGE bool 'Symmetric multi-processing support' CONFIG_SMP bool 'Preemptible Kernel' CONFIG_PREEMPT diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 00b09cc403a20aadbd822a562ec1038bc5c7ac1e..991024118fe6ce8b84f563b2be9c85f860795a5d 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -25,7 +25,7 @@ __asm__(".align 4\nvide: ret"); static void __init init_amd(struct cpuinfo_x86 *c) { u32 l, h; - int mbytes = max_mapnr >> (20-PAGE_SHIFT); + int mbytes = num_physpages >> (20-PAGE_SHIFT); int r; /* diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h index 363a5b14acdcad7a2cf74415994635edc48d2e52..c59ccb1059f7c0dfab30d0ddfbae2efa459d887c 100644 --- a/arch/i386/kernel/cpu/mtrr/mtrr.h +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -96,4 +96,7 @@ extern struct mtrr_ops * mtrr_if; extern unsigned int num_var_ranges; +void finalize_mtrr_state(void); +void mtrr_state_warn(void); + extern char * mtrr_if_name[]; diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 0be81aed67474ec6acf8a969b4d470042244d21f..8b447ebcb591291d4c8faac9c1bfb7879550ffdf 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -58,7 +58,11 @@ EXPORT_SYMBOL(boot_cpu_data); EXPORT_SYMBOL(EISA_bus); #endif EXPORT_SYMBOL(MCA_bus); -#ifdef CONFIG_MULTIQUAD +#ifdef CONFIG_DISCONTIGMEM +EXPORT_SYMBOL(node_data); +EXPORT_SYMBOL(pfn_to_nid); +#endif +#ifdef CONFIG_X86_NUMAQ EXPORT_SYMBOL(xquad_portio); #endif EXPORT_SYMBOL(__verify_write); diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c index ffd27c7d2d8157e3b5ea358fcfed4dd40a672c39..07cf91d92dd968d0be390566a93c6ed80b42841d 100644 --- a/arch/i386/kernel/numaq.c +++ b/arch/i386/kernel/numaq.c @@ -82,27 +82,19 @@ static void __init smp_dump_qct(void) */ int physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; -#define MB_TO_ELEMENT(x) (x >> ELEMENT_REPRESENTS) -#define PA_TO_MB(pa) (pa >> 20) /* assumption: a physical address is in bytes */ +#define PFN_TO_ELEMENT(pfn) (pfn / PAGES_PER_ELEMENT) +#define PA_TO_ELEMENT(pa) (PFN_TO_ELEMENT(pa >> PAGE_SHIFT)) -int pa_to_nid(u64 pa) +int pfn_to_nid(unsigned long pfn) { - int nid; - - nid = physnode_map[MB_TO_ELEMENT(PA_TO_MB(pa))]; + int nid = physnode_map[PFN_TO_ELEMENT(pfn)]; - /* the physical address passed in is not in the map for the system */ if (nid == -1) - BUG(); + BUG(); /* address is not present */ return nid; } -int pfn_to_nid(unsigned long pfn) -{ - return pa_to_nid(((u64)pfn) << PAGE_SHIFT); -} - /* * for each node mark the regions * TOPOFMEM = hi_shrd_mem_start + hi_shrd_mem_size @@ -132,7 +124,7 @@ static void __init initialize_physnode_map(void) topofmem = eq->hi_shrd_mem_start + eq->hi_shrd_mem_size; while (cur < topofmem) { physnode_map[cur >> 8] = nid; - cur += (ELEMENT_REPRESENTS - 1); + cur ++; } } } diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index de811d22ac098ccd3e292508a5973de14e8e7a9e..eab75722398a7a96a5741db0d44e730869f43974 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -275,20 +275,9 @@ void __init set_highmem_pages_init(int bad_ppro) void __init set_max_mapnr_init(void) { #ifdef CONFIG_HIGHMEM - unsigned long lmax_mapnr; - int nid; - - highmem_start_page = mem_map + NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_start_mapnr; + highmem_start_page = NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_mem_map; num_physpages = highend_pfn; - - for (nid = 0; nid < numnodes; nid++) { - lmax_mapnr = node_startnr(nid) + node_size(nid); - if (lmax_mapnr > max_mapnr) { - max_mapnr = lmax_mapnr; - } - } - #else - max_mapnr = num_physpages = max_low_pfn; + num_physpages = max_low_pfn; #endif } diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index c50cec1dbafb6af670061df747f76d843c938b0c..928622ee5b22e935bee8aa0cd7934bfaba09e799 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c @@ -319,7 +319,7 @@ set_new_inode(unsigned long len, int prot, int flag, int key) } if (i == MAX_ID) return NULL; - inode = kmalloc(sizeof (struct inode), GFP_KERNEL); + inode = kmalloc(sizeof (struct inode), GFP_ATOMIC); if (inode == NULL) return NULL; @@ -502,7 +502,7 @@ set_hugetlb_mem_size(int count) if (lcount > 0) { /* Increase the mem size. */ while (lcount--) { - page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER); + page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER); if (page == NULL) break; map = page; diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 5d73c07fd72615eaf6c123a7e0a2bf750174e059..c672b966bcca8766d0d88d1747eb26ddf8090a9a 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -440,8 +440,10 @@ void __init mem_init(void) int tmp; int bad_ppro; +#ifndef CONFIG_DISCONTIGMEM if (!mem_map) BUG(); +#endif bad_ppro = ppro_with_ram_bug(); @@ -471,7 +473,7 @@ void __init mem_init(void) printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - max_mapnr << (PAGE_SHIFT-10), + num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, @@ -504,7 +506,7 @@ void __init mem_init(void) /*Will make this kernel command line. */ INIT_LIST_HEAD(&htlbpage_freelist); for (i=0; i<htlbzone_pages; i++) { - page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER); + page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER); if (page == NULL) break; map = page; diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 18a7664b115b7b3534dd985366617cad61fd9504..1f59100e77bf1271a18731471c68f542bd563ddb 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -22,26 +22,29 @@ void show_mem(void) { - int pfn, total = 0, reserved = 0; + int total = 0, reserved = 0; int shared = 0, cached = 0; int highmem = 0; struct page *page; + pg_data_t *pgdat; + unsigned long i; printk("Mem-info:\n"); show_free_areas(); printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); - pfn = max_mapnr; - while (pfn-- > 0) { - page = pfn_to_page(pfn); - total++; - if (PageHighMem(page)) - highmem++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; + for_each_pgdat(pgdat) { + for (i = 0; i < pgdat->node_size; ++i) { + page = pgdat->node_mem_map + i; + total++; + if (PageHighMem(page)) + highmem++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } } printk("%d pages of RAM\n", total); printk("%d pages of HIGHMEM\n",highmem); diff --git a/arch/mips64/sgi-ip27/ip27-memory.c b/arch/mips64/sgi-ip27/ip27-memory.c index e5f79e031816ade1efb35d11bb8f9f0c1f52af91..f46fa89f145f457e454354c2ab33035c6949ba21 100644 --- a/arch/mips64/sgi-ip27/ip27-memory.c +++ b/arch/mips64/sgi-ip27/ip27-memory.c @@ -254,10 +254,6 @@ void __init paging_init(void) zones_size[ZONE_DMA] = end_pfn + 1 - start_pfn; free_area_init_node(node, NODE_DATA(node), 0, zones_size, start_pfn, 0); - if ((PLAT_NODE_DATA_STARTNR(node) + - PLAT_NODE_DATA_SIZE(node)) > pagenr) - pagenr = PLAT_NODE_DATA_STARTNR(node) + - PLAT_NODE_DATA_SIZE(node); } } @@ -271,7 +267,6 @@ void __init mem_init(void) unsigned long codesize, datasize, initsize; int slot, numslots; struct page *pg, *pslot; - pfn_t pgnr; num_physpages = numpages; /* memory already sized by szmem */ max_mapnr = pagenr; /* already found during paging_init */ @@ -293,7 +288,6 @@ void __init mem_init(void) * We need to manually do the other slots. */ pg = NODE_DATA(nid)->node_mem_map + slot_getsize(nid, 0); - pgnr = PLAT_NODE_DATA_STARTNR(nid) + slot_getsize(nid, 0); numslots = node_getlastslot(nid); for (slot = 1; slot <= numslots; slot++) { pslot = NODE_DATA(nid)->node_mem_map + @@ -304,7 +298,7 @@ void __init mem_init(void) * free up the pages that hold the memmap entries. */ while (pg < pslot) { - pg++; pgnr++; + pg++; } /* @@ -312,8 +306,8 @@ void __init mem_init(void) */ pslot += slot_getsize(nid, slot); while (pg < pslot) { - if (!page_is_ram(pgnr)) - continue; + /* if (!page_is_ram(pgnr)) continue; */ + /* commented out until page_is_ram works */ ClearPageReserved(pg); atomic_set(&pg->count, 1); __free_page(pg); diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index 6410e974796ae83caf248066ffe43cf5c002bdde..d577bad64d683f5230b4a41bdf0c5c4299b2822f 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -1733,7 +1733,7 @@ void __init mem_init(void) * Set up the zero page, mark it reserved, so that page count * is not manipulated when freeing the page from user ptes. */ - mem_map_zero = _alloc_pages(GFP_KERNEL, 0); + mem_map_zero = alloc_pages(GFP_KERNEL, 0); if (mem_map_zero == NULL) { prom_printf("paging_init: Cannot alloc zero page.\n"); prom_halt(); diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 776233f78ac3431acf872692fe82df8804bf7c6c..8b5ae9a64e03d0371f3c97af0084895135252db2 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -36,7 +36,7 @@ static kmem_cache_t *request_cachep; /* * plug management */ -static struct list_head blk_plug_list; +static LIST_HEAD(blk_plug_list); static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; /* blk_dev_struct is: @@ -1875,27 +1875,16 @@ void end_that_request_last(struct request *req) blk_put_request(req); } -#define MB(kb) ((kb) << 10) - int __init blk_dev_init(void) { - struct blk_dev_struct *dev; - int total_ram; + int total_ram = nr_free_pages() << (PAGE_SHIFT - 10); request_cachep = kmem_cache_create("blkdev_requests", - sizeof(struct request), - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); - + sizeof(struct request), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); if (!request_cachep) panic("Can't create request pool slab cache\n"); - for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;) - dev->queue = NULL; - - memset(ro_bits,0,sizeof(ro_bits)); - - total_ram = nr_free_pages() << (PAGE_SHIFT - 10); - /* * Free request slots per queue. * (Half for reads, half for writes) @@ -1911,17 +1900,12 @@ int __init blk_dev_init(void) */ if ((batch_requests = queue_nr_requests / 4) > 32) batch_requests = 32; - printk("block: %d slots per queue, batch=%d\n", queue_nr_requests, batch_requests); + printk("block: %d slots per queue, batch=%d\n", + queue_nr_requests, batch_requests); blk_max_low_pfn = max_low_pfn; blk_max_pfn = max_pfn; - INIT_LIST_HEAD(&blk_plug_list); - -#if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD) - hd_init(); -#endif - return 0; }; diff --git a/drivers/char/raw.c b/drivers/char/raw.c index a2f05f72791d308d6c31debb0eadd6afc71948e0..2b08e77a18bbde5687a9b4f425c0bede2fe3c498 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -241,7 +241,7 @@ raw_read(struct file *filp, char *buf, size_t size, loff_t *offp) static ssize_t raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp) { - struct iovec local_iov = { .iov_base = buf, .iov_len = size}; + struct iovec local_iov = { .iov_base = (char *)buf, .iov_len = size}; return rw_raw_dev(WRITE, filp, &local_iov, 1, offp); } diff --git a/drivers/ide/legacy/hd.c b/drivers/ide/legacy/hd.c index c0c042c1ddf10dadba00b4d35088adb99849f997..24c598563bd40ec72b7f5f53a4ceabdabee33044 100644 --- a/drivers/ide/legacy/hd.c +++ b/drivers/ide/legacy/hd.c @@ -846,7 +846,7 @@ static void __init hd_geninit(void) } } -int __init hd_init(void) +static int __init hd_init(void) { if (register_blkdev(MAJOR_NR,"hd",&hd_fops)) { printk("hd: unable to get major %d for hard disk\n",MAJOR_NR); diff --git a/drivers/pcmcia/sa1100.h b/drivers/pcmcia/sa1100.h index 713f5b49cf34b8ef2c8160b4703d8cb6dcdf3aca..53716e9dcf63577cf072c288fb6b241046dada2b 100644 --- a/drivers/pcmcia/sa1100.h +++ b/drivers/pcmcia/sa1100.h @@ -160,7 +160,7 @@ struct sa1100_pcmcia_socket { */ socket_state_t cs_state; pccard_io_map io_map[MAX_IO_WIN]; - pccard_mem_map mem_map[MAX_WIN]; + pccard_mem_map pc_mem_map[MAX_WIN]; void (*handler)(void *, unsigned int); void *handler_info; diff --git a/drivers/pcmcia/sa1100_generic.c b/drivers/pcmcia/sa1100_generic.c index ef238c0f90b730ced0fefc116b90f8913f13bcfd..12dc9270e402757e05189c7748cf9b19229e1ff4 100644 --- a/drivers/pcmcia/sa1100_generic.c +++ b/drivers/pcmcia/sa1100_generic.c @@ -686,7 +686,7 @@ sa1100_pcmcia_get_mem_map(unsigned int sock, struct pccard_mem_map *map) DEBUG(2, "%s() for sock %u\n", __FUNCTION__, sock); if (map->map < MAX_WIN) { - *map = skt->mem_map[map->map]; + *map = skt->pc_mem_map[map->map]; ret = 0; } @@ -754,7 +754,7 @@ sa1100_pcmcia_set_mem_map(unsigned int sock, struct pccard_mem_map *map) map->sys_stop += start; map->sys_start = start; - skt->mem_map[map->map] = *map; + skt->pc_mem_map[map->map] = *map; return 0; } /* sa1100_pcmcia_set_mem_map() */ diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 101a2b10fe2e8c646f6281794e7a55276bd15bbf..448bb2f167b108f364ceebd6a803fb9e1e21d236 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -2537,6 +2537,7 @@ struct scatterlist *scsi_alloc_sgtable(Scsi_Cmnd *SCpnt, int gfp_mask) { struct scsi_host_sg_pool *sgp; struct scatterlist *sgl; + int pf_flags; BUG_ON(!SCpnt->use_sg); @@ -2551,9 +2552,10 @@ struct scatterlist *scsi_alloc_sgtable(Scsi_Cmnd *SCpnt, int gfp_mask) sgp = scsi_sg_pools + SCpnt->sglist_len; + pf_flags = current->flags; current->flags |= PF_NOWARN; sgl = mempool_alloc(sgp->pool, gfp_mask); - current->flags &= ~PF_NOWARN; + current->flags = pf_flags; if (sgl) { memset(sgl, 0, sgp->size); return sgl; diff --git a/fs/bio.c b/fs/bio.c index 95f402e38670406b2fd6f6a53344ef94cb03b045..d2fa052eacc2c4ddca6af9bb1e997b38862de4ef 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -135,6 +135,7 @@ struct bio *bio_alloc(int gfp_mask, int nr_iovecs) { struct bio *bio; struct bio_vec *bvl = NULL; + int pf_flags = current->flags; current->flags |= PF_NOWARN; bio = mempool_alloc(bio_pool, gfp_mask); @@ -151,7 +152,7 @@ struct bio *bio_alloc(int gfp_mask, int nr_iovecs) mempool_free(bio, bio_pool); bio = NULL; out: - current->flags &= ~PF_NOWARN; + current->flags = pf_flags; return bio; } diff --git a/fs/buffer.c b/fs/buffer.c index 4fe7c935e4d69d4f7fe19c034b0b573da43fa8a7..cb06b5454e3653890f282fd980811385dd72464a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -937,9 +937,11 @@ create_buffers(struct page * page, unsigned long size, int retry) head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { + int pf_flags = current->flags; + current->flags |= PF_NOWARN; bh = alloc_buffer_head(); - current->flags &= ~PF_NOWARN; + current->flags = pf_flags; if (!bh) goto no_grow; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 78a1b6ace4944b7753d815fd3872af649d40c6b2..99627183120ece972b72ec11f44b13ea84b2725a 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -627,13 +627,13 @@ ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov, } static int -ext2_writepages(struct address_space *mapping, int *nr_to_write) +ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; int err; ret = write_mapping_buffers(mapping); - err = mpage_writepages(mapping, nr_to_write, ext2_get_block); + err = mpage_writepages(mapping, wbc, ext2_get_block); if (!ret) ret = err; return ret; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 99157f9cb6e7cfdcf93d1c93028e770fda145e12..2b672bb2aed46c8f52aeee29acba66316698a712 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1475,13 +1475,13 @@ struct address_space_operations ext3_aops = { /* For writeback mode, we can use mpage_writepages() */ static int -ext3_writepages(struct address_space *mapping, int *nr_to_write) +ext3_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; int err; ret = write_mapping_buffers(mapping); - err = mpage_writepages(mapping, nr_to_write, ext3_get_block); + err = mpage_writepages(mapping, wbc, ext3_get_block); if (!ret) ret = err; return ret; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 25e0dfad847c679bf1d860628f1a374693b3d965..e306a31f46b5bc0e5b43723c3c534ac73be66f37 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -111,8 +111,7 @@ static void write_inode(struct inode *inode, int sync) /* * Write a single inode's dirty pages and inode data out to disk. * If `sync' is set, wait on the writeout. - * If `nr_to_write' is not NULL, subtract the number of written pages - * from *nr_to_write. + * Subtract the number of written pages from nr_to_write. * * Normally it is not legal for a single process to lock more than one * page at a time, due to ab/ba deadlock problems. But writepages() @@ -127,7 +126,9 @@ static void write_inode(struct inode *inode, int sync) * * Called under inode_lock. */ -static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) +static void +__sync_single_inode(struct inode *inode, int wait, + struct writeback_control *wbc) { unsigned dirty; unsigned long orig_dirtied_when; @@ -144,7 +145,7 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) mapping->dirtied_when = 0; /* assume it's whole-file writeback */ spin_unlock(&inode_lock); - do_writepages(mapping, nr_to_write); + do_writepages(mapping, wbc); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) @@ -181,7 +182,8 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write) * Write out an inode's dirty pages. Called under inode_lock. */ static void -__writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) +__writeback_single_inode(struct inode *inode, int sync, + struct writeback_control *wbc) { if (current_is_pdflush() && (inode->i_state & I_LOCK)) return; @@ -193,7 +195,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) iput(inode); spin_lock(&inode_lock); } - __sync_single_inode(inode, sync, nr_to_write); + __sync_single_inode(inode, sync, wbc); } /* @@ -226,8 +228,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write) * thrlttled threads: we don't want them all piling up on __wait_on_inode. */ static void -sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, - int sync_mode, int *nr_to_write, unsigned long *older_than_this) +sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) { struct list_head *tmp; struct list_head *head; @@ -241,7 +242,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, struct backing_dev_info *bdi; int really_sync; - if (single_bdi && mapping->backing_dev_info != single_bdi) { + if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) { if (sb != blockdev_superblock) break; /* inappropriate superblock */ list_move(&inode->i_list, &sb->s_dirty); @@ -252,23 +253,20 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, if (time_after(mapping->dirtied_when, start)) break; - if (older_than_this && - time_after(mapping->dirtied_when, *older_than_this)) + if (wbc->older_than_this && time_after(mapping->dirtied_when, + *wbc->older_than_this)) goto out; bdi = mapping->backing_dev_info; if (current_is_pdflush() && !writeback_acquire(bdi)) break; - really_sync = (sync_mode == WB_SYNC_ALL); - if ((sync_mode == WB_SYNC_LAST) && (head->prev == head)) - really_sync = 1; - + really_sync = (wbc->sync_mode == WB_SYNC_ALL); BUG_ON(inode->i_state & I_FREEING); __iget(inode); list_move(&inode->i_list, &sb->s_dirty); - __writeback_single_inode(inode, really_sync, nr_to_write); - if (sync_mode == WB_SYNC_HOLD) { + __writeback_single_inode(inode, really_sync, wbc); + if (wbc->sync_mode == WB_SYNC_HOLD) { mapping->dirtied_when = jiffies; list_move(&inode->i_list, &sb->s_dirty); } @@ -277,7 +275,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, spin_unlock(&inode_lock); iput(inode); spin_lock(&inode_lock); - if (nr_to_write && *nr_to_write <= 0) + if (wbc->nr_to_write <= 0) break; } out: @@ -288,16 +286,26 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb, } /* + * Start writeback of dirty pagecache data against all unlocked inodes. + * + * Note: + * We don't need to grab a reference to superblock here. If it has non-empty + * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed + * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are + * empty. Since __sync_single_inode() regains inode_lock before it finally moves + * inode from superblock lists we are OK. + * + * If `older_than_this' is non-zero then only flush inodes which have a + * flushtime older than *older_than_this. + * * If `bdi' is non-zero then we will scan the first inode against each * superblock until we find the matching ones. One group will be the dirty * inodes against a filesystem. Then when we hit the dummy blockdev superblock, * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not * super-efficient but we're about to do a ton of I/O... */ -static void -__writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write, - enum writeback_sync_modes sync_mode, - unsigned long *older_than_this) +void +writeback_inodes(struct writeback_control *wbc) { struct super_block *sb; @@ -307,54 +315,16 @@ __writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write, for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) { spin_unlock(&sb_lock); - sync_sb_inodes(bdi, sb, sync_mode, nr_to_write, - older_than_this); + sync_sb_inodes(sb, wbc); spin_lock(&sb_lock); } - if (nr_to_write && *nr_to_write <= 0) + if (wbc->nr_to_write <= 0) break; } spin_unlock(&sb_lock); spin_unlock(&inode_lock); } -/* - * Start writeback of dirty pagecache data against all unlocked inodes. - * - * Note: - * We don't need to grab a reference to superblock here. If it has non-empty - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed - * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are - * empty. Since __sync_single_inode() regains inode_lock before it finally moves - * inode from superblock lists we are OK. - * - * If `older_than_this' is non-zero then only flush inodes which have a - * flushtime older than *older_than_this. - * - * This is a "memory cleansing" operation, not a "data integrity" operation. - */ -void writeback_unlocked_inodes(int *nr_to_write, - enum writeback_sync_modes sync_mode, - unsigned long *older_than_this) -{ - __writeback_unlocked_inodes(NULL, nr_to_write, - sync_mode, older_than_this); -} -/* - * Perform writeback of dirty data against a particular queue. - * - * This is for writer throttling. We don't want processes to write back - * other process's data, espsecially when the other data belongs to a - * different spindle. - */ -void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write, - enum writeback_sync_modes sync_mode, - unsigned long *older_than_this) -{ - __writeback_unlocked_inodes(bdi, nr_to_write, - sync_mode, older_than_this); -} - /* * writeback and wait upon the filesystem's dirty inodes. The caller will * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is @@ -366,14 +336,17 @@ void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write, void sync_inodes_sb(struct super_block *sb, int wait) { struct page_state ps; - int nr_to_write; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + .older_than_this = NULL, + .nr_to_write = 0, + }; get_page_state(&ps); - nr_to_write = ps.nr_dirty + ps.nr_dirty / 4; - + wbc.nr_to_write = ps.nr_dirty + ps.nr_dirty / 4; spin_lock(&inode_lock); - sync_sb_inodes(NULL, sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD, - &nr_to_write, NULL); + sync_sb_inodes(sb, &wbc); spin_unlock(&inode_lock); } @@ -466,8 +439,12 @@ void sync_inodes(int wait) void write_inode_now(struct inode *inode, int sync) { + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + }; + spin_lock(&inode_lock); - __writeback_single_inode(inode, sync, NULL); + __writeback_single_inode(inode, sync, &wbc); spin_unlock(&inode_lock); if (sync) wait_on_inode(inode); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 65d1dff1f80dc4c200265e8cfd84a64be64a47a0..91ab1b3f723fa649bfb6b73e8aff8b8a618a2ff3 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -282,9 +282,10 @@ static int jfs_writepage(struct page *page) return block_write_full_page(page, jfs_get_block); } -static int jfs_writepages(struct address_space *mapping, int *nr_to_write) +static int jfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return mpage_writepages(mapping, nr_to_write, jfs_get_block); + return mpage_writepages(mapping, wbc, jfs_get_block); } static int jfs_readpage(struct file *file, struct page *page) diff --git a/fs/mpage.c b/fs/mpage.c index 363085535ddf5cf9a62d50656de4d5067ccb77df..a200d8f68fb8126362fd34af9d0e6fb711aac057 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -484,7 +484,7 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, * address space and writepage() all of them. * * @mapping: address space structure to write - * @nr_to_write: subtract the number of written pages from *@nr_to_write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. * If this is NULL then use a_ops->writepage. Otherwise, go * direct-to-BIO. @@ -520,7 +520,7 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, */ int mpage_writepages(struct address_space *mapping, - int *nr_to_write, get_block_t get_block) + struct writeback_control *wbc, get_block_t get_block) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -583,7 +583,7 @@ mpage_writepages(struct address_space *mapping, __set_page_dirty_nobuffers(page); ret = 0; } - if (ret || (nr_to_write && --(*nr_to_write) <= 0)) + if (ret || (--(wbc->nr_to_write) <= 0)) done = 1; } else { unlock_page(page); diff --git a/fs/proc/array.c b/fs/proc/array.c index feb2cbab4699df701a06f1de8487a9b986d02b95..c1587b0cc89bf5c04399b28bc1bab6008285bf1b 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -394,131 +394,40 @@ int proc_pid_stat(struct task_struct *task, char * buffer) return res; } -static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, int * pages, int * shared, int * dirty, int * total) +int proc_pid_statm(task_t *task, char *buffer) { - unsigned long end, pmd_end; - pte_t *pte; - - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - preempt_disable(); - pte = pte_offset_map(pmd, address); - end = address + size; - pmd_end = (address + PMD_SIZE) & PMD_MASK; - if (end > pmd_end) - end = pmd_end; - do { - pte_t page = *pte; - struct page *ptpage; - unsigned long pfn; + int size, resident, shared, text, lib, data, dirty; + struct mm_struct *mm = get_task_mm(task); + struct vm_area_struct * vma; - address += PAGE_SIZE; - pte++; - if (pte_none(page)) - continue; - ++*total; - if (!pte_present(page)) - continue; - pfn = pte_pfn(page); - if (!pfn_valid(pfn)) - continue; - ptpage = pfn_to_page(pfn); - if (PageReserved(ptpage)) - continue; - ++*pages; - if (pte_dirty(page)) - ++*dirty; - if (page_count(pte_page(page)) > 1) - ++*shared; - } while (address < end); - pte_unmap(pte - 1); - preempt_enable(); -} + size = resident = shared = text = lib = data = dirty = 0; -static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size, - int * pages, int * shared, int * dirty, int * total) -{ - pmd_t * pmd; - unsigned long end; - - if (pgd_none(*pgd)) - return; - if (pgd_bad(*pgd)) { - pgd_ERROR(*pgd); - pgd_clear(pgd); - return; - } - pmd = pmd_offset(pgd, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - statm_pte_range(pmd, address, end - address, pages, shared, dirty, total); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); -} - -static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end, - int * pages, int * shared, int * dirty, int * total) -{ - while (address < end) { - statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - pgd++; - } -} + if (!mm) + goto out; -int proc_pid_statm(struct task_struct *task, char * buffer) -{ - int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0; - struct mm_struct *mm = get_task_mm(task); + down_read(&mm->mmap_sem); + resident = mm->rss; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - if (mm) { - struct vm_area_struct * vma; - down_read(&mm->mmap_sem); - vma = mm->mmap; - while (vma) { - pgd_t *pgd = pgd_offset(mm, vma->vm_start); - int pages = 0, shared = 0, dirty = 0, total = 0; - if (is_vm_hugetlb_page(vma)) { - int num_pages = ((vma->vm_end - vma->vm_start)/PAGE_SIZE); - - resident += num_pages; - if (!(vma->vm_flags & VM_DONTCOPY)) - share += num_pages; - if (vma->vm_flags & VM_WRITE) - dt += num_pages; - drs += num_pages; - vma = vma->vm_next; - continue; - } - statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total); - resident += pages; - share += shared; - dt += dirty; - size += total; - if (vma->vm_flags & VM_EXECUTABLE) - trs += pages; /* text */ - else if (vma->vm_flags & VM_GROWSDOWN) - drs += pages; /* stack */ - else if (vma->vm_end > 0x60000000) - lrs += pages; /* library */ - else - drs += pages; - vma = vma->vm_next; + size += pages; + if (is_vm_hugetlb_page(vma)) { + if (!(vma->vm_flags & VM_DONTCOPY)) + shared += pages; + continue; } - up_read(&mm->mmap_sem); - mmput(mm); + if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared)) + shared += pages; + if (vma->vm_flags & VM_EXECUTABLE) + text += pages; + else + data += pages; } + up_read(&mm->mmap_sem); + mmput(mm); +out: return sprintf(buffer,"%d %d %d %d %d %d %d\n", - size, resident, share, trs, lrs, drs, dt); + size, resident, shared, text, lib, data, dirty); } /* diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h index 572569df5dd40b9a9248e7173ac9d996e0304970..4059862d4b3dc96a66ee9eee1fb2fa3602862c2d 100644 --- a/include/asm-alpha/mmzone.h +++ b/include/asm-alpha/mmzone.h @@ -36,18 +36,14 @@ extern plat_pg_data_t *plat_node_data[]; #ifdef CONFIG_ALPHA_WILDFIRE # define ALPHA_PA_TO_NID(pa) ((pa) >> 36) /* 16 nodes max due 43bit kseg */ -#define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */ -#define MAX_NUMNODES WILDFIRE_MAX_QBB +# define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */ #else # define ALPHA_PA_TO_NID(pa) (0) -#define NODE_MAX_MEM_SIZE (~0UL) -#define MAX_NUMNODES 1 +# define NODE_MAX_MEM_SIZE (~0UL) #endif #define PHYSADDR_TO_NID(pa) ALPHA_PA_TO_NID(pa) #define PLAT_NODE_DATA(n) (plat_node_data[(n)]) -#define PLAT_NODE_DATA_STARTNR(n) \ - (PLAT_NODE_DATA(n)->gendata.node_start_mapnr) #define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_size) #if 1 diff --git a/include/asm-alpha/numnodes.h b/include/asm-alpha/numnodes.h new file mode 100644 index 0000000000000000000000000000000000000000..4ff6b3ecfbed9200999a74292985906b4fb38cc0 --- /dev/null +++ b/include/asm-alpha/numnodes.h @@ -0,0 +1,12 @@ +#ifndef _ASM_MAX_NUMNODES_H +#define _ASM_MAX_NUMNODES_H + +/* + * Currently the Wildfire is the only discontigmem/NUMA capable Alpha core. + */ +#if defined(CONFIG_ALPHA_WILDFIRE) || defined(CONFIG_ALPHA_GENERIC) +# include <asm/core_wildfire.h> +# define MAX_NUMNODES WILDFIRE_MAX_QBB +#endif + +#endif /* _ASM_MAX_NUMNODES_H */ diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index d2994f116f03f8479dc171c65d99700438a50e49..00a5d7ffbed9efaf3a67da118670241e4d50e33c 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -6,12 +6,13 @@ #ifndef _ASM_MMZONE_H_ #define _ASM_MMZONE_H_ +#include <asm/smp.h> + #ifdef CONFIG_DISCONTIGMEM #ifdef CONFIG_X86_NUMAQ #include <asm/numaq.h> #else -#define pa_to_nid(pa) (0) #define pfn_to_nid(pfn) (0) #ifdef CONFIG_NUMA #define _cpu_to_node(cpu) 0 @@ -44,7 +45,6 @@ extern struct pglist_data *node_data[]; #define alloc_bootmem_low_pages_node(ignore, x) \ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#define node_startnr(nid) (node_data[nid]->node_start_mapnr) #define node_size(nid) (node_data[nid]->node_size) #define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) @@ -55,7 +55,7 @@ extern struct pglist_data *node_data[]; /* * Given a kernel address, find the home node of the underlying memory. */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) +#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) /* * Return a pointer to the node data for node n. @@ -64,6 +64,8 @@ extern struct pglist_data *node_data[]; #define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) +#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ + NODE_DATA(nid)->node_size) #define local_mapnr(kvaddr) \ ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) @@ -74,5 +76,13 @@ extern struct pglist_data *node_data[]; #define pfn_to_page(pfn) (node_mem_map(pfn_to_nid(pfn)) + node_localnr(pfn, pfn_to_nid(pfn))) #define page_to_pfn(page) ((page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +/* + * pfn_valid should be made as fast as possible, and the current definition + * is valid for machines that are NUMA, but still contiguous, which is what + * is currently supported. A more generalised, but slower definition would + * be something like this - mbligh: + * ( pfn_to_pgdat(pfn) && (pfn < node_end_pfn(pfn_to_nid(pfn))) ) + */ +#define pfn_valid(pfn) (pfn < num_physpages) #endif /* CONFIG_DISCONTIGMEM */ #endif /* _ASM_MMZONE_H_ */ diff --git a/include/asm-i386/numaq.h b/include/asm-i386/numaq.h index ed10442f1dccb75ba1ccc58631bc19aaa3391620..b32b28c12c7398f8252dc95b4a2ba1a8d2f51c0d 100644 --- a/include/asm-i386/numaq.h +++ b/include/asm-i386/numaq.h @@ -32,17 +32,18 @@ /* * for now assume that 64Gb is max amount of RAM for whole system - * 64Gb * 1024Mb/Gb = 65536 Mb - * 65536 Mb / 256Mb = 256 + * 64Gb / 4096bytes/page = 16777216 pages */ +#define MAX_NR_PAGES 16777216 #define MAX_ELEMENTS 256 -#define ELEMENT_REPRESENTS 8 /* 256 Mb */ +#define PAGES_PER_ELEMENT (16777216/256) +#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn)) +#define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT) #define MAX_NUMNODES 8 #ifdef CONFIG_NUMA #define _cpu_to_node(cpu) (cpu_to_logical_apicid(cpu) >> 4) #endif /* CONFIG_NUMA */ -extern int pa_to_nid(u64); extern int pfn_to_nid(unsigned long); extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() diff --git a/include/asm-i386/max_numnodes.h b/include/asm-i386/numnodes.h similarity index 100% rename from include/asm-i386/max_numnodes.h rename to include/asm-i386/numnodes.h diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 5a09fd4b72f1167ce4e57f78f79f176af9006606..f9fe284b9057d496c66fb47f4db839cee095141c 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -145,10 +145,10 @@ static __inline__ int get_order(unsigned long size) #ifndef CONFIG_DISCONTIGMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) +#define pfn_valid(pfn) ((pfn) < max_mapnr) #endif /* !CONFIG_DISCONTIGMEM */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ diff --git a/include/asm-mips64/mmzone.h b/include/asm-mips64/mmzone.h index 5e643b114269c26b35cf328e2cc19879d022afe6..d60ad12acd75a85d5681096fc31270fa31f5e00a 100644 --- a/include/asm-mips64/mmzone.h +++ b/include/asm-mips64/mmzone.h @@ -24,7 +24,6 @@ extern plat_pg_data_t *plat_node_data[]; #define PHYSADDR_TO_NID(pa) NASID_TO_COMPACT_NODEID(NASID_GET(pa)) #define PLAT_NODE_DATA(n) (plat_node_data[n]) -#define PLAT_NODE_DATA_STARTNR(n) (PLAT_NODE_DATA(n)->gendata.node_start_mapnr) #define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_size) #define PLAT_NODE_DATA_LOCALNR(p, n) \ (((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn) diff --git a/include/asm-mips64/pgtable.h b/include/asm-mips64/pgtable.h index ded7d0a0a98683cb4bef0d0ee6d7e67fbaf27e9b..b32768e57d16a3882392cdf379d416712f7931ee 100644 --- a/include/asm-mips64/pgtable.h +++ b/include/asm-mips64/pgtable.h @@ -373,10 +373,10 @@ extern inline void pgd_clear(pgd_t *pgdp) #ifndef CONFIG_DISCONTIGMEM #define pte_page(x) (mem_map+(unsigned long)((pte_val(x) >> PAGE_SHIFT))) #else -#define mips64_pte_pagenr(x) \ - (PLAT_NODE_DATA_STARTNR(PHYSADDR_TO_NID(pte_val(x))) + \ - PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x)))) -#define pte_page(x) (mem_map+mips64_pte_pagenr(x)) + +#define pte_page(x) ( NODE_MEM_MAP(PHYSADDR_TO_NID(pte_val(x))) + + PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x))) ) + #endif /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 804ea47301f52e42fa157eb522311e0e1c01fe55..56f2bab87d7fade6afc4180200c68da01a3a461d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -279,6 +279,7 @@ struct iattr { */ struct page; struct address_space; +struct writeback_control; struct address_space_operations { int (*writepage)(struct page *); @@ -286,10 +287,10 @@ struct address_space_operations { int (*sync_page)(struct page *); /* Write back some dirty pages from this mapping. */ - int (*writepages)(struct address_space *, int *nr_to_write); + int (*writepages)(struct address_space *, struct writeback_control *); /* Perform a writeback as a memory-freeing operation. */ - int (*vm_writeback)(struct page *, int *nr_to_write); + int (*vm_writeback)(struct page *, struct writeback_control *); /* Set a page dirty */ int (*set_page_dirty)(struct page *page); @@ -1259,7 +1260,8 @@ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); extern int generic_file_open(struct inode * inode, struct file * filp); -extern int generic_vm_writeback(struct page *page, int *nr_to_write); +extern int generic_vm_writeback(struct page *page, + struct writeback_control *wbc); extern struct file_operations generic_ro_fops; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 10021357c0934bc35aa554679717c11daf5c7294..437572e2240b3109bb9b319502c6b208c6dd4eb4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,18 +39,25 @@ * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ -extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order)); extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, struct zonelist *zonelist)); extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order); +/* + * We get the zone list from the current node and the gfp_mask. + * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones. + * + * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets + * optimized to &contig_page_data at compile-time. + */ static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order) { - /* - * Gets optimized away by the compiler. - */ - if (order >= MAX_ORDER) + pg_data_t *pgdat = NODE_DATA(numa_node_id()); + unsigned int idx = (gfp_mask & GFP_ZONEMASK); + + if (unlikely(order >= MAX_ORDER)) return NULL; - return _alloc_pages(gfp_mask, order); + + return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + idx); } #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7483c39e28dd4c69ac59f3eba0c1874f77069985..c63e4947387f6aac208bbfa31f43e5f684ddc8bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -15,7 +15,10 @@ #include <linux/rbtree.h> #include <linux/fs.h> +#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; +#endif + extern unsigned long num_physpages; extern void * high_memory; extern int page_cluster; @@ -345,8 +348,10 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_MINOR 1 #define VM_FAULT_MAJOR 2 -/* The array of struct pages */ +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ extern struct page *mem_map; +#endif extern void show_free_areas(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8ebf441bdb475fa4415619b91f4d1192bd0a6291..580c39c4dcc1cad7e81764bb87d4ab15181398b7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -10,11 +10,14 @@ #include <linux/wait.h> #include <linux/cache.h> #include <asm/atomic.h> +#ifdef CONFIG_DISCONTIGMEM +#include <asm/numnodes.h> +#endif +#ifndef MAX_NUMNODES +#define MAX_NUMNODES 1 +#endif -/* - * Free memory management - zoned buddy allocator. - */ - +/* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER #define MAX_ORDER 11 #else @@ -112,7 +115,6 @@ struct zone { struct page *zone_mem_map; /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; - unsigned long zone_start_mapnr; /* * rarely used fields: @@ -138,7 +140,7 @@ struct zone { * footprint of this construct is very small. */ struct zonelist { - struct zone *zones[MAX_NR_ZONES+1]; // NULL delimited + struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited }; #define GFP_ZONEMASK 0x0f @@ -163,7 +165,6 @@ typedef struct pglist_data { unsigned long *valid_addr_bitmap; struct bootmem_data *bdata; unsigned long node_start_pfn; - unsigned long node_start_mapnr; unsigned long node_size; int node_id; struct pglist_data *pgdat_next; @@ -187,10 +188,12 @@ memclass(struct zone *pgzone, struct zone *classzone) * prototypes for the discontig memory code. */ struct page; -void free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, - unsigned long *zones_size, unsigned long paddr, unsigned long *zholes_size, - struct page *pmap); +extern void calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size, + unsigned long *zholes_size); +extern void free_area_init_core(pg_data_t *pgdat, unsigned long *zones_size, + unsigned long *zholes_size); void get_zone_counts(unsigned long *active, unsigned long *inactive); +extern void build_all_zonelists(void); extern pg_data_t contig_page_data; diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 52253d90f55dc1a3bae04b3d954d08d38eac7261..86aa7b6762747c1baaaa8de76f01a633be57e4c4 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -10,14 +10,16 @@ * nested includes. Get it right in the .c file). */ +struct writeback_control; + int mpage_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); int mpage_writepages(struct address_space *mapping, - int *nr_to_write, get_block_t get_block); + struct writeback_control *wbc, get_block_t get_block); static inline int -generic_writepages(struct address_space *mapping, int *nr_to_write) +generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return mpage_writepages(mapping, nr_to_write, NULL); + return mpage_writepages(mapping, wbc, NULL); } diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 3127165e7c1339be5dd7ea6ad9dcdd47882d7f7f..9fd7d5c056052598c040bc7c2765944ead827908 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -128,7 +128,6 @@ enum KERN_TAINTED=53, /* int: various kernel tainted flags */ KERN_CADPID=54, /* int: PID of the process to notify on CAD */ KERN_PIDMAX=55, /* int: PID # limit */ - KERN_HUGETLB_PAGE_NUM=56, /* int: Number of available Huge Pages */ }; @@ -147,12 +146,12 @@ enum VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ VM_DIRTY_BACKGROUND=11, /* dirty_background_ratio */ VM_DIRTY_ASYNC=12, /* dirty_async_ratio */ - VM_DIRTY_SYNC=13, /* dirty_sync_ratio */ - VM_DIRTY_WB_CS=14, /* dirty_writeback_centisecs */ - VM_DIRTY_EXPIRE_CS=15, /* dirty_expire_centisecs */ - VM_NR_PDFLUSH_THREADS=16, /* nr_pdflush_threads */ - VM_OVERCOMMIT_RATIO=17, /* percent of RAM to allow overcommit in */ - VM_PAGEBUF=18 /* struct: Control pagebuf parameters */ + VM_DIRTY_WB_CS=13, /* dirty_writeback_centisecs */ + VM_DIRTY_EXPIRE_CS=14, /* dirty_expire_centisecs */ + VM_NR_PDFLUSH_THREADS=15, /* nr_pdflush_threads */ + VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ + VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ + VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ }; diff --git a/include/linux/uio.h b/include/linux/uio.h index ec098c8e67931389b8bc994384d95bb0658499dd..85b2f0ec9d3f0ba65fbed81865a2832f1bedcd56 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -35,7 +35,11 @@ struct iovec #endif /* - * Total number of bytes covered by an iovec + * Total number of bytes covered by an iovec. + * + * NOTE that it is not safe to use this function until all the iovec's + * segment lengths have been validated. Because the individual lengths can + * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 5de884cd6a7cf1a1a9ed7db6176013db245b1a29..c35b96eb6a90fc7320695db4c9fd8690e7f54986 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -27,22 +27,29 @@ static inline int current_is_pdflush(void) * fs/fs-writeback.c */ enum writeback_sync_modes { - WB_SYNC_NONE = 0, /* Don't wait on anything */ - WB_SYNC_LAST = 1, /* Wait on the last-written mapping */ - WB_SYNC_ALL = 2, /* Wait on every mapping */ - WB_SYNC_HOLD = 3, /* Hold the inode on sb_dirty for sys_sync() */ + WB_SYNC_NONE, /* Don't wait on anything */ + WB_SYNC_ALL, /* Wait on every mapping */ + WB_SYNC_HOLD, /* Hold the inode on sb_dirty for sys_sync() */ }; -void writeback_unlocked_inodes(int *nr_to_write, - enum writeback_sync_modes sync_mode, - unsigned long *older_than_this); +/* + * A control structure which tells the writeback code what to do + */ +struct writeback_control { + struct backing_dev_info *bdi; /* If !NULL, only write back this + queue */ + enum writeback_sync_modes sync_mode; + unsigned long *older_than_this; /* If !NULL, only write back inodes + older than this */ + long nr_to_write; /* Write this many pages, and decrement + this for each page written */ +}; + +void writeback_inodes(struct writeback_control *wbc); void wake_up_inode(struct inode *inode); void __wait_on_inode(struct inode * inode); void sync_inodes_sb(struct super_block *, int wait); void sync_inodes(int wait); -void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write, - enum writeback_sync_modes sync_mode, - unsigned long *older_than_this); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) @@ -57,7 +64,6 @@ static inline void wait_on_inode(struct inode *inode) /* These 5 are exported to sysctl. */ extern int dirty_background_ratio; extern int dirty_async_ratio; -extern int dirty_sync_ratio; extern int dirty_writeback_centisecs; extern int dirty_expire_centisecs; @@ -65,7 +71,7 @@ extern int dirty_expire_centisecs; void balance_dirty_pages(struct address_space *mapping); void balance_dirty_pages_ratelimited(struct address_space *mapping); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); -int do_writepages(struct address_space *mapping, int *nr_to_write); +int do_writepages(struct address_space *mapping, struct writeback_control *wbc); /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl diff --git a/init/main.c b/init/main.c index 9c38da7a9bd04604811d26409a5f44bbd6034ce4..b47b623aa6a014f7f82cf64c912c6059fb219e05 100644 --- a/init/main.c +++ b/init/main.c @@ -393,6 +393,7 @@ asmlinkage void __init start_kernel(void) printk(linux_banner); setup_arch(&command_line); setup_per_cpu_areas(); + build_all_zonelists(); printk("Kernel command line: %s\n", saved_command_line); parse_options(command_line); trap_init(); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 4dc1840a8b70ebaa986583683dc9c61b1d6aa0fe..4931e909724fb00e6d076d709207d8a585a8d89c 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -91,7 +91,6 @@ EXPORT_SYMBOL(do_brk); EXPORT_SYMBOL(exit_mm); /* internal kernel memory management */ -EXPORT_SYMBOL(_alloc_pages); EXPORT_SYMBOL(__alloc_pages); EXPORT_SYMBOL(alloc_pages_node); EXPORT_SYMBOL(__get_free_pages); @@ -116,9 +115,12 @@ EXPORT_SYMBOL(vmalloc_32); EXPORT_SYMBOL(vmap); EXPORT_SYMBOL(vunmap); EXPORT_SYMBOL(vmalloc_to_page); -EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(remap_page_range); +#ifndef CONFIG_DISCONTIGMEM +EXPORT_SYMBOL(contig_page_data); +EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(max_mapnr); +#endif EXPORT_SYMBOL(high_memory); EXPORT_SYMBOL(vmtruncate); EXPORT_SYMBOL(find_vma); diff --git a/kernel/printk.c b/kernel/printk.c index ca1cd3fea625570be71607a7338a0ea712a46149..a3d23302ae5bd04390f3d5688ca8dc6eaa96fd69 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -525,11 +525,11 @@ void release_console_sem(void) { unsigned long flags; unsigned long _con_start, _log_end; - unsigned long must_wake_klogd = 0; + unsigned long wake_klogd = 0; for ( ; ; ) { spin_lock_irqsave(&logbuf_lock, flags); - must_wake_klogd |= log_start - log_end; + wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; @@ -541,7 +541,7 @@ void release_console_sem(void) console_may_schedule = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); - if (must_wake_klogd && !oops_in_progress) + if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) wake_up_interruptible(&log_wait); } diff --git a/kernel/suspend.c b/kernel/suspend.c index 2d7eeaabe1271b2e7b7c90453bbcb16184c023f0..419490900ff606bf218075c7fbfd683b3bf51641 100644 --- a/kernel/suspend.c +++ b/kernel/suspend.c @@ -471,10 +471,12 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p) int nr_copy_pages = 0; int pfn; struct page *page; - + +#ifndef CONFIG_DISCONTIGMEM if (max_mapnr != num_physpages) panic("mapnr is not expected"); - for (pfn = 0; pfn < max_mapnr; pfn++) { +#endif + for (pfn = 0; pfn < num_physpages; pfn++) { page = pfn_to_page(pfn); if (PageHighMem(page)) panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-)."); @@ -514,19 +516,20 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p) static void free_suspend_pagedir(unsigned long this_pagedir) { - struct page *page = mem_map; - int i; + struct page *page; + int pfn; unsigned long this_pagedir_end = this_pagedir + (PAGE_SIZE << pagedir_order); - for(i=0; i < num_physpages; i++, page++) { + for(pfn = 0; pfn < num_physpages; pfn++) { + page = pfn_to_page(pfn); if (!TestClearPageNosave(page)) continue; - if (ADDRESS(i) >= this_pagedir && ADDRESS(i) < this_pagedir_end) + if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end) continue; /* old pagedir gets freed in one */ - free_page(ADDRESS(i)); + free_page(ADDRESS(pfn)); } free_pages(this_pagedir, pagedir_order); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6f92068e3f2960fb354ce45c832eb82b6005ded5..1a63d254ab80c15a32a1036801ae92a8fbee28a1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -99,8 +99,8 @@ extern int acct_parm[]; #endif #ifdef CONFIG_HUGETLB_PAGE -extern int htlbpage_max; -extern int set_hugetlb_mem_size(int); +extern int htlbpage_max; +extern int set_hugetlb_mem_size(int); #endif static int parse_table(int *, int, void *, size_t *, void *, size_t, @@ -263,10 +263,6 @@ static ctl_table kern_table[] = { #endif {KERN_PIDMAX, "pid_max", &pid_max, sizeof (int), 0600, NULL, &proc_dointvec}, -#ifdef CONFIG_HUGETLB_PAGE - {KERN_HUGETLB_PAGE_NUM, "numhugepages", &htlbpage_max, sizeof(int), 0644, NULL, - &proc_dointvec}, -#endif {0} }; @@ -292,9 +288,6 @@ static ctl_table vm_table[] = { {VM_DIRTY_ASYNC, "dirty_async_ratio", &dirty_async_ratio, sizeof(dirty_async_ratio), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero, &one_hundred }, - {VM_DIRTY_SYNC, "dirty_sync_ratio", &dirty_sync_ratio, - sizeof(dirty_sync_ratio), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, &zero, &one_hundred }, {VM_DIRTY_WB_CS, "dirty_writeback_centisecs", &dirty_writeback_centisecs, sizeof(dirty_writeback_centisecs), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, @@ -317,6 +310,10 @@ static ctl_table vm_table[] = { { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads", &nr_pdflush_threads, sizeof nr_pdflush_threads, 0444 /* read-only*/, NULL, &proc_dointvec}, +#ifdef CONFIG_HUGETLB_PAGE + {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif {0} }; diff --git a/mm/filemap.c b/mm/filemap.c index 483699da95d06fb9c4e2585721dab124bbb518fb..3aa685fdcf25accb4fd64cb50672e5a2dbc6307f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -487,9 +487,13 @@ EXPORT_SYMBOL(fail_writepage); int filemap_fdatawrite(struct address_space *mapping) { int ret; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = mapping->nrpages * 2, + }; current->flags |= PF_SYNC; - ret = do_writepages(mapping, NULL); + ret = do_writepages(mapping, &wbc); current->flags &= ~PF_SYNC; return ret; } @@ -1130,10 +1134,26 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, struct file *filp = iocb->ki_filp; ssize_t retval; unsigned long seg; - size_t count = iov_length(iov, nr_segs); + size_t count; - if ((ssize_t) count < 0) - return -EINVAL; + count = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + count += iv->iov_len; + if (unlikely((ssize_t)(count|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + nr_segs = seg; + break; + } /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { @@ -1162,11 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, goto out; } - for (seg = 0; seg < nr_segs; seg++) { - if (!access_ok(VERIFY_WRITE,iov[seg].iov_base,iov[seg].iov_len)) - return -EFAULT; - } - retval = 0; if (count) { for (seg = 0; seg < nr_segs; seg++) { @@ -1626,6 +1641,63 @@ filemap_copy_from_user(struct page *page, unsigned long offset, return left; } +static inline int +__filemap_copy_from_user_iovec(char *vaddr, + const struct iovec *iov, size_t base, unsigned bytes) +{ + int left = 0; + + while (bytes) { + char *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + base = 0; + if ((left = __copy_from_user(vaddr, buf, copy))) + break; + bytes -= copy; + vaddr += copy; + iov++; + } + return left; +} + +static inline int +filemap_copy_from_user_iovec(struct page *page, unsigned long offset, + const struct iovec *iov, size_t base, unsigned bytes) +{ + char *kaddr; + int left; + + kaddr = kmap_atomic(page, KM_USER0); + left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes); + kunmap_atomic(kaddr, KM_USER0); + if (left != 0) { + kaddr = kmap(page); + left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes); + kunmap(page); + } + return left; +} + +static inline void +filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, unsigned bytes) +{ + const struct iovec *iov = *iovp; + size_t base = *basep; + + while (bytes) { + int copy = min(bytes, iov->iov_len - base); + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + *iovp = iov; + *basep = base; +} + + /* * Write to a file through the page cache. * @@ -1641,8 +1713,8 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, { struct address_space * mapping = file->f_dentry->d_inode->i_mapping; struct address_space_operations *a_ops = mapping->a_ops; - const size_t ocount = iov_length(iov, nr_segs); - size_t count = ocount; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ struct inode *inode = mapping->host; unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; long status = 0; @@ -1654,19 +1726,30 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, unsigned bytes; time_t time_now; struct pagevec lru_pvec; - struct iovec *cur_iov; - unsigned iov_bytes; /* Cumulative count to the end of the - current iovec */ + const struct iovec *cur_iov = iov; /* current iovec */ + unsigned iov_base = 0; /* offset in the current iovec */ unsigned long seg; char *buf; - if (unlikely((ssize_t)count < 0)) - return -EINVAL; - + ocount = 0; for (seg = 0; seg < nr_segs; seg++) { - if (!access_ok(VERIFY_READ,iov[seg].iov_base,iov[seg].iov_len)) + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) return -EFAULT; + nr_segs = seg; + break; } + count = ocount; pos = *ppos; if (unlikely(pos < 0)) @@ -1788,9 +1871,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, goto out_status; } - cur_iov = (struct iovec *)iov; - iov_bytes = cur_iov->iov_len; - buf = cur_iov->iov_base; + buf = iov->iov_base; do { unsigned long index; unsigned long offset; @@ -1801,8 +1882,6 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, bytes = PAGE_CACHE_SIZE - offset; if (bytes > count) bytes = count; - if (bytes + written > iov_bytes) - bytes = iov_bytes - written; /* * Bring in the user page that we will copy from _first_. @@ -1830,7 +1909,12 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, vmtruncate(inode, inode->i_size); break; } - page_fault = filemap_copy_from_user(page, offset, buf, bytes); + if (likely(nr_segs == 1)) + page_fault = filemap_copy_from_user(page, offset, + buf, bytes); + else + page_fault = filemap_copy_from_user_iovec(page, offset, + cur_iov, iov_base, bytes); flush_dcache_page(page); status = a_ops->commit_write(file, page, offset, offset+bytes); if (unlikely(page_fault)) { @@ -1844,11 +1928,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, count -= status; pos += status; buf += status; - if (written == iov_bytes && count) { - cur_iov++; - iov_bytes += cur_iov->iov_len; - buf = cur_iov->iov_base; - } + if (unlikely(nr_segs > 1)) + filemap_set_next_iovec(&cur_iov, + &iov_base, status); } } if (!PageReferenced(page)) diff --git a/mm/memory.c b/mm/memory.c index c886e849231bf2f0e847b0c665525e5bb92e4d44..e58e9dee7bfcbc22e8db610657ad5c9f6e2370ce 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -40,7 +40,6 @@ #include <linux/mm.h> #include <linux/mman.h> #include <linux/swap.h> -#include <linux/smp_lock.h> #include <linux/iobuf.h> #include <linux/highmem.h> #include <linux/pagemap.h> @@ -53,7 +52,12 @@ #include <linux/swapops.h> +#ifndef CONFIG_DISCONTIGMEM +/* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; +struct page *mem_map; +#endif + unsigned long num_physpages; void * high_memory; struct page *highmem_start_page; @@ -72,8 +76,6 @@ static inline void copy_cow_page(struct page * from, struct page * to, unsigned copy_user_highpage(to, from, address); } -struct page *mem_map; - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. diff --git a/mm/mempool.c b/mm/mempool.c index b92e72b211d3f55bfeb9d5111f6c4f5369ff284b..a201059c12645d00cc65ca014dcbc2e77b0fec84 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -187,11 +187,12 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask) int curr_nr; DECLARE_WAITQUEUE(wait, current); int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + int pf_flags = current->flags; repeat_alloc: current->flags |= PF_NOWARN; element = pool->alloc(gfp_nowait, pool->pool_data); - current->flags &= ~PF_NOWARN; + current->flags = pf_flags; if (likely(element != NULL)) return element; diff --git a/mm/mmap.c b/mm/mmap.c index 7b621c7166bf64bc61fe147f330d7b9586f9e2d6..0038ed6bf5e2238068c7549c7023b22717730329 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -11,7 +11,6 @@ #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/file.h> #include <linux/fs.h> @@ -444,6 +443,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, */ vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + if (flags & MAP_LOCKED) { + if (!capable(CAP_IPC_LOCK)) + return -EPERM; + vm_flags |= VM_LOCKED; + } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked = mm->locked_vm << PAGE_SHIFT; @@ -1073,7 +1077,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. - * Jeremy Fitzhardine <jeremy@sw.oz.au> + * Jeremy Fitzhardinge <jeremy@goop.org> */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) { diff --git a/mm/mremap.c b/mm/mremap.c index 0d22f3d6c20f0e6e1b88005e62fc56a2d5e8b4a2..6b1d44bd114c37252391c2dce62295e4540fdeb8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -9,7 +9,6 @@ #include <linux/mm.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/swap.h> diff --git a/mm/numa.c b/mm/numa.c index c293d9ae2df02f3890c2b2bd4abd60f6a7c1cd9c..a36769c95390dd81a9f4207d821d765243fb30bb 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -22,11 +22,21 @@ pg_data_t contig_page_data = { .bdata = &contig_bootmem_data }; * Should be invoked with paramters (0, 0, unsigned long *[], start_paddr). */ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, - unsigned long *zones_size, unsigned long zone_start_pfn, + unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { - free_area_init_core(0, &contig_page_data, &mem_map, zones_size, - zone_start_pfn, zholes_size, pmap); + unsigned long size; + + contig_page_data.node_id = 0; + contig_page_data.node_start_pfn = node_start_pfn; + calculate_totalpages (&contig_page_data, zones_size, zholes_size); + if (pmap == (struct page *)0) { + size = (pgdat->node_size + 1) * sizeof(struct page); + pmap = (struct page *) alloc_bootmem_node(pgdat, size); + } + contig_page_data.node_mem_map = pmap; + free_area_init_core(&contig_page_data, zones_size, zholes_size); + mem_map = contig_page_data.node_mem_map; } #endif /* !CONFIG_DISCONTIGMEM */ @@ -48,22 +58,26 @@ struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int orde * Nodes can be initialized parallely, in no particular order. */ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, - unsigned long *zones_size, unsigned long zone_start_pfn, + unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { - int i, size = 0; - struct page *discard; - - if (mem_map == NULL) - mem_map = (struct page *)PAGE_OFFSET; + int i; + unsigned long size; - free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_pfn, - zholes_size, pmap); pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; + calculate_totalpages (pgdat, zones_size, zholes_size); + if (pmap == (struct page *)0) { + size = (pgdat->node_size + 1) * sizeof(struct page); + pmap = (struct page *) alloc_bootmem_node(pgdat, size); + } + pgdat->node_mem_map = pmap; + free_area_init_core(pgdat, zones_size, zholes_size); /* * Get space for the valid bitmap. */ + size = 0; for (i = 0; i < MAX_NR_ZONES; i++) size += zones_size[i]; size = LONG_ALIGN((size + 7) >> 3); @@ -71,48 +85,4 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, memset(pgdat->valid_addr_bitmap, 0, size); } -static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask, - unsigned int order) -{ - return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK)); -} - -/* - * This can be refined. Currently, tries to do round robin, instead - * should do concentratic circle search, starting from current node. - */ -struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order) -{ - struct page *ret = 0; - pg_data_t *start, *temp; -#ifndef CONFIG_NUMA - unsigned long flags; - static pg_data_t *next = 0; -#endif - - if (order >= MAX_ORDER) - return NULL; -#ifdef CONFIG_NUMA - temp = NODE_DATA(numa_node_id()); -#else - if (!next) - next = pgdat_list; - temp = next; - next = next->pgdat_next; -#endif - start = temp; - while (temp) { - if ((ret = alloc_pages_pgdat(temp, gfp_mask, order))) - return(ret); - temp = temp->pgdat_next; - } - temp = pgdat_list; - while (temp != start) { - if ((ret = alloc_pages_pgdat(temp, gfp_mask, order))) - return(ret); - temp = temp->pgdat_next; - } - return(0); -} - #endif /* CONFIG_DISCONTIGMEM */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fb201b6ca0b27f4e2466f6d03458b9117a7ee53b..a8afd3699509b1f34db00eb15bd738ff5796b7ba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -51,7 +51,7 @@ static long total_pages; * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably * large amounts of I/O are submitted. */ -static inline int sync_writeback_pages(void) +static inline long sync_writeback_pages(void) { return ratelimit_pages + ratelimit_pages / 2; } @@ -72,11 +72,6 @@ int dirty_background_ratio = 10; */ int dirty_async_ratio = 40; -/* - * The generator of dirty data performs sync writeout at this level - */ -int dirty_sync_ratio = 50; - /* * The interval between `kupdate'-style writebacks, in centiseconds * (hundredths of a second) @@ -105,15 +100,11 @@ static void background_writeout(unsigned long _min_pages); * - Does nothing at all. * * balance_dirty_pages() can sleep. - * - * FIXME: WB_SYNC_LAST doesn't actually work. It waits on the last dirty - * inode on the superblock list. It should wait when nr_to_write is - * exhausted. Doesn't seem to matter. */ void balance_dirty_pages(struct address_space *mapping) { struct page_state ps; - long background_thresh, async_thresh, sync_thresh; + long background_thresh, async_thresh; unsigned long dirty_and_writeback; struct backing_dev_info *bdi; @@ -122,18 +113,17 @@ void balance_dirty_pages(struct address_space *mapping) background_thresh = (dirty_background_ratio * total_pages) / 100; async_thresh = (dirty_async_ratio * total_pages) / 100; - sync_thresh = (dirty_sync_ratio * total_pages) / 100; bdi = mapping->backing_dev_info; - if (dirty_and_writeback > sync_thresh) { - int nr_to_write = sync_writeback_pages(); + if (dirty_and_writeback > async_thresh) { + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = sync_writeback_pages(), + }; - writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_LAST, NULL); - get_page_state(&ps); - } else if (dirty_and_writeback > async_thresh) { - int nr_to_write = sync_writeback_pages(); - - writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_NONE, NULL); + writeback_inodes(&wbc); get_page_state(&ps); } @@ -177,7 +167,12 @@ static void background_writeout(unsigned long _min_pages) { long min_pages = _min_pages; long background_thresh; - int nr_to_write; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 0, + }; CHECK_EMERGENCY_SYNC @@ -185,14 +180,13 @@ static void background_writeout(unsigned long _min_pages) do { struct page_state ps; - get_page_state(&ps); if (ps.nr_dirty < background_thresh && min_pages <= 0) break; - nr_to_write = MAX_WRITEBACK_PAGES; - writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL); - min_pages -= MAX_WRITEBACK_PAGES - nr_to_write; - } while (nr_to_write <= 0); + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + writeback_inodes(&wbc); + min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + } while (wbc.nr_to_write <= 0); blk_run_queues(); } @@ -230,7 +224,12 @@ static void wb_kupdate(unsigned long arg) unsigned long start_jif; unsigned long next_jif; struct page_state ps; - int nr_to_write; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = WB_SYNC_NONE, + .older_than_this = &oldest_jif, + .nr_to_write = 0, + }; sync_supers(); get_page_state(&ps); @@ -238,8 +237,8 @@ static void wb_kupdate(unsigned long arg) oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; start_jif = jiffies; next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; - nr_to_write = ps.nr_dirty; - writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif); + wbc.nr_to_write = ps.nr_dirty; + writeback_inodes(&wbc); blk_run_queues(); yield(); @@ -312,8 +311,6 @@ static int __init page_writeback_init(void) dirty_background_ratio /= 100; dirty_async_ratio *= correction; dirty_async_ratio /= 100; - dirty_sync_ratio *= correction; - dirty_sync_ratio /= 100; } init_timer(&wb_timer); @@ -351,7 +348,7 @@ module_init(page_writeback_init); * So. The proper fix is to leave the page locked-and-dirty and to pass * it all the way down. */ -int generic_vm_writeback(struct page *page, int *nr_to_write) +int generic_vm_writeback(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; @@ -363,7 +360,7 @@ int generic_vm_writeback(struct page *page, int *nr_to_write) unlock_page(page); if (inode) { - do_writepages(inode->i_mapping, nr_to_write); + do_writepages(inode->i_mapping, wbc); /* * This iput() will internally call ext2_discard_prealloc(), @@ -392,11 +389,11 @@ int generic_vm_writeback(struct page *page, int *nr_to_write) } EXPORT_SYMBOL(generic_vm_writeback); -int do_writepages(struct address_space *mapping, int *nr_to_write) +int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { if (mapping->a_ops->writepages) - return mapping->a_ops->writepages(mapping, nr_to_write); - return generic_writepages(mapping, nr_to_write); + return mapping->a_ops->writepages(mapping, wbc); + return generic_writepages(mapping, wbc); } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a806031113fc0f8806b8577b3838e6401e476b60..435a12dd157495bd2442a161695f0af98d322d54 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -256,14 +256,6 @@ int is_head_of_free_region(struct page *page) } #endif /* CONFIG_SOFTWARE_SUSPEND */ -#ifndef CONFIG_DISCONTIGMEM -struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order) -{ - return __alloc_pages(gfp_mask, order, - contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); -} -#endif - static /* inline */ struct page * balance_classzone(struct zone* classzone, unsigned int gfp_mask, unsigned int order, int * freed) @@ -680,13 +672,41 @@ void show_free_areas(void) /* * Builds allocation fallback zone lists. */ -static inline void build_zonelists(pg_data_t *pgdat) +static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) { - int i, j, k; + switch (k) { + struct zone *zone; + default: + BUG(); + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->size) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->size) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->size) + zonelist->zones[j++] = zone; + } + return j; +} + +static void __init build_zonelists(pg_data_t *pgdat) +{ + int i, j, k, node, local_node; + + local_node = pgdat->node_id; + printk("Building zonelist for node : %d\n", local_node); for (i = 0; i <= GFP_ZONEMASK; i++) { struct zonelist *zonelist; - struct zone *zone; zonelist = pgdat->node_zonelists + i; memset(zonelist, 0, sizeof(*zonelist)); @@ -698,33 +718,49 @@ static inline void build_zonelists(pg_data_t *pgdat) if (i & __GFP_DMA) k = ZONE_DMA; - switch (k) { - default: - BUG(); - /* - * fallthrough: - */ - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->size) { -#ifndef CONFIG_HIGHMEM - BUG(); -#endif - zonelist->zones[j++] = zone; - } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->size) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->size) - zonelist->zones[j++] = zone; - } + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < numnodes; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + for (node = 0; node < local_node; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j++] = NULL; } } +void __init build_all_zonelists(void) +{ + int i; + + for(i = 0 ; i < numnodes ; i++) + build_zonelists(NODE_DATA(i)); +} + +void __init calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size, + unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zones_size[i]; + pgdat->node_size = totalpages; + + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); +} + /* * Helper functions to size the waitqueue hash table. * Essentially these want to choose hash table sizes sufficiently @@ -775,46 +811,18 @@ static inline unsigned long wait_table_bits(unsigned long size) * - mark all memory queues empty * - clear the memory bitmaps */ -void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, - unsigned long *zones_size, unsigned long zone_start_pfn, - unsigned long *zholes_size, struct page *lmem_map) +void __init free_area_init_core(pg_data_t *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) { unsigned long i, j; - unsigned long map_size; - unsigned long totalpages, offset, realtotalpages; + unsigned long local_offset; const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + int nid = pgdat->node_id; + struct page *lmem_map = pgdat->node_mem_map; + unsigned long zone_start_pfn = pgdat->node_start_pfn; - totalpages = 0; - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zones_size[i]; - - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - - printk("On node %d totalpages: %lu\n", nid, realtotalpages); - - /* - * Some architectures (with lots of mem and discontinous memory - * maps) have to search for a good mem_map area: - * For discontigmem, the conceptual mem map array starts from - * PAGE_OFFSET, we need to align the actual array onto a mem map - * boundary, so that MAP_NR works. - */ - map_size = (totalpages + 1)*sizeof(struct page); - if (lmem_map == (struct page *)0) { - lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); - lmem_map = (struct page *)(PAGE_OFFSET + - MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); - } - *gmap = pgdat->node_mem_map = lmem_map; - pgdat->node_size = totalpages; - pgdat->node_start_pfn = zone_start_pfn; - pgdat->node_start_mapnr = (lmem_map - mem_map); pgdat->nr_zones = 0; - - offset = lmem_map - mem_map; + local_offset = 0; /* offset within lmem_map */ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long mask; @@ -866,8 +874,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, zone->pages_low = mask*2; zone->pages_high = mask*3; - zone->zone_mem_map = mem_map + offset; - zone->zone_start_mapnr = offset; + zone->zone_mem_map = lmem_map + local_offset; zone->zone_start_pfn = zone_start_pfn; if ((zone_start_pfn) & (zone_required_alignment-1)) @@ -879,7 +886,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, * done. Non-atomic initialization, single-pass. */ for (i = 0; i < size; i++) { - struct page *page = mem_map + offset + i; + struct page *page = lmem_map + local_offset + i; set_page_zone(page, nid * MAX_NR_ZONES + j); set_page_count(page, 0); SetPageReserved(page); @@ -893,7 +900,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, zone_start_pfn++; } - offset += size; + local_offset += size; for (i = 0; ; i++) { unsigned long bitmap_size; @@ -932,13 +939,15 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); } } - build_zonelists(pgdat); } +#ifndef CONFIG_DISCONTIGMEM void __init free_area_init(unsigned long *zones_size) { - free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); + free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, NULL); + mem_map = contig_page_data.node_mem_map; } +#endif static int __init setup_mem_frac(char *str) { diff --git a/mm/page_io.c b/mm/page_io.c index ced005c65001977197f5ac338079bf462decee3c..47de394d5576a75cc4cb4fe163cfd4c99be5f5b6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -131,12 +131,12 @@ int swap_readpage(struct file *file, struct page *page) * Swap pages are !PageLocked and PageWriteback while under writeout so that * memory allocators will throttle against them. */ -static int swap_vm_writeback(struct page *page, int *nr_to_write) +static int swap_vm_writeback(struct page *page, struct writeback_control *wbc) { struct address_space *mapping = page->mapping; unlock_page(page); - return generic_writepages(mapping, nr_to_write); + return generic_writepages(mapping, wbc); } struct address_space_operations swap_aops = { diff --git a/mm/shmem.c b/mm/shmem.c index 53a5defb4436ab809aad631a1f982e39aaf4e235..496659e341f4475d825dae668ae61191105e11fb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,7 +28,6 @@ #include <linux/pagemap.h> #include <linux/string.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/backing-dev.h> #include <linux/shmem_fs.h> diff --git a/mm/swap.c b/mm/swap.c index 4e88784e20457e0d155cbc34bbdcc0d4f44adc1c..4528369df0840768a4748ba8964ac0abdf926b6c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -124,9 +124,9 @@ void release_pages(struct page **pages, int nr) if (page_count(page) == 0) { if (!pagevec_add(&pages_to_free, page)) { spin_unlock_irq(&zone->lru_lock); - pagevec_free(&pages_to_free); + __pagevec_free(&pages_to_free); pagevec_init(&pages_to_free); - spin_lock_irq(&zone->lru_lock); + zone = NULL; /* No lock is held */ } } } @@ -165,8 +165,8 @@ void __pagevec_release_nonlru(struct pagevec *pvec) } /* - * Move all the inactive pages to the head of the inactive list - * and release them. Reinitialises the caller's pagevec. + * Move all the inactive pages to the head of the inactive list and release + * them. Reinitialises the caller's pagevec. */ void pagevec_deactivate_inactive(struct pagevec *pvec) { @@ -180,8 +180,6 @@ void pagevec_deactivate_inactive(struct pagevec *pvec) struct zone *pagezone = page_zone(page); if (pagezone != zone) { - if (PageActive(page) || !PageLRU(page)) - continue; if (zone) spin_unlock_irq(&zone->lru_lock); zone = pagezone; diff --git a/mm/swap_state.c b/mm/swap_state.c index d07f8db1f7c72b5613bdcd28546d035956a6dfe7..d936aadcbf921209a5ede3ecdb91949d332bca69 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -12,7 +12,6 @@ #include <linux/swap.h> #include <linux/init.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> /* block_sync_page() */ @@ -119,7 +118,7 @@ void __delete_from_swap_cache(struct page *page) int add_to_swap(struct page * page) { swp_entry_t entry; - int flags; + int pf_flags; if (!PageLocked(page)) BUG(); @@ -142,7 +141,7 @@ int add_to_swap(struct page * page) * just not all of them. */ - flags = current->flags; + pf_flags = current->flags; current->flags &= ~PF_MEMALLOC; current->flags |= PF_NOWARN; ClearPageUptodate(page); /* why? */ @@ -154,20 +153,20 @@ int add_to_swap(struct page * page) */ switch (add_to_swap_cache(page, entry)) { case 0: /* Success */ - current->flags = flags; + current->flags = pf_flags; SetPageUptodate(page); set_page_dirty(page); swap_free(entry); return 1; case -ENOMEM: /* radix-tree allocation */ - current->flags = flags; + current->flags = pf_flags; swap_free(entry); return 0; default: /* ENOENT: raced */ break; } /* Raced with "speculative" read_swap_cache_async */ - current->flags = flags; + current->flags = pf_flags; swap_free(entry); } } diff --git a/mm/swapfile.c b/mm/swapfile.c index 330c94cef7877b5ecd6534d403eeab80024de681..000ed1583dc5bd7a5b81f058ede5de6c05cdc76b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -7,7 +7,6 @@ #include <linux/mm.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/vmalloc.h> diff --git a/mm/vmscan.c b/mm/vmscan.c index f220b40fc9c1b4e92378c08b45e5bc1a1cb7f38a..a8b2c1911c9ae35baecb865449a45a1e6620857b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -15,7 +15,6 @@ #include <linux/slab.h> #include <linux/kernel_stat.h> #include <linux/swap.h> -#include <linux/smp_lock.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> @@ -145,6 +144,7 @@ shrink_list(struct list_head *page_list, int nr_pages, if (!add_to_swap(page)) goto activate_locked; pte_chain_lock(page); + mapping = page->mapping; } /* @@ -174,15 +174,18 @@ shrink_list(struct list_head *page_list, int nr_pages, */ if (PageDirty(page) && is_page_cache_freeable(page) && mapping && may_enter_fs) { - int (*writeback)(struct page *, int *); + int (*writeback)(struct page *, + struct writeback_control *); const int cluster_size = SWAP_CLUSTER_MAX; - int nr_to_write = cluster_size; + struct writeback_control wbc = { + .nr_to_write = cluster_size, + }; writeback = mapping->a_ops->vm_writeback; if (writeback == NULL) writeback = generic_vm_writeback; - (*writeback)(page, &nr_to_write); - *max_scan -= (cluster_size - nr_to_write); + (*writeback)(page, &wbc); + *max_scan -= (cluster_size - wbc.nr_to_write); goto keep; }