Commit 616d8602 authored by Linus Torvalds's avatar Linus Torvalds

Linux 2.2.2-pre2

this one contains various small documentation updates and updates to xconfig,
but the important parts (and the smallest part of the actual patch) are:

 - shared file lockup fix by Stephen Tweedie
 - my fix for the TCP bug that Ingo found
 - Ingo's io-apic setup fixes, which should finally get rid of the
   spurious apic interrupts with some motherboards and the ExtINT setup.
 - inode leak thing
 - SMP scheduler potential race condition fix
 - sound driver updates
 - partition and disk fixes (2kB blocksize media and some IDE disk
   geometry and irq detection issues).

None of the fixes are critical to most people, but all of them _can_ be
critical to people who have seen vulnerabilities in the area. As such, if
you're happy with 2.2.1 there is no pressing reason to test this patch
out, but I hope to have the pre-patches so that the final 2.2.2 can be
left around for a while (CD-ROM manufacturers etc would certainly prefer
to not see lots of releases).

                Linus
parent da0f0135
...@@ -202,7 +202,7 @@ DO_ACTION( enable, 1, |= 0xff000000, ) /* destination = 0xff */ ...@@ -202,7 +202,7 @@ DO_ACTION( enable, 1, |= 0xff000000, ) /* destination = 0xff */
DO_ACTION( mask, 0, |= 0x00010000, io_apic_sync()) /* mask = 1 */ DO_ACTION( mask, 0, |= 0x00010000, io_apic_sync()) /* mask = 1 */
DO_ACTION( unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ DO_ACTION( unmask, 0, &= 0xfffeffff, ) /* mask = 0 */
static void __init clear_IO_APIC_pin(unsigned int pin) static void clear_IO_APIC_pin(unsigned int pin)
{ {
struct IO_APIC_route_entry entry; struct IO_APIC_route_entry entry;
...@@ -215,6 +215,13 @@ static void __init clear_IO_APIC_pin(unsigned int pin) ...@@ -215,6 +215,13 @@ static void __init clear_IO_APIC_pin(unsigned int pin)
io_apic_write(0x11 + 2 * pin, *(((int *)&entry) + 1)); io_apic_write(0x11 + 2 * pin, *(((int *)&entry) + 1));
} }
static void clear_IO_APIC (void)
{
int pin;
for (pin = 0; pin < nr_ioapic_registers; pin++)
clear_IO_APIC_pin(pin);
}
/* /*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
...@@ -625,7 +632,7 @@ void __init setup_IO_APIC_irqs(void) ...@@ -625,7 +632,7 @@ void __init setup_IO_APIC_irqs(void)
/* /*
* Set up a certain pin as ExtINT delivered interrupt * Set up a certain pin as ExtINT delivered interrupt
*/ */
void __init setup_ExtINT_pin(unsigned int pin) void __init setup_ExtINT_pin(unsigned int pin, int irq)
{ {
struct IO_APIC_route_entry entry; struct IO_APIC_route_entry entry;
...@@ -635,17 +642,16 @@ void __init setup_ExtINT_pin(unsigned int pin) ...@@ -635,17 +642,16 @@ void __init setup_ExtINT_pin(unsigned int pin)
memset(&entry,0,sizeof(entry)); memset(&entry,0,sizeof(entry));
entry.delivery_mode = dest_ExtINT; entry.delivery_mode = dest_ExtINT;
entry.dest_mode = 1; /* logical delivery */ entry.dest_mode = 0; /* physical delivery */
entry.mask = 0; /* unmask IRQ now */ entry.mask = 0; /* unmask IRQ now */
/* /*
* Careful with this one. We do not use 'true' logical * We use physical delivery to get the timer IRQ
* delivery, as we set local APICs to LDR == 0. But * to the boot CPU. 'boot_cpu_id' is the physical
* 0xff logical destination is special (broadcast). * APIC ID of the boot CPU.
* Any other combination will cause problems.
*/ */
entry.dest.logical.logical_dest = 0xff; entry.dest.physical.physical_dest = boot_cpu_id;
entry.vector = 0; /* it's ignored */ entry.vector = assign_irq_vector(irq);
entry.polarity = 0; entry.polarity = 0;
entry.trigger = 0; entry.trigger = 0;
...@@ -760,7 +766,7 @@ void __init print_IO_APIC(void) ...@@ -760,7 +766,7 @@ void __init print_IO_APIC(void)
static void __init init_sym_mode(void) static void __init init_sym_mode(void)
{ {
int i, pin; int i;
for (i = 0; i < PIN_MAP_SIZE; i++) { for (i = 0; i < PIN_MAP_SIZE; i++) {
irq_2_pin[i].pin = -1; irq_2_pin[i].pin = -1;
...@@ -790,8 +796,7 @@ static void __init init_sym_mode(void) ...@@ -790,8 +796,7 @@ static void __init init_sym_mode(void)
/* /*
* Do not trust the IO-APIC being empty at bootup * Do not trust the IO-APIC being empty at bootup
*/ */
for (pin = 0; pin < nr_ioapic_registers; pin++) clear_IO_APIC();
clear_IO_APIC_pin(pin);
} }
/* /*
...@@ -799,6 +804,15 @@ static void __init init_sym_mode(void) ...@@ -799,6 +804,15 @@ static void __init init_sym_mode(void)
*/ */
void init_pic_mode(void) void init_pic_mode(void)
{ {
/*
* Clear the IO-APIC before rebooting:
*/
clear_IO_APIC();
/*
* Put it back into PIC mode (has an effect only on
* certain boards)
*/
printk("disabling symmetric IO mode... "); printk("disabling symmetric IO mode... ");
outb_p(0x70, 0x22); outb_p(0x70, 0x22);
outb_p(0x00, 0x23); outb_p(0x00, 0x23);
...@@ -1184,7 +1198,7 @@ static inline void check_timer(void) ...@@ -1184,7 +1198,7 @@ static inline void check_timer(void)
if (pin2 != -1) { if (pin2 != -1) {
printk(".. (found pin %d) ...", pin2); printk(".. (found pin %d) ...", pin2);
setup_ExtINT_pin(pin2); setup_ExtINT_pin(pin2, 0);
make_8259A_irq(0); make_8259A_irq(0);
} }
......
...@@ -165,7 +165,7 @@ do_kdsk_ioctl(int cmd, struct kbentry *user_kbe, int perm, struct kbd_struct *kb ...@@ -165,7 +165,7 @@ do_kdsk_ioctl(int cmd, struct kbentry *user_kbe, int perm, struct kbd_struct *kb
val = K_HOLE; val = K_HOLE;
} else } else
val = (i ? K_HOLE : K_NOSUCHMAP); val = (i ? K_HOLE : K_NOSUCHMAP);
return __put_user(val, &user_kbe->kb_value); return put_user(val, &user_kbe->kb_value);
case KDSKBENT: case KDSKBENT:
if (!perm) if (!perm)
return -EPERM; return -EPERM;
...@@ -244,7 +244,7 @@ do_kbkeycode_ioctl(int cmd, struct kbkeycode *user_kbkc, int perm) ...@@ -244,7 +244,7 @@ do_kbkeycode_ioctl(int cmd, struct kbkeycode *user_kbkc, int perm)
case KDGETKEYCODE: case KDGETKEYCODE:
kc = getkeycode(tmp.scancode); kc = getkeycode(tmp.scancode);
if (kc >= 0) if (kc >= 0)
kc = __put_user(kc, &user_kbkc->keycode); kc = put_user(kc, &user_kbkc->keycode);
break; break;
case KDSETKEYCODE: case KDSETKEYCODE:
if (!perm) if (!perm)
...@@ -282,8 +282,8 @@ do_kdgkb_ioctl(int cmd, struct kbsentry *user_kdgkb, int perm) ...@@ -282,8 +282,8 @@ do_kdgkb_ioctl(int cmd, struct kbsentry *user_kdgkb, int perm)
p = func_table[i]; p = func_table[i];
if(p) if(p)
for ( ; *p && sz; p++, sz--) for ( ; *p && sz; p++, sz--)
__put_user(*p, q++); put_user(*p, q++);
__put_user('\0', q); put_user('\0', q);
return ((p && *p) ? -EOVERFLOW : 0); return ((p && *p) ? -EOVERFLOW : 0);
case KDSKBSENT: case KDSKBSENT:
if (!perm) if (!perm)
...@@ -603,12 +603,10 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ...@@ -603,12 +603,10 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
{ {
struct kbdiacrs *a = (struct kbdiacrs *)arg; struct kbdiacrs *a = (struct kbdiacrs *)arg;
i = verify_area(VERIFY_WRITE, (void *) a, sizeof(struct kbdiacrs)); if (put_user(accent_table_size, &a->kb_cnt))
if (i) return -EFAULT;
return i; if (copy_to_user(a->kbdiacr, accent_table, accent_table_size*sizeof(struct kbdiacr)))
__put_user(accent_table_size, &a->kb_cnt); return -EFAULT;
__copy_to_user(a->kbdiacr, accent_table,
accent_table_size*sizeof(struct kbdiacr));
return 0; return 0;
} }
...@@ -619,14 +617,13 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ...@@ -619,14 +617,13 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
if (!perm) if (!perm)
return -EPERM; return -EPERM;
i = verify_area(VERIFY_READ, (void *) a, sizeof(struct kbdiacrs)); if (get_user(ct,&a->kb_cnt))
if (i) return -EFAULT;
return i;
__get_user(ct,&a->kb_cnt);
if (ct >= MAX_DIACR) if (ct >= MAX_DIACR)
return -EINVAL; return -EINVAL;
accent_table_size = ct; accent_table_size = ct;
__copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr)); if (copy_from_user(accent_table, a->kbdiacr, ct*sizeof(struct kbdiacr)))
return -EFAULT;
return 0; return 0;
} }
...@@ -717,12 +714,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ...@@ -717,12 +714,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
i = verify_area(VERIFY_WRITE,(void *)vtstat, sizeof(struct vt_stat)); i = verify_area(VERIFY_WRITE,(void *)vtstat, sizeof(struct vt_stat));
if (i) if (i)
return i; return i;
__put_user(fg_console + 1, &vtstat->v_active); put_user(fg_console + 1, &vtstat->v_active);
state = 1; /* /dev/tty0 is always open */ state = 1; /* /dev/tty0 is always open */
for (i = 0, mask = 2; i < MAX_NR_CONSOLES && mask; ++i, mask <<= 1) for (i = 0, mask = 2; i < MAX_NR_CONSOLES && mask; ++i, mask <<= 1)
if (VT_IS_IN_USE(i)) if (VT_IS_IN_USE(i))
state |= mask; state |= mask;
return __put_user(state, &vtstat->v_state); return put_user(state, &vtstat->v_state);
} }
/* /*
...@@ -856,8 +853,8 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ...@@ -856,8 +853,8 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
i = verify_area(VERIFY_READ, (void *)vtsizes, sizeof(struct vt_sizes)); i = verify_area(VERIFY_READ, (void *)vtsizes, sizeof(struct vt_sizes));
if (i) if (i)
return i; return i;
__get_user(ll, &vtsizes->v_rows); get_user(ll, &vtsizes->v_rows);
__get_user(cc, &vtsizes->v_cols); get_user(cc, &vtsizes->v_cols);
return vc_resize_all(ll, cc); return vc_resize_all(ll, cc);
} }
...@@ -870,12 +867,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ...@@ -870,12 +867,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
i = verify_area(VERIFY_READ, (void *)vtconsize, sizeof(struct vt_consize)); i = verify_area(VERIFY_READ, (void *)vtconsize, sizeof(struct vt_consize));
if (i) if (i)
return i; return i;
__get_user(ll, &vtconsize->v_rows); get_user(ll, &vtconsize->v_rows);
__get_user(cc, &vtconsize->v_cols); get_user(cc, &vtconsize->v_cols);
__get_user(vlin, &vtconsize->v_vlin); get_user(vlin, &vtconsize->v_vlin);
__get_user(clin, &vtconsize->v_clin); get_user(clin, &vtconsize->v_clin);
__get_user(vcol, &vtconsize->v_vcol); get_user(vcol, &vtconsize->v_vcol);
__get_user(ccol, &vtconsize->v_ccol); get_user(ccol, &vtconsize->v_ccol);
vlin = vlin ? vlin : video_scan_lines; vlin = vlin ? vlin : video_scan_lines;
if ( clin ) if ( clin )
{ {
......
...@@ -232,13 +232,15 @@ void clear_inode(struct inode *inode) ...@@ -232,13 +232,15 @@ void clear_inode(struct inode *inode)
/* /*
* Dispose-list gets a local list, so it doesn't need to * Dispose-list gets a local list, so it doesn't need to
* worry about list corruption. * worry about list corruption. It releases the inode lock
* while clearing the inodes.
*/ */
static void dispose_list(struct list_head * head) static void dispose_list(struct list_head * head)
{ {
struct list_head *next; struct list_head *next;
int count = 0; int count = 0;
spin_unlock(&inode_lock);
next = head->next; next = head->next;
for (;;) { for (;;) {
struct list_head * tmp = next; struct list_head * tmp = next;
...@@ -256,7 +258,6 @@ static void dispose_list(struct list_head * head) ...@@ -256,7 +258,6 @@ static void dispose_list(struct list_head * head)
spin_lock(&inode_lock); spin_lock(&inode_lock);
list_splice(head, &inode_unused); list_splice(head, &inode_unused);
inodes_stat.nr_free_inodes += count; inodes_stat.nr_free_inodes += count;
spin_unlock(&inode_lock);
} }
/* /*
...@@ -305,52 +306,52 @@ int invalidate_inodes(struct super_block * sb) ...@@ -305,52 +306,52 @@ int invalidate_inodes(struct super_block * sb)
spin_lock(&inode_lock); spin_lock(&inode_lock);
busy = invalidate_list(&inode_in_use, sb, &throw_away); busy = invalidate_list(&inode_in_use, sb, &throw_away);
busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); busy |= invalidate_list(&sb->s_dirty, sb, &throw_away);
spin_unlock(&inode_lock);
dispose_list(&throw_away); dispose_list(&throw_away);
spin_unlock(&inode_lock);
return busy; return busy;
} }
/* /*
* This is called with the inode lock held. It searches * This is called with the inode lock held. It searches
* the in-use for the specified number of freeable inodes. * the in-use for freeable inodes, which are moved to a
* Freeable inodes are moved to a temporary list and then * temporary list and then placed on the unused list by
* placed on the unused list by dispose_list. * dispose_list.
*
* We don't expect to have to call this very often.
* *
* Note that we do not expect to have to search very hard: * N.B. The spinlock is released during the call to
* the freeable inodes will be at the old end of the list. * dispose_list.
*
* N.B. The spinlock is released to call dispose_list.
*/ */
#define CAN_UNUSE(inode) \ #define CAN_UNUSE(inode) \
(((inode)->i_count == 0) && \ (((inode)->i_count | (inode)->i_state) == 0)
(!(inode)->i_state)) #define INODE(entry) (list_entry(entry, struct inode, i_list))
static int free_inodes(int goal) static int free_inodes(void)
{ {
struct list_head *tmp, *head = &inode_in_use; struct list_head list, *entry, *freeable = &list;
LIST_HEAD(freeable); int found = 0;
int found = 0, depth = goal << 1;
while ((tmp = head->prev) != head && depth--) { INIT_LIST_HEAD(freeable);
struct inode * inode = list_entry(tmp, struct inode, i_list); entry = inode_in_use.next;
while (entry != &inode_in_use) {
struct list_head *tmp = entry;
entry = entry->next;
if (!CAN_UNUSE(INODE(tmp)))
continue;
list_del(tmp); list_del(tmp);
if (CAN_UNUSE(inode)) { list_del(&INODE(tmp)->i_hash);
list_del(&inode->i_hash); INIT_LIST_HEAD(&INODE(tmp)->i_hash);
INIT_LIST_HEAD(&inode->i_hash); list_add(tmp, freeable);
list_add(tmp, &freeable); found = 1;
if (++found < goal)
continue;
break;
}
list_add(tmp, head);
} }
if (found) { if (found) {
spin_unlock(&inode_lock); dispose_list(freeable);
dispose_list(&freeable); found = 1; /* silly compiler */
spin_lock(&inode_lock);
} }
return found; return found;
} }
...@@ -374,7 +375,7 @@ static void shrink_dentry_inodes(int goal) ...@@ -374,7 +375,7 @@ static void shrink_dentry_inodes(int goal)
static void try_to_free_inodes(int goal) static void try_to_free_inodes(int goal)
{ {
shrink_dentry_inodes(goal); shrink_dentry_inodes(goal);
if (!free_inodes(goal)) if (!free_inodes())
shrink_dentry_inodes(goal); shrink_dentry_inodes(goal);
} }
...@@ -385,7 +386,7 @@ static void try_to_free_inodes(int goal) ...@@ -385,7 +386,7 @@ static void try_to_free_inodes(int goal)
void free_inode_memory(int goal) void free_inode_memory(int goal)
{ {
spin_lock(&inode_lock); spin_lock(&inode_lock);
free_inodes(goal); free_inodes();
spin_unlock(&inode_lock); spin_unlock(&inode_lock);
} }
...@@ -450,7 +451,7 @@ static struct inode * grow_inodes(void) ...@@ -450,7 +451,7 @@ static struct inode * grow_inodes(void)
inodes_stat.preshrink = 1; inodes_stat.preshrink = 1;
spin_lock(&inode_lock); spin_lock(&inode_lock);
free_inodes(inodes_stat.nr_inodes >> 2); free_inodes();
{ {
struct list_head *tmp = inode_unused.next; struct list_head *tmp = inode_unused.next;
if (tmp != &inode_unused) { if (tmp != &inode_unused) {
......
...@@ -174,6 +174,8 @@ struct mm_struct { ...@@ -174,6 +174,8 @@ struct mm_struct {
unsigned long rss, total_vm, locked_vm; unsigned long rss, total_vm, locked_vm;
unsigned long def_flags; unsigned long def_flags;
unsigned long cpu_vm_mask; unsigned long cpu_vm_mask;
unsigned long swap_cnt; /* number of pages to swap on next pass */
unsigned long swap_address;
/* /*
* This is an architecture-specific pointer: the portable * This is an architecture-specific pointer: the portable
* part of Linux does not know about any segments. * part of Linux does not know about any segments.
...@@ -191,7 +193,7 @@ struct mm_struct { ...@@ -191,7 +193,7 @@ struct mm_struct {
0, 0, 0, \ 0, 0, 0, \
0, 0, 0, 0, \ 0, 0, 0, 0, \
0, 0, 0, \ 0, 0, 0, \
0, 0, NULL } 0, 0, 0, 0, NULL }
struct signal_struct { struct signal_struct {
atomic_t count; atomic_t count;
...@@ -276,8 +278,6 @@ struct task_struct { ...@@ -276,8 +278,6 @@ struct task_struct {
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
int swappable:1; int swappable:1;
unsigned long swap_address;
unsigned long swap_cnt; /* number of pages to swap on next pass */
/* process credentials */ /* process credentials */
uid_t uid,euid,suid,fsuid; uid_t uid,euid,suid,fsuid;
gid_t gid,egid,sgid,fsgid; gid_t gid,egid,sgid,fsgid;
...@@ -361,7 +361,7 @@ struct task_struct { ...@@ -361,7 +361,7 @@ struct task_struct {
/* utime */ {0,0,0,0},0, \ /* utime */ {0,0,0,0},0, \
/* per CPU times */ {0, }, {0, }, \ /* per CPU times */ {0, }, {0, }, \
/* flt */ 0,0,0,0,0,0, \ /* flt */ 0,0,0,0,0,0, \
/* swp */ 0,0,0, \ /* swp */ 0, \
/* process credentials */ \ /* process credentials */ \
/* uid etc */ 0,0,0,0,0,0,0,0, \ /* uid etc */ 0,0,0,0,0,0,0,0, \
/* suppl grps*/ 0, {0,}, \ /* suppl grps*/ 0, {0,}, \
......
...@@ -64,6 +64,7 @@ extern int console_loglevel; ...@@ -64,6 +64,7 @@ extern int console_loglevel;
static int init(void *); static int init(void *);
extern int bdflush(void *); extern int bdflush(void *);
extern int kswapd(void *); extern int kswapd(void *);
extern int kpiod(void *);
extern void kswapd_setup(void); extern void kswapd_setup(void);
extern void init_IRQ(void); extern void init_IRQ(void);
...@@ -1271,6 +1272,7 @@ static void __init do_basic_setup(void) ...@@ -1271,6 +1272,7 @@ static void __init do_basic_setup(void)
kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); kernel_thread(bdflush, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
/* Start the background pageout daemon. */ /* Start the background pageout daemon. */
kswapd_setup(); kswapd_setup();
kernel_thread(kpiod, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
#if CONFIG_AP1000 #if CONFIG_AP1000
......
...@@ -107,6 +107,7 @@ EXPORT_SYMBOL(high_memory); ...@@ -107,6 +107,7 @@ EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(update_vm_cache); EXPORT_SYMBOL(update_vm_cache);
EXPORT_SYMBOL(vmtruncate); EXPORT_SYMBOL(vmtruncate);
EXPORT_SYMBOL(find_vma); EXPORT_SYMBOL(find_vma);
EXPORT_SYMBOL(get_unmapped_area);
/* filesystem internal functions */ /* filesystem internal functions */
EXPORT_SYMBOL(in_group_p); EXPORT_SYMBOL(in_group_p);
......
...@@ -680,8 +680,18 @@ asmlinkage void schedule(void) ...@@ -680,8 +680,18 @@ asmlinkage void schedule(void)
sched_data->prevstate = prev->state; sched_data->prevstate = prev->state;
/* this is the scheduler proper: */
{ {
struct task_struct * p = init_task.next_run; struct task_struct * p = init_task.next_run;
int c = -1000;
/* Default process to select.. */
next = idle_task;
if (prev->state == TASK_RUNNING) {
c = goodness(prev, prev, this_cpu);
next = prev;
}
/* /*
* This is subtle. * This is subtle.
* Note how we can enable interrupts here, even * Note how we can enable interrupts here, even
...@@ -693,36 +703,27 @@ asmlinkage void schedule(void) ...@@ -693,36 +703,27 @@ asmlinkage void schedule(void)
* the scheduler lock * the scheduler lock
*/ */
spin_unlock_irq(&runqueue_lock); spin_unlock_irq(&runqueue_lock);
#ifdef __SMP__
prev->has_cpu = 0;
#endif
/* /*
* Note! there may appear new tasks on the run-queue during this, as * Note! there may appear new tasks on the run-queue during this, as
* interrupts are enabled. However, they will be put on front of the * interrupts are enabled. However, they will be put on front of the
* list, so our list starting at "p" is essentially fixed. * list, so our list starting at "p" is essentially fixed.
*/ */
/* this is the scheduler proper: */ while (p != &init_task) {
{ if (can_schedule(p)) {
int c = -1000; int weight = goodness(p, prev, this_cpu);
next = idle_task; if (weight > c)
while (p != &init_task) { c = weight, next = p;
if (can_schedule(p)) {
int weight = goodness(p, prev, this_cpu);
if (weight > c)
c = weight, next = p;
}
p = p->next_run;
} }
p = p->next_run;
}
/* Do we need to re-calculate counters? */ /* Do we need to re-calculate counters? */
if (!c) { if (!c) {
struct task_struct *p; struct task_struct *p;
read_lock(&tasklist_lock); read_lock(&tasklist_lock);
for_each_task(p) for_each_task(p)
p->counter = (p->counter >> 1) + p->priority; p->counter = (p->counter >> 1) + p->priority;
read_unlock(&tasklist_lock); read_unlock(&tasklist_lock);
}
} }
} }
...@@ -751,10 +752,8 @@ asmlinkage void schedule(void) ...@@ -751,10 +752,8 @@ asmlinkage void schedule(void)
* thus we have to lock the previous process from getting * thus we have to lock the previous process from getting
* rescheduled during switch_to(). * rescheduled during switch_to().
*/ */
prev->has_cpu = 1;
next->has_cpu = 1;
next->processor = this_cpu; next->processor = this_cpu;
next->has_cpu = 1;
spin_unlock(&scheduler_lock); spin_unlock(&scheduler_lock);
#endif /* __SMP__ */ #endif /* __SMP__ */
if (prev != next) { if (prev != next) {
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/swapctl.h> #include <linux/swapctl.h>
#include <linux/slab.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
...@@ -39,6 +40,26 @@ struct page * page_hash_table[PAGE_HASH_SIZE]; ...@@ -39,6 +40,26 @@ struct page * page_hash_table[PAGE_HASH_SIZE];
#define release_page(page) __free_page((page)) #define release_page(page) __free_page((page))
/*
* Define a request structure for outstanding page write requests
* to the background page io daemon
*/
struct pio_request
{
struct pio_request * next;
struct file * file;
unsigned long offset;
unsigned long page;
};
static struct pio_request *pio_first = NULL, **pio_last = &pio_first;
static kmem_cache_t *pio_request_cache;
static struct wait_queue *pio_wait = NULL;
static inline void
make_pio_request(struct file *, unsigned long, unsigned long);
/* /*
* Invalidate the pages of an inode, removing all pages that aren't * Invalidate the pages of an inode, removing all pages that aren't
* locked down (those are sure to be up-to-date anyway, so we shouldn't * locked down (those are sure to be up-to-date anyway, so we shouldn't
...@@ -1079,8 +1100,9 @@ static inline int do_write_page(struct inode * inode, struct file * file, ...@@ -1079,8 +1100,9 @@ static inline int do_write_page(struct inode * inode, struct file * file,
} }
static int filemap_write_page(struct vm_area_struct * vma, static int filemap_write_page(struct vm_area_struct * vma,
unsigned long offset, unsigned long offset,
unsigned long page) unsigned long page,
int wait)
{ {
int result; int result;
struct file * file; struct file * file;
...@@ -1098,6 +1120,17 @@ static int filemap_write_page(struct vm_area_struct * vma, ...@@ -1098,6 +1120,17 @@ static int filemap_write_page(struct vm_area_struct * vma,
* and file could be released ... increment the count to be safe. * and file could be released ... increment the count to be safe.
*/ */
file->f_count++; file->f_count++;
/*
* If this is a swapping operation rather than msync(), then
* leave the actual IO, and the restoration of the file count,
* to the kpiod thread. Just queue the request for now.
*/
if (!wait) {
make_pio_request(file, offset, page);
return 0;
}
down(&inode->i_sem); down(&inode->i_sem);
result = do_write_page(inode, file, (const char *) page, offset); result = do_write_page(inode, file, (const char *) page, offset);
up(&inode->i_sem); up(&inode->i_sem);
...@@ -1113,7 +1146,7 @@ static int filemap_write_page(struct vm_area_struct * vma, ...@@ -1113,7 +1146,7 @@ static int filemap_write_page(struct vm_area_struct * vma,
*/ */
int filemap_swapout(struct vm_area_struct * vma, struct page * page) int filemap_swapout(struct vm_area_struct * vma, struct page * page)
{ {
return filemap_write_page(vma, page->offset, page_address(page)); return filemap_write_page(vma, page->offset, page_address(page), 0);
} }
static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
...@@ -1150,7 +1183,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma, ...@@ -1150,7 +1183,7 @@ static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
return 0; return 0;
} }
} }
error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page); error = filemap_write_page(vma, address - vma->vm_start + vma->vm_offset, page, 1);
free_page(page); free_page(page);
return error; return error;
} }
...@@ -1569,3 +1602,120 @@ void put_cached_page(unsigned long addr) ...@@ -1569,3 +1602,120 @@ void put_cached_page(unsigned long addr)
wake_up(&page->wait); wake_up(&page->wait);
__free_page(page); __free_page(page);
} }
/* Add request for page IO to the queue */
static inline void put_pio_request(struct pio_request *p)
{
*pio_last = p;
p->next = NULL;
pio_last = &p->next;
}
/* Take the first page IO request off the queue */
static inline struct pio_request * get_pio_request(void)
{
struct pio_request * p = pio_first;
pio_first = p->next;
if (!pio_first)
pio_last = &pio_first;
return p;
}
/* Make a new page IO request and queue it to the kpiod thread */
static inline void make_pio_request(struct file *file,
unsigned long offset,
unsigned long page)
{
struct pio_request *p;
atomic_inc(&mem_map[MAP_NR(page)].count);
/*
* We need to allocate without causing any recursive IO in the
* current thread's context. We might currently be swapping out
* as a result of an allocation made while holding a critical
* filesystem lock. To avoid deadlock, we *MUST* not reenter
* the filesystem in this thread.
*
* We can wait for kswapd to free memory, or we can try to free
* pages without actually performing further IO, without fear of
* deadlock. --sct
*/
while ((p = kmem_cache_alloc(pio_request_cache, GFP_BUFFER)) == NULL) {
if (try_to_free_pages(__GFP_WAIT))
continue;
current->state = TASK_INTERRUPTIBLE;
schedule_timeout(HZ/10);
}
p->file = file;
p->offset = offset;
p->page = page;
put_pio_request(p);
wake_up(&pio_wait);
}
/*
* This is the only thread which is allowed to write out filemap pages
* while swapping.
*
* To avoid deadlock, it is important that we never reenter this thread.
* Although recursive memory allocations within this thread may result
* in more page swapping, that swapping will always be done by queuing
* another IO request to the same thread: we will never actually start
* that IO request until we have finished with the current one, and so
* we will not deadlock.
*/
int kpiod(void * unused)
{
struct wait_queue wait = {current};
struct inode * inode;
struct dentry * dentry;
struct pio_request * p;
current->session = 1;
current->pgrp = 1;
strcpy(current->comm, "kpiod");
sigfillset(&current->blocked);
init_waitqueue(&pio_wait);
lock_kernel();
pio_request_cache = kmem_cache_create("pio_request",
sizeof(struct pio_request),
0, SLAB_HWCACHE_ALIGN,
NULL, NULL);
if (!pio_request_cache)
panic ("Could not create pio_request slab cache");
while (1) {
current->state = TASK_INTERRUPTIBLE;
add_wait_queue(&pio_wait, &wait);
while (!pio_first)
schedule();
remove_wait_queue(&pio_wait, &wait);
current->state = TASK_RUNNING;
while (pio_first) {
p = get_pio_request();
dentry = p->file->f_dentry;
inode = dentry->d_inode;
down(&inode->i_sem);
do_write_page(inode, p->file,
(const char *) p->page, p->offset);
up(&inode->i_sem);
fput(p->file);
free_page(p->page);
kmem_cache_free(pio_request_cache, p);
}
}
}
...@@ -202,7 +202,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct * ...@@ -202,7 +202,7 @@ static inline int swap_out_pmd(struct task_struct * tsk, struct vm_area_struct *
do { do {
int result; int result;
tsk->swap_address = address + PAGE_SIZE; tsk->mm->swap_address = address + PAGE_SIZE;
result = try_to_swap_out(tsk, vma, address, pte, gfp_mask); result = try_to_swap_out(tsk, vma, address, pte, gfp_mask);
if (result) if (result)
return result; return result;
...@@ -274,7 +274,7 @@ static int swap_out_process(struct task_struct * p, int gfp_mask) ...@@ -274,7 +274,7 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
/* /*
* Go through process' page directory. * Go through process' page directory.
*/ */
address = p->swap_address; address = p->mm->swap_address;
/* /*
* Find the proper vm-area * Find the proper vm-area
...@@ -296,8 +296,8 @@ static int swap_out_process(struct task_struct * p, int gfp_mask) ...@@ -296,8 +296,8 @@ static int swap_out_process(struct task_struct * p, int gfp_mask)
} }
/* We didn't find anything for the process */ /* We didn't find anything for the process */
p->swap_cnt = 0; p->mm->swap_cnt = 0;
p->swap_address = 0; p->mm->swap_address = 0;
return 0; return 0;
} }
...@@ -345,9 +345,9 @@ static int swap_out(unsigned int priority, int gfp_mask) ...@@ -345,9 +345,9 @@ static int swap_out(unsigned int priority, int gfp_mask)
continue; continue;
/* Refresh swap_cnt? */ /* Refresh swap_cnt? */
if (assign) if (assign)
p->swap_cnt = p->mm->rss; p->mm->swap_cnt = p->mm->rss;
if (p->swap_cnt > max_cnt) { if (p->mm->swap_cnt > max_cnt) {
max_cnt = p->swap_cnt; max_cnt = p->mm->swap_cnt;
pbest = p; pbest = p;
} }
} }
......
...@@ -184,6 +184,8 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of ...@@ -184,6 +184,8 @@ get__netinfo(struct proto *pro, char *buffer, int format, char **start, off_t of
for (req = sp->tp_pinfo.af_tcp.syn_wait_queue; req; for (req = sp->tp_pinfo.af_tcp.syn_wait_queue; req;
i++, req = req->dl_next) { i++, req = req->dl_next) {
if (req->sk)
continue;
pos += 128; pos += 128;
if (pos < offset) if (pos < offset)
continue; continue;
......
...@@ -1563,12 +1563,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1563,12 +1563,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
} }
#endif /* CONFIG_FILTER */ #endif /* CONFIG_FILTER */
/*
* socket locking is here for SMP purposes as backlog rcv
* is currently called with bh processing disabled.
*/
lock_sock(sk);
/* /*
* This doesn't check if the socket has enough room for the packet. * This doesn't check if the socket has enough room for the packet.
* Either process the packet _without_ queueing it and then free it, * Either process the packet _without_ queueing it and then free it,
...@@ -1579,7 +1573,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1579,7 +1573,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sk->state == TCP_ESTABLISHED) { /* Fast path */ if (sk->state == TCP_ESTABLISHED) { /* Fast path */
if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
goto reset; goto reset;
release_sock(sk);
return 0; return 0;
} }
...@@ -1590,14 +1583,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1590,14 +1583,21 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
nsk = tcp_v4_hnd_req(sk, skb); nsk = tcp_v4_hnd_req(sk, skb);
if (!nsk) if (!nsk)
goto discard; goto discard;
lock_sock(nsk);
release_sock(sk); /*
* Queue it on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket..
*/
if (atomic_read(&nsk->sock_readers)) {
__skb_queue_tail(&nsk->back_log, skb);
return 0;
}
sk = nsk; sk = nsk;
} }
if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
goto reset; goto reset;
release_sock(sk);
return 0; return 0;
reset: reset:
...@@ -1609,7 +1609,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1609,7 +1609,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
* might be destroyed here. This current version compiles correctly, * might be destroyed here. This current version compiles correctly,
* but you have been warned. * but you have been warned.
*/ */
release_sock(sk);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment