Commit 9243548a authored by Andrew Morton's avatar Andrew Morton Committed by Ben Collins

[PATCH] /proc/kcore fixes

From: Tony Luck <tony.luck@intel.com>

/proc/kcore has been broken on some architectures for a long time.  Problems
surround the fact that some architectures allocate memory for vmalloc() and
thus modules at addresses below PAGE_OFFSET, which results in negative file
offsets in the virtual core file image provided by /proc/kcore.  There are
also pending problems for discontig memory systems as /proc/kcore just
pretends that there are no holes between "PAGE_OFFSET" and "high_memory", so
an unwary user (ok super-user) can read non-existant memory which may do bad
things.  There may also be kernel objects that would be nice to view in
/proc/kcore, but do not show up there.

A pending change on ia64 to allow booting on machines that don't have
physical memory in any convenient pre-determined place will make things even
worse, because the kernel itself won't show up in the current implementation
of /proc/kcore!

The patch attached provides enough hooks that each architecture should be
able to make /proc/kcore useful.  The patch is INCOMPLETE in that *use* of
those hooks is ONLY PROVIDED FOR IA64.

Here's how it works.  The default code in fs/proc/kcore.c doesn't set up any
"elf_phdr" sections ...  it is left to each architecture to make appropriate
calls to "kclist_add()" to specify a base address and size for each piece of
kernel virtual address space that needs to be made accessible through
/proc/kcore.  To get the old functionality, you'll need two calls that look
something like:

 kclist_add(&kcore_mem, __va(0),
             max_low_pfn * PAGE_SIZE);
 kclist_add(&kcore_vmem, (void *)VMALLOC_START,
             VMALLOC_END-VMALLOC_START);

The first makes all of memory visible (__i386__, __mc68000__ and __x86_64__
should use __va(PAGE_SIZE) to duplicate the original lack of access to page
0).  The second provides a single map for all "vmalloc" space (the code still
searches the vmlist to see what actually exists before accessing it).

Other blocks of kernel virtual space can be added as needed, and removed
again (with kclist_del()).  E.g.  discontiguous memory machines can add an
entry for each block of memory.  Architectures that allocate memory for
modules someplace outside of vmalloc-land can add/remove entries on module
insert and remove.

The second piece of abstraction is the kc_vaddr_to_offset() and
kc_offset_to_vaddr() macros.  These provide mappings from kernel virtual
addresses to offsets in the virtual file that /proc/kcore instantiates.  I
hope they are sufficient to avoid negative offset problems that plagued the
old /proc/kcore.  Default versions are provided for the old behaviour
(mapping simply adds/subtracts PAGE_OFFSET).  For ia64 I just need to use a
different offset as all kernel virtual allocations are in the high 37.5% of
the 64-bit virtual address space.  x86_64 was the other architecture with
this problem.  I don't know enough (anything) about the kernel memory map on
x86_64, so I hope these provide a big enough hook.  I'm hoping that you have
some low stuff, and some high stuff with a big hole in the middle ...  in
which case the macros might look something like:

#define kc_vaddr_to_offset(v) ((v) < 0x1000000000000000 ? (v) : \
                              ((v) - 0xF000000000000000))

But if you have interesting stuff scattered across *every* part of the
unsigned address range, then you won't be able to squeeze it all into a
signed file offset.

There are a couple of bug fixes too:
1) get_kcore_size() didn't account for the elf_prstatus, elf_prpsinfo
   and task_struct that are placed in the PT_NOTE section that is
   part of the header.  We were saved on most configurations by the
   round-up to PAGE_SIZE ... but it's possible that some architectures
   or machines corrupted memory beyond the space allocated for the
   header.

2) The size of the PT_NOTES section was incorrectly set to the size
   of the last note, rather than the sum of the sizes of all the notes.
parent 32b5fa26
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
struct mmu_gather mmu_gathers[NR_CPUS]; struct mmu_gather mmu_gathers[NR_CPUS];
/* References to section boundaries: */ /* References to section boundaries: */
extern char _stext, _etext, _edata, __init_begin, __init_end; extern char _stext, _etext, _edata, __init_begin, __init_end, _end;
extern void ia64_tlb_init (void); extern void ia64_tlb_init (void);
...@@ -583,6 +583,7 @@ mem_init (void) ...@@ -583,6 +583,7 @@ mem_init (void)
long reserved_pages, codesize, datasize, initsize; long reserved_pages, codesize, datasize, initsize;
unsigned long num_pgt_pages; unsigned long num_pgt_pages;
pg_data_t *pgdat; pg_data_t *pgdat;
static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
#ifdef CONFIG_PCI #ifdef CONFIG_PCI
/* /*
...@@ -601,6 +602,10 @@ mem_init (void) ...@@ -601,6 +602,10 @@ mem_init (void)
high_memory = __va(max_low_pfn * PAGE_SIZE); high_memory = __va(max_low_pfn * PAGE_SIZE);
kclist_add(&kcore_mem, __va(0), max_low_pfn * PAGE_SIZE);
kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
kclist_add(&kcore_kernel, &_stext, &_end - &_stext);
for_each_pgdat(pgdat) for_each_pgdat(pgdat)
totalram_pages += free_all_bootmem_node(pgdat); totalram_pages += free_all_bootmem_node(pgdat);
......
...@@ -99,7 +99,12 @@ static ssize_t read_kcore(struct file *file, char *buf, size_t count, loff_t *pp ...@@ -99,7 +99,12 @@ static ssize_t read_kcore(struct file *file, char *buf, size_t count, loff_t *pp
} }
#else /* CONFIG_KCORE_AOUT */ #else /* CONFIG_KCORE_AOUT */
#define KCORE_BASE PAGE_OFFSET #ifndef kc_vaddr_to_offset
#define kc_vaddr_to_offset(v) ((v) - PAGE_OFFSET)
#endif
#ifndef kc_offset_to_vaddr
#define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET)
#endif
#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) #define roundup(x, y) ((((x)+((y)-1))/(y))*(y))
...@@ -112,29 +117,60 @@ struct memelfnote ...@@ -112,29 +117,60 @@ struct memelfnote
void *data; void *data;
}; };
static struct kcore_list *kclist;
static rwlock_t kclist_lock = RW_LOCK_UNLOCKED;
void
kclist_add(struct kcore_list *new, void *addr, size_t size)
{
new->addr = (unsigned long)addr;
new->size = size;
write_lock(&kclist_lock);
new->next = kclist;
kclist = new;
write_unlock(&kclist_lock);
}
struct kcore_list *
kclist_del(void *addr)
{
struct kcore_list *m, **p = &kclist;
write_lock(&kclist_lock);
for (m = *p; m; p = &m->next) {
if (m->addr == (unsigned long)addr) {
*p = m->next;
write_unlock(&kclist_lock);
return m;
}
}
write_unlock(&kclist_lock);
return 0;
}
extern char saved_command_line[]; extern char saved_command_line[];
static size_t get_kcore_size(int *num_vma, size_t *elf_buflen) static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
{ {
size_t try, size; size_t try, size;
struct vm_struct *m; struct kcore_list *m;
*num_vma = 0; *nphdr = 1; /* PT_NOTE */
size = ((size_t)high_memory - KCORE_BASE + PAGE_SIZE); size = 0;
if (!vmlist) {
*elf_buflen = PAGE_SIZE;
return (size);
}
for (m=vmlist; m; m=m->next) { for (m=kclist; m; m=m->next) {
try = (size_t)m->addr + m->size; try = kc_vaddr_to_offset((size_t)m->addr + m->size);
if (try > KCORE_BASE + size) if (try > size)
size = try - KCORE_BASE; size = try;
*num_vma = *num_vma + 1; *nphdr = *nphdr + 1;
} }
*elf_buflen = sizeof(struct elfhdr) + *elf_buflen = sizeof(struct elfhdr) +
(*num_vma + 2)*sizeof(struct elf_phdr) + (*nphdr + 2)*sizeof(struct elf_phdr) +
3 * sizeof(struct memelfnote); 3 * sizeof(struct memelfnote) +
sizeof(struct elf_prstatus) +
sizeof(struct elf_prpsinfo) +
sizeof(struct task_struct);
*elf_buflen = PAGE_ALIGN(*elf_buflen); *elf_buflen = PAGE_ALIGN(*elf_buflen);
return size + *elf_buflen; return size + *elf_buflen;
} }
...@@ -184,9 +220,9 @@ static char *storenote(struct memelfnote *men, char *bufp) ...@@ -184,9 +220,9 @@ static char *storenote(struct memelfnote *men, char *bufp)
/* /*
* store an ELF coredump header in the supplied buffer * store an ELF coredump header in the supplied buffer
* num_vma is the number of elements in vmlist * nphdr is the number of elf_phdr to insert
*/ */
static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff) static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
{ {
struct elf_prstatus prstatus; /* NT_PRSTATUS */ struct elf_prstatus prstatus; /* NT_PRSTATUS */
struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */ struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */
...@@ -194,7 +230,7 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff) ...@@ -194,7 +230,7 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff)
struct elfhdr *elf; struct elfhdr *elf;
struct memelfnote notes[3]; struct memelfnote notes[3];
off_t offset = 0; off_t offset = 0;
struct vm_struct *m; struct kcore_list *m;
/* setup ELF header */ /* setup ELF header */
elf = (struct elfhdr *) bufp; elf = (struct elfhdr *) bufp;
...@@ -214,7 +250,7 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff) ...@@ -214,7 +250,7 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff)
elf->e_flags = 0; elf->e_flags = 0;
elf->e_ehsize = sizeof(struct elfhdr); elf->e_ehsize = sizeof(struct elfhdr);
elf->e_phentsize= sizeof(struct elf_phdr); elf->e_phentsize= sizeof(struct elf_phdr);
elf->e_phnum = 2 + num_vma; elf->e_phnum = nphdr;
elf->e_shentsize= 0; elf->e_shentsize= 0;
elf->e_shnum = 0; elf->e_shnum = 0;
elf->e_shstrndx = 0; elf->e_shstrndx = 0;
...@@ -232,33 +268,17 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff) ...@@ -232,33 +268,17 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff)
nhdr->p_flags = 0; nhdr->p_flags = 0;
nhdr->p_align = 0; nhdr->p_align = 0;
/* setup ELF PT_LOAD program header for the /* setup ELF PT_LOAD program header for every area */
* virtual range 0xc0000000 -> high_memory */ for (m=kclist; m; m=m->next) {
phdr = (struct elf_phdr *) bufp;
bufp += sizeof(struct elf_phdr);
offset += sizeof(struct elf_phdr);
phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R|PF_W|PF_X;
phdr->p_offset = PAGE_OFFSET - KCORE_BASE + dataoff;
phdr->p_vaddr = PAGE_OFFSET;
phdr->p_paddr = __pa(PAGE_OFFSET);
phdr->p_filesz = phdr->p_memsz = ((unsigned long)high_memory - PAGE_OFFSET);
phdr->p_align = PAGE_SIZE;
/* setup ELF PT_LOAD program header for every vmalloc'd area */
for (m=vmlist; m; m=m->next) {
if (m->flags & VM_IOREMAP) /* don't dump ioremap'd stuff! (TA) */
continue;
phdr = (struct elf_phdr *) bufp; phdr = (struct elf_phdr *) bufp;
bufp += sizeof(struct elf_phdr); bufp += sizeof(struct elf_phdr);
offset += sizeof(struct elf_phdr); offset += sizeof(struct elf_phdr);
phdr->p_type = PT_LOAD; phdr->p_type = PT_LOAD;
phdr->p_flags = PF_R|PF_W|PF_X; phdr->p_flags = PF_R|PF_W|PF_X;
phdr->p_offset = (size_t)m->addr - KCORE_BASE + dataoff; phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff;
phdr->p_vaddr = (size_t)m->addr; phdr->p_vaddr = (size_t)m->addr;
phdr->p_paddr = __pa(m->addr); phdr->p_paddr = 0;
phdr->p_filesz = phdr->p_memsz = m->size; phdr->p_filesz = phdr->p_memsz = m->size;
phdr->p_align = PAGE_SIZE; phdr->p_align = PAGE_SIZE;
} }
...@@ -294,7 +314,7 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff) ...@@ -294,7 +314,7 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff)
strcpy(prpsinfo.pr_fname, "vmlinux"); strcpy(prpsinfo.pr_fname, "vmlinux");
strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ); strncpy(prpsinfo.pr_psargs, saved_command_line, ELF_PRARGSZ);
nhdr->p_filesz = notesize(&notes[1]); nhdr->p_filesz += notesize(&notes[1]);
bufp = storenote(&notes[1], bufp); bufp = storenote(&notes[1], bufp);
/* set up the task structure */ /* set up the task structure */
...@@ -303,7 +323,7 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff) ...@@ -303,7 +323,7 @@ static void elf_kcore_store_hdr(char *bufp, int num_vma, int dataoff)
notes[2].datasz = sizeof(struct task_struct); notes[2].datasz = sizeof(struct task_struct);
notes[2].data = current; notes[2].data = current;
nhdr->p_filesz = notesize(&notes[2]); nhdr->p_filesz += notesize(&notes[2]);
bufp = storenote(&notes[2], bufp); bufp = storenote(&notes[2], bufp);
} /* end elf_kcore_store_hdr() */ } /* end elf_kcore_store_hdr() */
...@@ -317,13 +337,14 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t ...@@ -317,13 +337,14 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t
ssize_t acc = 0; ssize_t acc = 0;
size_t size, tsz; size_t size, tsz;
size_t elf_buflen; size_t elf_buflen;
int num_vma; int nphdr;
unsigned long start; unsigned long start;
read_lock(&vmlist_lock); read_lock(&kclist_lock);
proc_root_kcore->size = size = get_kcore_size(&num_vma, &elf_buflen); tsz = get_kcore_size(&nphdr, &elf_buflen);
proc_root_kcore->size = size = tsz + elf_buflen;
if (buflen == 0 || *fpos >= size) { if (buflen == 0 || *fpos >= size) {
read_unlock(&vmlist_lock); read_unlock(&kclist_lock);
return 0; return 0;
} }
...@@ -340,12 +361,12 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t ...@@ -340,12 +361,12 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t
tsz = buflen; tsz = buflen;
elf_buf = kmalloc(elf_buflen, GFP_ATOMIC); elf_buf = kmalloc(elf_buflen, GFP_ATOMIC);
if (!elf_buf) { if (!elf_buf) {
read_unlock(&vmlist_lock); read_unlock(&kclist_lock);
return -ENOMEM; return -ENOMEM;
} }
memset(elf_buf, 0, elf_buflen); memset(elf_buf, 0, elf_buflen);
elf_kcore_store_hdr(elf_buf, num_vma, elf_buflen); elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen);
read_unlock(&vmlist_lock); read_unlock(&kclist_lock);
if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { if (copy_to_user(buffer, elf_buf + *fpos, tsz)) {
kfree(elf_buf); kfree(elf_buf);
return -EFAULT; return -EFAULT;
...@@ -360,41 +381,30 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t ...@@ -360,41 +381,30 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t
if (buflen == 0) if (buflen == 0)
return acc; return acc;
} else } else
read_unlock(&vmlist_lock); read_unlock(&kclist_lock);
/* where page 0 not mapped, write zeros into buffer */
#if defined (__i386__) || defined (__mc68000__) || defined(__x86_64__)
if (*fpos < PAGE_SIZE + elf_buflen) {
/* work out how much to clear */
tsz = PAGE_SIZE + elf_buflen - *fpos;
if (buflen < tsz)
tsz = buflen;
/* write zeros to buffer */
if (clear_user(buffer, tsz))
return -EFAULT;
buflen -= tsz;
*fpos += tsz;
buffer += tsz;
acc += tsz;
/* leave now if filled buffer already */
if (buflen == 0)
return tsz;
}
#endif
/* /*
* Fill the remainder of the buffer from kernel VM space. * Check to see if our file offset matches with any of
* We said in the ELF header that the data which starts * the addresses in the elf_phdr on our list.
* at 'elf_buflen' is virtual address KCORE_BASE. --rmk
*/ */
start = KCORE_BASE + (*fpos - elf_buflen); start = kc_offset_to_vaddr(*fpos - elf_buflen);
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen; tsz = buflen;
while (buflen) { while (buflen) {
if ((start >= VMALLOC_START) && (start < VMALLOC_END)) { struct kcore_list *m;
read_lock(&kclist_lock);
for (m=kclist; m; m=m->next) {
if (start >= m->addr && start < (m->addr+m->size))
break;
}
read_unlock(&kclist_lock);
if (m == NULL) {
if (clear_user(buffer, tsz))
return -EFAULT;
} else if ((start >= VMALLOC_START) && (start < VMALLOC_END)) {
char * elf_buf; char * elf_buf;
struct vm_struct *m; struct vm_struct *m;
unsigned long curstart = start; unsigned long curstart = start;
...@@ -439,8 +449,7 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t ...@@ -439,8 +449,7 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t
return -EFAULT; return -EFAULT;
} }
kfree(elf_buf); kfree(elf_buf);
} else if ((start > PAGE_OFFSET) && (start < } else {
(unsigned long)high_memory)) {
if (kern_addr_valid(start)) { if (kern_addr_valid(start)) {
if (copy_to_user(buffer, (char *)start, tsz)) if (copy_to_user(buffer, (char *)start, tsz))
return -EFAULT; return -EFAULT;
...@@ -448,9 +457,6 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t ...@@ -448,9 +457,6 @@ static ssize_t read_kcore(struct file *file, char *buffer, size_t buflen, loff_t
if (clear_user(buffer, tsz)) if (clear_user(buffer, tsz))
return -EFAULT; return -EFAULT;
} }
} else {
if (clear_user(buffer, tsz))
return -EFAULT;
} }
buflen -= tsz; buflen -= tsz;
*fpos += tsz; *fpos += tsz;
......
...@@ -209,6 +209,10 @@ ia64_phys_addr_valid (unsigned long addr) ...@@ -209,6 +209,10 @@ ia64_phys_addr_valid (unsigned long addr)
#define VMALLOC_VMADDR(x) ((unsigned long)(x)) #define VMALLOC_VMADDR(x) ((unsigned long)(x))
#define VMALLOC_END (0xa000000000000000 + (1UL << (4*PAGE_SHIFT - 9))) #define VMALLOC_END (0xa000000000000000 + (1UL << (4*PAGE_SHIFT - 9)))
/* fs/proc/kcore.c */
#define kc_vaddr_to_offset(v) ((v) - 0xA000000000000000)
#define kc_offset_to_vaddr(o) ((o) + 0xA000000000000000)
/* /*
* Conversion functions: convert page frame number (pfn) and a protection value to a page * Conversion functions: convert page frame number (pfn) and a protection value to a page
* table entry (pte). * table entry (pte).
......
...@@ -74,6 +74,12 @@ struct proc_dir_entry { ...@@ -74,6 +74,12 @@ struct proc_dir_entry {
kdev_t rdev; kdev_t rdev;
}; };
struct kcore_list {
struct kcore_list *next;
unsigned long addr;
size_t size;
};
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
extern struct proc_dir_entry proc_root; extern struct proc_dir_entry proc_root;
...@@ -179,6 +185,12 @@ static inline void proc_net_remove(const char *name) ...@@ -179,6 +185,12 @@ static inline void proc_net_remove(const char *name)
remove_proc_entry(name,proc_net); remove_proc_entry(name,proc_net);
} }
/*
* fs/proc/kcore.c
*/
extern void kclist_add(struct kcore_list *, void *, size_t);
extern struct kcore_list *kclist_del(void *);
#else #else
#define proc_root_driver NULL #define proc_root_driver NULL
...@@ -223,6 +235,8 @@ static inline struct kcore_list * kclist_del(void *addr) ...@@ -223,6 +235,8 @@ static inline struct kcore_list * kclist_del(void *addr)
return NULL; return NULL;
} }
static inline void kclist_add(struct kcore_list *new, void *addr, size_t size) {};
static inline struct kcore_list * kclist_del(void *addr) {return NULL};
#endif /* CONFIG_PROC_FS */ #endif /* CONFIG_PROC_FS */
struct proc_inode { struct proc_inode {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment