Commit 1d508f8a authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc

Pull more powerpc updates from Ben Herrenschmidt:
 "Here are some more powerpc bits for 3.17, essentially fixes.

  The biggest series, also aimed at -stable, is from Aneesh and is the
  result of weeks and weeks of debugging to find out why the heck or THP
  implementation was occasionally triggering multi-hit errors in our
  level 1 TLB.  It ended up being a combination of issues including
  subtleties as to how we should invalidate those special 'MPSS' pages
  we use to allow the use of 16M pages inside 4K/64K "base page size"
  segments (you really have to love our MMU !)

  Another interesting one in the "OMG" category is the series from
  Michael adding memory barriers to spin_is_locked().  That's also the
  result of many days of debugging to figure out why the semaphore code
  would occasionally crash in ways that made no sense.  It ended up
  being some creative lock stacking that was defeated by the fact that
  our locks allow a load inside the locked section to be re-ordered with
  the load of the lock value itself (I'm still of two mind about whether
  to kill that once and for all by putting a heavier barrier back into
  our lock implementation...).  The fixes come with a long explanation
  in the cset comments, feel free to read it if you feel like having a
  headache today"

* 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc: (25 commits)
  powerpc/thp: Add tracepoints to track hugepage invalidate
  powerpc/mm: Use read barrier when creating real_pte
  powerpc/thp: Use ACCESS_ONCE when loading pmdp
  powerpc/thp: Invalidate with vpn in loop
  powerpc/thp: Handle combo pages in invalidate
  powerpc/thp: Invalidate old 64K based hash page mapping before insert of 4k pte
  powerpc/thp: Don't recompute vsid and ssize in loop on invalidate
  powerpc/thp: Add write barrier after updating the valid bit
  powerpc: reorder per-cpu NUMA information's initialization
  powerpc/perf/hv-24x7: Use kmem_cache_free
  powerpc/pseries/hvcserver: Fix endian issue in hvcs_get_partner_info
  powerpc: Hard disable interrupts in xmon
  powerpc: remove duplicate definition of TEXASR_FS
  powerpc/pseries: Avoid deadlock on removing ddw
  powerpc/pseries: Failure on removing device node
  powerpc/boot: Use correct zlib types for comparison
  powerpc/powernv: Interface to register/unregister opal dump region
  printk: Add function to return log buffer address and size
  powerpc: Add POWER8 features to CPU_FTRS_POSSIBLE/ALWAYS
  powerpc/ppc476: Disable BTAC
  ...
parents 2d0c05e1 9e813308
......@@ -112,10 +112,10 @@ int gunzip_partial(struct gunzip_state *state, void *dst, int dstlen)
r = zlib_inflate(&state->s, Z_FULL_FLUSH);
if (r != Z_OK && r != Z_STREAM_END)
fatal("inflate returned %d msg: %s\n\r", r, state->s.msg);
len = state->s.next_out - (unsigned char *)dst;
len = state->s.next_out - (Byte *)dst;
} else {
/* uncompressed image */
len = min(state->s.avail_in, (unsigned)dstlen);
len = min(state->s.avail_in, (uLong)dstlen);
memcpy(dst, state->s.next_in, len);
state->s.next_in += len;
state->s.avail_in -= len;
......
......@@ -459,7 +459,8 @@ extern const char *powerpc_base_platform;
#define CPU_FTRS_POSSIBLE \
(CPU_FTRS_POWER4 | CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | CPU_FTR_VSX)
CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \
CPU_FTRS_PA6T | CPU_FTR_VSX)
#endif
#else
enum {
......@@ -509,7 +510,8 @@ enum {
#define CPU_FTRS_ALWAYS \
(CPU_FTRS_POWER4 & CPU_FTRS_PPC970 & CPU_FTRS_POWER5 & \
CPU_FTRS_POWER6 & CPU_FTRS_POWER7 & CPU_FTRS_CELL & \
CPU_FTRS_PA6T & CPU_FTRS_POSSIBLE)
CPU_FTRS_PA6T & CPU_FTRS_POWER8 & CPU_FTRS_POWER8E & \
CPU_FTRS_POWER8_DD1 & CPU_FTRS_POSSIBLE)
#endif
#else
enum {
......
......@@ -57,10 +57,10 @@ struct machdep_calls {
void (*hpte_removebolted)(unsigned long ea,
int psize, int ssize);
void (*flush_hash_range)(unsigned long number, int local);
void (*hugepage_invalidate)(struct mm_struct *mm,
void (*hugepage_invalidate)(unsigned long vsid,
unsigned long addr,
unsigned char *hpte_slot_array,
unsigned long addr, int psize);
int psize, int ssize);
/* special for kexec, to be called in real mode, linear mapping is
* destroyed as well */
void (*hpte_clear_all)(void);
......
......@@ -149,6 +149,8 @@ struct opal_sg_list {
#define OPAL_DUMP_INFO2 94
#define OPAL_PCI_EEH_FREEZE_SET 97
#define OPAL_HANDLE_HMI 98
#define OPAL_REGISTER_DUMP_REGION 101
#define OPAL_UNREGISTER_DUMP_REGION 102
#ifndef __ASSEMBLY__
......@@ -920,6 +922,8 @@ int64_t opal_set_param(uint64_t token, uint32_t param_id, uint64_t buffer,
uint64_t length);
int64_t opal_sensor_read(uint32_t sensor_hndl, int token, __be32 *sensor_data);
int64_t opal_handle_hmi(void);
int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end);
int64_t opal_unregister_dump_region(uint32_t id);
/* Internal functions */
extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
......@@ -974,6 +978,13 @@ struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
unsigned long vmalloc_size);
void opal_free_sg_list(struct opal_sg_list *sg);
/*
* Dump region ID range usable by the OS
*/
#define OPAL_DUMP_REGION_HOST_START 0x80
#define OPAL_DUMP_REGION_LOG_BUF 0x80
#define OPAL_DUMP_REGION_HOST_END 0xFF
#endif /* __ASSEMBLY__ */
#endif /* __OPAL_H */
......@@ -413,7 +413,7 @@ static inline char *get_hpte_slot_array(pmd_t *pmdp)
}
extern void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp);
pmd_t *pmdp, unsigned long old_pmd);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
......
......@@ -46,11 +46,31 @@
* in order to deal with 64K made of 4K HW pages. Thus we override the
* generic accessors and iterators here
*/
#define __real_pte(e,p) ((real_pte_t) { \
(e), (pte_val(e) & _PAGE_COMBO) ? \
(pte_val(*((p) + PTRS_PER_PTE))) : 0 })
#define __rpte_to_hidx(r,index) ((pte_val((r).pte) & _PAGE_COMBO) ? \
(((r).hidx >> ((index)<<2)) & 0xf) : ((pte_val((r).pte) >> 12) & 0xf))
#define __real_pte __real_pte
static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
{
real_pte_t rpte;
rpte.pte = pte;
rpte.hidx = 0;
if (pte_val(pte) & _PAGE_COMBO) {
/*
* Make sure we order the hidx load against the _PAGE_COMBO
* check. The store side ordering is done in __hash_page_4K
*/
smp_rmb();
rpte.hidx = pte_val(*((ptep) + PTRS_PER_PTE));
}
return rpte;
}
static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index)
{
if ((pte_val(rpte.pte) & _PAGE_COMBO))
return (rpte.hidx >> (index<<2)) & 0xf;
return (pte_val(rpte.pte) >> 12) & 0xf;
}
#define __rpte_to_pte(r) ((r).pte)
#define __rpte_sub_valid(rpte, index) \
(pte_val(rpte.pte) & (_PAGE_HPTE_SUB0 >> (index)))
......
......@@ -213,7 +213,6 @@
#define SPRN_ACOP 0x1F /* Available Coprocessor Register */
#define SPRN_TFIAR 0x81 /* Transaction Failure Inst Addr */
#define SPRN_TEXASR 0x82 /* Transaction EXception & Summary */
#define TEXASR_FS __MASK(63-36) /* Transaction Failure Summary */
#define SPRN_TEXASRU 0x83 /* '' '' '' Upper 32 */
#define TEXASR_FS __MASK(63-36) /* TEXASR Failure Summary */
#define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */
......
......@@ -61,6 +61,7 @@ static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{
smp_mb();
return !arch_spin_value_unlocked(*lock);
}
......
......@@ -592,61 +592,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
MASKABLE_EXCEPTION_HV_OOL(0xe62, hmi_exception)
KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
.globl hmi_exception_early
hmi_exception_early:
EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
mr r10,r1 /* Save r1 */
ld r1,PACAEMERGSP(r13) /* Use emergency stack */
subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
std r9,_CCR(r1) /* save CR in stackframe */
mfspr r11,SPRN_HSRR0 /* Save HSRR0 */
std r11,_NIP(r1) /* save HSRR0 in stackframe */
mfspr r12,SPRN_HSRR1 /* Save SRR1 */
std r12,_MSR(r1) /* save SRR1 in stackframe */
std r10,0(r1) /* make stack chain pointer */
std r0,GPR0(r1) /* save r0 in stackframe */
std r10,GPR1(r1) /* save r1 in stackframe */
EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
EXCEPTION_PROLOG_COMMON_3(0xe60)
addi r3,r1,STACK_FRAME_OVERHEAD
bl hmi_exception_realmode
/* Windup the stack. */
/* Clear MSR_RI before setting SRR0 and SRR1. */
li r0,MSR_RI
mfmsr r9 /* get MSR value */
andc r9,r9,r0
mtmsrd r9,1 /* Clear MSR_RI */
/* Move original HSRR0 and HSRR1 into the respective regs */
ld r9,_MSR(r1)
mtspr SPRN_HSRR1,r9
ld r3,_NIP(r1)
mtspr SPRN_HSRR0,r3
ld r9,_CTR(r1)
mtctr r9
ld r9,_XER(r1)
mtxer r9
ld r9,_LINK(r1)
mtlr r9
REST_GPR(0, r1)
REST_8GPRS(2, r1)
REST_GPR(10, r1)
ld r11,_CCR(r1)
mtcr r11
REST_GPR(11, r1)
REST_2GPRS(12, r1)
/* restore original r1. */
ld r1,GPR1(r1)
/*
* Go to virtual mode and pull the HMI event information from
* firmware.
*/
.globl hmi_exception_after_realmode
hmi_exception_after_realmode:
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXGEN)
b hmi_exception_hv
MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell)
KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82)
......@@ -1306,6 +1251,61 @@ fwnmi_data_area:
. = 0x8000
#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
.globl hmi_exception_early
hmi_exception_early:
EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
mr r10,r1 /* Save r1 */
ld r1,PACAEMERGSP(r13) /* Use emergency stack */
subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
std r9,_CCR(r1) /* save CR in stackframe */
mfspr r11,SPRN_HSRR0 /* Save HSRR0 */
std r11,_NIP(r1) /* save HSRR0 in stackframe */
mfspr r12,SPRN_HSRR1 /* Save SRR1 */
std r12,_MSR(r1) /* save SRR1 in stackframe */
std r10,0(r1) /* make stack chain pointer */
std r0,GPR0(r1) /* save r0 in stackframe */
std r10,GPR1(r1) /* save r1 in stackframe */
EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
EXCEPTION_PROLOG_COMMON_3(0xe60)
addi r3,r1,STACK_FRAME_OVERHEAD
bl hmi_exception_realmode
/* Windup the stack. */
/* Clear MSR_RI before setting SRR0 and SRR1. */
li r0,MSR_RI
mfmsr r9 /* get MSR value */
andc r9,r9,r0
mtmsrd r9,1 /* Clear MSR_RI */
/* Move original HSRR0 and HSRR1 into the respective regs */
ld r9,_MSR(r1)
mtspr SPRN_HSRR1,r9
ld r3,_NIP(r1)
mtspr SPRN_HSRR0,r3
ld r9,_CTR(r1)
mtctr r9
ld r9,_XER(r1)
mtxer r9
ld r9,_LINK(r1)
mtlr r9
REST_GPR(0, r1)
REST_8GPRS(2, r1)
REST_GPR(10, r1)
ld r11,_CCR(r1)
mtcr r11
REST_GPR(11, r1)
REST_2GPRS(12, r1)
/* restore original r1. */
ld r1,GPR1(r1)
/*
* Go to virtual mode and pull the HMI event information from
* firmware.
*/
.globl hmi_exception_after_realmode
hmi_exception_after_realmode:
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXGEN)
b hmi_exception_hv
#ifdef CONFIG_PPC_POWERNV
_GLOBAL(opal_mc_secondary_handler)
HMT_MEDIUM_PPR_DISCARD
......
......@@ -1210,10 +1210,12 @@ clear_utlb_entry:
/* We configure icbi to invalidate 128 bytes at a time since the
* current 32-bit kernel code isn't too happy with icache != dcache
* block size
* block size. We also disable the BTAC as this can cause errors
* in some circumstances (see IBM Erratum 47).
*/
mfspr r3,SPRN_CCR0
oris r3,r3,0x0020
ori r3,r3,0x0040
mtspr SPRN_CCR0,r3
isync
......
......@@ -1120,37 +1120,41 @@ EXPORT_SYMBOL_GPL(iommu_release_ownership);
int iommu_add_device(struct device *dev)
{
struct iommu_table *tbl;
int ret = 0;
if (WARN_ON(dev->iommu_group)) {
pr_warn("iommu_tce: device %s is already in iommu group %d, skipping\n",
dev_name(dev),
/*
* The sysfs entries should be populated before
* binding IOMMU group. If sysfs entries isn't
* ready, we simply bail.
*/
if (!device_is_registered(dev))
return -ENOENT;
if (dev->iommu_group) {
pr_debug("%s: Skipping device %s with iommu group %d\n",
__func__, dev_name(dev),
iommu_group_id(dev->iommu_group));
return -EBUSY;
}
tbl = get_iommu_table_base(dev);
if (!tbl || !tbl->it_group) {
pr_debug("iommu_tce: skipping device %s with no tbl\n",
dev_name(dev));
pr_debug("%s: Skipping device %s with no tbl\n",
__func__, dev_name(dev));
return 0;
}
pr_debug("iommu_tce: adding %s to iommu group %d\n",
dev_name(dev), iommu_group_id(tbl->it_group));
pr_debug("%s: Adding %s to iommu group %d\n",
__func__, dev_name(dev),
iommu_group_id(tbl->it_group));
if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
pr_err("iommu_tce: unsupported iommu page size.");
pr_err("%s has not been added\n", dev_name(dev));
pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
__func__, IOMMU_PAGE_SIZE(tbl),
PAGE_SIZE, dev_name(dev));
return -EINVAL;
}
ret = iommu_group_add_device(tbl->it_group, dev);
if (ret < 0)
pr_err("iommu_tce: %s has not been added, ret=%d\n",
dev_name(dev), ret);
return ret;
return iommu_group_add_device(tbl->it_group, dev);
}
EXPORT_SYMBOL_GPL(iommu_add_device);
......
......@@ -376,6 +376,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
/*
* numa_node_id() works after this.
*/
set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]);
set_cpu_numa_mem(cpu, local_memory_node(numa_cpu_lookup_table[cpu]));
}
cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
......@@ -723,12 +728,6 @@ void start_secondary(void *unused)
}
traverse_core_siblings(cpu, true);
/*
* numa_node_id() works after this.
*/
set_numa_node(numa_cpu_lookup_table[cpu]);
set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
smp_wmb();
notify_cpu_starting(cpu);
set_cpu_online(cpu, true);
......
......@@ -70,12 +70,16 @@ void __rw_yield(arch_rwlock_t *rw)
void arch_spin_unlock_wait(arch_spinlock_t *lock)
{
smp_mb();
while (lock->slock) {
HMT_low();
if (SHARED_PROCESSOR)
__spin_yield(lock);
}
HMT_medium();
smp_mb();
}
EXPORT_SYMBOL(arch_spin_unlock_wait);
......@@ -412,18 +412,18 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
local_irq_restore(flags);
}
static void native_hugepage_invalidate(struct mm_struct *mm,
static void native_hugepage_invalidate(unsigned long vsid,
unsigned long addr,
unsigned char *hpte_slot_array,
unsigned long addr, int psize)
int psize, int ssize)
{
int ssize = 0, i;
int lock_tlbie;
int i;
struct hash_pte *hptep;
int actual_psize = MMU_PAGE_16M;
unsigned int max_hpte_count, valid;
unsigned long flags, s_addr = addr;
unsigned long hpte_v, want_v, shift;
unsigned long hidx, vpn = 0, vsid, hash, slot;
unsigned long hidx, vpn = 0, hash, slot;
shift = mmu_psize_defs[psize].shift;
max_hpte_count = 1U << (PMD_SHIFT - shift);
......@@ -437,15 +437,6 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
vsid = get_vsid(mm->context.id, addr, ssize);
WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx & _PTEIDX_SECONDARY)
......@@ -465,22 +456,13 @@ static void native_hugepage_invalidate(struct mm_struct *mm,
else
/* Invalidate the hpte. NOTE: this also unlocks it */
hptep->v = 0;
}
/*
* Since this is a hugepage, we just need a single tlbie.
* use the last vpn.
* We need to do tlb invalidate for all the address, tlbie
* instruction compares entry_VA in tlb with the VA specified
* here
*/
lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
if (lock_tlbie)
raw_spin_lock(&native_tlbie_lock);
asm volatile("ptesync":::"memory");
__tlbie(vpn, psize, actual_psize, ssize);
asm volatile("eieio; tlbsync; ptesync":::"memory");
if (lock_tlbie)
raw_spin_unlock(&native_tlbie_lock);
tlbie(vpn, psize, actual_psize, ssize, 0);
}
local_irq_restore(flags);
}
......
......@@ -18,6 +18,57 @@
#include <linux/mm.h>
#include <asm/machdep.h>
static void invalidate_old_hpte(unsigned long vsid, unsigned long addr,
pmd_t *pmdp, unsigned int psize, int ssize)
{
int i, max_hpte_count, valid;
unsigned long s_addr;
unsigned char *hpte_slot_array;
unsigned long hidx, shift, vpn, hash, slot;
s_addr = addr & HPAGE_PMD_MASK;
hpte_slot_array = get_hpte_slot_array(pmdp);
/*
* IF we try to do a HUGE PTE update after a withdraw is done.
* we will find the below NULL. This happens when we do
* split_huge_page_pmd
*/
if (!hpte_slot_array)
return;
if (ppc_md.hugepage_invalidate)
return ppc_md.hugepage_invalidate(vsid, s_addr, hpte_slot_array,
psize, ssize);
/*
* No bluk hpte removal support, invalidate each entry
*/
shift = mmu_psize_defs[psize].shift;
max_hpte_count = HPAGE_PMD_SIZE >> shift;
for (i = 0; i < max_hpte_count; i++) {
/*
* 8 bits per each hpte entries
* 000| [ secondary group (one bit) | hidx (3 bits) | valid bit]
*/
valid = hpte_valid(hpte_slot_array, i);
if (!valid)
continue;
hidx = hpte_hash_index(hpte_slot_array, i);
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx & _PTEIDX_SECONDARY)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += hidx & _PTEIDX_GROUP_IX;
ppc_md.hpte_invalidate(slot, vpn, psize,
MMU_PAGE_16M, ssize, 0);
}
}
int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
pmd_t *pmdp, unsigned long trap, int local, int ssize,
unsigned int psize)
......@@ -33,7 +84,9 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
* atomically mark the linux large page PMD busy and dirty
*/
do {
old_pmd = pmd_val(*pmdp);
pmd_t pmd = ACCESS_ONCE(*pmdp);
old_pmd = pmd_val(pmd);
/* If PMD busy, retry the access */
if (unlikely(old_pmd & _PAGE_BUSY))
return 0;
......@@ -85,6 +138,15 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
vpn = hpt_vpn(ea, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
hpte_slot_array = get_hpte_slot_array(pmdp);
if (psize == MMU_PAGE_4K) {
/*
* invalidate the old hpte entry if we have that mapped via 64K
* base page size. This is because demote_segment won't flush
* hash page table entries.
*/
if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO))
invalidate_old_hpte(vsid, ea, pmdp, MMU_PAGE_64K, ssize);
}
valid = hpte_valid(hpte_slot_array, index);
if (valid) {
......@@ -107,11 +169,8 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
* safely update this here.
*/
valid = 0;
new_pmd &= ~_PAGE_HPTEFLAGS;
hpte_slot_array[index] = 0;
} else
/* clear the busy bits and set the hash pte bits */
new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
}
}
if (!valid) {
......@@ -119,11 +178,7 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
/* insert new entry */
pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT;
repeat:
hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
/* clear the busy bits and set the hash pte bits */
new_pmd = (new_pmd & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
new_pmd |= _PAGE_HASHPTE;
/* Add in WIMG bits */
rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
......@@ -132,6 +187,8 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
* enable the memory coherence always
*/
rflags |= HPTE_R_M;
repeat:
hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
/* Insert into the hash table, primary slot */
slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
......@@ -172,8 +229,17 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
mark_hpte_slot_valid(hpte_slot_array, index, slot);
}
/*
* No need to use ldarx/stdcx here
* Mark the pte with _PAGE_COMBO, if we are trying to hash it with
* base page size 4k.
*/
if (psize == MMU_PAGE_4K)
new_pmd |= _PAGE_COMBO;
/*
* The hpte valid is stored in the pgtable whose address is in the
* second half of the PMD. Order this against clearing of the busy bit in
* huge pmd.
*/
smp_wmb();
*pmdp = __pmd(new_pmd & ~_PAGE_BUSY);
return 0;
}
......@@ -1049,7 +1049,7 @@ static void __init mark_reserved_regions_for_nid(int nid)
void __init do_init_bootmem(void)
{
int nid;
int nid, cpu;
min_low_pfn = 0;
max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
......@@ -1122,8 +1122,15 @@ void __init do_init_bootmem(void)
reset_numa_cpu_lookup_table();
register_cpu_notifier(&ppc64_numa_nb);
/*
* We need the numa_cpu_lookup_table to be accurate for all CPUs,
* even before we online them, so that we can use cpu_to_{node,mem}
* early in boot, cf. smp_prepare_cpus().
*/
for_each_possible_cpu(cpu) {
cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
(void *)(unsigned long)boot_cpuid);
(void *)(unsigned long)cpu);
}
}
void __init paging_init(void)
......
......@@ -54,6 +54,9 @@
#include "mmu_decl.h"
#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>
/* Some sanity checking */
#if TASK_SIZE_USER64 > PGTABLE_RANGE
#error TASK_SIZE_USER64 exceeds pagetable range
......@@ -537,8 +540,9 @@ unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
old = pmd_val(*pmdp);
*pmdp = __pmd((old & ~clr) | set);
#endif
trace_hugepage_update(addr, old, clr, set);
if (old & _PAGE_HASHPTE)
hpte_do_hugepage_flush(mm, addr, pmdp);
hpte_do_hugepage_flush(mm, addr, pmdp, old);
return old;
}
......@@ -642,10 +646,11 @@ void pmdp_splitting_flush(struct vm_area_struct *vma,
* If we didn't had the splitting flag set, go and flush the
* HPTE entries.
*/
trace_hugepage_splitting(address, old);
if (!(old & _PAGE_SPLITTING)) {
/* We need to flush the hpte */
if (old & _PAGE_HASHPTE)
hpte_do_hugepage_flush(vma->vm_mm, address, pmdp);
hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old);
}
/*
* This ensures that generic code that rely on IRQ disabling
......@@ -709,6 +714,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
assert_spin_locked(&mm->page_table_lock);
WARN_ON(!pmd_trans_huge(pmd));
#endif
trace_hugepage_set_pmd(addr, pmd);
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
}
......@@ -723,7 +729,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
* neesd to be flushed.
*/
void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp)
pmd_t *pmdp, unsigned long old_pmd)
{
int ssize, i;
unsigned long s_addr;
......@@ -745,12 +751,29 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
if (!hpte_slot_array)
return;
/* get the base page size */
/* get the base page size,vsid and segment size */
#ifdef CONFIG_DEBUG_VM
psize = get_slice_psize(mm, s_addr);
BUG_ON(psize == MMU_PAGE_16M);
#endif
if (old_pmd & _PAGE_COMBO)
psize = MMU_PAGE_4K;
else
psize = MMU_PAGE_64K;
if (!is_kernel_addr(s_addr)) {
ssize = user_segment_size(s_addr);
vsid = get_vsid(mm->context.id, s_addr, ssize);
WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(s_addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
if (ppc_md.hugepage_invalidate)
return ppc_md.hugepage_invalidate(mm, hpte_slot_array,
s_addr, psize);
return ppc_md.hugepage_invalidate(vsid, s_addr,
hpte_slot_array,
psize, ssize);
/*
* No bluk hpte removal support, invalidate each entry
*/
......@@ -768,15 +791,6 @@ void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr,
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
vsid = get_vsid(mm->context.id, addr, ssize);
WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx & _PTEIDX_SECONDARY)
......
......@@ -30,6 +30,8 @@
#include <asm/tlb.h>
#include <asm/bug.h>
#include <trace/events/thp.h>
DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
/*
......@@ -213,10 +215,12 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
if (ptep == NULL)
continue;
pte = pte_val(*ptep);
if (hugepage_shift)
trace_hugepage_invalidate(start, pte_val(pte));
if (!(pte & _PAGE_HASHPTE))
continue;
if (unlikely(hugepage_shift && pmd_trans_huge(*(pmd_t *)pte)))
hpte_do_hugepage_flush(mm, start, (pmd_t *)pte);
hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte);
else
hpte_need_flush(mm, start, ptep, pte, 0);
}
......
......@@ -581,42 +581,10 @@ static void setup_mmu_htw(void)
/*
* Early initialization of the MMU TLB code
*/
static void __early_init_mmu(int boot_cpu)
static void early_init_this_mmu(void)
{
unsigned int mas4;
/* XXX This will have to be decided at runtime, but right
* now our boot and TLB miss code hard wires it. Ideally
* we should find out a suitable page size and patch the
* TLB miss code (either that or use the PACA to store
* the value we want)
*/
mmu_linear_psize = MMU_PAGE_1G;
/* XXX This should be decided at runtime based on supported
* page sizes in the TLB, but for now let's assume 16M is
* always there and a good fit (which it probably is)
*
* Freescale booke only supports 4K pages in TLB0, so use that.
*/
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
mmu_vmemmap_psize = MMU_PAGE_4K;
else
mmu_vmemmap_psize = MMU_PAGE_16M;
/* XXX This code only checks for TLB 0 capabilities and doesn't
* check what page size combos are supported by the HW. It
* also doesn't handle the case where a separate array holds
* the IND entries from the array loaded by the PT.
*/
if (boot_cpu) {
/* Look for supported page sizes */
setup_page_sizes();
/* Look for HW tablewalk support */
setup_mmu_htw();
}
/* Set MAS4 based on page table setting */
mas4 = 0x4 << MAS4_WIMGED_SHIFT;
......@@ -650,11 +618,6 @@ static void __early_init_mmu(int boot_cpu)
}
mtspr(SPRN_MAS4, mas4);
/* Set the global containing the top of the linear mapping
* for use by the TLB miss code
*/
linear_map_top = memblock_end_of_DRAM();
#ifdef CONFIG_PPC_FSL_BOOK3E
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
unsigned int num_cams;
......@@ -662,10 +625,49 @@ static void __early_init_mmu(int boot_cpu)
/* use a quarter of the TLBCAM for bolted linear map */
num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
linear_map_top = map_mem_in_cams(linear_map_top, num_cams);
}
#endif
/* limit memory so we dont have linear faults */
memblock_enforce_memory_limit(linear_map_top);
/* A sync won't hurt us after mucking around with
* the MMU configuration
*/
mb();
}
static void __init early_init_mmu_global(void)
{
/* XXX This will have to be decided at runtime, but right
* now our boot and TLB miss code hard wires it. Ideally
* we should find out a suitable page size and patch the
* TLB miss code (either that or use the PACA to store
* the value we want)
*/
mmu_linear_psize = MMU_PAGE_1G;
/* XXX This should be decided at runtime based on supported
* page sizes in the TLB, but for now let's assume 16M is
* always there and a good fit (which it probably is)
*
* Freescale booke only supports 4K pages in TLB0, so use that.
*/
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E))
mmu_vmemmap_psize = MMU_PAGE_4K;
else
mmu_vmemmap_psize = MMU_PAGE_16M;
/* XXX This code only checks for TLB 0 capabilities and doesn't
* check what page size combos are supported by the HW. It
* also doesn't handle the case where a separate array holds
* the IND entries from the array loaded by the PT.
*/
/* Look for supported page sizes */
setup_page_sizes();
/* Look for HW tablewalk support */
setup_mmu_htw();
#ifdef CONFIG_PPC_FSL_BOOK3E
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
if (book3e_htw_mode == PPC_HTW_NONE) {
extlb_level_exc = EX_TLB_SIZE;
patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
......@@ -675,22 +677,41 @@ static void __early_init_mmu(int boot_cpu)
}
#endif
/* A sync won't hurt us after mucking around with
* the MMU configuration
/* Set the global containing the top of the linear mapping
* for use by the TLB miss code
*/
mb();
linear_map_top = memblock_end_of_DRAM();
}
static void __init early_mmu_set_memory_limit(void)
{
#ifdef CONFIG_PPC_FSL_BOOK3E
if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
/*
* Limit memory so we dont have linear faults.
* Unlike memblock_set_current_limit, which limits
* memory available during early boot, this permanently
* reduces the memory available to Linux. We need to
* do this because highmem is not supported on 64-bit.
*/
memblock_enforce_memory_limit(linear_map_top);
}
#endif
memblock_set_current_limit(linear_map_top);
}
/* boot cpu only */
void __init early_init_mmu(void)
{
__early_init_mmu(1);
early_init_mmu_global();
early_init_this_mmu();
early_mmu_set_memory_limit();
}
void early_init_mmu_secondary(void)
{
__early_init_mmu(0);
early_init_this_mmu();
}
void setup_initial_memory_limit(phys_addr_t first_memblock_base,
......
......@@ -223,7 +223,7 @@ static ssize_t catalog_read(struct file *filp, struct kobject *kobj,
pr_err("h_get_24x7_catalog_page(ver=%lld, page=%lld) failed:"
" rc=%ld\n",
catalog_version_num, page_offset, hret);
kfree(page);
kmem_cache_free(hv_page_cache, page);
pr_devel("catalog_read: offset=%lld(%lld) count=%zu(%zu) catalog_len=%zu(%zu) => %zd\n",
offset, page_offset, count, page_count, catalog_len,
......
......@@ -245,3 +245,5 @@ OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI);
OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION);
OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION);
......@@ -605,6 +605,24 @@ static int opal_sysfs_init(void)
return 0;
}
static void __init opal_dump_region_init(void)
{
void *addr;
uint64_t size;
int rc;
/* Register kernel log buffer */
addr = log_buf_addr_get();
size = log_buf_len_get();
rc = opal_register_dump_region(OPAL_DUMP_REGION_LOG_BUF,
__pa(addr), size);
/* Don't warn if this is just an older OPAL that doesn't
* know about that call
*/
if (rc && rc != OPAL_UNSUPPORTED)
pr_warn("DUMP: Failed to register kernel log buffer. "
"rc = %d\n", rc);
}
static int __init opal_init(void)
{
struct device_node *np, *consoles;
......@@ -654,6 +672,8 @@ static int __init opal_init(void)
/* Create "opal" kobject under /sys/firmware */
rc = opal_sysfs_init();
if (rc == 0) {
/* Setup dump region interface */
opal_dump_region_init();
/* Setup error log interface */
rc = opal_elog_init();
/* Setup code update interface */
......@@ -694,6 +714,9 @@ void opal_shutdown(void)
else
mdelay(10);
}
/* Unregister memory dump region */
opal_unregister_dump_region(OPAL_DUMP_REGION_LOG_BUF);
}
/* Export this so that test modules can use it */
......
......@@ -857,7 +857,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
pe = &phb->ioda.pe_array[pdn->pe_number];
WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
set_iommu_table_base(&pdev->dev, &pe->tce32_table);
set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table);
}
static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
......
......@@ -146,7 +146,7 @@ static inline int pseries_remove_memblock(unsigned long base,
}
static inline int pseries_remove_mem_node(struct device_node *np)
{
return -EOPNOTSUPP;
return 0;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
......
......@@ -163,8 +163,8 @@ int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head,
return retval;
}
last_p_partition_ID = pi_buff[0];
last_p_unit_address = pi_buff[1];
last_p_partition_ID = be64_to_cpu(pi_buff[0]);
last_p_unit_address = be64_to_cpu(pi_buff[1]);
/* This indicates that there are no further partners */
if (last_p_partition_ID == ~0UL
......
......@@ -721,13 +721,13 @@ static int __init disable_ddw_setup(char *str)
early_param("disable_ddw", disable_ddw_setup);
static void remove_ddw(struct device_node *np)
static void remove_ddw(struct device_node *np, bool remove_prop)
{
struct dynamic_dma_window_prop *dwp;
struct property *win64;
const u32 *ddw_avail;
u64 liobn;
int len, ret;
int len, ret = 0;
ddw_avail = of_get_property(np, "ibm,ddw-applicable", &len);
win64 = of_find_property(np, DIRECT64_PROPNAME, NULL);
......@@ -761,6 +761,7 @@ static void remove_ddw(struct device_node *np)
np->full_name, ret, ddw_avail[2], liobn);
delprop:
if (remove_prop)
ret = of_remove_property(np, win64);
if (ret)
pr_warning("%s: failed to remove direct window property: %d\n",
......@@ -805,7 +806,7 @@ static int find_existing_ddw_windows(void)
window = kzalloc(sizeof(*window), GFP_KERNEL);
if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
kfree(window);
remove_ddw(pdn);
remove_ddw(pdn, true);
continue;
}
......@@ -1045,7 +1046,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
kfree(window);
out_clear_window:
remove_ddw(pdn);
remove_ddw(pdn, true);
out_free_prop:
kfree(win64->name);
......@@ -1255,7 +1256,14 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
switch (action) {
case OF_RECONFIG_DETACH_NODE:
remove_ddw(np);
/*
* Removing the property will invoke the reconfig
* notifier again, which causes dead-lock on the
* read-write semaphore of the notifier chain. So
* we have to remove the property when releasing
* the device node.
*/
remove_ddw(np, false);
if (pci && pci->iommu_table)
iommu_free_table(pci->iommu_table, np->full_name);
......
......@@ -431,16 +431,17 @@ static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
}
static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
unsigned long addr,
unsigned char *hpte_slot_array,
unsigned long addr, int psize)
int psize, int ssize)
{
int ssize = 0, i, index = 0;
int i, index = 0;
unsigned long s_addr = addr;
unsigned int max_hpte_count, valid;
unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
unsigned long shift, hidx, vpn = 0, vsid, hash, slot;
unsigned long shift, hidx, vpn = 0, hash, slot;
shift = mmu_psize_defs[psize].shift;
max_hpte_count = 1U << (PMD_SHIFT - shift);
......@@ -453,15 +454,6 @@ static void pSeries_lpar_hugepage_invalidate(struct mm_struct *mm,
/* get the vpn */
addr = s_addr + (i * (1ul << shift));
if (!is_kernel_addr(addr)) {
ssize = user_segment_size(addr);
vsid = get_vsid(mm->context.id, addr, ssize);
WARN_ON(vsid == 0);
} else {
vsid = get_kernel_vsid(addr, mmu_kernel_ssize);
ssize = mmu_kernel_ssize;
}
vpn = hpt_vpn(addr, vsid, ssize);
hash = hpt_hash(vpn, shift, ssize);
if (hidx & _PTEIDX_SECONDARY)
......
......@@ -24,6 +24,7 @@
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/bug.h>
#include <linux/nmi.h>
#include <asm/ptrace.h>
#include <asm/string.h>
......@@ -374,6 +375,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
#endif
local_irq_save(flags);
hard_irq_disable();
bp = in_breakpoint_table(regs->nip, &offset);
if (bp != NULL) {
......@@ -558,6 +560,7 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
#endif
insert_cpu_bpts();
touch_nmi_watchdog();
local_irq_restore(flags);
return cmd != 'X' && cmd != EOF;
......
......@@ -10,6 +10,9 @@
extern const char linux_banner[];
extern const char linux_proc_banner[];
extern char *log_buf_addr_get(void);
extern u32 log_buf_len_get(void);
static inline int printk_get_level(const char *buffer)
{
if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
......
#undef TRACE_SYSTEM
#define TRACE_SYSTEM thp
#if !defined(_TRACE_THP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_THP_H
#include <linux/types.h>
#include <linux/tracepoint.h>
TRACE_EVENT(hugepage_invalidate,
TP_PROTO(unsigned long addr, unsigned long pte),
TP_ARGS(addr, pte),
TP_STRUCT__entry(
__field(unsigned long, addr)
__field(unsigned long, pte)
),
TP_fast_assign(
__entry->addr = addr;
__entry->pte = pte;
),
TP_printk("hugepage invalidate at addr 0x%lx and pte = 0x%lx",
__entry->addr, __entry->pte)
);
TRACE_EVENT(hugepage_set_pmd,
TP_PROTO(unsigned long addr, unsigned long pmd),
TP_ARGS(addr, pmd),
TP_STRUCT__entry(
__field(unsigned long, addr)
__field(unsigned long, pmd)
),
TP_fast_assign(
__entry->addr = addr;
__entry->pmd = pmd;
),
TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd)
);
TRACE_EVENT(hugepage_update,
TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set),
TP_ARGS(addr, pte, clr, set),
TP_STRUCT__entry(
__field(unsigned long, addr)
__field(unsigned long, pte)
__field(unsigned long, clr)
__field(unsigned long, set)
),
TP_fast_assign(
__entry->addr = addr;
__entry->pte = pte;
__entry->clr = clr;
__entry->set = set;
),
TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set)
);
TRACE_EVENT(hugepage_splitting,
TP_PROTO(unsigned long addr, unsigned long pte),
TP_ARGS(addr, pte),
TP_STRUCT__entry(
__field(unsigned long, addr)
__field(unsigned long, pte)
),
TP_fast_assign(
__entry->addr = addr;
__entry->pte = pte;
),
TP_printk("hugepage splitting at addr 0x%lx and pte = 0x%lx",
__entry->addr, __entry->pte)
);
#endif /* _TRACE_THP_H */
/* This part must be outside protection */
#include <trace/define_trace.h>
......@@ -272,6 +272,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
/* Return log buffer address */
char *log_buf_addr_get(void)
{
return log_buf;
}
/* Return log buffer size */
u32 log_buf_len_get(void)
{
return log_buf_len;
}
/* human readable text of the record */
static char *log_text(const struct printk_log *msg)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment