Commit 9f36f9c8 authored by Dave Airlie's avatar Dave Airlie

Merge tag 'drm-amdkfd-next-2018-03-27' of git://people.freedesktop.org/~gabbayo/linux into drm-next

- GPUVM support for dGPUs
- KFD events support for dGPUs
- Fix live-lock situation when restoring multiple evicted processes
- Fix VM page table allocation on large-bar systems
- Fix for build failure on frv architecture

* tag 'drm-amdkfd-next-2018-03-27' of git://people.freedesktop.org/~gabbayo/linux:
  drm/amdkfd: Use ordered workqueue to restore processes
  drm/amdgpu: Fix acquiring VM on large-BAR systems
  drm/amdkfd: Add module option for testing large-BAR functionality
  drm/amdkfd: Kmap event page for dGPUs
  drm/amdkfd: Add ioctls for GPUVM memory management
  drm/amdkfd: Add TC flush on VMID deallocation for Hawaii
  drm/amdkfd: Allocate CWSR trap handler memory for dGPUs
  drm/amdkfd: Add per-process IDR for buffer handles
  drm/amdkfd: Aperture setup for dGPUs
  drm/amdkfd: Remove limit on number of GPUs
  drm/amdkfd: Populate DRM render device minor
  drm/amdkfd: Create KFD VMs on demand
  drm/amdgpu: Add kfd2kgd interface to acquire an existing VM
  drm/amdgpu: Add helper to turn an existing VM into a compute VM
  drm/amdgpu: Fix initial validation of PD BO for KFD VMs
  drm/amdgpu: Move KFD-specific fields into struct amdgpu_vm
  drm/amdkfd: fix uninitialized variable use
  drm/amdkfd: add missing include of mm.h
parents cb17aa52 1679ae8f
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#define AMDGPU_AMDKFD_H_INCLUDED #define AMDGPU_AMDKFD_H_INCLUDED
#include <linux/types.h> #include <linux/types.h>
#include <linux/mm.h>
#include <linux/mmu_context.h> #include <linux/mmu_context.h>
#include <kgd_kfd_interface.h> #include <kgd_kfd_interface.h>
#include <drm/ttm/ttm_execbuf_util.h> #include <drm/ttm/ttm_execbuf_util.h>
...@@ -92,27 +93,6 @@ struct amdkfd_process_info { ...@@ -92,27 +93,6 @@ struct amdkfd_process_info {
struct amdgpu_amdkfd_fence *eviction_fence; struct amdgpu_amdkfd_fence *eviction_fence;
}; };
/* struct amdkfd_vm -
* For Memory Eviction KGD requires a mechanism to keep track of all KFD BOs
* belonging to a KFD process. All the VMs belonging to the same process point
* to the same amdkfd_process_info.
*/
struct amdkfd_vm {
/* Keep base as the first parameter for pointer compatibility between
* amdkfd_vm and amdgpu_vm.
*/
struct amdgpu_vm base;
/* List node in amdkfd_process_info.vm_list_head*/
struct list_head vm_list_node;
struct amdgpu_device *adev;
/* Points to the KFD process VM info*/
struct amdkfd_process_info *process_info;
uint64_t pd_phys_addr;
};
int amdgpu_amdkfd_init(void); int amdgpu_amdkfd_init(void);
void amdgpu_amdkfd_fini(void); void amdgpu_amdkfd_fini(void);
...@@ -165,6 +145,12 @@ uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd); ...@@ -165,6 +145,12 @@ uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd);
int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm, int amdgpu_amdkfd_gpuvm_create_process_vm(struct kgd_dev *kgd, void **vm,
void **process_info, void **process_info,
struct dma_fence **ef); struct dma_fence **ef);
int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct kgd_dev *kgd,
struct file *filp,
void **vm, void **process_info,
struct dma_fence **ef);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
struct amdgpu_vm *vm);
void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm); void amdgpu_amdkfd_gpuvm_destroy_process_vm(struct kgd_dev *kgd, void *vm);
uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm); uint32_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *vm);
int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
......
...@@ -205,6 +205,7 @@ static const struct kfd2kgd_calls kfd2kgd = { ...@@ -205,6 +205,7 @@ static const struct kfd2kgd_calls kfd2kgd = {
.get_cu_info = get_cu_info, .get_cu_info = get_cu_info,
.get_vram_usage = amdgpu_amdkfd_get_vram_usage, .get_vram_usage = amdgpu_amdkfd_get_vram_usage,
.create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
.acquire_process_vm = amdgpu_amdkfd_gpuvm_acquire_process_vm,
.destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
.get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
.set_vm_context_page_table_base = set_vm_context_page_table_base, .set_vm_context_page_table_base = set_vm_context_page_table_base,
......
...@@ -165,6 +165,7 @@ static const struct kfd2kgd_calls kfd2kgd = { ...@@ -165,6 +165,7 @@ static const struct kfd2kgd_calls kfd2kgd = {
.get_cu_info = get_cu_info, .get_cu_info = get_cu_info,
.get_vram_usage = amdgpu_amdkfd_get_vram_usage, .get_vram_usage = amdgpu_amdkfd_get_vram_usage,
.create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm, .create_process_vm = amdgpu_amdkfd_gpuvm_create_process_vm,
.acquire_process_vm = amdgpu_amdkfd_gpuvm_acquire_process_vm,
.destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm, .destroy_process_vm = amdgpu_amdkfd_gpuvm_destroy_process_vm,
.get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir, .get_process_page_dir = amdgpu_amdkfd_gpuvm_get_process_page_dir,
.set_vm_context_page_table_base = set_vm_context_page_table_base, .set_vm_context_page_table_base = set_vm_context_page_table_base,
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <drm/amdgpu_drm.h> #include <drm/amdgpu_drm.h>
#include "amdgpu.h" #include "amdgpu.h"
#include "amdgpu_trace.h" #include "amdgpu_trace.h"
#include "amdgpu_amdkfd.h"
/* /*
* GPUVM * GPUVM
...@@ -2405,8 +2406,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, ...@@ -2405,8 +2406,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
if (vm->use_cpu_for_update) if (vm->use_cpu_for_update)
flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
else else
flags |= (AMDGPU_GEM_CREATE_NO_CPU_ACCESS | flags |= AMDGPU_GEM_CREATE_SHADOW;
AMDGPU_GEM_CREATE_SHADOW);
size = amdgpu_vm_bo_size(adev, adev->vm_manager.root_level); size = amdgpu_vm_bo_size(adev, adev->vm_manager.root_level);
r = amdgpu_bo_create(adev, size, align, AMDGPU_GEM_DOMAIN_VRAM, flags, r = amdgpu_bo_create(adev, size, align, AMDGPU_GEM_DOMAIN_VRAM, flags,
...@@ -2461,6 +2461,73 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, ...@@ -2461,6 +2461,73 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
return r; return r;
} }
/**
* amdgpu_vm_make_compute - Turn a GFX VM into a compute VM
*
* This only works on GFX VMs that don't have any BOs added and no
* page tables allocated yet.
*
* Changes the following VM parameters:
* - use_cpu_for_update
* - pte_supports_ats
* - pasid (old PASID is released, because compute manages its own PASIDs)
*
* Reinitializes the page directory to reflect the changed ATS
* setting. May leave behind an unused shadow BO for the page
* directory when switching from SDMA updates to CPU updates.
*
* Returns 0 for success, -errno for errors.
*/
int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm)
{
bool pte_support_ats = (adev->asic_type == CHIP_RAVEN);
int r;
r = amdgpu_bo_reserve(vm->root.base.bo, true);
if (r)
return r;
/* Sanity checks */
if (!RB_EMPTY_ROOT(&vm->va.rb_root) || vm->root.entries) {
r = -EINVAL;
goto error;
}
/* Check if PD needs to be reinitialized and do it before
* changing any other state, in case it fails.
*/
if (pte_support_ats != vm->pte_support_ats) {
r = amdgpu_vm_clear_bo(adev, vm, vm->root.base.bo,
adev->vm_manager.root_level,
pte_support_ats);
if (r)
goto error;
}
/* Update VM state */
vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
AMDGPU_VM_USE_CPU_FOR_COMPUTE);
vm->pte_support_ats = pte_support_ats;
DRM_DEBUG_DRIVER("VM update mode is %s\n",
vm->use_cpu_for_update ? "CPU" : "SDMA");
WARN_ONCE((vm->use_cpu_for_update & !amdgpu_vm_is_large_bar(adev)),
"CPU update of VM recommended only for large BAR system\n");
if (vm->pasid) {
unsigned long flags;
spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
vm->pasid = 0;
}
error:
amdgpu_bo_unreserve(vm->root.base.bo);
return r;
}
/** /**
* amdgpu_vm_free_levels - free PD/PT levels * amdgpu_vm_free_levels - free PD/PT levels
* *
...@@ -2508,6 +2575,8 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) ...@@ -2508,6 +2575,8 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
u64 fault; u64 fault;
int i, r; int i, r;
amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
/* Clear pending page faults from IH when the VM is destroyed */ /* Clear pending page faults from IH when the VM is destroyed */
while (kfifo_get(&vm->faults, &fault)) while (kfifo_get(&vm->faults, &fault))
amdgpu_ih_clear_fault(adev, fault); amdgpu_ih_clear_fault(adev, fault);
......
...@@ -207,6 +207,15 @@ struct amdgpu_vm { ...@@ -207,6 +207,15 @@ struct amdgpu_vm {
/* Limit non-retry fault storms */ /* Limit non-retry fault storms */
unsigned int fault_credit; unsigned int fault_credit;
/* Points to the KFD process VM info */
struct amdkfd_process_info *process_info;
/* List node in amdkfd_process_info.vm_list_head */
struct list_head vm_list_node;
/* Valid while the PD is reserved or fenced */
uint64_t pd_phys_addr;
}; };
struct amdgpu_vm_manager { struct amdgpu_vm_manager {
...@@ -251,6 +260,7 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev); ...@@ -251,6 +260,7 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev);
void amdgpu_vm_manager_fini(struct amdgpu_device *adev); void amdgpu_vm_manager_fini(struct amdgpu_device *adev);
int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
int vm_context, unsigned int pasid); int vm_context, unsigned int pasid);
int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev, bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
unsigned int pasid); unsigned int pasid);
......
This diff is collapsed.
...@@ -882,7 +882,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) ...@@ -882,7 +882,7 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size)
crat_table->length = sizeof(struct crat_header); crat_table->length = sizeof(struct crat_header);
status = acpi_get_table("DSDT", 0, &acpi_table); status = acpi_get_table("DSDT", 0, &acpi_table);
if (status == AE_NOT_FOUND) if (status != AE_OK)
pr_warn("DSDT table not found for OEM information\n"); pr_warn("DSDT table not found for OEM information\n");
else { else {
crat_table->oem_revision = acpi_table->revision; crat_table->oem_revision = acpi_table->revision;
...@@ -1117,6 +1117,9 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, ...@@ -1117,6 +1117,9 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr +
sub_type_hdr->length); sub_type_hdr->length);
if (debug_largebar)
local_mem_info.local_mem_size_private = 0;
if (local_mem_info.local_mem_size_private == 0) if (local_mem_info.local_mem_size_private == 0)
ret = kfd_fill_gpu_memory_affinity(&avail_size, ret = kfd_fill_gpu_memory_affinity(&avail_size,
kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC,
......
...@@ -142,12 +142,31 @@ static int allocate_vmid(struct device_queue_manager *dqm, ...@@ -142,12 +142,31 @@ static int allocate_vmid(struct device_queue_manager *dqm,
return 0; return 0;
} }
static int flush_texture_cache_nocpsch(struct kfd_dev *kdev,
struct qcm_process_device *qpd)
{
uint32_t len;
if (!qpd->ib_kaddr)
return -ENOMEM;
len = pm_create_release_mem(qpd->ib_base, (uint32_t *)qpd->ib_kaddr);
return kdev->kfd2kgd->submit_ib(kdev->kgd, KGD_ENGINE_MEC1, qpd->vmid,
qpd->ib_base, (uint32_t *)qpd->ib_kaddr, len);
}
static void deallocate_vmid(struct device_queue_manager *dqm, static void deallocate_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd, struct qcm_process_device *qpd,
struct queue *q) struct queue *q)
{ {
int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd; int bit = qpd->vmid - dqm->dev->vm_info.first_vmid_kfd;
/* On GFX v7, CP doesn't flush TC at dequeue */
if (q->device->device_info->asic_family == CHIP_HAWAII)
if (flush_texture_cache_nocpsch(q->device, qpd))
pr_err("Failed to flush TC\n");
kfd_flush_tlb(qpd_to_pdd(qpd)); kfd_flush_tlb(qpd_to_pdd(qpd));
/* Release the vmid mapping */ /* Release the vmid mapping */
...@@ -792,11 +811,12 @@ static void uninitialize(struct device_queue_manager *dqm) ...@@ -792,11 +811,12 @@ static void uninitialize(struct device_queue_manager *dqm)
static int start_nocpsch(struct device_queue_manager *dqm) static int start_nocpsch(struct device_queue_manager *dqm)
{ {
init_interrupts(dqm); init_interrupts(dqm);
return 0; return pm_init(&dqm->packets, dqm);
} }
static int stop_nocpsch(struct device_queue_manager *dqm) static int stop_nocpsch(struct device_queue_manager *dqm)
{ {
pm_uninit(&dqm->packets);
return 0; return 0;
} }
......
...@@ -52,6 +52,7 @@ struct kfd_event_waiter { ...@@ -52,6 +52,7 @@ struct kfd_event_waiter {
struct kfd_signal_page { struct kfd_signal_page {
uint64_t *kernel_address; uint64_t *kernel_address;
uint64_t __user *user_address; uint64_t __user *user_address;
bool need_to_free_pages;
}; };
...@@ -79,6 +80,7 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p) ...@@ -79,6 +80,7 @@ static struct kfd_signal_page *allocate_signal_page(struct kfd_process *p)
KFD_SIGNAL_EVENT_LIMIT * 8); KFD_SIGNAL_EVENT_LIMIT * 8);
page->kernel_address = backing_store; page->kernel_address = backing_store;
page->need_to_free_pages = true;
pr_debug("Allocated new event signal page at %p, for process %p\n", pr_debug("Allocated new event signal page at %p, for process %p\n",
page, p); page, p);
...@@ -269,6 +271,7 @@ static void shutdown_signal_page(struct kfd_process *p) ...@@ -269,6 +271,7 @@ static void shutdown_signal_page(struct kfd_process *p)
struct kfd_signal_page *page = p->signal_page; struct kfd_signal_page *page = p->signal_page;
if (page) { if (page) {
if (page->need_to_free_pages)
free_pages((unsigned long)page->kernel_address, free_pages((unsigned long)page->kernel_address,
get_order(KFD_SIGNAL_EVENT_LIMIT * 8)); get_order(KFD_SIGNAL_EVENT_LIMIT * 8));
kfree(page); kfree(page);
...@@ -292,6 +295,30 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev) ...@@ -292,6 +295,30 @@ static bool event_can_be_cpu_signaled(const struct kfd_event *ev)
return ev->type == KFD_EVENT_TYPE_SIGNAL; return ev->type == KFD_EVENT_TYPE_SIGNAL;
} }
int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
uint64_t size)
{
struct kfd_signal_page *page;
if (p->signal_page)
return -EBUSY;
page = kzalloc(sizeof(*page), GFP_KERNEL);
if (!page)
return -ENOMEM;
/* Initialize all events to unsignaled */
memset(kernel_address, (uint8_t) UNSIGNALED_EVENT_SLOT,
KFD_SIGNAL_EVENT_LIMIT * 8);
page->kernel_address = kernel_address;
p->signal_page = page;
p->signal_mapped_size = size;
return 0;
}
int kfd_event_create(struct file *devkfd, struct kfd_process *p, int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint32_t event_type, bool auto_reset, uint32_t node_id, uint32_t event_type, bool auto_reset, uint32_t node_id,
uint32_t *event_id, uint32_t *event_trigger_data, uint32_t *event_id, uint32_t *event_trigger_data,
......
...@@ -278,21 +278,28 @@ ...@@ -278,21 +278,28 @@
#define MAKE_GPUVM_APP_BASE(gpu_num) \ #define MAKE_GPUVM_APP_BASE(gpu_num) \
(((uint64_t)(gpu_num) << 61) + 0x1000000000000L) (((uint64_t)(gpu_num) << 61) + 0x1000000000000L)
#define MAKE_GPUVM_APP_LIMIT(base) \ #define MAKE_GPUVM_APP_LIMIT(base, size) \
(((uint64_t)(base) & \ (((uint64_t)(base) & 0xFFFFFF0000000000UL) + (size) - 1)
0xFFFFFF0000000000UL) | 0xFFFFFFFFFFL)
#define MAKE_SCRATCH_APP_BASE(gpu_num) \ #define MAKE_SCRATCH_APP_BASE() \
(((uint64_t)(gpu_num) << 61) + 0x100000000L) (((uint64_t)(0x1UL) << 61) + 0x100000000L)
#define MAKE_SCRATCH_APP_LIMIT(base) \ #define MAKE_SCRATCH_APP_LIMIT(base) \
(((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) (((uint64_t)base & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
#define MAKE_LDS_APP_BASE(gpu_num) \ #define MAKE_LDS_APP_BASE() \
(((uint64_t)(gpu_num) << 61) + 0x0) (((uint64_t)(0x1UL) << 61) + 0x0)
#define MAKE_LDS_APP_LIMIT(base) \ #define MAKE_LDS_APP_LIMIT(base) \
(((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF) (((uint64_t)(base) & 0xFFFFFFFF00000000UL) | 0xFFFFFFFF)
/* User mode manages most of the SVM aperture address space. The low
* 16MB are reserved for kernel use (CWSR trap handler and kernel IB
* for now).
*/
#define SVM_USER_BASE 0x1000000ull
#define SVM_CWSR_BASE (SVM_USER_BASE - KFD_CWSR_TBA_TMA_SIZE)
#define SVM_IB_BASE (SVM_CWSR_BASE - PAGE_SIZE)
int kfd_init_apertures(struct kfd_process *process) int kfd_init_apertures(struct kfd_process *process)
{ {
uint8_t id = 0; uint8_t id = 0;
...@@ -314,7 +321,7 @@ int kfd_init_apertures(struct kfd_process *process) ...@@ -314,7 +321,7 @@ int kfd_init_apertures(struct kfd_process *process)
return -1; return -1;
} }
/* /*
* For 64 bit process aperture will be statically reserved in * For 64 bit process apertures will be statically reserved in
* the x86_64 non canonical process address space * the x86_64 non canonical process address space
* amdkfd doesn't currently support apertures for 32 bit process * amdkfd doesn't currently support apertures for 32 bit process
*/ */
...@@ -323,23 +330,35 @@ int kfd_init_apertures(struct kfd_process *process) ...@@ -323,23 +330,35 @@ int kfd_init_apertures(struct kfd_process *process)
pdd->gpuvm_base = pdd->gpuvm_limit = 0; pdd->gpuvm_base = pdd->gpuvm_limit = 0;
pdd->scratch_base = pdd->scratch_limit = 0; pdd->scratch_base = pdd->scratch_limit = 0;
} else { } else {
/* /* Same LDS and scratch apertures can be used
* node id couldn't be 0 - the three MSB bits of * on all GPUs. This allows using more dGPUs
* aperture shoudn't be 0 * than placement options for apertures.
*/ */
pdd->lds_base = MAKE_LDS_APP_BASE(id + 1); pdd->lds_base = MAKE_LDS_APP_BASE();
pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base); pdd->lds_limit = MAKE_LDS_APP_LIMIT(pdd->lds_base);
pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1); pdd->scratch_base = MAKE_SCRATCH_APP_BASE();
pdd->gpuvm_limit =
MAKE_GPUVM_APP_LIMIT(pdd->gpuvm_base);
pdd->scratch_base = MAKE_SCRATCH_APP_BASE(id + 1);
pdd->scratch_limit = pdd->scratch_limit =
MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base); MAKE_SCRATCH_APP_LIMIT(pdd->scratch_base);
if (dev->device_info->needs_iommu_device) {
/* APUs: GPUVM aperture in
* non-canonical address space
*/
pdd->gpuvm_base = MAKE_GPUVM_APP_BASE(id + 1);
pdd->gpuvm_limit = MAKE_GPUVM_APP_LIMIT(
pdd->gpuvm_base,
dev->shared_resources.gpuvm_size);
} else {
/* dGPUs: SVM aperture starting at 0
* with small reserved space for kernel
*/
pdd->gpuvm_base = SVM_USER_BASE;
pdd->gpuvm_limit =
dev->shared_resources.gpuvm_size - 1;
pdd->qpd.cwsr_base = SVM_CWSR_BASE;
pdd->qpd.ib_base = SVM_IB_BASE;
}
} }
dev_dbg(kfd_device, "node id %u\n", id); dev_dbg(kfd_device, "node id %u\n", id);
......
...@@ -71,6 +71,11 @@ module_param(send_sigterm, int, 0444); ...@@ -71,6 +71,11 @@ module_param(send_sigterm, int, 0444);
MODULE_PARM_DESC(send_sigterm, MODULE_PARM_DESC(send_sigterm,
"Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)"); "Send sigterm to HSA process on unhandled exception (0 = disable, 1 = enable)");
int debug_largebar;
module_param(debug_largebar, int, 0444);
MODULE_PARM_DESC(debug_largebar,
"Debug large-bar flag used to simulate large-bar capability on non-large bar machine (0 = disable, 1 = enable)");
int ignore_crat; int ignore_crat;
module_param(ignore_crat, int, 0444); module_param(ignore_crat, int, 0444);
MODULE_PARM_DESC(ignore_crat, MODULE_PARM_DESC(ignore_crat,
...@@ -128,7 +133,9 @@ static int __init kfd_module_init(void) ...@@ -128,7 +133,9 @@ static int __init kfd_module_init(void)
if (err < 0) if (err < 0)
goto err_topology; goto err_topology;
kfd_process_create_wq(); err = kfd_process_create_wq();
if (err < 0)
goto err_create_wq;
kfd_debugfs_init(); kfd_debugfs_init();
...@@ -138,6 +145,8 @@ static int __init kfd_module_init(void) ...@@ -138,6 +145,8 @@ static int __init kfd_module_init(void)
return 0; return 0;
err_create_wq:
kfd_topology_shutdown();
err_topology: err_topology:
kfd_chardev_exit(); kfd_chardev_exit();
err_ioctl: err_ioctl:
......
...@@ -356,6 +356,43 @@ static int pm_create_runlist_ib(struct packet_manager *pm, ...@@ -356,6 +356,43 @@ static int pm_create_runlist_ib(struct packet_manager *pm,
return retval; return retval;
} }
/* pm_create_release_mem - Create a RELEASE_MEM packet and return the size
* of this packet
* @gpu_addr - GPU address of the packet. It's a virtual address.
* @buffer - buffer to fill up with the packet. It's a CPU kernel pointer
* Return - length of the packet
*/
uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer)
{
struct pm4_mec_release_mem *packet;
WARN_ON(!buffer);
packet = (struct pm4_mec_release_mem *)buffer;
memset(buffer, 0, sizeof(*packet));
packet->header.u32All = build_pm4_header(IT_RELEASE_MEM,
sizeof(*packet));
packet->bitfields2.event_type = CACHE_FLUSH_AND_INV_TS_EVENT;
packet->bitfields2.event_index = event_index___release_mem__end_of_pipe;
packet->bitfields2.tcl1_action_ena = 1;
packet->bitfields2.tc_action_ena = 1;
packet->bitfields2.cache_policy = cache_policy___release_mem__lru;
packet->bitfields2.atc = 0;
packet->bitfields3.data_sel = data_sel___release_mem__send_32_bit_low;
packet->bitfields3.int_sel =
int_sel___release_mem__send_interrupt_after_write_confirm;
packet->bitfields4.address_lo_32b = (gpu_addr & 0xffffffff) >> 2;
packet->address_hi = upper_32_bits(gpu_addr);
packet->data_lo = 0;
return sizeof(*packet) / sizeof(unsigned int);
}
int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
{ {
pm->dqm = dqm; pm->dqm = dqm;
......
...@@ -104,6 +104,12 @@ extern int cwsr_enable; ...@@ -104,6 +104,12 @@ extern int cwsr_enable;
*/ */
extern int send_sigterm; extern int send_sigterm;
/*
* This kernel module is used to simulate large bar machine on non-large bar
* enabled machines.
*/
extern int debug_largebar;
/* /*
* Ignore CRAT table during KFD initialization, can be used to work around * Ignore CRAT table during KFD initialization, can be used to work around
* broken CRAT tables on some AMD systems * broken CRAT tables on some AMD systems
...@@ -488,8 +494,13 @@ struct qcm_process_device { ...@@ -488,8 +494,13 @@ struct qcm_process_device {
/* CWSR memory */ /* CWSR memory */
void *cwsr_kaddr; void *cwsr_kaddr;
uint64_t cwsr_base;
uint64_t tba_addr; uint64_t tba_addr;
uint64_t tma_addr; uint64_t tma_addr;
/* IB memory */
uint64_t ib_base;
void *ib_kaddr;
}; };
/* KFD Memory Eviction */ /* KFD Memory Eviction */
...@@ -504,6 +515,14 @@ struct qcm_process_device { ...@@ -504,6 +515,14 @@ struct qcm_process_device {
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence); struct dma_fence *fence);
/* 8 byte handle containing GPU ID in the most significant 4 bytes and
* idr_handle in the least significant 4 bytes
*/
#define MAKE_HANDLE(gpu_id, idr_handle) \
(((uint64_t)(gpu_id) << 32) + idr_handle)
#define GET_GPU_ID(handle) (handle >> 32)
#define GET_IDR_HANDLE(handle) (handle & 0xFFFFFFFF)
enum kfd_pdd_bound { enum kfd_pdd_bound {
PDD_UNBOUND = 0, PDD_UNBOUND = 0,
PDD_BOUND, PDD_BOUND,
...@@ -536,8 +555,12 @@ struct kfd_process_device { ...@@ -536,8 +555,12 @@ struct kfd_process_device {
uint64_t scratch_limit; uint64_t scratch_limit;
/* VM context for GPUVM allocations */ /* VM context for GPUVM allocations */
struct file *drm_file;
void *vm; void *vm;
/* GPUVM allocations storage */
struct idr alloc_idr;
/* Flag used to tell the pdd has dequeued from the dqm. /* Flag used to tell the pdd has dequeued from the dqm.
* This is used to prevent dev->dqm->ops.process_termination() from * This is used to prevent dev->dqm->ops.process_termination() from
* being called twice when it is already called in IOMMU callback * being called twice when it is already called in IOMMU callback
...@@ -651,7 +674,7 @@ struct amdkfd_ioctl_desc { ...@@ -651,7 +674,7 @@ struct amdkfd_ioctl_desc {
const char *name; const char *name;
}; };
void kfd_process_create_wq(void); int kfd_process_create_wq(void);
void kfd_process_destroy_wq(void); void kfd_process_destroy_wq(void);
struct kfd_process *kfd_create_process(struct file *filep); struct kfd_process *kfd_create_process(struct file *filep);
struct kfd_process *kfd_get_process(const struct task_struct *); struct kfd_process *kfd_get_process(const struct task_struct *);
...@@ -661,6 +684,8 @@ void kfd_unref_process(struct kfd_process *p); ...@@ -661,6 +684,8 @@ void kfd_unref_process(struct kfd_process *p);
void kfd_suspend_all_processes(void); void kfd_suspend_all_processes(void);
int kfd_resume_all_processes(void); int kfd_resume_all_processes(void);
int kfd_process_device_init_vm(struct kfd_process_device *pdd,
struct file *drm_file);
struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
struct kfd_process *p); struct kfd_process *p);
struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev, struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
...@@ -671,6 +696,14 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, ...@@ -671,6 +696,14 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
int kfd_reserved_mem_mmap(struct kfd_process *process, int kfd_reserved_mem_mmap(struct kfd_process *process,
struct vm_area_struct *vma); struct vm_area_struct *vma);
/* KFD process API for creating and translating handles */
int kfd_process_device_create_obj_handle(struct kfd_process_device *pdd,
void *mem);
void *kfd_process_device_translate_handle(struct kfd_process_device *p,
int handle);
void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
int handle);
/* Process device data iterator */ /* Process device data iterator */
struct kfd_process_device *kfd_get_first_process_device_data( struct kfd_process_device *kfd_get_first_process_device_data(
struct kfd_process *p); struct kfd_process *p);
...@@ -816,6 +849,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, ...@@ -816,6 +849,8 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type,
void pm_release_ib(struct packet_manager *pm); void pm_release_ib(struct packet_manager *pm);
uint32_t pm_create_release_mem(uint64_t gpu_addr, uint32_t *buffer);
uint64_t kfd_get_number_elems(struct kfd_dev *kfd); uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
/* Events */ /* Events */
...@@ -837,6 +872,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, ...@@ -837,6 +872,8 @@ void kfd_signal_iommu_event(struct kfd_dev *dev,
void kfd_signal_hw_exception_event(unsigned int pasid); void kfd_signal_hw_exception_event(unsigned int pasid);
int kfd_set_event(struct kfd_process *p, uint32_t event_id); int kfd_set_event(struct kfd_process *p, uint32_t event_id);
int kfd_reset_event(struct kfd_process *p, uint32_t event_id); int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
int kfd_event_page_set(struct kfd_process *p, void *kernel_address,
uint64_t size);
int kfd_event_create(struct file *devkfd, struct kfd_process *p, int kfd_event_create(struct file *devkfd, struct kfd_process *p,
uint32_t event_type, bool auto_reset, uint32_t node_id, uint32_t event_type, bool auto_reset, uint32_t node_id,
uint32_t *event_id, uint32_t *event_trigger_data, uint32_t *event_id, uint32_t *event_trigger_data,
......
This diff is collapsed.
...@@ -441,6 +441,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, ...@@ -441,6 +441,8 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
dev->node_props.device_id); dev->node_props.device_id);
sysfs_show_32bit_prop(buffer, "location_id", sysfs_show_32bit_prop(buffer, "location_id",
dev->node_props.location_id); dev->node_props.location_id);
sysfs_show_32bit_prop(buffer, "drm_render_minor",
dev->node_props.drm_render_minor);
if (dev->gpu) { if (dev->gpu) {
log_max_watch_addr = log_max_watch_addr =
...@@ -1214,6 +1216,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu) ...@@ -1214,6 +1216,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd); dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd);
dev->node_props.max_engine_clk_ccompute = dev->node_props.max_engine_clk_ccompute =
cpufreq_quick_get_max(0) / 1000; cpufreq_quick_get_max(0) / 1000;
dev->node_props.drm_render_minor =
gpu->shared_resources.drm_render_minor;
kfd_fill_mem_clk_max_info(dev); kfd_fill_mem_clk_max_info(dev);
kfd_fill_iolink_non_crat_info(dev); kfd_fill_iolink_non_crat_info(dev);
......
...@@ -71,6 +71,7 @@ struct kfd_node_properties { ...@@ -71,6 +71,7 @@ struct kfd_node_properties {
uint32_t location_id; uint32_t location_id;
uint32_t max_engine_clk_fcompute; uint32_t max_engine_clk_fcompute;
uint32_t max_engine_clk_ccompute; uint32_t max_engine_clk_ccompute;
int32_t drm_render_minor;
uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE]; uint16_t marketing_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
}; };
......
...@@ -130,6 +130,7 @@ struct tile_config { ...@@ -130,6 +130,7 @@ struct tile_config {
/* /*
* Allocation flag domains * Allocation flag domains
* NOTE: This must match the corresponding definitions in kfd_ioctl.h.
*/ */
#define ALLOC_MEM_FLAGS_VRAM (1 << 0) #define ALLOC_MEM_FLAGS_VRAM (1 << 0)
#define ALLOC_MEM_FLAGS_GTT (1 << 1) #define ALLOC_MEM_FLAGS_GTT (1 << 1)
...@@ -138,6 +139,7 @@ struct tile_config { ...@@ -138,6 +139,7 @@ struct tile_config {
/* /*
* Allocation flags attributes/access options. * Allocation flags attributes/access options.
* NOTE: This must match the corresponding definitions in kfd_ioctl.h.
*/ */
#define ALLOC_MEM_FLAGS_WRITABLE (1 << 31) #define ALLOC_MEM_FLAGS_WRITABLE (1 << 31)
#define ALLOC_MEM_FLAGS_EXECUTABLE (1 << 30) #define ALLOC_MEM_FLAGS_EXECUTABLE (1 << 30)
...@@ -336,6 +338,8 @@ struct kfd2kgd_calls { ...@@ -336,6 +338,8 @@ struct kfd2kgd_calls {
int (*create_process_vm)(struct kgd_dev *kgd, void **vm, int (*create_process_vm)(struct kgd_dev *kgd, void **vm,
void **process_info, struct dma_fence **ef); void **process_info, struct dma_fence **ef);
int (*acquire_process_vm)(struct kgd_dev *kgd, struct file *filp,
void **vm, void **process_info, struct dma_fence **ef);
void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm); void (*destroy_process_vm)(struct kgd_dev *kgd, void *vm);
uint32_t (*get_process_page_dir)(void *vm); uint32_t (*get_process_page_dir)(void *vm);
void (*set_vm_context_page_table_base)(struct kgd_dev *kgd, void (*set_vm_context_page_table_base)(struct kgd_dev *kgd,
......
...@@ -107,8 +107,6 @@ struct kfd_ioctl_get_clock_counters_args { ...@@ -107,8 +107,6 @@ struct kfd_ioctl_get_clock_counters_args {
__u32 pad; __u32 pad;
}; };
#define NUM_OF_SUPPORTED_GPUS 7
struct kfd_process_device_apertures { struct kfd_process_device_apertures {
__u64 lds_base; /* from KFD */ __u64 lds_base; /* from KFD */
__u64 lds_limit; /* from KFD */ __u64 lds_limit; /* from KFD */
...@@ -120,6 +118,12 @@ struct kfd_process_device_apertures { ...@@ -120,6 +118,12 @@ struct kfd_process_device_apertures {
__u32 pad; __u32 pad;
}; };
/*
* AMDKFD_IOC_GET_PROCESS_APERTURES is deprecated. Use
* AMDKFD_IOC_GET_PROCESS_APERTURES_NEW instead, which supports an
* unlimited number of GPUs.
*/
#define NUM_OF_SUPPORTED_GPUS 7
struct kfd_ioctl_get_process_apertures_args { struct kfd_ioctl_get_process_apertures_args {
struct kfd_process_device_apertures struct kfd_process_device_apertures
process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */ process_apertures[NUM_OF_SUPPORTED_GPUS];/* from KFD */
...@@ -129,6 +133,19 @@ struct kfd_ioctl_get_process_apertures_args { ...@@ -129,6 +133,19 @@ struct kfd_ioctl_get_process_apertures_args {
__u32 pad; __u32 pad;
}; };
struct kfd_ioctl_get_process_apertures_new_args {
/* User allocated. Pointer to struct kfd_process_device_apertures
* filled in by Kernel
*/
__u64 kfd_process_device_apertures_ptr;
/* to KFD - indicates amount of memory present in
* kfd_process_device_apertures_ptr
* from KFD - Number of entries filled by KFD.
*/
__u32 num_of_nodes;
__u32 pad;
};
#define MAX_ALLOWED_NUM_POINTS 100 #define MAX_ALLOWED_NUM_POINTS 100
#define MAX_ALLOWED_AW_BUFF_SIZE 4096 #define MAX_ALLOWED_AW_BUFF_SIZE 4096
#define MAX_ALLOWED_WAC_BUFF_SIZE 128 #define MAX_ALLOWED_WAC_BUFF_SIZE 128
...@@ -269,6 +286,86 @@ struct kfd_ioctl_set_trap_handler_args { ...@@ -269,6 +286,86 @@ struct kfd_ioctl_set_trap_handler_args {
__u32 pad; __u32 pad;
}; };
struct kfd_ioctl_acquire_vm_args {
__u32 drm_fd; /* to KFD */
__u32 gpu_id; /* to KFD */
};
/* Allocation flags: memory types */
#define KFD_IOC_ALLOC_MEM_FLAGS_VRAM (1 << 0)
#define KFD_IOC_ALLOC_MEM_FLAGS_GTT (1 << 1)
#define KFD_IOC_ALLOC_MEM_FLAGS_USERPTR (1 << 2)
#define KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL (1 << 3)
/* Allocation flags: attributes/access options */
#define KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE (1 << 31)
#define KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE (1 << 30)
#define KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC (1 << 29)
#define KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE (1 << 28)
#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27)
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26)
/* Allocate memory for later SVM (shared virtual memory) mapping.
*
* @va_addr: virtual address of the memory to be allocated
* all later mappings on all GPUs will use this address
* @size: size in bytes
* @handle: buffer handle returned to user mode, used to refer to
* this allocation for mapping, unmapping and freeing
* @mmap_offset: for CPU-mapping the allocation by mmapping a render node
* for userptrs this is overloaded to specify the CPU address
* @gpu_id: device identifier
* @flags: memory type and attributes. See KFD_IOC_ALLOC_MEM_FLAGS above
*/
struct kfd_ioctl_alloc_memory_of_gpu_args {
__u64 va_addr; /* to KFD */
__u64 size; /* to KFD */
__u64 handle; /* from KFD */
__u64 mmap_offset; /* to KFD (userptr), from KFD (mmap offset) */
__u32 gpu_id; /* to KFD */
__u32 flags;
};
/* Free memory allocated with kfd_ioctl_alloc_memory_of_gpu
*
* @handle: memory handle returned by alloc
*/
struct kfd_ioctl_free_memory_of_gpu_args {
__u64 handle; /* to KFD */
};
/* Map memory to one or more GPUs
*
* @handle: memory handle returned by alloc
* @device_ids_array_ptr: array of gpu_ids (__u32 per device)
* @n_devices: number of devices in the array
* @n_success: number of devices mapped successfully
*
* @n_success returns information to the caller how many devices from
* the start of the array have mapped the buffer successfully. It can
* be passed into a subsequent retry call to skip those devices. For
* the first call the caller should initialize it to 0.
*
* If the ioctl completes with return code 0 (success), n_success ==
* n_devices.
*/
struct kfd_ioctl_map_memory_to_gpu_args {
__u64 handle; /* to KFD */
__u64 device_ids_array_ptr; /* to KFD */
__u32 n_devices; /* to KFD */
__u32 n_success; /* to/from KFD */
};
/* Unmap memory from one or more GPUs
*
* same arguments as for mapping
*/
struct kfd_ioctl_unmap_memory_from_gpu_args {
__u64 handle; /* to KFD */
__u64 device_ids_array_ptr; /* to KFD */
__u32 n_devices; /* to KFD */
__u32 n_success; /* to/from KFD */
};
#define AMDKFD_IOCTL_BASE 'K' #define AMDKFD_IOCTL_BASE 'K'
#define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr)
#define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type)
...@@ -332,7 +429,26 @@ struct kfd_ioctl_set_trap_handler_args { ...@@ -332,7 +429,26 @@ struct kfd_ioctl_set_trap_handler_args {
#define AMDKFD_IOC_SET_TRAP_HANDLER \ #define AMDKFD_IOC_SET_TRAP_HANDLER \
AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args) AMDKFD_IOW(0x13, struct kfd_ioctl_set_trap_handler_args)
#define AMDKFD_IOC_GET_PROCESS_APERTURES_NEW \
AMDKFD_IOWR(0x14, \
struct kfd_ioctl_get_process_apertures_new_args)
#define AMDKFD_IOC_ACQUIRE_VM \
AMDKFD_IOW(0x15, struct kfd_ioctl_acquire_vm_args)
#define AMDKFD_IOC_ALLOC_MEMORY_OF_GPU \
AMDKFD_IOWR(0x16, struct kfd_ioctl_alloc_memory_of_gpu_args)
#define AMDKFD_IOC_FREE_MEMORY_OF_GPU \
AMDKFD_IOW(0x17, struct kfd_ioctl_free_memory_of_gpu_args)
#define AMDKFD_IOC_MAP_MEMORY_TO_GPU \
AMDKFD_IOWR(0x18, struct kfd_ioctl_map_memory_to_gpu_args)
#define AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU \
AMDKFD_IOWR(0x19, struct kfd_ioctl_unmap_memory_from_gpu_args)
#define AMDKFD_COMMAND_START 0x01 #define AMDKFD_COMMAND_START 0x01
#define AMDKFD_COMMAND_END 0x14 #define AMDKFD_COMMAND_END 0x1A
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment