Commit 4f4d89af authored by Dave Airlie's avatar Dave Airlie

Merge tag 'drm-amdkfd-next-2015-01-09' of git://people.freedesktop.org/~gabbayo/linux into drm-next

- Add support for SDMA usermode queues
- Replace logic of sub-allocating from GART buffer in amdkfd. Instead
  of using radeon_sa module, use a new module that is more suited for
  this purpose
- Add the number of watch points to amdkfd topology
- Split a function that did two things into two seperate functions.

* tag 'drm-amdkfd-next-2015-01-09' of git://people.freedesktop.org/~gabbayo/linux:
  drm/amd: Remove old radeon_sa funcs from kfd-->kgd interface
  drm/radeon: Remove old radeon_sa usage from kfd-->kgd interface
  drm/amdkfd: Using new gtt sa in amdkfd
  drm/amdkfd: Allocate gart memory using new interface
  drm/amdkfd: Fixed calculation of gart buffer size
  drm/amdkfd: Add kfd gtt sub-allocator functions
  drm/amdkfd: Add gtt sa related data to kfd_dev struct
  drm/radeon: Impl. new gtt allocate/free functions
  drm/amd: Add new kfd-->kgd interface for gart usage
  drm/radeon: Enable sdma preemption
  drm/amdkfd: Pass queue type to pqm_create_queue()
  drm/amdkfd: Identify SDMA queue in create queue ioctl
  drm/amdkfd: Add SDMA user-mode queues support to QCM
  drm/amdkfd: Add SDMA mqd support
  drm/radeon: Implement SDMA interface functions
  drm/amd: Add SDMA functions to kfd-->kgd interface
  drm/amdkfd: Process-device data creation and lookup split
  drm/amdkfd: Add number of watch points to topology
parents 426959c9 6bbcde98
......@@ -192,6 +192,8 @@ static int set_queue_properties_from_user(struct queue_properties *q_properties,
if (args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE ||
args->queue_type == KFD_IOC_QUEUE_TYPE_COMPUTE_AQL)
q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
else if (args->queue_type == KFD_IOC_QUEUE_TYPE_SDMA)
q_properties->type = KFD_QUEUE_TYPE_SDMA;
else
return -ENOTSUPP;
......@@ -258,8 +260,8 @@ static long kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
p->pasid,
dev->id);
err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, 0,
KFD_QUEUE_TYPE_COMPUTE, &queue_id);
err = pqm_create_queue(&p->pqm, dev, filep, &q_properties,
0, q_properties.type, &queue_id);
if (err != 0)
goto err_create_queue;
......
......@@ -26,12 +26,14 @@
#include <linux/slab.h>
#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
#include "kfd_pm4_headers.h"
#define MQD_SIZE_ALIGNED 768
static const struct kfd_device_info kaveri_device_info = {
.max_pasid_bits = 16,
.ih_ring_entry_size = 4 * sizeof(uint32_t),
.num_of_watch_points = 4,
.mqd_size_aligned = MQD_SIZE_ALIGNED
};
......@@ -66,6 +68,10 @@ static const struct kfd_deviceid supported_devices[] = {
{ 0x131D, &kaveri_device_info }, /* Kaveri */
};
static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
unsigned int chunk_size);
static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
static const struct kfd_device_info *lookup_device_info(unsigned short did)
{
size_t i;
......@@ -173,16 +179,39 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
max_num_of_queues_per_process *
kfd->device_info->mqd_size_aligned;
/* add another 512KB for all other allocations on gart */
/*
* calculate max size of runlist packet.
* There can be only 2 packets at once
*/
size += (max_num_of_processes * sizeof(struct pm4_map_process) +
max_num_of_processes * max_num_of_queues_per_process *
sizeof(struct pm4_map_queues) + sizeof(struct pm4_runlist)) * 2;
/* Add size of HIQ & DIQ */
size += KFD_KERNEL_QUEUE_SIZE * 2;
/* add another 512KB for all other allocations on gart (HPD, fences) */
size += 512 * 1024;
if (kfd2kgd->init_sa_manager(kfd->kgd, size)) {
if (kfd2kgd->init_gtt_mem_allocation(kfd->kgd, size, &kfd->gtt_mem,
&kfd->gtt_start_gpu_addr, &kfd->gtt_start_cpu_ptr)) {
dev_err(kfd_device,
"Error initializing sa manager for device (%x:%x)\n",
kfd->pdev->vendor, kfd->pdev->device);
"Could not allocate %d bytes for device (%x:%x)\n",
size, kfd->pdev->vendor, kfd->pdev->device);
goto out;
}
dev_info(kfd_device,
"Allocated %d bytes on gart for device(%x:%x)\n",
size, kfd->pdev->vendor, kfd->pdev->device);
/* Initialize GTT sa with 512 byte chunk size */
if (kfd_gtt_sa_init(kfd, size, 512) != 0) {
dev_err(kfd_device,
"Error initializing gtt sub-allocator\n");
goto kfd_gtt_sa_init_error;
}
kfd_doorbell_init(kfd);
if (kfd_topology_add_device(kfd) != 0) {
......@@ -241,7 +270,9 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
kfd_interrupt_error:
kfd_topology_remove_device(kfd);
kfd_topology_add_device_error:
kfd2kgd->fini_sa_manager(kfd->kgd);
kfd_gtt_sa_fini(kfd);
kfd_gtt_sa_init_error:
kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem);
dev_err(kfd_device,
"device (%x:%x) NOT added due to errors\n",
kfd->pdev->vendor, kfd->pdev->device);
......@@ -256,6 +287,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
amd_iommu_free_device(kfd->pdev);
kfd_interrupt_exit(kfd);
kfd_topology_remove_device(kfd);
kfd_gtt_sa_fini(kfd);
kfd2kgd->free_gtt_mem(kfd->kgd, kfd->gtt_mem);
}
kfree(kfd);
......@@ -306,3 +339,185 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
spin_unlock(&kfd->interrupt_lock);
}
}
static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
unsigned int chunk_size)
{
unsigned int num_of_bits;
BUG_ON(!kfd);
BUG_ON(!kfd->gtt_mem);
BUG_ON(buf_size < chunk_size);
BUG_ON(buf_size == 0);
BUG_ON(chunk_size == 0);
kfd->gtt_sa_chunk_size = chunk_size;
kfd->gtt_sa_num_of_chunks = buf_size / chunk_size;
num_of_bits = kfd->gtt_sa_num_of_chunks / BITS_PER_BYTE;
BUG_ON(num_of_bits == 0);
kfd->gtt_sa_bitmap = kzalloc(num_of_bits, GFP_KERNEL);
if (!kfd->gtt_sa_bitmap)
return -ENOMEM;
pr_debug("kfd: gtt_sa_num_of_chunks = %d, gtt_sa_bitmap = %p\n",
kfd->gtt_sa_num_of_chunks, kfd->gtt_sa_bitmap);
mutex_init(&kfd->gtt_sa_lock);
return 0;
}
static void kfd_gtt_sa_fini(struct kfd_dev *kfd)
{
mutex_destroy(&kfd->gtt_sa_lock);
kfree(kfd->gtt_sa_bitmap);
}
static inline uint64_t kfd_gtt_sa_calc_gpu_addr(uint64_t start_addr,
unsigned int bit_num,
unsigned int chunk_size)
{
return start_addr + bit_num * chunk_size;
}
static inline uint32_t *kfd_gtt_sa_calc_cpu_addr(void *start_addr,
unsigned int bit_num,
unsigned int chunk_size)
{
return (uint32_t *) ((uint64_t) start_addr + bit_num * chunk_size);
}
int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
struct kfd_mem_obj **mem_obj)
{
unsigned int found, start_search, cur_size;
BUG_ON(!kfd);
if (size == 0)
return -EINVAL;
if (size > kfd->gtt_sa_num_of_chunks * kfd->gtt_sa_chunk_size)
return -ENOMEM;
*mem_obj = kmalloc(sizeof(struct kfd_mem_obj), GFP_KERNEL);
if ((*mem_obj) == NULL)
return -ENOMEM;
pr_debug("kfd: allocated mem_obj = %p for size = %d\n", *mem_obj, size);
start_search = 0;
mutex_lock(&kfd->gtt_sa_lock);
kfd_gtt_restart_search:
/* Find the first chunk that is free */
found = find_next_zero_bit(kfd->gtt_sa_bitmap,
kfd->gtt_sa_num_of_chunks,
start_search);
pr_debug("kfd: found = %d\n", found);
/* If there wasn't any free chunk, bail out */
if (found == kfd->gtt_sa_num_of_chunks)
goto kfd_gtt_no_free_chunk;
/* Update fields of mem_obj */
(*mem_obj)->range_start = found;
(*mem_obj)->range_end = found;
(*mem_obj)->gpu_addr = kfd_gtt_sa_calc_gpu_addr(
kfd->gtt_start_gpu_addr,
found,
kfd->gtt_sa_chunk_size);
(*mem_obj)->cpu_ptr = kfd_gtt_sa_calc_cpu_addr(
kfd->gtt_start_cpu_ptr,
found,
kfd->gtt_sa_chunk_size);
pr_debug("kfd: gpu_addr = %p, cpu_addr = %p\n",
(uint64_t *) (*mem_obj)->gpu_addr, (*mem_obj)->cpu_ptr);
/* If we need only one chunk, mark it as allocated and get out */
if (size <= kfd->gtt_sa_chunk_size) {
pr_debug("kfd: single bit\n");
set_bit(found, kfd->gtt_sa_bitmap);
goto kfd_gtt_out;
}
/* Otherwise, try to see if we have enough contiguous chunks */
cur_size = size - kfd->gtt_sa_chunk_size;
do {
(*mem_obj)->range_end =
find_next_zero_bit(kfd->gtt_sa_bitmap,
kfd->gtt_sa_num_of_chunks, ++found);
/*
* If next free chunk is not contiguous than we need to
* restart our search from the last free chunk we found (which
* wasn't contiguous to the previous ones
*/
if ((*mem_obj)->range_end != found) {
start_search = found;
goto kfd_gtt_restart_search;
}
/*
* If we reached end of buffer, bail out with error
*/
if (found == kfd->gtt_sa_num_of_chunks)
goto kfd_gtt_no_free_chunk;
/* Check if we don't need another chunk */
if (cur_size <= kfd->gtt_sa_chunk_size)
cur_size = 0;
else
cur_size -= kfd->gtt_sa_chunk_size;
} while (cur_size > 0);
pr_debug("kfd: range_start = %d, range_end = %d\n",
(*mem_obj)->range_start, (*mem_obj)->range_end);
/* Mark the chunks as allocated */
for (found = (*mem_obj)->range_start;
found <= (*mem_obj)->range_end;
found++)
set_bit(found, kfd->gtt_sa_bitmap);
kfd_gtt_out:
mutex_unlock(&kfd->gtt_sa_lock);
return 0;
kfd_gtt_no_free_chunk:
pr_debug("kfd: allocation failed with mem_obj = %p\n", mem_obj);
mutex_unlock(&kfd->gtt_sa_lock);
kfree(mem_obj);
return -ENOMEM;
}
int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
{
unsigned int bit;
BUG_ON(!kfd);
BUG_ON(!mem_obj);
pr_debug("kfd: free mem_obj = %p, range_start = %d, range_end = %d\n",
mem_obj, mem_obj->range_start, mem_obj->range_end);
mutex_lock(&kfd->gtt_sa_lock);
/* Mark the chunks as free */
for (bit = mem_obj->range_start;
bit <= mem_obj->range_end;
bit++)
clear_bit(bit, kfd->gtt_sa_bitmap);
mutex_unlock(&kfd->gtt_sa_lock);
kfree(mem_obj);
return 0;
}
......@@ -36,6 +36,9 @@
#define KFD_VMID_START_OFFSET (8)
#define VMID_PER_DEVICE CIK_VMID_NUM
#define KFD_DQM_FIRST_PIPE (0)
#define CIK_SDMA_QUEUES (4)
#define CIK_SDMA_QUEUES_PER_ENGINE (2)
#define CIK_SDMA_ENGINE_NUM (2)
struct device_process_node {
struct qcm_process_device *qpd;
......@@ -130,8 +133,10 @@ struct device_queue_manager {
struct list_head queues;
unsigned int processes_count;
unsigned int queue_count;
unsigned int sdma_queue_count;
unsigned int next_pipe_to_allocate;
unsigned int *allocated_queues;
unsigned int sdma_bitmap;
unsigned int vmid_bitmap;
uint64_t pipelines_addr;
struct kfd_mem_obj *pipeline_mem;
......
......@@ -137,10 +137,6 @@ int kfd_doorbell_mmap(struct kfd_process *process, struct vm_area_struct *vma)
if (dev == NULL)
return -EINVAL;
/* Find if pdd exists for combination of process and gpu id */
if (!kfd_get_process_device_data(dev, process, 0))
return -EINVAL;
/* Calculate physical address of doorbell */
address = kfd_get_process_doorbells(dev, process);
......
......@@ -303,10 +303,11 @@ int kfd_init_apertures(struct kfd_process *process)
while ((dev = kfd_topology_enum_kfd_devices(id)) != NULL &&
id < NUM_OF_SUPPORTED_GPUS) {
pdd = kfd_get_process_device_data(dev, process, 1);
if (!pdd)
pdd = kfd_create_process_device_data(dev, process);
if (pdd == NULL) {
pr_err("Failed to create process device data\n");
return -1;
}
/*
* For 64 bit process aperture will be statically reserved in
* the x86_64 non canonical process address space
......
......@@ -72,11 +72,7 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
if (prop.doorbell_ptr == NULL)
goto err_get_kernel_doorbell;
retval = kfd2kgd->allocate_mem(dev->kgd,
queue_size,
PAGE_SIZE,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) &kq->pq);
retval = kfd_gtt_sa_allocate(dev, queue_size, &kq->pq);
if (retval != 0)
goto err_pq_allocate_vidmem;
......@@ -84,11 +80,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
kq->pq_kernel_addr = kq->pq->cpu_ptr;
kq->pq_gpu_addr = kq->pq->gpu_addr;
retval = kfd2kgd->allocate_mem(dev->kgd,
sizeof(*kq->rptr_kernel),
32,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) &kq->rptr_mem);
retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->rptr_kernel),
&kq->rptr_mem);
if (retval != 0)
goto err_rptr_allocate_vidmem;
......@@ -96,11 +89,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
kq->rptr_kernel = kq->rptr_mem->cpu_ptr;
kq->rptr_gpu_addr = kq->rptr_mem->gpu_addr;
retval = kfd2kgd->allocate_mem(dev->kgd,
sizeof(*kq->wptr_kernel),
32,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) &kq->wptr_mem);
retval = kfd_gtt_sa_allocate(dev, sizeof(*kq->wptr_kernel),
&kq->wptr_mem);
if (retval != 0)
goto err_wptr_allocate_vidmem;
......@@ -145,11 +135,8 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
} else {
/* allocate fence for DIQ */
retval = kfd2kgd->allocate_mem(dev->kgd,
sizeof(uint32_t),
32,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) &kq->fence_mem_obj);
retval = kfd_gtt_sa_allocate(dev, sizeof(uint32_t),
&kq->fence_mem_obj);
if (retval != 0)
goto err_alloc_fence;
......@@ -165,11 +152,11 @@ static bool initialize(struct kernel_queue *kq, struct kfd_dev *dev,
err_init_mqd:
uninit_queue(kq->queue);
err_init_queue:
kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->wptr_mem);
kfd_gtt_sa_free(dev, kq->wptr_mem);
err_wptr_allocate_vidmem:
kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->rptr_mem);
kfd_gtt_sa_free(dev, kq->rptr_mem);
err_rptr_allocate_vidmem:
kfd2kgd->free_mem(dev->kgd, (struct kgd_mem *) kq->pq);
kfd_gtt_sa_free(dev, kq->pq);
err_pq_allocate_vidmem:
pr_err("kfd: error init pq\n");
kfd_release_kernel_doorbell(dev, prop.doorbell_ptr);
......@@ -190,10 +177,12 @@ static void uninitialize(struct kernel_queue *kq)
QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS,
kq->queue->pipe,
kq->queue->queue);
else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj);
kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->rptr_mem);
kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->wptr_mem);
kfd2kgd->free_mem(kq->dev->kgd, (struct kgd_mem *) kq->pq);
kfd_gtt_sa_free(kq->dev, kq->rptr_mem);
kfd_gtt_sa_free(kq->dev, kq->wptr_mem);
kfd_gtt_sa_free(kq->dev, kq->pq);
kfd_release_kernel_doorbell(kq->dev,
kq->queue->properties.doorbell_ptr);
uninit_queue(kq->queue);
......
......@@ -26,6 +26,7 @@
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
#include "cik_regs.h"
#include "../../radeon/cikd.h"
#include "../../radeon/cik_reg.h"
inline void busy_wait(unsigned long ms)
......@@ -51,11 +52,8 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
pr_debug("kfd: In func %s\n", __func__);
retval = kfd2kgd->allocate_mem(mm->dev->kgd,
sizeof(struct cik_mqd),
256,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) mqd_mem_obj);
retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd),
mqd_mem_obj);
if (retval != 0)
return -ENOMEM;
......@@ -111,18 +109,60 @@ static int init_mqd(struct mqd_manager *mm, void **mqd,
return retval;
}
static int init_mqd_sdma(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj **mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
int retval;
struct cik_sdma_rlc_registers *m;
BUG_ON(!mm || !mqd || !mqd_mem_obj);
retval = kfd_gtt_sa_allocate(mm->dev,
sizeof(struct cik_sdma_rlc_registers),
mqd_mem_obj);
if (retval != 0)
return -ENOMEM;
m = (struct cik_sdma_rlc_registers *) (*mqd_mem_obj)->cpu_ptr;
memset(m, 0, sizeof(struct cik_sdma_rlc_registers));
*mqd = m;
if (gart_addr != NULL)
*gart_addr = (*mqd_mem_obj)->gpu_addr;
retval = mm->update_mqd(mm, m, q);
return retval;
}
static void uninit_mqd(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
BUG_ON(!mm || !mqd);
kfd2kgd->free_mem(mm->dev->kgd, (struct kgd_mem *) mqd_mem_obj);
kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
}
static void uninit_mqd_sdma(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
BUG_ON(!mm || !mqd);
kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
}
static int load_mqd(struct mqd_manager *mm, void *mqd, uint32_t pipe_id,
uint32_t queue_id, uint32_t __user *wptr)
{
return kfd2kgd->hqd_load(mm->dev->kgd, mqd, pipe_id, queue_id, wptr);
}
static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
uint32_t __user *wptr)
{
return kfd2kgd->hqd_sdma_load(mm->dev->kgd, mqd);
}
static int update_mqd(struct mqd_manager *mm, void *mqd,
......@@ -170,6 +210,41 @@ static int update_mqd(struct mqd_manager *mm, void *mqd,
return 0;
}
static int update_mqd_sdma(struct mqd_manager *mm, void *mqd,
struct queue_properties *q)
{
struct cik_sdma_rlc_registers *m;
BUG_ON(!mm || !mqd || !q);
m = get_sdma_mqd(mqd);
m->sdma_rlc_rb_cntl =
SDMA_RB_SIZE((ffs(q->queue_size / sizeof(unsigned int)))) |
SDMA_RB_VMID(q->vmid) |
SDMA_RPTR_WRITEBACK_ENABLE |
SDMA_RPTR_WRITEBACK_TIMER(6);
m->sdma_rlc_rb_base = lower_32_bits(q->queue_address >> 8);
m->sdma_rlc_rb_base_hi = upper_32_bits(q->queue_address >> 8);
m->sdma_rlc_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
m->sdma_rlc_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
m->sdma_rlc_doorbell = SDMA_OFFSET(q->doorbell_off) | SDMA_DB_ENABLE;
m->sdma_rlc_virtual_addr = q->sdma_vm_addr;
m->sdma_engine_id = q->sdma_engine_id;
m->sdma_queue_id = q->sdma_queue_id;
q->is_active = false;
if (q->queue_size > 0 &&
q->queue_address != 0 &&
q->queue_percent > 0) {
m->sdma_rlc_rb_cntl |= SDMA_RB_ENABLE;
q->is_active = true;
}
return 0;
}
static int destroy_mqd(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type,
unsigned int timeout, uint32_t pipe_id,
......@@ -179,6 +254,18 @@ static int destroy_mqd(struct mqd_manager *mm, void *mqd,
pipe_id, queue_id);
}
/*
* preempt type here is ignored because there is only one way
* to preempt sdma queue
*/
static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id)
{
return kfd2kgd->hqd_sdma_destroy(mm->dev->kgd, mqd, timeout);
}
static bool is_occupied(struct mqd_manager *mm, void *mqd,
uint64_t queue_address, uint32_t pipe_id,
uint32_t queue_id)
......@@ -189,6 +276,13 @@ static bool is_occupied(struct mqd_manager *mm, void *mqd,
}
static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
uint64_t queue_address, uint32_t pipe_id,
uint32_t queue_id)
{
return kfd2kgd->hqd_sdma_is_occupied(mm->dev->kgd, mqd);
}
/*
* HIQ MQD Implementation, concrete implementation for HIQ MQD implementation.
* The HIQ queue in Kaveri is using the same MQD structure as all the user mode
......@@ -207,11 +301,8 @@ static int init_mqd_hiq(struct mqd_manager *mm, void **mqd,
pr_debug("kfd: In func %s\n", __func__);
retval = kfd2kgd->allocate_mem(mm->dev->kgd,
sizeof(struct cik_mqd),
256,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) mqd_mem_obj);
retval = kfd_gtt_sa_allocate(mm->dev, sizeof(struct cik_mqd),
mqd_mem_obj);
if (retval != 0)
return -ENOMEM;
......@@ -301,6 +392,21 @@ static int update_mqd_hiq(struct mqd_manager *mm, void *mqd,
return 0;
}
/*
* SDMA MQD Implementation
*/
struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
{
struct cik_sdma_rlc_registers *m;
BUG_ON(!mqd);
m = (struct cik_sdma_rlc_registers *)mqd;
return m;
}
struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
struct kfd_dev *dev)
{
......@@ -335,6 +441,14 @@ struct mqd_manager *mqd_manager_init(enum KFD_MQD_TYPE type,
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
break;
case KFD_MQD_TYPE_CIK_SDMA:
mqd->init_mqd = init_mqd_sdma;
mqd->uninit_mqd = uninit_mqd_sdma;
mqd->load_mqd = load_mqd_sdma;
mqd->update_mqd = update_mqd_sdma;
mqd->destroy_mqd = destroy_mqd_sdma;
mqd->is_occupied = is_occupied_sdma;
break;
default:
kfree(mqd);
return NULL;
......
......@@ -97,11 +97,8 @@ static int pm_allocate_runlist_ib(struct packet_manager *pm,
pm_calc_rlib_size(pm, rl_buffer_size, is_over_subscription);
retval = kfd2kgd->allocate_mem(pm->dqm->dev->kgd,
*rl_buffer_size,
PAGE_SIZE,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE,
(struct kgd_mem **) &pm->ib_buffer_obj);
retval = kfd_gtt_sa_allocate(pm->dqm->dev, *rl_buffer_size,
&pm->ib_buffer_obj);
if (retval != 0) {
pr_err("kfd: failed to allocate runlist IB\n");
......@@ -557,8 +554,7 @@ void pm_release_ib(struct packet_manager *pm)
mutex_lock(&pm->lock);
if (pm->allocated) {
kfd2kgd->free_mem(pm->dqm->dev->kgd,
(struct kgd_mem *) pm->ib_buffer_obj);
kfd_gtt_sa_free(pm->dqm->dev, pm->ib_buffer_obj);
pm->allocated = false;
}
mutex_unlock(&pm->lock);
......
......@@ -107,9 +107,17 @@ enum cache_policy {
struct kfd_device_info {
unsigned int max_pasid_bits;
size_t ih_ring_entry_size;
uint8_t num_of_watch_points;
uint16_t mqd_size_aligned;
};
struct kfd_mem_obj {
uint32_t range_start;
uint32_t range_end;
uint64_t gpu_addr;
uint32_t *cpu_ptr;
};
struct kfd_dev {
struct kgd_dev *kgd;
......@@ -135,6 +143,14 @@ struct kfd_dev {
struct kgd2kfd_shared_resources shared_resources;
void *gtt_mem;
uint64_t gtt_start_gpu_addr;
void *gtt_start_cpu_ptr;
void *gtt_sa_bitmap;
struct mutex gtt_sa_lock;
unsigned int gtt_sa_chunk_size;
unsigned int gtt_sa_num_of_chunks;
void *interrupt_ring;
size_t interrupt_ring_size;
atomic_t interrupt_ring_rptr;
......@@ -162,12 +178,6 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd);
extern const struct kfd2kgd_calls *kfd2kgd;
struct kfd_mem_obj {
void *bo;
uint64_t gpu_addr;
uint32_t *cpu_ptr;
};
enum kfd_mempool {
KFD_MEMPOOL_SYSTEM_CACHEABLE = 1,
KFD_MEMPOOL_SYSTEM_WRITECOMBINE = 2,
......@@ -285,6 +295,10 @@ struct queue_properties {
bool is_active;
/* Not relevant for user mode queues in cp scheduling */
unsigned int vmid;
/* Relevant only for sdma queues*/
uint32_t sdma_engine_id;
uint32_t sdma_queue_id;
uint32_t sdma_vm_addr;
};
/**
......@@ -327,6 +341,8 @@ struct queue {
uint32_t pipe;
uint32_t queue;
unsigned int sdma_id;
struct kfd_process *process;
struct kfd_dev *device;
};
......@@ -472,8 +488,9 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
struct kfd_process *p);
void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid);
struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
struct kfd_process *p,
int create_pdd);
struct kfd_process *p);
struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
struct kfd_process *p);
/* Process device data iterator */
struct kfd_process_device *kfd_get_first_process_device_data(struct kfd_process *p);
......@@ -501,6 +518,13 @@ unsigned int kfd_queue_id_to_doorbell(struct kfd_dev *kfd,
struct kfd_process *process,
unsigned int queue_id);
/* GTT Sub-Allocator */
int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size,
struct kfd_mem_obj **mem_obj);
int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj);
extern struct device *kfd_device;
/* Topology */
......@@ -528,6 +552,8 @@ int kfd_init_apertures(struct kfd_process *process);
/* Queue Context Management */
inline uint32_t lower_32(uint64_t x);
inline uint32_t upper_32(uint64_t x);
struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd);
inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m);
int init_queue(struct queue **q, struct queue_properties properties);
void uninit_queue(struct queue *q);
......
......@@ -311,24 +311,29 @@ static struct kfd_process *create_process(const struct task_struct *thread)
}
struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
struct kfd_process *p,
int create_pdd)
struct kfd_process *p)
{
struct kfd_process_device *pdd = NULL;
list_for_each_entry(pdd, &p->per_device_data, per_device_list)
if (pdd->dev == dev)
return pdd;
if (create_pdd) {
pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
if (pdd != NULL) {
pdd->dev = dev;
INIT_LIST_HEAD(&pdd->qpd.queues_list);
INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
pdd->qpd.dqm = dev->dqm;
list_add(&pdd->per_device_list, &p->per_device_data);
}
break;
return pdd;
}
struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
struct kfd_process *p)
{
struct kfd_process_device *pdd = NULL;
pdd = kzalloc(sizeof(*pdd), GFP_KERNEL);
if (pdd != NULL) {
pdd->dev = dev;
INIT_LIST_HEAD(&pdd->qpd.queues_list);
INIT_LIST_HEAD(&pdd->qpd.priv_queue_list);
pdd->qpd.dqm = dev->dqm;
list_add(&pdd->per_device_list, &p->per_device_data);
}
return pdd;
......@@ -344,11 +349,14 @@ struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
struct kfd_process *p)
{
struct kfd_process_device *pdd = kfd_get_process_device_data(dev, p, 1);
struct kfd_process_device *pdd;
int err;
if (pdd == NULL)
pdd = kfd_get_process_device_data(dev, p);
if (!pdd) {
pr_err("Process device data doesn't exist\n");
return ERR_PTR(-ENOMEM);
}
if (pdd->bound)
return pdd;
......@@ -384,7 +392,7 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
pqm_uninit(&p->pqm);
pdd = kfd_get_process_device_data(dev, p, 0);
pdd = kfd_get_process_device_data(dev, p);
/*
* Just mark pdd as unbound, because we still need it to call
......
......@@ -128,7 +128,6 @@ static int create_cp_queue(struct process_queue_manager *pqm,
/* let DQM handle it*/
q_properties->vmid = 0;
q_properties->queue_id = qid;
q_properties->type = KFD_QUEUE_TYPE_COMPUTE;
retval = init_queue(q, *q_properties);
if (retval != 0)
......@@ -167,8 +166,11 @@ int pqm_create_queue(struct process_queue_manager *pqm,
q = NULL;
kq = NULL;
pdd = kfd_get_process_device_data(dev, pqm->process, 1);
BUG_ON(!pdd);
pdd = kfd_get_process_device_data(dev, pqm->process);
if (!pdd) {
pr_err("Process device data doesn't exist\n");
return -1;
}
retval = find_available_queue_slot(pqm, qid);
if (retval != 0)
......@@ -186,6 +188,7 @@ int pqm_create_queue(struct process_queue_manager *pqm,
}
switch (type) {
case KFD_QUEUE_TYPE_SDMA:
case KFD_QUEUE_TYPE_COMPUTE:
/* check if there is over subscription */
if ((sched_policy == KFD_SCHED_POLICY_HWS_NO_OVERSUBSCRIPTION) &&
......@@ -273,8 +276,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
dev = pqn->q->device;
BUG_ON(!dev);
pdd = kfd_get_process_device_data(dev, pqm->process, 1);
BUG_ON(!pdd);
pdd = kfd_get_process_device_data(dev, pqm->process);
if (!pdd) {
pr_err("Process device data doesn't exist\n");
return -1;
}
if (pqn->kq) {
/* destroy kernel queue (DIQ) */
......
......@@ -27,6 +27,7 @@
#include <linux/acpi.h>
#include <linux/hash.h>
#include <linux/cpufreq.h>
#include <linux/log2.h>
#include "kfd_priv.h"
#include "kfd_crat.h"
......@@ -630,10 +631,10 @@ static struct kobj_type cache_type = {
static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
char *buffer)
{
ssize_t ret;
struct kfd_topology_device *dev;
char public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE];
uint32_t i;
uint32_t log_max_watch_addr;
/* Making sure that the buffer is an empty string */
buffer[0] = 0;
......@@ -641,8 +642,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
if (strcmp(attr->name, "gpu_id") == 0) {
dev = container_of(attr, struct kfd_topology_device,
attr_gpuid);
ret = sysfs_show_32bit_val(buffer, dev->gpu_id);
} else if (strcmp(attr->name, "name") == 0) {
return sysfs_show_32bit_val(buffer, dev->gpu_id);
}
if (strcmp(attr->name, "name") == 0) {
dev = container_of(attr, struct kfd_topology_device,
attr_name);
for (i = 0; i < KFD_TOPOLOGY_PUBLIC_NAME_SIZE; i++) {
......@@ -652,80 +655,90 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
break;
}
public_name[KFD_TOPOLOGY_PUBLIC_NAME_SIZE-1] = 0x0;
ret = sysfs_show_str_val(buffer, public_name);
} else {
dev = container_of(attr, struct kfd_topology_device,
attr_props);
sysfs_show_32bit_prop(buffer, "cpu_cores_count",
dev->node_props.cpu_cores_count);
sysfs_show_32bit_prop(buffer, "simd_count",
dev->node_props.simd_count);
if (dev->mem_bank_count < dev->node_props.mem_banks_count) {
pr_warn("kfd: mem_banks_count truncated from %d to %d\n",
dev->node_props.mem_banks_count,
dev->mem_bank_count);
sysfs_show_32bit_prop(buffer, "mem_banks_count",
dev->mem_bank_count);
} else {
sysfs_show_32bit_prop(buffer, "mem_banks_count",
dev->node_props.mem_banks_count);
}
return sysfs_show_str_val(buffer, public_name);
}
sysfs_show_32bit_prop(buffer, "caches_count",
dev->node_props.caches_count);
sysfs_show_32bit_prop(buffer, "io_links_count",
dev->node_props.io_links_count);
sysfs_show_32bit_prop(buffer, "cpu_core_id_base",
dev->node_props.cpu_core_id_base);
sysfs_show_32bit_prop(buffer, "simd_id_base",
dev->node_props.simd_id_base);
sysfs_show_32bit_prop(buffer, "capability",
dev->node_props.capability);
sysfs_show_32bit_prop(buffer, "max_waves_per_simd",
dev->node_props.max_waves_per_simd);
sysfs_show_32bit_prop(buffer, "lds_size_in_kb",
dev->node_props.lds_size_in_kb);
sysfs_show_32bit_prop(buffer, "gds_size_in_kb",
dev->node_props.gds_size_in_kb);
sysfs_show_32bit_prop(buffer, "wave_front_size",
dev->node_props.wave_front_size);
sysfs_show_32bit_prop(buffer, "array_count",
dev->node_props.array_count);
sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine",
dev->node_props.simd_arrays_per_engine);
sysfs_show_32bit_prop(buffer, "cu_per_simd_array",
dev->node_props.cu_per_simd_array);
sysfs_show_32bit_prop(buffer, "simd_per_cu",
dev->node_props.simd_per_cu);
sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu",
dev->node_props.max_slots_scratch_cu);
sysfs_show_32bit_prop(buffer, "vendor_id",
dev->node_props.vendor_id);
sysfs_show_32bit_prop(buffer, "device_id",
dev->node_props.device_id);
sysfs_show_32bit_prop(buffer, "location_id",
dev->node_props.location_id);
if (dev->gpu) {
sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute",
kfd2kgd->get_max_engine_clock_in_mhz(
dev->gpu->kgd));
sysfs_show_64bit_prop(buffer, "local_mem_size",
kfd2kgd->get_vmem_size(dev->gpu->kgd));
sysfs_show_32bit_prop(buffer, "fw_version",
kfd2kgd->get_fw_version(
dev->gpu->kgd,
KGD_ENGINE_MEC1));
dev = container_of(attr, struct kfd_topology_device,
attr_props);
sysfs_show_32bit_prop(buffer, "cpu_cores_count",
dev->node_props.cpu_cores_count);
sysfs_show_32bit_prop(buffer, "simd_count",
dev->node_props.simd_count);
if (dev->mem_bank_count < dev->node_props.mem_banks_count) {
pr_warn("kfd: mem_banks_count truncated from %d to %d\n",
dev->node_props.mem_banks_count,
dev->mem_bank_count);
sysfs_show_32bit_prop(buffer, "mem_banks_count",
dev->mem_bank_count);
} else {
sysfs_show_32bit_prop(buffer, "mem_banks_count",
dev->node_props.mem_banks_count);
}
sysfs_show_32bit_prop(buffer, "caches_count",
dev->node_props.caches_count);
sysfs_show_32bit_prop(buffer, "io_links_count",
dev->node_props.io_links_count);
sysfs_show_32bit_prop(buffer, "cpu_core_id_base",
dev->node_props.cpu_core_id_base);
sysfs_show_32bit_prop(buffer, "simd_id_base",
dev->node_props.simd_id_base);
sysfs_show_32bit_prop(buffer, "capability",
dev->node_props.capability);
sysfs_show_32bit_prop(buffer, "max_waves_per_simd",
dev->node_props.max_waves_per_simd);
sysfs_show_32bit_prop(buffer, "lds_size_in_kb",
dev->node_props.lds_size_in_kb);
sysfs_show_32bit_prop(buffer, "gds_size_in_kb",
dev->node_props.gds_size_in_kb);
sysfs_show_32bit_prop(buffer, "wave_front_size",
dev->node_props.wave_front_size);
sysfs_show_32bit_prop(buffer, "array_count",
dev->node_props.array_count);
sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine",
dev->node_props.simd_arrays_per_engine);
sysfs_show_32bit_prop(buffer, "cu_per_simd_array",
dev->node_props.cu_per_simd_array);
sysfs_show_32bit_prop(buffer, "simd_per_cu",
dev->node_props.simd_per_cu);
sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu",
dev->node_props.max_slots_scratch_cu);
sysfs_show_32bit_prop(buffer, "vendor_id",
dev->node_props.vendor_id);
sysfs_show_32bit_prop(buffer, "device_id",
dev->node_props.device_id);
sysfs_show_32bit_prop(buffer, "location_id",
dev->node_props.location_id);
if (dev->gpu) {
log_max_watch_addr =
__ilog2_u32(dev->gpu->device_info->num_of_watch_points);
if (log_max_watch_addr) {
dev->node_props.capability |=
HSA_CAP_WATCH_POINTS_SUPPORTED;
dev->node_props.capability |=
((log_max_watch_addr <<
HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT) &
HSA_CAP_WATCH_POINTS_TOTALBITS_MASK);
}
ret = sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute",
cpufreq_quick_get_max(0)/1000);
sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute",
kfd2kgd->get_max_engine_clock_in_mhz(
dev->gpu->kgd));
sysfs_show_64bit_prop(buffer, "local_mem_size",
kfd2kgd->get_vmem_size(dev->gpu->kgd));
sysfs_show_32bit_prop(buffer, "fw_version",
kfd2kgd->get_fw_version(
dev->gpu->kgd,
KGD_ENGINE_MEC1));
}
return ret;
return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute",
cpufreq_quick_get_max(0)/1000);
}
static const struct sysfs_ops node_ops = {
......
......@@ -110,17 +110,10 @@ struct kgd2kfd_calls {
/**
* struct kfd2kgd_calls
*
* @init_sa_manager: Initialize an instance of the sa manager, used by
* amdkfd for all system memory allocations that are mapped to the GART
* address space
* @init_gtt_mem_allocation: Allocate a buffer on the gart aperture.
* The buffer can be used for mqds, hpds, kernel queue, fence and runlists
*
* @fini_sa_manager: Releases all memory allocations for amdkfd that are
* handled by kgd sa manager
*
* @allocate_mem: Allocate a buffer from amdkfd's sa manager. The buffer can
* be used for mqds, hpds, kernel queue, fence and runlists
*
* @free_mem: Frees a buffer that was allocated by amdkfd's sa manager
* @free_gtt_mem: Frees a buffer that was allocated on the gart aperture
*
* @get_vmem_size: Retrieves (physical) size of VRAM
*
......@@ -144,10 +137,18 @@ struct kgd2kfd_calls {
* @hqd_load: Loads the mqd structure to a H/W hqd slot. used only for no cp
* sceduling mode.
*
* @hqd_sdma_load: Loads the SDMA mqd structure to a H/W SDMA hqd slot.
* used only for no HWS mode.
*
* @hqd_is_occupies: Checks if a hqd slot is occupied.
*
* @hqd_destroy: Destructs and preempts the queue assigned to that hqd slot.
*
* @hqd_sdma_is_occupied: Checks if an SDMA hqd slot is occupied.
*
* @hqd_sdma_destroy: Destructs and preempts the SDMA queue assigned to that
* SDMA hqd slot.
*
* @get_fw_version: Returns FW versions from the header
*
* This structure contains function pointers to services that the kgd driver
......@@ -155,13 +156,11 @@ struct kgd2kfd_calls {
*
*/
struct kfd2kgd_calls {
/* Memory management. */
int (*init_sa_manager)(struct kgd_dev *kgd, unsigned int size);
void (*fini_sa_manager)(struct kgd_dev *kgd);
int (*allocate_mem)(struct kgd_dev *kgd, size_t size, size_t alignment,
enum kgd_memory_pool pool, struct kgd_mem **mem);
int (*init_gtt_mem_allocation)(struct kgd_dev *kgd, size_t size,
void **mem_obj, uint64_t *gpu_addr,
void **cpu_ptr);
void (*free_mem)(struct kgd_dev *kgd, struct kgd_mem *mem);
void (*free_gtt_mem)(struct kgd_dev *kgd, void *mem_obj);
uint64_t (*get_vmem_size)(struct kgd_dev *kgd);
uint64_t (*get_gpu_clock_counter)(struct kgd_dev *kgd);
......@@ -183,18 +182,26 @@ struct kfd2kgd_calls {
int (*hqd_load)(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
uint32_t queue_id, uint32_t __user *wptr);
int (*hqd_sdma_load)(struct kgd_dev *kgd, void *mqd);
bool (*hqd_is_occupies)(struct kgd_dev *kgd, uint64_t queue_address,
uint32_t pipe_id, uint32_t queue_id);
int (*hqd_destroy)(struct kgd_dev *kgd, uint32_t reset_type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id);
bool (*hqd_sdma_is_occupied)(struct kgd_dev *kgd, void *mqd);
int (*hqd_sdma_destroy)(struct kgd_dev *kgd, void *mqd,
unsigned int timeout);
uint16_t (*get_fw_version)(struct kgd_dev *kgd,
enum kgd_engine_type type);
};
bool kgd2kfd_init(unsigned interface_version,
const struct kfd2kgd_calls *f2g,
const struct kgd2kfd_calls **g2f);
const struct kfd2kgd_calls *f2g,
const struct kgd2kfd_calls **g2f);
#endif /* KGD_KFD_INTERFACE_H_INCLUDED */
#endif /* KGD_KFD_INTERFACE_H_INCLUDED */
......@@ -147,10 +147,42 @@
#define CIK_LB_DESKTOP_HEIGHT 0x6b0c
#define KFD_CIK_SDMA_QUEUE_OFFSET 0x200
#define CP_HQD_IQ_RPTR 0xC970u
#define AQL_ENABLE (1U << 0)
#define IDLE (1 << 2)
#define SDMA0_RLC0_RB_CNTL 0xD400u
#define SDMA_RB_VMID(x) (x << 24)
#define SDMA0_RLC0_RB_BASE 0xD404u
#define SDMA0_RLC0_RB_BASE_HI 0xD408u
#define SDMA0_RLC0_RB_RPTR 0xD40Cu
#define SDMA0_RLC0_RB_WPTR 0xD410u
#define SDMA0_RLC0_RB_WPTR_POLL_CNTL 0xD414u
#define SDMA0_RLC0_RB_WPTR_POLL_ADDR_HI 0xD418u
#define SDMA0_RLC0_RB_WPTR_POLL_ADDR_LO 0xD41Cu
#define SDMA0_RLC0_RB_RPTR_ADDR_HI 0xD420u
#define SDMA0_RLC0_RB_RPTR_ADDR_LO 0xD424u
#define SDMA0_RLC0_IB_CNTL 0xD428u
#define SDMA0_RLC0_IB_RPTR 0xD42Cu
#define SDMA0_RLC0_IB_OFFSET 0xD430u
#define SDMA0_RLC0_IB_BASE_LO 0xD434u
#define SDMA0_RLC0_IB_BASE_HI 0xD438u
#define SDMA0_RLC0_IB_SIZE 0xD43Cu
#define SDMA0_RLC0_SKIP_CNTL 0xD440u
#define SDMA0_RLC0_CONTEXT_STATUS 0xD444u
#define SDMA_RLC_IDLE (1 << 2)
#define SDMA0_RLC0_DOORBELL 0xD448u
#define SDMA_OFFSET(x) (x << 0)
#define SDMA_DB_ENABLE (1 << 28)
#define SDMA0_RLC0_VIRTUAL_ADDR 0xD49Cu
#define SDMA_ATC (1 << 0)
#define SDMA_VA_PTR32 (1 << 4)
#define SDMA_VA_SHARED_BASE(x) (x << 8)
#define SDMA0_RLC0_APE1_CNTL 0xD4A0u
#define SDMA0_RLC0_DOORBELL_LOG 0xD4A4u
#define SDMA0_RLC0_WATERMARK 0xD4A8u
#define SDMA0_CNTL 0xD010
#define SDMA1_CNTL 0xD810
struct cik_mqd {
uint32_t header;
......@@ -283,4 +315,137 @@ struct cik_mqd {
uint32_t queue_doorbell_id15;
};
struct cik_sdma_rlc_registers {
uint32_t sdma_rlc_rb_cntl;
uint32_t sdma_rlc_rb_base;
uint32_t sdma_rlc_rb_base_hi;
uint32_t sdma_rlc_rb_rptr;
uint32_t sdma_rlc_rb_wptr;
uint32_t sdma_rlc_rb_wptr_poll_cntl;
uint32_t sdma_rlc_rb_wptr_poll_addr_hi;
uint32_t sdma_rlc_rb_wptr_poll_addr_lo;
uint32_t sdma_rlc_rb_rptr_addr_hi;
uint32_t sdma_rlc_rb_rptr_addr_lo;
uint32_t sdma_rlc_ib_cntl;
uint32_t sdma_rlc_ib_rptr;
uint32_t sdma_rlc_ib_offset;
uint32_t sdma_rlc_ib_base_lo;
uint32_t sdma_rlc_ib_base_hi;
uint32_t sdma_rlc_ib_size;
uint32_t sdma_rlc_skip_cntl;
uint32_t sdma_rlc_context_status;
uint32_t sdma_rlc_doorbell;
uint32_t sdma_rlc_virtual_addr;
uint32_t sdma_rlc_ape1_cntl;
uint32_t sdma_rlc_doorbell_log;
uint32_t reserved_22;
uint32_t reserved_23;
uint32_t reserved_24;
uint32_t reserved_25;
uint32_t reserved_26;
uint32_t reserved_27;
uint32_t reserved_28;
uint32_t reserved_29;
uint32_t reserved_30;
uint32_t reserved_31;
uint32_t reserved_32;
uint32_t reserved_33;
uint32_t reserved_34;
uint32_t reserved_35;
uint32_t reserved_36;
uint32_t reserved_37;
uint32_t reserved_38;
uint32_t reserved_39;
uint32_t reserved_40;
uint32_t reserved_41;
uint32_t reserved_42;
uint32_t reserved_43;
uint32_t reserved_44;
uint32_t reserved_45;
uint32_t reserved_46;
uint32_t reserved_47;
uint32_t reserved_48;
uint32_t reserved_49;
uint32_t reserved_50;
uint32_t reserved_51;
uint32_t reserved_52;
uint32_t reserved_53;
uint32_t reserved_54;
uint32_t reserved_55;
uint32_t reserved_56;
uint32_t reserved_57;
uint32_t reserved_58;
uint32_t reserved_59;
uint32_t reserved_60;
uint32_t reserved_61;
uint32_t reserved_62;
uint32_t reserved_63;
uint32_t reserved_64;
uint32_t reserved_65;
uint32_t reserved_66;
uint32_t reserved_67;
uint32_t reserved_68;
uint32_t reserved_69;
uint32_t reserved_70;
uint32_t reserved_71;
uint32_t reserved_72;
uint32_t reserved_73;
uint32_t reserved_74;
uint32_t reserved_75;
uint32_t reserved_76;
uint32_t reserved_77;
uint32_t reserved_78;
uint32_t reserved_79;
uint32_t reserved_80;
uint32_t reserved_81;
uint32_t reserved_82;
uint32_t reserved_83;
uint32_t reserved_84;
uint32_t reserved_85;
uint32_t reserved_86;
uint32_t reserved_87;
uint32_t reserved_88;
uint32_t reserved_89;
uint32_t reserved_90;
uint32_t reserved_91;
uint32_t reserved_92;
uint32_t reserved_93;
uint32_t reserved_94;
uint32_t reserved_95;
uint32_t reserved_96;
uint32_t reserved_97;
uint32_t reserved_98;
uint32_t reserved_99;
uint32_t reserved_100;
uint32_t reserved_101;
uint32_t reserved_102;
uint32_t reserved_103;
uint32_t reserved_104;
uint32_t reserved_105;
uint32_t reserved_106;
uint32_t reserved_107;
uint32_t reserved_108;
uint32_t reserved_109;
uint32_t reserved_110;
uint32_t reserved_111;
uint32_t reserved_112;
uint32_t reserved_113;
uint32_t reserved_114;
uint32_t reserved_115;
uint32_t reserved_116;
uint32_t reserved_117;
uint32_t reserved_118;
uint32_t reserved_119;
uint32_t reserved_120;
uint32_t reserved_121;
uint32_t reserved_122;
uint32_t reserved_123;
uint32_t reserved_124;
uint32_t reserved_125;
uint32_t reserved_126;
uint32_t reserved_127;
uint32_t sdma_engine_id;
uint32_t sdma_queue_id;
};
#endif
......@@ -282,6 +282,33 @@ static void cik_sdma_rlc_stop(struct radeon_device *rdev)
/* XXX todo */
}
/**
* cik_sdma_ctx_switch_enable - enable/disable sdma engine preemption
*
* @rdev: radeon_device pointer
* @enable: enable/disable preemption.
*
* Halt or unhalt the async dma engines (CIK).
*/
void cik_sdma_ctx_switch_enable(struct radeon_device *rdev, bool enable)
{
uint32_t reg_offset, value;
int i;
for (i = 0; i < 2; i++) {
if (i == 0)
reg_offset = SDMA0_REGISTER_OFFSET;
else
reg_offset = SDMA1_REGISTER_OFFSET;
value = RREG32(SDMA0_CNTL + reg_offset);
if (enable)
value |= AUTO_CTXSW_ENABLE;
else
value &= ~AUTO_CTXSW_ENABLE;
WREG32(SDMA0_CNTL + reg_offset, value);
}
}
/**
* cik_sdma_enable - stop the async dma engines
*
......@@ -312,6 +339,8 @@ void cik_sdma_enable(struct radeon_device *rdev, bool enable)
me_cntl |= SDMA_HALT;
WREG32(SDMA0_ME_CNTL + reg_offset, me_cntl);
}
cik_sdma_ctx_switch_enable(rdev, enable);
}
/**
......
......@@ -34,18 +34,17 @@
#define CIK_PIPE_PER_MEC (4)
struct kgd_mem {
struct radeon_sa_bo *sa_bo;
struct radeon_bo *bo;
uint64_t gpu_addr;
void *ptr;
void *cpu_ptr;
};
static int init_sa_manager(struct kgd_dev *kgd, unsigned int size);
static void fini_sa_manager(struct kgd_dev *kgd);
static int allocate_mem(struct kgd_dev *kgd, size_t size, size_t alignment,
enum kgd_memory_pool pool, struct kgd_mem **mem);
static int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
void **mem_obj, uint64_t *gpu_addr,
void **cpu_ptr);
static void free_mem(struct kgd_dev *kgd, struct kgd_mem *mem);
static void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj);
static uint64_t get_vmem_size(struct kgd_dev *kgd);
static uint64_t get_gpu_clock_counter(struct kgd_dev *kgd);
......@@ -71,19 +70,20 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
uint32_t queue_id, uint32_t __user *wptr);
static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd);
static bool kgd_hqd_is_occupies(struct kgd_dev *kgd, uint64_t queue_address,
uint32_t pipe_id, uint32_t queue_id);
static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id);
static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd);
static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
unsigned int timeout);
static const struct kfd2kgd_calls kfd2kgd = {
.init_sa_manager = init_sa_manager,
.fini_sa_manager = fini_sa_manager,
.allocate_mem = allocate_mem,
.free_mem = free_mem,
.init_gtt_mem_allocation = alloc_gtt_mem,
.free_gtt_mem = free_gtt_mem,
.get_vmem_size = get_vmem_size,
.get_gpu_clock_counter = get_gpu_clock_counter,
.get_max_engine_clock_in_mhz = get_max_engine_clock_in_mhz,
......@@ -92,8 +92,11 @@ static const struct kfd2kgd_calls kfd2kgd = {
.init_memory = kgd_init_memory,
.init_pipeline = kgd_init_pipeline,
.hqd_load = kgd_hqd_load,
.hqd_sdma_load = kgd_hqd_sdma_load,
.hqd_is_occupies = kgd_hqd_is_occupies,
.hqd_sdma_is_occupied = kgd_hqd_sdma_is_occupied,
.hqd_destroy = kgd_hqd_destroy,
.hqd_sdma_destroy = kgd_hqd_sdma_destroy,
.get_fw_version = get_fw_version
};
......@@ -182,87 +185,78 @@ int radeon_kfd_resume(struct radeon_device *rdev)
return r;
}
static u32 pool_to_domain(enum kgd_memory_pool p)
{
switch (p) {
case KGD_POOL_FRAMEBUFFER: return RADEON_GEM_DOMAIN_VRAM;
default: return RADEON_GEM_DOMAIN_GTT;
}
}
static int init_sa_manager(struct kgd_dev *kgd, unsigned int size)
static int alloc_gtt_mem(struct kgd_dev *kgd, size_t size,
void **mem_obj, uint64_t *gpu_addr,
void **cpu_ptr)
{
struct radeon_device *rdev = (struct radeon_device *)kgd;
struct kgd_mem **mem = (struct kgd_mem **) mem_obj;
int r;
BUG_ON(kgd == NULL);
BUG_ON(gpu_addr == NULL);
BUG_ON(cpu_ptr == NULL);
r = radeon_sa_bo_manager_init(rdev, &rdev->kfd_bo,
size,
RADEON_GPU_PAGE_SIZE,
RADEON_GEM_DOMAIN_GTT,
RADEON_GEM_GTT_WC);
*mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL);
if ((*mem) == NULL)
return -ENOMEM;
if (r)
r = radeon_bo_create(rdev, size, PAGE_SIZE, true, RADEON_GEM_DOMAIN_GTT,
RADEON_GEM_GTT_WC, NULL, NULL, &(*mem)->bo);
if (r) {
dev_err(rdev->dev,
"failed to allocate BO for amdkfd (%d)\n", r);
return r;
}
r = radeon_sa_bo_manager_start(rdev, &rdev->kfd_bo);
if (r)
radeon_sa_bo_manager_fini(rdev, &rdev->kfd_bo);
return r;
}
static void fini_sa_manager(struct kgd_dev *kgd)
{
struct radeon_device *rdev = (struct radeon_device *)kgd;
BUG_ON(kgd == NULL);
radeon_sa_bo_manager_suspend(rdev, &rdev->kfd_bo);
radeon_sa_bo_manager_fini(rdev, &rdev->kfd_bo);
}
static int allocate_mem(struct kgd_dev *kgd, size_t size, size_t alignment,
enum kgd_memory_pool pool, struct kgd_mem **mem)
{
struct radeon_device *rdev = (struct radeon_device *)kgd;
u32 domain;
int r;
BUG_ON(kgd == NULL);
domain = pool_to_domain(pool);
if (domain != RADEON_GEM_DOMAIN_GTT) {
dev_err(rdev->dev,
"Only allowed to allocate gart memory for kfd\n");
return -EINVAL;
/* map the buffer */
r = radeon_bo_reserve((*mem)->bo, true);
if (r) {
dev_err(rdev->dev, "(%d) failed to reserve bo for amdkfd\n", r);
goto allocate_mem_reserve_bo_failed;
}
*mem = kmalloc(sizeof(struct kgd_mem), GFP_KERNEL);
if ((*mem) == NULL)
return -ENOMEM;
r = radeon_bo_pin((*mem)->bo, RADEON_GEM_DOMAIN_GTT,
&(*mem)->gpu_addr);
if (r) {
dev_err(rdev->dev, "(%d) failed to pin bo for amdkfd\n", r);
goto allocate_mem_pin_bo_failed;
}
*gpu_addr = (*mem)->gpu_addr;
r = radeon_sa_bo_new(rdev, &rdev->kfd_bo, &(*mem)->sa_bo, size,
alignment);
r = radeon_bo_kmap((*mem)->bo, &(*mem)->cpu_ptr);
if (r) {
dev_err(rdev->dev, "failed to get memory for kfd (%d)\n", r);
return r;
dev_err(rdev->dev,
"(%d) failed to map bo to kernel for amdkfd\n", r);
goto allocate_mem_kmap_bo_failed;
}
*cpu_ptr = (*mem)->cpu_ptr;
(*mem)->ptr = radeon_sa_bo_cpu_addr((*mem)->sa_bo);
(*mem)->gpu_addr = radeon_sa_bo_gpu_addr((*mem)->sa_bo);
radeon_bo_unreserve((*mem)->bo);
return 0;
allocate_mem_kmap_bo_failed:
radeon_bo_unpin((*mem)->bo);
allocate_mem_pin_bo_failed:
radeon_bo_unreserve((*mem)->bo);
allocate_mem_reserve_bo_failed:
radeon_bo_unref(&(*mem)->bo);
return r;
}
static void free_mem(struct kgd_dev *kgd, struct kgd_mem *mem)
static void free_gtt_mem(struct kgd_dev *kgd, void *mem_obj)
{
struct radeon_device *rdev = (struct radeon_device *)kgd;
struct kgd_mem *mem = (struct kgd_mem *) mem_obj;
BUG_ON(kgd == NULL);
BUG_ON(mem == NULL);
radeon_sa_bo_free(rdev, &mem->sa_bo, NULL);
radeon_bo_reserve(mem->bo, true);
radeon_bo_kunmap(mem->bo);
radeon_bo_unpin(mem->bo);
radeon_bo_unreserve(mem->bo);
radeon_bo_unref(&(mem->bo));
kfree(mem);
}
......@@ -435,11 +429,28 @@ static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id,
return 0;
}
static inline uint32_t get_sdma_base_addr(struct cik_sdma_rlc_registers *m)
{
uint32_t retval;
retval = m->sdma_engine_id * SDMA1_REGISTER_OFFSET +
m->sdma_queue_id * KFD_CIK_SDMA_QUEUE_OFFSET;
pr_debug("kfd: sdma base address: 0x%x\n", retval);
return retval;
}
static inline struct cik_mqd *get_mqd(void *mqd)
{
return (struct cik_mqd *)mqd;
}
static inline struct cik_sdma_rlc_registers *get_sdma_mqd(void *mqd)
{
return (struct cik_sdma_rlc_registers *)mqd;
}
static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
uint32_t queue_id, uint32_t __user *wptr)
{
......@@ -517,6 +528,45 @@ static int kgd_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id,
return 0;
}
static int kgd_hqd_sdma_load(struct kgd_dev *kgd, void *mqd)
{
struct cik_sdma_rlc_registers *m;
uint32_t sdma_base_addr;
m = get_sdma_mqd(mqd);
sdma_base_addr = get_sdma_base_addr(m);
write_register(kgd,
sdma_base_addr + SDMA0_RLC0_VIRTUAL_ADDR,
m->sdma_rlc_virtual_addr);
write_register(kgd,
sdma_base_addr + SDMA0_RLC0_RB_BASE,
m->sdma_rlc_rb_base);
write_register(kgd,
sdma_base_addr + SDMA0_RLC0_RB_BASE_HI,
m->sdma_rlc_rb_base_hi);
write_register(kgd,
sdma_base_addr + SDMA0_RLC0_RB_RPTR_ADDR_LO,
m->sdma_rlc_rb_rptr_addr_lo);
write_register(kgd,
sdma_base_addr + SDMA0_RLC0_RB_RPTR_ADDR_HI,
m->sdma_rlc_rb_rptr_addr_hi);
write_register(kgd,
sdma_base_addr + SDMA0_RLC0_DOORBELL,
m->sdma_rlc_doorbell);
write_register(kgd,
sdma_base_addr + SDMA0_RLC0_RB_CNTL,
m->sdma_rlc_rb_cntl);
return 0;
}
static bool kgd_hqd_is_occupies(struct kgd_dev *kgd, uint64_t queue_address,
uint32_t pipe_id, uint32_t queue_id)
{
......@@ -538,6 +588,24 @@ static bool kgd_hqd_is_occupies(struct kgd_dev *kgd, uint64_t queue_address,
return retval;
}
static bool kgd_hqd_sdma_is_occupied(struct kgd_dev *kgd, void *mqd)
{
struct cik_sdma_rlc_registers *m;
uint32_t sdma_base_addr;
uint32_t sdma_rlc_rb_cntl;
m = get_sdma_mqd(mqd);
sdma_base_addr = get_sdma_base_addr(m);
sdma_rlc_rb_cntl = read_register(kgd,
sdma_base_addr + SDMA0_RLC0_RB_CNTL);
if (sdma_rlc_rb_cntl & SDMA_RB_ENABLE)
return true;
return false;
}
static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id)
......@@ -566,6 +634,39 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, uint32_t reset_type,
return 0;
}
static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
unsigned int timeout)
{
struct cik_sdma_rlc_registers *m;
uint32_t sdma_base_addr;
uint32_t temp;
m = get_sdma_mqd(mqd);
sdma_base_addr = get_sdma_base_addr(m);
temp = read_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_CNTL);
temp = temp & ~SDMA_RB_ENABLE;
write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_CNTL, temp);
while (true) {
temp = read_register(kgd, sdma_base_addr +
SDMA0_RLC0_CONTEXT_STATUS);
if (temp & SDMA_RLC_IDLE)
break;
if (timeout == 0)
return -ETIME;
msleep(20);
timeout -= 20;
}
write_register(kgd, sdma_base_addr + SDMA0_RLC0_DOORBELL, 0);
write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_RPTR, 0);
write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_WPTR, 0);
write_register(kgd, sdma_base_addr + SDMA0_RLC0_RB_BASE, 0);
return 0;
}
static uint16_t get_fw_version(struct kgd_dev *kgd, enum kgd_engine_type type)
{
struct radeon_device *rdev = (struct radeon_device *) kgd;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment