Commit 347d81b6 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'dma-mapping-5.11' of git://git.infradead.org/users/hch/dma-mapping

Pull dma-mapping updates from Christoph Hellwig:

 - support for a partial IOMMU bypass (Alexey Kardashevskiy)

 - add a DMA API benchmark (Barry Song)

 - misc fixes (Tiezhu Yang, tangjianqiang)

* tag 'dma-mapping-5.11' of git://git.infradead.org/users/hch/dma-mapping:
  selftests/dma: add test application for DMA_MAP_BENCHMARK
  dma-mapping: add benchmark support for streaming DMA APIs
  dma-contiguous: fix a typo error in a comment
  dma-pool: no need to check return value of debugfs_create functions
  powerpc/dma: Fallback to dma_ops when persistent memory present
  dma-mapping: Allow mixing bypass and mapped DMA operation
parents 4f06f210 76793257
......@@ -5297,6 +5297,12 @@ F: include/linux/dma-mapping.h
F: include/linux/dma-map-ops.h
F: kernel/dma/
DMA MAPPING BENCHMARK
M: Barry Song <song.bao.hua@hisilicon.com>
L: iommu@lists.linux-foundation.org
F: kernel/dma/map_benchmark.c
F: tools/testing/selftests/dma/
DMA-BUF HEAPS FRAMEWORK
M: Sumit Semwal <sumit.semwal@linaro.org>
R: Benjamin Gaignard <benjamin.gaignard@linaro.org>
......
......@@ -161,6 +161,7 @@ config PPC
select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN
select DMA_OPS if PPC64
select DMA_OPS_BYPASS if PPC64
select ARCH_HAS_DMA_MAP_DIRECT if PPC64 && PPC_PSERIES
select DYNAMIC_FTRACE if FUNCTION_TRACER
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
......
......@@ -10,6 +10,63 @@
#include <linux/pci.h>
#include <asm/iommu.h>
#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT
#define can_map_direct(dev, addr) \
((dev)->bus_dma_limit >= phys_to_dma((dev), (addr)))
bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr)
{
if (likely(!dev->bus_dma_limit))
return false;
return can_map_direct(dev, addr);
}
#define is_direct_handle(dev, h) ((h) >= (dev)->archdata.dma_offset)
bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle)
{
if (likely(!dev->bus_dma_limit))
return false;
return is_direct_handle(dev, dma_handle);
}
bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg,
int nents)
{
struct scatterlist *s;
int i;
if (likely(!dev->bus_dma_limit))
return false;
for_each_sg(sg, s, nents, i) {
if (!can_map_direct(dev, sg_phys(s) + s->offset + s->length))
return false;
}
return true;
}
bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
int nents)
{
struct scatterlist *s;
int i;
if (likely(!dev->bus_dma_limit))
return false;
for_each_sg(sg, s, nents, i) {
if (!is_direct_handle(dev, s->dma_address + s->length))
return false;
}
return true;
}
#endif /* CONFIG_ARCH_HAS_DMA_MAP_DIRECT */
/*
* Generic iommu implementation
*/
......@@ -90,7 +147,17 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask)
struct iommu_table *tbl = get_iommu_table_base(dev);
if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) {
dev->dma_ops_bypass = true;
/*
* dma_iommu_bypass_supported() sets dma_max when there is
* 1:1 mapping but it is somehow limited.
* ibm,pmemory is one example.
*/
dev->dma_ops_bypass = dev->bus_dma_limit == 0;
if (!dev->dma_ops_bypass)
dev_warn(dev,
"iommu: 64-bit OK but direct DMA is limited by %llx\n",
dev->bus_dma_limit);
else
dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n");
return 1;
}
......
......@@ -839,7 +839,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop)
np, ret);
}
static u64 find_existing_ddw(struct device_node *pdn)
static u64 find_existing_ddw(struct device_node *pdn, int *window_shift)
{
struct direct_window *window;
const struct dynamic_dma_window_prop *direct64;
......@@ -851,6 +851,7 @@ static u64 find_existing_ddw(struct device_node *pdn)
if (window->device == pdn) {
direct64 = window->prop;
dma_addr = be64_to_cpu(direct64->dma_base);
*window_shift = be32_to_cpu(direct64->window_shift);
break;
}
}
......@@ -1111,11 +1112,12 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
*/
static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
{
int len, ret;
int len = 0, ret;
int max_ram_len = order_base_2(ddw_memory_hotplug_max());
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
u64 dma_addr, max_addr;
u64 dma_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
......@@ -1123,10 +1125,15 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
bool pmem_present;
dn = of_find_node_by_type(NULL, "ibm,pmemory");
pmem_present = dn != NULL;
of_node_put(dn);
mutex_lock(&direct_window_init_mutex);
dma_addr = find_existing_ddw(pdn);
dma_addr = find_existing_ddw(pdn, &len);
if (dma_addr != 0)
goto out_unlock;
......@@ -1212,14 +1219,29 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
}
/* verify the window * number of ptes will map the partition */
/* check largest block * page size > max memory hotplug addr */
max_addr = ddw_memory_hotplug_max();
if (query.largest_available_block < (max_addr >> page_shift)) {
dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu "
"%llu-sized pages\n", max_addr, query.largest_available_block,
/*
* The "ibm,pmemory" can appear anywhere in the address space.
* Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS
* for the upper limit and fallback to max RAM otherwise but this
* disables device::dma_ops_bypass.
*/
len = max_ram_len;
if (pmem_present) {
if (query.largest_available_block >=
(1ULL << (MAX_PHYSMEM_BITS - page_shift)))
len = MAX_PHYSMEM_BITS - page_shift;
else
dev_info(&dev->dev, "Skipping ibm,pmemory");
}
if (query.largest_available_block < (1ULL << (len - page_shift))) {
dev_dbg(&dev->dev,
"can't map partition max 0x%llx with %llu %llu-sized pages\n",
1ULL << len,
query.largest_available_block,
1ULL << page_shift);
goto out_failed;
}
len = order_base_2(max_addr);
win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
if (!win64) {
dev_info(&dev->dev,
......@@ -1299,6 +1321,15 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
out_unlock:
mutex_unlock(&direct_window_init_mutex);
/*
* If we have persistent memory and the window size is only as big
* as RAM, then we failed to create a window to cover persistent
* memory and need to set the DMA limit.
*/
if (pmem_present && dma_addr && (len == max_ram_len))
dev->dev.bus_dma_limit = dma_addr + (1ULL << len);
return dma_addr;
}
......
......@@ -317,6 +317,20 @@ static inline void arch_dma_mark_clean(phys_addr_t paddr, size_t size)
void *arch_dma_set_uncached(void *addr, size_t size);
void arch_dma_clear_uncached(void *addr, size_t size);
#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT
bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr);
bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle);
bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg,
int nents);
bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
int nents);
#else
#define arch_dma_map_page_direct(d, a) (false)
#define arch_dma_unmap_page_direct(d, a) (false)
#define arch_dma_map_sg_direct(d, s, n) (false)
#define arch_dma_unmap_sg_direct(d, s, n) (false)
#endif
#ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
const struct iommu_ops *iommu, bool coherent);
......
......@@ -20,6 +20,10 @@ config DMA_OPS
config DMA_OPS_BYPASS
bool
# Lets platform IOMMU driver choose between bypass and IOMMU
config ARCH_HAS_DMA_MAP_DIRECT
bool
config NEED_SG_DMA_LENGTH
bool
......@@ -220,3 +224,12 @@ config DMA_API_DEBUG_SG
is technically out-of-spec.
If unsure, say N.
config DMA_MAP_BENCHMARK
bool "Enable benchmarking of streaming DMA mapping"
depends on DEBUG_FS
help
Provides /sys/kernel/debug/dma_map_benchmark that helps with testing
performance of dma_(un)map_page.
See tools/testing/selftests/dma/dma_map_benchmark.c
......@@ -9,3 +9,4 @@ obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
obj-$(CONFIG_DMA_REMAP) += remap.o
obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
......@@ -20,7 +20,7 @@
* coders, etc.
*
* Such devices often require big memory buffers (a full HD frame
* is, for instance, more then 2 mega pixels large, i.e. more than 6
* is, for instance, more than 2 mega pixels large, i.e. more than 6
* MB of memory), which makes mechanisms such as kmalloc() or
* alloc_page() ineffective.
*
......
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2020 Hisilicon Limited.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/platform_device.h>
#include <linux/slab.h>
#include <linux/timekeeping.h>
#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
#define DMA_MAP_MAX_THREADS 1024
#define DMA_MAP_MAX_SECONDS 300
#define DMA_MAP_BIDIRECTIONAL 0
#define DMA_MAP_TO_DEVICE 1
#define DMA_MAP_FROM_DEVICE 2
struct map_benchmark {
__u64 avg_map_100ns; /* average map latency in 100ns */
__u64 map_stddev; /* standard deviation of map latency */
__u64 avg_unmap_100ns; /* as above */
__u64 unmap_stddev;
__u32 threads; /* how many threads will do map/unmap in parallel */
__u32 seconds; /* how long the test will last */
__s32 node; /* which numa node this benchmark will run on */
__u32 dma_bits; /* DMA addressing capability */
__u32 dma_dir; /* DMA data direction */
__u64 expansion[10]; /* For future use */
};
struct map_benchmark_data {
struct map_benchmark bparam;
struct device *dev;
struct dentry *debugfs;
enum dma_data_direction dir;
atomic64_t sum_map_100ns;
atomic64_t sum_unmap_100ns;
atomic64_t sum_sq_map;
atomic64_t sum_sq_unmap;
atomic64_t loops;
};
static int map_benchmark_thread(void *data)
{
void *buf;
dma_addr_t dma_addr;
struct map_benchmark_data *map = data;
int ret = 0;
buf = (void *)__get_free_page(GFP_KERNEL);
if (!buf)
return -ENOMEM;
while (!kthread_should_stop()) {
u64 map_100ns, unmap_100ns, map_sq, unmap_sq;
ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
ktime_t map_delta, unmap_delta;
/*
* for a non-coherent device, if we don't stain them in the
* cache, this will give an underestimate of the real-world
* overhead of BIDIRECTIONAL or TO_DEVICE mappings;
* 66 means evertything goes well! 66 is lucky.
*/
if (map->dir != DMA_FROM_DEVICE)
memset(buf, 0x66, PAGE_SIZE);
map_stime = ktime_get();
dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir);
if (unlikely(dma_mapping_error(map->dev, dma_addr))) {
pr_err("dma_map_single failed on %s\n",
dev_name(map->dev));
ret = -ENOMEM;
goto out;
}
map_etime = ktime_get();
map_delta = ktime_sub(map_etime, map_stime);
unmap_stime = ktime_get();
dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir);
unmap_etime = ktime_get();
unmap_delta = ktime_sub(unmap_etime, unmap_stime);
/* calculate sum and sum of squares */
map_100ns = div64_ul(map_delta, 100);
unmap_100ns = div64_ul(unmap_delta, 100);
map_sq = map_100ns * map_100ns;
unmap_sq = unmap_100ns * unmap_100ns;
atomic64_add(map_100ns, &map->sum_map_100ns);
atomic64_add(unmap_100ns, &map->sum_unmap_100ns);
atomic64_add(map_sq, &map->sum_sq_map);
atomic64_add(unmap_sq, &map->sum_sq_unmap);
atomic64_inc(&map->loops);
}
out:
free_page((unsigned long)buf);
return ret;
}
static int do_map_benchmark(struct map_benchmark_data *map)
{
struct task_struct **tsk;
int threads = map->bparam.threads;
int node = map->bparam.node;
const cpumask_t *cpu_mask = cpumask_of_node(node);
u64 loops;
int ret = 0;
int i;
tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL);
if (!tsk)
return -ENOMEM;
get_device(map->dev);
for (i = 0; i < threads; i++) {
tsk[i] = kthread_create_on_node(map_benchmark_thread, map,
map->bparam.node, "dma-map-benchmark/%d", i);
if (IS_ERR(tsk[i])) {
pr_err("create dma_map thread failed\n");
ret = PTR_ERR(tsk[i]);
goto out;
}
if (node != NUMA_NO_NODE)
kthread_bind_mask(tsk[i], cpu_mask);
}
/* clear the old value in the previous benchmark */
atomic64_set(&map->sum_map_100ns, 0);
atomic64_set(&map->sum_unmap_100ns, 0);
atomic64_set(&map->sum_sq_map, 0);
atomic64_set(&map->sum_sq_unmap, 0);
atomic64_set(&map->loops, 0);
for (i = 0; i < threads; i++)
wake_up_process(tsk[i]);
msleep_interruptible(map->bparam.seconds * 1000);
/* wait for the completion of benchmark threads */
for (i = 0; i < threads; i++) {
ret = kthread_stop(tsk[i]);
if (ret)
goto out;
}
loops = atomic64_read(&map->loops);
if (likely(loops > 0)) {
u64 map_variance, unmap_variance;
u64 sum_map = atomic64_read(&map->sum_map_100ns);
u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns);
u64 sum_sq_map = atomic64_read(&map->sum_sq_map);
u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap);
/* average latency */
map->bparam.avg_map_100ns = div64_u64(sum_map, loops);
map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops);
/* standard deviation of latency */
map_variance = div64_u64(sum_sq_map, loops) -
map->bparam.avg_map_100ns *
map->bparam.avg_map_100ns;
unmap_variance = div64_u64(sum_sq_unmap, loops) -
map->bparam.avg_unmap_100ns *
map->bparam.avg_unmap_100ns;
map->bparam.map_stddev = int_sqrt64(map_variance);
map->bparam.unmap_stddev = int_sqrt64(unmap_variance);
}
out:
put_device(map->dev);
kfree(tsk);
return ret;
}
static long map_benchmark_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct map_benchmark_data *map = file->private_data;
void __user *argp = (void __user *)arg;
u64 old_dma_mask;
int ret;
if (copy_from_user(&map->bparam, argp, sizeof(map->bparam)))
return -EFAULT;
switch (cmd) {
case DMA_MAP_BENCHMARK:
if (map->bparam.threads == 0 ||
map->bparam.threads > DMA_MAP_MAX_THREADS) {
pr_err("invalid thread number\n");
return -EINVAL;
}
if (map->bparam.seconds == 0 ||
map->bparam.seconds > DMA_MAP_MAX_SECONDS) {
pr_err("invalid duration seconds\n");
return -EINVAL;
}
if (map->bparam.node != NUMA_NO_NODE &&
!node_possible(map->bparam.node)) {
pr_err("invalid numa node\n");
return -EINVAL;
}
switch (map->bparam.dma_dir) {
case DMA_MAP_BIDIRECTIONAL:
map->dir = DMA_BIDIRECTIONAL;
break;
case DMA_MAP_FROM_DEVICE:
map->dir = DMA_FROM_DEVICE;
break;
case DMA_MAP_TO_DEVICE:
map->dir = DMA_TO_DEVICE;
break;
default:
pr_err("invalid DMA direction\n");
return -EINVAL;
}
old_dma_mask = dma_get_mask(map->dev);
ret = dma_set_mask(map->dev,
DMA_BIT_MASK(map->bparam.dma_bits));
if (ret) {
pr_err("failed to set dma_mask on device %s\n",
dev_name(map->dev));
return -EINVAL;
}
ret = do_map_benchmark(map);
/*
* restore the original dma_mask as many devices' dma_mask are
* set by architectures, acpi, busses. When we bind them back
* to their original drivers, those drivers shouldn't see
* dma_mask changed by benchmark
*/
dma_set_mask(map->dev, old_dma_mask);
break;
default:
return -EINVAL;
}
if (copy_to_user(argp, &map->bparam, sizeof(map->bparam)))
return -EFAULT;
return ret;
}
static const struct file_operations map_benchmark_fops = {
.open = simple_open,
.unlocked_ioctl = map_benchmark_ioctl,
};
static void map_benchmark_remove_debugfs(void *data)
{
struct map_benchmark_data *map = (struct map_benchmark_data *)data;
debugfs_remove(map->debugfs);
}
static int __map_benchmark_probe(struct device *dev)
{
struct dentry *entry;
struct map_benchmark_data *map;
int ret;
map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL);
if (!map)
return -ENOMEM;
map->dev = dev;
ret = devm_add_action(dev, map_benchmark_remove_debugfs, map);
if (ret) {
pr_err("Can't add debugfs remove action\n");
return ret;
}
/*
* we only permit a device bound with this driver, 2nd probe
* will fail
*/
entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map,
&map_benchmark_fops);
if (IS_ERR(entry))
return PTR_ERR(entry);
map->debugfs = entry;
return 0;
}
static int map_benchmark_platform_probe(struct platform_device *pdev)
{
return __map_benchmark_probe(&pdev->dev);
}
static struct platform_driver map_benchmark_platform_driver = {
.driver = {
.name = "dma_map_benchmark",
},
.probe = map_benchmark_platform_probe,
};
static int
map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
return __map_benchmark_probe(&pdev->dev);
}
static struct pci_driver map_benchmark_pci_driver = {
.name = "dma_map_benchmark",
.probe = map_benchmark_pci_probe,
};
static int __init map_benchmark_init(void)
{
int ret;
ret = pci_register_driver(&map_benchmark_pci_driver);
if (ret)
return ret;
ret = platform_driver_register(&map_benchmark_platform_driver);
if (ret) {
pci_unregister_driver(&map_benchmark_pci_driver);
return ret;
}
return 0;
}
static void __exit map_benchmark_cleanup(void)
{
platform_driver_unregister(&map_benchmark_platform_driver);
pci_unregister_driver(&map_benchmark_pci_driver);
}
module_init(map_benchmark_init);
module_exit(map_benchmark_cleanup);
MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
MODULE_DESCRIPTION("dma_map benchmark driver");
MODULE_LICENSE("GPL");
......@@ -149,7 +149,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
if (WARN_ON_ONCE(!dev->dma_mask))
return DMA_MAPPING_ERROR;
if (dma_map_direct(dev, ops))
if (dma_map_direct(dev, ops) ||
arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size))
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
......@@ -165,7 +166,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size,
const struct dma_map_ops *ops = get_dma_ops(dev);
BUG_ON(!valid_dma_direction(dir));
if (dma_map_direct(dev, ops))
if (dma_map_direct(dev, ops) ||
arch_dma_unmap_page_direct(dev, addr + size))
dma_direct_unmap_page(dev, addr, size, dir, attrs);
else if (ops->unmap_page)
ops->unmap_page(dev, addr, size, dir, attrs);
......@@ -188,7 +190,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
if (WARN_ON_ONCE(!dev->dma_mask))
return 0;
if (dma_map_direct(dev, ops))
if (dma_map_direct(dev, ops) ||
arch_dma_map_sg_direct(dev, sg, nents))
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
......@@ -207,7 +210,8 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
BUG_ON(!valid_dma_direction(dir));
debug_dma_unmap_sg(dev, sg, nents, dir);
if (dma_map_direct(dev, ops))
if (dma_map_direct(dev, ops) ||
arch_dma_unmap_sg_direct(dev, sg, nents))
dma_direct_unmap_sg(dev, sg, nents, dir, attrs);
else if (ops->unmap_sg)
ops->unmap_sg(dev, sg, nents, dir, attrs);
......
......@@ -38,9 +38,6 @@ static void __init dma_atomic_pool_debugfs_init(void)
struct dentry *root;
root = debugfs_create_dir("dma_pools", NULL);
if (IS_ERR_OR_NULL(root))
return;
debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma);
debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32);
debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel);
......
# SPDX-License-Identifier: GPL-2.0
CFLAGS += -I../../../../usr/include/
TEST_GEN_PROGS := dma_map_benchmark
include ../lib.mk
CONFIG_DMA_MAP_BENCHMARK=y
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2020 Hisilicon Limited.
*/
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/types.h>
#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
#define DMA_MAP_MAX_THREADS 1024
#define DMA_MAP_MAX_SECONDS 300
#define DMA_MAP_BIDIRECTIONAL 0
#define DMA_MAP_TO_DEVICE 1
#define DMA_MAP_FROM_DEVICE 2
static char *directions[] = {
"BIDIRECTIONAL",
"TO_DEVICE",
"FROM_DEVICE",
};
struct map_benchmark {
__u64 avg_map_100ns; /* average map latency in 100ns */
__u64 map_stddev; /* standard deviation of map latency */
__u64 avg_unmap_100ns; /* as above */
__u64 unmap_stddev;
__u32 threads; /* how many threads will do map/unmap in parallel */
__u32 seconds; /* how long the test will last */
__s32 node; /* which numa node this benchmark will run on */
__u32 dma_bits; /* DMA addressing capability */
__u32 dma_dir; /* DMA data direction */
__u64 expansion[10]; /* For future use */
};
int main(int argc, char **argv)
{
struct map_benchmark map;
int fd, opt;
/* default single thread, run 20 seconds on NUMA_NO_NODE */
int threads = 1, seconds = 20, node = -1;
/* default dma mask 32bit, bidirectional DMA */
int bits = 32, dir = DMA_MAP_BIDIRECTIONAL;
int cmd = DMA_MAP_BENCHMARK;
char *p;
while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) {
switch (opt) {
case 't':
threads = atoi(optarg);
break;
case 's':
seconds = atoi(optarg);
break;
case 'n':
node = atoi(optarg);
break;
case 'b':
bits = atoi(optarg);
break;
case 'd':
dir = atoi(optarg);
break;
default:
return -1;
}
}
if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) {
fprintf(stderr, "invalid number of threads, must be in 1-%d\n",
DMA_MAP_MAX_THREADS);
exit(1);
}
if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) {
fprintf(stderr, "invalid number of seconds, must be in 1-%d\n",
DMA_MAP_MAX_SECONDS);
exit(1);
}
/* suppose the mininum DMA zone is 1MB in the world */
if (bits < 20 || bits > 64) {
fprintf(stderr, "invalid dma mask bit, must be in 20-64\n");
exit(1);
}
if (dir != DMA_MAP_BIDIRECTIONAL && dir != DMA_MAP_TO_DEVICE &&
dir != DMA_MAP_FROM_DEVICE) {
fprintf(stderr, "invalid dma direction\n");
exit(1);
}
fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR);
if (fd == -1) {
perror("open");
exit(1);
}
map.seconds = seconds;
map.threads = threads;
map.node = node;
map.dma_bits = bits;
map.dma_dir = dir;
if (ioctl(fd, cmd, &map)) {
perror("ioctl");
exit(1);
}
printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s\n",
threads, seconds, node, dir[directions]);
printf("average map latency(us):%.1f standard deviation:%.1f\n",
map.avg_map_100ns/10.0, map.map_stddev/10.0);
printf("average unmap latency(us):%.1f standard deviation:%.1f\n",
map.avg_unmap_100ns/10.0, map.unmap_stddev/10.0);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment