Commit b7176c26 authored by Barry Song's avatar Barry Song Committed by Christoph Hellwig

dma-contiguous: provide the ability to reserve per-numa CMA

Right now, drivers like ARM SMMU are using dma_alloc_coherent() to get
coherent DMA buffers to save their command queues and page tables. As
there is only one default CMA in the whole system, SMMUs on nodes other
than node0 will get remote memory. This leads to significant latency.

This patch provides per-numa CMA so that drivers like SMMU can get local
memory. Tests show localizing CMA can decrease dma_unmap latency much.
For instance, before this patch, SMMU on node2  has to wait for more than
560ns for the completion of CMD_SYNC in an empty command queue; with this
patch, it needs 240ns only.

A positive side effect of this patch would be improving performance even
further for those users who are worried about performance more than DMA
security and use iommu.passthrough=1 to skip IOMMU. With local CMA, all
drivers can get local coherent DMA buffers.

Also, this patch changes the default CONFIG_CMA_AREAS to 19 in NUMA. As
1+CONFIG_CMA_AREAS should be quite enough for most servers on the market
even they enable both hugetlb_cma and pernuma_cma.
2 numa nodes: 2(hugetlb) + 2(pernuma) + 1(default global cma) = 5
4 numa nodes: 4(hugetlb) + 4(pernuma) + 1(default global cma) = 9
8 numa nodes: 8(hugetlb) + 8(pernuma) + 1(default global cma) = 17
Signed-off-by: default avatarBarry Song <song.bao.hua@hisilicon.com>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
parent f75aef39
...@@ -599,6 +599,17 @@ ...@@ -599,6 +599,17 @@
altogether. For more information, see altogether. For more information, see
include/linux/dma-contiguous.h include/linux/dma-contiguous.h
cma_pernuma=nn[MG]
[ARM64,KNL]
Sets the size of kernel per-numa memory area for
contiguous memory allocations. A value of 0 disables
per-numa CMA altogether. And If this option is not
specificed, the default value is 0.
With per-numa CMA enabled, DMA users on node nid will
first try to allocate buffer from the pernuma area
which is located in node nid, if the allocation fails,
they will fallback to the global default memory area.
cmo_free_hint= [PPC] Format: { yes | no } cmo_free_hint= [PPC] Format: { yes | no }
Specify whether pages are marked as being inactive Specify whether pages are marked as being inactive
when they are freed. This is used in CMO environments when they are freed. This is used in CMO environments
......
...@@ -171,6 +171,12 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page, ...@@ -171,6 +171,12 @@ static inline void dma_free_contiguous(struct device *dev, struct page *page,
#endif #endif
#ifdef CONFIG_DMA_PERNUMA_CMA
void dma_pernuma_cma_reserve(void);
#else
static inline void dma_pernuma_cma_reserve(void) { }
#endif
#endif #endif
#endif #endif
...@@ -118,6 +118,17 @@ config DMA_CMA ...@@ -118,6 +118,17 @@ config DMA_CMA
If unsure, say "n". If unsure, say "n".
if DMA_CMA if DMA_CMA
config DMA_PERNUMA_CMA
bool "Enable separate DMA Contiguous Memory Area for each NUMA Node"
default NUMA && ARM64
help
Enable this option to get pernuma CMA areas so that devices like
ARM64 SMMU can get local memory by DMA coherent APIs.
You can set the size of pernuma CMA by specifying "cma_pernuma=size"
on the kernel's command line.
comment "Default contiguous memory area size:" comment "Default contiguous memory area size:"
config CMA_SIZE_MBYTES config CMA_SIZE_MBYTES
......
...@@ -69,6 +69,19 @@ static int __init early_cma(char *p) ...@@ -69,6 +69,19 @@ static int __init early_cma(char *p)
} }
early_param("cma", early_cma); early_param("cma", early_cma);
#ifdef CONFIG_DMA_PERNUMA_CMA
static struct cma *dma_contiguous_pernuma_area[MAX_NUMNODES];
static phys_addr_t pernuma_size_bytes __initdata;
static int __init early_cma_pernuma(char *p)
{
pernuma_size_bytes = memparse(p, &p);
return 0;
}
early_param("cma_pernuma", early_cma_pernuma);
#endif
#ifdef CONFIG_CMA_SIZE_PERCENTAGE #ifdef CONFIG_CMA_SIZE_PERCENTAGE
static phys_addr_t __init __maybe_unused cma_early_percent_memory(void) static phys_addr_t __init __maybe_unused cma_early_percent_memory(void)
...@@ -96,6 +109,34 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) ...@@ -96,6 +109,34 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void)
#endif #endif
#ifdef CONFIG_DMA_PERNUMA_CMA
void __init dma_pernuma_cma_reserve(void)
{
int nid;
if (!pernuma_size_bytes)
return;
for_each_online_node(nid) {
int ret;
char name[20];
struct cma **cma = &dma_contiguous_pernuma_area[nid];
snprintf(name, sizeof(name), "pernuma%d", nid);
ret = cma_declare_contiguous_nid(0, pernuma_size_bytes, 0, 0,
0, false, name, cma, nid);
if (ret) {
pr_warn("%s: reservation failed: err %d, node %d", __func__,
ret, nid);
continue;
}
pr_debug("%s: reserved %llu MiB on node %d\n", __func__,
(unsigned long long)pernuma_size_bytes / SZ_1M, nid);
}
}
#endif
/** /**
* dma_contiguous_reserve() - reserve area(s) for contiguous memory handling * dma_contiguous_reserve() - reserve area(s) for contiguous memory handling
* @limit: End address of the reserved memory (optional, 0 for any). * @limit: End address of the reserved memory (optional, 0 for any).
...@@ -228,23 +269,44 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp) ...@@ -228,23 +269,44 @@ static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
* @size: Requested allocation size. * @size: Requested allocation size.
* @gfp: Allocation flags. * @gfp: Allocation flags.
* *
* This function allocates contiguous memory buffer for specified device. It * tries to use device specific contiguous memory area if available, or it
* tries to use device specific contiguous memory area if available, or the * tries to use per-numa cma, if the allocation fails, it will fallback to
* default global one. * try default global one.
* *
* Note that it byapss one-page size of allocations from the global area as * Note that it bypass one-page size of allocations from the per-numa and
* the addresses within one page are always contiguous, so there is no need * global area as the addresses within one page are always contiguous, so
* to waste CMA pages for that kind; it also helps reduce fragmentations. * there is no need to waste CMA pages for that kind; it also helps reduce
* fragmentations.
*/ */
struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
{ {
#ifdef CONFIG_DMA_PERNUMA_CMA
int nid = dev_to_node(dev);
#endif
/* CMA can be used only in the context which permits sleeping */ /* CMA can be used only in the context which permits sleeping */
if (!gfpflags_allow_blocking(gfp)) if (!gfpflags_allow_blocking(gfp))
return NULL; return NULL;
if (dev->cma_area) if (dev->cma_area)
return cma_alloc_aligned(dev->cma_area, size, gfp); return cma_alloc_aligned(dev->cma_area, size, gfp);
if (size <= PAGE_SIZE || !dma_contiguous_default_area) if (size <= PAGE_SIZE)
return NULL;
#ifdef CONFIG_DMA_PERNUMA_CMA
if (nid != NUMA_NO_NODE && !(gfp & (GFP_DMA | GFP_DMA32))) {
struct cma *cma = dma_contiguous_pernuma_area[nid];
struct page *page;
if (cma) {
page = cma_alloc_aligned(cma, size, gfp);
if (page)
return page;
}
}
#endif
if (!dma_contiguous_default_area)
return NULL; return NULL;
return cma_alloc_aligned(dma_contiguous_default_area, size, gfp); return cma_alloc_aligned(dma_contiguous_default_area, size, gfp);
} }
...@@ -261,8 +323,26 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) ...@@ -261,8 +323,26 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
*/ */
void dma_free_contiguous(struct device *dev, struct page *page, size_t size) void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
{ {
if (!cma_release(dev_get_cma_area(dev), page, unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
PAGE_ALIGN(size) >> PAGE_SHIFT))
/* if dev has its own cma, free page from there */
if (dev->cma_area) {
if (cma_release(dev->cma_area, page, count))
return;
} else {
/*
* otherwise, page is from either per-numa cma or default cma
*/
#ifdef CONFIG_DMA_PERNUMA_CMA
if (cma_release(dma_contiguous_pernuma_area[page_to_nid(page)],
page, count))
return;
#endif
if (cma_release(dma_contiguous_default_area, page, count))
return;
}
/* not in any cma, free from buddy */
__free_pages(page, get_order(size)); __free_pages(page, get_order(size));
} }
......
...@@ -516,13 +516,14 @@ config CMA_DEBUGFS ...@@ -516,13 +516,14 @@ config CMA_DEBUGFS
config CMA_AREAS config CMA_AREAS
int "Maximum count of the CMA areas" int "Maximum count of the CMA areas"
depends on CMA depends on CMA
default 19 if NUMA
default 7 default 7
help help
CMA allows to create CMA areas for particular purpose, mainly, CMA allows to create CMA areas for particular purpose, mainly,
used as device private area. This parameter sets the maximum used as device private area. This parameter sets the maximum
number of CMA area in the system. number of CMA area in the system.
If unsure, leave the default value "7". If unsure, leave the default value "7" in UMA and "19" in NUMA.
config MEM_SOFT_DIRTY config MEM_SOFT_DIRTY
bool "Track memory changes" bool "Track memory changes"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment