From 8d8d53cf8fd028310b1189165b939cde124895d7 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 29 Oct 2020 12:52:40 +1100 Subject: [PATCH 1/6] dma-mapping: Allow mixing bypass and mapped DMA operation At the moment we allow bypassing DMA ops only when we can do this for the entire RAM. However there are configs with mixed type memory where we could still allow bypassing IOMMU in most cases; POWERPC with persistent memory is one example. This adds an arch hook to determine where bypass can still work and we invoke direct DMA API. The following patch checks the bus limit on POWERPC to allow or disallow direct mapping. This adds a ARCH_HAS_DMA_MAP_DIRECT config option to make the arch_xxxx hooks no-op by default. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Christoph Hellwig --- include/linux/dma-map-ops.h | 14 ++++++++++++++ kernel/dma/Kconfig | 4 ++++ kernel/dma/mapping.c | 12 ++++++++---- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index a5f89fc4d6df..38c8a4558e08 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -314,6 +314,20 @@ static inline void arch_dma_mark_clean(phys_addr_t paddr, size_t size) void *arch_dma_set_uncached(void *addr, size_t size); void arch_dma_clear_uncached(void *addr, size_t size); +#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT +bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr); +bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle); +bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg, + int nents); +bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, + int nents); +#else +#define arch_dma_map_page_direct(d, a) (false) +#define arch_dma_unmap_page_direct(d, a) (false) +#define arch_dma_map_sg_direct(d, s, n) (false) +#define arch_dma_unmap_sg_direct(d, s, n) (false) +#endif + #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index c99de4a21458..43d106598e82 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -20,6 +20,10 @@ config DMA_OPS config DMA_OPS_BYPASS bool +# Lets platform IOMMU driver choose between bypass and IOMMU +config ARCH_HAS_DMA_MAP_DIRECT + bool + config NEED_SG_DMA_LENGTH bool diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 51bb8fa8eb89..f87a89d08654 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -149,7 +149,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, if (WARN_ON_ONCE(!dev->dma_mask)) return DMA_MAPPING_ERROR; - if (dma_map_direct(dev, ops)) + if (dma_map_direct(dev, ops) || + arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); @@ -165,7 +166,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_map_direct(dev, ops)) + if (dma_map_direct(dev, ops) || + arch_dma_unmap_page_direct(dev, addr + size)) dma_direct_unmap_page(dev, addr, size, dir, attrs); else if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); @@ -188,7 +190,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, if (WARN_ON_ONCE(!dev->dma_mask)) return 0; - if (dma_map_direct(dev, ops)) + if (dma_map_direct(dev, ops) || + arch_dma_map_sg_direct(dev, sg, nents)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else ents = ops->map_sg(dev, sg, nents, dir, attrs); @@ -207,7 +210,8 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); debug_dma_unmap_sg(dev, sg, nents, dir); - if (dma_map_direct(dev, ops)) + if (dma_map_direct(dev, ops) || + arch_dma_unmap_sg_direct(dev, sg, nents)) dma_direct_unmap_sg(dev, sg, nents, dir, attrs); else if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); From bf6e2d562bbc4d115cf322b0bca57fe5bbd26f48 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 29 Oct 2020 12:52:41 +1100 Subject: [PATCH 2/6] powerpc/dma: Fallback to dma_ops when persistent memory present So far we have been using huge DMA windows to map all the RAM available. The RAM is normally mapped to the VM address space contiguously, and there is always a reasonable upper limit for possible future hot plugged RAM which makes it easy to map all RAM via IOMMU. Now there is persistent memory ("ibm,pmemory" in the FDT) which (unlike normal RAM) can map anywhere in the VM space beyond the maximum RAM size and since it can be used for DMA, it requires extending the huge window up to MAX_PHYSMEM_BITS which requires hypervisor support for: 1. huge TCE tables; 2. multilevel TCE tables; 3. huge IOMMU pages. Certain hypervisors cannot do either so the only option left is restricting the huge DMA window to include only RAM and fallback to the default DMA window for persistent memory. This defines arch_dma_map_direct/etc to allow generic DMA code perform additional checks on whether direct DMA is still possible. This checks if the system has persistent memory. If it does not, the DMA bypass mode is selected, i.e. * dev->bus_dma_limit = 0 * dev->dma_ops_bypass = true <- this avoid calling dma_ops for mapping. If there is such memory, this creates identity mapping only for RAM and sets the dev->bus_dma_limit to let the generic code decide whether to call into the direct DMA or the indirect DMA ops. This should not change the existing behaviour when no persistent memory as dev->dma_ops_bypass is expected to be set. Signed-off-by: Alexey Kardashevskiy Acked-by: Michael Ellerman Signed-off-by: Christoph Hellwig --- arch/powerpc/Kconfig | 1 + arch/powerpc/kernel/dma-iommu.c | 71 +++++++++++++++++++++++++- arch/powerpc/platforms/pseries/iommu.c | 51 ++++++++++++++---- 3 files changed, 111 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e9f13fe08492..b2d4580acf79 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -159,6 +159,7 @@ config PPC select DCACHE_WORD_ACCESS if PPC64 && CPU_LITTLE_ENDIAN select DMA_OPS if PPC64 select DMA_OPS_BYPASS if PPC64 + select ARCH_HAS_DMA_MAP_DIRECT if PPC64 && PPC_PSERIES select DYNAMIC_FTRACE if FUNCTION_TRACER select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index a1c744194018..111249fd619d 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -10,6 +10,63 @@ #include #include +#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT +#define can_map_direct(dev, addr) \ + ((dev)->bus_dma_limit >= phys_to_dma((dev), (addr))) + +bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr) +{ + if (likely(!dev->bus_dma_limit)) + return false; + + return can_map_direct(dev, addr); +} + +#define is_direct_handle(dev, h) ((h) >= (dev)->archdata.dma_offset) + +bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle) +{ + if (likely(!dev->bus_dma_limit)) + return false; + + return is_direct_handle(dev, dma_handle); +} + +bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg, + int nents) +{ + struct scatterlist *s; + int i; + + if (likely(!dev->bus_dma_limit)) + return false; + + for_each_sg(sg, s, nents, i) { + if (!can_map_direct(dev, sg_phys(s) + s->offset + s->length)) + return false; + } + + return true; +} + +bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, + int nents) +{ + struct scatterlist *s; + int i; + + if (likely(!dev->bus_dma_limit)) + return false; + + for_each_sg(sg, s, nents, i) { + if (!is_direct_handle(dev, s->dma_address + s->length)) + return false; + } + + return true; +} +#endif /* CONFIG_ARCH_HAS_DMA_MAP_DIRECT */ + /* * Generic iommu implementation */ @@ -90,8 +147,18 @@ int dma_iommu_dma_supported(struct device *dev, u64 mask) struct iommu_table *tbl = get_iommu_table_base(dev); if (dev_is_pci(dev) && dma_iommu_bypass_supported(dev, mask)) { - dev->dma_ops_bypass = true; - dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); + /* + * dma_iommu_bypass_supported() sets dma_max when there is + * 1:1 mapping but it is somehow limited. + * ibm,pmemory is one example. + */ + dev->dma_ops_bypass = dev->bus_dma_limit == 0; + if (!dev->dma_ops_bypass) + dev_warn(dev, + "iommu: 64-bit OK but direct DMA is limited by %llx\n", + dev->bus_dma_limit); + else + dev_dbg(dev, "iommu: 64-bit OK, using fixed ops\n"); return 1; } diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index e4198700ed1a..9fc5217f0c8e 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -839,7 +839,7 @@ static void remove_ddw(struct device_node *np, bool remove_prop) np, ret); } -static u64 find_existing_ddw(struct device_node *pdn) +static u64 find_existing_ddw(struct device_node *pdn, int *window_shift) { struct direct_window *window; const struct dynamic_dma_window_prop *direct64; @@ -851,6 +851,7 @@ static u64 find_existing_ddw(struct device_node *pdn) if (window->device == pdn) { direct64 = window->prop; dma_addr = be64_to_cpu(direct64->dma_base); + *window_shift = be32_to_cpu(direct64->window_shift); break; } } @@ -1111,11 +1112,12 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn) */ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) { - int len, ret; + int len = 0, ret; + int max_ram_len = order_base_2(ddw_memory_hotplug_max()); struct ddw_query_response query; struct ddw_create_response create; int page_shift; - u64 dma_addr, max_addr; + u64 dma_addr; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; struct direct_window *window; @@ -1123,10 +1125,15 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) struct dynamic_dma_window_prop *ddwprop; struct failed_ddw_pdn *fpdn; bool default_win_removed = false; + bool pmem_present; + + dn = of_find_node_by_type(NULL, "ibm,pmemory"); + pmem_present = dn != NULL; + of_node_put(dn); mutex_lock(&direct_window_init_mutex); - dma_addr = find_existing_ddw(pdn); + dma_addr = find_existing_ddw(pdn, &len); if (dma_addr != 0) goto out_unlock; @@ -1212,14 +1219,29 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) } /* verify the window * number of ptes will map the partition */ /* check largest block * page size > max memory hotplug addr */ - max_addr = ddw_memory_hotplug_max(); - if (query.largest_available_block < (max_addr >> page_shift)) { - dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu " - "%llu-sized pages\n", max_addr, query.largest_available_block, - 1ULL << page_shift); + /* + * The "ibm,pmemory" can appear anywhere in the address space. + * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS + * for the upper limit and fallback to max RAM otherwise but this + * disables device::dma_ops_bypass. + */ + len = max_ram_len; + if (pmem_present) { + if (query.largest_available_block >= + (1ULL << (MAX_PHYSMEM_BITS - page_shift))) + len = MAX_PHYSMEM_BITS - page_shift; + else + dev_info(&dev->dev, "Skipping ibm,pmemory"); + } + + if (query.largest_available_block < (1ULL << (len - page_shift))) { + dev_dbg(&dev->dev, + "can't map partition max 0x%llx with %llu %llu-sized pages\n", + 1ULL << len, + query.largest_available_block, + 1ULL << page_shift); goto out_failed; } - len = order_base_2(max_addr); win64 = kzalloc(sizeof(struct property), GFP_KERNEL); if (!win64) { dev_info(&dev->dev, @@ -1299,6 +1321,15 @@ out_failed: out_unlock: mutex_unlock(&direct_window_init_mutex); + + /* + * If we have persistent memory and the window size is only as big + * as RAM, then we failed to create a window to cover persistent + * memory and need to set the DMA limit. + */ + if (pmem_present && dma_addr && (len == max_ram_len)) + dev->dev.bus_dma_limit = dma_addr + (1ULL << len); + return dma_addr; } From 94035edcb4e3bbc9f445bee706722ef64e044095 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Sat, 7 Nov 2020 18:03:12 +0800 Subject: [PATCH 3/6] dma-pool: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Tiezhu Yang Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index d4637f72239b..5f84e6cdb78e 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -38,9 +38,6 @@ static void __init dma_atomic_pool_debugfs_init(void) struct dentry *root; root = debugfs_create_dir("dma_pools", NULL); - if (IS_ERR_OR_NULL(root)) - return; - debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma); debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32); debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel); From 819b70ad620119d21a9e4be6ad665ece26fc0db8 Mon Sep 17 00:00:00 2001 From: tangjianqiang Date: Tue, 24 Nov 2020 18:40:19 +0800 Subject: [PATCH 4/6] dma-contiguous: fix a typo error in a comment Fix a typo error in cma description comment: "then" -> "than". Signed-off-by: tangjianqiang Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 16b95ff12e4d..3d63d91cba5c 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -20,7 +20,7 @@ * coders, etc. * * Such devices often require big memory buffers (a full HD frame - * is, for instance, more then 2 mega pixels large, i.e. more than 6 + * is, for instance, more than 2 mega pixels large, i.e. more than 6 * MB of memory), which makes mechanisms such as kmalloc() or * alloc_page() ineffective. * From 65789daa8087e125927230ccb7e1eab13999b0cf Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 16 Nov 2020 19:08:47 +1300 Subject: [PATCH 5/6] dma-mapping: add benchmark support for streaming DMA APIs Nowadays, there are increasing requirements to benchmark the performance of dma_map and dma_unmap particually while the device is attached to an IOMMU. This patch enables the support. Users can run specified number of threads to do dma_map_page and dma_unmap_page on a specific NUMA node with the specified duration. Then dma_map_benchmark will calculate the average latency for map and unmap. A difficulity for this benchmark is that dma_map/unmap APIs must run on a particular device. Each device might have different backend of IOMMU or non-IOMMU. So we use the driver_override to bind dma_map_benchmark to a particual device by: For platform devices: echo dma_map_benchmark > /sys/bus/platform/devices/xxx/driver_override echo xxx > /sys/bus/platform/drivers/xxx/unbind echo xxx > /sys/bus/platform/drivers/dma_map_benchmark/bind For PCI devices: echo dma_map_benchmark > /sys/bus/pci/devices/0000:00:01.0/driver_override echo 0000:00:01.0 > /sys/bus/pci/drivers/xxx/unbind echo 0000:00:01.0 > /sys/bus/pci/drivers/dma_map_benchmark/bind Cc: Will Deacon Cc: Shuah Khan Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Signed-off-by: Barry Song [hch: folded in two fixes from Colin Ian King ] Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 9 + kernel/dma/Makefile | 1 + kernel/dma/map_benchmark.c | 361 +++++++++++++++++++++++++++++++++++++ 3 files changed, 371 insertions(+) create mode 100644 kernel/dma/map_benchmark.c diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 43d106598e82..ba50d4588bc5 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -229,3 +229,12 @@ config DMA_API_DEBUG_SG is technically out-of-spec. If unsure, say N. + +config DMA_MAP_BENCHMARK + bool "Enable benchmarking of streaming DMA mapping" + depends on DEBUG_FS + help + Provides /sys/kernel/debug/dma_map_benchmark that helps with testing + performance of dma_(un)map_page. + + See tools/testing/selftests/dma/dma_map_benchmark.c diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index dc755ab68aab..7aa6b26b1348 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o obj-$(CONFIG_DMA_REMAP) += remap.o +obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c new file mode 100644 index 000000000000..b1496e744c68 --- /dev/null +++ b/kernel/dma/map_benchmark.c @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Hisilicon Limited. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) +#define DMA_MAP_MAX_THREADS 1024 +#define DMA_MAP_MAX_SECONDS 300 + +#define DMA_MAP_BIDIRECTIONAL 0 +#define DMA_MAP_TO_DEVICE 1 +#define DMA_MAP_FROM_DEVICE 2 + +struct map_benchmark { + __u64 avg_map_100ns; /* average map latency in 100ns */ + __u64 map_stddev; /* standard deviation of map latency */ + __u64 avg_unmap_100ns; /* as above */ + __u64 unmap_stddev; + __u32 threads; /* how many threads will do map/unmap in parallel */ + __u32 seconds; /* how long the test will last */ + __s32 node; /* which numa node this benchmark will run on */ + __u32 dma_bits; /* DMA addressing capability */ + __u32 dma_dir; /* DMA data direction */ + __u64 expansion[10]; /* For future use */ +}; + +struct map_benchmark_data { + struct map_benchmark bparam; + struct device *dev; + struct dentry *debugfs; + enum dma_data_direction dir; + atomic64_t sum_map_100ns; + atomic64_t sum_unmap_100ns; + atomic64_t sum_sq_map; + atomic64_t sum_sq_unmap; + atomic64_t loops; +}; + +static int map_benchmark_thread(void *data) +{ + void *buf; + dma_addr_t dma_addr; + struct map_benchmark_data *map = data; + int ret = 0; + + buf = (void *)__get_free_page(GFP_KERNEL); + if (!buf) + return -ENOMEM; + + while (!kthread_should_stop()) { + u64 map_100ns, unmap_100ns, map_sq, unmap_sq; + ktime_t map_stime, map_etime, unmap_stime, unmap_etime; + ktime_t map_delta, unmap_delta; + + /* + * for a non-coherent device, if we don't stain them in the + * cache, this will give an underestimate of the real-world + * overhead of BIDIRECTIONAL or TO_DEVICE mappings; + * 66 means evertything goes well! 66 is lucky. + */ + if (map->dir != DMA_FROM_DEVICE) + memset(buf, 0x66, PAGE_SIZE); + + map_stime = ktime_get(); + dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir); + if (unlikely(dma_mapping_error(map->dev, dma_addr))) { + pr_err("dma_map_single failed on %s\n", + dev_name(map->dev)); + ret = -ENOMEM; + goto out; + } + map_etime = ktime_get(); + map_delta = ktime_sub(map_etime, map_stime); + + unmap_stime = ktime_get(); + dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir); + unmap_etime = ktime_get(); + unmap_delta = ktime_sub(unmap_etime, unmap_stime); + + /* calculate sum and sum of squares */ + + map_100ns = div64_ul(map_delta, 100); + unmap_100ns = div64_ul(unmap_delta, 100); + map_sq = map_100ns * map_100ns; + unmap_sq = unmap_100ns * unmap_100ns; + + atomic64_add(map_100ns, &map->sum_map_100ns); + atomic64_add(unmap_100ns, &map->sum_unmap_100ns); + atomic64_add(map_sq, &map->sum_sq_map); + atomic64_add(unmap_sq, &map->sum_sq_unmap); + atomic64_inc(&map->loops); + } + +out: + free_page((unsigned long)buf); + return ret; +} + +static int do_map_benchmark(struct map_benchmark_data *map) +{ + struct task_struct **tsk; + int threads = map->bparam.threads; + int node = map->bparam.node; + const cpumask_t *cpu_mask = cpumask_of_node(node); + u64 loops; + int ret = 0; + int i; + + tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL); + if (!tsk) + return -ENOMEM; + + get_device(map->dev); + + for (i = 0; i < threads; i++) { + tsk[i] = kthread_create_on_node(map_benchmark_thread, map, + map->bparam.node, "dma-map-benchmark/%d", i); + if (IS_ERR(tsk[i])) { + pr_err("create dma_map thread failed\n"); + ret = PTR_ERR(tsk[i]); + goto out; + } + + if (node != NUMA_NO_NODE) + kthread_bind_mask(tsk[i], cpu_mask); + } + + /* clear the old value in the previous benchmark */ + atomic64_set(&map->sum_map_100ns, 0); + atomic64_set(&map->sum_unmap_100ns, 0); + atomic64_set(&map->sum_sq_map, 0); + atomic64_set(&map->sum_sq_unmap, 0); + atomic64_set(&map->loops, 0); + + for (i = 0; i < threads; i++) + wake_up_process(tsk[i]); + + msleep_interruptible(map->bparam.seconds * 1000); + + /* wait for the completion of benchmark threads */ + for (i = 0; i < threads; i++) { + ret = kthread_stop(tsk[i]); + if (ret) + goto out; + } + + loops = atomic64_read(&map->loops); + if (likely(loops > 0)) { + u64 map_variance, unmap_variance; + u64 sum_map = atomic64_read(&map->sum_map_100ns); + u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns); + u64 sum_sq_map = atomic64_read(&map->sum_sq_map); + u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap); + + /* average latency */ + map->bparam.avg_map_100ns = div64_u64(sum_map, loops); + map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops); + + /* standard deviation of latency */ + map_variance = div64_u64(sum_sq_map, loops) - + map->bparam.avg_map_100ns * + map->bparam.avg_map_100ns; + unmap_variance = div64_u64(sum_sq_unmap, loops) - + map->bparam.avg_unmap_100ns * + map->bparam.avg_unmap_100ns; + map->bparam.map_stddev = int_sqrt64(map_variance); + map->bparam.unmap_stddev = int_sqrt64(unmap_variance); + } + +out: + put_device(map->dev); + kfree(tsk); + return ret; +} + +static long map_benchmark_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct map_benchmark_data *map = file->private_data; + void __user *argp = (void __user *)arg; + u64 old_dma_mask; + + int ret; + + if (copy_from_user(&map->bparam, argp, sizeof(map->bparam))) + return -EFAULT; + + switch (cmd) { + case DMA_MAP_BENCHMARK: + if (map->bparam.threads == 0 || + map->bparam.threads > DMA_MAP_MAX_THREADS) { + pr_err("invalid thread number\n"); + return -EINVAL; + } + + if (map->bparam.seconds == 0 || + map->bparam.seconds > DMA_MAP_MAX_SECONDS) { + pr_err("invalid duration seconds\n"); + return -EINVAL; + } + + if (map->bparam.node != NUMA_NO_NODE && + !node_possible(map->bparam.node)) { + pr_err("invalid numa node\n"); + return -EINVAL; + } + + switch (map->bparam.dma_dir) { + case DMA_MAP_BIDIRECTIONAL: + map->dir = DMA_BIDIRECTIONAL; + break; + case DMA_MAP_FROM_DEVICE: + map->dir = DMA_FROM_DEVICE; + break; + case DMA_MAP_TO_DEVICE: + map->dir = DMA_TO_DEVICE; + break; + default: + pr_err("invalid DMA direction\n"); + return -EINVAL; + } + + old_dma_mask = dma_get_mask(map->dev); + + ret = dma_set_mask(map->dev, + DMA_BIT_MASK(map->bparam.dma_bits)); + if (ret) { + pr_err("failed to set dma_mask on device %s\n", + dev_name(map->dev)); + return -EINVAL; + } + + ret = do_map_benchmark(map); + + /* + * restore the original dma_mask as many devices' dma_mask are + * set by architectures, acpi, busses. When we bind them back + * to their original drivers, those drivers shouldn't see + * dma_mask changed by benchmark + */ + dma_set_mask(map->dev, old_dma_mask); + break; + default: + return -EINVAL; + } + + if (copy_to_user(argp, &map->bparam, sizeof(map->bparam))) + return -EFAULT; + + return ret; +} + +static const struct file_operations map_benchmark_fops = { + .open = simple_open, + .unlocked_ioctl = map_benchmark_ioctl, +}; + +static void map_benchmark_remove_debugfs(void *data) +{ + struct map_benchmark_data *map = (struct map_benchmark_data *)data; + + debugfs_remove(map->debugfs); +} + +static int __map_benchmark_probe(struct device *dev) +{ + struct dentry *entry; + struct map_benchmark_data *map; + int ret; + + map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + map->dev = dev; + + ret = devm_add_action(dev, map_benchmark_remove_debugfs, map); + if (ret) { + pr_err("Can't add debugfs remove action\n"); + return ret; + } + + /* + * we only permit a device bound with this driver, 2nd probe + * will fail + */ + entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map, + &map_benchmark_fops); + if (IS_ERR(entry)) + return PTR_ERR(entry); + map->debugfs = entry; + + return 0; +} + +static int map_benchmark_platform_probe(struct platform_device *pdev) +{ + return __map_benchmark_probe(&pdev->dev); +} + +static struct platform_driver map_benchmark_platform_driver = { + .driver = { + .name = "dma_map_benchmark", + }, + .probe = map_benchmark_platform_probe, +}; + +static int +map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + return __map_benchmark_probe(&pdev->dev); +} + +static struct pci_driver map_benchmark_pci_driver = { + .name = "dma_map_benchmark", + .probe = map_benchmark_pci_probe, +}; + +static int __init map_benchmark_init(void) +{ + int ret; + + ret = pci_register_driver(&map_benchmark_pci_driver); + if (ret) + return ret; + + ret = platform_driver_register(&map_benchmark_platform_driver); + if (ret) { + pci_unregister_driver(&map_benchmark_pci_driver); + return ret; + } + + return 0; +} + +static void __exit map_benchmark_cleanup(void) +{ + platform_driver_unregister(&map_benchmark_platform_driver); + pci_unregister_driver(&map_benchmark_pci_driver); +} + +module_init(map_benchmark_init); +module_exit(map_benchmark_cleanup); + +MODULE_AUTHOR("Barry Song "); +MODULE_DESCRIPTION("dma_map benchmark driver"); +MODULE_LICENSE("GPL"); From 7679325702c90aecd393cd7cde685576c14489c0 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 16 Nov 2020 19:08:48 +1300 Subject: [PATCH 6/6] selftests/dma: add test application for DMA_MAP_BENCHMARK This patch provides the test application for DMA_MAP_BENCHMARK. Before running the test application, we need to bind a device to dma_map_ benchmark driver. For example, unbind "xxx" from its original driver and bind to dma_map_benchmark: echo dma_map_benchmark > /sys/bus/platform/devices/xxx/driver_override echo xxx > /sys/bus/platform/drivers/xxx/unbind echo xxx > /sys/bus/platform/drivers/dma_map_benchmark/bind Another example for PCI devices: echo dma_map_benchmark > /sys/bus/pci/devices/0000:00:01.0/driver_override echo 0000:00:01.0 > /sys/bus/pci/drivers/xxx/unbind echo 0000:00:01.0 > /sys/bus/pci/drivers/dma_map_benchmark/bind The below command will run 16 threads on numa node 0 for 10 seconds on the device bound to dma_map_benchmark platform_driver or pci_driver: ./dma_map_benchmark -t 16 -s 10 -n 0 dma mapping benchmark: threads:16 seconds:10 average map latency(us):1.1 standard deviation:1.9 average unmap latency(us):0.5 standard deviation:0.8 Cc: Will Deacon Cc: Shuah Khan Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Signed-off-by: Barry Song Signed-off-by: Christoph Hellwig --- MAINTAINERS | 6 + tools/testing/selftests/dma/Makefile | 6 + tools/testing/selftests/dma/config | 1 + .../testing/selftests/dma/dma_map_benchmark.c | 123 ++++++++++++++++++ 4 files changed, 136 insertions(+) create mode 100644 tools/testing/selftests/dma/Makefile create mode 100644 tools/testing/selftests/dma/config create mode 100644 tools/testing/selftests/dma/dma_map_benchmark.c diff --git a/MAINTAINERS b/MAINTAINERS index a008b70f3c16..c88abd1d323b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5237,6 +5237,12 @@ F: include/linux/dma-mapping.h F: include/linux/dma-map-ops.h F: kernel/dma/ +DMA MAPPING BENCHMARK +M: Barry Song +L: iommu@lists.linux-foundation.org +F: kernel/dma/map_benchmark.c +F: tools/testing/selftests/dma/ + DMA-BUF HEAPS FRAMEWORK M: Sumit Semwal R: Benjamin Gaignard diff --git a/tools/testing/selftests/dma/Makefile b/tools/testing/selftests/dma/Makefile new file mode 100644 index 000000000000..aa8e8b5b3864 --- /dev/null +++ b/tools/testing/selftests/dma/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +CFLAGS += -I../../../../usr/include/ + +TEST_GEN_PROGS := dma_map_benchmark + +include ../lib.mk diff --git a/tools/testing/selftests/dma/config b/tools/testing/selftests/dma/config new file mode 100644 index 000000000000..6102ee3c43cd --- /dev/null +++ b/tools/testing/selftests/dma/config @@ -0,0 +1 @@ +CONFIG_DMA_MAP_BENCHMARK=y diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c new file mode 100644 index 000000000000..7065163a8388 --- /dev/null +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Hisilicon Limited. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) +#define DMA_MAP_MAX_THREADS 1024 +#define DMA_MAP_MAX_SECONDS 300 + +#define DMA_MAP_BIDIRECTIONAL 0 +#define DMA_MAP_TO_DEVICE 1 +#define DMA_MAP_FROM_DEVICE 2 + +static char *directions[] = { + "BIDIRECTIONAL", + "TO_DEVICE", + "FROM_DEVICE", +}; + +struct map_benchmark { + __u64 avg_map_100ns; /* average map latency in 100ns */ + __u64 map_stddev; /* standard deviation of map latency */ + __u64 avg_unmap_100ns; /* as above */ + __u64 unmap_stddev; + __u32 threads; /* how many threads will do map/unmap in parallel */ + __u32 seconds; /* how long the test will last */ + __s32 node; /* which numa node this benchmark will run on */ + __u32 dma_bits; /* DMA addressing capability */ + __u32 dma_dir; /* DMA data direction */ + __u64 expansion[10]; /* For future use */ +}; + +int main(int argc, char **argv) +{ + struct map_benchmark map; + int fd, opt; + /* default single thread, run 20 seconds on NUMA_NO_NODE */ + int threads = 1, seconds = 20, node = -1; + /* default dma mask 32bit, bidirectional DMA */ + int bits = 32, dir = DMA_MAP_BIDIRECTIONAL; + + int cmd = DMA_MAP_BENCHMARK; + char *p; + + while ((opt = getopt(argc, argv, "t:s:n:b:d:")) != -1) { + switch (opt) { + case 't': + threads = atoi(optarg); + break; + case 's': + seconds = atoi(optarg); + break; + case 'n': + node = atoi(optarg); + break; + case 'b': + bits = atoi(optarg); + break; + case 'd': + dir = atoi(optarg); + break; + default: + return -1; + } + } + + if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) { + fprintf(stderr, "invalid number of threads, must be in 1-%d\n", + DMA_MAP_MAX_THREADS); + exit(1); + } + + if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) { + fprintf(stderr, "invalid number of seconds, must be in 1-%d\n", + DMA_MAP_MAX_SECONDS); + exit(1); + } + + /* suppose the mininum DMA zone is 1MB in the world */ + if (bits < 20 || bits > 64) { + fprintf(stderr, "invalid dma mask bit, must be in 20-64\n"); + exit(1); + } + + if (dir != DMA_MAP_BIDIRECTIONAL && dir != DMA_MAP_TO_DEVICE && + dir != DMA_MAP_FROM_DEVICE) { + fprintf(stderr, "invalid dma direction\n"); + exit(1); + } + + fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR); + if (fd == -1) { + perror("open"); + exit(1); + } + + map.seconds = seconds; + map.threads = threads; + map.node = node; + map.dma_bits = bits; + map.dma_dir = dir; + if (ioctl(fd, cmd, &map)) { + perror("ioctl"); + exit(1); + } + + printf("dma mapping benchmark: threads:%d seconds:%d node:%d dir:%s\n", + threads, seconds, node, dir[directions]); + printf("average map latency(us):%.1f standard deviation:%.1f\n", + map.avg_map_100ns/10.0, map.map_stddev/10.0); + printf("average unmap latency(us):%.1f standard deviation:%.1f\n", + map.avg_unmap_100ns/10.0, map.unmap_stddev/10.0); + + return 0; +}