diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 95fec5968362..4e3e9362afeb 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -161,13 +161,11 @@ device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use:: - long hmm_range_fault(struct hmm_range *range, unsigned int flags); + long hmm_range_fault(struct hmm_range *range); -With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table -entries and will not trigger a page fault on missing or non-present entries. -Without that flag, it does trigger a page fault on missing or read-only entries -if write access is requested (see below). Page faults use the generic mm page -fault code path just like a CPU page fault. +It will trigger a page fault on missing or read-only entries if write access is +requested (see below). Page faults use the generic mm page fault code path just +like a CPU page fault. Both functions copy CPU page table entries into their pfns array argument. Each entry in that array corresponds to an address in the virtual range. HMM @@ -197,7 +195,7 @@ The usage pattern is:: again: range.notifier_seq = mmu_interval_read_begin(&interval_sub); down_read(&mm->mmap_sem); - ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT); + ret = hmm_range_fault(&range); if (ret) { up_read(&mm->mmap_sem); if (ret == -EBUSY) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 79b1202b1c62..f44f6b27950f 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -563,6 +563,7 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, mig.end = end; mig.src = &src_pfn; mig.dst = &dst_pfn; + mig.src_owner = &kvmppc_uvmem_pgmap; mutex_lock(&kvm->arch.uvmem_lock); /* The requested page is already paged-out, nothing to do */ @@ -779,6 +780,8 @@ int kvmppc_uvmem_init(void) kvmppc_uvmem_pgmap.type = MEMORY_DEVICE_PRIVATE; kvmppc_uvmem_pgmap.res = *res; kvmppc_uvmem_pgmap.ops = &kvmppc_uvmem_ops; + /* just one global instance: */ + kvmppc_uvmem_pgmap.owner = &kvmppc_uvmem_pgmap; addr = memremap_pages(&kvmppc_uvmem_pgmap, NUMA_NO_NODE); if (IS_ERR(addr)) { ret = PTR_ERR(addr); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 9f44ba7d9d97..6309ff72bd78 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -770,7 +770,6 @@ struct amdgpu_ttm_tt { static const uint64_t hmm_range_flags[HMM_PFN_FLAG_MAX] = { (1 << 0), /* HMM_PFN_VALID */ (1 << 1), /* HMM_PFN_WRITE */ - 0 /* HMM_PFN_DEVICE_PRIVATE */ }; static const uint64_t hmm_range_values[HMM_PFN_VALUE_MAX] = { @@ -851,7 +850,7 @@ retry: range->notifier_seq = mmu_interval_read_begin(&bo->notifier); down_read(&mm->mmap_sem); - r = hmm_range_fault(range, 0); + r = hmm_range_fault(range); up_read(&mm->mmap_sem); if (unlikely(r <= 0)) { /* diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 0ad5d87b5a8e..ad89e09a0be3 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -176,6 +177,7 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) .end = vmf->address + PAGE_SIZE, .src = &src, .dst = &dst, + .src_owner = drm->dev, }; /* @@ -526,6 +528,7 @@ nouveau_dmem_init(struct nouveau_drm *drm) drm->dmem->pagemap.type = MEMORY_DEVICE_PRIVATE; drm->dmem->pagemap.res = *res; drm->dmem->pagemap.ops = &nouveau_dmem_pagemap_ops; + drm->dmem->pagemap.owner = drm->dev; if (IS_ERR(devm_memremap_pages(device, &drm->dmem->pagemap))) goto out_free; @@ -669,12 +672,6 @@ out: return ret; } -static inline bool -nouveau_dmem_page(struct nouveau_drm *drm, struct page *page) -{ - return is_device_private_page(page) && drm->dmem == page_to_dmem(page); -} - void nouveau_dmem_convert_pfn(struct nouveau_drm *drm, struct hmm_range *range) @@ -690,18 +687,12 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, if (page == NULL) continue; - if (!(range->pfns[i] & range->flags[HMM_PFN_DEVICE_PRIVATE])) { + if (!is_device_private_page(page)) continue; - } - - if (!nouveau_dmem_page(drm, page)) { - WARN(1, "Some unknown device memory !\n"); - range->pfns[i] = 0; - continue; - } addr = nouveau_dmem_page_addr(page); range->pfns[i] &= ((1UL << range->pfn_shift) - 1); range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift; + range->pfns[i] |= NVIF_VMM_PFNMAP_V0_VRAM; } } diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index df9bf1fd1bc0..e3797b2d4d17 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -367,7 +367,6 @@ static const u64 nouveau_svm_pfn_flags[HMM_PFN_FLAG_MAX] = { [HMM_PFN_VALID ] = NVIF_VMM_PFNMAP_V0_V, [HMM_PFN_WRITE ] = NVIF_VMM_PFNMAP_V0_W, - [HMM_PFN_DEVICE_PRIVATE] = NVIF_VMM_PFNMAP_V0_VRAM, }; static const u64 @@ -541,7 +540,7 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm, range.default_flags = 0; range.pfn_flags_mask = -1UL; down_read(&mm->mmap_sem); - ret = hmm_range_fault(&range, 0); + ret = hmm_range_fault(&range); up_read(&mm->mmap_sem); if (ret <= 0) { if (ret == 0 || ret == -EBUSY) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ddf9f7144c43..7475051100c7 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -3,58 +3,8 @@ * Copyright 2013 Red Hat Inc. * * Authors: Jérôme Glisse - */ -/* - * Heterogeneous Memory Management (HMM) * - * See Documentation/vm/hmm.rst for reasons and overview of what HMM is and it - * is for. Here we focus on the HMM API description, with some explanation of - * the underlying implementation. - * - * Short description: HMM provides a set of helpers to share a virtual address - * space between CPU and a device, so that the device can access any valid - * address of the process (while still obeying memory protection). HMM also - * provides helpers to migrate process memory to device memory, and back. Each - * set of functionality (address space mirroring, and migration to and from - * device memory) can be used independently of the other. - * - * - * HMM address space mirroring API: - * - * Use HMM address space mirroring if you want to mirror a range of the CPU - * page tables of a process into a device page table. Here, "mirror" means "keep - * synchronized". Prerequisites: the device must provide the ability to write- - * protect its page tables (at PAGE_SIZE granularity), and must be able to - * recover from the resulting potential page faults. - * - * HMM guarantees that at any point in time, a given virtual address points to - * either the same memory in both CPU and device page tables (that is: CPU and - * device page tables each point to the same pages), or that one page table (CPU - * or device) points to no entry, while the other still points to the old page - * for the address. The latter case happens when the CPU page table update - * happens first, and then the update is mirrored over to the device page table. - * This does not cause any issue, because the CPU page table cannot start - * pointing to a new page until the device page table is invalidated. - * - * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any - * updates to each device driver that has registered a mirror. It also provides - * some API calls to help with taking a snapshot of the CPU page table, and to - * synchronize with any updates that might happen concurrently. - * - * - * HMM migration to and from device memory: - * - * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with - * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page - * of the device memory, and allows the device driver to manage its memory - * using those struct pages. Having struct pages for device memory makes - * migration easier. Because that memory is not addressable by the CPU it must - * never be pinned to the device; in other words, any CPU page fault can always - * cause the device memory to be migrated (copied/moved) back to regular memory. - * - * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that - * allows use of a device DMA engine to perform the copy operation between - * regular system memory and device memory. + * See Documentation/vm/hmm.rst for reasons and overview of what HMM is. */ #ifndef LINUX_HMM_H #define LINUX_HMM_H @@ -74,7 +24,6 @@ * Flags: * HMM_PFN_VALID: pfn is valid. It has, at least, read permission. * HMM_PFN_WRITE: CPU page table has write permission set - * HMM_PFN_DEVICE_PRIVATE: private device memory (ZONE_DEVICE) * * The driver provides a flags array for mapping page protections to device * PTE bits. If the driver valid bit for an entry is bit 3, @@ -86,7 +35,6 @@ enum hmm_pfn_flag_e { HMM_PFN_VALID = 0, HMM_PFN_WRITE, - HMM_PFN_DEVICE_PRIVATE, HMM_PFN_FLAG_MAX }; @@ -122,9 +70,6 @@ enum hmm_pfn_value_e { * * @notifier: a mmu_interval_notifier that includes the start/end * @notifier_seq: result of mmu_interval_read_begin() - * @hmm: the core HMM structure this range is active against - * @vma: the vm area struct for the range - * @list: all range lock are on a list * @start: range virtual start address (inclusive) * @end: range virtual end address (exclusive) * @pfns: array of pfns (big enough for the range) @@ -132,8 +77,8 @@ enum hmm_pfn_value_e { * @values: pfn value for some special case (none, special, error, ...) * @default_flags: default flags for the range (write, read, ... see hmm doc) * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter - * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) - * @valid: pfns array did not change since it has been fill by an HMM function + * @pfn_shift: pfn shift value (should be <= PAGE_SHIFT) + * @dev_private_owner: owner of device private pages */ struct hmm_range { struct mmu_interval_notifier *notifier; @@ -146,6 +91,7 @@ struct hmm_range { uint64_t default_flags; uint64_t pfn_flags_mask; uint8_t pfn_shift; + void *dev_private_owner; }; /* @@ -171,71 +117,10 @@ static inline struct page *hmm_device_entry_to_page(const struct hmm_range *rang return pfn_to_page(entry >> range->pfn_shift); } -/* - * hmm_device_entry_to_pfn() - return pfn value store in a device entry - * @range: range use to decode device entry value - * @entry: device entry to extract pfn from - * Return: pfn value if device entry is valid, -1UL otherwise - */ -static inline unsigned long -hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) -{ - if (pfn == range->values[HMM_PFN_NONE]) - return -1UL; - if (pfn == range->values[HMM_PFN_ERROR]) - return -1UL; - if (pfn == range->values[HMM_PFN_SPECIAL]) - return -1UL; - if (!(pfn & range->flags[HMM_PFN_VALID])) - return -1UL; - return (pfn >> range->pfn_shift); -} - -/* - * hmm_device_entry_from_page() - create a valid device entry for a page - * @range: range use to encode HMM pfn value - * @page: page for which to create the device entry - * Return: valid device entry for the page - */ -static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, - struct page *page) -{ - return (page_to_pfn(page) << range->pfn_shift) | - range->flags[HMM_PFN_VALID]; -} - -/* - * hmm_device_entry_from_pfn() - create a valid device entry value from pfn - * @range: range use to encode HMM pfn value - * @pfn: pfn value for which to create the device entry - * Return: valid device entry for the pfn - */ -static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, - unsigned long pfn) -{ - return (pfn << range->pfn_shift) | - range->flags[HMM_PFN_VALID]; -} - -/* - * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. - */ -#define HMM_FAULT_ALLOW_RETRY (1 << 0) - -/* Don't fault in missing PTEs, just snapshot the current state. */ -#define HMM_FAULT_SNAPSHOT (1 << 1) - -#ifdef CONFIG_HMM_MIRROR /* * Please see Documentation/vm/hmm.rst for how to use the range API. */ -long hmm_range_fault(struct hmm_range *range, unsigned int flags); -#else -static inline long hmm_range_fault(struct hmm_range *range, unsigned int flags) -{ - return -EOPNOTSUPP; -} -#endif +long hmm_range_fault(struct hmm_range *range); /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 6fefb09af7c3..60d97e8fd3c0 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -103,6 +103,9 @@ struct dev_pagemap_ops { * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior * @ops: method table + * @owner: an opaque pointer identifying the entity that manages this + * instance. Used by various helpers to make sure that no + * foreign ZONE_DEVICE memory is accessed. */ struct dev_pagemap { struct vmem_altmap altmap; @@ -113,6 +116,7 @@ struct dev_pagemap { enum memory_type type; unsigned int flags; const struct dev_pagemap_ops *ops; + void *owner; }; static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 72120061b7d4..3e546cbf03dd 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -196,6 +196,14 @@ struct migrate_vma { unsigned long npages; unsigned long start; unsigned long end; + + /* + * Set to the owner value also stored in page->pgmap->owner for + * migrating out of device private memory. If set only device + * private pages with this owner are migrated. If not set + * device private pages are not migrated at all. + */ + void *src_owner; }; int migrate_vma_setup(struct migrate_vma *args); diff --git a/mm/hmm.c b/mm/hmm.c index 72e5a6d9a417..280585833adf 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -28,41 +28,25 @@ struct hmm_vma_walk { struct hmm_range *range; - struct dev_pagemap *pgmap; unsigned long last; - unsigned int flags; }; -static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, - bool write_fault, uint64_t *pfn) +enum { + HMM_NEED_FAULT = 1 << 0, + HMM_NEED_WRITE_FAULT = 1 << 1, + HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT, +}; + +/* + * hmm_device_entry_from_pfn() - create a valid device entry value from pfn + * @range: range use to encode HMM pfn value + * @pfn: pfn value for which to create the device entry + * Return: valid device entry for the pfn + */ +static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, + unsigned long pfn) { - unsigned int flags = FAULT_FLAG_REMOTE; - struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; - vm_fault_t ret; - - if (!vma) - goto err; - - if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) - flags |= FAULT_FLAG_ALLOW_RETRY; - if (write_fault) - flags |= FAULT_FLAG_WRITE; - - ret = handle_mm_fault(vma, addr, flags); - if (ret & VM_FAULT_RETRY) { - /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ - return -EAGAIN; - } - if (ret & VM_FAULT_ERROR) - goto err; - - return -EBUSY; - -err: - *pfn = range->values[HMM_PFN_ERROR]; - return -EFAULT; + return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } static int hmm_pfns_fill(unsigned long addr, unsigned long end, @@ -79,56 +63,43 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end, } /* - * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) + * hmm_vma_fault() - fault in a range lacking valid pmd or pte(s) * @addr: range virtual start address (inclusive) * @end: range virtual end address (exclusive) - * @fault: should we fault or not ? - * @write_fault: write fault ? + * @required_fault: HMM_NEED_* flags * @walk: mm_walk structure - * Return: 0 on success, -EBUSY after page fault, or page fault error + * Return: -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. */ -static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, - bool fault, bool write_fault, - struct mm_walk *walk) +static int hmm_vma_fault(unsigned long addr, unsigned long end, + unsigned int required_fault, struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = range->pfns; - unsigned long i; + struct vm_area_struct *vma = walk->vma; + unsigned int fault_flags = FAULT_FLAG_REMOTE; + WARN_ON_ONCE(!required_fault); hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) - return -EPERM; - - for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = range->values[HMM_PFN_NONE]; - if (fault || write_fault) { - int ret; - - ret = hmm_vma_do_fault(walk, addr, write_fault, - &pfns[i]); - if (ret != -EBUSY) - return ret; - } + if (required_fault & HMM_NEED_WRITE_FAULT) { + if (!(vma->vm_flags & VM_WRITE)) + return -EPERM; + fault_flags |= FAULT_FLAG_WRITE; } - return (fault || write_fault) ? -EBUSY : 0; + for (; addr < end; addr += PAGE_SIZE) + if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR) + return -EFAULT; + return -EBUSY; } -static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - uint64_t pfns, uint64_t cpu_flags, - bool *fault, bool *write_fault) +static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + uint64_t pfns, uint64_t cpu_flags) { struct hmm_range *range = hmm_vma_walk->range; - if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) - return; - /* * So we not only consider the individual per page request we also * consider the default flags requested for the range. The API can @@ -143,46 +114,44 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) - return; - /* If this is device memory then only fault if explicitly requested */ - if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { - /* Do we fault on device memory ? */ - if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { - *write_fault = pfns & range->flags[HMM_PFN_WRITE]; - *fault = true; - } - return; - } + return 0; - /* If CPU page table is not valid then we need to fault */ - *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); /* Need to write fault ? */ if ((pfns & range->flags[HMM_PFN_WRITE]) && - !(cpu_flags & range->flags[HMM_PFN_WRITE])) { - *write_fault = true; - *fault = true; - } + !(cpu_flags & range->flags[HMM_PFN_WRITE])) + return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT; + + /* If CPU page table is not valid then we need to fault */ + if (!(cpu_flags & range->flags[HMM_PFN_VALID])) + return HMM_NEED_FAULT; + return 0; } -static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, - const uint64_t *pfns, unsigned long npages, - uint64_t cpu_flags, bool *fault, - bool *write_fault) +static unsigned int +hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const uint64_t *pfns, unsigned long npages, + uint64_t cpu_flags) { + struct hmm_range *range = hmm_vma_walk->range; + unsigned int required_fault = 0; unsigned long i; - if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { - *fault = *write_fault = false; - return; - } + /* + * If the default flags do not request to fault pages, and the mask does + * not allow for individual pages to be faulted, then + * hmm_pte_need_fault() will always return 0. + */ + if (!((range->default_flags | range->pfn_flags_mask) & + range->flags[HMM_PFN_VALID])) + return 0; - *fault = *write_fault = false; for (i = 0; i < npages; ++i) { - hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, - fault, write_fault); - if ((*write_fault)) - return; + required_fault |= + hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags); + if (required_fault == HMM_NEED_ALL_BITS) + return required_fault; } + return required_fault; } static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, @@ -190,16 +159,23 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - bool fault, write_fault; + unsigned int required_fault; unsigned long i, npages; uint64_t *pfns; i = (addr - range->start) >> PAGE_SHIFT; npages = (end - addr) >> PAGE_SHIFT; pfns = &range->pfns[i]; - hmm_range_need_fault(hmm_vma_walk, pfns, npages, - 0, &fault, &write_fault); - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0); + if (!walk->vma) { + if (required_fault) + return -EFAULT; + return hmm_pfns_fill(addr, end, range, HMM_PFN_ERROR); + } + if (required_fault) + return hmm_vma_fault(addr, end, required_fault, walk); + hmm_vma_walk->last = addr; + return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE); } static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) @@ -218,31 +194,19 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; - bool fault, write_fault; + unsigned int required_fault; uint64_t cpu_flags; npages = (end - addr) >> PAGE_SHIFT; cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); - hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, - &fault, &write_fault); - - if (pmd_protnone(pmd) || fault || write_fault) - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + required_fault = + hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags); + if (required_fault) + return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { - if (pmd_devmap(pmd)) { - hmm_vma_walk->pgmap = get_dev_pagemap(pfn, - hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) - return -EBUSY; - } + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - } - if (hmm_vma_walk->pgmap) { - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } hmm_vma_walk->last = end; return 0; } @@ -252,6 +216,14 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, uint64_t *pfns, pmd_t pmd); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline bool hmm_is_device_private_entry(struct hmm_range *range, + swp_entry_t entry) +{ + return is_device_private_entry(entry) && + device_private_entry_to_page(entry)->pgmap->owner == + range->dev_private_owner; +} + static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) @@ -267,102 +239,81 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - bool fault, write_fault; + unsigned int required_fault; uint64_t cpu_flags; pte_t pte = *ptep; uint64_t orig_pfn = *pfn; - *pfn = range->values[HMM_PFN_NONE]; - fault = write_fault = false; - if (pte_none(pte)) { - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, - &fault, &write_fault); - if (fault || write_fault) + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + if (required_fault) goto fault; + *pfn = range->values[HMM_PFN_NONE]; return 0; } if (!pte_present(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - if (!non_swap_entry(entry)) { - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); - if (fault || write_fault) - goto fault; + /* + * Never fault in device private pages pages, but just report + * the PFN even if not present. + */ + if (hmm_is_device_private_entry(range, entry)) { + *pfn = hmm_device_entry_from_pfn(range, + device_private_entry_to_pfn(entry)); + *pfn |= range->flags[HMM_PFN_VALID]; + if (is_write_device_private_entry(entry)) + *pfn |= range->flags[HMM_PFN_WRITE]; return 0; } - /* - * This is a special swap entry, ignore migration, use - * device and report anything else as error. - */ - if (is_device_private_entry(entry)) { - cpu_flags = range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_DEVICE_PRIVATE]; - cpu_flags |= is_write_device_private_entry(entry) ? - range->flags[HMM_PFN_WRITE] : 0; - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); - if (fault || write_fault) - goto fault; - *pfn = hmm_device_entry_from_pfn(range, - swp_offset(entry)); - *pfn |= cpu_flags; + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0); + if (!required_fault) { + *pfn = range->values[HMM_PFN_NONE]; return 0; } + if (!non_swap_entry(entry)) + goto fault; + if (is_migration_entry(entry)) { - if (fault || write_fault) { - pte_unmap(ptep); - hmm_vma_walk->last = addr; - migration_entry_wait(walk->mm, pmdp, addr); - return -EBUSY; - } - return 0; + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(walk->mm, pmdp, addr); + return -EBUSY; } /* Report error for everything else */ - *pfn = range->values[HMM_PFN_ERROR]; + pte_unmap(ptep); return -EFAULT; - } else { - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); } - if (fault || write_fault) + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + if (required_fault) goto fault; - if (pte_devmap(pte)) { - hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), - hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) - return -EBUSY; - } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { - if (!is_zero_pfn(pte_pfn(pte))) { - *pfn = range->values[HMM_PFN_SPECIAL]; + /* + * Since each architecture defines a struct page for the zero page, just + * fall through and treat it like a normal page. + */ + if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) { + if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) { + pte_unmap(ptep); return -EFAULT; } - /* - * Since each architecture defines a struct page for the zero - * page, just fall through and treat it like a normal page. - */ + *pfn = range->values[HMM_PFN_SPECIAL]; + return 0; } *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: - if (hmm_vma_walk->pgmap) { - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + return hmm_vma_fault(addr, end, required_fault, walk); } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -372,8 +323,9 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - uint64_t *pfns = range->pfns; - unsigned long addr = start, i; + uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT]; + unsigned long npages = (end - start) >> PAGE_SHIFT; + unsigned long addr = start; pte_t *ptep; pmd_t pmd; @@ -383,24 +335,19 @@ again: return hmm_vma_walk_hole(start, end, -1, walk); if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { - bool fault, write_fault; - unsigned long npages; - uint64_t *pfns; - - i = (addr - range->start) >> PAGE_SHIFT; - npages = (end - addr) >> PAGE_SHIFT; - pfns = &range->pfns[i]; - - hmm_range_need_fault(hmm_vma_walk, pfns, npages, - 0, &fault, &write_fault); - if (fault || write_fault) { + if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(walk->mm, pmdp); return -EBUSY; } - return 0; - } else if (!pmd_present(pmd)) + return hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + } + + if (!pmd_present(pmd)) { + if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* @@ -417,8 +364,7 @@ again: if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - i = (addr - range->start) >> PAGE_SHIFT; - return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); + return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd); } /* @@ -427,31 +373,23 @@ again: * entry pointing to pte directory or it is a bad pmd that will not * recover. */ - if (pmd_bad(pmd)) + if (pmd_bad(pmd)) { + if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) + return -EFAULT; return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + } ptep = pte_offset_map(pmdp, addr); - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { + for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) { int r; - r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns); if (r) { - /* hmm_vma_handle_pte() did unmap pte directory */ + /* hmm_vma_handle_pte() did pte_unmap() */ hmm_vma_walk->last = addr; return r; } } - if (hmm_vma_walk->pgmap) { - /* - * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() - * so that we can leverage get_dev_pagemap() optimization which - * will not re-take a reference on a pgmap if we already have - * one. - */ - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } pte_unmap(ptep - 1); hmm_vma_walk->last = addr; @@ -487,18 +425,18 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pud = READ_ONCE(*pudp); if (pud_none(pud)) { - ret = hmm_vma_walk_hole(start, end, -1, walk); - goto out_unlock; + spin_unlock(ptl); + return hmm_vma_walk_hole(start, end, -1, walk); } if (pud_huge(pud) && pud_devmap(pud)) { unsigned long i, npages, pfn; + unsigned int required_fault; uint64_t *pfns, cpu_flags; - bool fault, write_fault; if (!pud_present(pud)) { - ret = hmm_vma_walk_hole(start, end, -1, walk); - goto out_unlock; + spin_unlock(ptl); + return hmm_vma_walk_hole(start, end, -1, walk); } i = (addr - range->start) >> PAGE_SHIFT; @@ -506,29 +444,17 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, pfns = &range->pfns[i]; cpu_flags = pud_to_hmm_pfn_flags(range, pud); - hmm_range_need_fault(hmm_vma_walk, pfns, npages, - cpu_flags, &fault, &write_fault); - if (fault || write_fault) { - ret = hmm_vma_walk_hole_(addr, end, fault, - write_fault, walk); - goto out_unlock; + required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, + npages, cpu_flags); + if (required_fault) { + spin_unlock(ptl); + return hmm_vma_fault(addr, end, required_fault, walk); } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) { - hmm_vma_walk->pgmap = get_dev_pagemap(pfn, - hmm_vma_walk->pgmap); - if (unlikely(!hmm_vma_walk->pgmap)) { - ret = -EBUSY; - goto out_unlock; - } + for (i = 0; i < npages; ++i, ++pfn) pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; - } - if (hmm_vma_walk->pgmap) { - put_dev_pagemap(hmm_vma_walk->pgmap); - hmm_vma_walk->pgmap = NULL; - } hmm_vma_walk->last = end; goto out_unlock; } @@ -554,24 +480,20 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; uint64_t orig_pfn, cpu_flags; - bool fault, write_fault; + unsigned int required_fault; spinlock_t *ptl; pte_t entry; - int ret = 0; ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); i = (start - range->start) >> PAGE_SHIFT; orig_pfn = range->pfns[i]; - range->pfns[i] = range->values[HMM_PFN_NONE]; cpu_flags = pte_to_hmm_pfn_flags(range, entry); - fault = write_fault = false; - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); - if (fault || write_fault) { - ret = -ENOENT; - goto unlock; + required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags); + if (required_fault) { + spin_unlock(ptl); + return hmm_vma_fault(addr, end, required_fault, walk); } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); @@ -579,14 +501,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; hmm_vma_walk->last = end; - -unlock: spin_unlock(ptl); - - if (ret == -ENOENT) - return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); - - return ret; + return 0; } #else #define hmm_vma_walk_hugetlb_entry NULL @@ -599,40 +515,32 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && + vma->vm_flags & VM_READ) + return 0; + /* - * Skip vma ranges that don't have struct page backing them or - * map I/O devices directly. + * vma ranges that don't have struct page backing them or map I/O + * devices directly cannot be handled by hmm_range_fault(). + * + * If the vma does not allow read access, then assume that it does not + * allow write access either. HMM does not support architectures that + * allow write without read. + * + * If a fault is requested for an unsupported range then it is a hard + * failure. */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) + if (hmm_range_need_fault(hmm_vma_walk, + range->pfns + + ((start - range->start) >> PAGE_SHIFT), + (end - start) >> PAGE_SHIFT, 0)) return -EFAULT; - /* - * If the vma does not allow read access, then assume that it does not - * allow write access either. HMM does not support architectures - * that allow write without read. - */ - if (!(vma->vm_flags & VM_READ)) { - bool fault, write_fault; + hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); + hmm_vma_walk->last = end; - /* - * Check to see if a fault is requested for any page in the - * range. - */ - hmm_range_need_fault(hmm_vma_walk, range->pfns + - ((start - range->start) >> PAGE_SHIFT), - (end - start) >> PAGE_SHIFT, - 0, &fault, &write_fault); - if (fault || write_fault) - return -EFAULT; - - hmm_pfns_fill(start, end, range, HMM_PFN_NONE); - hmm_vma_walk->last = end; - - /* Skip this vma and continue processing the next vma. */ - return 1; - } - - return 0; + /* Skip this vma and continue processing the next vma. */ + return 1; } static const struct mm_walk_ops hmm_walk_ops = { @@ -645,8 +553,7 @@ static const struct mm_walk_ops hmm_walk_ops = { /** * hmm_range_fault - try to fault some address in a virtual address range - * @range: range being faulted - * @flags: HMM_FAULT_* flags + * @range: argument structure * * Return: the number of valid pages in range->pfns[] (from range start * address), which may be zero. On error one of the following status codes @@ -657,26 +564,19 @@ static const struct mm_walk_ops hmm_walk_ops = { * -ENOMEM: Out of memory. * -EPERM: Invalid permission (e.g., asking for write and range is read * only). - * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. * -EBUSY: The range has been invalidated and the caller needs to wait for * the invalidation to finish. - * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access - * that range) number of valid pages in range->pfns[] (from - * range start address). + * -EFAULT: A page was requested to be valid and could not be made valid + * ie it has no backing VMA or it is illegal to access * - * This is similar to a regular CPU page fault except that it will not trigger - * any memory migration if the memory being faulted is not accessible by CPUs - * and caller does not ask for migration. - * - * On error, for one virtual address in the range, the function will mark the - * corresponding HMM pfn entry with an error flag. + * This is similar to get_user_pages(), except that it can read the page tables + * without mutating them (ie causing faults). */ -long hmm_range_fault(struct hmm_range *range, unsigned int flags) +long hmm_range_fault(struct hmm_range *range) { struct hmm_vma_walk hmm_vma_walk = { .range = range, .last = range->start, - .flags = flags, }; struct mm_struct *mm = range->notifier->mm; int ret; diff --git a/mm/memremap.c b/mm/memremap.c index 09b5b7adc773..9b2c97ceb775 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -181,6 +181,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "Missing migrate_to_ram method\n"); return ERR_PTR(-EINVAL); } + if (!pgmap->owner) { + WARN(1, "Missing owner\n"); + return ERR_PTR(-EINVAL); + } break; case MEMORY_DEVICE_FS_DAX: if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || diff --git a/mm/migrate.c b/mm/migrate.c index b1092876e537..7605d2c23433 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2241,7 +2241,7 @@ again: arch_enter_lazy_mmu_mode(); for (; addr < end; addr += PAGE_SIZE, ptep++) { - unsigned long mpfn, pfn; + unsigned long mpfn = 0, pfn; struct page *page; swp_entry_t entry; pte_t pte; @@ -2255,8 +2255,6 @@ again: } if (!pte_present(pte)) { - mpfn = 0; - /* * Only care about unaddressable device page special * page table entry. Other special swap entries are not @@ -2267,11 +2265,16 @@ again: goto next; page = device_private_entry_to_page(entry); + if (page->pgmap->owner != migrate->src_owner) + goto next; + mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { + if (migrate->src_owner) + goto next; pfn = pte_pfn(pte); if (is_zero_pfn(pfn)) { mpfn = MIGRATE_PFN_MIGRATE;