mirror of
https://github.com/torvalds/linux.git
synced 2024-12-25 04:11:49 +00:00
314e51b985
A long time ago, in v2.4, VM_RESERVED kept swapout process off VMA, currently it lost original meaning but still has some effects: | effect | alternative flags -+------------------------+--------------------------------------------- 1| account as reserved_vm | VM_IO 2| skip in core dump | VM_IO, VM_DONTDUMP 3| do not merge or expand | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP 4| do not mlock | VM_IO, VM_DONTEXPAND, VM_HUGETLB, VM_PFNMAP This patch removes reserved_vm counter from mm_struct. Seems like nobody cares about it, it does not exported into userspace directly, it only reduces total_vm showed in proc. Thus VM_RESERVED can be replaced with VM_IO or pair VM_DONTEXPAND | VM_DONTDUMP. remap_pfn_range() and io_remap_pfn_range() set VM_IO|VM_DONTEXPAND|VM_DONTDUMP. remap_vmalloc_range() set VM_DONTEXPAND | VM_DONTDUMP. [akpm@linux-foundation.org: drivers/vfio/pci/vfio_pci.c fixup] Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Carsten Otte <cotte@de.ibm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Cyrill Gorcunov <gorcunov@openvz.org> Cc: Eric Paris <eparis@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Hugh Dickins <hughd@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Morris <james.l.morris@oracle.com> Cc: Jason Baron <jbaron@redhat.com> Cc: Kentaro Takeda <takedakn@nttdata.co.jp> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Robert Richter <robert.richter@amd.com> Cc: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Venkatesh Pallipadi <venki@google.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
466 lines
11 KiB
C
466 lines
11 KiB
C
/**************************************************************************
|
|
*
|
|
* Copyright (c) 2006-2009 VMware, Inc., Palo Alto, CA., USA
|
|
* All Rights Reserved.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the
|
|
* "Software"), to deal in the Software without restriction, including
|
|
* without limitation the rights to use, copy, modify, merge, publish,
|
|
* distribute, sub license, and/or sell copies of the Software, and to
|
|
* permit persons to whom the Software is furnished to do so, subject to
|
|
* the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the
|
|
* next paragraph) shall be included in all copies or substantial portions
|
|
* of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
|
|
* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
|
|
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
|
|
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
|
|
* USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
**************************************************************************/
|
|
/*
|
|
* Authors: Thomas Hellstrom <thellstrom-at-vmware-dot-com>
|
|
*/
|
|
|
|
#define pr_fmt(fmt) "[TTM] " fmt
|
|
|
|
#include <ttm/ttm_module.h>
|
|
#include <ttm/ttm_bo_driver.h>
|
|
#include <ttm/ttm_placement.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/module.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
#define TTM_BO_VM_NUM_PREFAULT 16
|
|
|
|
static struct ttm_buffer_object *ttm_bo_vm_lookup_rb(struct ttm_bo_device *bdev,
|
|
unsigned long page_start,
|
|
unsigned long num_pages)
|
|
{
|
|
struct rb_node *cur = bdev->addr_space_rb.rb_node;
|
|
unsigned long cur_offset;
|
|
struct ttm_buffer_object *bo;
|
|
struct ttm_buffer_object *best_bo = NULL;
|
|
|
|
while (likely(cur != NULL)) {
|
|
bo = rb_entry(cur, struct ttm_buffer_object, vm_rb);
|
|
cur_offset = bo->vm_node->start;
|
|
if (page_start >= cur_offset) {
|
|
cur = cur->rb_right;
|
|
best_bo = bo;
|
|
if (page_start == cur_offset)
|
|
break;
|
|
} else
|
|
cur = cur->rb_left;
|
|
}
|
|
|
|
if (unlikely(best_bo == NULL))
|
|
return NULL;
|
|
|
|
if (unlikely((best_bo->vm_node->start + best_bo->num_pages) <
|
|
(page_start + num_pages)))
|
|
return NULL;
|
|
|
|
return best_bo;
|
|
}
|
|
|
|
static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
|
{
|
|
struct ttm_buffer_object *bo = (struct ttm_buffer_object *)
|
|
vma->vm_private_data;
|
|
struct ttm_bo_device *bdev = bo->bdev;
|
|
unsigned long page_offset;
|
|
unsigned long page_last;
|
|
unsigned long pfn;
|
|
struct ttm_tt *ttm = NULL;
|
|
struct page *page;
|
|
int ret;
|
|
int i;
|
|
unsigned long address = (unsigned long)vmf->virtual_address;
|
|
int retval = VM_FAULT_NOPAGE;
|
|
struct ttm_mem_type_manager *man =
|
|
&bdev->man[bo->mem.mem_type];
|
|
|
|
/*
|
|
* Work around locking order reversal in fault / nopfn
|
|
* between mmap_sem and bo_reserve: Perform a trylock operation
|
|
* for reserve, and if it fails, retry the fault after scheduling.
|
|
*/
|
|
|
|
ret = ttm_bo_reserve(bo, true, true, false, 0);
|
|
if (unlikely(ret != 0)) {
|
|
if (ret == -EBUSY)
|
|
set_need_resched();
|
|
return VM_FAULT_NOPAGE;
|
|
}
|
|
|
|
if (bdev->driver->fault_reserve_notify) {
|
|
ret = bdev->driver->fault_reserve_notify(bo);
|
|
switch (ret) {
|
|
case 0:
|
|
break;
|
|
case -EBUSY:
|
|
set_need_resched();
|
|
case -ERESTARTSYS:
|
|
retval = VM_FAULT_NOPAGE;
|
|
goto out_unlock;
|
|
default:
|
|
retval = VM_FAULT_SIGBUS;
|
|
goto out_unlock;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Wait for buffer data in transit, due to a pipelined
|
|
* move.
|
|
*/
|
|
|
|
spin_lock(&bdev->fence_lock);
|
|
if (test_bit(TTM_BO_PRIV_FLAG_MOVING, &bo->priv_flags)) {
|
|
ret = ttm_bo_wait(bo, false, true, false);
|
|
spin_unlock(&bdev->fence_lock);
|
|
if (unlikely(ret != 0)) {
|
|
retval = (ret != -ERESTARTSYS) ?
|
|
VM_FAULT_SIGBUS : VM_FAULT_NOPAGE;
|
|
goto out_unlock;
|
|
}
|
|
} else
|
|
spin_unlock(&bdev->fence_lock);
|
|
|
|
ret = ttm_mem_io_lock(man, true);
|
|
if (unlikely(ret != 0)) {
|
|
retval = VM_FAULT_NOPAGE;
|
|
goto out_unlock;
|
|
}
|
|
ret = ttm_mem_io_reserve_vm(bo);
|
|
if (unlikely(ret != 0)) {
|
|
retval = VM_FAULT_SIGBUS;
|
|
goto out_io_unlock;
|
|
}
|
|
|
|
page_offset = ((address - vma->vm_start) >> PAGE_SHIFT) +
|
|
bo->vm_node->start - vma->vm_pgoff;
|
|
page_last = ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) +
|
|
bo->vm_node->start - vma->vm_pgoff;
|
|
|
|
if (unlikely(page_offset >= bo->num_pages)) {
|
|
retval = VM_FAULT_SIGBUS;
|
|
goto out_io_unlock;
|
|
}
|
|
|
|
/*
|
|
* Strictly, we're not allowed to modify vma->vm_page_prot here,
|
|
* since the mmap_sem is only held in read mode. However, we
|
|
* modify only the caching bits of vma->vm_page_prot and
|
|
* consider those bits protected by
|
|
* the bo->mutex, as we should be the only writers.
|
|
* There shouldn't really be any readers of these bits except
|
|
* within vm_insert_mixed()? fork?
|
|
*
|
|
* TODO: Add a list of vmas to the bo, and change the
|
|
* vma->vm_page_prot when the object changes caching policy, with
|
|
* the correct locks held.
|
|
*/
|
|
if (bo->mem.bus.is_iomem) {
|
|
vma->vm_page_prot = ttm_io_prot(bo->mem.placement,
|
|
vma->vm_page_prot);
|
|
} else {
|
|
ttm = bo->ttm;
|
|
vma->vm_page_prot = (bo->mem.placement & TTM_PL_FLAG_CACHED) ?
|
|
vm_get_page_prot(vma->vm_flags) :
|
|
ttm_io_prot(bo->mem.placement, vma->vm_page_prot);
|
|
|
|
/* Allocate all page at once, most common usage */
|
|
if (ttm->bdev->driver->ttm_tt_populate(ttm)) {
|
|
retval = VM_FAULT_OOM;
|
|
goto out_io_unlock;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Speculatively prefault a number of pages. Only error on
|
|
* first page.
|
|
*/
|
|
for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) {
|
|
if (bo->mem.bus.is_iomem)
|
|
pfn = ((bo->mem.bus.base + bo->mem.bus.offset) >> PAGE_SHIFT) + page_offset;
|
|
else {
|
|
page = ttm->pages[page_offset];
|
|
if (unlikely(!page && i == 0)) {
|
|
retval = VM_FAULT_OOM;
|
|
goto out_io_unlock;
|
|
} else if (unlikely(!page)) {
|
|
break;
|
|
}
|
|
pfn = page_to_pfn(page);
|
|
}
|
|
|
|
ret = vm_insert_mixed(vma, address, pfn);
|
|
/*
|
|
* Somebody beat us to this PTE or prefaulting to
|
|
* an already populated PTE, or prefaulting error.
|
|
*/
|
|
|
|
if (unlikely((ret == -EBUSY) || (ret != 0 && i > 0)))
|
|
break;
|
|
else if (unlikely(ret != 0)) {
|
|
retval =
|
|
(ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
|
|
goto out_io_unlock;
|
|
}
|
|
|
|
address += PAGE_SIZE;
|
|
if (unlikely(++page_offset >= page_last))
|
|
break;
|
|
}
|
|
out_io_unlock:
|
|
ttm_mem_io_unlock(man);
|
|
out_unlock:
|
|
ttm_bo_unreserve(bo);
|
|
return retval;
|
|
}
|
|
|
|
static void ttm_bo_vm_open(struct vm_area_struct *vma)
|
|
{
|
|
struct ttm_buffer_object *bo =
|
|
(struct ttm_buffer_object *)vma->vm_private_data;
|
|
|
|
(void)ttm_bo_reference(bo);
|
|
}
|
|
|
|
static void ttm_bo_vm_close(struct vm_area_struct *vma)
|
|
{
|
|
struct ttm_buffer_object *bo = (struct ttm_buffer_object *)vma->vm_private_data;
|
|
|
|
ttm_bo_unref(&bo);
|
|
vma->vm_private_data = NULL;
|
|
}
|
|
|
|
static const struct vm_operations_struct ttm_bo_vm_ops = {
|
|
.fault = ttm_bo_vm_fault,
|
|
.open = ttm_bo_vm_open,
|
|
.close = ttm_bo_vm_close
|
|
};
|
|
|
|
int ttm_bo_mmap(struct file *filp, struct vm_area_struct *vma,
|
|
struct ttm_bo_device *bdev)
|
|
{
|
|
struct ttm_bo_driver *driver;
|
|
struct ttm_buffer_object *bo;
|
|
int ret;
|
|
|
|
read_lock(&bdev->vm_lock);
|
|
bo = ttm_bo_vm_lookup_rb(bdev, vma->vm_pgoff,
|
|
(vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
|
|
if (likely(bo != NULL))
|
|
ttm_bo_reference(bo);
|
|
read_unlock(&bdev->vm_lock);
|
|
|
|
if (unlikely(bo == NULL)) {
|
|
pr_err("Could not find buffer object to map\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
driver = bo->bdev->driver;
|
|
if (unlikely(!driver->verify_access)) {
|
|
ret = -EPERM;
|
|
goto out_unref;
|
|
}
|
|
ret = driver->verify_access(bo, filp);
|
|
if (unlikely(ret != 0))
|
|
goto out_unref;
|
|
|
|
vma->vm_ops = &ttm_bo_vm_ops;
|
|
|
|
/*
|
|
* Note: We're transferring the bo reference to
|
|
* vma->vm_private_data here.
|
|
*/
|
|
|
|
vma->vm_private_data = bo;
|
|
vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND | VM_DONTDUMP;
|
|
return 0;
|
|
out_unref:
|
|
ttm_bo_unref(&bo);
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(ttm_bo_mmap);
|
|
|
|
int ttm_fbdev_mmap(struct vm_area_struct *vma, struct ttm_buffer_object *bo)
|
|
{
|
|
if (vma->vm_pgoff != 0)
|
|
return -EACCES;
|
|
|
|
vma->vm_ops = &ttm_bo_vm_ops;
|
|
vma->vm_private_data = ttm_bo_reference(bo);
|
|
vma->vm_flags |= VM_IO | VM_MIXEDMAP | VM_DONTEXPAND;
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(ttm_fbdev_mmap);
|
|
|
|
|
|
ssize_t ttm_bo_io(struct ttm_bo_device *bdev, struct file *filp,
|
|
const char __user *wbuf, char __user *rbuf, size_t count,
|
|
loff_t *f_pos, bool write)
|
|
{
|
|
struct ttm_buffer_object *bo;
|
|
struct ttm_bo_driver *driver;
|
|
struct ttm_bo_kmap_obj map;
|
|
unsigned long dev_offset = (*f_pos >> PAGE_SHIFT);
|
|
unsigned long kmap_offset;
|
|
unsigned long kmap_end;
|
|
unsigned long kmap_num;
|
|
size_t io_size;
|
|
unsigned int page_offset;
|
|
char *virtual;
|
|
int ret;
|
|
bool no_wait = false;
|
|
bool dummy;
|
|
|
|
read_lock(&bdev->vm_lock);
|
|
bo = ttm_bo_vm_lookup_rb(bdev, dev_offset, 1);
|
|
if (likely(bo != NULL))
|
|
ttm_bo_reference(bo);
|
|
read_unlock(&bdev->vm_lock);
|
|
|
|
if (unlikely(bo == NULL))
|
|
return -EFAULT;
|
|
|
|
driver = bo->bdev->driver;
|
|
if (unlikely(!driver->verify_access)) {
|
|
ret = -EPERM;
|
|
goto out_unref;
|
|
}
|
|
|
|
ret = driver->verify_access(bo, filp);
|
|
if (unlikely(ret != 0))
|
|
goto out_unref;
|
|
|
|
kmap_offset = dev_offset - bo->vm_node->start;
|
|
if (unlikely(kmap_offset >= bo->num_pages)) {
|
|
ret = -EFBIG;
|
|
goto out_unref;
|
|
}
|
|
|
|
page_offset = *f_pos & ~PAGE_MASK;
|
|
io_size = bo->num_pages - kmap_offset;
|
|
io_size = (io_size << PAGE_SHIFT) - page_offset;
|
|
if (count < io_size)
|
|
io_size = count;
|
|
|
|
kmap_end = (*f_pos + count - 1) >> PAGE_SHIFT;
|
|
kmap_num = kmap_end - kmap_offset + 1;
|
|
|
|
ret = ttm_bo_reserve(bo, true, no_wait, false, 0);
|
|
|
|
switch (ret) {
|
|
case 0:
|
|
break;
|
|
case -EBUSY:
|
|
ret = -EAGAIN;
|
|
goto out_unref;
|
|
default:
|
|
goto out_unref;
|
|
}
|
|
|
|
ret = ttm_bo_kmap(bo, kmap_offset, kmap_num, &map);
|
|
if (unlikely(ret != 0)) {
|
|
ttm_bo_unreserve(bo);
|
|
goto out_unref;
|
|
}
|
|
|
|
virtual = ttm_kmap_obj_virtual(&map, &dummy);
|
|
virtual += page_offset;
|
|
|
|
if (write)
|
|
ret = copy_from_user(virtual, wbuf, io_size);
|
|
else
|
|
ret = copy_to_user(rbuf, virtual, io_size);
|
|
|
|
ttm_bo_kunmap(&map);
|
|
ttm_bo_unreserve(bo);
|
|
ttm_bo_unref(&bo);
|
|
|
|
if (unlikely(ret != 0))
|
|
return -EFBIG;
|
|
|
|
*f_pos += io_size;
|
|
|
|
return io_size;
|
|
out_unref:
|
|
ttm_bo_unref(&bo);
|
|
return ret;
|
|
}
|
|
|
|
ssize_t ttm_bo_fbdev_io(struct ttm_buffer_object *bo, const char __user *wbuf,
|
|
char __user *rbuf, size_t count, loff_t *f_pos,
|
|
bool write)
|
|
{
|
|
struct ttm_bo_kmap_obj map;
|
|
unsigned long kmap_offset;
|
|
unsigned long kmap_end;
|
|
unsigned long kmap_num;
|
|
size_t io_size;
|
|
unsigned int page_offset;
|
|
char *virtual;
|
|
int ret;
|
|
bool no_wait = false;
|
|
bool dummy;
|
|
|
|
kmap_offset = (*f_pos >> PAGE_SHIFT);
|
|
if (unlikely(kmap_offset >= bo->num_pages))
|
|
return -EFBIG;
|
|
|
|
page_offset = *f_pos & ~PAGE_MASK;
|
|
io_size = bo->num_pages - kmap_offset;
|
|
io_size = (io_size << PAGE_SHIFT) - page_offset;
|
|
if (count < io_size)
|
|
io_size = count;
|
|
|
|
kmap_end = (*f_pos + count - 1) >> PAGE_SHIFT;
|
|
kmap_num = kmap_end - kmap_offset + 1;
|
|
|
|
ret = ttm_bo_reserve(bo, true, no_wait, false, 0);
|
|
|
|
switch (ret) {
|
|
case 0:
|
|
break;
|
|
case -EBUSY:
|
|
return -EAGAIN;
|
|
default:
|
|
return ret;
|
|
}
|
|
|
|
ret = ttm_bo_kmap(bo, kmap_offset, kmap_num, &map);
|
|
if (unlikely(ret != 0)) {
|
|
ttm_bo_unreserve(bo);
|
|
return ret;
|
|
}
|
|
|
|
virtual = ttm_kmap_obj_virtual(&map, &dummy);
|
|
virtual += page_offset;
|
|
|
|
if (write)
|
|
ret = copy_from_user(virtual, wbuf, io_size);
|
|
else
|
|
ret = copy_to_user(rbuf, virtual, io_size);
|
|
|
|
ttm_bo_kunmap(&map);
|
|
ttm_bo_unreserve(bo);
|
|
ttm_bo_unref(&bo);
|
|
|
|
if (unlikely(ret != 0))
|
|
return ret;
|
|
|
|
*f_pos += io_size;
|
|
|
|
return io_size;
|
|
}
|