mirror of
https://github.com/torvalds/linux.git
synced 2024-12-28 05:41:55 +00:00
e067eba587
Patch series "userfaultfd tmpfs/hugetlbfs/non-cooperative", v2 These userfaultfd features are finished and are ready for larger exposure in -mm and upstream merging. 1) tmpfs non present userfault 2) hugetlbfs non present userfault 3) non cooperative userfault for fork/madvise/mremap qemu development code is already exercising 2) and container postcopy live migration needs 3). 1) is not currently used but there's a self test and we know some qemu user for various reasons uses tmpfs as backing for KVM so it'll need it too to use postcopy live migration with tmpfs memory. All review feedback from the previous submit has been handled and the fixes are included. There's no outstanding issue AFIK. Upstream code just did a s/fe/vmf/ conversion in the page faults and this has been converted as well incrementally. In addition to the previous submits, this also wakes up stuck userfaults during UFFDIO_UNREGISTER. The non cooperative testcase actually reproduced this problem by getting stuck instead of quitting clean in some rare case as it could call UFFDIO_UNREGISTER while some userfault could be still in flight. The other option would have been to keep leaving it up to userland to serialize itself and to patch the testcase instead but the wakeup during unregister I think is preferable. David also asked the UFFD_FEATURE_MISSING_HUGETLBFS and UFFD_FEATURE_MISSING_SHMEM feature flags to be added so QEMU can avoid to probe if the hugetlbfs/shmem missing support is available by calling UFFDIO_REGISTER. QEMU already checks HUGETLBFS_MAGIC with fstatfs so if UFFD_FEATURE_MISSING_HUGETLBFS is also set, it knows UFFDIO_REGISTER will succeed (or if it fails, it's for some other more concerning reason). There's no reason to worry about adding too many feature flags. There are 64 available and worst case we've to bump the API if someday we're really going to run out of them. The round-trip network latency of hugetlbfs userfaults during postcopy live migration is still of the order of dozen milliseconds on 10GBit if at 2MB hugepage granularity so it's working perfectly and it should provide for higher bandwidth or lower CPU usage (which makes it interesting to add an option in the future to support THP granularity too for anonymous memory, UFFDIO_COPY would then have to create THP if alignment/len allows for it). 1GB hugetlbfs granularity will require big changes in hugetlbfs to work so it's deferred for later. This patch (of 42): This adds proper documentation (inline) to avoid the risk of further misunderstandings about the semantics of _IOW/_IOR and it also reminds whoever will bump the UFFDIO_API in the future, to change the two ioctl to _IOW. This was found while implementing strace support for those ioctl, otherwise we could have never found it by just reviewing kernel code and testing it. _IOC_READ or _IOC_WRITE alters nothing but the ioctl number itself, so it's only worth fixing if the UFFDIO_API is bumped someday. Link: http://lkml.kernel.org/r/20161216144821.5183-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Reported-by: "Dmitry V. Levin" <ldv@altlinux.org> Cc: Michael Rapoport <RAPOPORT@il.ibm.com> Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Pavel Emelyanov <xemul@parallels.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
107 lines
3.4 KiB
C
107 lines
3.4 KiB
C
#ifndef _UAPI_ASM_GENERIC_IOCTL_H
|
|
#define _UAPI_ASM_GENERIC_IOCTL_H
|
|
|
|
/* ioctl command encoding: 32 bits total, command in lower 16 bits,
|
|
* size of the parameter structure in the lower 14 bits of the
|
|
* upper 16 bits.
|
|
* Encoding the size of the parameter structure in the ioctl request
|
|
* is useful for catching programs compiled with old versions
|
|
* and to avoid overwriting user space outside the user buffer area.
|
|
* The highest 2 bits are reserved for indicating the ``access mode''.
|
|
* NOTE: This limits the max parameter size to 16kB -1 !
|
|
*/
|
|
|
|
/*
|
|
* The following is for compatibility across the various Linux
|
|
* platforms. The generic ioctl numbering scheme doesn't really enforce
|
|
* a type field. De facto, however, the top 8 bits of the lower 16
|
|
* bits are indeed used as a type field, so we might just as well make
|
|
* this explicit here. Please be sure to use the decoding macros
|
|
* below from now on.
|
|
*/
|
|
#define _IOC_NRBITS 8
|
|
#define _IOC_TYPEBITS 8
|
|
|
|
/*
|
|
* Let any architecture override either of the following before
|
|
* including this file.
|
|
*/
|
|
|
|
#ifndef _IOC_SIZEBITS
|
|
# define _IOC_SIZEBITS 14
|
|
#endif
|
|
|
|
#ifndef _IOC_DIRBITS
|
|
# define _IOC_DIRBITS 2
|
|
#endif
|
|
|
|
#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1)
|
|
#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1)
|
|
#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1)
|
|
#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1)
|
|
|
|
#define _IOC_NRSHIFT 0
|
|
#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS)
|
|
#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS)
|
|
#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS)
|
|
|
|
/*
|
|
* Direction bits, which any architecture can choose to override
|
|
* before including this file.
|
|
*
|
|
* NOTE: _IOC_WRITE means userland is writing and kernel is
|
|
* reading. _IOC_READ means userland is reading and kernel is writing.
|
|
*/
|
|
|
|
#ifndef _IOC_NONE
|
|
# define _IOC_NONE 0U
|
|
#endif
|
|
|
|
#ifndef _IOC_WRITE
|
|
# define _IOC_WRITE 1U
|
|
#endif
|
|
|
|
#ifndef _IOC_READ
|
|
# define _IOC_READ 2U
|
|
#endif
|
|
|
|
#define _IOC(dir,type,nr,size) \
|
|
(((dir) << _IOC_DIRSHIFT) | \
|
|
((type) << _IOC_TYPESHIFT) | \
|
|
((nr) << _IOC_NRSHIFT) | \
|
|
((size) << _IOC_SIZESHIFT))
|
|
|
|
#ifndef __KERNEL__
|
|
#define _IOC_TYPECHECK(t) (sizeof(t))
|
|
#endif
|
|
|
|
/*
|
|
* Used to create numbers.
|
|
*
|
|
* NOTE: _IOW means userland is writing and kernel is reading. _IOR
|
|
* means userland is reading and kernel is writing.
|
|
*/
|
|
#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0)
|
|
#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size)))
|
|
#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size)))
|
|
#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size)))
|
|
#define _IOR_BAD(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size))
|
|
#define _IOW_BAD(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
|
|
#define _IOWR_BAD(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
|
|
|
|
/* used to decode ioctl numbers.. */
|
|
#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
|
|
#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
|
|
#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
|
|
#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
|
|
|
|
/* ...and for the drivers/sound files... */
|
|
|
|
#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT)
|
|
#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT)
|
|
#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT)
|
|
#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT)
|
|
#define IOCSIZE_SHIFT (_IOC_SIZESHIFT)
|
|
|
|
#endif /* _UAPI_ASM_GENERIC_IOCTL_H */
|