net: poll/select low latency socket support
select/poll busy-poll support. Split sysctl value into two separate ones, one for read and one for poll. updated Documentation/sysctl/net.txt Add a new poll flag POLL_LL. When this flag is set, sock_poll will call sk_poll_ll if possible. sock_poll sets this flag in its return value to indicate to select/poll when a socket that can busy poll is found. When poll/select have nothing to report, call the low-level sock_poll again until we are out of time or we find something. Once the system call finds something, it stops setting POLL_LL, so it can return the result to the user ASAP. Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
e4f2379db6
commit
2d48d67fa8
@ -50,11 +50,25 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
|
|||||||
it's a Per-CPU variable.
|
it's a Per-CPU variable.
|
||||||
Default: 64
|
Default: 64
|
||||||
|
|
||||||
|
low_latency_read
|
||||||
|
----------------
|
||||||
|
Low latency busy poll timeout for socket reads. (needs CONFIG_NET_LL_RX_POLL)
|
||||||
|
Approximate time in us to spin waiting for packets on the device queue.
|
||||||
|
This sets the default value of the SO_LL socket option.
|
||||||
|
Can be set or overridden per socket by setting socket option SO_LL.
|
||||||
|
Recommended value is 50. May increase power usage.
|
||||||
|
Default: 0 (off)
|
||||||
|
|
||||||
low_latency_poll
|
low_latency_poll
|
||||||
----------------
|
----------------
|
||||||
Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL)
|
Low latency busy poll timeout for poll and select. (needs CONFIG_NET_LL_RX_POLL)
|
||||||
Approximate time in us to spin waiting for packets on the device queue.
|
Approximate time in us to spin waiting for packets on the device queue.
|
||||||
Recommended value is 50. May increase power usage.
|
Recommended value depends on the number of sockets you poll on.
|
||||||
|
For several sockets 50, for several hundreds 100.
|
||||||
|
For more than that you probably want to use epoll.
|
||||||
|
Note that only sockets with SO_LL set will be busy polled, so you want to either
|
||||||
|
selectively set SO_LL on those sockets or set sysctl.net.low_latency_read globally.
|
||||||
|
May increase power usage.
|
||||||
Default: 0 (off)
|
Default: 0 (off)
|
||||||
|
|
||||||
rmem_default
|
rmem_default
|
||||||
|
34
fs/select.c
34
fs/select.c
@ -27,6 +27,7 @@
|
|||||||
#include <linux/rcupdate.h>
|
#include <linux/rcupdate.h>
|
||||||
#include <linux/hrtimer.h>
|
#include <linux/hrtimer.h>
|
||||||
#include <linux/sched/rt.h>
|
#include <linux/sched/rt.h>
|
||||||
|
#include <net/ll_poll.h>
|
||||||
|
|
||||||
#include <asm/uaccess.h>
|
#include <asm/uaccess.h>
|
||||||
|
|
||||||
@ -384,9 +385,10 @@ get_max:
|
|||||||
#define POLLEX_SET (POLLPRI)
|
#define POLLEX_SET (POLLPRI)
|
||||||
|
|
||||||
static inline void wait_key_set(poll_table *wait, unsigned long in,
|
static inline void wait_key_set(poll_table *wait, unsigned long in,
|
||||||
unsigned long out, unsigned long bit)
|
unsigned long out, unsigned long bit,
|
||||||
|
unsigned int ll_flag)
|
||||||
{
|
{
|
||||||
wait->_key = POLLEX_SET;
|
wait->_key = POLLEX_SET | ll_flag;
|
||||||
if (in & bit)
|
if (in & bit)
|
||||||
wait->_key |= POLLIN_SET;
|
wait->_key |= POLLIN_SET;
|
||||||
if (out & bit)
|
if (out & bit)
|
||||||
@ -400,6 +402,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
|
|||||||
poll_table *wait;
|
poll_table *wait;
|
||||||
int retval, i, timed_out = 0;
|
int retval, i, timed_out = 0;
|
||||||
unsigned long slack = 0;
|
unsigned long slack = 0;
|
||||||
|
unsigned int ll_flag = POLL_LL;
|
||||||
|
u64 ll_time = ll_end_time();
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
retval = max_select_fd(n, fds);
|
retval = max_select_fd(n, fds);
|
||||||
@ -422,6 +426,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
|
|||||||
retval = 0;
|
retval = 0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
|
unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
|
||||||
|
bool can_ll = false;
|
||||||
|
|
||||||
inp = fds->in; outp = fds->out; exp = fds->ex;
|
inp = fds->in; outp = fds->out; exp = fds->ex;
|
||||||
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
|
rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;
|
||||||
@ -449,7 +454,8 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
|
|||||||
f_op = f.file->f_op;
|
f_op = f.file->f_op;
|
||||||
mask = DEFAULT_POLLMASK;
|
mask = DEFAULT_POLLMASK;
|
||||||
if (f_op && f_op->poll) {
|
if (f_op && f_op->poll) {
|
||||||
wait_key_set(wait, in, out, bit);
|
wait_key_set(wait, in, out,
|
||||||
|
bit, ll_flag);
|
||||||
mask = (*f_op->poll)(f.file, wait);
|
mask = (*f_op->poll)(f.file, wait);
|
||||||
}
|
}
|
||||||
fdput(f);
|
fdput(f);
|
||||||
@ -468,6 +474,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
|
|||||||
retval++;
|
retval++;
|
||||||
wait->_qproc = NULL;
|
wait->_qproc = NULL;
|
||||||
}
|
}
|
||||||
|
if (mask & POLL_LL)
|
||||||
|
can_ll = true;
|
||||||
|
/* got something, stop busy polling */
|
||||||
|
if (retval)
|
||||||
|
ll_flag = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (res_in)
|
if (res_in)
|
||||||
@ -486,6 +497,9 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (can_ll && can_poll_ll(ll_time))
|
||||||
|
continue;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If this is the first loop and we have a timeout
|
* If this is the first loop and we have a timeout
|
||||||
* given, then we convert to ktime_t and set the to
|
* given, then we convert to ktime_t and set the to
|
||||||
@ -717,7 +731,8 @@ struct poll_list {
|
|||||||
* pwait poll_table will be used by the fd-provided poll handler for waiting,
|
* pwait poll_table will be used by the fd-provided poll handler for waiting,
|
||||||
* if pwait->_qproc is non-NULL.
|
* if pwait->_qproc is non-NULL.
|
||||||
*/
|
*/
|
||||||
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
|
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
|
||||||
|
bool *can_ll, unsigned int ll_flag)
|
||||||
{
|
{
|
||||||
unsigned int mask;
|
unsigned int mask;
|
||||||
int fd;
|
int fd;
|
||||||
@ -731,7 +746,10 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
|
|||||||
mask = DEFAULT_POLLMASK;
|
mask = DEFAULT_POLLMASK;
|
||||||
if (f.file->f_op && f.file->f_op->poll) {
|
if (f.file->f_op && f.file->f_op->poll) {
|
||||||
pwait->_key = pollfd->events|POLLERR|POLLHUP;
|
pwait->_key = pollfd->events|POLLERR|POLLHUP;
|
||||||
|
pwait->_key |= ll_flag;
|
||||||
mask = f.file->f_op->poll(f.file, pwait);
|
mask = f.file->f_op->poll(f.file, pwait);
|
||||||
|
if (mask & POLL_LL)
|
||||||
|
*can_ll = true;
|
||||||
}
|
}
|
||||||
/* Mask out unneeded events. */
|
/* Mask out unneeded events. */
|
||||||
mask &= pollfd->events | POLLERR | POLLHUP;
|
mask &= pollfd->events | POLLERR | POLLHUP;
|
||||||
@ -750,6 +768,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
|
|||||||
ktime_t expire, *to = NULL;
|
ktime_t expire, *to = NULL;
|
||||||
int timed_out = 0, count = 0;
|
int timed_out = 0, count = 0;
|
||||||
unsigned long slack = 0;
|
unsigned long slack = 0;
|
||||||
|
unsigned int ll_flag = POLL_LL;
|
||||||
|
u64 ll_time = ll_end_time();
|
||||||
|
|
||||||
/* Optimise the no-wait case */
|
/* Optimise the no-wait case */
|
||||||
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
|
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
|
||||||
@ -762,6 +782,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
|
|||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
struct poll_list *walk;
|
struct poll_list *walk;
|
||||||
|
bool can_ll = false;
|
||||||
|
|
||||||
for (walk = list; walk != NULL; walk = walk->next) {
|
for (walk = list; walk != NULL; walk = walk->next) {
|
||||||
struct pollfd * pfd, * pfd_end;
|
struct pollfd * pfd, * pfd_end;
|
||||||
@ -776,9 +797,10 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
|
|||||||
* this. They'll get immediately deregistered
|
* this. They'll get immediately deregistered
|
||||||
* when we break out and return.
|
* when we break out and return.
|
||||||
*/
|
*/
|
||||||
if (do_pollfd(pfd, pt)) {
|
if (do_pollfd(pfd, pt, &can_ll, ll_flag)) {
|
||||||
count++;
|
count++;
|
||||||
pt->_qproc = NULL;
|
pt->_qproc = NULL;
|
||||||
|
ll_flag = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -795,6 +817,8 @@ static int do_poll(unsigned int nfds, struct poll_list *list,
|
|||||||
if (count || timed_out)
|
if (count || timed_out)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
if (can_ll && can_poll_ll(ll_time))
|
||||||
|
continue;
|
||||||
/*
|
/*
|
||||||
* If this is the first loop and we have a timeout
|
* If this is the first loop and we have a timeout
|
||||||
* given, then we convert to ktime_t and set the to
|
* given, then we convert to ktime_t and set the to
|
||||||
|
@ -30,6 +30,7 @@
|
|||||||
#ifdef CONFIG_NET_LL_RX_POLL
|
#ifdef CONFIG_NET_LL_RX_POLL
|
||||||
|
|
||||||
struct napi_struct;
|
struct napi_struct;
|
||||||
|
extern unsigned int sysctl_net_ll_read __read_mostly;
|
||||||
extern unsigned int sysctl_net_ll_poll __read_mostly;
|
extern unsigned int sysctl_net_ll_poll __read_mostly;
|
||||||
|
|
||||||
/* return values from ndo_ll_poll */
|
/* return values from ndo_ll_poll */
|
||||||
@ -38,17 +39,18 @@ extern unsigned int sysctl_net_ll_poll __read_mostly;
|
|||||||
|
|
||||||
/* we can use sched_clock() because we don't care much about precision
|
/* we can use sched_clock() because we don't care much about precision
|
||||||
* we only care that the average is bounded
|
* we only care that the average is bounded
|
||||||
|
* we don't mind a ~2.5% imprecision so <<10 instead of *1000
|
||||||
|
* sk->sk_ll_usec is a u_int so this can't overflow
|
||||||
*/
|
*/
|
||||||
static inline u64 ll_end_time(struct sock *sk)
|
static inline u64 ll_sk_end_time(struct sock *sk)
|
||||||
{
|
{
|
||||||
u64 end_time = ACCESS_ONCE(sk->sk_ll_usec);
|
return ((u64)ACCESS_ONCE(sk->sk_ll_usec) << 10) + sched_clock();
|
||||||
|
}
|
||||||
|
|
||||||
/* we don't mind a ~2.5% imprecision
|
/* in poll/select we use the global sysctl_net_ll_poll value */
|
||||||
* sk->sk_ll_usec is a u_int so this can't overflow
|
static inline u64 ll_end_time(void)
|
||||||
*/
|
{
|
||||||
end_time = (end_time << 10) + sched_clock();
|
return ((u64)ACCESS_ONCE(sysctl_net_ll_poll) << 10) + sched_clock();
|
||||||
|
|
||||||
return end_time;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool sk_valid_ll(struct sock *sk)
|
static inline bool sk_valid_ll(struct sock *sk)
|
||||||
@ -62,10 +64,13 @@ static inline bool can_poll_ll(u64 end_time)
|
|||||||
return !time_after64(sched_clock(), end_time);
|
return !time_after64(sched_clock(), end_time);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* when used in sock_poll() nonblock is known at compile time to be true
|
||||||
|
* so the loop and end_time will be optimized out
|
||||||
|
*/
|
||||||
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
|
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
|
||||||
{
|
{
|
||||||
|
u64 end_time = nonblock ? 0 : ll_sk_end_time(sk);
|
||||||
const struct net_device_ops *ops;
|
const struct net_device_ops *ops;
|
||||||
u64 end_time = ll_end_time(sk);
|
|
||||||
struct napi_struct *napi;
|
struct napi_struct *napi;
|
||||||
int rc = false;
|
int rc = false;
|
||||||
|
|
||||||
@ -84,7 +89,6 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock)
|
|||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
|
|
||||||
rc = ops->ndo_ll_poll(napi);
|
rc = ops->ndo_ll_poll(napi);
|
||||||
|
|
||||||
if (rc == LL_FLUSH_FAILED)
|
if (rc == LL_FLUSH_FAILED)
|
||||||
@ -95,8 +99,8 @@ static inline bool sk_poll_ll(struct sock *sk, int nonblock)
|
|||||||
NET_ADD_STATS_BH(sock_net(sk),
|
NET_ADD_STATS_BH(sock_net(sk),
|
||||||
LINUX_MIB_LOWLATENCYRXPACKETS, rc);
|
LINUX_MIB_LOWLATENCYRXPACKETS, rc);
|
||||||
|
|
||||||
} while (skb_queue_empty(&sk->sk_receive_queue)
|
} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
|
||||||
&& can_poll_ll(end_time) && !nonblock);
|
can_poll_ll(end_time));
|
||||||
|
|
||||||
rc = !skb_queue_empty(&sk->sk_receive_queue);
|
rc = !skb_queue_empty(&sk->sk_receive_queue);
|
||||||
out:
|
out:
|
||||||
@ -118,7 +122,12 @@ static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
|
|||||||
|
|
||||||
#else /* CONFIG_NET_LL_RX_POLL */
|
#else /* CONFIG_NET_LL_RX_POLL */
|
||||||
|
|
||||||
static inline u64 ll_end_time(struct sock *sk)
|
static inline u64 sk_ll_end_time(struct sock *sk)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline u64 ll_end_time(void)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -30,6 +30,8 @@
|
|||||||
|
|
||||||
#define POLLFREE 0x4000 /* currently only for epoll */
|
#define POLLFREE 0x4000 /* currently only for epoll */
|
||||||
|
|
||||||
|
#define POLL_LL 0x8000
|
||||||
|
|
||||||
struct pollfd {
|
struct pollfd {
|
||||||
int fd;
|
int fd;
|
||||||
short events;
|
short events;
|
||||||
|
@ -2307,7 +2307,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
|
|||||||
|
|
||||||
#ifdef CONFIG_NET_LL_RX_POLL
|
#ifdef CONFIG_NET_LL_RX_POLL
|
||||||
sk->sk_napi_id = 0;
|
sk->sk_napi_id = 0;
|
||||||
sk->sk_ll_usec = sysctl_net_ll_poll;
|
sk->sk_ll_usec = sysctl_net_ll_read;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -306,6 +306,14 @@ static struct ctl_table net_core_table[] = {
|
|||||||
.mode = 0644,
|
.mode = 0644,
|
||||||
.proc_handler = proc_dointvec
|
.proc_handler = proc_dointvec
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.procname = "low_latency_read",
|
||||||
|
.data = &sysctl_net_ll_read,
|
||||||
|
.maxlen = sizeof(unsigned int),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dointvec
|
||||||
|
},
|
||||||
|
#
|
||||||
#endif
|
#endif
|
||||||
#endif /* CONFIG_NET */
|
#endif /* CONFIG_NET */
|
||||||
{
|
{
|
||||||
|
14
net/socket.c
14
net/socket.c
@ -107,6 +107,7 @@
|
|||||||
#include <net/ll_poll.h>
|
#include <net/ll_poll.h>
|
||||||
|
|
||||||
#ifdef CONFIG_NET_LL_RX_POLL
|
#ifdef CONFIG_NET_LL_RX_POLL
|
||||||
|
unsigned int sysctl_net_ll_read __read_mostly;
|
||||||
unsigned int sysctl_net_ll_poll __read_mostly;
|
unsigned int sysctl_net_ll_poll __read_mostly;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -1147,13 +1148,24 @@ EXPORT_SYMBOL(sock_create_lite);
|
|||||||
/* No kernel lock held - perfect */
|
/* No kernel lock held - perfect */
|
||||||
static unsigned int sock_poll(struct file *file, poll_table *wait)
|
static unsigned int sock_poll(struct file *file, poll_table *wait)
|
||||||
{
|
{
|
||||||
|
unsigned int ll_flag = 0;
|
||||||
struct socket *sock;
|
struct socket *sock;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can't return errors to poll, so it's either yes or no.
|
* We can't return errors to poll, so it's either yes or no.
|
||||||
*/
|
*/
|
||||||
sock = file->private_data;
|
sock = file->private_data;
|
||||||
return sock->ops->poll(file, sock, wait);
|
|
||||||
|
if (sk_valid_ll(sock->sk)) {
|
||||||
|
/* this socket can poll_ll so tell the system call */
|
||||||
|
ll_flag = POLL_LL;
|
||||||
|
|
||||||
|
/* once, only if requested by syscall */
|
||||||
|
if (wait && (wait->_key & POLL_LL))
|
||||||
|
sk_poll_ll(sock->sk, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ll_flag | sock->ops->poll(file, sock, wait);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int sock_mmap(struct file *file, struct vm_area_struct *vma)
|
static int sock_mmap(struct file *file, struct vm_area_struct *vma)
|
||||||
|
Loading…
Reference in New Issue
Block a user