This work implements direct packet access for helpers and direct packet write in a similar fashion as already available for XDP types via commits4acf6c0b84("bpf: enable direct packet data write for xdp progs") and6841de8b0d("bpf: allow helpers access the packet directly"), and as a complementary feature to the already available direct packet read for tc (cls/act) programs. For enabling this, we need to introduce two helpers, bpf_skb_pull_data() and bpf_csum_update(). The first is generally needed for both, read and write, because they would otherwise only be limited to the current linear skb head. Usually, when the data_end test fails, programs just bail out, or, in the direct read case, use bpf_skb_load_bytes() as an alternative to overcome this limitation. If such data sits in non-linear parts, we can just pull them in once with the new helper, retest and eventually access them. At the same time, this also makes sure the skb is uncloned, which is, of course, a necessary condition for direct write. As this needs to be an invariant for the write part only, the verifier detects writes and adds a prologue that is calling bpf_skb_pull_data() to effectively unclone the skb from the very beginning in case it is indeed cloned. The heuristic makes use of a similar trick that was done in233577a220("net: filter: constify detection of pkt_type_offset"). This comes at zero cost for other programs that do not use the direct write feature. Should a program use this feature only sparsely and has read access for the most parts with, for example, drop return codes, then such write action can be delegated to a tail called program for mitigating this cost of potential uncloning to a late point in time where it would have been paid similarly with the bpf_skb_store_bytes() as well. Advantage of direct write is that the writes are inlined whereas the helper cannot make any length assumptions and thus needs to generate a call to memcpy() also for small sizes, as well as cost of helper call itself with sanity checks are avoided. Plus, when direct read is already used, we don't need to cache or perform rechecks on the data boundaries (due to verifier invalidating previous checks for helpers that change skb->data), so more complex programs using rewrites can benefit from switching to direct read plus write. For direct packet access to helpers, we save the otherwise needed copy into a temp struct sitting on stack memory when use-case allows. Both facilities are enabled via may_access_direct_pkt_data() in verifier. For now, we limit this to map helpers and csum_diff, and can successively enable other helpers where we find it makes sense. Helpers that definitely cannot be allowed for this are those part of bpf_helper_changes_skb_data() since they can change underlying data, and those that write into memory as this could happen for packet typed args when still cloned. bpf_csum_update() helper accommodates for the fact that we need to fixup checksum_complete when using direct write instead of bpf_skb_store_bytes(), meaning the programs can use available helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(), csum_block_add(), csum_block_sub() equivalents in eBPF together with the new helper. A usage example will be provided for iproute2's examples/bpf/ directory. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net>
		
			
				
	
	
		
			170 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			170 lines
		
	
	
		
			4.3 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 | |
|  *
 | |
|  * This program is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of version 2 of the GNU General Public
 | |
|  * License as published by the Free Software Foundation.
 | |
|  *
 | |
|  * This program is distributed in the hope that it will be useful, but
 | |
|  * WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 | |
|  * General Public License for more details.
 | |
|  */
 | |
| #include <linux/bpf.h>
 | |
| #include <linux/rcupdate.h>
 | |
| #include <linux/random.h>
 | |
| #include <linux/smp.h>
 | |
| #include <linux/ktime.h>
 | |
| #include <linux/sched.h>
 | |
| #include <linux/uidgid.h>
 | |
| #include <linux/filter.h>
 | |
| 
 | |
| /* If kernel subsystem is allowing eBPF programs to call this function,
 | |
|  * inside its own verifier_ops->get_func_proto() callback it should return
 | |
|  * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
 | |
|  *
 | |
|  * Different map implementations will rely on rcu in map methods
 | |
|  * lookup/update/delete, therefore eBPF programs must run under rcu lock
 | |
|  * if program is allowed to access maps, so check rcu_read_lock_held in
 | |
|  * all three functions.
 | |
|  */
 | |
| BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key)
 | |
| {
 | |
| 	WARN_ON_ONCE(!rcu_read_lock_held());
 | |
| 	return (unsigned long) map->ops->map_lookup_elem(map, key);
 | |
| }
 | |
| 
 | |
| const struct bpf_func_proto bpf_map_lookup_elem_proto = {
 | |
| 	.func		= bpf_map_lookup_elem,
 | |
| 	.gpl_only	= false,
 | |
| 	.pkt_access	= true,
 | |
| 	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
 | |
| 	.arg1_type	= ARG_CONST_MAP_PTR,
 | |
| 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 | |
| };
 | |
| 
 | |
| BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key,
 | |
| 	   void *, value, u64, flags)
 | |
| {
 | |
| 	WARN_ON_ONCE(!rcu_read_lock_held());
 | |
| 	return map->ops->map_update_elem(map, key, value, flags);
 | |
| }
 | |
| 
 | |
| const struct bpf_func_proto bpf_map_update_elem_proto = {
 | |
| 	.func		= bpf_map_update_elem,
 | |
| 	.gpl_only	= false,
 | |
| 	.pkt_access	= true,
 | |
| 	.ret_type	= RET_INTEGER,
 | |
| 	.arg1_type	= ARG_CONST_MAP_PTR,
 | |
| 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 | |
| 	.arg3_type	= ARG_PTR_TO_MAP_VALUE,
 | |
| 	.arg4_type	= ARG_ANYTHING,
 | |
| };
 | |
| 
 | |
| BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key)
 | |
| {
 | |
| 	WARN_ON_ONCE(!rcu_read_lock_held());
 | |
| 	return map->ops->map_delete_elem(map, key);
 | |
| }
 | |
| 
 | |
| const struct bpf_func_proto bpf_map_delete_elem_proto = {
 | |
| 	.func		= bpf_map_delete_elem,
 | |
| 	.gpl_only	= false,
 | |
| 	.pkt_access	= true,
 | |
| 	.ret_type	= RET_INTEGER,
 | |
| 	.arg1_type	= ARG_CONST_MAP_PTR,
 | |
| 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 | |
| };
 | |
| 
 | |
| const struct bpf_func_proto bpf_get_prandom_u32_proto = {
 | |
| 	.func		= bpf_user_rnd_u32,
 | |
| 	.gpl_only	= false,
 | |
| 	.ret_type	= RET_INTEGER,
 | |
| };
 | |
| 
 | |
| BPF_CALL_0(bpf_get_smp_processor_id)
 | |
| {
 | |
| 	return smp_processor_id();
 | |
| }
 | |
| 
 | |
| const struct bpf_func_proto bpf_get_smp_processor_id_proto = {
 | |
| 	.func		= bpf_get_smp_processor_id,
 | |
| 	.gpl_only	= false,
 | |
| 	.ret_type	= RET_INTEGER,
 | |
| };
 | |
| 
 | |
| BPF_CALL_0(bpf_ktime_get_ns)
 | |
| {
 | |
| 	/* NMI safe access to clock monotonic */
 | |
| 	return ktime_get_mono_fast_ns();
 | |
| }
 | |
| 
 | |
| const struct bpf_func_proto bpf_ktime_get_ns_proto = {
 | |
| 	.func		= bpf_ktime_get_ns,
 | |
| 	.gpl_only	= true,
 | |
| 	.ret_type	= RET_INTEGER,
 | |
| };
 | |
| 
 | |
| BPF_CALL_0(bpf_get_current_pid_tgid)
 | |
| {
 | |
| 	struct task_struct *task = current;
 | |
| 
 | |
| 	if (unlikely(!task))
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	return (u64) task->tgid << 32 | task->pid;
 | |
| }
 | |
| 
 | |
| const struct bpf_func_proto bpf_get_current_pid_tgid_proto = {
 | |
| 	.func		= bpf_get_current_pid_tgid,
 | |
| 	.gpl_only	= false,
 | |
| 	.ret_type	= RET_INTEGER,
 | |
| };
 | |
| 
 | |
| BPF_CALL_0(bpf_get_current_uid_gid)
 | |
| {
 | |
| 	struct task_struct *task = current;
 | |
| 	kuid_t uid;
 | |
| 	kgid_t gid;
 | |
| 
 | |
| 	if (unlikely(!task))
 | |
| 		return -EINVAL;
 | |
| 
 | |
| 	current_uid_gid(&uid, &gid);
 | |
| 	return (u64) from_kgid(&init_user_ns, gid) << 32 |
 | |
| 		     from_kuid(&init_user_ns, uid);
 | |
| }
 | |
| 
 | |
| const struct bpf_func_proto bpf_get_current_uid_gid_proto = {
 | |
| 	.func		= bpf_get_current_uid_gid,
 | |
| 	.gpl_only	= false,
 | |
| 	.ret_type	= RET_INTEGER,
 | |
| };
 | |
| 
 | |
| BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
 | |
| {
 | |
| 	struct task_struct *task = current;
 | |
| 
 | |
| 	if (unlikely(!task))
 | |
| 		goto err_clear;
 | |
| 
 | |
| 	strncpy(buf, task->comm, size);
 | |
| 
 | |
| 	/* Verifier guarantees that size > 0. For task->comm exceeding
 | |
| 	 * size, guarantee that buf is %NUL-terminated. Unconditionally
 | |
| 	 * done here to save the size test.
 | |
| 	 */
 | |
| 	buf[size - 1] = 0;
 | |
| 	return 0;
 | |
| err_clear:
 | |
| 	memset(buf, 0, size);
 | |
| 	return -EINVAL;
 | |
| }
 | |
| 
 | |
| const struct bpf_func_proto bpf_get_current_comm_proto = {
 | |
| 	.func		= bpf_get_current_comm,
 | |
| 	.gpl_only	= false,
 | |
| 	.ret_type	= RET_INTEGER,
 | |
| 	.arg1_type	= ARG_PTR_TO_RAW_STACK,
 | |
| 	.arg2_type	= ARG_CONST_STACK_SIZE,
 | |
| };
 |