From 62b8cea62e8bad0511260faab8e8de04c76a69af Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 7 Feb 2019 11:29:24 -0800 Subject: [PATCH 01/36] tools/bpf: add missing strings.h include Few files in libbpf are using bzero() function (defined in strings.h header), but don't include corresponding header. When libbpf is added as a dependency to pahole, this undeterministically causes warnings on some machines: bpf.c:225:2: warning: implicit declaration of function 'bzero' [-Wimplicit-function-declaration] bzero(&attr, sizeof(attr)); ^~~~~ Signed-off-by: Andrii Nakryiko Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 1 + tools/lib/bpf/btf.c | 1 + tools/lib/bpf/libbpf.c | 1 + 3 files changed, 3 insertions(+) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 3defad77dc7a..92fd27fe0599 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -22,6 +22,7 @@ */ #include +#include #include #include #include diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index ab6528c935a1..4324eb47d214 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 47969aa0faf8..8d64ada5f728 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include From a4021a3579c52d5a5131820aeb94f531a7b082a7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 7 Feb 2019 09:34:51 -0800 Subject: [PATCH 02/36] tools/bpf: add log_level to bpf_load_program_attr The kernel verifier has three levels of logs: 0: no logs 1: logs mostly useful > 1: verbose Current libbpf API functions bpf_load_program_xattr() and bpf_load_program() cannot specify log_level. The bcc, however, provides an interface for user to specify log_level 2 for verbose output. This patch added log_level into structure bpf_load_program_attr, so users, including bcc, can use bpf_load_program_xattr() to change log_level. The supported log_level is 0, 1, and 2. The bpf selftest test_sock.c is modified to enable log_level = 2. If the "verbose" in test_sock.c is changed to true, the test will output logs like below: $ ./test_sock func#0 @0 0: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 0: (bf) r6 = r1 1: R1=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 1: (61) r7 = *(u32 *)(r6 +28) invalid bpf_context access off=28 size=4 Test case: bind4 load with invalid access: src_ip6 .. [PASS] ... Test case: bind6 allow all .. [PASS] Summary: 16 PASSED, 0 FAILED Some test_sock tests are negative tests and verbose verifier log will be printed out as shown in the above. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 22 +++++++++++++++++----- tools/lib/bpf/bpf.h | 1 + tools/testing/selftests/bpf/test_sock.c | 9 ++++++++- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 92fd27fe0599..a5261f39e2bd 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -215,10 +215,15 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, { void *finfo = NULL, *linfo = NULL; union bpf_attr attr; + __u32 log_level; __u32 name_len; int fd; - if (!load_attr) + if (!load_attr || !log_buf != !log_buf_sz) + return -EINVAL; + + log_level = load_attr->log_level; + if (log_level > 2 || (log_level && !log_buf)) return -EINVAL; name_len = load_attr->name ? strlen(load_attr->name) : 0; @@ -229,9 +234,16 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, attr.insn_cnt = (__u32)load_attr->insns_cnt; attr.insns = ptr_to_u64(load_attr->insns); attr.license = ptr_to_u64(load_attr->license); - attr.log_buf = ptr_to_u64(NULL); - attr.log_size = 0; - attr.log_level = 0; + + attr.log_level = log_level; + if (log_level) { + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_buf_sz; + } else { + attr.log_buf = ptr_to_u64(NULL); + attr.log_size = 0; + } + attr.kern_version = load_attr->kern_version; attr.prog_ifindex = load_attr->prog_ifindex; attr.prog_btf_fd = load_attr->prog_btf_fd; @@ -287,7 +299,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, goto done; } - if (!log_buf || !log_buf_sz) + if (log_level || !log_buf) goto done; /* Try again with log */ diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index ed09eed2dc3b..6ffdd79bea89 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -85,6 +85,7 @@ struct bpf_load_program_attr { __u32 line_info_rec_size; const void *line_info; __u32 line_info_cnt; + __u32 log_level; }; /* Flags to direct loading requirements */ diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c index 561ffb6d6433..fb679ac3d4b0 100644 --- a/tools/testing/selftests/bpf/test_sock.c +++ b/tools/testing/selftests/bpf/test_sock.c @@ -20,6 +20,7 @@ #define MAX_INSNS 512 char bpf_log_buf[BPF_LOG_BUF_SIZE]; +static bool verbose = false; struct sock_test { const char *descr; @@ -325,6 +326,7 @@ static int load_sock_prog(const struct bpf_insn *prog, enum bpf_attach_type attach_type) { struct bpf_load_program_attr attr; + int ret; memset(&attr, 0, sizeof(struct bpf_load_program_attr)); attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK; @@ -332,8 +334,13 @@ static int load_sock_prog(const struct bpf_insn *prog, attr.insns = prog; attr.insns_cnt = probe_prog_length(attr.insns); attr.license = "GPL"; + attr.log_level = 2; - return bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE); + ret = bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE); + if (verbose && ret < 0) + fprintf(stderr, "%s\n", bpf_log_buf); + + return ret; } static int attach_sock_prog(int cgfd, int progfd, From d29d87f7e61226c339d1212beff6b82f653acd67 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Feb 2019 11:19:36 -0800 Subject: [PATCH 03/36] btf: separate btf creation and loading This change splits out previous btf__new functionality of constructing struct btf and loading it into kernel into two: - btf__new() just creates and initializes struct btf - btf__load() attempts to load existing struct btf into kernel btf__free will still close BTF fd, if it was ever loaded successfully into kernel. This change allows users of libbpf to manipulate BTF using its API, without the need to unnecessarily load it into kernel. One of the intended use cases is pahole, which will do DWARF to BTF conversion and then use libbpf to do type deduplication, while then handling ELF sections overwriting and other concerns on its own. Fixes: 2d3feca8c44f ("bpf: btf: print map dump and lookup with btf info") Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 54 ++++++++++++++++++++++------------------ tools/lib/bpf/btf.h | 1 + tools/lib/bpf/libbpf.c | 2 +- tools/lib/bpf/libbpf.map | 1 + 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 4324eb47d214..46db0a3b5cb7 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -367,8 +367,6 @@ void btf__free(struct btf *btf) struct btf *btf__new(__u8 *data, __u32 size) { - __u32 log_buf_size = 0; - char *log_buf = NULL; struct btf *btf; int err; @@ -378,15 +376,6 @@ struct btf *btf__new(__u8 *data, __u32 size) btf->fd = -1; - log_buf = malloc(BPF_LOG_BUF_SIZE); - if (!log_buf) { - err = -ENOMEM; - goto done; - } - - *log_buf = 0; - log_buf_size = BPF_LOG_BUF_SIZE; - btf->data = malloc(size); if (!btf->data) { err = -ENOMEM; @@ -396,17 +385,6 @@ struct btf *btf__new(__u8 *data, __u32 size) memcpy(btf->data, data, size); btf->data_size = size; - btf->fd = bpf_load_btf(btf->data, btf->data_size, - log_buf, log_buf_size, false); - - if (btf->fd == -1) { - err = -errno; - pr_warning("Error loading BTF: %s(%d)\n", strerror(errno), errno); - if (log_buf && *log_buf) - pr_warning("%s\n", log_buf); - goto done; - } - err = btf_parse_hdr(btf); if (err) goto done; @@ -418,8 +396,6 @@ struct btf *btf__new(__u8 *data, __u32 size) err = btf_parse_type_sec(btf); done: - free(log_buf); - if (err) { btf__free(btf); return ERR_PTR(err); @@ -428,6 +404,36 @@ done: return btf; } +int btf__load(struct btf *btf) +{ + __u32 log_buf_size = BPF_LOG_BUF_SIZE; + char *log_buf = NULL; + int err = 0; + + if (btf->fd >= 0) + return -EEXIST; + + log_buf = malloc(log_buf_size); + if (!log_buf) + return -ENOMEM; + + *log_buf = 0; + + btf->fd = bpf_load_btf(btf->data, btf->data_size, + log_buf, log_buf_size, false); + if (btf->fd < 0) { + err = -errno; + pr_warning("Error loading BTF: %s(%d)\n", strerror(errno), errno); + if (*log_buf) + pr_warning("%s\n", log_buf); + goto done; + } + +done: + free(log_buf); + return err; +} + int btf__fd(const struct btf *btf) { return btf->fd; diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index b393da90cc85..f55b7bc98d9e 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -57,6 +57,7 @@ struct btf_ext_header { LIBBPF_API void btf__free(struct btf *btf); LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size); +LIBBPF_API int btf__load(struct btf *btf); LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, const char *type_name); LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 8d64ada5f728..e3c39edfb9d3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -836,7 +836,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj, int flags) obj->efile.maps_shndx = idx; else if (strcmp(name, BTF_ELF_SEC) == 0) { obj->btf = btf__new(data->d_buf, data->d_size); - if (IS_ERR(obj->btf)) { + if (IS_ERR(obj->btf) || btf__load(obj->btf)) { pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n", BTF_ELF_SEC, PTR_ERR(obj->btf)); obj->btf = NULL; diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 89c1149e32ee..f5372df143f4 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -137,6 +137,7 @@ LIBBPF_0.0.2 { btf__get_map_kv_tids; btf__get_nr_types; btf__get_strings; + btf__load; btf_ext__free; btf_ext__func_info_rec_size; btf_ext__line_info_rec_size; From 02c874460f3d9213096323ac8a937fb486a4e70d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Feb 2019 11:19:37 -0800 Subject: [PATCH 04/36] btf: expose API to work with raw btf data This patch exposes new API btf__get_raw_data() that allows to get a copy of raw BTF data out of struct btf. This is useful for external programs that need to manipulate raw data, e.g., pahole using btf__dedup() to deduplicate BTF type info and then writing it back to file. Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 6 ++++++ tools/lib/bpf/btf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 8 insertions(+) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 46db0a3b5cb7..4fba0aa989df 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -439,6 +439,12 @@ int btf__fd(const struct btf *btf) return btf->fd; } +const void *btf__get_raw_data(const struct btf *btf, __u32 *size) +{ + *size = btf->data_size; + return btf->data; +} + void btf__get_strings(const struct btf *btf, const char **strings, __u32 *str_len) { diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index f55b7bc98d9e..10fe412461fe 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -66,6 +66,7 @@ LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); LIBBPF_API int btf__fd(const struct btf *btf); +LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size); LIBBPF_API void btf__get_strings(const struct btf *btf, const char **strings, __u32 *str_len); LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index f5372df143f4..9e10467f8cbb 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -136,6 +136,7 @@ LIBBPF_0.0.2 { btf__dedup; btf__get_map_kv_tids; btf__get_nr_types; + btf__get_raw_data; btf__get_strings; btf__load; btf_ext__free; From ae4ab4b4117d23da49f04a7e1fe82a41e6074eeb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Feb 2019 11:19:38 -0800 Subject: [PATCH 05/36] btf: expose API to work with raw btf_ext data This patch changes struct btf_ext to retain original data in sequential block of memory, which makes it possible to expose btf_ext__get_raw_data() interface similar to btf__get_raw_data(), allowing users of libbpf to get access to raw representation of .BTF.ext section. Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 102 +++++++++++++++++++++------------------ tools/lib/bpf/btf.h | 2 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 57 insertions(+), 48 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 4fba0aa989df..f6b724ed1bdd 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -42,9 +42,8 @@ struct btf { struct btf_ext_info { /* - * info points to a deep copy of the individual info section - * (e.g. func_info and line_info) from the .BTF.ext. - * It does not include the __u32 rec_size. + * info points to the individual info section (e.g. func_info and + * line_info) from the .BTF.ext. It does not include the __u32 rec_size. */ void *info; __u32 rec_size; @@ -52,8 +51,13 @@ struct btf_ext_info { }; struct btf_ext { + union { + struct btf_ext_header *hdr; + void *data; + }; struct btf_ext_info func_info; struct btf_ext_info line_info; + __u32 data_size; }; struct btf_ext_info_sec { @@ -596,7 +600,7 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, return 0; } -struct btf_ext_sec_copy_param { +struct btf_ext_sec_setup_param { __u32 off; __u32 len; __u32 min_rec_size; @@ -604,20 +608,14 @@ struct btf_ext_sec_copy_param { const char *desc; }; -static int btf_ext_copy_info(struct btf_ext *btf_ext, - __u8 *data, __u32 data_size, - struct btf_ext_sec_copy_param *ext_sec) +static int btf_ext_setup_info(struct btf_ext *btf_ext, + struct btf_ext_sec_setup_param *ext_sec) { - const struct btf_ext_header *hdr = (struct btf_ext_header *)data; const struct btf_ext_info_sec *sinfo; struct btf_ext_info *ext_info; __u32 info_left, record_size; /* The start of the info sec (including the __u32 record_size). */ - const void *info; - - /* data and data_size do not include btf_ext_header from now on */ - data = data + hdr->hdr_len; - data_size -= hdr->hdr_len; + void *info; if (ext_sec->off & 0x03) { pr_debug(".BTF.ext %s section is not aligned to 4 bytes\n", @@ -625,16 +623,15 @@ static int btf_ext_copy_info(struct btf_ext *btf_ext, return -EINVAL; } - if (data_size < ext_sec->off || - ext_sec->len > data_size - ext_sec->off) { + info = btf_ext->data + btf_ext->hdr->hdr_len + ext_sec->off; + info_left = ext_sec->len; + + if (btf_ext->data + btf_ext->data_size < info + ext_sec->len) { pr_debug("%s section (off:%u len:%u) is beyond the end of the ELF section .BTF.ext\n", - ext_sec->desc, ext_sec->off, ext_sec->len); + ext_sec->desc, ext_sec->off, ext_sec->len); return -EINVAL; } - info = data + ext_sec->off; - info_left = ext_sec->len; - /* At least a record size */ if (info_left < sizeof(__u32)) { pr_debug(".BTF.ext %s record size not found\n", ext_sec->desc); @@ -646,7 +643,7 @@ static int btf_ext_copy_info(struct btf_ext *btf_ext, if (record_size < ext_sec->min_rec_size || record_size & 0x03) { pr_debug("%s section in .BTF.ext has invalid record size %u\n", - ext_sec->desc, record_size); + ext_sec->desc, record_size); return -EINVAL; } @@ -692,42 +689,35 @@ static int btf_ext_copy_info(struct btf_ext *btf_ext, ext_info = ext_sec->ext_info; ext_info->len = ext_sec->len - sizeof(__u32); ext_info->rec_size = record_size; - ext_info->info = malloc(ext_info->len); - if (!ext_info->info) - return -ENOMEM; - memcpy(ext_info->info, info + sizeof(__u32), ext_info->len); + ext_info->info = info + sizeof(__u32); return 0; } -static int btf_ext_copy_func_info(struct btf_ext *btf_ext, - __u8 *data, __u32 data_size) +static int btf_ext_setup_func_info(struct btf_ext *btf_ext) { - const struct btf_ext_header *hdr = (struct btf_ext_header *)data; - struct btf_ext_sec_copy_param param = { - .off = hdr->func_info_off, - .len = hdr->func_info_len, + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->func_info_off, + .len = btf_ext->hdr->func_info_len, .min_rec_size = sizeof(struct bpf_func_info_min), .ext_info = &btf_ext->func_info, .desc = "func_info" }; - return btf_ext_copy_info(btf_ext, data, data_size, ¶m); + return btf_ext_setup_info(btf_ext, ¶m); } -static int btf_ext_copy_line_info(struct btf_ext *btf_ext, - __u8 *data, __u32 data_size) +static int btf_ext_setup_line_info(struct btf_ext *btf_ext) { - const struct btf_ext_header *hdr = (struct btf_ext_header *)data; - struct btf_ext_sec_copy_param param = { - .off = hdr->line_info_off, - .len = hdr->line_info_len, + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->line_info_off, + .len = btf_ext->hdr->line_info_len, .min_rec_size = sizeof(struct bpf_line_info_min), .ext_info = &btf_ext->line_info, .desc = "line_info", }; - return btf_ext_copy_info(btf_ext, data, data_size, ¶m); + return btf_ext_setup_info(btf_ext, ¶m); } static int btf_ext_parse_hdr(__u8 *data, __u32 data_size) @@ -767,9 +757,7 @@ void btf_ext__free(struct btf_ext *btf_ext) { if (!btf_ext) return; - - free(btf_ext->func_info.info); - free(btf_ext->line_info.info); + free(btf_ext->data); free(btf_ext); } @@ -786,13 +774,23 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size) if (!btf_ext) return ERR_PTR(-ENOMEM); - err = btf_ext_copy_func_info(btf_ext, data, size); - if (err) { - btf_ext__free(btf_ext); - return ERR_PTR(err); + btf_ext->data_size = size; + btf_ext->data = malloc(size); + if (!btf_ext->data) { + err = -ENOMEM; + goto done; } + memcpy(btf_ext->data, data, size); - err = btf_ext_copy_line_info(btf_ext, data, size); + err = btf_ext_setup_func_info(btf_ext); + if (err) + goto done; + + err = btf_ext_setup_line_info(btf_ext); + if (err) + goto done; + +done: if (err) { btf_ext__free(btf_ext); return ERR_PTR(err); @@ -801,6 +799,12 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size) return btf_ext; } +const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size) +{ + *size = btf_ext->data_size; + return btf_ext->data; +} + static int btf_ext_reloc_info(const struct btf *btf, const struct btf_ext_info *ext_info, const char *sec_name, __u32 insns_cnt, @@ -849,7 +853,8 @@ static int btf_ext_reloc_info(const struct btf *btf, return -ENOENT; } -int btf_ext__reloc_func_info(const struct btf *btf, const struct btf_ext *btf_ext, +int btf_ext__reloc_func_info(const struct btf *btf, + const struct btf_ext *btf_ext, const char *sec_name, __u32 insns_cnt, void **func_info, __u32 *cnt) { @@ -857,7 +862,8 @@ int btf_ext__reloc_func_info(const struct btf *btf, const struct btf_ext *btf_ex insns_cnt, func_info, cnt); } -int btf_ext__reloc_line_info(const struct btf *btf, const struct btf_ext *btf_ext, +int btf_ext__reloc_line_info(const struct btf *btf, + const struct btf_ext *btf_ext, const char *sec_name, __u32 insns_cnt, void **line_info, __u32 *cnt) { diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 10fe412461fe..0306b54d54eb 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -78,6 +78,8 @@ LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, LIBBPF_API struct btf_ext *btf_ext__new(__u8 *data, __u32 size); LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); +LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext* btf_ext, + __u32 *size); LIBBPF_API int btf_ext__reloc_func_info(const struct btf *btf, const struct btf_ext *btf_ext, const char *sec_name, __u32 insns_cnt, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 9e10467f8cbb..eb78c7c261d9 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -141,6 +141,7 @@ LIBBPF_0.0.2 { btf__load; btf_ext__free; btf_ext__func_info_rec_size; + btf_ext__get_raw_data; btf_ext__line_info_rec_size; btf_ext__new; btf_ext__reloc_func_info; From 49b57e0d01db73c99f86d68480fb9b4014bb1060 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 8 Feb 2019 11:19:39 -0800 Subject: [PATCH 06/36] tools/bpf: remove btf__get_strings() superseded by raw data API Now that we have btf__get_raw_data() it's trivial for tests to iterate over all strings for testing purposes, which eliminates the need for btf__get_strings() API. Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 7 ----- tools/lib/bpf/btf.h | 2 -- tools/lib/bpf/libbpf.map | 1 - tools/testing/selftests/bpf/test_btf.c | 39 +++++++++++++++++--------- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index f6b724ed1bdd..6953fedb88ff 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -449,13 +449,6 @@ const void *btf__get_raw_data(const struct btf *btf, __u32 *size) return btf->data; } -void btf__get_strings(const struct btf *btf, const char **strings, - __u32 *str_len) -{ - *strings = btf->strings; - *str_len = btf->hdr->str_len; -} - const char *btf__name_by_offset(const struct btf *btf, __u32 offset) { if (offset < btf->hdr->str_len) diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 0306b54d54eb..94bbc249b0f1 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -67,8 +67,6 @@ LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); LIBBPF_API int btf__fd(const struct btf *btf); LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size); -LIBBPF_API void btf__get_strings(const struct btf *btf, const char **strings, - __u32 *str_len); LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf); LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index eb78c7c261d9..5fc8222209f8 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -137,7 +137,6 @@ LIBBPF_0.0.2 { btf__get_map_kv_tids; btf__get_nr_types; btf__get_raw_data; - btf__get_strings; btf__load; btf_ext__free; btf_ext__func_info_rec_size; diff --git a/tools/testing/selftests/bpf/test_btf.c b/tools/testing/selftests/bpf/test_btf.c index 447acc34db94..bbcacba39590 100644 --- a/tools/testing/selftests/bpf/test_btf.c +++ b/tools/testing/selftests/bpf/test_btf.c @@ -5882,15 +5882,17 @@ static void dump_btf_strings(const char *strs, __u32 len) static int do_test_dedup(unsigned int test_num) { const struct btf_dedup_test *test = &dedup_tests[test_num - 1]; - int err = 0, i; - __u32 test_nr_types, expect_nr_types, test_str_len, expect_str_len; - void *raw_btf; - unsigned int raw_btf_size; + __u32 test_nr_types, expect_nr_types, test_btf_size, expect_btf_size; + const struct btf_header *test_hdr, *expect_hdr; struct btf *test_btf = NULL, *expect_btf = NULL; + const void *test_btf_data, *expect_btf_data; const char *ret_test_next_str, *ret_expect_next_str; const char *test_strs, *expect_strs; const char *test_str_cur, *test_str_end; const char *expect_str_cur, *expect_str_end; + unsigned int raw_btf_size; + void *raw_btf; + int err = 0, i; fprintf(stderr, "BTF dedup test[%u] (%s):", test_num, test->descr); @@ -5927,23 +5929,34 @@ static int do_test_dedup(unsigned int test_num) goto done; } - btf__get_strings(test_btf, &test_strs, &test_str_len); - btf__get_strings(expect_btf, &expect_strs, &expect_str_len); - if (CHECK(test_str_len != expect_str_len, - "test_str_len:%u != expect_str_len:%u", - test_str_len, expect_str_len)) { + test_btf_data = btf__get_raw_data(test_btf, &test_btf_size); + expect_btf_data = btf__get_raw_data(expect_btf, &expect_btf_size); + if (CHECK(test_btf_size != expect_btf_size, + "test_btf_size:%u != expect_btf_size:%u", + test_btf_size, expect_btf_size)) { + err = -1; + goto done; + } + + test_hdr = test_btf_data; + test_strs = test_btf_data + test_hdr->str_off; + expect_hdr = expect_btf_data; + expect_strs = expect_btf_data + expect_hdr->str_off; + if (CHECK(test_hdr->str_len != expect_hdr->str_len, + "test_hdr->str_len:%u != expect_hdr->str_len:%u", + test_hdr->str_len, expect_hdr->str_len)) { fprintf(stderr, "\ntest strings:\n"); - dump_btf_strings(test_strs, test_str_len); + dump_btf_strings(test_strs, test_hdr->str_len); fprintf(stderr, "\nexpected strings:\n"); - dump_btf_strings(expect_strs, expect_str_len); + dump_btf_strings(expect_strs, expect_hdr->str_len); err = -1; goto done; } test_str_cur = test_strs; - test_str_end = test_strs + test_str_len; + test_str_end = test_strs + test_hdr->str_len; expect_str_cur = expect_strs; - expect_str_end = expect_strs + expect_str_len; + expect_str_end = expect_strs + expect_hdr->str_len; while (test_str_cur < test_str_end && expect_str_cur < expect_str_end) { size_t test_len, expect_len; From 5f4566498dee5e38e36a015a968c22ed21568f0b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 8 Feb 2019 22:25:54 -0800 Subject: [PATCH 07/36] bpf: Fix narrow load on a bpf_sock returned from sk_lookup() By adding this test to test_verifier: { "reference tracking: access sk->src_ip4 (narrow load)", .insns = { BPF_SK_LOOKUP, BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_0, offsetof(struct bpf_sock, src_ip4) + 2), BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), BPF_EMIT_CALL(BPF_FUNC_sk_release), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, The above test loads 2 bytes from sk->src_ip4 where sk is obtained by bpf_sk_lookup_tcp(). It hits an internal verifier error from convert_ctx_accesses(): [root@arch-fb-vm1 bpf]# ./test_verifier 665 665 Failed to load prog 'Invalid argument'! 0: (b7) r2 = 0 1: (63) *(u32 *)(r10 -8) = r2 2: (7b) *(u64 *)(r10 -16) = r2 3: (7b) *(u64 *)(r10 -24) = r2 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 7: (bf) r2 = r10 8: (07) r2 += -48 9: (b7) r3 = 36 10: (b7) r4 = 0 11: (b7) r5 = 0 12: (85) call bpf_sk_lookup_tcp#84 13: (bf) r6 = r0 14: (15) if r0 == 0x0 goto pc+3 R0=sock(id=1,off=0,imm=0) R6=sock(id=1,off=0,imm=0) R10=fp0,call_-1 fp-8=????0000 fp-16=0000mmmm fp-24=mmmmmmmm fp-32=mmmmmmmm fp-40=mmmmmmmm fp-48=mmmmmmmm refs=1 15: (69) r2 = *(u16 *)(r0 +26) 16: (bf) r1 = r6 17: (85) call bpf_sk_release#86 18: (95) exit from 14 to 18: safe processed 20 insns (limit 131072), stack depth 48 bpf verifier is misconfigured Summary: 0 PASSED, 0 SKIPPED, 1 FAILED The bpf_sock_is_valid_access() is expecting src_ip4 can be narrowly loaded (meaning load any 1 or 2 bytes of the src_ip4) by marking info->ctx_field_size. However, this marked ctx_field_size is not used. This patch fixes it. Due to the recent refactoring in test_verifier, this new test will be added to the bpf-next branch (together with the bpf_tcp_sock patchset) to avoid merge conflict. Fixes: c64b7983288e ("bpf: Add PTR_TO_SOCKET verifier type") Cc: Joe Stringer Signed-off-by: Martin KaFai Lau Acked-by: Joe Stringer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b63bc77af2d1..516dfc6d78de 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1640,12 +1640,13 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, return 0; } -static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, enum bpf_access_type t) +static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, + u32 regno, int off, int size, + enum bpf_access_type t) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - struct bpf_insn_access_aux info; + struct bpf_insn_access_aux info = {}; if (reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1659,6 +1660,8 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + return 0; } @@ -2055,7 +2058,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "cannot write into socket\n"); return -EACCES; } - err = check_sock_access(env, regno, off, size, t); + err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { From 46f8bc92758c6259bcf945e9216098661c1587cd Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:20 -0800 Subject: [PATCH 08/36] bpf: Add a bpf_sock pointer to __sk_buff and a bpf_sk_fullsock helper In kernel, it is common to check "skb->sk && sk_fullsock(skb->sk)" before accessing the fields in sock. For example, in __netdev_pick_tx: static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* ... */ struct sock *sk = skb->sk; if (queue_index != new_index && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); /* ... */ return queue_index; } This patch adds a "struct bpf_sock *sk" pointer to the "struct __sk_buff" where a few of the convert_ctx_access() in filter.c has already been accessing the skb->sk sock_common's fields, e.g. sock_ops_convert_ctx_access(). "__sk_buff->sk" is a PTR_TO_SOCK_COMMON_OR_NULL in the verifier. Some of the fileds in "bpf_sock" will not be directly accessible through the "__sk_buff->sk" pointer. It is limited by the new "bpf_sock_common_is_valid_access()". e.g. The existing "type", "protocol", "mark" and "priority" in bpf_sock are not allowed. The newly added "struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)" can be used to get a sk with all accessible fields in "bpf_sock". This helper is added to both cg_skb and sched_(cls|act). int cg_skb_foo(struct __sk_buff *skb) { struct bpf_sock *sk; sk = skb->sk; if (!sk) return 1; sk = bpf_sk_fullsock(sk); if (!sk) return 1; if (sk->family != AF_INET6 || sk->protocol != IPPROTO_TCP) return 1; /* some_traffic_shaping(); */ return 1; } (1) The sk is read only (2) There is no new "struct bpf_sock_common" introduced. (3) Future kernel sock's members could be added to bpf_sock only instead of repeatedly adding at multiple places like currently in bpf_sock_ops_md, bpf_sock_addr_md, sk_reuseport_md...etc. (4) After "sk = skb->sk", the reg holding sk is in type PTR_TO_SOCK_COMMON_OR_NULL. (5) After bpf_sk_fullsock(), the return type will be in type PTR_TO_SOCKET_OR_NULL which is the same as the return type of bpf_sk_lookup_xxx(). However, bpf_sk_fullsock() does not take refcnt. The acquire_reference_state() is only depending on the return type now. To avoid it, a new is_acquire_function() is checked before calling acquire_reference_state(). (6) The WARN_ON in "release_reference_state()" is no longer an internal verifier bug. When reg->id is not found in state->refs[], it means the bpf_prog does something wrong like "bpf_sk_release(bpf_sk_fullsock(skb->sk))" where reference has never been acquired by calling "bpf_sk_fullsock(skb->sk)". A -EINVAL and a verbose are done instead of WARN_ON. A test is added to the test_verifier in a later patch. Since the WARN_ON in "release_reference_state()" is no longer needed, "__release_reference_state()" is folded into "release_reference_state()" also. Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 ++++ include/uapi/linux/bpf.h | 12 +++- kernel/bpf/verifier.c | 132 +++++++++++++++++++++++++++------------ net/core/filter.c | 42 +++++++++++++ 4 files changed, 157 insertions(+), 41 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bd169a7bcc93..a60463b45b54 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -194,6 +194,7 @@ enum bpf_arg_type { ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ + ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ }; /* type of values returned from helper functions */ @@ -256,6 +257,8 @@ enum bpf_reg_type { PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ + PTR_TO_SOCK_COMMON, /* reg points to sock_common */ + PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -920,6 +923,9 @@ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) +bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info); bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, @@ -928,6 +934,12 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size); #else +static inline bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} static inline bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1777fa0c61e4..5d79cba74ddc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2329,6 +2329,14 @@ union bpf_attr { * "**y**". * Return * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in bpf_sock can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2425,7 +2433,8 @@ union bpf_attr { FN(msg_pop_data), \ FN(rc_pointer_rel), \ FN(spin_lock), \ - FN(spin_unlock), + FN(spin_unlock), \ + FN(sk_fullsock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2545,6 +2554,7 @@ struct __sk_buff { __u64 tstamp; __u32 wire_len; __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); }; struct bpf_tunnel_key { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 516dfc6d78de..b755d55a3791 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -331,10 +331,17 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) type == PTR_TO_PACKET_META; } +static bool type_is_sk_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL; + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_SOCK_COMMON_OR_NULL; } static bool type_is_refcounted(enum bpf_reg_type type) @@ -377,6 +384,12 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } +static bool is_acquire_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -392,6 +405,8 @@ static const char * const reg_type_str[] = { [PTR_TO_FLOW_KEYS] = "flow_keys", [PTR_TO_SOCKET] = "sock", [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", }; static char slot_type_char[] = { @@ -618,13 +633,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) } /* release function corresponding to acquire_reference_state(). Idempotent. */ -static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +static int release_reference_state(struct bpf_func_state *state, int ptr_id) { int i, last_idx; - if (!ptr_id) - return -EFAULT; - last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { @@ -636,21 +648,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id) return 0; } } - return -EFAULT; -} - -/* variation on the above for cases where we expect that there must be an - * outstanding reference for the specified ptr_id. - */ -static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) -{ - struct bpf_func_state *state = cur_func(env); - int err; - - err = __release_reference_state(state, ptr_id); - if (WARN_ON_ONCE(err != 0)) - verbose(env, "verifier internal error: can't release reference\n"); - return err; + return -EINVAL; } static int transfer_reference_state(struct bpf_func_state *dst, @@ -1209,6 +1207,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: return true; default: return false; @@ -1647,6 +1647,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; struct bpf_insn_access_aux info = {}; + bool valid; if (reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1654,15 +1655,28 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return -EACCES; } - if (!bpf_sock_is_valid_access(off, size, t, &info)) { - verbose(env, "invalid bpf_sock access off=%d size=%d\n", - off, size); - return -EACCES; + switch (reg->type) { + case PTR_TO_SOCK_COMMON: + valid = bpf_sock_common_is_valid_access(off, size, t, &info); + break; + case PTR_TO_SOCKET: + valid = bpf_sock_is_valid_access(off, size, t, &info); + break; + default: + valid = false; } - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; - return 0; + if (valid) { + env->insn_aux_data[insn_idx].ctx_field_size = + info.ctx_field_size; + return 0; + } + + verbose(env, "R%d invalid %s access off=%d size=%d\n", + regno, reg_type_str[reg->type], off, size); + + return -EACCES; } static bool __is_pointer_value(bool allow_ptr_leaks, @@ -1688,8 +1702,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = reg_state(env, regno); - return reg->type == PTR_TO_CTX || - reg->type == PTR_TO_SOCKET; + return reg->type == PTR_TO_CTX; +} + +static bool is_sk_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + return type_is_sk_pointer(reg->type); } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) @@ -1800,6 +1820,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCKET: pointer_desc = "sock "; break; + case PTR_TO_SOCK_COMMON: + pointer_desc = "sock_common "; + break; default: break; } @@ -2003,11 +2026,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) + if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, value_regno); - else + } else { mark_reg_known_zero(env, regs, value_regno); + if (reg_type_may_be_null(reg_type)) + regs[value_regno].id = ++env->id_gen; + } regs[value_regno].type = reg_type; } @@ -2053,9 +2079,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_flow_keys_access(env, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_SOCKET) { + } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "cannot write into socket\n"); + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); return -EACCES; } err = check_sock_access(env, insn_idx, regno, off, size, t); @@ -2102,7 +2129,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg)) { + is_flow_key_reg(env, insn->dst_reg) || + is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); @@ -2369,6 +2397,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; + } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { + expected_type = PTR_TO_SOCK_COMMON; + /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ + if (!type_is_sk_pointer(type)) + goto err_type; } else if (arg_type == ARG_PTR_TO_SOCKET) { expected_type = PTR_TO_SOCKET; if (type != expected_type) @@ -2783,7 +2816,7 @@ static int release_reference(struct bpf_verifier_env *env, for (i = 0; i <= vstate->curframe; i++) release_reg_references(env, vstate->frame[i], meta->ptr_id); - return release_reference_state(env, meta->ptr_id); + return release_reference_state(cur_func(env), meta->ptr_id); } static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -3049,8 +3082,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } } else if (is_release_function(func_id)) { err = release_reference(env, &meta); - if (err) + if (err) { + verbose(env, "func %s#%d reference has not been acquired before\n", + func_id_name(func_id), func_id); return err; + } } regs = cur_regs(env); @@ -3099,12 +3135,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].id = ++env->id_gen; } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { - int id = acquire_reference_state(env, insn_idx); - if (id < 0) - return id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = id; + if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For release_reference() */ + regs[BPF_REG_0].id = id; + } else { + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = ++env->id_gen; + } } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3364,6 +3407,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -4597,6 +4642,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; + } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { + reg->type = PTR_TO_SOCK_COMMON; } if (is_null || !(reg_is_refcounted(reg) || reg_may_point_to_spin_lock(reg))) { @@ -4621,7 +4668,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, int i, j; if (reg_is_refcounted_or_null(®s[regno]) && is_null) - __release_reference_state(state, id); + release_reference_state(state, id); for (i = 0; i < MAX_BPF_REG; i++) mark_ptr_or_null_reg(state, ®s[i], id, is_null); @@ -5790,6 +5837,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6110,6 +6159,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_CTX: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: return false; default: return true; @@ -7112,6 +7163,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = ops->convert_ctx_access; break; case PTR_TO_SOCKET: + case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; default: diff --git a/net/core/filter.c b/net/core/filter.c index 3a49f68eda10..401d2e0aebf8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1793,6 +1793,20 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sk_fullsock_proto = { + .func = bpf_sk_fullsock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { @@ -5406,6 +5420,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; default: return sk_filter_func_proto(func_id, prog); } @@ -5477,6 +5493,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; @@ -5764,6 +5782,11 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(__u64)) return false; break; + case offsetof(struct __sk_buff, sk): + if (type == BPF_WRITE || size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5950,6 +5973,18 @@ static bool __sock_filter_check_size(int off, int size, return size == size_default; } +bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range_till(struct bpf_sock, type, priority): + return false; + default: + return bpf_sock_is_valid_access(off, size, type, info); + } +} + bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -6748,6 +6783,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct qdisc_skb_cb, pkt_len); *target_size = 4; *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); + break; + + case offsetof(struct __sk_buff, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + break; } return insn - insn_buf; From aa65d6960a98fc15a96ce361b26e9fd55c9bccc5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:21 -0800 Subject: [PATCH 09/36] bpf: Add state, dst_ip4, dst_ip6 and dst_port to bpf_sock This patch adds "state", "dst_ip4", "dst_ip6" and "dst_port" to the bpf_sock. The userspace has already been using "state", e.g. inet_diag (ss -t) and getsockopt(TCP_INFO). This patch also allows narrow load on the following existing fields: "family", "type", "protocol" and "src_port". Unlike IP address, the load offset is resticted to the first byte for them but it can be relaxed later if there is a use case. This patch also folds __sock_filter_check_size() into bpf_sock_is_valid_access() since it is not called by any where else. All bpf_sock checking is in one place. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 17 ++++--- net/core/filter.c | 99 +++++++++++++++++++++++++++++++--------- 2 files changed, 85 insertions(+), 31 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5d79cba74ddc..d8f91777c5b6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2606,15 +2606,14 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; - __u32 src_ip4; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_ip6[4]; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_port; /* Allows 4-byte read. - * Stored in host byte order - */ + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __u32 dst_port; /* network byte order */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; }; struct bpf_sock_tuple { diff --git a/net/core/filter.c b/net/core/filter.c index 401d2e0aebf8..01bb64bf2b5e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5958,21 +5958,6 @@ full_access: return true; } -static bool __sock_filter_check_size(int off, int size, - struct bpf_insn_access_aux *info) -{ - const int size_default = sizeof(__u32); - - switch (off) { - case bpf_ctx_range(struct bpf_sock, src_ip4): - case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): - bpf_ctx_record_field_size(info, size_default); - return bpf_ctx_narrow_access_ok(off, size, size_default); - } - - return size == size_default; -} - bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) @@ -5988,13 +5973,29 @@ bool bpf_sock_common_is_valid_access(int off, int size, bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; - if (!__sock_filter_check_size(off, size, info)) - return false; - return true; + + switch (off) { + case offsetof(struct bpf_sock, state): + case offsetof(struct bpf_sock, family): + case offsetof(struct bpf_sock, type): + case offsetof(struct bpf_sock, protocol): + case offsetof(struct bpf_sock, dst_port): + case offsetof(struct bpf_sock, src_port): + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + case bpf_ctx_range(struct bpf_sock, dst_ip4): + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } + + return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, @@ -6838,24 +6839,32 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock, family): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); - - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_family)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_family), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_family, + FIELD_SIZEOF(struct sock_common, + skc_family), + target_size)); break; case offsetof(struct bpf_sock, type): + BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + *target_size = 2; break; case offsetof(struct bpf_sock, protocol): + BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); + *target_size = 1; break; case offsetof(struct bpf_sock, src_ip4): @@ -6867,6 +6876,15 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, target_size)); break; + case offsetof(struct bpf_sock, dst_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_daddr, + FIELD_SIZEOF(struct sock_common, + skc_daddr), + target_size)); + break; + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; @@ -6885,6 +6903,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, #endif break; + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, dst_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_v6_daddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]), + target_size) + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + *target_size = 4; +#endif + break; + case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), @@ -6894,6 +6929,26 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_num), target_size)); break; + + case offsetof(struct bpf_sock, dst_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_dport), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_dport, + FIELD_SIZEOF(struct sock_common, + skc_dport), + target_size)); + break; + + case offsetof(struct bpf_sock, state): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_state), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_state, + FIELD_SIZEOF(struct sock_common, + skc_state), + target_size)); + break; } return insn - insn_buf; From 9b1f3d6e5af295a72deb5e3f04db07a6a58be72e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:23 -0800 Subject: [PATCH 10/36] bpf: Refactor sock_ops_convert_ctx_access The next patch will introduce a new "struct bpf_tcp_sock" which exposes the same tcp_sock's fields already exposed in "struct bpf_sock_ops". This patch refactor the existing convert_ctx_access() codes for "struct bpf_sock_ops" to get them ready to be reused for "struct bpf_tcp_sock". The "rtt_min" is not refactored in this patch because its handling is different from other fields. The SOCK_OPS_GET_TCP_SOCK_FIELD is new. All other SOCK_OPS_XXX_FIELD changes are code move only. Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 287 ++++++++++++++++++++-------------------------- 1 file changed, 127 insertions(+), 160 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 01bb64bf2b5e..c0d7b9ef279f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5030,6 +5030,54 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ +#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \ +do { \ + switch (si->off) { \ + case offsetof(md_type, snd_cwnd): \ + CONVERT(snd_cwnd); break; \ + case offsetof(md_type, srtt_us): \ + CONVERT(srtt_us); break; \ + case offsetof(md_type, snd_ssthresh): \ + CONVERT(snd_ssthresh); break; \ + case offsetof(md_type, rcv_nxt): \ + CONVERT(rcv_nxt); break; \ + case offsetof(md_type, snd_nxt): \ + CONVERT(snd_nxt); break; \ + case offsetof(md_type, snd_una): \ + CONVERT(snd_una); break; \ + case offsetof(md_type, mss_cache): \ + CONVERT(mss_cache); break; \ + case offsetof(md_type, ecn_flags): \ + CONVERT(ecn_flags); break; \ + case offsetof(md_type, rate_delivered): \ + CONVERT(rate_delivered); break; \ + case offsetof(md_type, rate_interval_us): \ + CONVERT(rate_interval_us); break; \ + case offsetof(md_type, packets_out): \ + CONVERT(packets_out); break; \ + case offsetof(md_type, retrans_out): \ + CONVERT(retrans_out); break; \ + case offsetof(md_type, total_retrans): \ + CONVERT(total_retrans); break; \ + case offsetof(md_type, segs_in): \ + CONVERT(segs_in); break; \ + case offsetof(md_type, data_segs_in): \ + CONVERT(data_segs_in); break; \ + case offsetof(md_type, segs_out): \ + CONVERT(segs_out); break; \ + case offsetof(md_type, data_segs_out): \ + CONVERT(data_segs_out); break; \ + case offsetof(md_type, lost_out): \ + CONVERT(lost_out); break; \ + case offsetof(md_type, sacked_out): \ + CONVERT(sacked_out); break; \ + case offsetof(md_type, bytes_received): \ + CONVERT(bytes_received); break; \ + case offsetof(md_type, bytes_acked): \ + CONVERT(bytes_acked); break; \ + } \ +} while (0) + #ifdef CONFIG_INET static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) @@ -7196,6 +7244,85 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn = insn_buf; int off; +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + } while (0) + +#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \ + SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + + CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops, + SOCK_OPS_GET_TCP_SOCK_FIELD); + + if (insn > insn_buf) + return insn - insn_buf; + switch (si->off) { case offsetof(struct bpf_sock_ops, op) ... offsetof(struct bpf_sock_ops, replylong[3]): @@ -7353,175 +7480,15 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, FIELD_SIZEOF(struct minmax_sample, t)); break; -/* Helper macro for adding read access to tcp_sock or sock fields. */ -#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ - do { \ - BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ - FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, \ - is_fullsock), \ - si->dst_reg, si->src_reg, \ - offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ - *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, sk),\ - si->dst_reg, si->src_reg, \ - offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ - OBJ_FIELD), \ - si->dst_reg, si->dst_reg, \ - offsetof(OBJ, OBJ_FIELD)); \ - } while (0) - -/* Helper macro for adding write access to tcp_sock or sock fields. - * The macro is called with two registers, dst_reg which contains a pointer - * to ctx (context) and src_reg which contains the value that should be - * stored. However, we need an additional register since we cannot overwrite - * dst_reg because it may be used later in the program. - * Instead we "borrow" one of the other register. We first save its value - * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore - * it at the end of the macro. - */ -#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ - do { \ - int reg = BPF_REG_9; \ - BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ - FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ - if (si->dst_reg == reg || si->src_reg == reg) \ - reg--; \ - if (si->dst_reg == reg || si->src_reg == reg) \ - reg--; \ - *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ - offsetof(struct bpf_sock_ops_kern, \ - temp)); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, \ - is_fullsock), \ - reg, si->dst_reg, \ - offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ - *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ - struct bpf_sock_ops_kern, sk),\ - reg, si->dst_reg, \ - offsetof(struct bpf_sock_ops_kern, sk));\ - *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ - reg, si->src_reg, \ - offsetof(OBJ, OBJ_FIELD)); \ - *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ - offsetof(struct bpf_sock_ops_kern, \ - temp)); \ - } while (0) - -#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ - do { \ - if (TYPE == BPF_WRITE) \ - SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ - else \ - SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ - } while (0) - - case offsetof(struct bpf_sock_ops, snd_cwnd): - SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, srtt_us): - SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); - break; - case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, struct tcp_sock); break; - case offsetof(struct bpf_sock_ops, snd_ssthresh): - SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, rcv_nxt): - SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, snd_nxt): - SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, snd_una): - SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, mss_cache): - SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, ecn_flags): - SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, rate_delivered): - SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, rate_interval_us): - SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, packets_out): - SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, retrans_out): - SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, total_retrans): - SOCK_OPS_GET_FIELD(total_retrans, total_retrans, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, segs_in): - SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, data_segs_in): - SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, segs_out): - SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, data_segs_out): - SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, lost_out): - SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, sacked_out): - SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); - break; - case offsetof(struct bpf_sock_ops, sk_txhash): SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, struct sock, type); break; - - case offsetof(struct bpf_sock_ops, bytes_received): - SOCK_OPS_GET_FIELD(bytes_received, bytes_received, - struct tcp_sock); - break; - - case offsetof(struct bpf_sock_ops, bytes_acked): - SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); - break; - } return insn - insn_buf; } From 655a51e536c09d15ffa3603b1b6fce2b45b85a1f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:24 -0800 Subject: [PATCH 11/36] bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock This patch adds a helper function BPF_FUNC_tcp_sock and it is currently available for cg_skb and sched_(cls|act): struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk); int cg_skb_foo(struct __sk_buff *skb) { struct bpf_tcp_sock *tp; struct bpf_sock *sk; __u32 snd_cwnd; sk = skb->sk; if (!sk) return 1; tp = bpf_tcp_sock(sk); if (!tp) return 1; snd_cwnd = tp->snd_cwnd; /* ... */ return 1; } A 'struct bpf_tcp_sock' is also added to the uapi bpf.h to provide read-only access. bpf_tcp_sock has all the existing tcp_sock's fields that has already been exposed by the bpf_sock_ops. i.e. no new tcp_sock's fields are exposed in bpf.h. This helper returns a pointer to the tcp_sock. If it is not a tcp_sock or it cannot be traced back to a tcp_sock by sk_to_full_sk(), it returns NULL. Hence, the caller needs to check for NULL before accessing it. The current use case is to expose members from tcp_sock to allow a cg_skb_bpf_prog to provide per cgroup traffic policing/shaping. Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 30 +++++++++++++++ include/uapi/linux/bpf.h | 51 +++++++++++++++++++++++++- kernel/bpf/verifier.c | 31 +++++++++++++++- net/core/filter.c | 79 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 188 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a60463b45b54..7f58828755fd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -204,6 +204,7 @@ enum bpf_return_type { RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ + RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -259,6 +260,8 @@ enum bpf_reg_type { PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ + PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ + PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -956,4 +959,31 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, } #endif +#ifdef CONFIG_INET +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); +#else +static inline bool bpf_tcp_sock_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} + +static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + return 0; +} +#endif /* CONFIG_INET */ + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d8f91777c5b6..25c8c0e62ecf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2337,6 +2337,15 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or NULL in * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Return + * A **struct bpf_tcp_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2434,7 +2443,8 @@ union bpf_attr { FN(rc_pointer_rel), \ FN(spin_lock), \ FN(spin_unlock), \ - FN(sk_fullsock), + FN(sk_fullsock), \ + FN(tcp_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2616,6 +2626,45 @@ struct bpf_sock { __u32 state; }; +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ +}; + struct bpf_sock_tuple { union { struct { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b755d55a3791..1b9496c41383 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -334,14 +334,16 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) static bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || - type == PTR_TO_SOCK_COMMON; + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK; } static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL; + type == PTR_TO_SOCK_COMMON_OR_NULL || + type == PTR_TO_TCP_SOCK_OR_NULL; } static bool type_is_refcounted(enum bpf_reg_type type) @@ -407,6 +409,8 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", [PTR_TO_SOCK_COMMON] = "sock_common", [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", }; static char slot_type_char[] = { @@ -1209,6 +1213,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return true; default: return false; @@ -1662,6 +1668,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, case PTR_TO_SOCKET: valid = bpf_sock_is_valid_access(off, size, t, &info); break; + case PTR_TO_TCP_SOCK: + valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); + break; default: valid = false; } @@ -1823,6 +1832,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCK_COMMON: pointer_desc = "sock_common "; break; + case PTR_TO_TCP_SOCK: + pointer_desc = "tcp_sock "; + break; default: break; } @@ -3148,6 +3160,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn /* For mark_ptr_or_null_reg() */ regs[BPF_REG_0].id = ++env->id_gen; } + } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3409,6 +3425,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -4644,6 +4662,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCKET; } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { reg->type = PTR_TO_SOCK_COMMON; + } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { + reg->type = PTR_TO_TCP_SOCK; } if (is_null || !(reg_is_refcounted(reg) || reg_may_point_to_spin_lock(reg))) { @@ -5839,6 +5859,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6161,6 +6183,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return false; default: return true; @@ -7166,6 +7190,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; + case PTR_TO_TCP_SOCK: + convert_ctx_access = bpf_tcp_sock_convert_ctx_access; + break; default: continue; } diff --git a/net/core/filter.c b/net/core/filter.c index c0d7b9ef279f..353735575204 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5315,6 +5315,79 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct bpf_tcp_sock, bytes_received): + case offsetof(struct bpf_tcp_sock, bytes_acked): + return size == sizeof(__u64); + default: + return size == sizeof(__u32); + } +} + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_TCP_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct tcp_sock, FIELD)); \ + } while (0) + + CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, + BPF_TCP_SOCK_GET_COMMON); + + if (insn > insn_buf) + return insn - insn_buf; + + switch (si->off) { + case offsetof(struct bpf_tcp_sock, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct tcp_sock, rtt_min) + + offsetof(struct minmax_sample, v)); + break; + } + + return insn - insn_buf; +} + +BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_tcp_sock_proto = { + .func = bpf_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5470,6 +5543,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif default: return sk_filter_func_proto(func_id, prog); } @@ -5560,6 +5637,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; #endif default: return bpf_base_func_proto(func_id); From 281f9e7572075879c0af4be792eaf31c17e7f894 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:25 -0800 Subject: [PATCH 12/36] bpf: Sync bpf.h to tools/ This patch sync the uapi bpf.h to tools/. Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 72 ++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 7 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1777fa0c61e4..25c8c0e62ecf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2329,6 +2329,23 @@ union bpf_attr { * "**y**". * Return * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in bpf_sock can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Return + * A **struct bpf_tcp_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2425,7 +2442,9 @@ union bpf_attr { FN(msg_pop_data), \ FN(rc_pointer_rel), \ FN(spin_lock), \ - FN(spin_unlock), + FN(spin_unlock), \ + FN(sk_fullsock), \ + FN(tcp_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2545,6 +2564,7 @@ struct __sk_buff { __u64 tstamp; __u32 wire_len; __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); }; struct bpf_tunnel_key { @@ -2596,14 +2616,52 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; - __u32 src_ip4; /* Allows 1,2,4-byte read. - * Stored in network byte order. + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __u32 dst_port; /* network byte order */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; +}; + +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. */ - __u32 src_ip6[4]; /* Allows 1,2,4-byte read. - * Stored in network byte order. + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. */ - __u32 src_port; /* Allows 4-byte read. - * Stored in host byte order + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. */ }; From fb47d1d931f8419645db15ef5fc0dc7a857c8f4e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:26 -0800 Subject: [PATCH 13/36] bpf: Add skb->sk, bpf_sk_fullsock and bpf_tcp_sock tests to test_verifer This patch tests accessing the skb->sk and the new helpers, bpf_sk_fullsock and bpf_tcp_sock. The errstr of some existing "reference tracking" tests is changed with s/bpf_sock/sock/ and s/socket/sock/ where "sock" is from the verifier's reg_type_str[]. Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_util.h | 9 + .../selftests/bpf/verifier/ref_tracking.c | 4 +- tools/testing/selftests/bpf/verifier/sock.c | 384 ++++++++++++++++++ tools/testing/selftests/bpf/verifier/unpriv.c | 2 +- 4 files changed, 396 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/verifier/sock.c diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h index 315a44fa32af..197347031038 100644 --- a/tools/testing/selftests/bpf/bpf_util.h +++ b/tools/testing/selftests/bpf/bpf_util.h @@ -48,4 +48,13 @@ static inline unsigned int bpf_num_possible_cpus(void) # define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif +#ifndef sizeof_field +#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) +#endif + +#ifndef offsetofend +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER)) +#endif + #endif /* __BPF_UTIL__ */ diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index dc2cc823df2b..3ed3593bd8b6 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -547,7 +547,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .errstr = "cannot write into socket", + .errstr = "cannot write into sock", .result = REJECT, }, { @@ -562,7 +562,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, - .errstr = "invalid bpf_sock access off=0 size=8", + .errstr = "invalid sock access off=0 size=8", .result = REJECT, }, { diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c new file mode 100644 index 000000000000..0ddfdf76aba5 --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -0,0 +1,384 @@ +{ + "skb->sk: no NULL check", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "invalid mem access 'sock_common_or_null'", +}, +{ + "skb->sk: sk->family [non fullsock field]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_sock, family)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "skb->sk: sk->type [fullsock field]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_sock, type)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "invalid sock_common access", +}, +{ + "bpf_sk_fullsock(skb->sk): no !skb->sk check", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "type=sock_common_or_null expected=sock_common", +}, +{ + "sk_fullsock(skb->sk): no NULL check on ret", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "invalid mem access 'sock_or_null'", +}, +{ + "sk_fullsock(skb->sk): sk->type [fullsock field]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "sk_fullsock(skb->sk): sk->family [non fullsock field]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, family)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "sk_fullsock(skb->sk): sk->state [narrow load]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, state)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "sk_fullsock(skb->sk): sk->dst_port [narrow load]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "sk_fullsock(skb->sk): sk->dst_port [load 2nd byte]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "invalid sock access", +}, +{ + "sk_fullsock(skb->sk): sk->dst_ip6 [load 2nd byte]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_ip6[0]) + 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "sk_fullsock(skb->sk): sk->type [narrow load]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "sk_fullsock(skb->sk): sk->protocol [narrow load]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, protocol)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "sk_fullsock(skb->sk): beyond last field", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, state)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "invalid sock access", +}, +{ + "bpf_tcp_sock(skb->sk): no !skb->sk check", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_EMIT_CALL(BPF_FUNC_tcp_sock), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "type=sock_common_or_null expected=sock_common", +}, +{ + "bpf_tcp_sock(skb->sk): no NULL check on ret", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_tcp_sock), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_tcp_sock, snd_cwnd)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "invalid mem access 'tcp_sock_or_null'", +}, +{ + "bpf_tcp_sock(skb->sk): tp->snd_cwnd", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_tcp_sock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_tcp_sock, snd_cwnd)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "bpf_tcp_sock(skb->sk): tp->bytes_acked", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_tcp_sock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_tcp_sock, bytes_acked)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "bpf_tcp_sock(skb->sk): beyond last field", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_tcp_sock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_tcp_sock, bytes_acked)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "invalid tcp_sock access", +}, +{ + "bpf_tcp_sock(bpf_sk_fullsock(skb->sk)): tp->snd_cwnd", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_tcp_sock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_tcp_sock, snd_cwnd)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "bpf_sk_release(skb->sk)", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "type=sock_common expected=sock", +}, +{ + "bpf_sk_release(bpf_sk_fullsock(skb->sk))", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "reference has not been acquired before", +}, +{ + "bpf_sk_release(bpf_tcp_sock(skb->sk))", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_tcp_sock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), + BPF_EXIT_INSN(), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_EMIT_CALL(BPF_FUNC_sk_release), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + .result = REJECT, + .errstr = "type=tcp_sock expected=sock", +}, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index 3e046695fad7..dbaf5be947b2 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -365,7 +365,7 @@ }, .result = REJECT, //.errstr = "same insn cannot be used with different pointers", - .errstr = "cannot write into socket", + .errstr = "cannot write into sock", .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { From e0b27b3f97b8fce620331baad563833617c1f303 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:28 -0800 Subject: [PATCH 14/36] bpf: Add test_sock_fields for skb->sk and bpf_tcp_sock This patch adds a C program to show the usage on skb->sk and bpf_tcp_sock. Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 6 +- tools/testing/selftests/bpf/bpf_helpers.h | 4 + .../testing/selftests/bpf/test_sock_fields.c | 327 ++++++++++++++++++ .../selftests/bpf/test_sock_fields_kern.c | 152 ++++++++ 4 files changed, 487 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/test_sock_fields.c create mode 100644 tools/testing/selftests/bpf/test_sock_fields_kern.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 383d2ff13fc7..c7e1e3255448 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -23,7 +23,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \ test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \ - test_netcnt test_tcpnotify_user + test_netcnt test_tcpnotify_user test_sock_fields BPF_OBJ_FILES = \ test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ @@ -35,7 +35,8 @@ BPF_OBJ_FILES = \ sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_xdp_vlan.o \ - xdp_dummy.o test_map_in_map.o test_spin_lock.o test_map_lock.o + xdp_dummy.o test_map_in_map.o test_spin_lock.o test_map_lock.o \ + test_sock_fields_kern.o # Objects are built with default compilation flags and with sub-register # code-gen enabled. @@ -111,6 +112,7 @@ $(OUTPUT)/test_progs: trace_helpers.c $(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c $(OUTPUT)/test_cgroup_storage: cgroup_helpers.c $(OUTPUT)/test_netcnt: cgroup_helpers.c +$(OUTPUT)/test_sock_fields: cgroup_helpers.c .PHONY: force diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 6a0ce0f055c5..d9999f1ed1d2 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -176,6 +176,10 @@ static void (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) BPF_FUNC_spin_lock; static void (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) BPF_FUNC_spin_unlock; +static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_sk_fullsock; +static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_tcp_sock; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/tools/testing/selftests/bpf/test_sock_fields.c b/tools/testing/selftests/bpf/test_sock_fields.c new file mode 100644 index 000000000000..9bb58369b481 --- /dev/null +++ b/tools/testing/selftests/bpf/test_sock_fields.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cgroup_helpers.h" + +enum bpf_array_idx { + SRV_IDX, + CLI_IDX, + __NR_BPF_ARRAY_IDX, +}; + +#define CHECK(condition, tag, format...) ({ \ + int __ret = !!(condition); \ + if (__ret) { \ + printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \ + printf(format); \ + printf("\n"); \ + exit(-1); \ + } \ +}) + +#define TEST_CGROUP "/test-bpf-sock-fields" +#define DATA "Hello BPF!" +#define DATA_LEN sizeof(DATA) + +static struct sockaddr_in6 srv_sa6, cli_sa6; +static int linum_map_fd; +static int addr_map_fd; +static int tp_map_fd; +static int sk_map_fd; +static __u32 srv_idx = SRV_IDX; +static __u32 cli_idx = CLI_IDX; + +static void init_loopback6(struct sockaddr_in6 *sa6) +{ + memset(sa6, 0, sizeof(*sa6)); + sa6->sin6_family = AF_INET6; + sa6->sin6_addr = in6addr_loopback; +} + +static void print_sk(const struct bpf_sock *sk) +{ + char src_ip4[24], dst_ip4[24]; + char src_ip6[64], dst_ip6[64]; + + inet_ntop(AF_INET, &sk->src_ip4, src_ip4, sizeof(src_ip4)); + inet_ntop(AF_INET6, &sk->src_ip6, src_ip6, sizeof(src_ip6)); + inet_ntop(AF_INET, &sk->dst_ip4, dst_ip4, sizeof(dst_ip4)); + inet_ntop(AF_INET6, &sk->dst_ip6, dst_ip6, sizeof(dst_ip6)); + + printf("state:%u bound_dev_if:%u family:%u type:%u protocol:%u mark:%u priority:%u " + "src_ip4:%x(%s) src_ip6:%x:%x:%x:%x(%s) src_port:%u " + "dst_ip4:%x(%s) dst_ip6:%x:%x:%x:%x(%s) dst_port:%u\n", + sk->state, sk->bound_dev_if, sk->family, sk->type, sk->protocol, + sk->mark, sk->priority, + sk->src_ip4, src_ip4, + sk->src_ip6[0], sk->src_ip6[1], sk->src_ip6[2], sk->src_ip6[3], + src_ip6, sk->src_port, + sk->dst_ip4, dst_ip4, + sk->dst_ip6[0], sk->dst_ip6[1], sk->dst_ip6[2], sk->dst_ip6[3], + dst_ip6, ntohs(sk->dst_port)); +} + +static void print_tp(const struct bpf_tcp_sock *tp) +{ + printf("snd_cwnd:%u srtt_us:%u rtt_min:%u snd_ssthresh:%u rcv_nxt:%u " + "snd_nxt:%u snd:una:%u mss_cache:%u ecn_flags:%u " + "rate_delivered:%u rate_interval_us:%u packets_out:%u " + "retrans_out:%u total_retrans:%u segs_in:%u data_segs_in:%u " + "segs_out:%u data_segs_out:%u lost_out:%u sacked_out:%u " + "bytes_received:%llu bytes_acked:%llu\n", + tp->snd_cwnd, tp->srtt_us, tp->rtt_min, tp->snd_ssthresh, + tp->rcv_nxt, tp->snd_nxt, tp->snd_una, tp->mss_cache, + tp->ecn_flags, tp->rate_delivered, tp->rate_interval_us, + tp->packets_out, tp->retrans_out, tp->total_retrans, + tp->segs_in, tp->data_segs_in, tp->segs_out, + tp->data_segs_out, tp->lost_out, tp->sacked_out, + tp->bytes_received, tp->bytes_acked); +} + +static void check_result(void) +{ + struct bpf_tcp_sock srv_tp, cli_tp; + struct bpf_sock srv_sk, cli_sk; + __u32 linum, idx0 = 0; + int err; + + err = bpf_map_lookup_elem(linum_map_fd, &idx0, &linum); + CHECK(err == -1, "bpf_map_lookup_elem(linum_map_fd)", + "err:%d errno:%d", err, errno); + + err = bpf_map_lookup_elem(sk_map_fd, &srv_idx, &srv_sk); + CHECK(err == -1, "bpf_map_lookup_elem(sk_map_fd, &srv_idx)", + "err:%d errno:%d", err, errno); + err = bpf_map_lookup_elem(tp_map_fd, &srv_idx, &srv_tp); + CHECK(err == -1, "bpf_map_lookup_elem(tp_map_fd, &srv_idx)", + "err:%d errno:%d", err, errno); + + err = bpf_map_lookup_elem(sk_map_fd, &cli_idx, &cli_sk); + CHECK(err == -1, "bpf_map_lookup_elem(sk_map_fd, &cli_idx)", + "err:%d errno:%d", err, errno); + err = bpf_map_lookup_elem(tp_map_fd, &cli_idx, &cli_tp); + CHECK(err == -1, "bpf_map_lookup_elem(tp_map_fd, &cli_idx)", + "err:%d errno:%d", err, errno); + + printf("srv_sk: "); + print_sk(&srv_sk); + printf("\n"); + + printf("cli_sk: "); + print_sk(&cli_sk); + printf("\n"); + + printf("srv_tp: "); + print_tp(&srv_tp); + printf("\n"); + + printf("cli_tp: "); + print_tp(&cli_tp); + printf("\n"); + + CHECK(srv_sk.state == 10 || + !srv_sk.state || + srv_sk.family != AF_INET6 || + srv_sk.protocol != IPPROTO_TCP || + memcmp(srv_sk.src_ip6, &in6addr_loopback, + sizeof(srv_sk.src_ip6)) || + memcmp(srv_sk.dst_ip6, &in6addr_loopback, + sizeof(srv_sk.dst_ip6)) || + srv_sk.src_port != ntohs(srv_sa6.sin6_port) || + srv_sk.dst_port != cli_sa6.sin6_port, + "Unexpected srv_sk", "Check srv_sk output. linum:%u", linum); + + CHECK(cli_sk.state == 10 || + !cli_sk.state || + cli_sk.family != AF_INET6 || + cli_sk.protocol != IPPROTO_TCP || + memcmp(cli_sk.src_ip6, &in6addr_loopback, + sizeof(cli_sk.src_ip6)) || + memcmp(cli_sk.dst_ip6, &in6addr_loopback, + sizeof(cli_sk.dst_ip6)) || + cli_sk.src_port != ntohs(cli_sa6.sin6_port) || + cli_sk.dst_port != srv_sa6.sin6_port, + "Unexpected cli_sk", "Check cli_sk output. linum:%u", linum); + + CHECK(srv_tp.data_segs_out != 1 || + srv_tp.data_segs_in || + srv_tp.snd_cwnd != 10 || + srv_tp.total_retrans || + srv_tp.bytes_acked != DATA_LEN, + "Unexpected srv_tp", "Check srv_tp output. linum:%u", linum); + + CHECK(cli_tp.data_segs_out || + cli_tp.data_segs_in != 1 || + cli_tp.snd_cwnd != 10 || + cli_tp.total_retrans || + cli_tp.bytes_received != DATA_LEN, + "Unexpected cli_tp", "Check cli_tp output. linum:%u", linum); +} + +static void test(void) +{ + int listen_fd, cli_fd, accept_fd, epfd, err; + struct epoll_event ev; + socklen_t addrlen; + + addrlen = sizeof(struct sockaddr_in6); + ev.events = EPOLLIN; + + epfd = epoll_create(1); + CHECK(epfd == -1, "epoll_create()", "epfd:%d errno:%d", epfd, errno); + + /* Prepare listen_fd */ + listen_fd = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0); + CHECK(listen_fd == -1, "socket()", "listen_fd:%d errno:%d", + listen_fd, errno); + + init_loopback6(&srv_sa6); + err = bind(listen_fd, (struct sockaddr *)&srv_sa6, sizeof(srv_sa6)); + CHECK(err, "bind(listen_fd)", "err:%d errno:%d", err, errno); + + err = getsockname(listen_fd, (struct sockaddr *)&srv_sa6, &addrlen); + CHECK(err, "getsockname(listen_fd)", "err:%d errno:%d", err, errno); + + err = listen(listen_fd, 1); + CHECK(err, "listen(listen_fd)", "err:%d errno:%d", err, errno); + + /* Prepare cli_fd */ + cli_fd = socket(AF_INET6, SOCK_STREAM | SOCK_NONBLOCK, 0); + CHECK(cli_fd == -1, "socket()", "cli_fd:%d errno:%d", cli_fd, errno); + + init_loopback6(&cli_sa6); + err = bind(cli_fd, (struct sockaddr *)&cli_sa6, sizeof(cli_sa6)); + CHECK(err, "bind(cli_fd)", "err:%d errno:%d", err, errno); + + err = getsockname(cli_fd, (struct sockaddr *)&cli_sa6, &addrlen); + CHECK(err, "getsockname(cli_fd)", "err:%d errno:%d", + err, errno); + + /* Update addr_map with srv_sa6 and cli_sa6 */ + err = bpf_map_update_elem(addr_map_fd, &srv_idx, &srv_sa6, 0); + CHECK(err, "map_update", "err:%d errno:%d", err, errno); + + err = bpf_map_update_elem(addr_map_fd, &cli_idx, &cli_sa6, 0); + CHECK(err, "map_update", "err:%d errno:%d", err, errno); + + /* Connect from cli_sa6 to srv_sa6 */ + err = connect(cli_fd, (struct sockaddr *)&srv_sa6, addrlen); + printf("srv_sa6.sin6_port:%u cli_sa6.sin6_port:%u\n\n", + ntohs(srv_sa6.sin6_port), ntohs(cli_sa6.sin6_port)); + CHECK(err && errno != EINPROGRESS, + "connect(cli_fd)", "err:%d errno:%d", err, errno); + + ev.data.fd = listen_fd; + err = epoll_ctl(epfd, EPOLL_CTL_ADD, listen_fd, &ev); + CHECK(err, "epoll_ctl(EPOLL_CTL_ADD, listen_fd)", "err:%d errno:%d", + err, errno); + + /* Accept the connection */ + /* Have some timeout in accept(listen_fd). Just in case. */ + err = epoll_wait(epfd, &ev, 1, 1000); + CHECK(err != 1 || ev.data.fd != listen_fd, + "epoll_wait(listen_fd)", + "err:%d errno:%d ev.data.fd:%d listen_fd:%d", + err, errno, ev.data.fd, listen_fd); + + accept_fd = accept(listen_fd, NULL, NULL); + CHECK(accept_fd == -1, "accept(listen_fd)", "accept_fd:%d errno:%d", + accept_fd, errno); + close(listen_fd); + + /* Send some data from accept_fd to cli_fd */ + err = send(accept_fd, DATA, DATA_LEN, 0); + CHECK(err != DATA_LEN, "send(accept_fd)", "err:%d errno:%d", + err, errno); + + /* Have some timeout in recv(cli_fd). Just in case. */ + ev.data.fd = cli_fd; + err = epoll_ctl(epfd, EPOLL_CTL_ADD, cli_fd, &ev); + CHECK(err, "epoll_ctl(EPOLL_CTL_ADD, cli_fd)", "err:%d errno:%d", + err, errno); + + err = epoll_wait(epfd, &ev, 1, 1000); + CHECK(err != 1 || ev.data.fd != cli_fd, + "epoll_wait(cli_fd)", "err:%d errno:%d ev.data.fd:%d cli_fd:%d", + err, errno, ev.data.fd, cli_fd); + + err = recv(cli_fd, NULL, 0, MSG_TRUNC); + CHECK(err, "recv(cli_fd)", "err:%d errno:%d", err, errno); + + close(epfd); + close(accept_fd); + close(cli_fd); + + check_result(); +} + +int main(int argc, char **argv) +{ + struct bpf_prog_load_attr attr = { + .file = "test_sock_fields_kern.o", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .expected_attach_type = BPF_CGROUP_INET_EGRESS, + }; + int cgroup_fd, prog_fd, err; + struct bpf_object *obj; + struct bpf_map *map; + + err = setup_cgroup_environment(); + CHECK(err, "setup_cgroup_environment()", "err:%d errno:%d", + err, errno); + + atexit(cleanup_cgroup_environment); + + /* Create a cgroup, get fd, and join it */ + cgroup_fd = create_and_get_cgroup(TEST_CGROUP); + CHECK(cgroup_fd == -1, "create_and_get_cgroup()", + "cgroup_fd:%d errno:%d", cgroup_fd, errno); + + err = join_cgroup(TEST_CGROUP); + CHECK(err, "join_cgroup", "err:%d errno:%d", err, errno); + + err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + CHECK(err, "bpf_prog_load_xattr()", "err:%d", err); + + err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0); + CHECK(err == -1, "bpf_prog_attach(CPF_CGROUP_INET_EGRESS)", + "err:%d errno%d", err, errno); + close(cgroup_fd); + + map = bpf_object__find_map_by_name(obj, "addr_map"); + CHECK(!map, "cannot find addr_map", "(null)"); + addr_map_fd = bpf_map__fd(map); + + map = bpf_object__find_map_by_name(obj, "sock_result_map"); + CHECK(!map, "cannot find sock_result_map", "(null)"); + sk_map_fd = bpf_map__fd(map); + + map = bpf_object__find_map_by_name(obj, "tcp_sock_result_map"); + CHECK(!map, "cannot find tcp_sock_result_map", "(null)"); + tp_map_fd = bpf_map__fd(map); + + map = bpf_object__find_map_by_name(obj, "linum_map"); + CHECK(!map, "cannot find linum_map", "(null)"); + linum_map_fd = bpf_map__fd(map); + + test(); + + bpf_object__close(obj); + cleanup_cgroup_environment(); + + printf("PASS\n"); + + return 0; +} diff --git a/tools/testing/selftests/bpf/test_sock_fields_kern.c b/tools/testing/selftests/bpf/test_sock_fields_kern.c new file mode 100644 index 000000000000..de1a43e8f610 --- /dev/null +++ b/tools/testing/selftests/bpf/test_sock_fields_kern.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include +#include +#include + +#include "bpf_helpers.h" +#include "bpf_endian.h" + +enum bpf_array_idx { + SRV_IDX, + CLI_IDX, + __NR_BPF_ARRAY_IDX, +}; + +struct bpf_map_def SEC("maps") addr_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct sockaddr_in6), + .max_entries = __NR_BPF_ARRAY_IDX, +}; + +struct bpf_map_def SEC("maps") sock_result_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct bpf_sock), + .max_entries = __NR_BPF_ARRAY_IDX, +}; + +struct bpf_map_def SEC("maps") tcp_sock_result_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct bpf_tcp_sock), + .max_entries = __NR_BPF_ARRAY_IDX, +}; + +struct bpf_map_def SEC("maps") linum_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 1, +}; + +static bool is_loopback6(__u32 *a6) +{ + return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1); +} + +static void skcpy(struct bpf_sock *dst, + const struct bpf_sock *src) +{ + dst->bound_dev_if = src->bound_dev_if; + dst->family = src->family; + dst->type = src->type; + dst->protocol = src->protocol; + dst->mark = src->mark; + dst->priority = src->priority; + dst->src_ip4 = src->src_ip4; + dst->src_ip6[0] = src->src_ip6[0]; + dst->src_ip6[1] = src->src_ip6[1]; + dst->src_ip6[2] = src->src_ip6[2]; + dst->src_ip6[3] = src->src_ip6[3]; + dst->src_port = src->src_port; + dst->dst_ip4 = src->dst_ip4; + dst->dst_ip6[0] = src->dst_ip6[0]; + dst->dst_ip6[1] = src->dst_ip6[1]; + dst->dst_ip6[2] = src->dst_ip6[2]; + dst->dst_ip6[3] = src->dst_ip6[3]; + dst->dst_port = src->dst_port; + dst->state = src->state; +} + +static void tpcpy(struct bpf_tcp_sock *dst, + const struct bpf_tcp_sock *src) +{ + dst->snd_cwnd = src->snd_cwnd; + dst->srtt_us = src->srtt_us; + dst->rtt_min = src->rtt_min; + dst->snd_ssthresh = src->snd_ssthresh; + dst->rcv_nxt = src->rcv_nxt; + dst->snd_nxt = src->snd_nxt; + dst->snd_una = src->snd_una; + dst->mss_cache = src->mss_cache; + dst->ecn_flags = src->ecn_flags; + dst->rate_delivered = src->rate_delivered; + dst->rate_interval_us = src->rate_interval_us; + dst->packets_out = src->packets_out; + dst->retrans_out = src->retrans_out; + dst->total_retrans = src->total_retrans; + dst->segs_in = src->segs_in; + dst->data_segs_in = src->data_segs_in; + dst->segs_out = src->segs_out; + dst->data_segs_out = src->data_segs_out; + dst->lost_out = src->lost_out; + dst->sacked_out = src->sacked_out; + dst->bytes_received = src->bytes_received; + dst->bytes_acked = src->bytes_acked; +} + +#define RETURN { \ + linum = __LINE__; \ + bpf_map_update_elem(&linum_map, &idx0, &linum, 0); \ + return 1; \ +} + +SEC("cgroup_skb/egress") +int read_sock_fields(struct __sk_buff *skb) +{ + __u32 srv_idx = SRV_IDX, cli_idx = CLI_IDX, idx; + struct sockaddr_in6 *srv_sa6, *cli_sa6; + struct bpf_tcp_sock *tp, *tp_ret; + struct bpf_sock *sk, *sk_ret; + __u32 linum, idx0 = 0; + + sk = skb->sk; + if (!sk || sk->state == 10) + RETURN; + + sk = bpf_sk_fullsock(sk); + if (!sk || sk->family != AF_INET6 || sk->protocol != IPPROTO_TCP || + !is_loopback6(sk->src_ip6)) + RETURN; + + tp = bpf_tcp_sock(sk); + if (!tp) + RETURN; + + srv_sa6 = bpf_map_lookup_elem(&addr_map, &srv_idx); + cli_sa6 = bpf_map_lookup_elem(&addr_map, &cli_idx); + if (!srv_sa6 || !cli_sa6) + RETURN; + + if (sk->src_port == bpf_ntohs(srv_sa6->sin6_port)) + idx = srv_idx; + else if (sk->src_port == bpf_ntohs(cli_sa6->sin6_port)) + idx = cli_idx; + else + RETURN; + + sk_ret = bpf_map_lookup_elem(&sock_result_map, &idx); + tp_ret = bpf_map_lookup_elem(&tcp_sock_result_map, &idx); + if (!sk_ret || !tp_ret) + RETURN; + + skcpy(sk_ret, sk); + tpcpy(tp_ret, tp); + + RETURN; +} + +char _license[] SEC("license") = "GPL"; From 1727a9dce6775a4ea77a611e9ea51cac1d093519 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Mon, 11 Feb 2019 12:01:18 +0000 Subject: [PATCH 15/36] selftests: bpf: add "alu32" to .gitignore "alu32" is a build dir and contains various files for BPF sub-register code-gen testing. This patch tells git to ignore it. Suggested-by: Yonghong Song Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index dd093bd91aa9..e47168d1257d 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -29,3 +29,4 @@ test_netcnt test_section_names test_tcpnotify_user test_libbpf +alu32 From 4836b4637ef080c2764c44ee40ed354cdb991d79 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Mon, 11 Feb 2019 12:01:19 +0000 Subject: [PATCH 16/36] selftests: bpf: extend sub-register mode compilation to all bpf object files At the moment, we only do extra sub-register mode compilation on bpf object files used by "test_progs". These object files are really loaded and executed. This patch further extends sub-register mode compilation to all bpf object files, even those without corresponding runtime tests. Because this could help testing LLVM sub-register code-gen, kernel bpf selftest has much more C testcases with reasonable size and complexity compared with LLVM testsuite which only contains unit tests. There were some file duplication inside BPF_OBJ_FILES_DUAL_COMPILE which is removed now. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c7e1e3255448..f2c11474f762 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -36,20 +36,14 @@ BPF_OBJ_FILES = \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_xdp_vlan.o \ xdp_dummy.o test_map_in_map.o test_spin_lock.o test_map_lock.o \ - test_sock_fields_kern.o - -# Objects are built with default compilation flags and with sub-register -# code-gen enabled. -BPF_OBJ_FILES_DUAL_COMPILE = \ - test_pkt_access.o test_pkt_access.o test_xdp.o test_adjust_tail.o \ - test_l4lb.o test_l4lb_noinline.o test_xdp_noinline.o test_tcp_estats.o \ + test_pkt_access.o test_xdp.o test_adjust_tail.o test_l4lb.o \ + test_l4lb_noinline.o test_xdp_noinline.o test_tcp_estats.o \ test_obj_id.o test_pkt_md_access.o test_tracepoint.o \ - test_stacktrace_map.o test_stacktrace_map.o test_stacktrace_build_id.o \ - test_stacktrace_build_id.o test_get_stack_rawtp.o \ - test_get_stack_rawtp.o test_tracepoint.o test_sk_lookup_kern.o \ - test_queue_map.o test_stack_map.o + test_stacktrace_map.o test_stacktrace_build_id.o \ + test_get_stack_rawtp.o test_sk_lookup_kern.o test_queue_map.o \ + test_stack_map.o test_sock_fields_kern.o -TEST_GEN_FILES = $(BPF_OBJ_FILES) $(BPF_OBJ_FILES_DUAL_COMPILE) +TEST_GEN_FILES = $(BPF_OBJ_FILES) # Also test sub-register code-gen if LLVM + kernel both has eBPF v3 processor # support which is the first version to contain both ALU32 and JMP32 @@ -59,7 +53,7 @@ SUBREG_CODEGEN := $(shell echo "int cal(int a) { return a > 0; }" | \ $(LLC) -mattr=+alu32 -mcpu=probe 2>&1 | \ grep 'if w') ifneq ($(SUBREG_CODEGEN),) -TEST_GEN_FILES += $(patsubst %.o,alu32/%.o, $(BPF_OBJ_FILES_DUAL_COMPILE)) +TEST_GEN_FILES += $(patsubst %.o,alu32/%.o, $(BPF_OBJ_FILES)) endif # Order correspond to 'make run_tests' order From bd4aed0ee73ca873bef3cb3ec746dd796f03df28 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Mon, 11 Feb 2019 12:01:20 +0000 Subject: [PATCH 17/36] selftests: bpf: centre kernel bpf objects under new subdir "progs" At the moment, all kernel bpf objects are listed under BPF_OBJ_FILES. Listing them manually sometimes causing patch conflict when people are adding new testcases simultaneously. It is better to centre all the related source files under a subdir "progs", then auto-generate the object file list. Suggested-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 26 ++++--------------- .../selftests/bpf/{ => progs}/bpf_flow.c | 0 .../selftests/bpf/{ => progs}/connect4_prog.c | 0 .../selftests/bpf/{ => progs}/connect6_prog.c | 0 .../selftests/bpf/{ => progs}/dev_cgroup.c | 0 .../bpf/{ => progs}/get_cgroup_id_kern.c | 0 .../selftests/bpf/{ => progs}/netcnt_prog.c | 0 .../bpf/{ => progs}/sample_map_ret0.c | 0 .../selftests/bpf/{ => progs}/sample_ret0.c | 0 .../selftests/bpf/{ => progs}/sendmsg4_prog.c | 0 .../selftests/bpf/{ => progs}/sendmsg6_prog.c | 0 .../bpf/{ => progs}/socket_cookie_prog.c | 0 .../bpf/{ => progs}/sockmap_parse_prog.c | 0 .../bpf/{ => progs}/sockmap_tcp_msg_prog.c | 0 .../bpf/{ => progs}/sockmap_verdict_prog.c | 0 .../bpf/{ => progs}/test_adjust_tail.c | 0 .../bpf/{ => progs}/test_btf_haskv.c | 0 .../selftests/bpf/{ => progs}/test_btf_nokv.c | 0 .../bpf/{ => progs}/test_get_stack_rawtp.c | 0 .../selftests/bpf/{ => progs}/test_l4lb.c | 0 .../bpf/{ => progs}/test_l4lb_noinline.c | 0 .../bpf/{ => progs}/test_lirc_mode2_kern.c | 0 .../bpf/{ => progs}/test_lwt_seg6local.c | 0 .../bpf/{ => progs}/test_map_in_map.c | 0 .../selftests/bpf/{ => progs}/test_map_lock.c | 0 .../selftests/bpf/{ => progs}/test_obj_id.c | 0 .../bpf/{ => progs}/test_pkt_access.c | 0 .../bpf/{ => progs}/test_pkt_md_access.c | 0 .../bpf/{ => progs}/test_queue_map.c | 0 .../{ => progs}/test_select_reuseport_kern.c | 0 .../bpf/{ => progs}/test_sk_lookup_kern.c | 0 .../bpf/{ => progs}/test_skb_cgroup_id_kern.c | 0 .../bpf/{ => progs}/test_sock_fields_kern.c | 0 .../bpf/{ => progs}/test_sockhash_kern.c | 0 .../bpf/{ => progs}/test_sockmap_kern.c | 0 .../bpf/{ => progs}/test_spin_lock.c | 0 .../bpf/{ => progs}/test_stack_map.c | 0 .../{ => progs}/test_stacktrace_build_id.c | 0 .../bpf/{ => progs}/test_stacktrace_map.c | 0 .../bpf/{ => progs}/test_tcp_estats.c | 0 .../bpf/{ => progs}/test_tcpbpf_kern.c | 0 .../bpf/{ => progs}/test_tcpnotify_kern.c | 0 .../bpf/{ => progs}/test_tracepoint.c | 0 .../bpf/{ => progs}/test_tunnel_kern.c | 0 .../selftests/bpf/{ => progs}/test_xdp.c | 0 .../selftests/bpf/{ => progs}/test_xdp_meta.c | 0 .../bpf/{ => progs}/test_xdp_noinline.c | 0 .../bpf/{ => progs}/test_xdp_redirect.c | 0 .../selftests/bpf/{ => progs}/test_xdp_vlan.c | 0 .../selftests/bpf/{ => progs}/xdp_dummy.c | 0 50 files changed, 5 insertions(+), 21 deletions(-) rename tools/testing/selftests/bpf/{ => progs}/bpf_flow.c (100%) rename tools/testing/selftests/bpf/{ => progs}/connect4_prog.c (100%) rename tools/testing/selftests/bpf/{ => progs}/connect6_prog.c (100%) rename tools/testing/selftests/bpf/{ => progs}/dev_cgroup.c (100%) rename tools/testing/selftests/bpf/{ => progs}/get_cgroup_id_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/netcnt_prog.c (100%) rename tools/testing/selftests/bpf/{ => progs}/sample_map_ret0.c (100%) rename tools/testing/selftests/bpf/{ => progs}/sample_ret0.c (100%) rename tools/testing/selftests/bpf/{ => progs}/sendmsg4_prog.c (100%) rename tools/testing/selftests/bpf/{ => progs}/sendmsg6_prog.c (100%) rename tools/testing/selftests/bpf/{ => progs}/socket_cookie_prog.c (100%) rename tools/testing/selftests/bpf/{ => progs}/sockmap_parse_prog.c (100%) rename tools/testing/selftests/bpf/{ => progs}/sockmap_tcp_msg_prog.c (100%) rename tools/testing/selftests/bpf/{ => progs}/sockmap_verdict_prog.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_adjust_tail.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_btf_haskv.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_btf_nokv.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_get_stack_rawtp.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_l4lb.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_l4lb_noinline.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_lirc_mode2_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_lwt_seg6local.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_map_in_map.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_map_lock.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_obj_id.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_pkt_access.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_pkt_md_access.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_queue_map.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_select_reuseport_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_sk_lookup_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_skb_cgroup_id_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_sock_fields_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_sockhash_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_sockmap_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_spin_lock.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_stack_map.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_stacktrace_build_id.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_stacktrace_map.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_tcp_estats.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_tcpbpf_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_tcpnotify_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_tracepoint.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_tunnel_kern.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_xdp.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_xdp_meta.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_xdp_noinline.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_xdp_redirect.c (100%) rename tools/testing/selftests/bpf/{ => progs}/test_xdp_vlan.c (100%) rename tools/testing/selftests/bpf/{ => progs}/xdp_dummy.c (100%) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f2c11474f762..575746e63544 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -25,24 +25,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \ test_netcnt test_tcpnotify_user test_sock_fields -BPF_OBJ_FILES = \ - test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ - sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o \ - test_tcpnotify_kern.o sample_map_ret0.o test_tcpbpf_kern.o \ - sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o \ - test_btf_haskv.o test_btf_nokv.o test_sockmap_kern.o \ - test_tunnel_kern.o test_sockhash_kern.o test_lwt_seg6local.o \ - sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ - get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_xdp_vlan.o \ - xdp_dummy.o test_map_in_map.o test_spin_lock.o test_map_lock.o \ - test_pkt_access.o test_xdp.o test_adjust_tail.o test_l4lb.o \ - test_l4lb_noinline.o test_xdp_noinline.o test_tcp_estats.o \ - test_obj_id.o test_pkt_md_access.o test_tracepoint.o \ - test_stacktrace_map.o test_stacktrace_build_id.o \ - test_get_stack_rawtp.o test_sk_lookup_kern.o test_queue_map.o \ - test_stack_map.o test_sock_fields_kern.o - +BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c))) TEST_GEN_FILES = $(BPF_OBJ_FILES) # Also test sub-register code-gen if LLVM + kernel both has eBPF v3 processor @@ -184,7 +167,8 @@ $(ALU32_BUILD_DIR)/test_progs_32: test_progs.c $(ALU32_BUILD_DIR) \ $(CC) $(CFLAGS) -o $(ALU32_BUILD_DIR)/test_progs_32 $< \ trace_helpers.c $(OUTPUT)/libbpf.a $(LDLIBS) -$(ALU32_BUILD_DIR)/%.o: %.c $(ALU32_BUILD_DIR) $(ALU32_BUILD_DIR)/test_progs_32 +$(ALU32_BUILD_DIR)/%.o: progs/%.c $(ALU32_BUILD_DIR) \ + $(ALU32_BUILD_DIR)/test_progs_32 $(CLANG) $(CLANG_FLAGS) \ -O2 -target bpf -emit-llvm -c $< -o - | \ $(LLC) -march=bpf -mattr=+alu32 -mcpu=$(CPU) $(LLC_FLAGS) \ @@ -196,7 +180,7 @@ endif # Have one program compiled without "-target bpf" to test whether libbpf loads # it successfully -$(OUTPUT)/test_xdp.o: test_xdp.c +$(OUTPUT)/test_xdp.o: progs/test_xdp.c $(CLANG) $(CLANG_FLAGS) \ -O2 -emit-llvm -c $< -o - | \ $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@ @@ -204,7 +188,7 @@ ifeq ($(DWARF2BTF),y) $(BTF_PAHOLE) -J $@ endif -$(OUTPUT)/%.o: %.c +$(OUTPUT)/%.o: progs/%.c $(CLANG) $(CLANG_FLAGS) \ -O2 -target bpf -emit-llvm -c $< -o - | \ $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@ diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c similarity index 100% rename from tools/testing/selftests/bpf/bpf_flow.c rename to tools/testing/selftests/bpf/progs/bpf_flow.c diff --git a/tools/testing/selftests/bpf/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c similarity index 100% rename from tools/testing/selftests/bpf/connect4_prog.c rename to tools/testing/selftests/bpf/progs/connect4_prog.c diff --git a/tools/testing/selftests/bpf/connect6_prog.c b/tools/testing/selftests/bpf/progs/connect6_prog.c similarity index 100% rename from tools/testing/selftests/bpf/connect6_prog.c rename to tools/testing/selftests/bpf/progs/connect6_prog.c diff --git a/tools/testing/selftests/bpf/dev_cgroup.c b/tools/testing/selftests/bpf/progs/dev_cgroup.c similarity index 100% rename from tools/testing/selftests/bpf/dev_cgroup.c rename to tools/testing/selftests/bpf/progs/dev_cgroup.c diff --git a/tools/testing/selftests/bpf/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c similarity index 100% rename from tools/testing/selftests/bpf/get_cgroup_id_kern.c rename to tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c diff --git a/tools/testing/selftests/bpf/netcnt_prog.c b/tools/testing/selftests/bpf/progs/netcnt_prog.c similarity index 100% rename from tools/testing/selftests/bpf/netcnt_prog.c rename to tools/testing/selftests/bpf/progs/netcnt_prog.c diff --git a/tools/testing/selftests/bpf/sample_map_ret0.c b/tools/testing/selftests/bpf/progs/sample_map_ret0.c similarity index 100% rename from tools/testing/selftests/bpf/sample_map_ret0.c rename to tools/testing/selftests/bpf/progs/sample_map_ret0.c diff --git a/tools/testing/selftests/bpf/sample_ret0.c b/tools/testing/selftests/bpf/progs/sample_ret0.c similarity index 100% rename from tools/testing/selftests/bpf/sample_ret0.c rename to tools/testing/selftests/bpf/progs/sample_ret0.c diff --git a/tools/testing/selftests/bpf/sendmsg4_prog.c b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c similarity index 100% rename from tools/testing/selftests/bpf/sendmsg4_prog.c rename to tools/testing/selftests/bpf/progs/sendmsg4_prog.c diff --git a/tools/testing/selftests/bpf/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c similarity index 100% rename from tools/testing/selftests/bpf/sendmsg6_prog.c rename to tools/testing/selftests/bpf/progs/sendmsg6_prog.c diff --git a/tools/testing/selftests/bpf/socket_cookie_prog.c b/tools/testing/selftests/bpf/progs/socket_cookie_prog.c similarity index 100% rename from tools/testing/selftests/bpf/socket_cookie_prog.c rename to tools/testing/selftests/bpf/progs/socket_cookie_prog.c diff --git a/tools/testing/selftests/bpf/sockmap_parse_prog.c b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c similarity index 100% rename from tools/testing/selftests/bpf/sockmap_parse_prog.c rename to tools/testing/selftests/bpf/progs/sockmap_parse_prog.c diff --git a/tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c similarity index 100% rename from tools/testing/selftests/bpf/sockmap_tcp_msg_prog.c rename to tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c diff --git a/tools/testing/selftests/bpf/sockmap_verdict_prog.c b/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c similarity index 100% rename from tools/testing/selftests/bpf/sockmap_verdict_prog.c rename to tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c diff --git a/tools/testing/selftests/bpf/test_adjust_tail.c b/tools/testing/selftests/bpf/progs/test_adjust_tail.c similarity index 100% rename from tools/testing/selftests/bpf/test_adjust_tail.c rename to tools/testing/selftests/bpf/progs/test_adjust_tail.c diff --git a/tools/testing/selftests/bpf/test_btf_haskv.c b/tools/testing/selftests/bpf/progs/test_btf_haskv.c similarity index 100% rename from tools/testing/selftests/bpf/test_btf_haskv.c rename to tools/testing/selftests/bpf/progs/test_btf_haskv.c diff --git a/tools/testing/selftests/bpf/test_btf_nokv.c b/tools/testing/selftests/bpf/progs/test_btf_nokv.c similarity index 100% rename from tools/testing/selftests/bpf/test_btf_nokv.c rename to tools/testing/selftests/bpf/progs/test_btf_nokv.c diff --git a/tools/testing/selftests/bpf/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c similarity index 100% rename from tools/testing/selftests/bpf/test_get_stack_rawtp.c rename to tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c diff --git a/tools/testing/selftests/bpf/test_l4lb.c b/tools/testing/selftests/bpf/progs/test_l4lb.c similarity index 100% rename from tools/testing/selftests/bpf/test_l4lb.c rename to tools/testing/selftests/bpf/progs/test_l4lb.c diff --git a/tools/testing/selftests/bpf/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c similarity index 100% rename from tools/testing/selftests/bpf/test_l4lb_noinline.c rename to tools/testing/selftests/bpf/progs/test_l4lb_noinline.c diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_kern.c b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_lirc_mode2_kern.c rename to tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.c b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c similarity index 100% rename from tools/testing/selftests/bpf/test_lwt_seg6local.c rename to tools/testing/selftests/bpf/progs/test_lwt_seg6local.c diff --git a/tools/testing/selftests/bpf/test_map_in_map.c b/tools/testing/selftests/bpf/progs/test_map_in_map.c similarity index 100% rename from tools/testing/selftests/bpf/test_map_in_map.c rename to tools/testing/selftests/bpf/progs/test_map_in_map.c diff --git a/tools/testing/selftests/bpf/test_map_lock.c b/tools/testing/selftests/bpf/progs/test_map_lock.c similarity index 100% rename from tools/testing/selftests/bpf/test_map_lock.c rename to tools/testing/selftests/bpf/progs/test_map_lock.c diff --git a/tools/testing/selftests/bpf/test_obj_id.c b/tools/testing/selftests/bpf/progs/test_obj_id.c similarity index 100% rename from tools/testing/selftests/bpf/test_obj_id.c rename to tools/testing/selftests/bpf/progs/test_obj_id.c diff --git a/tools/testing/selftests/bpf/test_pkt_access.c b/tools/testing/selftests/bpf/progs/test_pkt_access.c similarity index 100% rename from tools/testing/selftests/bpf/test_pkt_access.c rename to tools/testing/selftests/bpf/progs/test_pkt_access.c diff --git a/tools/testing/selftests/bpf/test_pkt_md_access.c b/tools/testing/selftests/bpf/progs/test_pkt_md_access.c similarity index 100% rename from tools/testing/selftests/bpf/test_pkt_md_access.c rename to tools/testing/selftests/bpf/progs/test_pkt_md_access.c diff --git a/tools/testing/selftests/bpf/test_queue_map.c b/tools/testing/selftests/bpf/progs/test_queue_map.c similarity index 100% rename from tools/testing/selftests/bpf/test_queue_map.c rename to tools/testing/selftests/bpf/progs/test_queue_map.c diff --git a/tools/testing/selftests/bpf/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_select_reuseport_kern.c rename to tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c diff --git a/tools/testing/selftests/bpf/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_sk_lookup_kern.c rename to tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_skb_cgroup_id_kern.c rename to tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c diff --git a/tools/testing/selftests/bpf/test_sock_fields_kern.c b/tools/testing/selftests/bpf/progs/test_sock_fields_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_sock_fields_kern.c rename to tools/testing/selftests/bpf/progs/test_sock_fields_kern.c diff --git a/tools/testing/selftests/bpf/test_sockhash_kern.c b/tools/testing/selftests/bpf/progs/test_sockhash_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_sockhash_kern.c rename to tools/testing/selftests/bpf/progs/test_sockhash_kern.c diff --git a/tools/testing/selftests/bpf/test_sockmap_kern.c b/tools/testing/selftests/bpf/progs/test_sockmap_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_sockmap_kern.c rename to tools/testing/selftests/bpf/progs/test_sockmap_kern.c diff --git a/tools/testing/selftests/bpf/test_spin_lock.c b/tools/testing/selftests/bpf/progs/test_spin_lock.c similarity index 100% rename from tools/testing/selftests/bpf/test_spin_lock.c rename to tools/testing/selftests/bpf/progs/test_spin_lock.c diff --git a/tools/testing/selftests/bpf/test_stack_map.c b/tools/testing/selftests/bpf/progs/test_stack_map.c similarity index 100% rename from tools/testing/selftests/bpf/test_stack_map.c rename to tools/testing/selftests/bpf/progs/test_stack_map.c diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c similarity index 100% rename from tools/testing/selftests/bpf/test_stacktrace_build_id.c rename to tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c similarity index 100% rename from tools/testing/selftests/bpf/test_stacktrace_map.c rename to tools/testing/selftests/bpf/progs/test_stacktrace_map.c diff --git a/tools/testing/selftests/bpf/test_tcp_estats.c b/tools/testing/selftests/bpf/progs/test_tcp_estats.c similarity index 100% rename from tools/testing/selftests/bpf/test_tcp_estats.c rename to tools/testing/selftests/bpf/progs/test_tcp_estats.c diff --git a/tools/testing/selftests/bpf/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_tcpbpf_kern.c rename to tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c diff --git a/tools/testing/selftests/bpf/test_tcpnotify_kern.c b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_tcpnotify_kern.c rename to tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c diff --git a/tools/testing/selftests/bpf/test_tracepoint.c b/tools/testing/selftests/bpf/progs/test_tracepoint.c similarity index 100% rename from tools/testing/selftests/bpf/test_tracepoint.c rename to tools/testing/selftests/bpf/progs/test_tracepoint.c diff --git a/tools/testing/selftests/bpf/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c similarity index 100% rename from tools/testing/selftests/bpf/test_tunnel_kern.c rename to tools/testing/selftests/bpf/progs/test_tunnel_kern.c diff --git a/tools/testing/selftests/bpf/test_xdp.c b/tools/testing/selftests/bpf/progs/test_xdp.c similarity index 100% rename from tools/testing/selftests/bpf/test_xdp.c rename to tools/testing/selftests/bpf/progs/test_xdp.c diff --git a/tools/testing/selftests/bpf/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c similarity index 100% rename from tools/testing/selftests/bpf/test_xdp_meta.c rename to tools/testing/selftests/bpf/progs/test_xdp_meta.c diff --git a/tools/testing/selftests/bpf/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c similarity index 100% rename from tools/testing/selftests/bpf/test_xdp_noinline.c rename to tools/testing/selftests/bpf/progs/test_xdp_noinline.c diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.c b/tools/testing/selftests/bpf/progs/test_xdp_redirect.c similarity index 100% rename from tools/testing/selftests/bpf/test_xdp_redirect.c rename to tools/testing/selftests/bpf/progs/test_xdp_redirect.c diff --git a/tools/testing/selftests/bpf/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c similarity index 100% rename from tools/testing/selftests/bpf/test_xdp_vlan.c rename to tools/testing/selftests/bpf/progs/test_xdp_vlan.c diff --git a/tools/testing/selftests/bpf/xdp_dummy.c b/tools/testing/selftests/bpf/progs/xdp_dummy.c similarity index 100% rename from tools/testing/selftests/bpf/xdp_dummy.c rename to tools/testing/selftests/bpf/progs/xdp_dummy.c From 64e39ee2c84bd8eac1bd3ea370c4ada5163befac Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Mon, 11 Feb 2019 12:01:21 +0000 Subject: [PATCH 18/36] selftests: bpf: relax sub-register mode compilation criteria Sub-register mode compilation was enabled only when there are eBPF "v3" processor supports at both compilation time inside LLVM and runtime inside kernel. Given separation betwen build and test server could be often, this patch removes the runtime support criteria. Suggested-by: Alexei Starovoitov Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 575746e63544..c3edf47da05d 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -28,12 +28,11 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c))) TEST_GEN_FILES = $(BPF_OBJ_FILES) -# Also test sub-register code-gen if LLVM + kernel both has eBPF v3 processor -# support which is the first version to contain both ALU32 and JMP32 -# instructions. +# Also test sub-register code-gen if LLVM has eBPF v3 processor support which +# contains both ALU32 and JMP32 instructions. SUBREG_CODEGEN := $(shell echo "int cal(int a) { return a > 0; }" | \ $(CLANG) -target bpf -O2 -emit-llvm -S -x c - -o - | \ - $(LLC) -mattr=+alu32 -mcpu=probe 2>&1 | \ + $(LLC) -mattr=+alu32 -mcpu=v3 2>&1 | \ grep 'if w') ifneq ($(SUBREG_CODEGEN),) TEST_GEN_FILES += $(patsubst %.o,alu32/%.o, $(BPF_OBJ_FILES)) From ebbed0f46ed9d3ae23291d67cd52d18abb8501bc Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Tue, 12 Feb 2019 10:25:12 +0900 Subject: [PATCH 19/36] tools: bpftool: doc, add text about feature-subcommand This patch adds missing information about feature-subcommand in bpftool.rst Signed-off-by: Prashant Bhole Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- tools/bpf/bpftool/Documentation/bpftool.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index 27153bb816ac..4f2188845dd8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -16,7 +16,7 @@ SYNOPSIS **bpftool** **version** - *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** } + *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** | **feature** } *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } | { **-j** | **--json** } [{ **-p** | **--pretty** }] } @@ -34,6 +34,8 @@ SYNOPSIS *NET-COMMANDS* := { **show** | **list** | **help** } + *FEATURE-COMMANDS* := { **probe** | **help** } + DESCRIPTION =========== *bpftool* allows for inspection and simple modification of BPF objects From dd27c2e3d0a05c01ff14bb672d1a3f0fdd8f98fc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 12 Feb 2019 00:20:39 -0800 Subject: [PATCH 20/36] bpf: offload: add priv field for drivers Currently bpf_offload_dev does not have any priv pointer, forcing the drivers to work backwards from the netdev in program metadata. This is not great given programs are conceptually associated with the offload device, and it means one or two unnecessary deferences. Add a priv pointer to bpf_offload_dev. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.c | 2 +- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 4 +--- drivers/net/netdevsim/bpf.c | 5 +++-- include/linux/bpf.h | 3 ++- kernel/bpf/offload.c | 10 +++++++++- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index dccae0319204..275de9f4c61c 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -465,7 +465,7 @@ static int nfp_bpf_init(struct nfp_app *app) app->ctrl_mtu = nfp_bpf_ctrl_cmsg_mtu(bpf); } - bpf->bpf_dev = bpf_offload_dev_create(&nfp_bpf_dev_ops); + bpf->bpf_dev = bpf_offload_dev_create(&nfp_bpf_dev_ops, bpf); err = PTR_ERR_OR_ZERO(bpf->bpf_dev); if (err) goto err_free_neutral_maps; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 55c7dbf8b421..15dce97650a5 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -185,8 +185,6 @@ static void nfp_prog_free(struct nfp_prog *nfp_prog) static int nfp_bpf_verifier_prep(struct bpf_prog *prog) { - struct nfp_net *nn = netdev_priv(prog->aux->offload->netdev); - struct nfp_app *app = nn->app; struct nfp_prog *nfp_prog; int ret; @@ -197,7 +195,7 @@ static int nfp_bpf_verifier_prep(struct bpf_prog *prog) INIT_LIST_HEAD(&nfp_prog->insns); nfp_prog->type = prog->type; - nfp_prog->bpf = app->priv; + nfp_prog->bpf = bpf_offload_dev_priv(prog->aux->offload->offdev); ret = nfp_prog_prepare(nfp_prog, prog->insnsi, prog->len); if (ret) diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index 172b271c8bd2..f92c43453ec6 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -248,7 +248,7 @@ static int nsim_bpf_create_prog(struct netdevsim *ns, struct bpf_prog *prog) static int nsim_bpf_verifier_prep(struct bpf_prog *prog) { - struct netdevsim *ns = netdev_priv(prog->aux->offload->netdev); + struct netdevsim *ns = bpf_offload_dev_priv(prog->aux->offload->offdev); if (!ns->bpf_bind_accept) return -EOPNOTSUPP; @@ -589,7 +589,8 @@ int nsim_bpf_init(struct netdevsim *ns) if (IS_ERR_OR_NULL(ns->sdev->ddir_bpf_bound_progs)) return -ENOMEM; - ns->sdev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops); + ns->sdev->bpf_dev = bpf_offload_dev_create(&nsim_bpf_dev_ops, + ns); err = PTR_ERR_OR_ZERO(ns->sdev->bpf_dev); if (err) return err; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7f58828755fd..de18227b3d95 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -773,8 +773,9 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, bool bpf_offload_prog_map_match(struct bpf_prog *prog, struct bpf_map *map); struct bpf_offload_dev * -bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops); +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv); void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev); +void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev); int bpf_offload_dev_netdev_register(struct bpf_offload_dev *offdev, struct net_device *netdev); void bpf_offload_dev_netdev_unregister(struct bpf_offload_dev *offdev, diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 39dba8c90331..ba635209ae9a 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -35,6 +35,7 @@ static DECLARE_RWSEM(bpf_devs_lock); struct bpf_offload_dev { const struct bpf_prog_offload_ops *ops; struct list_head netdevs; + void *priv; }; struct bpf_offload_netdev { @@ -669,7 +670,7 @@ unlock: EXPORT_SYMBOL_GPL(bpf_offload_dev_netdev_unregister); struct bpf_offload_dev * -bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) +bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops, void *priv) { struct bpf_offload_dev *offdev; int err; @@ -688,6 +689,7 @@ bpf_offload_dev_create(const struct bpf_prog_offload_ops *ops) return ERR_PTR(-ENOMEM); offdev->ops = ops; + offdev->priv = priv; INIT_LIST_HEAD(&offdev->netdevs); return offdev; @@ -700,3 +702,9 @@ void bpf_offload_dev_destroy(struct bpf_offload_dev *offdev) kfree(offdev); } EXPORT_SYMBOL_GPL(bpf_offload_dev_destroy); + +void *bpf_offload_dev_priv(struct bpf_offload_dev *offdev) +{ + return offdev->priv; +} +EXPORT_SYMBOL_GPL(bpf_offload_dev_priv); From 3e0bd37ce0e4a574df6d87a901e13bcb46e10301 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:35 -0800 Subject: [PATCH 21/36] bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap This patch adds all needed plumbing in preparation to allowing bpf programs to do IP encapping via bpf_lwt_push_encap. Actual implementation is added in the next patch in the patchset. Of note: - bpf_lwt_push_encap can now be called from BPF_PROG_TYPE_LWT_XMIT prog types in addition to BPF_PROG_TYPE_LWT_IN; - if the skb being encapped has GSO set, encapsulation is limited to IPIP/IP+GRE/IP+GUE (both IPv4 and IPv6); - as route lookups are different for ingress vs egress, the single external bpf_lwt_push_encap BPF helper is routed internally to either bpf_lwt_in_push_encap or bpf_lwt_xmit_push_encap BPF_CALLs, depending on prog type. v8 changes: fixed a typo. Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 26 ++++++++++++++++++++-- net/core/filter.c | 48 +++++++++++++++++++++++++++++++++++----- 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 25c8c0e62ecf..bcdd2474eee7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2016,6 +2016,19 @@ union bpf_attr { * Only works if *skb* contains an IPv6 packet. Insert a * Segment Routing Header (**struct ipv6_sr_hdr**) inside * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to LWT_BPF_MAX_HEADROOM total + * bytes in all prepended headers. Please note that + * if skb_is_gso(skb) is true, no more than two headers + * can be prepended, and the inner header, if present, + * should be either GRE or UDP/GUE. + * + * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of + * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called + * by bpf programs of types BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off { /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6, - BPF_LWT_ENCAP_SEG6_INLINE + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, }; #define __bpf_md_ptr(type, name) \ @@ -2606,7 +2620,15 @@ enum bpf_ret_code { BPF_DROP = 2, /* 3-6 reserved */ BPF_REDIRECT = 7, - /* >127 are reserved for prog type specific return codes */ + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, }; struct bpf_sock { diff --git a/net/core/filter.c b/net/core/filter.c index 353735575204..12c88c21b6b8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4815,7 +4815,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len } #endif /* CONFIG_IPV6_SEG6_BPF */ -BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, + bool ingress) +{ + return -EINVAL; /* Implemented in the next patch. */ +} +#endif + +BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { @@ -4823,14 +4831,41 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, case BPF_LWT_ENCAP_SEG6: case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); +#endif +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); #endif default: return -EINVAL; } } -static const struct bpf_func_proto bpf_lwt_push_encap_proto = { - .func = bpf_lwt_push_encap, +BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, + void *, hdr, u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, false /* egress */); +#endif + default: + return -EINVAL; + } +} + +static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { + .func = bpf_lwt_in_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { + .func = bpf_lwt_xmit_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -5417,7 +5452,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || #endif - func == bpf_lwt_push_encap) + func == bpf_lwt_in_push_encap || + func == bpf_lwt_xmit_push_encap) return true; return false; @@ -5815,7 +5851,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: - return &bpf_lwt_push_encap_proto; + return &bpf_lwt_in_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } @@ -5851,6 +5887,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_xmit_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } From 52f278774e796a553be0c869dcaaee6f259ca795 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:36 -0800 Subject: [PATCH 22/36] bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap Implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers to packets (e.g. IP/GRE, GUE, IPIP). This is useful when thousands of different short-lived flows should be encapped, each with different and dynamically determined destination. Although lwtunnels can be used in some of these scenarios, the ability to dynamically generate encap headers adds more flexibility, e.g. when routing depends on the state of the host (reflected in global bpf maps). v7 changes: - added a call skb_clear_hash(); - removed calls to skb_set_transport_header(); - refuse to encap GSO-enabled packets. v8 changes: - fix build errors when LWT is not enabled. Note: the next patch in the patchset with deal with GSO-enabled packets, which are currently rejected at encapping attempt. Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- include/net/lwtunnel.h | 2 ++ net/core/filter.c | 3 +- net/core/lwt_bpf.c | 65 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 1 deletion(-) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 33fd9ba7e0e5..671113bcb2cc 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); int lwtunnel_xmit(struct sk_buff *skb); +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, + bool ingress); static inline void lwtunnel_set_redirect(struct dst_entry *dst) { diff --git a/net/core/filter.c b/net/core/filter.c index 12c88c21b6b8..a78deb2656e1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -73,6 +73,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -4819,7 +4820,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) { - return -EINVAL; /* Implemented in the next patch. */ + return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); } #endif diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index a648568c5e8f..e5a9850d9f48 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -390,6 +390,71 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = { .owner = THIS_MODULE, }; +static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) +{ + /* Handling of GSO-enabled packets is added in the next patch. */ + return -EOPNOTSUPP; +} + +int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) +{ + struct iphdr *iph; + bool ipv4; + int err; + + if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) + return -EINVAL; + + /* validate protocol and length */ + iph = (struct iphdr *)hdr; + if (iph->version == 4) { + ipv4 = true; + if (unlikely(len < iph->ihl * 4)) + return -EINVAL; + } else if (iph->version == 6) { + ipv4 = false; + if (unlikely(len < sizeof(struct ipv6hdr))) + return -EINVAL; + } else { + return -EINVAL; + } + + if (ingress) + err = skb_cow_head(skb, len + skb->mac_len); + else + err = skb_cow_head(skb, + len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); + if (unlikely(err)) + return err; + + /* push the encap headers and fix pointers */ + skb_reset_inner_headers(skb); + skb->encapsulation = 1; + skb_push(skb, len); + if (ingress) + skb_postpush_rcsum(skb, iph, len); + skb_reset_network_header(skb); + memcpy(skb_network_header(skb), hdr, len); + bpf_compute_data_pointers(skb); + skb_clear_hash(skb); + + if (ipv4) { + skb->protocol = htons(ETH_P_IP); + iph = ip_hdr(skb); + + if (!iph->check) + iph->check = ip_fast_csum((unsigned char *)iph, + iph->ihl); + } else { + skb->protocol = htons(ETH_P_IPV6); + } + + if (skb_is_gso(skb)) + return handle_gso_encap(skb, ipv4, len); + + return 0; +} + static int __init bpf_lwt_init(void) { return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); From ca78801a81e04a31f8088e96b2649a9cbace5499 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:37 -0800 Subject: [PATCH 23/36] bpf: handle GSO in bpf_lwt_push_encap This patch adds handling of GSO packets in bpf_lwt_push_ip_encap() (called from bpf_lwt_push_encap): * IPIP, GRE, and UDP encapsulation types are deduced by looking into iphdr->protocol or ipv6hdr->next_header; * SCTP GSO packets are not supported (as bpf_skb_proto_4_to_6 and similar do); * UDP_L4 GSO packets are also not supported (although they are not blocked in bpf_skb_proto_4_to_6 and similar), as skb_decrease_gso_size() will break it; * SKB_GSO_DODGY bit is set. Note: it may be possible to support SCTP and UDP_L4 gso packets; but as these cases seem to be not well handled by other tunneling/encapping code paths, the solution should be generic enough to apply to all tunneling/encapping code. v8 changes: - make sure that if GRE or UDP encap is detected, there is enough of pushed bytes to cover both IP[v6] + GRE|UDP headers; - do not reject double-encapped packets; - whitelist TCP GSO packets rather than block SCTP GSO and UDP GSO. Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- net/core/lwt_bpf.c | 67 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index e5a9850d9f48..079871fc020f 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -16,6 +16,7 @@ #include #include #include +#include struct bpf_lwt_prog { struct bpf_prog *prog; @@ -390,10 +391,72 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = { .owner = THIS_MODULE, }; +static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, + int encap_len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + gso_type |= SKB_GSO_DODGY; + shinfo->gso_type |= gso_type; + skb_decrease_gso_size(shinfo, encap_len); + shinfo->gso_segs = 0; + return 0; +} + static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) { - /* Handling of GSO-enabled packets is added in the next patch. */ - return -EOPNOTSUPP; + int next_hdr_offset; + void *next_hdr; + __u8 protocol; + + /* SCTP and UDP_L4 gso need more nuanced handling than what + * handle_gso_type() does above: skb_decrease_gso_size() is not enough. + * So at the moment only TCP GSO packets are let through. + */ + if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) + return -ENOTSUPP; + + if (ipv4) { + protocol = ip_hdr(skb)->protocol; + next_hdr_offset = sizeof(struct iphdr); + next_hdr = skb_network_header(skb) + next_hdr_offset; + } else { + protocol = ipv6_hdr(skb)->nexthdr; + next_hdr_offset = sizeof(struct ipv6hdr); + next_hdr = skb_network_header(skb) + next_hdr_offset; + } + + switch (protocol) { + case IPPROTO_GRE: + next_hdr_offset += sizeof(struct gre_base_hdr); + if (next_hdr_offset > encap_len) + return -EINVAL; + + if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) + return handle_gso_type(skb, SKB_GSO_GRE_CSUM, + encap_len); + return handle_gso_type(skb, SKB_GSO_GRE, encap_len); + + case IPPROTO_UDP: + next_hdr_offset += sizeof(struct udphdr); + if (next_hdr_offset > encap_len) + return -EINVAL; + + if (((struct udphdr *)next_hdr)->check) + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, + encap_len); + return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); + + case IPPROTO_IP: + case IPPROTO_IPV6: + if (ipv4) + return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); + else + return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); + + default: + return -EPROTONOSUPPORT; + } } int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) From 9b0a6a9dbab0ae092d033e67dc2701e8a7b09cdb Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:38 -0800 Subject: [PATCH 24/36] ipv6_stub: add ipv6_route_input stub/proxy. Proxy ip6_route_input via ipv6_stub, for later use by lwt bpf ip encap (see the next patch in the patchset). Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- include/net/addrconf.h | 1 + net/ipv6/addrconf_core.c | 6 ++++++ net/ipv6/af_inet6.c | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 20d523ee2fec..269ec27385e9 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -248,6 +248,7 @@ struct ipv6_stub { const struct in6_addr *addr); int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); + int (*ipv6_route_input)(struct sk_buff *skb); struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); struct fib6_info *(*fib6_lookup)(struct net *net, int oif, diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index 5cd0029d930e..6c79af056d9b 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -134,6 +134,11 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1, return -EAFNOSUPPORT; } +static int eafnosupport_ipv6_route_input(struct sk_buff *skb) +{ + return -EAFNOSUPPORT; +} + static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id) { return NULL; @@ -170,6 +175,7 @@ eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, + .ipv6_route_input = eafnosupport_ipv6_route_input, .fib6_get_table = eafnosupport_fib6_get_table, .fib6_table_lookup = eafnosupport_fib6_table_lookup, .fib6_lookup = eafnosupport_fib6_lookup, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d99753b5e39b..2f45d2a3e3a3 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -900,10 +900,17 @@ static struct pernet_operations inet6_net_ops = { .exit = inet6_net_exit, }; +static int ipv6_route_input(struct sk_buff *skb) +{ + ip6_route_input(skb); + return skb_dst(skb)->error; +} + static const struct ipv6_stub ipv6_stub_impl = { .ipv6_sock_mc_join = ipv6_sock_mc_join, .ipv6_sock_mc_drop = ipv6_sock_mc_drop, .ipv6_dst_lookup = ip6_dst_lookup, + .ipv6_route_input = ipv6_route_input, .fib6_get_table = fib6_get_table, .fib6_table_lookup = fib6_table_lookup, .fib6_lookup = fib6_lookup, From 3bd0b15281af776650e1550be4ea655b8cfa10c8 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:39 -0800 Subject: [PATCH 25/36] bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c This patch builds on top of the previous patch in the patchset, which added BPF_LWT_ENCAP_IP mode to bpf_lwt_push_encap. As the encapping can result in the skb needing to go via a different interface/route/dst, bpf programs can indicate this by returning BPF_LWT_REROUTE, which triggers a new route lookup for the skb. v8 changes: fix kbuild errors when LWTUNNEL_BPF is builtin, but IPV6 is a module: as LWTUNNEL_BPF can only be either Y or N, call IPV6 routing functions only if they are built-in. v9 changes: - fixed a kbuild test robot compiler warning; - call IPV6 routing functions via ipv6_stub. v10 changes: removed unnecessary IS_ENABLED and pr_warn_once. v11 changes: fixed a potential dst leak. Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- net/core/lwt_bpf.c | 126 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 124 insertions(+), 2 deletions(-) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 079871fc020f..32251f3fcda0 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -17,6 +17,7 @@ #include #include #include +#include struct bpf_lwt_prog { struct bpf_prog *prog; @@ -56,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, switch (ret) { case BPF_OK: + case BPF_LWT_REROUTE: break; case BPF_REDIRECT: @@ -88,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, return ret; } +static int bpf_lwt_input_reroute(struct sk_buff *skb) +{ + int err = -EINVAL; + + if (skb->protocol == htons(ETH_P_IP)) { + struct iphdr *iph = ip_hdr(skb); + + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb_dst(skb)->dev); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + err = ipv6_stub->ipv6_route_input(skb); + } else { + err = -EAFNOSUPPORT; + } + + if (err) + goto err; + return dst_input(skb); + +err: + kfree_skb(skb); + return err; +} + static int bpf_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -99,11 +125,11 @@ static int bpf_input(struct sk_buff *skb) ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); if (ret < 0) return ret; + if (ret == BPF_LWT_REROUTE) + return bpf_lwt_input_reroute(skb); } if (unlikely(!dst->lwtstate->orig_input)) { - pr_warn_once("orig_input not set on dst for prog %s\n", - bpf->out.name); kfree_skb(skb); return -EINVAL; } @@ -148,6 +174,91 @@ static int xmit_check_hhlen(struct sk_buff *skb) return 0; } +static int bpf_lwt_xmit_reroute(struct sk_buff *skb) +{ + struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); + int oif = l3mdev ? l3mdev->ifindex : 0; + struct dst_entry *dst = NULL; + struct sock *sk; + struct net *net; + bool ipv4; + int err; + + if (skb->protocol == htons(ETH_P_IP)) + ipv4 = true; + else if (skb->protocol == htons(ETH_P_IPV6)) + ipv4 = false; + else + return -EAFNOSUPPORT; + + sk = sk_to_full_sk(skb->sk); + if (sk) { + if (sk->sk_bound_dev_if) + oif = sk->sk_bound_dev_if; + net = sock_net(sk); + } else { + net = dev_net(skb_dst(skb)->dev); + } + + if (ipv4) { + struct iphdr *iph = ip_hdr(skb); + struct flowi4 fl4 = {}; + struct rtable *rt; + + fl4.flowi4_oif = oif; + fl4.flowi4_mark = skb->mark; + fl4.flowi4_uid = sock_net_uid(net, sk); + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; + fl4.flowi4_proto = iph->protocol; + fl4.daddr = iph->daddr; + fl4.saddr = iph->saddr; + + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return -EINVAL; + dst = &rt->dst; + } else { + struct ipv6hdr *iph6 = ipv6_hdr(skb); + struct flowi6 fl6 = {}; + + fl6.flowi6_oif = oif; + fl6.flowi6_mark = skb->mark; + fl6.flowi6_uid = sock_net_uid(net, sk); + fl6.flowlabel = ip6_flowinfo(iph6); + fl6.flowi6_proto = iph6->nexthdr; + fl6.daddr = iph6->daddr; + fl6.saddr = iph6->saddr; + + err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); + if (err || IS_ERR(dst)) + return -EINVAL; + } + if (unlikely(dst->error)) { + dst_release(dst); + return -EINVAL; + } + + /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it + * was done for the previous dst, so we are doing it here again, in + * case the new dst needs much more space. The call below is a noop + * if there is enough header space in skb. + */ + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + return err; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); + if (unlikely(err)) + return err; + + /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ + return LWTUNNEL_XMIT_DONE; +} + static int bpf_xmit(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -155,11 +266,20 @@ static int bpf_xmit(struct sk_buff *skb) bpf = bpf_lwt_lwtunnel(dst->lwtstate); if (bpf->xmit.prog) { + __be16 proto = skb->protocol; int ret; ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); switch (ret) { case BPF_OK: + /* If the header changed, e.g. via bpf_lwt_push_encap, + * BPF_LWT_REROUTE below should have been used if the + * protocol was also changed. + */ + if (skb->protocol != proto) { + kfree_skb(skb); + return -EINVAL; + } /* If the header was expanded, headroom might be too * small for L2 header to come, expand as needed. */ @@ -170,6 +290,8 @@ static int bpf_xmit(struct sk_buff *skb) return LWTUNNEL_XMIT_CONTINUE; case BPF_REDIRECT: return LWTUNNEL_XMIT_DONE; + case BPF_LWT_REROUTE: + return bpf_lwt_xmit_reroute(skb); default: return ret; } From 755db4771c96cf49e60e188a2c90c4d2a1ea063d Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:40 -0800 Subject: [PATCH 26/36] bpf: sync /include/.../bpf.h with tools/include/.../bpf.h This patch copies changes in bpf.h done by a previous patch in this patchset from the kernel uapi include dir into tools uapi include dir. Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 25c8c0e62ecf..bcdd2474eee7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2016,6 +2016,19 @@ union bpf_attr { * Only works if *skb* contains an IPv6 packet. Insert a * Segment Routing Header (**struct ipv6_sr_hdr**) inside * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to LWT_BPF_MAX_HEADROOM total + * bytes in all prepended headers. Please note that + * if skb_is_gso(skb) is true, no more than two headers + * can be prepended, and the inner header, if present, + * should be either GRE or UDP/GUE. + * + * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of + * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called + * by bpf programs of types BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off { /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6, - BPF_LWT_ENCAP_SEG6_INLINE + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, }; #define __bpf_md_ptr(type, name) \ @@ -2606,7 +2620,15 @@ enum bpf_ret_code { BPF_DROP = 2, /* 3-6 reserved */ BPF_REDIRECT = 7, - /* >127 are reserved for prog type specific return codes */ + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, }; struct bpf_sock { From 0fde56e4385b09a67dd25321f607d4c942282de2 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:41 -0800 Subject: [PATCH 27/36] selftests: bpf: add test_lwt_ip_encap selftest This patch adds a bpf self-test to cover BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap. Covered: - encapping in LWT_IN and LWT_XMIT - IPv4 and IPv6 A follow-up patch will add GSO and VRF-enabled tests. Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/progs/test_lwt_ip_encap.c | 85 +++++ .../selftests/bpf/test_lwt_ip_encap.sh | 311 ++++++++++++++++++ 3 files changed, 398 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c create mode 100755 tools/testing/selftests/bpf/test_lwt_ip_encap.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c3edf47da05d..ccffaa0a0787 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -50,7 +50,8 @@ TEST_PROGS := test_kmod.sh \ test_lirc_mode2.sh \ test_skb_cgroup_id.sh \ test_flow_dissector.sh \ - test_xdp_vlan.sh + test_xdp_vlan.sh \ + test_lwt_ip_encap.sh TEST_PROGS_EXTENDED := with_addr.sh \ with_tunnels.sh \ diff --git a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c new file mode 100644 index 000000000000..c957d6dfe6d7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +struct grehdr { + __be16 flags; + __be16 protocol; +}; + +SEC("encap_gre") +int bpf_lwt_encap_gre(struct __sk_buff *skb) +{ + struct encap_hdr { + struct iphdr iph; + struct grehdr greh; + } hdr; + int err; + + memset(&hdr, 0, sizeof(struct encap_hdr)); + + hdr.iph.ihl = 5; + hdr.iph.version = 4; + hdr.iph.ttl = 0x40; + hdr.iph.protocol = 47; /* IPPROTO_GRE */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + hdr.iph.saddr = 0x640110ac; /* 172.16.1.100 */ + hdr.iph.daddr = 0x641010ac; /* 172.16.16.100 */ +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + hdr.iph.saddr = 0xac100164; /* 172.16.1.100 */ + hdr.iph.daddr = 0xac101064; /* 172.16.16.100 */ +#else +#error "Fix your compiler's __BYTE_ORDER__?!" +#endif + hdr.iph.tot_len = bpf_htons(skb->len + sizeof(struct encap_hdr)); + + hdr.greh.protocol = skb->protocol; + + err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr, + sizeof(struct encap_hdr)); + if (err) + return BPF_DROP; + + return BPF_LWT_REROUTE; +} + +SEC("encap_gre6") +int bpf_lwt_encap_gre6(struct __sk_buff *skb) +{ + struct encap_hdr { + struct ipv6hdr ip6hdr; + struct grehdr greh; + } hdr; + int err; + + memset(&hdr, 0, sizeof(struct encap_hdr)); + + hdr.ip6hdr.version = 6; + hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(struct grehdr)); + hdr.ip6hdr.nexthdr = 47; /* IPPROTO_GRE */ + hdr.ip6hdr.hop_limit = 0x40; + /* fb01::1 */ + hdr.ip6hdr.saddr.s6_addr[0] = 0xfb; + hdr.ip6hdr.saddr.s6_addr[1] = 1; + hdr.ip6hdr.saddr.s6_addr[15] = 1; + /* fb10::1 */ + hdr.ip6hdr.daddr.s6_addr[0] = 0xfb; + hdr.ip6hdr.daddr.s6_addr[1] = 0x10; + hdr.ip6hdr.daddr.s6_addr[15] = 1; + + hdr.greh.protocol = skb->protocol; + + err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr, + sizeof(struct encap_hdr)); + if (err) + return BPF_DROP; + + return BPF_LWT_REROUTE; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh new file mode 100755 index 000000000000..4ca714e23ab0 --- /dev/null +++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh @@ -0,0 +1,311 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Setup/topology: +# +# NS1 NS2 NS3 +# veth1 <---> veth2 veth3 <---> veth4 (the top route) +# veth5 <---> veth6 veth7 <---> veth8 (the bottom route) +# +# each vethN gets IPv[4|6]_N address +# +# IPv*_SRC = IPv*_1 +# IPv*_DST = IPv*_4 +# +# all tests test pings from IPv*_SRC to IPv*_DST +# +# by default, routes are configured to allow packets to go +# IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route) +# +# a GRE device is installed in NS3 with IPv*_GRE, and +# NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8 +# (the bottom route) +# +# Tests: +# +# 1. routes NS2->IPv*_DST are brought down, so the only way a ping +# from IP*_SRC to IP*_DST can work is via IPv*_GRE +# +# 2a. in an egress test, a bpf LWT_XMIT program is installed on veth1 +# that encaps the packets with an IP/GRE header to route to IPv*_GRE +# +# ping: SRC->[encap at veth1:egress]->GRE:decap->DST +# ping replies go DST->SRC directly +# +# 2b. in an ingress test, a bpf LWT_IN program is installed on veth2 +# that encaps the packets with an IP/GRE header to route to IPv*_GRE +# +# ping: SRC->[encap at veth2:ingress]->GRE:decap->DST +# ping replies go DST->SRC directly + +set -e # exit on error + +if [[ $EUID -ne 0 ]]; then + echo "This script must be run as root" + echo "FAIL" + exit 1 +fi + +readonly NS1="ns1-$(mktemp -u XXXXXX)" +readonly NS2="ns2-$(mktemp -u XXXXXX)" +readonly NS3="ns3-$(mktemp -u XXXXXX)" + +readonly IPv4_1="172.16.1.100" +readonly IPv4_2="172.16.2.100" +readonly IPv4_3="172.16.3.100" +readonly IPv4_4="172.16.4.100" +readonly IPv4_5="172.16.5.100" +readonly IPv4_6="172.16.6.100" +readonly IPv4_7="172.16.7.100" +readonly IPv4_8="172.16.8.100" +readonly IPv4_GRE="172.16.16.100" + +readonly IPv4_SRC=$IPv4_1 +readonly IPv4_DST=$IPv4_4 + +readonly IPv6_1="fb01::1" +readonly IPv6_2="fb02::1" +readonly IPv6_3="fb03::1" +readonly IPv6_4="fb04::1" +readonly IPv6_5="fb05::1" +readonly IPv6_6="fb06::1" +readonly IPv6_7="fb07::1" +readonly IPv6_8="fb08::1" +readonly IPv6_GRE="fb10::1" + +readonly IPv6_SRC=$IPv6_1 +readonly IPv6_DST=$IPv6_4 + +setup() { +set -e # exit on error + # create devices and namespaces + ip netns add "${NS1}" + ip netns add "${NS2}" + ip netns add "${NS3}" + + ip link add veth1 type veth peer name veth2 + ip link add veth3 type veth peer name veth4 + ip link add veth5 type veth peer name veth6 + ip link add veth7 type veth peer name veth8 + + ip netns exec ${NS2} sysctl -wq net.ipv4.ip_forward=1 + ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.forwarding=1 + + ip link set veth1 netns ${NS1} + ip link set veth2 netns ${NS2} + ip link set veth3 netns ${NS2} + ip link set veth4 netns ${NS3} + ip link set veth5 netns ${NS1} + ip link set veth6 netns ${NS2} + ip link set veth7 netns ${NS2} + ip link set veth8 netns ${NS3} + + # configure addesses: the top route (1-2-3-4) + ip -netns ${NS1} addr add ${IPv4_1}/24 dev veth1 + ip -netns ${NS2} addr add ${IPv4_2}/24 dev veth2 + ip -netns ${NS2} addr add ${IPv4_3}/24 dev veth3 + ip -netns ${NS3} addr add ${IPv4_4}/24 dev veth4 + ip -netns ${NS1} -6 addr add ${IPv6_1}/128 nodad dev veth1 + ip -netns ${NS2} -6 addr add ${IPv6_2}/128 nodad dev veth2 + ip -netns ${NS2} -6 addr add ${IPv6_3}/128 nodad dev veth3 + ip -netns ${NS3} -6 addr add ${IPv6_4}/128 nodad dev veth4 + + # configure addresses: the bottom route (5-6-7-8) + ip -netns ${NS1} addr add ${IPv4_5}/24 dev veth5 + ip -netns ${NS2} addr add ${IPv4_6}/24 dev veth6 + ip -netns ${NS2} addr add ${IPv4_7}/24 dev veth7 + ip -netns ${NS3} addr add ${IPv4_8}/24 dev veth8 + ip -netns ${NS1} -6 addr add ${IPv6_5}/128 nodad dev veth5 + ip -netns ${NS2} -6 addr add ${IPv6_6}/128 nodad dev veth6 + ip -netns ${NS2} -6 addr add ${IPv6_7}/128 nodad dev veth7 + ip -netns ${NS3} -6 addr add ${IPv6_8}/128 nodad dev veth8 + + + ip -netns ${NS1} link set dev veth1 up + ip -netns ${NS2} link set dev veth2 up + ip -netns ${NS2} link set dev veth3 up + ip -netns ${NS3} link set dev veth4 up + ip -netns ${NS1} link set dev veth5 up + ip -netns ${NS2} link set dev veth6 up + ip -netns ${NS2} link set dev veth7 up + ip -netns ${NS3} link set dev veth8 up + + # configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default; + # the bottom route to specific bottom addresses + + # NS1 + # top route + ip -netns ${NS1} route add ${IPv4_2}/32 dev veth1 + ip -netns ${NS1} route add default dev veth1 via ${IPv4_2} # go top by default + ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1 + ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2} # go top by default + # bottom route + ip -netns ${NS1} route add ${IPv4_6}/32 dev veth5 + ip -netns ${NS1} route add ${IPv4_7}/32 dev veth5 via ${IPv4_6} + ip -netns ${NS1} route add ${IPv4_8}/32 dev veth5 via ${IPv4_6} + ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5 + ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6} + ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6} + + # NS2 + # top route + ip -netns ${NS2} route add ${IPv4_1}/32 dev veth2 + ip -netns ${NS2} route add ${IPv4_4}/32 dev veth3 + ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2 + ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3 + # bottom route + ip -netns ${NS2} route add ${IPv4_5}/32 dev veth6 + ip -netns ${NS2} route add ${IPv4_8}/32 dev veth7 + ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6 + ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7 + + # NS3 + # top route + ip -netns ${NS3} route add ${IPv4_3}/32 dev veth4 + ip -netns ${NS3} route add ${IPv4_1}/32 dev veth4 via ${IPv4_3} + ip -netns ${NS3} route add ${IPv4_2}/32 dev veth4 via ${IPv4_3} + ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4 + ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3} + ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3} + # bottom route + ip -netns ${NS3} route add ${IPv4_7}/32 dev veth8 + ip -netns ${NS3} route add ${IPv4_5}/32 dev veth8 via ${IPv4_7} + ip -netns ${NS3} route add ${IPv4_6}/32 dev veth8 via ${IPv4_7} + ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8 + ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7} + ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7} + + # configure IPv4 GRE device in NS3, and a route to it via the "bottom" route + ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255 + ip -netns ${NS3} link set gre_dev up + ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev + ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6} + ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8} + + + # configure IPv6 GRE device in NS3, and a route to it via the "bottom" route + ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255 + ip -netns ${NS3} link set gre6_dev up + ip -netns ${NS3} -6 addr add ${IPv6_GRE} nodad dev gre6_dev + ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} + ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} + + # rp_filter gets confused by what these tests are doing, so disable it + ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 + ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 +} + +cleanup() { + ip netns del ${NS1} 2> /dev/null + ip netns del ${NS2} 2> /dev/null + ip netns del ${NS3} 2> /dev/null +} + +trap cleanup EXIT + +test_ping() { + local readonly PROTO=$1 + local readonly EXPECTED=$2 + local RET=0 + + set +e + if [ "${PROTO}" == "IPv4" ] ; then + ip netns exec ${NS1} ping -c 1 -W 1 -I ${IPv4_SRC} ${IPv4_DST} 2>&1 > /dev/null + RET=$? + elif [ "${PROTO}" == "IPv6" ] ; then + ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST} 2>&1 > /dev/null + RET=$? + else + echo "test_ping: unknown PROTO: ${PROTO}" + exit 1 + fi + set -e + + if [ "0" != "${RET}" ]; then + RET=1 + fi + + if [ "${EXPECTED}" != "${RET}" ] ; then + echo "FAIL: test_ping: ${RET}" + exit 1 + fi +} + +test_egress() { + local readonly ENCAP=$1 + echo "starting egress ${ENCAP} encap test" + setup + + # need to wait a bit for IPv6 to autoconf, otherwise + # ping6 sometimes fails with "unable to bind to address" + + # by default, pings work + test_ping IPv4 0 + test_ping IPv6 0 + + # remove NS2->DST routes, ping fails + ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3 + ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3 + test_ping IPv4 1 + test_ping IPv6 1 + + # install replacement routes (LWT/eBPF), pings succeed + if [ "${ENCAP}" == "IPv4" ] ; then + ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1 + ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre dev veth1 + elif [ "${ENCAP}" == "IPv6" ] ; then + ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1 + ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1 + else + echo "FAIL: unknown encap ${ENCAP}" + fi + test_ping IPv4 0 + test_ping IPv6 0 + + cleanup + echo "PASS" +} + +test_ingress() { + local readonly ENCAP=$1 + echo "starting ingress ${ENCAP} encap test" + setup + + # need to wait a bit for IPv6 to autoconf, otherwise + # ping6 sometimes fails with "unable to bind to address" + + # by default, pings work + test_ping IPv4 0 + test_ping IPv6 0 + + # remove NS2->DST routes, pings fail + ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3 + ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3 + test_ping IPv4 1 + test_ping IPv6 1 + + # install replacement routes (LWT/eBPF), pings succeed + if [ "${ENCAP}" == "IPv4" ] ; then + ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2 + ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre dev veth2 + elif [ "${ENCAP}" == "IPv6" ] ; then + ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2 + ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj test_lwt_ip_encap.o sec encap_gre6 dev veth2 + else + echo "FAIL: unknown encap ${ENCAP}" + fi + test_ping IPv4 0 + test_ping IPv6 0 + + cleanup + echo "PASS" +} + +test_egress IPv4 +test_egress IPv6 + +test_ingress IPv4 +test_ingress IPv6 + +echo "all tests passed" From fb405883c189dd30f2fab2b3e2c954f34f000ac3 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Thu, 14 Feb 2019 10:39:31 -0800 Subject: [PATCH 28/36] bpf: fix memory leak in bpf_lwt_xmit_reroute On error the skb should be freed. Tested with diff/steps provided by David Ahern. v2: surface routing errors to the user instead of a generic EINVAL, as suggested by David Ahern. Reported-by: David Ahern Fixes: 3bd0b15281af ("bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c") Signed-off-by: Peter Oskolkov Reviewed-by: David Ahern Signed-off-by: Alexei Starovoitov --- net/core/lwt_bpf.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 32251f3fcda0..a5c8c79d468a 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -179,17 +179,17 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); int oif = l3mdev ? l3mdev->ifindex : 0; struct dst_entry *dst = NULL; + int err = -EAFNOSUPPORT; struct sock *sk; struct net *net; bool ipv4; - int err; if (skb->protocol == htons(ETH_P_IP)) ipv4 = true; else if (skb->protocol == htons(ETH_P_IPV6)) ipv4 = false; else - return -EAFNOSUPPORT; + goto err; sk = sk_to_full_sk(skb->sk); if (sk) { @@ -215,8 +215,10 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.saddr = iph->saddr; rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return -EINVAL; + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto err; + } dst = &rt->dst; } else { struct ipv6hdr *iph6 = ipv6_hdr(skb); @@ -231,12 +233,17 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl6.saddr = iph6->saddr; err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); - if (err || IS_ERR(dst)) - return -EINVAL; + if (unlikely(err)) + goto err; + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + goto err; + } } if (unlikely(dst->error)) { + err = dst->error; dst_release(dst); - return -EINVAL; + goto err; } /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it @@ -246,17 +253,21 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) */ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) - return err; + goto err; skb_dst_drop(skb); skb_dst_set(skb, dst); err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); if (unlikely(err)) - return err; + goto err; /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ return LWTUNNEL_XMIT_DONE; + +err: + kfree_skb(skb); + return err; } static int bpf_xmit(struct sk_buff *skb) From 1ad9cbb890f059dd233868654bb9d9e4430b095c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 13 Feb 2019 10:25:53 -0800 Subject: [PATCH 29/36] tools/bpf: replace bzero with memset bzero() call is deprecated and superseded by memset(). Signed-off-by: Andrii Nakryiko Reported-by: David Laight Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/bpf.c | 48 +++++++++++++++++++++--------------------- tools/lib/bpf/btf.c | 5 ++--- tools/lib/bpf/libbpf.c | 5 ++--- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index a5261f39e2bd..9cd015574e83 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -22,7 +22,7 @@ */ #include -#include +#include #include #include #include @@ -228,7 +228,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, name_len = load_attr->name ? strlen(load_attr->name) : 0; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.prog_type = load_attr->prog_type; attr.expected_attach_type = load_attr->expected_attach_type; attr.insn_cnt = (__u32)load_attr->insns_cnt; @@ -340,7 +340,7 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.prog_type = type; attr.insn_cnt = (__u32)insns_cnt; attr.insns = ptr_to_u64(insns); @@ -360,7 +360,7 @@ int bpf_map_update_elem(int fd, const void *key, const void *value, { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); @@ -373,7 +373,7 @@ int bpf_map_lookup_elem(int fd, const void *key, void *value) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); @@ -385,7 +385,7 @@ int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); @@ -398,7 +398,7 @@ int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); @@ -410,7 +410,7 @@ int bpf_map_delete_elem(int fd, const void *key) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); @@ -421,7 +421,7 @@ int bpf_map_get_next_key(int fd, const void *key, void *next_key) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.next_key = ptr_to_u64(next_key); @@ -433,7 +433,7 @@ int bpf_obj_pin(int fd, const char *pathname) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.pathname = ptr_to_u64((void *)pathname); attr.bpf_fd = fd; @@ -444,7 +444,7 @@ int bpf_obj_get(const char *pathname) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.pathname = ptr_to_u64((void *)pathname); return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); @@ -455,7 +455,7 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; @@ -468,7 +468,7 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.target_fd = target_fd; attr.attach_type = type; @@ -479,7 +479,7 @@ int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; @@ -493,7 +493,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, union bpf_attr attr; int ret; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.query.target_fd = target_fd; attr.query.attach_type = type; attr.query.query_flags = query_flags; @@ -514,7 +514,7 @@ int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, union bpf_attr attr; int ret; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.test.prog_fd = prog_fd; attr.test.data_in = ptr_to_u64(data); attr.test.data_out = ptr_to_u64(data_out); @@ -539,7 +539,7 @@ int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr) if (!test_attr->data_out && test_attr->data_size_out > 0) return -EINVAL; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.test.prog_fd = test_attr->prog_fd; attr.test.data_in = ptr_to_u64(test_attr->data_in); attr.test.data_out = ptr_to_u64(test_attr->data_out); @@ -559,7 +559,7 @@ int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id) union bpf_attr attr; int err; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.start_id = start_id; err = sys_bpf(BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr)); @@ -574,7 +574,7 @@ int bpf_map_get_next_id(__u32 start_id, __u32 *next_id) union bpf_attr attr; int err; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.start_id = start_id; err = sys_bpf(BPF_MAP_GET_NEXT_ID, &attr, sizeof(attr)); @@ -588,7 +588,7 @@ int bpf_prog_get_fd_by_id(__u32 id) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.prog_id = id; return sys_bpf(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); @@ -598,7 +598,7 @@ int bpf_map_get_fd_by_id(__u32 id) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_id = id; return sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); @@ -608,7 +608,7 @@ int bpf_btf_get_fd_by_id(__u32 id) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.btf_id = id; return sys_bpf(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr)); @@ -619,7 +619,7 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) union bpf_attr attr; int err; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.info.bpf_fd = prog_fd; attr.info.info_len = *info_len; attr.info.info = ptr_to_u64(info); @@ -635,7 +635,7 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.raw_tracepoint.name = ptr_to_u64(name); attr.raw_tracepoint.prog_fd = prog_fd; diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 6953fedb88ff..ade1c32fb083 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -484,7 +483,7 @@ int btf__get_from_id(__u32 id, struct btf **btf) goto exit_free; } - bzero(ptr, last_size); + memset(ptr, 0, last_size); btf_info.btf = ptr_to_u64(ptr); err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); @@ -498,7 +497,7 @@ int btf__get_from_id(__u32 id, struct btf **btf) goto exit_free; } ptr = temp_ptr; - bzero(ptr, last_size); + memset(ptr, 0, last_size); btf_info.btf = ptr_to_u64(ptr); err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); } diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e3c39edfb9d3..6ef7e6e4cbd3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -308,7 +307,7 @@ bpf_program__init(void *data, size_t size, char *section_name, int idx, return -EINVAL; } - bzero(prog, sizeof(*prog)); + memset(prog, 0, sizeof(*prog)); prog->section_name = strdup(section_name); if (!prog->section_name) { @@ -1577,7 +1576,7 @@ bpf_program__load(struct bpf_program *prog, struct bpf_prog_prep_result result; bpf_program_prep_t preprocessor = prog->preprocessor; - bzero(&result, sizeof(result)); + memset(&result, 0, sizeof(result)); err = preprocessor(prog, i, prog->insns, prog->insns_cnt, &result); if (err) { From d931206476b83d4d516f9425cc9e6a1cf361fb2b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 13 Feb 2019 10:25:54 -0800 Subject: [PATCH 30/36] tools: sync uapi/linux/if_link.h header Syncing if_link.h that got out of sync. Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index d6533828123a..5b225ff63b48 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -925,6 +925,7 @@ enum { enum { LINK_XSTATS_TYPE_UNSPEC, LINK_XSTATS_TYPE_BRIDGE, + LINK_XSTATS_TYPE_BOND, __LINK_XSTATS_TYPE_MAX }; #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) From f8ebfaf6684b03084858d8c55f81867e5171af08 Mon Sep 17 00:00:00 2001 From: Jan Sokolowski Date: Wed, 13 Feb 2019 18:07:29 +0100 Subject: [PATCH 31/36] net: bpf: remove XDP_QUERY_XSK_UMEM enumerator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c9b47cc1fabc ("xsk: fix bug when trying to use both copy and zero-copy on one queue id") moved the umem query code to the AF_XDP core, and therefore removed the need to query the netdevice for a umem. This patch removes XDP_QUERY_XSK_UMEM and all code that implement that behavior, which is just dead code. Signed-off-by: Jan Sokolowski Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 -- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 28 ------------------- drivers/net/ethernet/intel/i40e/i40e_xsk.h | 2 -- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 -- .../ethernet/intel/ixgbe/ixgbe_txrx_common.h | 2 -- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 17 ----------- include/linux/netdevice.h | 7 ++--- 7 files changed, 3 insertions(+), 59 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 44856a84738d..5e74a5127849 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -12128,9 +12128,6 @@ static int i40e_xdp(struct net_device *dev, case XDP_QUERY_PROG: xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0; return 0; - case XDP_QUERY_XSK_UMEM: - return i40e_xsk_umem_query(vsi, &xdp->xsk.umem, - xdp->xsk.queue_id); case XDP_SETUP_XSK_UMEM: return i40e_xsk_umem_setup(vsi, xdp->xsk.umem, xdp->xsk.queue_id); diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 96d849460d9b..e190a2c2b9ff 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -154,34 +154,6 @@ static int i40e_xsk_umem_disable(struct i40e_vsi *vsi, u16 qid) return 0; } -/** - * i40e_xsk_umem_query - Queries a certain ring/qid for its UMEM - * @vsi: Current VSI - * @umem: UMEM associated to the ring, if any - * @qid: Rx ring to associate UMEM to - * - * This function will store, if any, the UMEM associated to certain ring. - * - * Returns 0 on success, <0 on failure - **/ -int i40e_xsk_umem_query(struct i40e_vsi *vsi, struct xdp_umem **umem, - u16 qid) -{ - struct net_device *netdev = vsi->netdev; - struct xdp_umem *queried_umem; - - if (vsi->type != I40E_VSI_MAIN) - return -EINVAL; - - queried_umem = xdp_get_umem_from_qid(netdev, qid); - - if (!queried_umem) - return -EINVAL; - - *umem = queried_umem; - return 0; -} - /** * i40e_xsk_umem_setup - Enable/disassociate a UMEM to/from a ring/qid * @vsi: Current VSI diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h index 9038c5d5cf08..8cc0a2e7d9a2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -10,8 +10,6 @@ struct zero_copy_allocator; int i40e_queue_pair_disable(struct i40e_vsi *vsi, int queue_pair); int i40e_queue_pair_enable(struct i40e_vsi *vsi, int queue_pair); -int i40e_xsk_umem_query(struct i40e_vsi *vsi, struct xdp_umem **umem, - u16 qid); int i40e_xsk_umem_setup(struct i40e_vsi *vsi, struct xdp_umem *umem, u16 qid); void i40e_zca_free(struct zero_copy_allocator *alloc, unsigned long handle); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index b53087a980ef..38c430b94ae3 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10280,9 +10280,6 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp) xdp->prog_id = adapter->xdp_prog ? adapter->xdp_prog->aux->id : 0; return 0; - case XDP_QUERY_XSK_UMEM: - return ixgbe_xsk_umem_query(adapter, &xdp->xsk.umem, - xdp->xsk.queue_id); case XDP_SETUP_XSK_UMEM: return ixgbe_xsk_umem_setup(adapter, xdp->xsk.umem, xdp->xsk.queue_id); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h index 53d4089f5644..d93a690aff74 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_txrx_common.h @@ -30,8 +30,6 @@ void ixgbe_txrx_ring_enable(struct ixgbe_adapter *adapter, int ring); struct xdp_umem *ixgbe_xsk_umem(struct ixgbe_adapter *adapter, struct ixgbe_ring *ring); -int ixgbe_xsk_umem_query(struct ixgbe_adapter *adapter, struct xdp_umem **umem, - u16 qid); int ixgbe_xsk_umem_setup(struct ixgbe_adapter *adapter, struct xdp_umem *umem, u16 qid); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 65c3e2c979d4..98870707b51a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -174,23 +174,6 @@ static int ixgbe_xsk_umem_disable(struct ixgbe_adapter *adapter, u16 qid) return 0; } -int ixgbe_xsk_umem_query(struct ixgbe_adapter *adapter, struct xdp_umem **umem, - u16 qid) -{ - if (qid >= adapter->num_rx_queues) - return -EINVAL; - - if (adapter->xsk_umems) { - if (qid >= adapter->num_xsk_umems) - return -EINVAL; - *umem = adapter->xsk_umems[qid]; - return 0; - } - - *umem = NULL; - return 0; -} - int ixgbe_xsk_umem_setup(struct ixgbe_adapter *adapter, struct xdp_umem *umem, u16 qid) { diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1d95e634f3fe..6aedaf1e9a25 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -868,7 +868,6 @@ enum bpf_netdev_command { /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, - XDP_QUERY_XSK_UMEM, XDP_SETUP_XSK_UMEM, }; @@ -895,10 +894,10 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_QUERY_XSK_UMEM, XDP_SETUP_XSK_UMEM */ + /* XDP_SETUP_XSK_UMEM */ struct { - struct xdp_umem *umem; /* out for query*/ - u16 queue_id; /* in for query */ + struct xdp_umem *umem; + u16 queue_id; } xsk; }; }; From 1a11a4c74f73adb840d61371c3bb560ed4d7a87f Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 Feb 2019 15:01:42 -0800 Subject: [PATCH 32/36] libbpf: Introduce bpf_map__resize Add bpf_map__resize() to change max_entries for a map. Quite often necessary map size is unknown at compile time and can be calculated only at run time. Currently the following approach is used to do so: * bpf_object__open_buffer() to open Elf file from a buffer; * bpf_object__find_map_by_name() to find relevant map; * bpf_map__def() to get map attributes and create struct bpf_create_map_attr from them; * update max_entries in bpf_create_map_attr; * bpf_create_map_xattr() to create new map with updated max_entries; * bpf_map__reuse_fd() to replace the map in bpf_object with newly created one. And after all this bpf_object can finally be loaded. The map will have new size. It 1) is quite a lot of steps; 2) doesn't take BTF into account. For "2)" even more steps should be made and some of them require changes to libbpf (e.g. to get struct btf * from bpf_object). Instead the whole problem can be solved by introducing simple bpf_map__resize() API that checks the map and sets new max_entries if the map is not loaded yet. So the new steps are: * bpf_object__open_buffer() to open Elf file from a buffer; * bpf_object__find_map_by_name() to find relevant map; * bpf_map__resize() to update max_entries. That's much simpler and works with BTF. Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 14 ++++++++++++++ tools/lib/bpf/libbpf.h | 1 + tools/lib/bpf/libbpf.map | 1 + 3 files changed, 16 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 6ef7e6e4cbd3..9597d4dace34 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1113,6 +1113,20 @@ err_free_new_name: return -errno; } +int bpf_map__resize(struct bpf_map *map, __u32 max_entries) +{ + if (!map || !max_entries) + return -EINVAL; + + /* If map already created, its attributes can't be changed. */ + if (map->fd >= 0) + return -EBUSY; + + map->def.max_entries = max_entries; + + return 0; +} + static int bpf_object__probe_name(struct bpf_object *obj) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 69a7c25eaccc..987fd92661d6 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -294,6 +294,7 @@ LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, bpf_map_clear_priv_t clear_priv); LIBBPF_API void *bpf_map__priv(struct bpf_map *map); LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); +LIBBPF_API int bpf_map__resize(struct bpf_map *map, __u32 max_entries); LIBBPF_API bool bpf_map__is_offload_neutral(struct bpf_map *map); LIBBPF_API void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 5fc8222209f8..16f342c3d4bc 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -130,6 +130,7 @@ LIBBPF_0.0.2 { bpf_probe_helper; bpf_probe_map_type; bpf_probe_prog_type; + bpf_map__resize; bpf_map_lookup_elem_flags; bpf_object__find_map_fd_by_name; bpf_get_link_xdp_id; From 789f6bab849e04ea029c09b81dc8401dc0268cf9 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 Feb 2019 15:01:43 -0800 Subject: [PATCH 33/36] libbpf: Introduce bpf_object__btf Add new accessor for bpf_object to get opaque struct btf * from it. struct btf * is needed for all operations with BTF and it's present in bpf_object. The only thing missing is a way to get it. Example use-case is to get BTF key_type_id and value_type_id for a map in bpf_object. It can be done with btf__get_map_kv_tids() but that function requires struct btf *. Similar API can be added for struct btf_ext but no use-case for it yet. Signed-off-by: Andrey Ignatov Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- tools/lib/bpf/libbpf.c | 5 +++++ tools/lib/bpf/libbpf.h | 3 +++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 9 insertions(+) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 9597d4dace34..b38dcbe7460a 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -2331,6 +2331,11 @@ unsigned int bpf_object__kversion(struct bpf_object *obj) return obj ? obj->kern_version : 0; } +struct btf *bpf_object__btf(struct bpf_object *obj) +{ + return obj ? obj->btf : NULL; +} + int bpf_object__btf_fd(const struct bpf_object *obj) { return obj->btf ? btf__fd(obj->btf) : -1; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 987fd92661d6..6c0168f8bba5 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -89,6 +89,9 @@ LIBBPF_API int bpf_object__load(struct bpf_object *obj); LIBBPF_API int bpf_object__unload(struct bpf_object *obj); LIBBPF_API const char *bpf_object__name(struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj); + +struct btf; +LIBBPF_API struct btf *bpf_object__btf(struct bpf_object *obj); LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); LIBBPF_API struct bpf_program * diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index 16f342c3d4bc..99dfa710c818 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -132,6 +132,7 @@ LIBBPF_0.0.2 { bpf_probe_prog_type; bpf_map__resize; bpf_map_lookup_elem_flags; + bpf_object__btf; bpf_object__find_map_fd_by_name; bpf_get_link_xdp_id; btf__dedup; From b251f9f63a3bea7864bf627960926d978e97814d Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Fri, 15 Feb 2019 09:51:35 -0800 Subject: [PATCH 34/36] bpf: make LWTUNNEL_BPF dependent on INET Lightweight tunnels are L3 constructs that are used with IP/IP6. For example, lwtunnel_xmit is called from ip_output.c and ip6_output.c only. Make the dependency explicit at least for LWT-BPF, as now they call into IP routing. V2: added "Reported-by" below. Reported-by: Randy Dunlap Signed-off-by: Peter Oskolkov Acked-by: Randy Dunlap # build-tested Signed-off-by: Daniel Borkmann --- net/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/Kconfig b/net/Kconfig index 5cb9de1aaf88..62da6148e9f8 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -403,7 +403,7 @@ config LWTUNNEL config LWTUNNEL_BPF bool "Execute BPF program as route nexthop action" - depends on LWTUNNEL + depends on LWTUNNEL && INET default y if LWTUNNEL=y ---help--- Allows to run BPF programs as a nexthop action following a route From 9d6b3584a7a9d7fa872cb17ceda7a3b208f441eb Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Fri, 15 Feb 2019 15:49:33 -0800 Subject: [PATCH 35/36] selftests: bpf: test_lwt_ip_encap: add negative tests. As requested by David Ahern: - add negative tests (no routes, explicitly unreachable destinations) to exercize error handling code paths; - do not exit on test failures, but instead print a summary of passed/failed tests at the end. Future patches will add TSO and VRF tests. Signed-off-by: Peter Oskolkov Reviewed-by: David Ahern Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/test_lwt_ip_encap.sh | 111 ++++++++++++++---- 1 file changed, 88 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh index 4ca714e23ab0..612632c1425f 100755 --- a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh +++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh @@ -38,8 +38,6 @@ # ping: SRC->[encap at veth2:ingress]->GRE:decap->DST # ping replies go DST->SRC directly -set -e # exit on error - if [[ $EUID -ne 0 ]]; then echo "This script must be run as root" echo "FAIL" @@ -76,8 +74,37 @@ readonly IPv6_GRE="fb10::1" readonly IPv6_SRC=$IPv6_1 readonly IPv6_DST=$IPv6_4 -setup() { -set -e # exit on error +TEST_STATUS=0 +TESTS_SUCCEEDED=0 +TESTS_FAILED=0 + +process_test_results() +{ + if [[ "${TEST_STATUS}" -eq 0 ]] ; then + echo "PASS" + TESTS_SUCCEEDED=$((TESTS_SUCCEEDED+1)) + else + echo "FAIL" + TESTS_FAILED=$((TESTS_FAILED+1)) + fi +} + +print_test_summary_and_exit() +{ + echo "passed tests: ${TESTS_SUCCEEDED}" + echo "failed tests: ${TESTS_FAILED}" + if [ "${TESTS_FAILED}" -eq "0" ] ; then + exit 0 + else + exit 1 + fi +} + +setup() +{ + set -e # exit on error + TEST_STATUS=0 + # create devices and namespaces ip netns add "${NS1}" ip netns add "${NS2}" @@ -178,7 +205,7 @@ set -e # exit on error # configure IPv4 GRE device in NS3, and a route to it via the "bottom" route ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255 ip -netns ${NS3} link set gre_dev up - ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev + ip -netns ${NS3} addr add ${IPv4_GRE} nodad dev gre_dev ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6} ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8} @@ -194,9 +221,13 @@ set -e # exit on error ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 + + sleep 1 # reduce flakiness + set +e } -cleanup() { +cleanup() +{ ip netns del ${NS1} 2> /dev/null ip netns del ${NS2} 2> /dev/null ip netns del ${NS3} 2> /dev/null @@ -204,12 +235,28 @@ cleanup() { trap cleanup EXIT -test_ping() { +remove_routes_to_gredev() +{ + ip -netns ${NS1} route del ${IPv4_GRE} dev veth5 + ip -netns ${NS2} route del ${IPv4_GRE} dev veth7 + ip -netns ${NS1} -6 route del ${IPv6_GRE}/128 dev veth5 + ip -netns ${NS2} -6 route del ${IPv6_GRE}/128 dev veth7 +} + +add_unreachable_routes_to_gredev() +{ + ip -netns ${NS1} route add unreachable ${IPv4_GRE}/32 + ip -netns ${NS2} route add unreachable ${IPv4_GRE}/32 + ip -netns ${NS1} -6 route add unreachable ${IPv6_GRE}/128 + ip -netns ${NS2} -6 route add unreachable ${IPv6_GRE}/128 +} + +test_ping() +{ local readonly PROTO=$1 local readonly EXPECTED=$2 local RET=0 - set +e if [ "${PROTO}" == "IPv4" ] ; then ip netns exec ${NS1} ping -c 1 -W 1 -I ${IPv4_SRC} ${IPv4_DST} 2>&1 > /dev/null RET=$? @@ -217,29 +264,26 @@ test_ping() { ip netns exec ${NS1} ping6 -c 1 -W 6 -I ${IPv6_SRC} ${IPv6_DST} 2>&1 > /dev/null RET=$? else - echo "test_ping: unknown PROTO: ${PROTO}" - exit 1 + echo " test_ping: unknown PROTO: ${PROTO}" + TEST_STATUS=1 fi - set -e if [ "0" != "${RET}" ]; then RET=1 fi if [ "${EXPECTED}" != "${RET}" ] ; then - echo "FAIL: test_ping: ${RET}" - exit 1 + echo " test_ping failed: expected: ${EXPECTED}; got ${RET}" + TEST_STATUS=1 fi } -test_egress() { +test_egress() +{ local readonly ENCAP=$1 echo "starting egress ${ENCAP} encap test" setup - # need to wait a bit for IPv6 to autoconf, otherwise - # ping6 sometimes fails with "unable to bind to address" - # by default, pings work test_ping IPv4 0 test_ping IPv6 0 @@ -258,16 +302,28 @@ test_egress() { ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1 ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj test_lwt_ip_encap.o sec encap_gre6 dev veth1 else - echo "FAIL: unknown encap ${ENCAP}" + echo " unknown encap ${ENCAP}" + TEST_STATUS=1 fi test_ping IPv4 0 test_ping IPv6 0 + # a negative test: remove routes to GRE devices: ping fails + remove_routes_to_gredev + test_ping IPv4 1 + test_ping IPv6 1 + + # another negative test + add_unreachable_routes_to_gredev + test_ping IPv4 1 + test_ping IPv6 1 + cleanup - echo "PASS" + process_test_results } -test_ingress() { +test_ingress() +{ local readonly ENCAP=$1 echo "starting ingress ${ENCAP} encap test" setup @@ -298,14 +354,23 @@ test_ingress() { test_ping IPv4 0 test_ping IPv6 0 + # a negative test: remove routes to GRE devices: ping fails + remove_routes_to_gredev + test_ping IPv4 1 + test_ping IPv6 1 + + # another negative test + add_unreachable_routes_to_gredev + test_ping IPv4 1 + test_ping IPv6 1 + cleanup - echo "PASS" + process_test_results } test_egress IPv4 test_egress IPv6 - test_ingress IPv4 test_ingress IPv6 -echo "all tests passed" +print_test_summary_and_exit From 5aab392c55c96f9bb26d9294f965f156a87ee81c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 15 Feb 2019 19:52:18 -0800 Subject: [PATCH 36/36] tools/libbpf: support bigger BTF data sizes While it's understandable why kernel limits number of BTF types to 65535 and size of string section to 64KB, in libbpf as user-space library it's too restrictive. E.g., pahole converting DWARF to BTF type information for Linux kernel generates more than 3 million BTF types and more than 3MB of strings, before deduplication. So to allow btf__dedup() to do its work, we need to be able to load bigger BTF sections using btf__new(). Singed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- tools/lib/bpf/btf.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index ade1c32fb083..68b50e9bbde1 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -16,7 +16,8 @@ #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) -#define BTF_MAX_NR_TYPES 65535 +#define BTF_MAX_NR_TYPES 0x7fffffff +#define BTF_MAX_STR_OFFSET 0x7fffffff #define IS_MODIFIER(k) (((k) == BTF_KIND_TYPEDEF) || \ ((k) == BTF_KIND_VOLATILE) || \ @@ -175,7 +176,7 @@ static int btf_parse_str_sec(struct btf *btf) const char *start = btf->nohdr_data + hdr->str_off; const char *end = start + btf->hdr->str_len; - if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_STR_OFFSET || start[0] || end[-1]) { pr_debug("Invalid BTF string section\n"); return -EINVAL; @@ -1882,7 +1883,7 @@ static int btf_dedup_prim_types(struct btf_dedup *d) */ static inline bool is_type_mapped(struct btf_dedup *d, uint32_t type_id) { - return d->map[type_id] <= BTF_MAX_TYPE; + return d->map[type_id] <= BTF_MAX_NR_TYPES; } /* @@ -2033,7 +2034,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, canon_id = resolve_fwd_id(d, canon_id); hypot_type_id = d->hypot_map[canon_id]; - if (hypot_type_id <= BTF_MAX_TYPE) + if (hypot_type_id <= BTF_MAX_NR_TYPES) return hypot_type_id == cand_id; if (btf_dedup_hypot_map_add(d, canon_id, cand_id)) @@ -2252,7 +2253,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) __u32 h; /* already deduped or is in process of deduping (loop detected) */ - if (d->map[type_id] <= BTF_MAX_TYPE) + if (d->map[type_id] <= BTF_MAX_NR_TYPES) return 0; t = d->btf->types[type_id]; @@ -2329,7 +2330,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) if (d->map[type_id] == BTF_IN_PROGRESS_ID) return -ELOOP; - if (d->map[type_id] <= BTF_MAX_TYPE) + if (d->map[type_id] <= BTF_MAX_NR_TYPES) return resolve_type_id(d, type_id); t = d->btf->types[type_id]; @@ -2509,7 +2510,7 @@ static int btf_dedup_remap_type_id(struct btf_dedup *d, __u32 type_id) resolved_type_id = resolve_type_id(d, type_id); new_type_id = d->hypot_map[resolved_type_id]; - if (new_type_id > BTF_MAX_TYPE) + if (new_type_id > BTF_MAX_NR_TYPES) return -EINVAL; return new_type_id; }