Commit cb86d0f8 authored by Daniel Borkmann's avatar Daniel Borkmann
Browse files

Merge branch 'bpf-per-cpu-cgroup-storage'



Roman Gushchin says:

====================
This patchset implements per-cpu cgroup local storage and provides
an example how per-cpu and shared cgroup local storage can be used
for efficient accounting of network traffic.

v4->v3:
  1) incorporated Alexei's feedback

v3->v2:
  1) incorporated Song's feedback
  2) rebased on top of current bpf-next

v2->v1:
  1) added a selftest implementing network counters
  2) added a missing free() in cgroup local storage selftest
====================

Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 5bf7a60b 371e4fcc
Loading
Loading
Loading
Loading
+43 −12
Original line number Original line Diff line number Diff line
@@ -2,6 +2,7 @@
#ifndef _BPF_CGROUP_H
#ifndef _BPF_CGROUP_H
#define _BPF_CGROUP_H
#define _BPF_CGROUP_H


#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/errno.h>
#include <linux/jump_label.h>
#include <linux/jump_label.h>
#include <linux/percpu.h>
#include <linux/percpu.h>
@@ -22,7 +23,11 @@ struct bpf_cgroup_storage;
extern struct static_key_false cgroup_bpf_enabled_key;
extern struct static_key_false cgroup_bpf_enabled_key;
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)


DECLARE_PER_CPU(void*, bpf_cgroup_storage);
DECLARE_PER_CPU(struct bpf_cgroup_storage*,
		bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);

#define for_each_cgroup_storage_type(stype) \
	for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++)


struct bpf_cgroup_storage_map;
struct bpf_cgroup_storage_map;


@@ -32,7 +37,10 @@ struct bpf_storage_buffer {
};
};


struct bpf_cgroup_storage {
struct bpf_cgroup_storage {
	union {
		struct bpf_storage_buffer *buf;
		struct bpf_storage_buffer *buf;
		void __percpu *percpu_buf;
	};
	struct bpf_cgroup_storage_map *map;
	struct bpf_cgroup_storage_map *map;
	struct bpf_cgroup_storage_key key;
	struct bpf_cgroup_storage_key key;
	struct list_head list;
	struct list_head list;
@@ -43,7 +51,7 @@ struct bpf_cgroup_storage {
struct bpf_prog_list {
struct bpf_prog_list {
	struct list_head node;
	struct list_head node;
	struct bpf_prog *prog;
	struct bpf_prog *prog;
	struct bpf_cgroup_storage *storage;
	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
};
};


struct bpf_prog_array;
struct bpf_prog_array;
@@ -101,18 +109,26 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
				      short access, enum bpf_attach_type type);
				      short access, enum bpf_attach_type type);


static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage)
static inline enum bpf_cgroup_storage_type cgroup_storage_type(
	struct bpf_map *map)
{
{
	struct bpf_storage_buffer *buf;
	if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
		return BPF_CGROUP_STORAGE_PERCPU;

	return BPF_CGROUP_STORAGE_SHARED;
}


	if (!storage)
static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
		return;
					  *storage[MAX_BPF_CGROUP_STORAGE_TYPE])
{
	enum bpf_cgroup_storage_type stype;


	buf = READ_ONCE(storage->buf);
	for_each_cgroup_storage_type(stype)
	this_cpu_write(bpf_cgroup_storage, &buf->data[0]);
		this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
}
}


struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
					enum bpf_cgroup_storage_type stype);
void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
			     struct cgroup *cgroup,
			     struct cgroup *cgroup,
@@ -121,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);


int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
				     void *value, u64 flags);

/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
({									      \
({									      \
@@ -265,15 +285,24 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
	return -EINVAL;
	return -EINVAL;
}
}


static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {}
static inline void bpf_cgroup_storage_set(
	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
					    struct bpf_map *map) { return 0; }
					    struct bpf_map *map) { return 0; }
static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
					      struct bpf_map *map) {}
					      struct bpf_map *map) {}
static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
	struct bpf_prog *prog) { return 0; }
	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
static inline void bpf_cgroup_storage_free(
static inline void bpf_cgroup_storage_free(
	struct bpf_cgroup_storage *storage) {}
	struct bpf_cgroup_storage *storage) {}
static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
						 void *value) {
	return 0;
}
static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
					void *key, void *value, u64 flags) {
	return 0;
}


#define cgroup_bpf_enabled (0)
#define cgroup_bpf_enabled (0)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
@@ -293,6 +322,8 @@ static inline void bpf_cgroup_storage_free(
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })


#define for_each_cgroup_storage_type(stype) for (; false; )

#endif /* CONFIG_CGROUP_BPF */
#endif /* CONFIG_CGROUP_BPF */


#endif /* _BPF_CGROUP_H */
#endif /* _BPF_CGROUP_H */
+10 −2
Original line number Original line Diff line number Diff line
@@ -272,6 +272,14 @@ struct bpf_prog_offload {
	u32			jited_len;
	u32			jited_len;
};
};


enum bpf_cgroup_storage_type {
	BPF_CGROUP_STORAGE_SHARED,
	BPF_CGROUP_STORAGE_PERCPU,
	__BPF_CGROUP_STORAGE_MAX
};

#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX

struct bpf_prog_aux {
struct bpf_prog_aux {
	atomic_t refcnt;
	atomic_t refcnt;
	u32 used_map_cnt;
	u32 used_map_cnt;
@@ -289,7 +297,7 @@ struct bpf_prog_aux {
	struct bpf_prog *prog;
	struct bpf_prog *prog;
	struct user_struct *user;
	struct user_struct *user;
	u64 load_time; /* ns since boottime */
	u64 load_time; /* ns since boottime */
	struct bpf_map *cgroup_storage;
	struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
	char name[BPF_OBJ_NAME_LEN];
	char name[BPF_OBJ_NAME_LEN];
#ifdef CONFIG_SECURITY
#ifdef CONFIG_SECURITY
	void *security;
	void *security;
@@ -358,7 +366,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 */
 */
struct bpf_prog_array_item {
struct bpf_prog_array_item {
	struct bpf_prog *prog;
	struct bpf_prog *prog;
	struct bpf_cgroup_storage *cgroup_storage;
	struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE];
};
};


struct bpf_prog_array {
struct bpf_prog_array {
+1 −0
Original line number Original line Diff line number Diff line
@@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
#endif
#endif
#ifdef CONFIG_CGROUP_BPF
#ifdef CONFIG_CGROUP_BPF
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
#endif
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
+1 −0
Original line number Original line Diff line number Diff line
@@ -127,6 +127,7 @@ enum bpf_map_type {
	BPF_MAP_TYPE_SOCKHASH,
	BPF_MAP_TYPE_SOCKHASH,
	BPF_MAP_TYPE_CGROUP_STORAGE,
	BPF_MAP_TYPE_CGROUP_STORAGE,
	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
	BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
	BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
};
};


enum bpf_prog_type {
enum bpf_prog_type {
+52 −22
Original line number Original line Diff line number Diff line
@@ -25,6 +25,7 @@ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
 */
 */
void cgroup_bpf_put(struct cgroup *cgrp)
void cgroup_bpf_put(struct cgroup *cgrp)
{
{
	enum bpf_cgroup_storage_type stype;
	unsigned int type;
	unsigned int type;


	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
@@ -34,8 +35,10 @@ void cgroup_bpf_put(struct cgroup *cgrp)
		list_for_each_entry_safe(pl, tmp, progs, node) {
		list_for_each_entry_safe(pl, tmp, progs, node) {
			list_del(&pl->node);
			list_del(&pl->node);
			bpf_prog_put(pl->prog);
			bpf_prog_put(pl->prog);
			bpf_cgroup_storage_unlink(pl->storage);
			for_each_cgroup_storage_type(stype) {
			bpf_cgroup_storage_free(pl->storage);
				bpf_cgroup_storage_unlink(pl->storage[stype]);
				bpf_cgroup_storage_free(pl->storage[stype]);
			}
			kfree(pl);
			kfree(pl);
			static_branch_dec(&cgroup_bpf_enabled_key);
			static_branch_dec(&cgroup_bpf_enabled_key);
		}
		}
@@ -97,6 +100,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
				   enum bpf_attach_type type,
				   enum bpf_attach_type type,
				   struct bpf_prog_array __rcu **array)
				   struct bpf_prog_array __rcu **array)
{
{
	enum bpf_cgroup_storage_type stype;
	struct bpf_prog_array *progs;
	struct bpf_prog_array *progs;
	struct bpf_prog_list *pl;
	struct bpf_prog_list *pl;
	struct cgroup *p = cgrp;
	struct cgroup *p = cgrp;
@@ -125,7 +129,9 @@ static int compute_effective_progs(struct cgroup *cgrp,
				continue;
				continue;


			progs->items[cnt].prog = pl->prog;
			progs->items[cnt].prog = pl->prog;
			progs->items[cnt].cgroup_storage = pl->storage;
			for_each_cgroup_storage_type(stype)
				progs->items[cnt].cgroup_storage[stype] =
					pl->storage[stype];
			cnt++;
			cnt++;
		}
		}
	} while ((p = cgroup_parent(p)));
	} while ((p = cgroup_parent(p)));
@@ -232,7 +238,9 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
{
{
	struct list_head *progs = &cgrp->bpf.progs[type];
	struct list_head *progs = &cgrp->bpf.progs[type];
	struct bpf_prog *old_prog = NULL;
	struct bpf_prog *old_prog = NULL;
	struct bpf_cgroup_storage *storage, *old_storage = NULL;
	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
		*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
	enum bpf_cgroup_storage_type stype;
	struct bpf_prog_list *pl;
	struct bpf_prog_list *pl;
	bool pl_was_allocated;
	bool pl_was_allocated;
	int err;
	int err;
@@ -254,34 +262,44 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
		return -E2BIG;
		return -E2BIG;


	storage = bpf_cgroup_storage_alloc(prog);
	for_each_cgroup_storage_type(stype) {
	if (IS_ERR(storage))
		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
		if (IS_ERR(storage[stype])) {
			storage[stype] = NULL;
			for_each_cgroup_storage_type(stype)
				bpf_cgroup_storage_free(storage[stype]);
			return -ENOMEM;
			return -ENOMEM;
		}
	}


	if (flags & BPF_F_ALLOW_MULTI) {
	if (flags & BPF_F_ALLOW_MULTI) {
		list_for_each_entry(pl, progs, node) {
		list_for_each_entry(pl, progs, node) {
			if (pl->prog == prog) {
			if (pl->prog == prog) {
				/* disallow attaching the same prog twice */
				/* disallow attaching the same prog twice */
				bpf_cgroup_storage_free(storage);
				for_each_cgroup_storage_type(stype)
					bpf_cgroup_storage_free(storage[stype]);
				return -EINVAL;
				return -EINVAL;
			}
			}
		}
		}


		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
		if (!pl) {
		if (!pl) {
			bpf_cgroup_storage_free(storage);
			for_each_cgroup_storage_type(stype)
				bpf_cgroup_storage_free(storage[stype]);
			return -ENOMEM;
			return -ENOMEM;
		}
		}


		pl_was_allocated = true;
		pl_was_allocated = true;
		pl->prog = prog;
		pl->prog = prog;
		pl->storage = storage;
		for_each_cgroup_storage_type(stype)
			pl->storage[stype] = storage[stype];
		list_add_tail(&pl->node, progs);
		list_add_tail(&pl->node, progs);
	} else {
	} else {
		if (list_empty(progs)) {
		if (list_empty(progs)) {
			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
			if (!pl) {
			if (!pl) {
				bpf_cgroup_storage_free(storage);
				for_each_cgroup_storage_type(stype)
					bpf_cgroup_storage_free(storage[stype]);
				return -ENOMEM;
				return -ENOMEM;
			}
			}
			pl_was_allocated = true;
			pl_was_allocated = true;
@@ -289,12 +307,15 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
		} else {
		} else {
			pl = list_first_entry(progs, typeof(*pl), node);
			pl = list_first_entry(progs, typeof(*pl), node);
			old_prog = pl->prog;
			old_prog = pl->prog;
			old_storage = pl->storage;
			for_each_cgroup_storage_type(stype) {
			bpf_cgroup_storage_unlink(old_storage);
				old_storage[stype] = pl->storage[stype];
				bpf_cgroup_storage_unlink(old_storage[stype]);
			}
			pl_was_allocated = false;
			pl_was_allocated = false;
		}
		}
		pl->prog = prog;
		pl->prog = prog;
		pl->storage = storage;
		for_each_cgroup_storage_type(stype)
			pl->storage[stype] = storage[stype];
	}
	}


	cgrp->bpf.flags[type] = flags;
	cgrp->bpf.flags[type] = flags;
@@ -304,21 +325,27 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
		goto cleanup;
		goto cleanup;


	static_branch_inc(&cgroup_bpf_enabled_key);
	static_branch_inc(&cgroup_bpf_enabled_key);
	if (old_storage)
	for_each_cgroup_storage_type(stype) {
		bpf_cgroup_storage_free(old_storage);
		if (!old_storage[stype])
			continue;
		bpf_cgroup_storage_free(old_storage[stype]);
	}
	if (old_prog) {
	if (old_prog) {
		bpf_prog_put(old_prog);
		bpf_prog_put(old_prog);
		static_branch_dec(&cgroup_bpf_enabled_key);
		static_branch_dec(&cgroup_bpf_enabled_key);
	}
	}
	bpf_cgroup_storage_link(storage, cgrp, type);
	for_each_cgroup_storage_type(stype)
		bpf_cgroup_storage_link(storage[stype], cgrp, type);
	return 0;
	return 0;


cleanup:
cleanup:
	/* and cleanup the prog list */
	/* and cleanup the prog list */
	pl->prog = old_prog;
	pl->prog = old_prog;
	bpf_cgroup_storage_free(pl->storage);
	for_each_cgroup_storage_type(stype) {
	pl->storage = old_storage;
		bpf_cgroup_storage_free(pl->storage[stype]);
	bpf_cgroup_storage_link(old_storage, cgrp, type);
		pl->storage[stype] = old_storage[stype];
		bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
	}
	if (pl_was_allocated) {
	if (pl_was_allocated) {
		list_del(&pl->node);
		list_del(&pl->node);
		kfree(pl);
		kfree(pl);
@@ -339,6 +366,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
			enum bpf_attach_type type, u32 unused_flags)
			enum bpf_attach_type type, u32 unused_flags)
{
{
	struct list_head *progs = &cgrp->bpf.progs[type];
	struct list_head *progs = &cgrp->bpf.progs[type];
	enum bpf_cgroup_storage_type stype;
	u32 flags = cgrp->bpf.flags[type];
	u32 flags = cgrp->bpf.flags[type];
	struct bpf_prog *old_prog = NULL;
	struct bpf_prog *old_prog = NULL;
	struct bpf_prog_list *pl;
	struct bpf_prog_list *pl;
@@ -385,8 +413,10 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,


	/* now can actually delete it from this cgroup list */
	/* now can actually delete it from this cgroup list */
	list_del(&pl->node);
	list_del(&pl->node);
	bpf_cgroup_storage_unlink(pl->storage);
	for_each_cgroup_storage_type(stype) {
	bpf_cgroup_storage_free(pl->storage);
		bpf_cgroup_storage_unlink(pl->storage[stype]);
		bpf_cgroup_storage_free(pl->storage[stype]);
	}
	kfree(pl);
	kfree(pl);
	if (list_empty(progs))
	if (list_empty(progs))
		/* last program was detached, reset flags to zero */
		/* last program was detached, reset flags to zero */
Loading