Skip to content
Snippets Groups Projects
sock.c 89.6 KiB
Newer Older
		break;

	case SO_OOBINLINE:
		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
		break;

	case SO_NO_CHECK:
		sk->sk_no_check_tx = valbool;
		break;

	case SO_PRIORITY:
		if ((val >= 0 && val <= 6) ||
		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
			sk->sk_priority = val;
		else
			ret = -EPERM;
		break;

	case SO_LINGER:
		if (optlen < sizeof(ling)) {
			ret = -EINVAL;	/* 1003.1g */
Linus Torvalds's avatar
Linus Torvalds committed
			break;
		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
			ret = -EFAULT;
Linus Torvalds's avatar
Linus Torvalds committed
			break;
		}
		if (!ling.l_onoff)
			sock_reset_flag(sk, SOCK_LINGER);
		else {
Linus Torvalds's avatar
Linus Torvalds committed
#if (BITS_PER_LONG == 32)
			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
Linus Torvalds's avatar
Linus Torvalds committed
			else
#endif
				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
			sock_set_flag(sk, SOCK_LINGER);
		}
		break;

	case SO_BSDCOMPAT:
		break;

	case SO_PASSCRED:
		if (valbool)
			set_bit(SOCK_PASSCRED, &sock->flags);
		else
			clear_bit(SOCK_PASSCRED, &sock->flags);
		break;

	case SO_TIMESTAMP_OLD:
	case SO_TIMESTAMP_NEW:
	case SO_TIMESTAMPNS_OLD:
	case SO_TIMESTAMPNS_NEW:
		sock_set_timestamp(sk, valbool, optname);
	case SO_TIMESTAMPING_NEW:
	case SO_TIMESTAMPING_OLD:
		ret = sock_set_timestamping(sk, optname, val);
	case SO_RCVLOWAT:
		if (val < 0)
			val = INT_MAX;
		if (sock->ops->set_rcvlowat)
			ret = sock->ops->set_rcvlowat(sk, val);
		else
			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
	case SO_RCVTIMEO_NEW:
		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
				       optlen, optname == SO_RCVTIMEO_OLD);
	case SO_SNDTIMEO_NEW:
		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
				       optlen, optname == SO_SNDTIMEO_OLD);
Linus Torvalds's avatar
Linus Torvalds committed

	case SO_ATTACH_FILTER: {
		struct sock_fprog fprog;
		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
			ret = sk_attach_filter(&fprog, sk);
		break;
	case SO_ATTACH_BPF:
		ret = -EINVAL;
		if (optlen == sizeof(u32)) {
			u32 ufd;

			ret = -EFAULT;
			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
	case SO_ATTACH_REUSEPORT_CBPF: {
		struct sock_fprog fprog;
		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
			ret = sk_reuseport_attach_filter(&fprog, sk);
		break;
	case SO_ATTACH_REUSEPORT_EBPF:
		ret = -EINVAL;
		if (optlen == sizeof(u32)) {
			u32 ufd;

			ret = -EFAULT;
			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
				break;

			ret = sk_reuseport_attach_bpf(ufd, sk);
		}
		break;

	case SO_DETACH_REUSEPORT_BPF:
		ret = reuseport_detach_prog(sk);
		break;

	case SO_DETACH_FILTER:
		ret = sk_detach_filter(sk);
Linus Torvalds's avatar
Linus Torvalds committed

	case SO_LOCK_FILTER:
		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
			ret = -EPERM;
		else
			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
		break;

	case SO_PASSSEC:
		if (valbool)
			set_bit(SOCK_PASSSEC, &sock->flags);
		else
			clear_bit(SOCK_PASSSEC, &sock->flags);
		break;
		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {

		__sock_set_mark(sk, val);
		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);

	case SO_WIFI_STATUS:
		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
		break;

	case SO_PEEK_OFF:
		if (sock->ops->set_peek_off)
			ret = sock->ops->set_peek_off(sk, val);
		else
			ret = -EOPNOTSUPP;
		break;

	case SO_NOFCS:
		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
		break;

	case SO_SELECT_ERR_QUEUE:
		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
		break;

#ifdef CONFIG_NET_RX_BUSY_POLL
	case SO_BUSY_POLL:
		/* allow unprivileged users to decrease the value */
		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
			ret = -EPERM;
		else {
			if (val < 0)
				ret = -EINVAL;
			else
				sk->sk_ll_usec = val;
		}
		break;
	case SO_PREFER_BUSY_POLL:
		if (valbool && !capable(CAP_NET_ADMIN))
			ret = -EPERM;
		else
			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
		break;
	case SO_BUSY_POLL_BUDGET:
		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
			ret = -EPERM;
		} else {
			if (val < 0 || val > U16_MAX)
				ret = -EINVAL;
			else
				WRITE_ONCE(sk->sk_busy_poll_budget, val);
		}
		break;

	case SO_MAX_PACING_RATE:
		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;

		if (sizeof(ulval) != sizeof(val) &&
		    optlen >= sizeof(ulval) &&
		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
			cmpxchg(&sk->sk_pacing_status,
				SK_PACING_NONE,
				SK_PACING_NEEDED);
		sk->sk_max_pacing_rate = ulval;
		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
	case SO_INCOMING_CPU:
		WRITE_ONCE(sk->sk_incoming_cpu, val);
	case SO_CNX_ADVICE:
		if (val == 1)
			dst_negative_advice(sk);
		break;

	case SO_ZEROCOPY:
		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
Willem de Bruijn's avatar
Willem de Bruijn committed
			if (!((sk->sk_type == SOCK_STREAM &&
			       sk->sk_protocol == IPPROTO_TCP) ||
			      (sk->sk_type == SOCK_DGRAM &&
			       sk->sk_protocol == IPPROTO_UDP)))
				ret = -ENOTSUPP;
		} else if (sk->sk_family != PF_RDS) {
			ret = -ENOTSUPP;
		}
		if (!ret) {
			if (val < 0 || val > 1)
				ret = -EINVAL;
			else
				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
		}
		if (optlen != sizeof(struct sock_txtime)) {
		} else if (copy_from_sockptr(&sk_txtime, optval,
			   sizeof(struct sock_txtime))) {
			ret = -EFAULT;
		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
			ret = -EINVAL;
			break;
		}
		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
		 * scheduler has enough safe guards.
		 */
		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
			ret = -EPERM;
			break;
		sock_valbool_flag(sk, SOCK_TXTIME, true);
		sk->sk_clockid = sk_txtime.clockid;
		sk->sk_txtime_deadline_mode =
			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
		sk->sk_txtime_report_errors =
			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
	case SO_BINDTOIFINDEX:
		ret = sock_bindtoindex_locked(sk, val);
	default:
		ret = -ENOPROTOOPT;
		break;
Linus Torvalds's avatar
Linus Torvalds committed
	release_sock(sk);
	return ret;
}
EXPORT_SYMBOL(sock_setsockopt);
stephen hemminger's avatar
stephen hemminger committed
static void cred_to_ucred(struct pid *pid, const struct cred *cred,
			  struct ucred *ucred)
{
	ucred->pid = pid_vnr(pid);
	ucred->uid = ucred->gid = -1;
	if (cred) {
		struct user_namespace *current_ns = current_user_ns();

		ucred->uid = from_kuid_munged(current_ns, cred->euid);
		ucred->gid = from_kgid_munged(current_ns, cred->egid);
static int groups_to_user(gid_t __user *dst, const struct group_info *src)
{
	struct user_namespace *user_ns = current_user_ns();
	int i;

	for (i = 0; i < src->ngroups; i++)
		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
			return -EFAULT;

	return 0;
}

Linus Torvalds's avatar
Linus Torvalds committed
int sock_getsockopt(struct socket *sock, int level, int optname,
		    char __user *optval, int __user *optlen)
{
	struct sock *sk = sock->sk;
		struct linger ling;
		struct old_timeval32 tm32;
		struct __kernel_old_timeval tm;
		struct  __kernel_sock_timeval stm;
Linus Torvalds's avatar
Linus Torvalds committed
	} v;
	int lv = sizeof(int);
Linus Torvalds's avatar
Linus Torvalds committed
	int len;
	if (get_user(len, optlen))
		return -EFAULT;
	if (len < 0)
Linus Torvalds's avatar
Linus Torvalds committed
		return -EINVAL;
	memset(&v, 0, sizeof(v));
	switch (optname) {
	case SO_DEBUG:
		v.val = sock_flag(sk, SOCK_DBG);
		break;

	case SO_DONTROUTE:
		v.val = sock_flag(sk, SOCK_LOCALROUTE);
		break;

	case SO_BROADCAST:
Eric Dumazet's avatar
Eric Dumazet committed
		v.val = sock_flag(sk, SOCK_BROADCAST);
		break;

	case SO_SNDBUF:
		v.val = sk->sk_sndbuf;
		break;

	case SO_RCVBUF:
		v.val = sk->sk_rcvbuf;
		break;

	case SO_REUSEADDR:
		v.val = sk->sk_reuse;
		break;

Tom Herbert's avatar
Tom Herbert committed
	case SO_REUSEPORT:
		v.val = sk->sk_reuseport;
		break;

	case SO_KEEPALIVE:
Eric Dumazet's avatar
Eric Dumazet committed
		v.val = sock_flag(sk, SOCK_KEEPOPEN);
		break;

	case SO_TYPE:
		v.val = sk->sk_type;
		break;

	case SO_PROTOCOL:
		v.val = sk->sk_protocol;
		break;

	case SO_DOMAIN:
		v.val = sk->sk_family;
		break;

	case SO_ERROR:
		v.val = -sock_error(sk);
		if (v.val == 0)
			v.val = xchg(&sk->sk_err_soft, 0);
		break;

	case SO_OOBINLINE:
Eric Dumazet's avatar
Eric Dumazet committed
		v.val = sock_flag(sk, SOCK_URGINLINE);
		break;

	case SO_NO_CHECK:
		v.val = sk->sk_no_check_tx;
		break;

	case SO_PRIORITY:
		v.val = sk->sk_priority;
		break;

	case SO_LINGER:
		lv		= sizeof(v.ling);
Eric Dumazet's avatar
Eric Dumazet committed
		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
		v.ling.l_linger	= sk->sk_lingertime / HZ;
		break;

	case SO_BSDCOMPAT:
		break;

	case SO_TIMESTAMP_OLD:
		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
				!sock_flag(sk, SOCK_RCVTSTAMPNS);
		break;

	case SO_TIMESTAMPNS_OLD:
		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
		break;

	case SO_TIMESTAMP_NEW:
		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
		break;

	case SO_TIMESTAMPNS_NEW:
		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
	case SO_TIMESTAMPING_OLD:
		v.val = sk->sk_tsflags;
	case SO_RCVTIMEO_OLD:
	case SO_RCVTIMEO_NEW:
		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
	case SO_SNDTIMEO_OLD:
	case SO_SNDTIMEO_NEW:
		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
Linus Torvalds's avatar
Linus Torvalds committed

	case SO_RCVLOWAT:
		v.val = sk->sk_rcvlowat;
		break;
Linus Torvalds's avatar
Linus Torvalds committed

	case SO_SNDLOWAT:
		v.val = 1;
Linus Torvalds's avatar
Linus Torvalds committed

	case SO_PASSCRED:
		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
Linus Torvalds's avatar
Linus Torvalds committed

	case SO_PEERCRED:
	{
		struct ucred peercred;
		if (len > sizeof(peercred))
			len = sizeof(peercred);
		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
		if (copy_to_user(optval, &peercred, len))
			return -EFAULT;
		goto lenout;
Linus Torvalds's avatar
Linus Torvalds committed

	case SO_PEERGROUPS:
	{
		int ret, n;

		if (!sk->sk_peer_cred)
			return -ENODATA;

		n = sk->sk_peer_cred->group_info->ngroups;
		if (len < n * sizeof(gid_t)) {
			len = n * sizeof(gid_t);
			return put_user(len, optlen) ? -EFAULT : -ERANGE;
		}
		len = n * sizeof(gid_t);

		ret = groups_to_user((gid_t __user *)optval,
				     sk->sk_peer_cred->group_info);
		if (ret)
			return ret;
		goto lenout;
	}

	case SO_PEERNAME:
	{
		char address[128];

		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
		if (lv < 0)
			return -ENOTCONN;
		if (lv < len)
			return -EINVAL;
		if (copy_to_user(optval, address, len))
			return -EFAULT;
		goto lenout;
	}
Linus Torvalds's avatar
Linus Torvalds committed

	/* Dubious BSD thing... Probably nobody even uses it, but
	 * the UNIX standard wants it for whatever reason... -DaveM
	 */
	case SO_ACCEPTCONN:
		v.val = sk->sk_state == TCP_LISTEN;
		break;
Linus Torvalds's avatar
Linus Torvalds committed

	case SO_PASSSEC:
		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
	case SO_PEERSEC:
		return security_socket_getpeersec_stream(sock, optval, optlen, len);
Linus Torvalds's avatar
Linus Torvalds committed

	case SO_MARK:
		v.val = sk->sk_mark;
		break;

Eric Dumazet's avatar
Eric Dumazet committed
		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
	case SO_WIFI_STATUS:
Eric Dumazet's avatar
Eric Dumazet committed
		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
	case SO_PEEK_OFF:
		if (!sock->ops->set_peek_off)
			return -EOPNOTSUPP;

		v.val = sk->sk_peek_off;
		break;
Eric Dumazet's avatar
Eric Dumazet committed
		v.val = sock_flag(sk, SOCK_NOFCS);
	case SO_BINDTODEVICE:
		return sock_getbindtodevice(sk, optval, optlen, len);

	case SO_GET_FILTER:
		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
		if (len < 0)
			return len;

		goto lenout;
	case SO_LOCK_FILTER:
		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
		break;

	case SO_BPF_EXTENSIONS:
		v.val = bpf_tell_extensions();
		break;

	case SO_SELECT_ERR_QUEUE:
		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
		break;

#ifdef CONFIG_NET_RX_BUSY_POLL
	case SO_BUSY_POLL:
		v.val = sk->sk_ll_usec;
		break;
	case SO_PREFER_BUSY_POLL:
		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
		break;
	case SO_MAX_PACING_RATE:
		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
			lv = sizeof(v.ulval);
			v.ulval = sk->sk_max_pacing_rate;
		} else {
			/* 32bit version */
			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
		}
	case SO_INCOMING_CPU:
		v.val = READ_ONCE(sk->sk_incoming_cpu);
	case SO_MEMINFO:
	{
		u32 meminfo[SK_MEMINFO_VARS];

		sk_get_meminfo(sk, meminfo);

		len = min_t(unsigned int, len, sizeof(meminfo));
		if (copy_to_user(optval, &meminfo, len))
			return -EFAULT;

		goto lenout;
	}

#ifdef CONFIG_NET_RX_BUSY_POLL
	case SO_INCOMING_NAPI_ID:
		v.val = READ_ONCE(sk->sk_napi_id);

		/* aggregate non-NAPI IDs down to 0 */
		if (v.val < MIN_NAPI_ID)
			v.val = 0;

		break;
#endif

	case SO_COOKIE:
		lv = sizeof(u64);
		if (len < lv)
			return -EINVAL;
		v.val64 = sock_gen_cookie(sk);
		break;

	case SO_ZEROCOPY:
		v.val = sock_flag(sk, SOCK_ZEROCOPY);
		break;

	case SO_TXTIME:
		lv = sizeof(v.txtime);
		v.txtime.clockid = sk->sk_clockid;
		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
				  SOF_TXTIME_DEADLINE_MODE : 0;
		v.txtime.flags |= sk->sk_txtime_report_errors ?
				  SOF_TXTIME_REPORT_ERRORS : 0;
	case SO_BINDTOIFINDEX:
		v.val = sk->sk_bound_dev_if;
		break;

	case SO_NETNS_COOKIE:
		lv = sizeof(u64);
		if (len != lv)
			return -EINVAL;
		v.val64 = sock_net(sk)->net_cookie;
		break;

		/* We implement the SO_SNDLOWAT etc to not be settable
		 * (1003.1g 7).
		 */
		return -ENOPROTOOPT;
Linus Torvalds's avatar
Linus Torvalds committed
	}
Linus Torvalds's avatar
Linus Torvalds committed
	if (len > lv)
		len = lv;
	if (copy_to_user(optval, &v, len))
		return -EFAULT;
lenout:
	if (put_user(len, optlen))
		return -EFAULT;
	return 0;
/*
 * Initialize an sk_lock.
 *
 * (We also register the sk_lock with the lock validator.)
 */
static inline void sock_lock_init(struct sock *sk)
	if (sk->sk_kern_sock)
		sock_lock_init_class_and_name(
			sk,
			af_family_kern_slock_key_strings[sk->sk_family],
			af_family_kern_slock_keys + sk->sk_family,
			af_family_kern_key_strings[sk->sk_family],
			af_family_kern_keys + sk->sk_family);
	else
		sock_lock_init_class_and_name(
			sk,
			af_family_slock_key_strings[sk->sk_family],
			af_family_slock_keys + sk->sk_family,
			af_family_key_strings[sk->sk_family],
			af_family_keys + sk->sk_family);
Eric Dumazet's avatar
Eric Dumazet committed
/*
 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 * even temporarly, because of RCU lookups. sk_node should also be left as is.
 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
Eric Dumazet's avatar
Eric Dumazet committed
 */
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
	const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
	void *sptr = nsk->sk_security;
#endif

	/* If we move sk_tx_queue_mapping out of the private section,
	 * we must check if sk_tx_queue_clear() is called after
	 * sock_copy() in sk_clone_lock().
	 */
	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
		     offsetof(struct sock, sk_dontcopy_begin) ||
		     offsetof(struct sock, sk_tx_queue_mapping) >=
		     offsetof(struct sock, sk_dontcopy_end));

	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));

	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
#ifdef CONFIG_SECURITY_NETWORK
	nsk->sk_security = sptr;
	security_sk_clone(osk, nsk);
#endif
}

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
		int family)
{
	struct sock *sk;
	struct kmem_cache *slab;

	slab = prot->slab;
	if (slab != NULL) {
		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
		if (!sk)
			return sk;
			sk_prot_clear_nulls(sk, prot->obj_size);
		sk = kmalloc(prot->obj_size, priority);

	if (sk != NULL) {
		if (security_sk_alloc(sk, family, priority))
			goto out_free;

		if (!try_module_get(prot->owner))
			goto out_free_sec;
	}


out_free_sec:
	security_sk_free(sk);
out_free:
	if (slab != NULL)
		kmem_cache_free(slab, sk);
	else
		kfree(sk);
	return NULL;
}

static void sk_prot_free(struct proto *prot, struct sock *sk)
{
	struct kmem_cache *slab;
	cgroup_sk_free(&sk->sk_cgrp_data);
	mem_cgroup_sk_free(sk);
	if (slab != NULL)
		kmem_cache_free(slab, sk);
	else
		kfree(sk);
Linus Torvalds's avatar
Linus Torvalds committed
/**
 *	sk_alloc - All socket objects are allocated here
 *	@net: the applicable net namespace
 *	@family: protocol family
 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *	@prot: struct proto associated with this new sock instance
 *	@kern: is this to be a kernel socket?
Linus Torvalds's avatar
Linus Torvalds committed
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
		      struct proto *prot, int kern)
Linus Torvalds's avatar
Linus Torvalds committed
{
Linus Torvalds's avatar
Linus Torvalds committed

	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds's avatar
Linus Torvalds committed
	if (sk) {
		sk->sk_family = family;
		/*
		 * See comment in struct sock definition to understand
		 * why we need sk_prot_creator -acme
		 */
		sk->sk_prot = sk->sk_prot_creator = prot;
		if (likely(sk->sk_net_refcnt)) {
		refcount_set(&sk->sk_wmem_alloc, 1);
		mem_cgroup_sk_alloc(sk);
		cgroup_sk_alloc(&sk->sk_cgrp_data);
		sock_update_classid(&sk->sk_cgrp_data);
		sock_update_netprioidx(&sk->sk_cgrp_data);
Linus Torvalds's avatar
Linus Torvalds committed
	}
Linus Torvalds's avatar
Linus Torvalds committed
}
EXPORT_SYMBOL(sk_alloc);
Linus Torvalds's avatar
Linus Torvalds committed

/* Sockets having SOCK_RCU_FREE will call this function after one RCU
 * grace period. This is the case for UDP sockets and TCP listeners.
 */
static void __sk_destruct(struct rcu_head *head)
Linus Torvalds's avatar
Linus Torvalds committed
{
	struct sock *sk = container_of(head, struct sock, sk_rcu);
Linus Torvalds's avatar
Linus Torvalds committed
	struct sk_filter *filter;

	if (sk->sk_destruct)
		sk->sk_destruct(sk);

	filter = rcu_dereference_check(sk->sk_filter,
				       refcount_read(&sk->sk_wmem_alloc) == 0);
Linus Torvalds's avatar
Linus Torvalds committed
	if (filter) {
		sk_filter_uncharge(sk, filter);
		RCU_INIT_POINTER(sk->sk_filter, NULL);
	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
Linus Torvalds's avatar
Linus Torvalds committed

#ifdef CONFIG_BPF_SYSCALL
	bpf_sk_storage_free(sk);
#endif

Linus Torvalds's avatar
Linus Torvalds committed
	if (atomic_read(&sk->sk_omem_alloc))
Joe Perches's avatar
Joe Perches committed
		pr_debug("%s: optmem leakage (%d bytes) detected\n",
			 __func__, atomic_read(&sk->sk_omem_alloc));
Linus Torvalds's avatar
Linus Torvalds committed

	if (sk->sk_frag.page) {
		put_page(sk->sk_frag.page);
		sk->sk_frag.page = NULL;
	}

	if (sk->sk_peer_cred)
		put_cred(sk->sk_peer_cred);
	put_pid(sk->sk_peer_pid);
	if (likely(sk->sk_net_refcnt))
		put_net(sock_net(sk));
	sk_prot_free(sk->sk_prot_creator, sk);
Linus Torvalds's avatar
Linus Torvalds committed
}
void sk_destruct(struct sock *sk)
{
	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);

	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
		reuseport_detach_sock(sk);
		use_call_rcu = true;
	}

	if (use_call_rcu)
		call_rcu(&sk->sk_rcu, __sk_destruct);
	else
		__sk_destruct(&sk->sk_rcu);
}

static void __sk_free(struct sock *sk)
{
	if (likely(sk->sk_net_refcnt))
		sock_inuse_add(sock_net(sk), -1);

	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
		sock_diag_broadcast_destroy(sk);
	else
		sk_destruct(sk);
}

void sk_free(struct sock *sk)
{
	/*
Lucas De Marchi's avatar
Lucas De Marchi committed
	 * We subtract one from sk_wmem_alloc and can know if
	 * some packets are still in some tx queue.
	 * If not null, sock_wfree() will call __sk_free(sk) later
	 */
	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
EXPORT_SYMBOL(sk_free);
Linus Torvalds's avatar
Linus Torvalds committed

static void sk_init_common(struct sock *sk)
{
	skb_queue_head_init(&sk->sk_receive_queue);
	skb_queue_head_init(&sk->sk_write_queue);
	skb_queue_head_init(&sk->sk_error_queue);

	rwlock_init(&sk->sk_callback_lock);
	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
			af_rlock_keys + sk->sk_family,
			af_family_rlock_key_strings[sk->sk_family]);
	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
			af_wlock_keys + sk->sk_family,
			af_family_wlock_key_strings[sk->sk_family]);
	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
			af_elock_keys + sk->sk_family,
			af_family_elock_key_strings[sk->sk_family]);
	lockdep_set_class_and_name(&sk->sk_callback_lock,
			af_callback_keys + sk->sk_family,
			af_family_clock_key_strings[sk->sk_family]);
}

/**
 *	sk_clone_lock - clone a socket, and lock its clone
 *	@sk: the socket to clone
 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *
 *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
 */
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
	struct proto *prot = READ_ONCE(sk->sk_prot);
	struct sk_filter *filter;
	bool is_charged = true;
	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
	sock_copy(newsk, sk);
	newsk->sk_prot_creator = prot;
	/* SANITY */
	if (likely(newsk->sk_net_refcnt))
		get_net(sock_net(newsk));
	sk_node_init(&newsk->sk_node);
	sock_lock_init(newsk);
	bh_lock_sock(newsk);
	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
	newsk->sk_backlog.len = 0;
	atomic_set(&newsk->sk_rmem_alloc, 0);
	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
	refcount_set(&newsk->sk_wmem_alloc, 1);
	atomic_set(&newsk->sk_omem_alloc, 0);
	sk_init_common(newsk);
	newsk->sk_dst_cache	= NULL;
	newsk->sk_dst_pending_confirm = 0;
	newsk->sk_wmem_queued	= 0;
	newsk->sk_forward_alloc = 0;
	atomic_set(&newsk->sk_drops, 0);
	newsk->sk_send_head	= NULL;
	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
	atomic_set(&newsk->sk_zckey, 0);
	sock_reset_flag(newsk, SOCK_DONE);
	/* sk->sk_memcg will be populated at accept() time */
	newsk->sk_memcg = NULL;
	cgroup_sk_clone(&newsk->sk_cgrp_data);
	rcu_read_lock();
	filter = rcu_dereference(sk->sk_filter);
	if (filter != NULL)
		/* though it's an empty new sock, the charging may fail
		 * if sysctl_optmem_max was changed between creation of
		 * original socket and cloning
		 */
		is_charged = sk_filter_charge(newsk, filter);
	RCU_INIT_POINTER(newsk->sk_filter, filter);
	rcu_read_unlock();

	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
		/* We need to make sure that we don't uncharge the new
		 * socket if we couldn't charge it in the first place
		 * as otherwise we uncharge the parent's filter.
		if (!is_charged)
			RCU_INIT_POINTER(newsk->sk_filter, NULL);
		sk_free_unlock_clone(newsk);
		newsk = NULL;
		goto out;
	}
	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
	if (bpf_sk_storage_clone(sk, newsk)) {
		sk_free_unlock_clone(newsk);
		newsk = NULL;
		goto out;
	}
	/* Clear sk_user_data if parent had the pointer tagged
	 * as not suitable for copying when cloning.