Newer
Older
if (skb_get_dst_pending_confirm(skb)) {
struct sock *sk = skb->sk;
unsigned long now = jiffies;
/* avoid dirtying neighbour */
if (READ_ONCE(n->confirmed) != now)
WRITE_ONCE(n->confirmed, now);
if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
hannes@stressinduktion.org
committed
bool sk_mc_loop(struct sock *sk);
static inline bool sk_can_gso(const struct sock *sk)
{
return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
}
void sk_setup_caps(struct sock *sk, struct dst_entry *dst);
static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags)
{
sk->sk_route_nocaps |= flags;
sk->sk_route_caps &= ~flags;
}
static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, char *to,
int copy, int offset)
{
if (skb->ip_summed == CHECKSUM_NONE) {
Al Viro
committed
if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
skb->csum = csum_block_add(skb->csum, csum, offset);
} else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
Al Viro
committed
if (!copy_from_iter_full_nocache(to, copy, from))
Al Viro
committed
} else if (!copy_from_iter_full(to, copy, from))
return -EFAULT;
return 0;
}
static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, int copy)
int err, offset = skb->len;
err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy),
copy, offset);
__skb_trim(skb, offset);
return err;
}
static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from,
struct sk_buff *skb,
struct page *page,
int off, int copy)
{
int err;
err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
copy, skb->len);
if (err)
return err;
skb->len += copy;
skb->data_len += copy;
skb->truesize += copy;
sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
return 0;
}
/**
* sk_wmem_alloc_get - returns write allocations
* @sk: socket
*
* Return: sk_wmem_alloc minus initial offset of one
*/
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
return refcount_read(&sk->sk_wmem_alloc) - 1;
}
/**
* sk_rmem_alloc_get - returns read allocations
* @sk: socket
*
*/
static inline int sk_rmem_alloc_get(const struct sock *sk)
{
return atomic_read(&sk->sk_rmem_alloc);
}
/**
* sk_has_allocations - check if allocations are outstanding
* @sk: socket
*
* Return: true if socket has write or read allocations
static inline bool sk_has_allocations(const struct sock *sk)
{
return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
}
* skwq_has_sleeper - check if there are any waiting processes
* Return: true if socket_wq has waiting processes
* The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
* barrier call. They were added due to the race found within the tcp code.
*
* Consider following tcp code paths::
* CPU1 CPU2
* sys_select receive packet
* ... ...
* __add_wait_queue update tp->rcv_nxt
* ... ...
* tp->rcv_nxt check sock_def_readable
* ... {
* schedule rcu_read_lock();
* wq = rcu_dereference(sk->sk_wq);
* if (wq && waitqueue_active(&wq->wait))
* wake_up_interruptible(&wq->wait)
* ...
* }
*
* The race for tcp fires when the __add_wait_queue changes done by CPU1 stay
* in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1
* could then endup calling schedule and sleep forever if there are no more
* data on the socket.
static inline bool skwq_has_sleeper(struct socket_wq *wq)
return wq && wq_has_sleeper(&wq->wait);
}
/**
* sock_poll_wait - place memory barrier behind the poll_wait call.
* @filp: file
* @p: poll_table
*
* See the comments in the wq_has_sleeper function.
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
poll_table *p)
if (!poll_does_not_wait(p)) {
/* We need to be sure we are in sync with the
* socket flags modification.
*
* This memory barrier is paired in the wq_has_sleeper.
smp_mb();
}
}
static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
{
if (sk->sk_txhash) {
skb->l4_hash = 1;
skb->hash = sk->sk_txhash;
}
}
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk);
* Queue a received datagram if it will fit. Stream and sequenced
* protocols can't normally use this as they need to fit buffers in
* and play with them.
*
* Inlined as it's very short and called for pretty much every
* packet ever received.
*/
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
skb->sk = sk;
skb->destructor = sock_rfree;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
sk_mem_charge(sk, skb->truesize);
void sk_reset_timer(struct sock *sk, struct timer_list *timer,
unsigned long expires);
void sk_stop_timer(struct sock *sk, struct timer_list *timer);
void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);
int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
struct sk_buff *skb, unsigned int flags,
void (*destructor)(struct sock *sk,
struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
struct sk_buff *sock_dequeue_err_skb(struct sock *sk);
/*
* Recover an error report and clear atomically
*/
int err;
if (likely(!sk->sk_err))
return 0;
err = xchg(&sk->sk_err, 0);
return -err;
}
static inline unsigned long sock_wspace(struct sock *sk)
{
int amt = 0;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc);
/* Note:
* We use sk->sk_wq_raw, from contexts knowing this
* pointer is not NULL and cannot disappear/change.
*/
static inline void sk_set_bit(int nr, struct sock *sk)
if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
!sock_flag(sk, SOCK_FASYNC))
set_bit(nr, &sk->sk_wq_raw->flags);
}
static inline void sk_clear_bit(int nr, struct sock *sk)
{
if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
!sock_flag(sk, SOCK_FASYNC))
clear_bit(nr, &sk->sk_wq_raw->flags);
static inline void sk_wake_async(const struct sock *sk, int how, int band)
if (sock_flag(sk, SOCK_FASYNC)) {
rcu_read_lock();
sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
rcu_read_unlock();
}
/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
* need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
* Note: for send buffers, TCP works better if we can build two skbs at
* minimum.
#define TCP_SKB_MIN_TRUESIZE (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff)))
#define SOCK_MIN_SNDBUF (TCP_SKB_MIN_TRUESIZE * 2)
#define SOCK_MIN_RCVBUF TCP_SKB_MIN_TRUESIZE
static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
u32 val;
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
return;
val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
bool force_schedule);
/**
* sk_page_frag - return an appropriate page_frag
* @sk: socket
*
* Use the per task page_frag instead of the per socket one for
* optimization when we know that we're in the normal context and owns
* everything that's associated with %current.
*
* gfpflags_allow_blocking() isn't enough here as direct reclaim may nest
* inside other socket operations and end up recursing into sk_page_frag()
* while it's already in use.
*
* Return: a per task page_frag if context allows that,
* otherwise a per socket one.
*/
static inline struct page_frag *sk_page_frag(struct sock *sk)
if (gfpflags_normal_context(sk->sk_allocation))
bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);
/*
* Default write policy as shown to user space via poll/select/SIGIO
*/
static inline bool sock_writeable(const struct sock *sk)
return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
return noblock ? 0 : sk->sk_rcvtimeo;
}
static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
return noblock ? 0 : sk->sk_sndtimeo;
}
static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
{
int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len);
return v ?: 1;
}
/* Alas, with timeout socket operations are not restartable.
* Compare this to poll().
*/
static inline int sock_intr_errno(long timeo)
{
return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}
struct sock_skb_cb {
u32 dropcount;
};
/* Store sock_skb_cb at the end of skb->cb[] so protocol families
* using skb->cb[] would keep using it directly and utilize its
* alignement guarantee.
*/
#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
sizeof(struct sock_skb_cb)))
#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
SOCK_SKB_CB_OFFSET))
Eyal Birger
committed
#define sock_skb_cb_check_size(size) \
BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)
Eyal Birger
committed
static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
atomic_read(&sk->sk_drops) : 0;
static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
{
int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
atomic_add(segs, &sk->sk_drops);
}
static inline ktime_t sock_read_timestamp(struct sock *sk)
{
#if BITS_PER_LONG==32
unsigned int seq;
ktime_t kt;
do {
seq = read_seqbegin(&sk->sk_stamp_seq);
kt = sk->sk_stamp;
} while (read_seqretry(&sk->sk_stamp_seq, seq));
return kt;
#else
return READ_ONCE(sk->sk_stamp);
#endif
}
static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
{
#if BITS_PER_LONG==32
write_seqlock(&sk->sk_stamp_seq);
sk->sk_stamp = kt;
write_sequnlock(&sk->sk_stamp_seq);
#else
WRITE_ONCE(sk->sk_stamp, kt);
void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb);
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb);
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
/*
* generate control messages if
* - receive time stamping in software requested
* - software time stamp available and wanted
* - hardware time stamps available and wanted
*/
if (sock_flag(sk, SOCK_RCVTSTAMP) ||
(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
(kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
(hwtstamps->hwtstamp &&
(sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
__sock_recv_timestamp(msg, sk, skb);
else
if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid)
__sock_recv_wifi_status(msg, sk, skb);
void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb);
#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb)
{
#define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \
(1UL << SOCK_RCVTSTAMP))
#define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \
SOF_TIMESTAMPING_RAW_HARDWARE)
if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY)
__sock_recv_ts_and_drops(msg, sk, skb);
else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
sock_write_timestamp(sk, skb->tstamp);
else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP))
void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags);
* _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
* @sk: socket sending this packet
* @tsflags: timestamping flags to use
* @tx_flags: completed with instructions for time stamping
* @tskey: filled in with next sk_tskey (not for TCP, which uses seqno)
* Note: callers should take care of initial ``*tx_flags`` value (usually 0)
static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags,
__u8 *tx_flags, __u32 *tskey)
if (unlikely(tsflags)) {
__sock_tx_timestamp(tsflags, tx_flags);
if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
*tskey = sk->sk_tskey++;
}
if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
*tx_flags |= SKBTX_WIFI_STATUS;
}
static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags,
__u8 *tx_flags)
{
_sock_tx_timestamp(sk, tsflags, tx_flags, NULL);
}
static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
{
_sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags,
&skb_shinfo(skb)->tskey);
}
DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
/**
* sk_eat_skb - Release a skb if it is no longer needed
* @sk: socket to eat this skb from
* @skb: socket buffer to eat
*
* This routine must be called with interrupts disabled or with the socket
* locked so that the sk_buff queue operation is ok.
*/
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
if (static_branch_unlikely(&tcp_rx_skb_cache_key) &&
!sk->sk_rx_skb_cache) {
sk->sk_rx_skb_cache = skb;
skb_orphan(skb);
return;
}
static inline
struct net *sock_net(const struct sock *sk)
{
}
static inline
void sock_net_set(struct sock *sk, struct net *net)
static inline bool
skb_sk_is_prefetched(struct sk_buff *skb)
{
#ifdef CONFIG_INET
return skb->destructor == sock_pfree;
#else
return false;
#endif /* CONFIG_INET */
}
/* This helper checks if a socket is a full socket,
* ie _not_ a timewait or request socket.
*/
static inline bool sk_fullsock(const struct sock *sk)
{
return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}
static inline bool
sk_is_refcounted(struct sock *sk)
{
/* Only full sockets have sk->sk_flags. */
return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}
* skb_steal_sock - steal a socket from an sk_buff
* @skb: sk_buff to steal the socket from
* @refcounted: is set to true if the socket is reference-counted
*/
static inline struct sock *
skb_steal_sock(struct sk_buff *skb, bool *refcounted)
{
if (skb->sk) {
struct sock *sk = skb->sk;
if (skb_sk_is_prefetched(skb))
*refcounted = sk_is_refcounted(sk);
skb->destructor = NULL;
skb->sk = NULL;
return sk;
}
return NULL;
}
/* Checks if this SKB belongs to an HW offloaded socket
* and whether any SW fallbacks are required based on dev.
* Check decrypted mark in case skb_orphan() cleared socket.
*/
static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
struct net_device *dev)
{
#ifdef CONFIG_SOCK_VALIDATE_XMIT
struct sock *sk = skb->sk;
if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
skb = sk->sk_validate_xmit_skb(sk, dev, skb);
#ifdef CONFIG_TLS_DEVICE
} else if (unlikely(skb->decrypted)) {
pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
kfree_skb(skb);
skb = NULL;
#endif
}
#endif
return skb;
}
/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
* SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
*/
static inline bool sk_listener(const struct sock *sk)
{
return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}
void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
int type);
bool sk_ns_capable(const struct sock *sk,
struct user_namespace *user_ns, int cap);
bool sk_capable(const struct sock *sk, int cap);
bool sk_net_capable(const struct sock *sk, int cap);
void sk_get_meminfo(const struct sock *sk, u32 *meminfo);
/* Take into consideration the size of the struct sk_buff overhead in the
* determination of these values, since that is non-constant across
* platforms. This makes socket queueing behavior and performance
* not depend upon such differences.
*/
#define _SK_MEM_PACKETS 256
#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;
extern int sysctl_tstamp_allow_data;
extern int sysctl_optmem_max;
extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;
DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_wmem ? */
if (proto->sysctl_wmem_offset)
return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset);
return *proto->sysctl_wmem;
}
static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
/* Does this proto have per netns sysctl_rmem ? */
if (proto->sysctl_rmem_offset)
return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset);
return *proto->sysctl_rmem;
}
/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
* Some wifi drivers need to tweak it to get more chunks.
* They can use this helper from their ndo_start_xmit()
*/
static inline void sk_pacing_shift_update(struct sock *sk, int val)
{
if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val)
WRITE_ONCE(sk->sk_pacing_shift, val);
/* if a socket is bound to a device, check that the given device
* index is either the same or that the socket is bound to an L3
* master device and the given device index is also enslaved to
* that L3 master
*/
static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
{
int mdif;
if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)
return true;
mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
if (mdif && mdif == sk->sk_bound_dev_if)
return true;
return false;
}
void sock_def_readable(struct sock *sk);
int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
void sock_enable_timestamps(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
void sock_set_rcvbuf(struct sock *sk, int val);
void sock_set_mark(struct sock *sk, u32 val);
void sock_set_reuseaddr(struct sock *sk);
void sock_set_reuseport(struct sock *sk);
void sock_set_sndtimeo(struct sock *sk, s64 secs);
int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);