Linux Kernel Networking Hot Path and Low-Level Mechanisms
Table of Contents
- Softirq and NAPI
- RCU (Read-Copy-Update) in Networking
- Per-CPU Variables
- Memory Barriers and Ordering
- Cache Line Optimization
- Branch Prediction
- Interrupt Coalescing
1. Softirq and NAPI
File: /Users/sphinx/github/linux/net/core/dev.c
1.1 struct softnet_data - Per-CPU Softirq State
File: /Users/sphinx/github/linux/include/linux/netdevice.h, lines 3516-3570
struct softnet_data {
struct list_head poll_list;
struct sk_buff_head process_queue;
local_lock_t process_queue_bh_lock;
/* stats */
unsigned int processed;
unsigned int time_squeeze;
#ifdef CONFIG_RPS
struct softnet_data *rps_ipi_list;
#endif
unsigned int received_rps;
bool in_net_rx_action;
bool in_napi_threaded_poll;
/* written and read only by owning cpu: */
struct netdev_xmit xmit;
#ifdef CONFIG_RPS
unsigned int input_queue_head ____cacheline_aligned_in_smp;
unsigned int input_queue_tail;
#endif
struct sk_buff_head input_pkt_queue;
struct napi_struct backlog;
};Declaration: /Users/sphinx/github/linux/net/core/dev.c, line 462
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
.process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
};Key design points:
DEFINE_PER_CPU_ALIGNEDensures the structure is cache-line aligned per CPUpoll_listlinks NAPI structs scheduled for pollingprocess_queueholds incoming SKBs awaiting processinginput_pkt_queueis the backlog queue for received packetsin_net_rx_actionflag tracks if net_rx_action is currently running on this CPU
1.2 net_rx_action() - Softirq Handler for Receive
File: /Users/sphinx/github/linux/net/core/dev.c, lines 7890-7956
static __latent_entropy void net_rx_action(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int budget = READ_ONCE(net_hotdata.netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
sd->in_net_rx_action = true;
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) {
struct napi_struct *n;
skb_defer_free_flush();
if (list_empty(&list)) {
if (list_empty(&repoll)) {
sd->in_net_rx_action = false;
barrier();
if (!list_empty(&sd->poll_list))
goto start;
if (!sd_has_rps_ipi_waiting(sd))
goto end;
}
break;
}
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);
if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) {
WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
break;
}
}
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
else
sd->in_net_rx_action = false;
net_rps_action_and_irq_enable(sd);
end:
bpf_net_ctx_clear(bpf_net_ctx);
}The softirq handler:
- Sets
in_net_rx_action = trueto track execution context - Splices the per-CPU
poll_listto a local list (safe from IRQ re-addition) - Iterates through scheduled NAPIs calling their
poll()functions - Tracks
budget(packet count) andtime_limit(jiffies-based) to bound latency - Returns early if budget exhausted or time limit reached
- Handles RPS (Remote Packet Steering) IPIs via
net_rps_action_and_irq_enable()
1.3 napi_schedule() / napi_disable() - NAPI Enable/Disable
__napi_schedule() - Lines 6689-6696
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n);
local_irq_restore(flags);
}____napi_schedule() - Lines 4942-4976
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
struct task_struct *thread;
lockdep_assert_irqs_disabled();
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
thread = READ_ONCE(napi->thread);
if (thread) {
if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
goto use_local_napi;
set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
wake_up_process(thread);
return;
}
}
use_local_napi:
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
if (!sd->in_net_rx_action)
raise_softirq_irqoff(NET_RX_SOFTIRQ);
}1.4 netif_receive_skb() - Main Receive Entry
File: /Users/sphinx/github/linux/net/core/dev.c
netif_receive_skb() - Line 6433
int netif_receive_skb(struct sk_buff *skb)
{
int ret;
trace_netif_receive_skb_entry(skb);
ret = netif_receive_skb_internal(skb);
trace_netif_receive_skb_exit(ret);
return ret;
}__netif_receive_skb_core() - Lines 5951-6050
Main receive processing function that:
- Validates packet headers
- Handles VLAN untagging
- Delivers to protocol handlers via
ptype_allanddev->ptype_all - Handles XDP programs via
do_xdp_generic()
1.5 How NAPI Polls vs Interrupt-Driven Receive
Interrupt-Driven Problems:
- High interrupt rate under heavy load (packets per interrupt = 1)
- Interrupt overhead dominates CPU time
- Cache thrashing as CPU switches contexts rapidly
NAPI Solution - Polling with Interrupt Coalescence:
- Initial interrupt signals packet arrival
- Driver calls
napi_schedule()to queue NAPI - Softirq
net_rx_action()polls the device vianapi->poll() - Device interrupt is disabled during polling
- Polling continues until
budgetexhausted ortime_limitreached napi_complete_done()re-enables interrupts
1.6 gro_normal_list() - GRO Batch Completion
File: /Users/sphinx/github/linux/include/net/gro.h, lines 519-526
static inline void gro_normal_list(struct gro_node *gro)
{
if (!gro->rx_count)
return;
netif_receive_skb_list_internal(&gro->rx_list);
INIT_LIST_HEAD(&gro->rx_list);
gro->rx_count = 0;
}GRO batches packets that don't need immediate processing and delivers them as a list for better throughput.
1.7 process_backlog() - Backlog Processing
File: /Users/sphinx/github/linux/net/core/dev.c, lines 6623-6680
This is the poll() function for the backlog NAPI that:
- Dequeues from
process_queue(or splices frominput_pkt_queue) - Uses
local_lock_nested_bhto protect the process queue - Returns when
quotapackets processed or queue empty
2. RCU (Read-Copy-Update) in Networking
2.1 RCU Primitives
File: /Users/sphinx/github/linux/include/linux/rcupdate.h
static inline void __rcu_read_lock(void)
{
preempt_disable();
}
static inline void __rcu_read_unlock(void)
{
if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
rcu_read_unlock_strict();
preempt_enable();
}2.2 RCU Usage in Conntrack - nf_conntrack_find_get()
File: /Users/sphinx/github/linux/net/netfilter/nf_conntrack_core.c, lines 774-827
struct nf_conntrack_tuple_hash *
nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple)
{
unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL);
struct nf_conntrack_tuple_hash *thash;
rcu_read_lock();
thash = __nf_conntrack_find_get(net, zone, tuple,
hash_conntrack_raw(tuple, zone_id, net));
if (thash)
goto out_unlock;
rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY);
if (rid != zone_id)
thash = __nf_conntrack_find_get(net, zone, tuple,
hash_conntrack_raw(tuple, rid, net));
out_unlock:
rcu_read_unlock();
return thash;
}Key RCU usage pattern:
rcu_read_lock()held while traversing the conntrack hashrefcount_inc_not_zero()safely increments reference countsmp_acquire__after_ctrl_dep()provides acquire barrier after ctrl dependency- Tuple key is re-checked after refcount increment to ensure validity
rcu_read_unlock()releases the lock
2.3 RCU Grace Periods in Conntrack Garbage Collection
The conntrack subsystem uses synchronize_rcu() to wait for RCU grace periods before freeing conntrack entries. call_rcu() is used for deferred freeing after the grace period.
3. Per-CPU Variables
3.1 DECLARE_PER_CPU() / DEFINE_PER_CPU()
Lines 110-114:
#define DECLARE_PER_CPU(type, name) \
DECLARE_PER_CPU_SECTION(type, name, "")
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")3.2 DEFINE_PER_CPU_ALIGNED() for Cache Line Alignment
#define DEFINE_PER_CPU_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_alignedUsage in networking - /Users/sphinx/github/linux/net/core/dev.c, line 462:
DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
.process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
};3.3 struct u64_stats_sync - Synchronized 64-bit Counters
File: /Users/sphinx/github/linux/include/linux/u64_stats_sync.h
struct u64_stats_sync {
#if BITS_PER_LONG == 32
seqcount_t seq;
#endif
};Usage pattern:
/* Writer (must hold exclusive access) */
u64_stats_update_begin(&stats->syncp);
u64_stats_add(&stats->bytes64, len);
u64_stats_inc(&stats->packets64);
u64_stats_update_end(&stats->syncp);
/* Reader (can be preempted, no locking required on 64-bit) */
do {
start = u64_stats_fetch_begin(&stats->syncp);
tbytes = u64_stats_read(&stats->bytes64);
tpackets = u64_stats_read(&stats->packets64);
} while (u64_stats_fetch_retry(&stats->syncp, start));4. Memory Barriers and Ordering
4.1 SMP Memory Barrier Definitions
Lines 96-124:
#ifdef CONFIG_SMP
#ifndef smp_mb
#define smp_mb() do { kcsan_mb(); __smp_mb(); } while (0)
#endif
#ifndef smp_rmb
#define smp_rmb() do { kcsan_rmb(); __smp_rmb(); } while (0)
#endif
#ifndef smp_wmb
#define smp_wmb() do { kcsan_wmb(); __smp_wmb(); } while (0)
#endif
#else
#define smp_mb() barrier()
#endif4.2 smp_load_acquire() / smp_store_release()
#define __smp_store_release(p, v) \
do { \
compiletime_assert_atomic_type(*p); \
__smp_mb(); \
WRITE_ONCE(*p, v); \
} while (0)
#define __smp_load_acquire(p) \
({ \
__unqual_scalar_typeof(*p) ___p1 = READ_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
__smp_mb(); \
(typeof(*p))___p1; \
})4.3 Memory Barriers in SKB
if (!IS_ENABLED(CONFIG_DEBUG_NET) && likely(refcount_read(&skb->users) == 1))
smp_rmb();These barriers ensure that:
- The refcount check is properly ordered with subsequent data accesses
- No speculative re-ordering of loads/stores occurs around the critical section
4.4 Sequence Counting in Conntrack
In conntrack, sequence counting provides lockless readers:
- Writer: Increments the sequence number before and after modifying shared state
- Reader: Reads the sequence number before and after critical reads
- If the sequence numbers match, the reader knows the data was consistent
5. Cache Line Optimization
5.1 L1_CACHE_BYTES
#define L1_CACHE_ALIGN(x) __ALIGN_KERNEL(x, L1_CACHE_BYTES)
#define NET_SKB_PAD max(32, L1_CACHE_BYTES)5.2 ____cacheline_aligned_in_smp in struct sock
File: /Users/sphinx/github/linux/include/net/sock.h, lines 401-497
The socket structure uses __cacheline_group_begin/end annotations to organize fields into cache-line groups:
struct sock {
__cacheline_group_begin(sock_write_rx);
atomic_t sk_drops;
__s32 sk_peek_off;
struct sk_buff_head sk_error_queue;
struct sk_buff_head sk_receive_queue;
__cacheline_group_end(sock_write_rx);
__cacheline_group_begin(sock_read_rx);
struct dst_entry __rcu *sk_rx_dst;
int sk_rx_dst_ifindex;
__cacheline_group_end(sock_read_rx);
__cacheline_group_begin(sock_write_rxtx);
socket_lock_t sk_lock;
__cacheline_group_end(sock_write_rxtx);
};5.3 Socket Lock Cache Line Bouncing - sk_lock
static inline void lock_sock_fast(struct sock *sk)
{
mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
spin_lock_bh(&sk->sk_lock.slock);
sk->sk_lock.owned = 1;
}The sk_lock combines:
- A spinlock (
sk_lock.slock) for fast-path locking with BH disabled - A mutex (
sk_lock.mutex) for sleepable operations ownedflag to track lock ownership state
5.4 struct softnet_data Cache Line Alignment
unsigned int input_queue_head ____cacheline_aligned_in_smp;Fields accessed across CPUs (like input_queue_head) are explicitly aligned to prevent false sharing.
6. Branch Prediction
6.1 likely() / unlikely() Macros
# define likely(x) __branch_check__(x, 1, __builtin_constant_p(x))
# define unlikely(x) __branch_check__(x, 0, __builtin_constant_p(x))
/* Default implementation */
# define likely(x) __builtin_expect(!!(x), 1)
# define unlikely(x) __builtin_expect(!!(x), 0)6.2 Usage in Packet Processing
if (likely(refcount_inc_not_zero(&ct->ct_general.use))) {
smp_acquire__after_ctrl_dep();
if (likely(nf_ct_key_equal(h, tuple, zone, net)))
return h;
}6.3 How These Affect CPU Branch Prediction
Modern CPUs use:
- Static branch prediction: Uses the hint from compiled code
- Dynamic branch prediction: Learns from runtime behavior via BTB (Branch Target Buffer)
The __builtin_expect() allows the compiler to:
- Position code to minimize branch mispredictions
- Optimize instruction cache placement
- Enable better pipelining by reducing stalls
7. Interrupt Coalescing
7.1 netif_napi_add_weight_locked() - NAPI Registration
Lines 7534-7578:
void netif_napi_add_weight_locked(struct net_device *dev,
struct napi_struct *napi,
int (*poll)(struct napi_struct *, int),
int weight)
{
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
return;
INIT_LIST_HEAD(&napi->poll_list);
INIT_HLIST_NODE(&napi->napi_hash_node);
hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
gro_init(&napi->gro);
napi->poll = poll;
napi->weight = weight;
napi->dev = dev;
napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
}7.2 napi_complete_done() - End of Polling
Lines 6750-6817:
bool napi_complete_done(struct napi_struct *n, int work_done)
{
unsigned long flags, val, new, timeout = 0;
bool ret = true;
if (unlikely(n->state & (NAPIF_STATE_NPSVC | NAPIF_STATE_IN_BUSY_POLL)))
return false;
if (work_done) {
if (n->gro.bitmask)
timeout = napi_get_gro_flush_timeout(n);
n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
if (n->defer_hard_irqs_count > 0) {
n->defer_hard_irqs_count--;
timeout = napi_get_gro_flush_timeout(n);
if (timeout)
ret = false;
}
gro_flush_normal(&n->gro, !!timeout);
/* ... state management ... */
if (timeout)
hrtimer_start(&n->timer, ns_to_ktime(timeout),
HRTIMER_MODE_REL_PINNED);
return ret;
}7.3 How Adaptive IRQ Coalescing Works
NAPI Deferral Mechanism:
Deferred Hard IRQs (
defer_hard_irqs):- Set via
napi_set_defer_hard_irqs(napi, count) - Each time
napi_complete_done()is called withwork_done > 0, the count is decremented - If count > 0 after decrement and a timeout is set, the function returns
false - This causes the device to not re-enable interrupts immediately, batching more packets
- Set via
GRO Flush Timeout (
gro_flush_timeout):- After polling completes, a timer can be set
- If packets arrive before the timer fires, they are merged via GRO
- This further reduces interrupt frequency
Summary
The Linux networking stack employs numerous low-level optimizations:
- NAPI switches between interrupt and polling modes based on load, naturally coalescing interrupts
- RCU provides lockless read access to shared data structures like conntrack tables
- Per-CPU variables eliminate most locking for hot path statistics
- Memory barriers ensure proper ordering without expensive locks
- Cache line alignment prevents false sharing in both
struct sockandstruct softnet_data - Branch prediction hints (
likely/unlikely) guide CPU pipeline optimization - Adaptive IRQ coalescing via NAPI deferral and GRO timeout dynamically adjusts interrupt rate