Using modern tooling to find a tough Linux kernel bug

Background on KLPs and TCP Retransmits

KLP

  1. The patch application process is entered by “inserting” the patch into the kernel as a module via insmod .
  2. The KLP may reference symbols (which can be both exported, or non-exported) in other modules, or in vmlinux (the main kernel binary). The module has no way of knowing the addresses of these symbols before it’s loaded, so relocations are performed on the module when the patch is being applied so that it points to all the correct addresses in the live kernel. This requires looking up the addresses of any referenced symbols via the referenced modules' symbol tables.
  3. Each task (i.e. thread) in the system is “transitioned” to using the new patch. Transitioning is simply a process of verifying that the task doesn’t have any patched functions in its callstack. If it does, the livepatching subsystem schedules some work to re-check the task again at a later time.
  4. Once each task is fully transitioned, the patch is “enabled”, and the insmod call is completed.

TCP Retransmits

Detecting the retransmits

# funccount-bpfcc -T -i 1 't:tcp:tcp_retransmit_skb'
Tracing 1 functions for "b't:tcp:tcp_retransmit_skb'"... Hit Ctrl-C to end.

FUNC COUNT 09:58:44
tcp:tcp_retransmit_skb 12
09:58:45
FUNC COUNT
tcp:tcp_retransmit_skb 42
09:58:46
FUNC COUNT
tcp:tcp_retransmit_skb 31
09:58:47
FUNC COUNT
tcp:tcp_retransmit_skb 3631
09:58:48
FUNC COUNT
tcp:tcp_retransmit_skb 5185
09:58:49
FUNC COUNT
tcp:tcp_retransmit_skb 653
09:58:50
FUNC COUNT
tcp:tcp_retransmit_skb 19
09:58:51
FUNC COUNT
tcp:tcp_retransmit_skb 18
^C
Detaching…
[ 35927.457289] livepatch: enabling patch 'my_kernel_livepatch'
[ 35928.460032] livepatch: 'livepatch_sample': starting patching
transition
[ 35929.411708] livepatch: 'my_kernel_livepatch': patching complete

Debugging the cause of the TCP retransmits

# perf record --a -g --all-kernel -- kpatch load --all
loading patch module: /var/lib/kpatch/.../my_patch.ko
waiting (up to 15 seconds) for patch transition to complete...
transition complete (2 seconds)
[ perf record: Woken up 45 times to write data ]
[ perf record: Captured and wrote 16.260 MB perf.data (81590 samples) ]
# ls
perf.data
# perf report
Processing events...
- 40.84% swapper [kernel.vmlinux] [k] intel_idle
- 0xffffffff...
- 39.78% start_secondary
cpu_startup_entry
do_idle
cpuidle_enter
cpuidle_enter_state
intel_idle
- 1.06% start_kernel
cpu_startup_entry
do_idle
cpuidle_enter
cpuidle_enter_state
intel_idle
- 10.16% insmod [kernel.vmlinux] [k] kallsyms_expand_symbol.constprop.1
0x20646...
syscall
entry_SYSCALL_64
do_syscall_64
__do_sys_finit_module
load_module
do_init_module
do_one_initcall
0xffffffff...
klp_enable_patch
- klp_init_object_loaded
- 9.75% klp_resolve_symbols.isra.11
klp_find_object_symbol
- kallsyms_on_each_symbol
kallsyms_expand_symbol.constprop.1
+ 1.31% systemctl [kernel.vmlinux] [k] __lock_acquire.isra.36
+ 1.27% swapper [kernel.vmlinux] [k] __lock_acquire.isra.36
+ 0.67% swapper [kernel.vmlinux] [k] lock_release
+ 0.65% systemctl [kernel.vmlinux] [k] lock_release
+ 0.62% chef-client [kernel.vmlinux] [k] release_pages
+ 0.60% python3.8 [kernel.vmlinux] [k] __lock_acquire.isra.36
+ 0.56% chef-client [kernel.vmlinux] [k] page_remove_rmap
+ 0.54% swapper [kernel.vmlinux] [k] check_preemption_disabled
+ 0.54% systemctl [kernel.vmlinux] [k] native_sched_clock
+ 0.51% systemctl [kernel.vmlinux] [k] check_preemption_disabled
...
/*
* Iterate over all symbols in vmlinux. For symbols from modules use
* module_kallsyms_on_each_symbol instead.
*/
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
unsigned long i;
unsigned int off;
int ret;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
if (ret != 0)
return ret;
}
return 0;
}

Step 2: Figure out where interrupts are being disabled

bpftrace -e \
'kprobe:klp_enable_patch {
@klp_cpu = cpu;
@klp_started = true;
}
kprobe:task_call_func+0x39 {
if (@klp_started && @klp_cpu == cpu) {
@stash[cpu] = nsecs;
}
}
kprobe:task_call_func+0x93 {
if (@klp_started && klp_cpu == cpu) {
$delta = nsecs - @stash[cpu];
@ = hist($delta);
}
}
END {
clear(@stash);
clear(@klp_started);
}'
# bpftrace -e ' … '
Attaching 4 probes...
@:
[1K, 2K) 66 | |
[2K, 4K) 4530 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[4K, 8K) 75 | |
[8K, 16K) 11 | |
[16K, 32K) 7 | |
[32K, 64K) 4 | |
[64K, 128K) 1 | |
@klp_cpu: 24
#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ))

Step 3: Look at how the kernel is polling for packets

TRACE_EVENT(napi_poll,    TP_PROTO(struct napi_struct *napi, int work, int budget),    TP_ARGS(napi, work, budget),    TP_STRUCT__entry(
__field( struct napi_struct *, napi)
__string( dev_name, napi->dev ? napi->dev->name : NO_DEV)
__field( int, work)
__field( int, budget)
),
TP_fast_assign(
__entry->napi = napi;
__assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
__entry->work = work;
__entry->budget = budget;
),
TP_printk("napi poll on napi struct %p for device %s work %d budget %d",
__entry->napi, __get_str(dev_name),
__entry->work, __entry->budget)
);
bpftrace -e \
'kprobe:klp_enable_patch {
@klp_cpu = cpu;
@klp_started = true;
}
t:napi:napi_poll {
if (@klp_started && klp_cpu == cpu) {
@napi_stacks[kstack, args->work] = count();
}
}
END {
clear(@klp_started);
}'
# bpftrace -e ' … '
Attaching 3 probes...
@cpu_invoked: 19
...
@napi_stacks[
net_rx_action+673
net_rx_action+673
__softirqentry_text_start+194
run_ksoftirqd+26
smpboot_thread_fn+184
kthread+267
ret_from_fork+31
, 64]: 45
...
# export cpu=1
# perf record -e 'sched:*' -C $cpu taskset -c $cpu kpatch load --all
loading patch module: /var/lib/.../my_patch.ko
waiting (up to 15 seconds) for patch transition to complete...
transition complete (2 seconds)
# ls
perf.data
# perf script[...]
insmod 2530776 [001] 53287.660039:
sched:sched_waking: comm=ksoftirqd/1 pid=16 prio=120 target_cpu=001
insmod 2530776 [001] 53287.660041:
sched:sched_stat_runtime: comm=insmod pid=2530776 runtime=202945 [ns]
vruntime=219869947880235 [ns]
insmod 2530776 [001] 53287.660041:
sched:sched_wakeup: comm=ksoftirqd/1 pid=16 prio=120 target_cpu=001
[...]
kworker/1:0-eve 2401391 [001] 53289.050664:
sched:sched_switch: prev_comm=kworker/1:0 prev_pid=2401391
prev_prio=120 prev_state=I ==> next_comm=ksoftirqd/1 next_pid=16
next_prio=120
# cat /boot/config-`uname -r` | grep PREEMPT_NONE
CONFIG_PREEMPT_NONE=y

Step 4: Fixing the bug

--
kernel/kallsyms.c | 1 +
kernel/module.c | 2 ++
2 files changed, 3 insertions(+)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 0ba87982d017..2a9afe484aec 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -223,6 +223,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
if (ret != 0)
return ret;
+ cond_resched();
}
return 0;
}
diff --git a/kernel/module.c b/kernel/module.c
index 40ec9a030eec..c96160f7f3f5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4462,6 +4462,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
mod, kallsyms_symbol_value(sym));
if (ret != 0)
goto out;
+
+ cond_resched();
}
}
out:
--
2.30.2
# funccount-bpfcc -T -i 1 't:tcp:tcp_retransmit_skb
Tracing 1 functions for "b't:tcp:tcp_retransmit_skb'"... Hit Ctrl-C to end.

FUNC COUNT 11:01:52
tcp:tcp_retransmit_skb 35
11:01:53
FUNC COUNT
tcp:tcp_retransmit_skb 31
11:01:54
FUNC COUNT
tcp:tcp_retransmit_skb 42
11:01:55
FUNC COUNT
tcp:tcp_retransmit_skb 11
11:01:56
FUNC COUNT
tcp:tcp_retransmit_skb 15
11:01:57
FUNC COUNT
tcp:tcp_retransmit_skb 44
Detaching…

Takeaways

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
Byte Lab

Byte Lab

Byte Lab is a technical blog that discusses systems engineering, and the tech industry.