0%

使用 GDB 调试 Linux 内核模块

记录调试内核模块相关

Linux Kernel Config

Linux内核调试需要打开如下内核配置

  • Kernel hacking
    • Kernel debugging
    • Detect Soft Lockups
    • Collect scheduler debugging info
    • Compile the kernel with debug info
    • Self test for the backtrace code

Linux module prepare

Linux模块需要打开调试信息-g -O0 -ggdb

两个选项分析,用于支持 watch

  • -fno-exceptions
  • -fstrict-volatile-bitfields

GDB

(cskygdb) help add-symbol-file
Load symbols from FILE, assuming FILE has been dynamically loaded.
Usage: add-symbol-file FILE ADDR [-s <SECT> <SECT_ADDR> -s <SECT> <SECT_ADDR> ...]
ADDR is the starting address of the file's text.
The optional arguments are section-name section-address pairs and
should be specified if the data and bss segments are not contiguous
with the text.  SECT is a section name to be loaded at SECT_ADDR.

加载地址是ko的text段位置,不是ko位置,计算加载位置如下

$ lsmod | grep xx
xx 1495536 0 - Live 0xc016d000
$ objdump --section-headers xx.o | grep text
0 .text         000003d0  00000000  00000000  00000034  2**2

因此,在gdb中加载符号如下:

(gdb) add-symbol-file /path/to/xx.ko 0xc016d034
add symbol table from file "path/to/xx.ko" at
    .text_addr = 0xc016d034
(y or n) y
Reading symbols from /path/to/xx/ko..done

如果需要调试静态符号,需要加载 .bss

(gdb) add-symbol-file /path/to/xx.ko 0xc016d034 -s .bss 0xc016e034
add symbol table from file "path/to/xx.ko" at
    .text_addr = 0xc016d034
    .bss_addr = 0xc016e034
(y or n) y
Reading symbols from /path/to/xx/ko..done

Linux modules

ko 加载时触发 module_init(xx_module_init),卸载时触发 module_exit(xx_module_exit)

static int __init xx_module_init(void)
{
    dev_t dev_id;
    int ret = -1, i;

    //执行sub模块init
    device_count = xx_init();

    if (device_count < 0)
        return device_count;

    dev_id = MKDEV(XX_MAJOR, 0);
    if ((ret = register_chrdev_region(dev_id, device_count, XX_DEVICE_NAME)) != 0) {
        printk(KERN_ERR "xx-core: unable to get major %d\n", XX_MAJOR);
        xx_devices_cleanup();

        return ret;
    }

    cdev_init(&cdev, &xx_fops);
    if ((ret = cdev_add(&cdev, dev_id, device_count)) != 0) {
        printk(KERN_ERR "xx-core: unable register character device\n");
        goto error;
    }
    xx_class = class_create(THIS_MODULE, XX_CLASS_NAME);
    if (IS_ERR(xx_class)) {
        ret = PTR_ERR(xx_class);
        goto error;
    }

    for (i = 0; i < device_count; i++) {
        CLASS_DEV_CREATE(xx_class, NULL, MKDEV(XX_MAJOR, i), i);
    }
    xx_create_proc_entries();
    printk(KERN_EMERG "%s: ok! \n", __FUNCTION__);
    return 0;

error:
    cdev_del(&cdev);
    unregister_chrdev_region(dev_id, device_count);
    xx_devices_cleanup();

    return ret;
}
static void __exit xx_module_exit(void)
{
    int i;

    printk(KERN_EMERG "%s: av_devices.count = %d\n", __FUNCTION__, device_count);

    for (i = 0; i < device_count; i++) {
        CLASS_DEV_DESTROY(xx_class, MKDEV(XX_MAJOR, i));
    }

    xx_devices_cleanup();
    class_destroy(xx_class);
    cdev_del(&cdev);
    unregister_chrdev_region(MKDEV(XX_MAJOR, 0), device_count);

    printk(KERN_EMERG "%s: ok! \n", __FUNCTION__);
}
void xx_devices_cleanup(void)
{
    struct xx_device *pos = device_list;

    XX_ASSERT(pos != NULL);

    do {
        //uninit主要执行sub模块close和cleanup
        device_uninit(pos);
        device_unregister(pos);

        pos = device_list;
    } while (device_list);
}

problem

出问题代码

xx_open()
{
    //something

    if(xx.thread == NULL) {
        xx.ops = &xx_ops;
        __xx_init(a, b);
    }
}

xx_close()
{
    //something

    if(xx.thread != NULL) {
        __xx_stop(a, b);
        __xx_exit(xx);
    }
}

xx_open xx_close 上层的封装中并不是调用 open close就会触发底层驱动。
而是有打开次数来决定的,如果open时当前模块没有打开才会调用xx_open,
因此在上述代码中,在某些情况下,其他模块(例如demux1)触发了xx_close
从而将xx结构体清零,而模块(demux0)并没有关闭,
当操作demux0时,调用open不会起作用,此时 xx为空导致死机

修改为,将xx相关信息放到 xx_initxx_cleanup 中,这样 xx_openxx_closexx 无影响

Ref

  1. Linux 系统内核的调试
  2. linux下用GDB调试可加载模块