Linux kernel 初探

编译内核

安装依赖：

sudo apt-get update
sudo apt-get install git fakeroot build-essential ncurses-dev xz-utils libssl-dev bc

下载kernel源代码：https://www.kernel.org/ 。

解压后进入目录，执行下面命令进行配置：

make menuconfig

配置的时候基本什么都不需要改动，直接Save，然后Exit。

然后运行下面的命令进行内核编译，该过程会花费较长时间。

make bzImage

编译好之后，在./arch/x86/boot/拿到bzImage，从源码根目录拿到vmlinux。

Setup is 17628 bytes (padded to 17920 bytes).
System is 8485 kB
CRC 7bdf0988
Kernel: arch/x86/boot/bzImage is ready  (#1)

添加自定义syscall

在源码根目录创建一个新的目录（模块），以经典的helloworld为例。

ex@Ex:~/test/temp/linux-5.1.7$ cd helloworld/
ex@Ex:~/test/temp/linux-5.1.7/helloworld$ tree
.
├── helloworld.c
└── Makefile

0 directories, 2 files
ex@Ex:~/test/temp/linux-5.1.7/helloworld$ cat helloworld.c 
#include <linux/kernel.h>

asmlinkage long sys_helloworld(void){
    printk("hello world\n");
    return 0;
}
ex@Ex:~/test/temp/linux-5.1.7/helloworld$ cat Makefile 
obj-y=helloworld.o

编辑源码根目录下的Makefile，加入helloworld模块。

...
PHONY += prepare0

ifeq ($(KBUILD_EXTMOD),)
core-y        += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/ helloworld/

vmlinux-dirs    := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
             $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
             $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y)))

vmlinux-alldirs    := $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
             $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))
...

然后编辑include/linux/syscalls.h，添加helloworld函数原型。

asmlinkage long sys_helloworld(void);

增加在文件末尾即可。

然后再修改arch/x86/entry/syscalls/syscall_32.tbl和arch/x86/entry/syscalls/syscall_64.tbl，添加自定义的系统调用号。

i386：

1000    i386    helloworld  sys_helloworld

amd64：

1000    common helloworld sys_helloworld

最后在编译生成新的内核即可。

编译busybox

先到官网上下载源码：https://busybox.net/ 。

下载完成后解压进入源码根目录输入make menuconfig进行配置。

最好在配置时进入Settings，勾上Build static binary (no shared libs)，这样就不会依赖libc文件。

ex@Ex:~/test/temp/busybox-1.31.0/_install$ ldd bin/busybox 
    not a dynamic executable

如果不勾选的话，需要自行配置libc库，这样步骤会很繁琐。

然后输入make install -j4进行编译，busybox编译要比kernel快很多。

编译完成后会生成一个_install的目录，这就是我们需要的环境。

先进行一些简单的初始化：

cd _install
mkdir proc
mkdir sys
mkdir lib64
mkdir -p lib/x86_64-linux-gnu/
mkdir etc
mkdir home
echo "root:x:0:0:root:/root:/bin/sh" > etc/passwd
echo "root:x:0:" > etc/group
touch etc/shadow
touch etc/gshadow
touch init
chmod +x init

然后把libc和ld准备好，否则程序需要静态编译才能运行，则会使得生成的程序调试的时候不太方便。

在生成的init初始化脚本中，加入如下内容：

#!/bin/sh
echo "{==DBG==} INIT SCRIPT"
mkdir /tmp
mount -t proc none /proc
mount -t sysfs none /sys
mount -t debugfs none /sys/kernel/debug
mount -t tmpfs none /tmp
mount -t devtmpfs devtmpfs /dev

# insmod /xxx.ko # load ko
mdev -s # We need this to find /dev/sda later
echo -e "{==DBG==} Boot took $(cut -d' ' -f1 /proc/uptime) seconds"

setsid /bin/cttyhack setuidgid 1000 /bin/sh #normal user
# exec /bin/sh #root

poweroff -d 0  -f

然后在_install目录里运行下面的命令进行打包：

find . | cpio -o --format=newc > ../rootfs.img

qemu

通过上面两步，我们得到了含有helloworld syscall的kernel bzImage和用busybox打包的fs（附带了ld和libc）。

接下来只要用qemu启动就ok了。

在这之前，可以先写一个测试程序来测试我们写的syscall。

// compiled: gcc helloworld.c -o helloworld
#include <stdio.h>
#include <unistd.h>

int main()
{
    puts("start");
    syscall(1000);
    puts("end");
    return 0;
}

将生成可执行二进制文件helloworld放在_install目录下，重新进行打包。

然后在用qemu启动：

qemu-system-x86_64 -cpu kvm64,+smep -kernel ./bzImage -initrd rootfs.img -nographic -append "console=ttyS0"

运行实例：

/ $ id
uid=1000 gid=1000 groups=1000
/ $ ./helloworld 
start
[   29.085005] hello world
end
/ $

驱动

register_chrdev

int register_chrdev (unsigned int major, const  char *name, struct file_operations*fops);

在这里，我们指定要注册它的设备的名称和主要编号，之后将链接设备和file_operations结构。如果我们为主参数指定零，该函数将自己分配一个主设备号（即它返回的值）。如果返回的值为零，则表示成功，而负数表示错误。两个设备编号均在0-255范围内指定。

我们将设备名称作为name参数的字符串值传递（如果模块注册单个设备，则此字符串也可以传递模块的名称）。然后，我们使用此字符串来标识/sys/devices文件中的设备。读取，写入和保存等设备文件操作由存储在file_operations结构中的函数指针处理。这些函数由模块实现，并且指向标识该模块的module结构的指针也存储在file_operations结构中。

来自源码：linux-5.2.7/include/linux/fs.h:1791

struct file_operations {
    struct module *owner;
    loff_t (*llseek) (struct file *, loff_t, int);
    ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
    ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
    ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
    ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
    int (*iopoll)(struct kiocb *kiocb, bool spin);
    int (*iterate) (struct file *, struct dir_context *);
    int (*iterate_shared) (struct file *, struct dir_context *);
    __poll_t (*poll) (struct file *, struct poll_table_struct *);
    long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
    long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
    int (*mmap) (struct file *, struct vm_area_struct *);
    unsigned long mmap_supported_flags;
    int (*open) (struct inode *, struct file *);
    int (*flush) (struct file *, fl_owner_t id);
    int (*release) (struct inode *, struct file *);
    int (*fsync) (struct file *, loff_t, loff_t, int datasync);
    int (*fasync) (int, struct file *, int);
    int (*lock) (struct file *, int, struct file_lock *);
    ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
    unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
    int (*check_flags)(int);
    int (*flock) (struct file *, int, struct file_lock *);
    ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
    ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
    int (*setlease)(struct file *, long, struct file_lock **, void **);
    long (*fallocate)(struct file *file, int mode, loff_t offset,
              loff_t len);
    void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
    unsigned (*mmap_capabilities)(struct file *);
#endif
    ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
            loff_t, size_t, unsigned int);
    loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                   struct file *file_out, loff_t pos_out,
                   loff_t len, unsigned int remap_flags);
    int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;

如果file_operations结构包含一些不需要的函数，您仍然可以使用该文件而不实现它们。指向未实现函数的指针可以简单地设置为零。之后，系统将负责该功能的实现并使其正常运行。

字符设备模块使用insmod加载，加载完毕需要在/dev目录下使用mkmod命令建立相应的文件结点

编写驱动程序：

memory.c

#include <linux/init.h>

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/proc_fs.h>
#include <linux/fcntl.h>

#include <linux/uaccess.h>

MODULE_LICENSE("Dual BSD/GPL");

int memory_open(struct inode *inode, struct file *filp);
int memory_release(struct inode *inode, struct file *filp);
ssize_t memory_read(struct file *filp, char *buf, size_t count, loff_t *f_pos);
ssize_t memory_write(struct file *filp, const char *buf, size_t count, loff_t *f_pos);
void memory_exit(void);
int memory_init(void);

struct file_operations memory_fops = {
    read : memory_read,
    write : memory_write,
    open : memory_open,
    release : memory_release
};

module_init(memory_init);
module_exit(memory_exit);

int memory_major = 60;

char *memory_buffer;

int used = 0;

#define LENGTH 0x1000

int memory_init(void)
{
    int result;

    result = register_chrdev(memory_major, "memory", &memory_fops);
    if (result < 0)
    {
        printk("<1>memory: can't obtain major number %d\n", memory_major);
        return result;
    }

    memory_buffer = kmalloc(LENGTH, GFP_KERNEL);
    if (!memory_buffer)
    {
        result = -ENOMEM;
        goto fail;
    }
    memset(memory_buffer, 0, LENGTH);

    printk("<1>Inserting memory module\n");
    return 0;

fail:
    memory_exit();
    return result;
}

void memory_exit(void)
{
    unregister_chrdev(memory_major, "memory");

    if (memory_buffer)
        kfree(memory_buffer);

    printk("<1>Removing memory module\n");
}

int memory_open(struct inode *inode, struct file *filp)
{
    printk("<1>Open\n");
    return 0;
}

int memory_release(struct inode *inode, struct file *filp)
{
    printk("<1>Release\n");
    return 0;
}

ssize_t memory_read(struct file *filp, char *buf,
                    size_t count, loff_t *f_pos)
{
    int bytes;

    if(used > count && used > 0)
    {
        used -= count;
        bytes = count;
        copy_to_user(buf, memory_buffer, bytes);
    }
    else if(used > 0)
    {
        bytes = used;
        used = 0;
        copy_to_user(buf, memory_buffer, bytes); 
    }

    return bytes;
}

ssize_t memory_write(struct file *filp, const char *buf,
                     size_t count, loff_t *f_pos)
{
    int bytes = 0;

    if(used + count < LENGTH)
    {
        used += count;
        bytes = count;
        copy_from_user(memory_buffer, buf, bytes);
    }
    else if(used < LENGTH)
    {
        bytes = LENGTH - used;
        used = LENGTH;
        copy_from_user(memory_buffer, buf, bytes);
    }
    

    return bytes;
}

上面的驱动可以看成一个简单的字符仓库，如果放满了字符就放不进去，如果是空的也拿不出来。

驱动源码并不能用gcc直接进行编译，需要生成一个Makefile来进行编译。

TARGET_MODULE:=memorys
PWD:=$(shell pwd)
# KERNELDIR := /lib/modules/$(shell uname -r)/build
KERNELDIR:=./linux-4.15

$(TARGET_MODULE)-objs := memory.o
obj-m := $(TARGET_MODULE).o

all:
    $(MAKE) -C $(KERNELDIR) M=$(PWD) modules

clean:  
    rm -rf *.o *~ core .depend .*.cmd *.ko *.mod.c .tmp_versions *.order *.symvers

对应的内核要编译相对应的驱动才能载入，否则会失败。

编译好会生成一个memorys.ko的驱动。

这时我们可以把驱动复制到_install根目录，然后在我们的init脚本中加入下面两条命令，重新生成镜像。

insmod /memorys.ko
mknod /dev/memorys c 60 0

60 为我们设置的主设备号

运行实例：

/ # ls
bin         lib         memorys.ko  sbin        usr
dev         lib64       proc        sys
init        linuxrc     root        tmp
/ # echo 1234567890 > /dev/memorys
[   25.850176] <1>Open
[   25.855288] <1>Release
/ # cat /dev/memorys
[   31.262535] <1>Open
1234567890
[   31.266417] <1>Release
/ #

上面的编译方式是早起驱动开发常用的。

根据新的资料，我重新编写了一个自动挂载的驱动，代码如下：

test_src.c

#include <linux/init.h>

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/proc_fs.h>
#include <linux/fcntl.h>
#include<linux/cdev.h> 
#include <linux/device.h>

#include <linux/uaccess.h>

MODULE_LICENSE("Dual BSD/GPL");

int test_open(struct inode *inode, struct file *filp);
int test_release(struct inode *inode, struct file *filp);
ssize_t test_read(struct file *filp, char *buf, size_t count, loff_t *f_pos);
ssize_t test_write(struct file *filp, const char *buf, size_t count, loff_t *f_pos);
long test_ioctl (struct file *filp, unsigned int cmd, unsigned long arg);
void test_exit(void);
int test_init(void);

struct file_operations test_fops = {
    .read = test_read,
    .write = test_write,
    .open = test_open,
    .release = test_release,
    .unlocked_ioctl = test_ioctl,
    .owner = THIS_MODULE
};

module_init(test_init);
module_exit(test_exit);

char *test_buffer;

int used = 0;

#define LENGTH 0x1000

dev_t test_major;
struct cdev test_cdev;
struct class *test_class;

int test_init(void)
{
    int result;

    if(alloc_chrdev_region(&test_major, 0, 1, "test") >= 0)
    {
        cdev_init(&test_cdev, &test_fops);
        test_cdev.owner = THIS_MODULE;
        result = cdev_add(&test_cdev, test_major, 1);
        if(result)
        {
            printk(KERN_ERR "cedv_add error\n");
            unregister_chrdev_region(test_major, 1LL);
            return -1;
        }
        else
        {
            test_class = class_create(THIS_MODULE, "test");
            if(test_class)
            {
                result = device_create(test_class, NULL, test_major, NULL, "test");
                if(result)
                {
                    printk(KERN_INFO "Register success\n");
                    test_buffer = kmalloc(LENGTH, GFP_KERNEL);
                    return 0;
                }
                else
                {
                    printk(KERN_ERR "device_create error\n");            
                    class_destroy(test_class);
                    cdev_del(&test_cdev);
                    unregister_chrdev_region(test_major, 1LL);
                    return -1;
                }
            }
            else
            {
                printk(KERN_ERR "class_create error\n");
                cdev_del(&test_cdev);
                unregister_chrdev_region(test_major, 1LL);
                return -1;
            }
        }

    }
}

void test_exit(void)
{
    device_destroy(test_class, test_major);
    class_destroy(test_class);
    cdev_del(&test_cdev);
    unregister_chrdev_region(test_major, 1LL);
    unregister_chrdev(test_major, "test");

    if (test_buffer)
        kfree(test_buffer);

    printk(KERN_INFO "Removing test module\n");
}

int open_times = 0;
int release_times = 0;

long test_ioctl (struct file *filp, unsigned int cmd, unsigned long arg)
{
    return 0;
}

int test_open(struct inode *inode, struct file *filp)
{
    printk(KERN_INFO "Open %d times\n", release_times);
    open_times++;
    return 0;
}

int test_release(struct inode *inode, struct file *filp)
{
    printk(KERN_INFO "Release %d times\n", release_times);
    release_times++;
    return 0;
}

ssize_t test_read(struct file *filp, char *buf,
                    size_t count, loff_t *f_pos)
{
    int bytes;

    if (used > count && used > 0)
    {
        used -= count;
        bytes = count;
        copy_to_user(buf, test_buffer, bytes);
    }
    else if (used > 0)
    {
        bytes = used;
        used = 0;
        copy_to_user(buf, test_buffer, bytes);
    }

    return bytes;
}

ssize_t test_write(struct file *filp, const char *buf,
                     size_t count, loff_t *f_pos)
{
    int bytes = 0;

    if (used + count < LENGTH)
    {
        used += count;
        bytes = count;
        copy_from_user(test_buffer, buf, bytes);
    }
    else if (used < LENGTH)
    {
        bytes = LENGTH - used;
        used = LENGTH;
        copy_from_user(test_buffer, buf, bytes);
    }

    return bytes;
}

其对应的Makefile如下：

TARGET_MODULE:=test
PWD:=$(shell pwd)
# KERNELDIR := /lib/modules/$(shell uname -r)/build
KERNELDIR:=./linux-5.2.7

$(TARGET_MODULE)-objs := test_src.o
obj-m := $(TARGET_MODULE).o

all:
    $(MAKE) -C $(KERNELDIR) M=$(PWD) modules
    
clean:  
    rm -rf *.o *~ core .depend .*.cmd *.ko *.mod.c .tmp_versions *.order *.symvers

调试

一般来说加nokaslr把kaslr关了调试起来会方便一些。否则gdb将找不到ELF基地址（毕竟不是本地）。

-append "console=ttyS0 nokaslr"

但是调试驱动时，即使关闭了kalsr，gdb也无法确定其基地址，这时候我们需要用add-symbol-file来手动添加基地址。

下面我写了一个程序方便快速读取驱动的基地址信息：

vmmap.c

// musl-gcc -static vmmap.c -O3 -s -o vmmap
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <dirent.h>

inline static void truncate_string(char *str)
{
    while(!(*str == '\n' || *str == '\0'))
    {
        str++;
    }
    if(*str == '\n')
    {
        *str = '\0';
    }
}

int main(int argc, char const *argv[])
{
    DIR * dir;
    struct dirent *ptr[0x100], *temp;
    int i, num = 0, text_position = -1;
    FILE *fp;
    char buf[0x400];

    memset(ptr, 0, sizeof(ptr));
    setvbuf(stdin, NULL, _IONBF, 0);
    setvbuf(stdout, NULL, _IONBF, 0);
    setvbuf(stderr, NULL, _IONBF, 0);

    if(argc < 2)
    {
        fprintf(stderr, "Usage: ./vmmap file-path\n");
        exit(1);
    }

    if(chdir(argv[1]) == -1)
    {
        fprintf(stderr, "chdir error: %m\n");
        exit(1);
    }

    dir = opendir(argv[1]);

    if(dir == NULL)
    {
        fprintf(stderr, "opendir error: %m\n");
        exit(1);
    }

    for(i = 0; i < 0x100; i++)
    {
        temp = readdir(dir);
        if(temp == NULL)
        {
            break;
        }

        if(temp->d_type == DT_REG)
        {
            if(!strcmp(".text", temp->d_name))
            {
                text_position = num;
                ptr[num] = temp;
            }
            else
            {
                ptr[num] = temp;
            }
            num++;
        }
    }

    if(text_position == -1)
    {
        fprintf(stderr, "Error: don't find .text\n");
        exit(1);
    }

    fp = fopen(ptr[text_position]->d_name, "rb");
    if(fp == NULL)
    {
        fprintf(stderr, "fopen error: %m\n");
        exit(1);
    }
    fgets(buf, 0x400, fp);
    truncate_string(buf);
    printf(" %s ", buf);
    fclose(fp);

    for(i = 0; i < num; i++)
    {
        if(i == text_position)
        {
            continue;
        }

        fp = fopen(ptr[i]->d_name, "rb");
        if(fp == NULL)
        {
            fprintf(stderr, "fopen error: %m\n");
            exit(1);
        }
        fgets(buf, 0x400, fp);
        truncate_string(buf);
        printf("-s %s %s ", ptr[i]->d_name, buf);
        fclose(fp);
    }
    puts("");

    return 0;
}

使用方法如下：

/ # /vmmap /sys/module/test/sections
 0xffffffffc00fd000 -s .note.Linux 0xffffffffc00fe138 -s .strtab 0xffffffffc01026a8 -s __mcount_loc 0xffffffffc00fe024 -s .bss 0xffffffffc00ff480 -s .gnu.linkonce.this_module 0xffffffffc00ff140 -s .symtab 0xffffffffc0102000 -s .note.gnu.build-id 0xffffffffc00fe000 -s .data 0xffffffffc00ff000 -s __bug_table 0xffffffffc00ff100 -s .rodata.str1.1 0xffffffffc00fe05c -s .rodata.str1.8 0xffffffffc00fe110

可以写一个脚本来快速连接：

#!/bin/sh
gdb -q \
-ex "file ./vmlinux" \
-ex "add-symbol-file ./test.ko 0x.... -s .bss 0x...." \
-ex "target remote localhost:1000"

那么结合上面写的vmmap其脚本就是下面这个样子：

#!/bin/sh
gdb -q \
-ex "file ./vmlinux" \
-ex "add-symbol-file ./test.ko 0xffffffffc00fd000 -s .note.Linux 0xffffffffc00fe138 -s .strtab 0xffffffffc01026a8 -s __mcount_loc 0xffffffffc00fe024 -s .bss 0xffffffffc00ff480 -s .gnu.linkonce.this_module 0xffffffffc00ff140 -s .symtab 0xffffffffc0102000 -s .note.gnu.build-id 0xffffffffc00fe000 -s .data 0xffffffffc00ff000 -s __bug_table 0xffffffffc00ff100 -s .rodata.str1.1 0xffffffffc00fe05c -s .rodata.str1.8 0xffffffffc00fe110" \
-ex "target remote localhost:1000"

资料来源：