Linux IO 多路复用 —— Epoll

Epoll 是 Linux 平台上独有的一组编程接口，用于监听多个文件描述符上的 IO 事件。 Epoll 相对于 select/poll 的优势在于即使监听了大量的文件描述符，性能也非常好。 Epoll API 支持两种监听方式：edge-triggered (EPOLLET) 和 level_triggered (default)。

Edge-triggered 模式下，只有当文件描述符上产生事件时，才会被 epoll_wait 返回。例如，监听一个 socket，假如第一次 epoll_wait 返回了该 sock，可读取为 2 字节，但是只读取了 1 字节。那么下一次 epoll_wait 将不会返回该文件描述符了。换句话说，缓冲区中还有数据可读不是一个事件。

Level-triggered 不同，只要该 sock 还是可读的，将持续返回。

在使用 ET 模式时，必须使用非阻塞文件描述符，防止阻塞读/阻塞写将处理多个文件描述符的任务饿死。最好以以下模式调用 ET 模式的 epoll_wait 接口：

使用非阻塞的文件描述符
只有当 read/write 返回 EAGAIN 时挂起并等待；当 read/write 返回的数据长度小于请求的数据长度时，就可以确定缓冲中已经没有数据了，也就可以认为事件已经完成了。

Epoll API#

Linux 提供了以下几个函数，用于创建、管理和使用 epoll 实例：

int epoll_create(int size);、int epoll_create1(int flags);
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
int epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout);
int epoll_pwait(int epfd, struct epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask);

epoll_create/epoll_create1#

epoll_create 将创建一个 epoll 实例，并且返回一个代表该实例的文件描述符。在 epoll_create1 中，epoll 的大小限制被取消了。 flags 可以为 EPOLL_CLOEXEC，即为新的文件描述符设置 close-on-exec (FD_CLOEXEC)，这个标志在文件描述符上表示当 execve 系统调用之后，新线程的文件描述符是否要被关闭。

epoll_ctl#

epoll_ctl 用于控制 epoll 实例上的监听的文件描述符，其中 epfd 就是 epoll 文件描述符，op 是指可以做的操作 (operation)，一共有三种：

EPOLL_CTL_ADD
EPOLL_CTL_MOD
EPOLL_CTL_DEL

顾名思义，添加、修改和删除。

后面的就是对应的文件描述符和 fd，以及设置好的想要监听的事件集合，存放在 struct epoll_event 中：

1
typedef union epoll_data {
2
   void        *ptr;
3
   int          fd;
4
   uint32_t     u32;
5
   uint64_t     u64;
6
} epoll_data_t;
7

8
struct epoll_event {
9
   uint32_t     events;      /* Epoll events */
10
   epoll_data_t data;        /* User data variable */
11
};

struct epoll_event 中的 events 是个位数组，表明当前监听的时间，列举几个比较重要的：

EPOLLIN/EPOLLOUT，文件可读/写
EPOLLRDHUP，关闭连接或者写入半连接
EPOLLERR，默认参数，文件描述符上发生错误
EPOLLHUP，默认参数，文件被挂断，在 socket/pipe 上代表本端关闭连接
EPOLLET，开启 edge-triggered，默认是 level-triggered
EPOLLONESHOT，一次触发后自动移除监听
EPOLLWAKEUP，如果 EPOLLONESHOT 和 EPOLLET 清除了，并且进程拥有 CAP_BLOCK_SUSPEND 权限，那么这个标志能够保证事件在挂起或者处理的时候，系统不会挂起或休眠

epoll_wait/epoll_pwait#

epoll_wait 阻塞并等待文件描述上的事件，需要保证 events 数组的大小要比 maxevents 大。epoll_wait 将阻塞直到：

一个文件描述符产生事件
被信号打断
超时 (timeout）

并返回当前事件的数量。

epoll_pwait 多设置一个 sigmask，代表不想被这些信号打断，其余的相当于 epoll_wait。

性能测试#

对 poll/selelct 和 epoll 在监听不同数量文件描述符时的系统调用消耗对比，参考自参考文献表中的第一个网站。

1
# operations  |  poll  |  select   | epoll
2
10            |   0.61 |    0.73   | 0.41
3
100           |   2.9  |    3.0    | 0.42
4
1000          |  35    |   35      | 0.53
5
10000         | 990    |  930      | 0.66

参考文献#

[1] https://jvns.ca/blog/2017/06/03/async-io-on-linux—select—poll—and-epoll/

[2] Linux Programmer’s Manual: man epoll/epoll_create/epoll_ctl/epoll_wait

基于 epoll 的简易服务器#

以下使用 C 语言实现了一个简单的服务器，支持同时最多 100 个连接，对每个新建的连接。将它加入 epoll 队列中。当 IO 事件到达时，处理对应的客户端的 IO 事件。

1
#include <stdio.h>
2
#include <stdlib.h>
3
#include <string.h>
4
#include <unistd.h>
5
#include <fcntl.h>
6
#include <errno.h>
7
#include <sys/types.h>
8
#include <sys/socket.h>
9
#include <arpa/inet.h>
10
#include <sys/epoll.h>
11

12
#define MAX_EVENTS 10
13
#define LISTEN_PORT 1234
14
#define BUF_LEN 512
15
#define MAX_CONN 100
16

17
struct epoll_event ev, events[MAX_EVENTS];
18
int listen_sock, conn_sock, nfds, epollfd;
19
struct sockaddr_in server;
20

21
#define log(...) printf(__VA_ARGS__)
22

23
void response_to_conn(int conn_sock) {
24
    char buf[BUF_LEN + 1];
25

26
    int read_len = 0;
27
    while ((read_len = read(conn_sock, buf, BUF_LEN)) > 0) {
28
        buf[read_len] = '\0';
29

30
        int cursor = 0;
31
        while (cursor < read_len) {
32
            // writing to a pipe or socket whose reading end is closed
33
            // will lead to a SIGPIPE
34
            int len = write(conn_sock, buf + cursor, read_len - cursor);
35
            if (len < 0) {
36
                perror("write");
37
                return;
38
            }
39
            cursor += len;
40
        }
41

42
        // there are no data so we do not have to do another read
43
        if (read_len < BUF_LEN) {
44
            break;
45
        }
46
    }
47

48
    // must make sure that the next read will block this non-blocking
49
    // socket, then we think the event is fully consumed.
50
    if (read_len < 0 && errno == EAGAIN) {
51
        return;
52
    }
53
    // end of file
54
    if (read_len == 0) {
55
        return;
56
    }
57
}
58

59
/* Code to set up listening socket, 'listen_sock' */
60
void listen_and_bind() {
61
    if ((listen_sock = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
62
        perror("socket");
63
        exit(EXIT_FAILURE);
64
    }
65
    int option = 1;
66
    setsockopt(listen_sock, SOL_SOCKET, SO_REUSEADDR, &option, sizeof(option));
67

68
    server.sin_family = AF_INET;
69
    server.sin_addr.s_addr = INADDR_ANY;
70
    server.sin_port = htons(LISTEN_PORT);
71
    if (bind(listen_sock, (struct sockaddr *)&server, sizeof(server)) == -1) {
72
        perror("bind");
73
        exit(EXIT_FAILURE);
74
    }
75

76
    listen(listen_sock, MAX_CONN);
77
}
78

79
void create_epoll() {
80
    epollfd = epoll_create1(0);
81
    if (epollfd == -1) {
82
        perror("epoll_create1");
83
        exit(EXIT_FAILURE);
84
    }
85

86
    ev.events = EPOLLIN;
87
    ev.data.fd = listen_sock;
88
    if (epoll_ctl(epollfd, EPOLL_CTL_ADD, listen_sock, &ev) == -1) {
89
        perror("epoll_ctl: listen_sock");
90
        exit(EXIT_FAILURE);
91
    }
92
}
93

94
void set_fd_nonblocking(int fd) {
95
    int flags = fcntl(fd, F_GETFL, 0);
96
    if (flags == -1) {
97
        perror("getfl");
98
        return;
99
    }
100
    if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) < 0) {
101
        perror("setfl");
102
        return;
103
    }
104
}
105

106
void epoll_loop() {
107
    for (;;) {
108
        int nfds = epoll_wait(epollfd, events, MAX_EVENTS, -1);
109
        if (nfds == -1) {
110
            perror("epoll_wait");
111
            exit(EXIT_FAILURE);
112
        }
113

114
        log("get %d events from epoll_wait!\n", nfds);
115

116
        for (int n = 0; n < nfds; ++ n) {
117
            if (events[n].data.fd == listen_sock) {
118
                struct sockaddr_in local;
119
                socklen_t addrlen;
120
                conn_sock = accept(listen_sock, (struct sockaddr *) &local, &addrlen);
121
                if (conn_sock == -1) {
122
                    perror("accept");
123
                    exit(EXIT_FAILURE);
124
                }
125

126
                log("accept a new connection!\n");
127

128
                // set non-blocking
129
                set_fd_nonblocking(conn_sock);
130

131
                ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
132
                ev.data.fd = conn_sock;
133
                if (epoll_ctl(epollfd, EPOLL_CTL_ADD, conn_sock, &ev) == -1) {
134
                    perror("epoll_ctl: conn_sock");
135
                    exit(EXIT_FAILURE);
136
                }
137
            } else {
138
                if (events[n].events & (EPOLLRDHUP | EPOLLERR)) {
139
                    log("detect a closed/broken connection!\n");
140
                    epoll_ctl(epollfd, EPOLL_CTL_DEL, events[n].data.fd, NULL);
141
                    close(events[n].data.fd);
142
                } else response_to_conn(events[n].data.fd);
143
            }
144
        }
145
    }
146
}
147

148
int main(int argc, char **argv) {
149
    log("listenning on port 1234!\n");
150
    listen_and_bind();
151

152
    log("creating epoll!\n");
153
    create_epoll();
154

155
    log("starting loop on epoll!\n");
156
    epoll_loop();
157

158
    return 0;
159
}