slirp4netns — How does it work
slirp4netns provides user-mode networking (“slirp”) for unprivileged network namespaces. This is heavily used in rootless-containers.
Let us examine how it actually works. You can do this without looking at the code (which you really should) using some simple linux tools. What you will find will expose you to some really nice linux features you may not be aware of.
Create a process with its own network and pid namespace
$ unshare --user --map-root-user --net --mount
[root@incensed-gawain ~]# echo $$
2646
Run the slirp process on the host and connect it to the process namespace
strace the process to examine what happens
strace -f slirp4netns --configure --mtu=65520 2646 tap0
It creates a socketpair and clones into the child
The fd is still available in the child process and accessible across the network namespace boundary
Wait for the child to communicate back on the socketpair
socketpair(AF_UNIX, SOCK_STREAM, 0, [3, 4]) = 0
clone(child_stack=NULL, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f5bc798b810) = 2667
wait4(2667, strace: Process 2667 attached
<unfinished ...>
Child
Creates the tap interface
[pid 2667] openat(AT_FDCWD, "/proc/2646/ns/user", O_RDONLY) = 5
[pid 2667] openat(AT_FDCWD, "/proc/2646/ns/net", O_RDONLY) = 6
[pid 2667] setns(5, CLONE_NEWUSER) = 0
[pid 2667] setns(6, CLONE_NEWNET) = 0
[pid 2667] close(5) = 0
[pid 2667] close(6) = 0
[pid 2667] openat(AT_FDCWD, "/dev/net/tun", O_RDWR) = 5
The tap fd in the child is 5
[pid 2667] ioctl(5, TUNSETIFF, 0x7ffd60075390) = 0
[pid 2667] socket(AF_INET, SOCK_DGRAM, IPPROTO_IP) = 6
[pid 2667] ioctl(6, SIOCSIFFLAGS, {ifr_name="tap0", ifr_flags=IFF_UP|IFF_RUNNING}) = 0
[pid 2667] ioctl(6, SIOCSIFMTU, {ifr_name="tap0", ifr_mtu=65520}) = 0
[pid 2667] ioctl(6, SIOCSIFADDR, {ifr_name="tap0", ifr_addr={sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("10.0.2.100")}}) = 0
[pid 2667] ioctl(6, SIOCSIFNETMASK, {ifr_name="tap0", ifr_netmask={sa_family=AF_INET, sin_port=htons(0), sin_addr=inet_addr("255.255.255.0")}}) = 0
[pid 2667] ioctl(6, SIOCADDRT, 0x7ffd60075390) = 0
Use out of band data to send the fd
5
back to the parent process running on the host
https://linux.die.net/man/2/sendmsg
MSG_OOB
Sends out-of-band data on sockets that support this notion (e.g., of type SOCK_STREAM); the underlying protocol must also support out-of-band data[pid 2667] sendmsg(4, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0", iov_len=1}], msg_iovlen=1, msg_control=[{cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, cmsg_data=[5]}], msg_controllen=20, msg_flags=0}, 0) = 1
[pid 2667] write(2, "sent tapfd=5 for tap0\n", 22sent tapfd=5 for tap0
) = 22
[pid 2667] close(4) = 0
[pid 2667] exit_group(0) = ?
[pid 2667] +++ exited with 0 +++
Parent
Picks up the fd 5
. This fd is read from, to get packets from the container.
That is how network traffic makes it across the network ns even though tap interfaces cannot cross a network namespace boundary
<... wait4 resumed> [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], 0, NULL) = 2667
--- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_EXITED, si_pid=2667, si_uid=1000, si_status=0, si_utime=0, si_stime=0} ---
recvmsg(3, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0", iov_len=1}], msg_iovlen=1, msg_control=[{cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, cmsg_data=[5]}], msg_controllen=24, msg_flags=0}, 0) = 1
write(2, "received tapfd=5\n", 17received tapfd=5
) = 17
close(3) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(0x88, 0x1), ...}) = 0
write(1, "Starting slirp\n", 15Starting slirp
) = 15
write(1, "* MTU: 65520\n", 25* MTU: 65520
) = 25
write(1, "* Network: 10.0.2.0\n", 28* Network: 10.0.2.0
) = 28
write(1, "* Netmask: 255.255.255.0"..., 33* Netmask: 255.255.255.0
) = 33
write(1, "* Gateway: 10.0.2.2\n", 28* Gateway: 10.0.2.2
) = 28
write(1, "* DNS: 10.0.2.3\n", 28* DNS: 10.0.2.3
) = 28
write(1, "* Recommended IP: 10.0.2.100\n", 30* Recommended IP: 10.0.2.100
) = 30
write(1, "WARNING: 127.0.0.1:* on the host"..., 127WARNING: 127.0.0.1:* on the host is accessible as 10.0.2.2 (set --disable-host-loopback to prohibit connecting to 127.0.0.1:*)
) = 127
rt_sigaction(SIGPIPE, {sa_handler=SIG_IGN, sa_mask=[PIPE], sa_flags=SA_RESTORER|SA_RESTART, sa_restorer=0x7f5bc73e2f30}, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0
poll([{fd=5, events=POLLIN|POLLHUP}], 1, 1000) = 1 ([{fd=5, revents=POLLIN}])
read(5, "33\0\0\0\26\372N1\230}\325\206\335`\0\0\0\0$\0\1\0\0\0\0\0\0\0\0\0\0"..., 65536) = 90
brk(NULL) = 0xe03000
brk(0xe2e000) = 0xe2e000
poll([{fd=5, events=POLLIN|POLLHUP}], 1, 1000) = 1 ([{fd=5, revents=POLLIN}])