Skip to content

Commit

Permalink
criu: Support C/R of pidfds
Browse files Browse the repository at this point in the history
Process file descriptors (pidfds) were introduced to provide a stable
handle on a process. They solve the problem of pid recycling.

For a detailed explanation, see https://lwn.net/Articles/801319/ and
http://www.corsix.org/content/what-is-a-pidfd

Before Linux 6.9, anonymous inodes were used for the implementation of
pidfds. So, we detect them in a fashion similiar to other fd types that
use anonymous inodes by calling `readlink()`.
After 6.9, pidfs (a file system for pidfds) was introduced.
After this change, pidfs inodes have no file type in st_mode in
userspace.
(https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/pidfs.c?h=v6.11-rc2#n285)
We use `PID_FS_MAGIC` to detect pidfds for kernel >= 6.9

For pidfds that refer to dead processes, we lose the pid of the process
as the Pid and NSpid fields in /proc/<pid>/fdinfo/<pidfd> change to -1.
So, we create a temporary process for each unique inode and open pidfds
that refer to this process. After all pidfds have been opened we kill
this temporary process.

Fixes: checkpoint-restore#2258 checkpoint-restore#2459

Signed-off-by: Bhavik Sachdev <[email protected]>
  • Loading branch information
bsach64 committed Aug 14, 2024
1 parent aa88b35 commit ea95ecc
Show file tree
Hide file tree
Showing 12 changed files with 297 additions and 1 deletion.
1 change: 1 addition & 0 deletions criu/Makefile.crtools
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ obj-$(CONFIG_COMPAT) += vdso-compat.o
CFLAGS_REMOVE_vdso-compat.o += $(CFLAGS-ASAN) $(CFLAGS-GCOV)
obj-y += pidfd-store.o
obj-y += hugetlb.o
obj-y += pidfd.o

PROTOBUF_GEN := scripts/protobuf-gen.sh

Expand Down
3 changes: 2 additions & 1 deletion criu/cr-restore.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
#include "timens.h"
#include "bpfmap.h"
#include "apparmor.h"
#include "pidfd.h"

#include "parasite-syscall.h"
#include "files-reg.h"
Expand Down Expand Up @@ -280,7 +281,7 @@ static struct collect_image_info *cinfos_files[] = {
&unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo,
&netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo,
&tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo,
&fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo,
&fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &pidfd_cinfo
};

/* These images are required to restore namespaces */
Expand Down
12 changes: 12 additions & 0 deletions criu/files.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include "kerndat.h"
#include "fdstore.h"
#include "bpfmap.h"
#include "pidfd.h"

#include "protobuf.h"
#include "util.h"
Expand Down Expand Up @@ -544,6 +545,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
ops = &signalfd_dump_ops;
else if (is_timerfd_link(link))
ops = &timerfd_dump_ops;
else if (is_pidfd_link(link))
ops = &pidfd_dump_ops;
#ifdef CONFIG_HAS_LIBBPF
else if (is_bpfmap_link(link))
ops = &bpfmap_dump_ops;
Expand Down Expand Up @@ -582,6 +585,11 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
return do_dump_gen_file(&p, lfd, ops, e);
}

if (p.fs_type == PID_FS_MAGIC) {
ops = &pidfd_dump_ops;
return do_dump_gen_file(&p, lfd, ops, e);
}

/*
* For debug purpose -- at least show the link
* file pointing to when reporting unsupported file.
Expand Down Expand Up @@ -1778,6 +1786,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i)
case FD_TYPES__MEMFD:
ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo);
break;
case FD_TYPES__PIDFD:
ret = collect_one_file_entry(fe, fe->pidfd->id, &fe->pidfd->base, &pidfd_cinfo);
break;
#ifdef CONFIG_HAS_LIBBPF
case FD_TYPES__BPFMAP:
ret = collect_one_file_entry(fe, fe->bpf->id, &fe->bpf->base, &bpfmap_cinfo);
Expand All @@ -1800,5 +1811,6 @@ int prepare_files(void)
{
init_fdesc_hash();
init_sk_info_hash();
init_dead_pidfd_hash();
return collect_image(&files_cinfo);
}
1 change: 1 addition & 0 deletions criu/image-desc.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF),
FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF),
FD_ENTRY(APPARMOR, "apparmor"),
FD_ENTRY(PIDFD, "pidfd"),

[CR_FD_STATS] = {
.fmt = "stats-%s",
Expand Down
4 changes: 4 additions & 0 deletions criu/include/fs-magic.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,8 @@
#define OVERLAYFS_SUPER_MAGIC 0x794c7630
#endif

#ifndef PID_FS_MAGIC
#define PID_FS_MAGIC 0x50494446
#endif

#endif /* __CR_FS_MAGIC_H__ */
1 change: 1 addition & 0 deletions criu/include/image-desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ enum {
CR_FD_PIPES,
CR_FD_TTY_FILES,
CR_FD_MEMFD_FILE,
CR_FD_PIDFD,

CR_FD_AUTOFS,

Expand Down
1 change: 1 addition & 0 deletions criu/include/magic.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
#define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */
#define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */
#define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */
#define PIDFD_MAGIC 0x54435556 /* Ufa */

#define IFADDR_MAGIC RAW_IMAGE_MAGIC
#define ROUTE_MAGIC RAW_IMAGE_MAGIC
Expand Down
16 changes: 16 additions & 0 deletions criu/include/pidfd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#ifndef __CR_PIDFD_H__
#define __CR_PIDFD_H__

#include "files.h"
#include "pidfd.pb-c.h"

extern const struct fdtype_ops pidfd_dump_ops;
extern struct collect_image_info pidfd_cinfo;
extern int is_pidfd_link(char *link);
extern void init_dead_pidfd_hash(void);
struct pidfd_dump_info {
PidfdEntry pidfe;
pid_t pid;
};

#endif /* __CR_PIDFD_H__ */
1 change: 1 addition & 0 deletions criu/include/protobuf-desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ enum {
PB_BPFMAP_FILE,
PB_BPFMAP_DATA,
PB_APPARMOR,
PB_PIDFD,

/* PB_AUTOGEN_STOP */

Expand Down
228 changes: 228 additions & 0 deletions criu/pidfd.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
#include "files.h"
#include "imgset.h"
#include "pidfd.h"
#include "fdinfo.h"
#include "pidfd.pb-c.h"
#include "protobuf.h"
#include "pstree.h"
#include <stdlib.h>
#include <sys/wait.h>
#include <signal.h>
#include "common/bug.h"

#undef LOG_PREFIX
#define LOG_PREFIX "pidfd: "

struct pidfd_info {
PidfdEntry *pidfe;
struct file_desc d;
};

struct dead_pidfd {
unsigned int ino;
int pid;
size_t count;
struct hlist_node hash;
};

#define DEAD_PIDFD_HASH_SIZE 32
static struct hlist_head dead_pidfd_hash[DEAD_PIDFD_HASH_SIZE];

void init_dead_pidfd_hash(void)
{
for (int i = 0; i < DEAD_PIDFD_HASH_SIZE; i++)
INIT_HLIST_HEAD(&dead_pidfd_hash[i]);
}

static struct dead_pidfd *lookup_dead_pidfd(unsigned int ino)
{
struct dead_pidfd *dead;
struct hlist_head *chain;

chain = &dead_pidfd_hash[ino % DEAD_PIDFD_HASH_SIZE];
hlist_for_each_entry(dead, chain, hash) {
if (dead->ino == ino)
return dead;
}
return NULL;
}

static void free_dead_pidfd_hash(void)
{
int i;
struct dead_pidfd *dead;
struct hlist_node *tmp;

for (i = 0; i < DEAD_PIDFD_HASH_SIZE; i++) {
hlist_for_each_entry_safe(dead, tmp, &dead_pidfd_hash[i], hash) {
xfree(dead);
}
INIT_HLIST_HEAD(&dead_pidfd_hash[i]);
}
}

int is_pidfd_link(char *link)
{
/*
* pidfs was introduced in Linux 6.9
* before which anonymous-inodes were used
*/
return is_anon_link_type(link, "[pidfd]");
}

static void pr_info_pidfd(char *action, PidfdEntry *pidfe)
{
pr_info("%s: id %#08x flags %u NSpid %d ino %u\n",
action, pidfe->id, pidfe->flags, pidfe->nspid, pidfe->ino
);
}

static int dump_one_pidfd(int pidfd, u32 id, const struct fd_parms *p)
{
struct pidfd_dump_info pidfd_info = {.pidfe = PIDFD_ENTRY__INIT};
FileEntry fe = FILE_ENTRY__INIT;

if (parse_fdinfo(pidfd, FD_TYPES__PIDFD, &pidfd_info))
return -1;

/*
* Check if the pid pidfd refers to is part of process tree
* This ensures the process will exist on restore.
*/
if (pidfd_info.pid != -1 && !pstree_item_by_real(pidfd_info.pid)) {
pr_err("pidfd pid %d is not a part of process tree..",
pidfd_info.pid);
return -1;
}

pidfd_info.pidfe.id = id;
pidfd_info.pidfe.flags = (p->flags & ~O_RDWR);
pidfd_info.pidfe.fown = (FownEntry *)&p->fown;

fe.type = FD_TYPES__PIDFD;
fe.id = pidfd_info.pidfe.id;
fe.pidfd = &pidfd_info.pidfe;

pr_info_pidfd("Dumping", &pidfd_info.pidfe);
return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE);
}

const struct fdtype_ops pidfd_dump_ops = {
.type = FD_TYPES__PIDFD,
.dump = dump_one_pidfd,
};

static int pidfd_open(pid_t pid, int flags)
{
return syscall(__NR_pidfd_open, pid, flags);
}

static int create_tmp_process(void)
{
int tmp_process;
tmp_process = fork();
if (tmp_process < 0) {
pr_perror("Could not fork");
return -1;
} else if (tmp_process == 0) {
while(1);
}
return tmp_process;
}

static int open_one_pidfd(struct file_desc *d, int *new_fd)
{
struct pidfd_info *info;
struct dead_pidfd *dead;
int pidfd;

info = container_of(d, struct pidfd_info, d);

if (info->pidfe->nspid != -1) {
pidfd = pidfd_open(info->pidfe->nspid, info->pidfe->flags);
if (pidfd < 0)
goto err_close;
goto out;
}

dead = lookup_dead_pidfd(info->pidfe->ino);
BUG_ON(!dead);

dead->count--;
if (dead->pid == -1) {
dead->pid = create_tmp_process();
if (dead->pid < 0)
goto err_close;
}

pidfd = pidfd_open(dead->pid, info->pidfe->flags);
if (pidfd < 0)
goto err_close;

if (dead->count == 0) {
int status;

kill(dead->pid, SIGKILL);
waitpid(dead->pid, &status, 0);
if (WTERMSIG(status) != SIGKILL)
goto err_close;
hlist_del(&dead->hash);
xfree(dead);
}
out:
if (rst_file_params(pidfd, info->pidfe->fown, info->pidfe->flags)) {
goto err_close;
}

*new_fd = pidfd;
return 0;

err_close:
free_dead_pidfd_hash();
pr_perror("Can't create pidfd %#08x NSpid: %d flags: %u",
info->pidfe->id, info->pidfe->nspid, info->pidfe->flags);
return -1;
}

static struct file_desc_ops pidfd_desc_ops = {
.type = FD_TYPES__PIDFD,
.open = open_one_pidfd
};

static int collect_one_pidfd(void *obj, ProtobufCMessage *msg, struct cr_img *i)
{
struct dead_pidfd *dead;
struct pidfd_info *info = obj;

info->pidfe = pb_msg(msg, PidfdEntry);
pr_info_pidfd("Collected ", info->pidfe);

if (info->pidfe->nspid != -1)
goto out;

dead = lookup_dead_pidfd(info->pidfe->ino);
if (dead) {
dead->count++;
goto out;
}

dead = xmalloc(sizeof(*dead));
if (!dead)
return -1;

INIT_HLIST_NODE(&dead->hash);
dead->ino = info->pidfe->ino;
dead->count = 1;
dead->pid = -1;

hlist_add_head(&dead->hash, &dead_pidfd_hash[dead->ino % DEAD_PIDFD_HASH_SIZE]);
out:
return file_desc_add(&info->d, info->pidfe->id, &pidfd_desc_ops);
}

struct collect_image_info pidfd_cinfo = {
.fd_type = CR_FD_PIDFD,
.pb_type = PB_PIDFD,
.priv_size = sizeof(struct pidfd_info),
.collect = collect_one_pidfd,
};
Loading

0 comments on commit ea95ecc

Please sign in to comment.