From 75215c972581d3934e76a57690cf838d7ceab399 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 18 Jun 2025 22:53:38 +0200
Subject: pidfs: move to anonymous struct

Move the pidfs entries to an anonymous struct.

Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-4-98f3456fd552@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pid.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 453ae6d8a68d..00646a692dd4 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -52,14 +52,15 @@ struct upid {
 	struct pid_namespace *ns;
 };
 
-struct pid
-{
+struct pid {
 	refcount_t count;
 	unsigned int level;
 	spinlock_t lock;
-	struct dentry *stashed;
-	u64 ino;
-	struct rb_node pidfs_node;
+	struct {
+		u64 ino;
+		struct rb_node pidfs_node;
+		struct dentry *stashed;
+	};
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
 	struct hlist_head inodes;
-- 
cgit v1.2.3


From 8ec7c826d97b390879df2a03dfb035c70af86779 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 18 Jun 2025 22:53:39 +0200
Subject: pidfs: persist information

Persist exit and coredump information independent of whether anyone
currently holds a pidfd for the struct pid.

The current scheme allocated pidfs dentries on-demand repeatedly.
This scheme is reaching it's limits as it makes it impossible to pin
information that needs to be available after the task has exited or
coredumped and that should not be lost simply because the pidfd got
closed temporarily. The next opener should still see the stashed
information.

This is also a prerequisite for supporting extended attributes on
pidfds to allow attaching meta information to them.

If someone opens a pidfd for a struct pid a pidfs dentry is allocated
and stashed in pid->stashed. Once the last pidfd for the struct pid is
closed the pidfs dentry is released and removed from pid->stashed.

So if 10 callers create a pidfs dentry for the same struct pid
sequentially, i.e., each closing the pidfd before the other creates a
new one then a new pidfs dentry is allocated every time.

Because multiple tasks acquiring and releasing a pidfd for the same
struct pid can race with each another a task may still find a valid
pidfs entry from the previous task in pid->stashed and reuse it. Or it
might find a dead dentry in there and fail to reuse it and so stashes a
new pidfs dentry. Multiple tasks may race to stash a new pidfs dentry
but only one will succeed, the other ones will put their dentry.

The current scheme aims to ensure that a pidfs dentry for a struct pid
can only be created if the task is still alive or if a pidfs dentry
already existed before the task was reaped and so exit information has
been was stashed in the pidfs inode.

That's great except that it's buggy. If a pidfs dentry is stashed in
pid->stashed after pidfs_exit() but before __unhash_process() is called
we will return a pidfd for a reaped task without exit information being
available.

The pidfds_pid_valid() check does not guard against this race as it
doens't sync at all with pidfs_exit(). The pid_has_task() check might be
successful simply because we're before __unhash_process() but after
pidfs_exit().

Introduce a new scheme where the lifetime of information associated with
a pidfs entry (coredump and exit information) isn't bound to the
lifetime of the pidfs inode but the struct pid itself.

The first time a pidfs dentry is allocated for a struct pid a struct
pidfs_attr will be allocated which will be used to store exit and
coredump information.

If all pidfs for the pidfs dentry are closed the dentry and inode can be
cleaned up but the struct pidfs_attr will stick until the struct pid
itself is freed. This will ensure minimal memory usage while persisting
relevant information.

The new scheme has various advantages. First, it allows to close the
race where we end up handing out a pidfd for a reaped task for which no
exit information is available. Second, it minimizes memory usage.
Third, it allows to remove complex lifetime tracking via dentries when
registering a struct pid with pidfs. There's no need to get or put a
reference. Instead, the lifetime of exit and coredump information
associated with a struct pid is bound to the lifetime of struct pid
itself.

Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-5-98f3456fd552@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pidfs.c            | 212 ++++++++++++++++++++++++++++++++++----------------
 include/linux/pid.h   |   3 +
 include/linux/pidfs.h |   1 +
 kernel/pid.c          |   2 +-
 4 files changed, 151 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pidfs.c b/fs/pidfs.c
index ff2560b34ed1..6a907457b1fe 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -25,7 +25,10 @@
 #include "internal.h"
 #include "mount.h"
 
+#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
+
 static struct kmem_cache *pidfs_cachep __ro_after_init;
+static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
 
 /*
  * Stashes information that userspace needs to access even after the
@@ -37,6 +40,11 @@ struct pidfs_exit_info {
 	__u32 coredump_mask;
 };
 
+struct pidfs_attr {
+	struct pidfs_exit_info __pei;
+	struct pidfs_exit_info *exit_info;
+};
+
 struct pidfs_inode {
 	struct pidfs_exit_info __pei;
 	struct pidfs_exit_info *exit_info;
@@ -125,6 +133,7 @@ void pidfs_add_pid(struct pid *pid)
 
 	pid->ino = pidfs_ino_nr;
 	pid->stashed = NULL;
+	pid->attr = NULL;
 	pidfs_ino_nr++;
 
 	write_seqcount_begin(&pidmap_lock_seq);
@@ -139,6 +148,18 @@ void pidfs_remove_pid(struct pid *pid)
 	write_seqcount_end(&pidmap_lock_seq);
 }
 
+void pidfs_free_pid(struct pid *pid)
+{
+	/*
+	 * Any dentry must've been wiped from the pid by now.
+	 * Otherwise there's a reference count bug.
+	 */
+	VFS_WARN_ON_ONCE(pid->stashed);
+
+	if (!IS_ERR(pid->attr))
+		kfree(pid->attr);
+}
+
 #ifdef CONFIG_PROC_FS
 /**
  * pidfd_show_fdinfo - print information about a pidfd
@@ -261,13 +282,13 @@ static __u32 pidfs_coredump_mask(unsigned long mm_flags)
 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
-	struct inode *inode = file_inode(file);
 	struct pid *pid = pidfd_pid(file);
 	size_t usize = _IOC_SIZE(cmd);
 	struct pidfd_info kinfo = {};
 	struct pidfs_exit_info *exit_info;
 	struct user_namespace *user_ns;
 	struct task_struct *task;
+	struct pidfs_attr *attr;
 	const struct cred *c;
 	__u64 mask;
 
@@ -286,8 +307,9 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 	if (!pid_in_current_pidns(pid))
 		return -ESRCH;
 
+	attr = READ_ONCE(pid->attr);
 	if (mask & PIDFD_INFO_EXIT) {
-		exit_info = READ_ONCE(pidfs_i(inode)->exit_info);
+		exit_info = READ_ONCE(attr->exit_info);
 		if (exit_info) {
 			kinfo.mask |= PIDFD_INFO_EXIT;
 #ifdef CONFIG_CGROUPS
@@ -300,7 +322,7 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
 
 	if (mask & PIDFD_INFO_COREDUMP) {
 		kinfo.mask |= PIDFD_INFO_COREDUMP;
-		kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask);
+		kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
 	}
 
 	task = get_pid_task(pid, PIDTYPE_PID);
@@ -552,41 +574,61 @@ struct pid *pidfd_pid(const struct file *file)
  * task has been reaped which cannot happen until we're out of
  * release_task().
  *
- * If this struct pid is referred to by a pidfd then
- * stashed_dentry_get() will return the dentry and inode for that struct
- * pid. Since we've taken a reference on it there's now an additional
- * reference from the exit path on it. Which is fine. We're going to put
- * it again in a second and we know that the pid is kept alive anyway.
+ * If this struct pid has at least once been referred to by a pidfd then
+ * pid->attr will be allocated. If not we mark the struct pid as dead so
+ * anyone who is trying to register it with pidfs will fail to do so.
+ * Otherwise we would hand out pidfs for reaped tasks without having
+ * exit information available.
  *
- * Worst case is that we've filled in the info and immediately free the
- * dentry and inode afterwards since the pidfd has been closed. Since
+ * Worst case is that we've filled in the info and the pid gets freed
+ * right away in free_pid() when no one holds a pidfd anymore. Since
  * pidfs_exit() currently is placed after exit_task_work() we know that
- * it cannot be us aka the exiting task holding a pidfd to ourselves.
+ * it cannot be us aka the exiting task holding a pidfd to itself.
  */
 void pidfs_exit(struct task_struct *tsk)
 {
-	struct dentry *dentry;
+	struct pid *pid = task_pid(tsk);
+	struct pidfs_attr *attr;
+	struct pidfs_exit_info *exit_info;
+#ifdef CONFIG_CGROUPS
+	struct cgroup *cgrp;
+#endif
 
 	might_sleep();
 
-	dentry = stashed_dentry_get(&task_pid(tsk)->stashed);
-	if (dentry) {
-		struct inode *inode = d_inode(dentry);
-		struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei;
-#ifdef CONFIG_CGROUPS
-		struct cgroup *cgrp;
+	guard(spinlock_irq)(&pid->wait_pidfd.lock);
+	attr = pid->attr;
+	if (!attr) {
+		/*
+		 * No one ever held a pidfd for this struct pid.
+		 * Mark it as dead so no one can add a pidfs
+		 * entry anymore. We're about to be reaped and
+		 * so no exit information would be available.
+		 */
+		pid->attr = PIDFS_PID_DEAD;
+		return;
+	}
 
-		rcu_read_lock();
-		cgrp = task_dfl_cgroup(tsk);
-		exit_info->cgroupid = cgroup_id(cgrp);
-		rcu_read_unlock();
+	/*
+	 * If @pid->attr is set someone might still legitimately hold a
+	 * pidfd to @pid or someone might concurrently still be getting
+	 * a reference to an already stashed dentry from @pid->stashed.
+	 * So defer cleaning @pid->attr until the last reference to @pid
+	 * is put
+	 */
+
+	exit_info = &attr->__pei;
+
+#ifdef CONFIG_CGROUPS
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(tsk);
+	exit_info->cgroupid = cgroup_id(cgrp);
+	rcu_read_unlock();
 #endif
-		exit_info->exit_code = tsk->exit_code;
+	exit_info->exit_code = tsk->exit_code;
 
-		/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
-		smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei);
-		dput(dentry);
-	}
+	/* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+	smp_store_release(&attr->exit_info, &attr->__pei);
 }
 
 #ifdef CONFIG_COREDUMP
@@ -594,16 +636,15 @@ void pidfs_coredump(const struct coredump_params *cprm)
 {
 	struct pid *pid = cprm->pid;
 	struct pidfs_exit_info *exit_info;
-	struct dentry *dentry;
-	struct inode *inode;
+	struct pidfs_attr *attr;
 	__u32 coredump_mask = 0;
 
-	dentry = pid->stashed;
-	if (WARN_ON_ONCE(!dentry))
-		return;
+	attr = READ_ONCE(pid->attr);
 
-	inode = d_inode(dentry);
-	exit_info = &pidfs_i(inode)->__pei;
+	VFS_WARN_ON_ONCE(!attr);
+	VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
+
+	exit_info = &attr->__pei;
 	/* Note how we were coredumped. */
 	coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
 	/* Note that we actually did coredump. */
@@ -663,7 +704,7 @@ static struct inode *pidfs_alloc_inode(struct super_block *sb)
 
 static void pidfs_free_inode(struct inode *inode)
 {
-	kmem_cache_free(pidfs_cachep, pidfs_i(inode));
+	kfree(pidfs_i(inode));
 }
 
 static const struct super_operations pidfs_sops = {
@@ -831,8 +872,13 @@ static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path,
 	 * recorded and published can be handled correctly.
 	 */
 	if (unlikely(!pid_has_task(pid, type))) {
-		struct inode *inode = d_inode(path->dentry);
-		return !!READ_ONCE(pidfs_i(inode)->exit_info);
+		struct pidfs_attr *attr;
+
+		attr = READ_ONCE(pid->attr);
+		if (!attr)
+			return false;
+		if (!READ_ONCE(attr->exit_info))
+			return false;
 	}
 
 	return true;
@@ -878,9 +924,67 @@ static void pidfs_put_data(void *data)
 	put_pid(pid);
 }
 
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs. Needs to be paired with
+ * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+	struct pidfs_attr *new_attr __free(kfree) = NULL;
+	struct pidfs_attr *attr;
+
+	might_sleep();
+
+	if (!pid)
+		return 0;
+
+	attr = READ_ONCE(pid->attr);
+	if (unlikely(attr == PIDFS_PID_DEAD))
+		return PTR_ERR(PIDFS_PID_DEAD);
+	if (attr)
+		return 0;
+
+	new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
+	if (!new_attr)
+		return -ENOMEM;
+
+	/* Synchronize with pidfs_exit(). */
+	guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+	attr = pid->attr;
+	if (unlikely(attr == PIDFS_PID_DEAD))
+		return PTR_ERR(PIDFS_PID_DEAD);
+	if (unlikely(attr))
+		return 0;
+
+	pid->attr = no_free_ptr(new_attr);
+	return 0;
+}
+
+static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
+					 struct dentry *dentry)
+{
+	int ret;
+	struct pid *pid = d_inode(dentry)->i_private;
+
+	VFS_WARN_ON_ONCE(stashed != &pid->stashed);
+
+	ret = pidfs_register_pid(pid);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return stash_dentry(stashed, dentry);
+}
+
 static const struct stashed_operations pidfs_stashed_ops = {
-	.init_inode = pidfs_init_inode,
-	.put_data = pidfs_put_data,
+	.stash_dentry	= pidfs_stash_dentry,
+	.init_inode	= pidfs_init_inode,
+	.put_data	= pidfs_put_data,
 };
 
 static int pidfs_init_fs_context(struct fs_context *fc)
@@ -936,33 +1040,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	return pidfd_file;
 }
 
-/**
- * pidfs_register_pid - register a struct pid in pidfs
- * @pid: pid to pin
- *
- * Register a struct pid in pidfs. Needs to be paired with
- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
- *
- * Return: On success zero, on error a negative error code is returned.
- */
-int pidfs_register_pid(struct pid *pid)
-{
-	struct path path __free(path_put) = {};
-	int ret;
-
-	might_sleep();
-
-	if (!pid)
-		return 0;
-
-	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
-	if (unlikely(ret))
-		return ret;
-	/* Keep the dentry and only put the reference to the mount. */
-	path.dentry = NULL;
-	return 0;
-}
-
 /**
  * pidfs_get_pid - pin a struct pid through pidfs
  * @pid: pid to pin
@@ -1008,6 +1085,9 @@ void __init pidfs_init(void)
 					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
 					  SLAB_ACCOUNT | SLAB_PANIC),
 					 pidfs_inode_init_once);
+	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
+					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+					  SLAB_ACCOUNT | SLAB_PANIC), NULL);
 	pidfs_mnt = kern_mount(&pidfs_type);
 	if (IS_ERR(pidfs_mnt))
 		panic("Failed to mount pidfs pseudo filesystem");
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 00646a692dd4..003a1027d219 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -47,6 +47,8 @@
 
 #define RESERVED_PIDS 300
 
+struct pidfs_attr;
+
 struct upid {
 	int nr;
 	struct pid_namespace *ns;
@@ -60,6 +62,7 @@ struct pid {
 		u64 ino;
 		struct rb_node pidfs_node;
 		struct dentry *stashed;
+		struct pidfs_attr *attr;
 	};
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 77e7db194914..8f6ed59bb3fb 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -16,5 +16,6 @@ extern const struct dentry_operations pidfs_dentry_operations;
 int pidfs_register_pid(struct pid *pid);
 void pidfs_get_pid(struct pid *pid);
 void pidfs_put_pid(struct pid *pid);
+void pidfs_free_pid(struct pid *pid);
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 8317bcbc7cf7..07db7d8d066c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -100,7 +100,7 @@ void put_pid(struct pid *pid)
 
 	ns = pid->numbers[pid->level].ns;
 	if (refcount_dec_and_test(&pid->count)) {
-		WARN_ON_ONCE(pid->stashed);
+		pidfs_free_pid(pid);
 		kmem_cache_free(ns->pid_cachep, pid);
 		put_pid_ns(ns);
 	}
-- 
cgit v1.2.3


From 804d6794497e6f3992d156e07d01e22b037ce09e Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 18 Jun 2025 22:53:42 +0200
Subject: pidfs: remove pidfs_{get,put}_pid()

Now that we stash persistent information in struct pid there's no need
to play volatile games with pinning struct pid via dentries in pidfs.

Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-8-98f3456fd552@kernel.org
Reviewed-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/coredump.c         |  6 ------
 fs/pidfs.c            | 35 +----------------------------------
 include/linux/pidfs.h |  2 --
 net/unix/af_unix.c    |  5 -----
 4 files changed, 1 insertion(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coredump.c b/fs/coredump.c
index f217ebf2b3b6..55d6a713a0fb 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -898,12 +898,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
 		retval = kernel_connect(socket, (struct sockaddr *)(&addr),
 					addr_len, O_NONBLOCK | SOCK_COREDUMP);
 
-		/*
-		 * ... Make sure to only put our reference after connect() took
-		 * its own reference keeping the pidfs entry alive ...
-		 */
-		pidfs_put_pid(cprm.pid);
-
 		if (retval) {
 			if (retval == -EAGAIN)
 				coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
diff --git a/fs/pidfs.c b/fs/pidfs.c
index c49c53d6ae51..bc2342cf4492 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -895,8 +895,7 @@ static void pidfs_put_data(void *data)
  * pidfs_register_pid - register a struct pid in pidfs
  * @pid: pid to pin
  *
- * Register a struct pid in pidfs. Needs to be paired with
- * pidfs_put_pid() to not risk leaking the pidfs dentry and inode.
+ * Register a struct pid in pidfs.
  *
  * Return: On success zero, on error a negative error code is returned.
  */
@@ -1007,38 +1006,6 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	return pidfd_file;
 }
 
-/**
- * pidfs_get_pid - pin a struct pid through pidfs
- * @pid: pid to pin
- *
- * Similar to pidfs_register_pid() but only valid if the caller knows
- * there's a reference to the @pid through a dentry already that can't
- * go away.
- */
-void pidfs_get_pid(struct pid *pid)
-{
-	if (!pid)
-		return;
-	WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed));
-}
-
-/**
- * pidfs_put_pid - drop a pidfs reference
- * @pid: pid to drop
- *
- * Drop a reference to @pid via pidfs. This is only safe if the
- * reference has been taken via pidfs_get_pid().
- */
-void pidfs_put_pid(struct pid *pid)
-{
-	might_sleep();
-
-	if (!pid)
-		return;
-	VFS_WARN_ON_ONCE(!pid->stashed);
-	dput(pid->stashed);
-}
-
 void __init pidfs_init(void)
 {
 	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 8f6ed59bb3fb..3e08c33da2df 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -14,8 +14,6 @@ void pidfs_coredump(const struct coredump_params *cprm);
 #endif
 extern const struct dentry_operations pidfs_dentry_operations;
 int pidfs_register_pid(struct pid *pid);
-void pidfs_get_pid(struct pid *pid);
-void pidfs_put_pid(struct pid *pid);
 void pidfs_free_pid(struct pid *pid);
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 2e2e9997a68e..129388c309b0 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -646,9 +646,6 @@ static void unix_sock_destructor(struct sock *sk)
 		return;
 	}
 
-	if (sk->sk_peer_pid)
-		pidfs_put_pid(sk->sk_peer_pid);
-
 	if (u->addr)
 		unix_release_addr(u->addr);
 
@@ -769,7 +766,6 @@ static void drop_peercred(struct unix_peercred *peercred)
 	swap(peercred->peer_pid, pid);
 	swap(peercred->peer_cred, cred);
 
-	pidfs_put_pid(pid);
 	put_pid(pid);
 	put_cred(cred);
 }
@@ -802,7 +798,6 @@ static void copy_peercred(struct sock *sk, struct sock *peersk)
 
 	spin_lock(&sk->sk_peer_lock);
 	sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
-	pidfs_get_pid(sk->sk_peer_pid);
 	sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 	spin_unlock(&sk->sk_peer_lock);
 }
-- 
cgit v1.2.3


From a683a5b2ba23598ad343e5ec10a4ef4077497fc9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Jul 2025 06:34:37 +0100
Subject: fold fs_struct->{lock,seq} into a seqlock

	The combination of spinlock_t lock and seqcount_spinlock_t seq
in struct fs_struct is an open-coded seqlock_t (see linux/seqlock_types.h).
	Combine and switch to equivalent seqlock_t primitives.  AFAICS,
that does end up with the same sequence of underlying operations in all
cases.
	While we are at it, get_fs_pwd() is open-coded verbatim in
get_path_from_fd(); rather than applying conversion to it, replace with
the call of get_fs_pwd() there.  Not worth splitting the commit for that,
IMO...

	A bit of historical background - conversion of seqlock_t to
use of seqcount_spinlock_t happened several months after the same
had been done to struct fs_struct; switching fs_struct to seqlock_t
could've been done immediately after that, but it looks like nobody
had gotten around to that until now.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/20250702053437.GC1880847@ZenIV
Acked-by: Ahmed S. Darwish <darwi@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/d_path.c               |  8 ++++----
 fs/exec.c                 |  4 ++--
 fs/fhandle.c              |  6 +-----
 fs/fs_struct.c            | 36 ++++++++++++++----------------------
 fs/namei.c                |  8 ++++----
 include/linux/fs_struct.h | 11 +++++------
 kernel/fork.c             | 10 +++++-----
 7 files changed, 35 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/d_path.c b/fs/d_path.c
index 5f4da5c8d5db..bb365511066b 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
 	unsigned seq;
 
 	do {
-		seq = read_seqcount_begin(&fs->seq);
+		seq = read_seqbegin(&fs->seq);
 		*root = fs->root;
-	} while (read_seqcount_retry(&fs->seq, seq));
+	} while (read_seqretry(&fs->seq, seq));
 }
 
 /**
@@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
 	unsigned seq;
 
 	do {
-		seq = read_seqcount_begin(&fs->seq);
+		seq = read_seqbegin(&fs->seq);
 		*root = fs->root;
 		*pwd = fs->pwd;
-	} while (read_seqcount_retry(&fs->seq, seq));
+	} while (read_seqretry(&fs->seq, seq));
 }
 
 /*
diff --git a/fs/exec.c b/fs/exec.c
index 1f5fdd2e096e..871078ddb220 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1510,7 +1510,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	 * state is protected by cred_guard_mutex we hold.
 	 */
 	n_fs = 1;
-	spin_lock(&p->fs->lock);
+	read_seqlock_excl(&p->fs->seq);
 	rcu_read_lock();
 	for_other_threads(p, t) {
 		if (t->fs == p->fs)
@@ -1523,7 +1523,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
 	else
 		p->fs->in_exec = 1;
-	spin_unlock(&p->fs->lock);
+	read_sequnlock_excl(&p->fs->seq);
 }
 
 static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
diff --git a/fs/fhandle.c b/fs/fhandle.c
index b1363ead6c5e..7c236f64cdea 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -180,11 +180,7 @@ static int get_path_anchor(int fd, struct path *root)
 	}
 
 	if (fd == AT_FDCWD) {
-		struct fs_struct *fs = current->fs;
-		spin_lock(&fs->lock);
-		*root = fs->pwd;
-		path_get(root);
-		spin_unlock(&fs->lock);
+		get_fs_pwd(current->fs, root);
 		return 0;
 	}
 
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 64c2d0814ed6..28be762ac1c6 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -17,12 +17,10 @@ void set_fs_root(struct fs_struct *fs, const struct path *path)
 	struct path old_root;
 
 	path_get(path);
-	spin_lock(&fs->lock);
-	write_seqcount_begin(&fs->seq);
+	write_seqlock(&fs->seq);
 	old_root = fs->root;
 	fs->root = *path;
-	write_seqcount_end(&fs->seq);
-	spin_unlock(&fs->lock);
+	write_sequnlock(&fs->seq);
 	if (old_root.dentry)
 		path_put(&old_root);
 }
@@ -36,12 +34,10 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
 	struct path old_pwd;
 
 	path_get(path);
-	spin_lock(&fs->lock);
-	write_seqcount_begin(&fs->seq);
+	write_seqlock(&fs->seq);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
-	write_seqcount_end(&fs->seq);
-	spin_unlock(&fs->lock);
+	write_sequnlock(&fs->seq);
 
 	if (old_pwd.dentry)
 		path_put(&old_pwd);
@@ -67,16 +63,14 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
 		fs = p->fs;
 		if (fs) {
 			int hits = 0;
-			spin_lock(&fs->lock);
-			write_seqcount_begin(&fs->seq);
+			write_seqlock(&fs->seq);
 			hits += replace_path(&fs->root, old_root, new_root);
 			hits += replace_path(&fs->pwd, old_root, new_root);
-			write_seqcount_end(&fs->seq);
 			while (hits--) {
 				count++;
 				path_get(new_root);
 			}
-			spin_unlock(&fs->lock);
+			write_sequnlock(&fs->seq);
 		}
 		task_unlock(p);
 	}
@@ -99,10 +93,10 @@ void exit_fs(struct task_struct *tsk)
 	if (fs) {
 		int kill;
 		task_lock(tsk);
-		spin_lock(&fs->lock);
+		read_seqlock_excl(&fs->seq);
 		tsk->fs = NULL;
 		kill = !--fs->users;
-		spin_unlock(&fs->lock);
+		read_sequnlock_excl(&fs->seq);
 		task_unlock(tsk);
 		if (kill)
 			free_fs_struct(fs);
@@ -116,16 +110,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	if (fs) {
 		fs->users = 1;
 		fs->in_exec = 0;
-		spin_lock_init(&fs->lock);
-		seqcount_spinlock_init(&fs->seq, &fs->lock);
+		seqlock_init(&fs->seq);
 		fs->umask = old->umask;
 
-		spin_lock(&old->lock);
+		read_seqlock_excl(&old->seq);
 		fs->root = old->root;
 		path_get(&fs->root);
 		fs->pwd = old->pwd;
 		path_get(&fs->pwd);
-		spin_unlock(&old->lock);
+		read_sequnlock_excl(&old->seq);
 	}
 	return fs;
 }
@@ -140,10 +133,10 @@ int unshare_fs_struct(void)
 		return -ENOMEM;
 
 	task_lock(current);
-	spin_lock(&fs->lock);
+	read_seqlock_excl(&fs->seq);
 	kill = !--fs->users;
 	current->fs = new_fs;
-	spin_unlock(&fs->lock);
+	read_sequnlock_excl(&fs->seq);
 	task_unlock(current);
 
 	if (kill)
@@ -162,7 +155,6 @@ EXPORT_SYMBOL(current_umask);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
-	.lock		= __SPIN_LOCK_UNLOCKED(init_fs.lock),
-	.seq		= SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
+	.seq		= __SEQLOCK_UNLOCKED(init_fs.seq),
 	.umask		= 0022,
 };
diff --git a/fs/namei.c b/fs/namei.c
index 4bb889fc980b..f2fcaf84e111 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1012,10 +1012,10 @@ static int set_root(struct nameidata *nd)
 		unsigned seq;
 
 		do {
-			seq = read_seqcount_begin(&fs->seq);
+			seq = read_seqbegin(&fs->seq);
 			nd->root = fs->root;
 			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
-		} while (read_seqcount_retry(&fs->seq, seq));
+		} while (read_seqretry(&fs->seq, seq));
 	} else {
 		get_fs_root(fs, &nd->root);
 		nd->state |= ND_ROOT_GRABBED;
@@ -2580,11 +2580,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 			unsigned seq;
 
 			do {
-				seq = read_seqcount_begin(&fs->seq);
+				seq = read_seqbegin(&fs->seq);
 				nd->path = fs->pwd;
 				nd->inode = nd->path.dentry->d_inode;
 				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-			} while (read_seqcount_retry(&fs->seq, seq));
+			} while (read_seqretry(&fs->seq, seq));
 		} else {
 			get_fs_pwd(current->fs, &nd->path);
 			nd->inode = nd->path.dentry->d_inode;
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 783b48dedb72..baf200ab5c77 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -8,8 +8,7 @@
 
 struct fs_struct {
 	int users;
-	spinlock_t lock;
-	seqcount_spinlock_t seq;
+	seqlock_t seq;
 	int umask;
 	int in_exec;
 	struct path root, pwd;
@@ -26,18 +25,18 @@ extern int unshare_fs_struct(void);
 
 static inline void get_fs_root(struct fs_struct *fs, struct path *root)
 {
-	spin_lock(&fs->lock);
+	read_seqlock_excl(&fs->seq);
 	*root = fs->root;
 	path_get(root);
-	spin_unlock(&fs->lock);
+	read_sequnlock_excl(&fs->seq);
 }
 
 static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
 {
-	spin_lock(&fs->lock);
+	read_seqlock_excl(&fs->seq);
 	*pwd = fs->pwd;
 	path_get(pwd);
-	spin_unlock(&fs->lock);
+	read_sequnlock_excl(&fs->seq);
 }
 
 extern bool current_chrooted(void);
diff --git a/kernel/fork.c b/kernel/fork.c
index 1ee8eb11f38b..6318a25a16ba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1542,14 +1542,14 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
 		/* tsk->fs is already what we want */
-		spin_lock(&fs->lock);
+		read_seqlock_excl(&fs->seq);
 		/* "users" and "in_exec" locked for check_unsafe_exec() */
 		if (fs->in_exec) {
-			spin_unlock(&fs->lock);
+			read_sequnlock_excl(&fs->seq);
 			return -EAGAIN;
 		}
 		fs->users++;
-		spin_unlock(&fs->lock);
+		read_sequnlock_excl(&fs->seq);
 		return 0;
 	}
 	tsk->fs = copy_fs_struct(fs);
@@ -3149,13 +3149,13 @@ int ksys_unshare(unsigned long unshare_flags)
 
 		if (new_fs) {
 			fs = current->fs;
-			spin_lock(&fs->lock);
+			read_seqlock_excl(&fs->seq);
 			current->fs = new_fs;
 			if (--fs->users)
 				new_fs = NULL;
 			else
 				new_fs = fs;
-			spin_unlock(&fs->lock);
+			read_sequnlock_excl(&fs->seq);
 		}
 
 		if (new_fd)
-- 
cgit v1.2.3