Merge tag 'kthread-for-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks

Pull kthread updates from Frederic Weisbecker: "The kthread code provides an infrastructure which manages the preferred affinity of unbound kthreads (node or custom cpumask) against housekeeping (CPU isolation) constraints and CPU hotplug events. One crucial missing piece is the handling of cpuset: when an isolated partition is created, deleted, or its CPUs updated, all the unbound kthreads in the top cpuset become indifferently affine to _all_ the non-isolated CPUs, possibly breaking their preferred affinity along the way. Solve this with performing the kthreads affinity update from cpuset to the kthreads consolidated relevant code instead so that preferred affinities are honoured and applied against the updated cpuset isolated partitions. The dispatch of the new isolated cpumasks to timers, workqueues and kthreads is performed by housekeeping, as per the nice Tejun's suggestion. As a welcome side effect, HK_TYPE_DOMAIN then integrates both the set from boot defined domain isolation (through isolcpus=) and cpuset isolated partitions. Housekeeping cpumasks are now modifiable with a specific RCU based synchronization. A big step toward making nohz_full= also mutable through cpuset in the future" * tag 'kthread-for-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks: (33 commits) doc: Add housekeeping documentation kthread: Document kthread_affine_preferred() kthread: Comment on the purpose and placement of kthread_affine_node() call kthread: Honour kthreads preferred affinity after cpuset changes sched/arm64: Move fallback task cpumask to HK_TYPE_DOMAIN sched: Switch the fallback task allowed cpumask to HK_TYPE_DOMAIN kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management kthread: Include kthreadd to the managed affinity list kthread: Include unbound kthreads in the managed affinity list kthread: Refine naming of affinity related fields PCI: Remove superfluous HK_TYPE_WQ check sched/isolation: Remove HK_TYPE_TICK test from cpu_is_isolated() cpuset: Remove cpuset_cpu_is_isolated() timers/migration: Remove superfluous cpuset isolation test cpuset: Propagate cpuset isolation update to timers through housekeeping cpuset: Propagate cpuset isolation update to workqueue through housekeeping PCI: Flush PCI probe workqueue on cpuset isolated partition change sched/isolation: Flush vmstat workqueues on cpuset isolated partition change sched/isolation: Flush memcg workqueues on cpuset isolated partition change cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2026-02-09 19:57:30 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2026-02-09 19:57:30 -0800
commit: d16738a4e79e55b2c3c9ff4fb7b74a4a24723515 (patch)
tree: 694b05e5b5f00ad2e70f243f84ad921b79cd8dc9 /mm
parent: 0506158ac7363a70f0deb49f71d26ccb57e55990 (diff)
parent: fa39ec4f89f2637ed1cdbcde3656825951787668 (diff)
2 files changed, 38 insertions, 8 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 86f43b7e5f71..783b3b008fef 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -96,6 +96,8 @@ static bool cgroup_memory_nokmem __ro_after_init;
 /* BPF memory accounting disabled? */
 static bool cgroup_memory_nobpf __ro_after_init;
 
+static struct workqueue_struct *memcg_wq __ro_after_init;
+
 static struct kmem_cache *memcg_cachep;
 static struct kmem_cache *memcg_pn_cachep;
 
@@ -2003,6 +2005,19 @@ static bool is_memcg_drain_needed(struct memcg_stock_pcp *stock,
 	return flush;
 }
 
+static void schedule_drain_work(int cpu, struct work_struct *work)
+{
+	/*
+	 * Protect housekeeping cpumask read and work enqueue together
+	 * in the same RCU critical section so that later cpuset isolated
+	 * partition update only need to wait for an RCU GP and flush the
+	 * pending work on newly isolated CPUs.
+	 */
+	guard(rcu)();
+	if (!cpu_is_isolated(cpu))
+		queue_work_on(cpu, memcg_wq, work);
+}
+
 /*
  * Drains all per-CPU charge caches for given root_memcg resp. subtree
  * of the hierarchy under it.
@@ -2032,8 +2047,8 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
 				      &memcg_st->flags)) {
 			if (cpu == curcpu)
 				drain_local_memcg_stock(&memcg_st->work);
-			else if (!cpu_is_isolated(cpu))
-				schedule_work_on(cpu, &memcg_st->work);
+			else
+				schedule_drain_work(cpu, &memcg_st->work);
 		}
 
 		if (!test_bit(FLUSHING_CACHED_CHARGE, &obj_st->flags) &&
@@ -2042,8 +2057,8 @@ void drain_all_stock(struct mem_cgroup *root_memcg)
 				      &obj_st->flags)) {
 			if (cpu == curcpu)
 				drain_local_obj_stock(&obj_st->work);
-			else if (!cpu_is_isolated(cpu))
-				schedule_work_on(cpu, &obj_st->work);
+			else
+				schedule_drain_work(cpu, &obj_st->work);
 		}
 	}
 	migrate_enable();
@@ -5112,6 +5127,11 @@ void mem_cgroup_sk_uncharge(const struct sock *sk, unsigned int nr_pages)
 	refill_stock(memcg, nr_pages);
 }
 
+void mem_cgroup_flush_workqueue(void)
+{
+	flush_workqueue(memcg_wq);
+}
+
 static int __init cgroup_memory(char *s)
 {
 	char *token;
@@ -5154,6 +5174,9 @@ int __init mem_cgroup_init(void)
 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
 				  memcg_hotplug_cpu_dead);
 
+	memcg_wq = alloc_workqueue("memcg", WQ_PERCPU, 0);
+	WARN_ON(!memcg_wq);
+
 	for_each_possible_cpu(cpu) {
 		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
 			  drain_local_memcg_stock);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 65de88cdf40e..d6e814c82952 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -2124,6 +2124,11 @@ static void vmstat_shepherd(struct work_struct *w);
 
 static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
 
+void vmstat_flush_workqueue(void)
+{
+	flush_workqueue(mm_percpu_wq);
+}
+
 static void vmstat_shepherd(struct work_struct *w)
 {
 	int cpu;
@@ -2144,11 +2149,13 @@ static void vmstat_shepherd(struct work_struct *w)
 		 * infrastructure ever noticing. Skip regular flushing from vmstat_shepherd
 		 * for all isolated CPUs to avoid interference with the isolated workload.
 		 */
-		if (cpu_is_isolated(cpu))
-			continue;
+		scoped_guard(rcu) {
+			if (cpu_is_isolated(cpu))
+				continue;
 
-		if (!delayed_work_pending(dw) && need_update(cpu))
-			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
+			if (!delayed_work_pending(dw) && need_update(cpu))
+				queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
+		}
 
 		cond_resched();
 	}
author	Linus Torvalds <torvalds@linux-foundation.org>	2026-02-09 19:57:30 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2026-02-09 19:57:30 -0800
commit	d16738a4e79e55b2c3c9ff4fb7b74a4a24723515 (patch)
tree	694b05e5b5f00ad2e70f243f84ad921b79cd8dc9 /mm
parent	0506158ac7363a70f0deb49f71d26ccb57e55990 (diff)
parent	fa39ec4f89f2637ed1cdbcde3656825951787668 (diff)