summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/damon/core.c41
-rw-r--r--mm/damon/sysfs-schemes.c10
-rw-r--r--mm/damon/sysfs.c9
-rw-r--r--mm/damon/vaddr.c2
-rw-r--r--mm/hugetlb.c147
-rw-r--r--mm/init-mm.c5
-rw-r--r--mm/internal.h8
-rw-r--r--mm/kasan/common.c53
-rw-r--r--mm/kasan/hw_tags.c2
-rw-r--r--mm/kasan/shadow.c4
-rw-r--r--mm/kfence/core.c40
-rw-r--r--mm/kmsan/shadow.c2
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memfd.c4
-rw-r--r--mm/memfd_luo.c10
-rw-r--r--mm/memory-failure.c170
-rw-r--r--mm/memory.c11
-rw-r--r--mm/memremap.c37
-rw-r--r--mm/migrate.c12
-rw-r--r--mm/mm_init.c12
-rw-r--r--mm/mmu_gather.c33
-rw-r--r--mm/numa_memblks.c2
-rw-r--r--mm/page_alloc.c91
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/rmap.c45
-rw-r--r--mm/shmem.c54
-rw-r--r--mm/slub.c14
-rw-r--r--mm/swap.h2
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/vma.c122
-rw-r--r--mm/vma.h3
-rw-r--r--mm/vmalloc.c11
-rw-r--r--mm/vmscan.c13
-rw-r--r--mm/zswap.c2
36 files changed, 677 insertions, 317 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index bd0ea5454af8..a992f2203eb9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1220,10 +1220,14 @@ config ZONE_DEVICE
Device memory hotplug support allows for establishing pmem,
or other device driver discovered memory regions, in the
memmap. This allows pfn_to_page() lookups of otherwise
- "device-physical" addresses which is needed for using a DAX
- mapping in an O_DIRECT operation, among other things.
-
- If FS_DAX is enabled, then say Y.
+ "device-physical" addresses which is needed for DAX, PCI_P2PDMA, and
+ DEVICE_PRIVATE features among others.
+
+ Enabling this option will reduce the entropy of x86 KASLR memory
+ regions. For example - on a 46 bit system, the entropy goes down
+ from 16 bits to 15 bits. The actual reduction in entropy depends
+ on the physical address bits, on processor features, kernel config
+ (5 level page table) and physical memory present on the system.
#
# Helpers to mirror range of the CPU page tables of a process into device page
diff --git a/mm/damon/core.c b/mm/damon/core.c
index f9fc0375890a..84f80a20f233 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1431,6 +1431,35 @@ bool damon_is_running(struct damon_ctx *ctx)
return running;
}
+/*
+ * damon_call_handle_inactive_ctx() - handle DAMON call request that added to
+ * an inactive context.
+ * @ctx: The inactive DAMON context.
+ * @control: Control variable of the call request.
+ *
+ * This function is called in a case that @control is added to @ctx but @ctx is
+ * not running (inactive). See if @ctx handled @control or not, and cleanup
+ * @control if it was not handled.
+ *
+ * Returns 0 if @control was handled by @ctx, negative error code otherwise.
+ */
+static int damon_call_handle_inactive_ctx(
+ struct damon_ctx *ctx, struct damon_call_control *control)
+{
+ struct damon_call_control *c;
+
+ mutex_lock(&ctx->call_controls_lock);
+ list_for_each_entry(c, &ctx->call_controls, list) {
+ if (c == control) {
+ list_del(&control->list);
+ mutex_unlock(&ctx->call_controls_lock);
+ return -EINVAL;
+ }
+ }
+ mutex_unlock(&ctx->call_controls_lock);
+ return 0;
+}
+
/**
* damon_call() - Invoke a given function on DAMON worker thread (kdamond).
* @ctx: DAMON context to call the function for.
@@ -1461,7 +1490,7 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control)
list_add_tail(&control->list, &ctx->call_controls);
mutex_unlock(&ctx->call_controls_lock);
if (!damon_is_running(ctx))
- return -EINVAL;
+ return damon_call_handle_inactive_ctx(ctx, control);
if (control->repeat)
return 0;
wait_for_completion(&control->completion);
@@ -2051,13 +2080,15 @@ static unsigned long damos_get_node_memcg_used_bp(
rcu_read_lock();
memcg = mem_cgroup_from_id(goal->memcg_id);
- rcu_read_unlock();
- if (!memcg) {
+ if (!memcg || !mem_cgroup_tryget(memcg)) {
+ rcu_read_unlock();
if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
return 0;
else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */
return 10000;
}
+ rcu_read_unlock();
+
mem_cgroup_flush_stats(memcg);
lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid));
used_pages = lruvec_page_state(lruvec, NR_ACTIVE_ANON);
@@ -2065,6 +2096,8 @@ static unsigned long damos_get_node_memcg_used_bp(
used_pages += lruvec_page_state(lruvec, NR_ACTIVE_FILE);
used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE);
+ mem_cgroup_put(memcg);
+
si_meminfo_node(&i, goal->nid);
if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP)
numerator = used_pages;
@@ -2751,13 +2784,13 @@ done:
if (ctx->ops.cleanup)
ctx->ops.cleanup(ctx);
kfree(ctx->regions_score_histogram);
+ kdamond_call(ctx, true);
pr_debug("kdamond (%d) finishes\n", current->pid);
mutex_lock(&ctx->kdamond_lock);
ctx->kdamond = NULL;
mutex_unlock(&ctx->kdamond_lock);
- kdamond_call(ctx, true);
damos_walk_cancel(ctx);
mutex_lock(&damon_lock);
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 30d20f5b3192..3a699dcd5a7f 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -2152,13 +2152,13 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme)
return err;
err = damos_sysfs_set_dests(scheme);
if (err)
- goto put_access_pattern_out;
+ goto rmdir_put_access_pattern_out;
err = damon_sysfs_scheme_set_quotas(scheme);
if (err)
goto put_dests_out;
err = damon_sysfs_scheme_set_watermarks(scheme);
if (err)
- goto put_quotas_access_pattern_out;
+ goto rmdir_put_quotas_access_pattern_out;
err = damos_sysfs_set_filter_dirs(scheme);
if (err)
goto put_watermarks_quotas_access_pattern_out;
@@ -2183,13 +2183,15 @@ put_filters_watermarks_quotas_access_pattern_out:
put_watermarks_quotas_access_pattern_out:
kobject_put(&scheme->watermarks->kobj);
scheme->watermarks = NULL;
-put_quotas_access_pattern_out:
+rmdir_put_quotas_access_pattern_out:
+ damon_sysfs_quotas_rm_dirs(scheme->quotas);
kobject_put(&scheme->quotas->kobj);
scheme->quotas = NULL;
put_dests_out:
kobject_put(&scheme->dests->kobj);
scheme->dests = NULL;
-put_access_pattern_out:
+rmdir_put_access_pattern_out:
+ damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern);
kobject_put(&scheme->access_pattern->kobj);
scheme->access_pattern = NULL;
return err;
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index e2bd2d7becdd..95fd9375a7d8 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -792,7 +792,7 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000);
if (!nr_regions_range) {
err = -ENOMEM;
- goto put_intervals_out;
+ goto rmdir_put_intervals_out;
}
err = kobject_init_and_add(&nr_regions_range->kobj,
@@ -806,6 +806,8 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs)
put_nr_regions_intervals_out:
kobject_put(&nr_regions_range->kobj);
attrs->nr_regions_range = NULL;
+rmdir_put_intervals_out:
+ damon_sysfs_intervals_rm_dirs(intervals);
put_intervals_out:
kobject_put(&intervals->kobj);
attrs->intervals = NULL;
@@ -948,7 +950,7 @@ static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context)
err = damon_sysfs_context_set_targets(context);
if (err)
- goto put_attrs_out;
+ goto rmdir_put_attrs_out;
err = damon_sysfs_context_set_schemes(context);
if (err)
@@ -958,7 +960,8 @@ static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context)
put_targets_attrs_out:
kobject_put(&context->targets->kobj);
context->targets = NULL;
-put_attrs_out:
+rmdir_put_attrs_out:
+ damon_sysfs_attrs_rm_dirs(context->attrs);
kobject_put(&context->attrs->kobj);
context->attrs = NULL;
return err;
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 2750c88e7225..23ed738a0bd6 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -743,7 +743,7 @@ huge_out:
if (!folio)
continue;
if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL))
- return 0;
+ continue;
damos_va_migrate_dests_add(folio, walk->vma, addr, dests,
migration_lists);
nr = folio_nr_pages(folio);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51273baec9e5..a1832da0f623 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4286,6 +4286,11 @@ static int __init hugepages_setup(char *s)
unsigned long tmp;
char *p = s;
+ if (!hugepages_supported()) {
+ pr_warn("HugeTLB: hugepages unsupported, ignoring hugepages=%s cmdline\n", s);
+ return 0;
+ }
+
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
@@ -4366,6 +4371,11 @@ static int __init hugepagesz_setup(char *s)
unsigned long size;
struct hstate *h;
+ if (!hugepages_supported()) {
+ pr_warn("HugeTLB: hugepages unsupported, ignoring hugepagesz=%s cmdline\n", s);
+ return 0;
+ }
+
parsed_valid_hugepagesz = false;
size = (unsigned long)memparse(s, NULL);
@@ -4414,6 +4424,12 @@ static int __init default_hugepagesz_setup(char *s)
unsigned long size;
int i;
+ if (!hugepages_supported()) {
+ pr_warn("HugeTLB: hugepages unsupported, ignoring default_hugepagesz=%s cmdline\n",
+ s);
+ return 0;
+ }
+
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
@@ -5096,7 +5112,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
- bool shared_pmd = false;
+ struct mmu_gather tlb;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
@@ -5106,6 +5122,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
* range.
*/
flush_cache_range(vma, range.start, range.end);
+ tlb_gather_mmu_vma(&tlb, vma);
mmu_notifier_invalidate_range_start(&range);
last_addr_mask = hugetlb_mask_last_page(h);
@@ -5122,8 +5139,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
continue;
- if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
- shared_pmd = true;
+ if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
continue;
@@ -5134,15 +5150,16 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
break;
move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
+ tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr);
}
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, old_end - len, old_end);
+ tlb_flush_mmu_tlbonly(&tlb);
+ huge_pmd_unshare_flush(&tlb, vma);
+
mmu_notifier_invalidate_range_end(&range);
i_mmap_unlock_write(mapping);
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
return len + old_addr - old_end;
}
@@ -5161,7 +5178,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
bool adjust_reservation;
unsigned long last_addr_mask;
- bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -5184,10 +5200,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(tlb, vma, address, ptep)) {
spin_unlock(ptl);
- tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
- force_flush = true;
address |= last_addr_mask;
continue;
}
@@ -5303,21 +5317,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
tlb_end_vma(tlb, vma);
- /*
- * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
- * could defer the flush until now, since by holding i_mmap_rwsem we
- * guaranteed that the last reference would not be dropped. But we must
- * do the flushing before we return, as otherwise i_mmap_rwsem will be
- * dropped and the last reference to the shared PMDs page might be
- * dropped as well.
- *
- * In theory we could defer the freeing of the PMD pages as well, but
- * huge_pmd_unshare() relies on the exact page_count for the PMD page to
- * detect sharing, so we cannot defer the release of the page either.
- * Instead, do flush now.
- */
- if (force_flush)
- tlb_flush_mmu_tlbonly(tlb);
+ huge_pmd_unshare_flush(tlb, vma);
}
void __hugetlb_zap_begin(struct vm_area_struct *vma,
@@ -6416,11 +6416,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
long pages = 0, psize = huge_page_size(h);
- bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ struct mmu_gather tlb;
/*
* In the case of shared PMDs, the area to flush could be beyond
@@ -6433,6 +6433,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
BUG_ON(address >= end);
flush_cache_range(vma, range.start, range.end);
+ tlb_gather_mmu_vma(&tlb, vma);
mmu_notifier_invalidate_range_start(&range);
hugetlb_vma_lock_write(vma);
@@ -6459,7 +6460,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
}
}
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, address, ptep)) {
+ if (huge_pmd_unshare(&tlb, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6468,7 +6469,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
pages++;
spin_unlock(ptl);
- shared_pmd = true;
address |= last_addr_mask;
continue;
}
@@ -6529,23 +6529,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
+ tlb_remove_huge_tlb_entry(h, &tlb, ptep, address);
}
next:
spin_unlock(ptl);
cond_resched();
}
- /*
- * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
- * may have cleared our pud entry and done put_page on the page table:
- * once we release i_mmap_rwsem, another task can do the final put_page
- * and that page table be reused and filled with junk. If we actually
- * did unshare a page of pmds, flush the range corresponding to the pud.
- */
- if (shared_pmd)
- flush_hugetlb_tlb_range(vma, range.start, range.end);
- else
- flush_hugetlb_tlb_range(vma, start, end);
+
+ tlb_flush_mmu_tlbonly(&tlb);
+ huge_pmd_unshare_flush(&tlb, vma);
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
* downgrading page table protection not changing it to point to a new
@@ -6556,6 +6549,7 @@ next:
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
return pages > 0 ? (pages << h->order) : pages;
}
@@ -6912,18 +6906,27 @@ out:
return pte;
}
-/*
- * unmap huge page backed by shared pte.
+/**
+ * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ * @addr: the address we are trying to unshare.
+ * @ptep: pointer into the (pmd) page table.
*
- * Called with page table lock held.
+ * Called with the page table lock held, the i_mmap_rwsem held in write mode
+ * and the hugetlb vma lock held in write mode.
*
- * returns: 1 successfully unmapped a shared pte page
- * 0 the underlying pte page is not shared, or it is the last user
+ * Note: The caller must call huge_pmd_unshare_flush() before dropping the
+ * i_mmap_rwsem.
+ *
+ * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it
+ * was not a shared PMD table.
*/
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
+ struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
@@ -6935,18 +6938,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
pud_clear(pud);
- /*
- * Once our caller drops the rmap lock, some other process might be
- * using this page table as a normal, non-hugetlb page table.
- * Wait for pending gup_fast() in other threads to finish before letting
- * that happen.
- */
- tlb_remove_table_sync_one();
- ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
+
+ tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr);
+
mm_dec_nr_pmds(mm);
return 1;
}
+/*
+ * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls
+ * @tlb: the current mmu_gather.
+ * @vma: the vma covering the pmd table.
+ *
+ * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table
+ * unsharing with concurrent page table walkers.
+ *
+ * This function must be called after a sequence of huge_pmd_unshare()
+ * calls while still holding the i_mmap_rwsem.
+ */
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ /*
+ * We must synchronize page table unsharing such that nobody will
+ * try reusing a previously-shared page table while it might still
+ * be in use by previous sharers (TLB, GUP_fast).
+ */
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+
+ tlb_flush_unshared_tables(tlb);
+}
+
#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -6955,12 +6976,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
+void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+}
+
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
@@ -7227,6 +7252,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
+ struct mmu_gather tlb;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;
@@ -7238,6 +7264,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
return;
flush_cache_range(vma, start, end);
+ tlb_gather_mmu_vma(&tlb, vma);
+
/*
* No need to call adjust_range_if_pmd_sharing_possible(), because
* we have already done the PUD_SIZE alignment.
@@ -7256,10 +7284,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- huge_pmd_unshare(mm, vma, address, ptep);
+ huge_pmd_unshare(&tlb, vma, address, ptep);
spin_unlock(ptl);
}
- flush_hugetlb_tlb_range(vma, start, end);
+ huge_pmd_unshare_flush(&tlb, vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
@@ -7269,6 +7297,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb);
}
/*
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 4600e7605cab..c5556bb9d5f0 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -44,7 +44,10 @@ struct mm_struct init_mm = {
.mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq),
#endif
.user_ns = &init_user_ns,
- .cpu_bitmap = CPU_BITS_NONE,
+#ifdef CONFIG_SCHED_MM_CID
+ .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(init_mm.mm_cid.lock),
+#endif
+ .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index e430da900430..f35dbcf99a86 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -538,16 +538,8 @@ extern unsigned long highest_memmap_pfn;
bool folio_isolate_lru(struct folio *folio);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
-#ifdef CONFIG_NUMA
int user_proactive_reclaim(char *buf,
struct mem_cgroup *memcg, pg_data_t *pgdat);
-#else
-static inline int user_proactive_reclaim(char *buf,
- struct mem_cgroup *memcg, pg_data_t *pgdat)
-{
- return 0;
-}
-#endif
/*
* in mm/rmap.c:
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 1d27f1bd260b..b7d05c2a6d93 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -28,6 +28,7 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bug.h>
+#include <linux/vmalloc.h>
#include "kasan.h"
#include "../slab.h"
@@ -575,3 +576,55 @@ bool __kasan_check_byte(const void *address, unsigned long ip)
}
return true;
}
+
+#ifdef CONFIG_KASAN_VMALLOC
+void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms,
+ kasan_vmalloc_flags_t flags)
+{
+ unsigned long size;
+ void *addr;
+ int area;
+ u8 tag;
+
+ /*
+ * If KASAN_VMALLOC_KEEP_TAG was set at this point, all vms[] pointers
+ * would be unpoisoned with the KASAN_TAG_KERNEL which would disable
+ * KASAN checks down the line.
+ */
+ if (WARN_ON_ONCE(flags & KASAN_VMALLOC_KEEP_TAG))
+ return;
+
+ size = vms[0]->size;
+ addr = vms[0]->addr;
+ vms[0]->addr = __kasan_unpoison_vmalloc(addr, size, flags);
+ tag = get_tag(vms[0]->addr);
+
+ for (area = 1 ; area < nr_vms ; area++) {
+ size = vms[area]->size;
+ addr = set_tag(vms[area]->addr, tag);
+ vms[area]->addr =
+ __kasan_unpoison_vmalloc(addr, size, flags | KASAN_VMALLOC_KEEP_TAG);
+ }
+}
+
+void __kasan_vrealloc(const void *addr, unsigned long old_size,
+ unsigned long new_size)
+{
+ if (new_size < old_size) {
+ kasan_poison_last_granule(addr, new_size);
+
+ new_size = round_up(new_size, KASAN_GRANULE_SIZE);
+ old_size = round_up(old_size, KASAN_GRANULE_SIZE);
+ if (new_size < old_size)
+ __kasan_poison_vmalloc(addr + new_size,
+ old_size - new_size);
+ } else if (new_size > old_size) {
+ old_size = round_down(old_size, KASAN_GRANULE_SIZE);
+ __kasan_unpoison_vmalloc(addr + old_size,
+ new_size - old_size,
+ KASAN_VMALLOC_PROT_NORMAL |
+ KASAN_VMALLOC_VM_ALLOC |
+ KASAN_VMALLOC_KEEP_TAG);
+ }
+}
+#endif
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 1c373cc4b3fa..cbef5e450954 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -361,7 +361,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
return (void *)start;
}
- tag = kasan_random_tag();
+ tag = (flags & KASAN_VMALLOC_KEEP_TAG) ? get_tag(start) : kasan_random_tag();
start = set_tag(start, tag);
/* Unpoison and initialize memory up to size. */
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 29a751a8a08d..32fbdf759ea2 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -631,7 +631,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
!(flags & KASAN_VMALLOC_PROT_NORMAL))
return (void *)start;
- start = set_tag(start, kasan_random_tag());
+ if (unlikely(!(flags & KASAN_VMALLOC_KEEP_TAG)))
+ start = set_tag(start, kasan_random_tag());
+
kasan_unpoison(start, size, false);
return (void *)start;
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 577a1699c553..4f79ec720752 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -596,7 +596,7 @@ static void rcu_guarded_free(struct rcu_head *h)
static unsigned long kfence_init_pool(void)
{
unsigned long addr, start_pfn;
- int i;
+ int i, rand;
if (!arch_kfence_init_pool())
return (unsigned long)__kfence_pool;
@@ -647,13 +647,27 @@ static unsigned long kfence_init_pool(void)
INIT_LIST_HEAD(&meta->list);
raw_spin_lock_init(&meta->lock);
meta->state = KFENCE_OBJECT_UNUSED;
- meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
- list_add_tail(&meta->list, &kfence_freelist);
+ /* Use addr to randomize the freelist. */
+ meta->addr = i;
/* Protect the right redzone. */
- if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
+ if (unlikely(!kfence_protect(addr + 2 * i * PAGE_SIZE + PAGE_SIZE)))
goto reset_slab;
+ }
+
+ for (i = CONFIG_KFENCE_NUM_OBJECTS; i > 0; i--) {
+ rand = get_random_u32_below(i);
+ swap(kfence_metadata_init[i - 1].addr, kfence_metadata_init[rand].addr);
+ }
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ struct kfence_metadata *meta_1 = &kfence_metadata_init[i];
+ struct kfence_metadata *meta_2 = &kfence_metadata_init[meta_1->addr];
+
+ list_add_tail(&meta_2->list, &kfence_freelist);
+ }
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ kfence_metadata_init[i].addr = addr;
addr += 2 * PAGE_SIZE;
}
@@ -666,6 +680,7 @@ static unsigned long kfence_init_pool(void)
return 0;
reset_slab:
+ addr += 2 * i * PAGE_SIZE;
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
struct page *page;
@@ -823,6 +838,9 @@ static struct notifier_block kfence_check_canary_notifier = {
static struct delayed_work kfence_timer;
#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* Wait queue to wake up allocation-gate timer task. */
+static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
+
static int kfence_reboot_callback(struct notifier_block *nb,
unsigned long action, void *data)
{
@@ -832,7 +850,12 @@ static int kfence_reboot_callback(struct notifier_block *nb,
*/
WRITE_ONCE(kfence_enabled, false);
/* Cancel any pending timer work */
- cancel_delayed_work_sync(&kfence_timer);
+ cancel_delayed_work(&kfence_timer);
+ /*
+ * Wake up any blocked toggle_allocation_gate() so it can complete
+ * early while the system is still able to handle IPIs.
+ */
+ wake_up(&allocation_wait);
return NOTIFY_OK;
}
@@ -842,9 +865,6 @@ static struct notifier_block kfence_reboot_notifier = {
.priority = INT_MAX, /* Run early to stop timers ASAP */
};
-/* Wait queue to wake up allocation-gate timer task. */
-static DECLARE_WAIT_QUEUE_HEAD(allocation_wait);
-
static void wake_up_kfence_timer(struct irq_work *work)
{
wake_up(&allocation_wait);
@@ -873,7 +893,9 @@ static void toggle_allocation_gate(struct work_struct *work)
/* Enable static key, and await allocation to happen. */
static_branch_enable(&kfence_allocation_key);
- wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate) > 0);
+ wait_event_idle(allocation_wait,
+ atomic_read(&kfence_allocation_gate) > 0 ||
+ !READ_ONCE(kfence_enabled));
/* Disable static key and reset timer. */
static_branch_disable(&kfence_allocation_key);
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index e7f554a31bb4..9e1c5f2b7a41 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -207,7 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
if (!kmsan_enabled || kmsan_in_runtime())
return;
kmsan_enter_runtime();
- kmsan_internal_poison_memory(page_address(page), page_size(page),
+ kmsan_internal_poison_memory(page_address(page), PAGE_SIZE << order,
GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
diff --git a/mm/ksm.c b/mm/ksm.c
index cfc182255c7b..2d89a7c8b4eb 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -650,7 +650,7 @@ static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long en
}
}
out_unlock:
- pte_unmap_unlock(ptep, ptl);
+ pte_unmap_unlock(start_ptep, ptl);
return found;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index be810c1fbfc3..86f43b7e5f71 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5638,6 +5638,6 @@ void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg)
memcg = root_mem_cgroup;
pr_warn("Memory cgroup min protection %lukB -- low protection %lukB",
- K(atomic_long_read(&memcg->memory.children_min_usage)*PAGE_SIZE),
- K(atomic_long_read(&memcg->memory.children_low_usage)*PAGE_SIZE));
+ K(atomic_long_read(&memcg->memory.children_min_usage)),
+ K(atomic_long_read(&memcg->memory.children_low_usage)));
}
diff --git a/mm/memfd.c b/mm/memfd.c
index ab5312aff14b..f032c6052926 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -456,7 +456,7 @@ err_name:
return ERR_PTR(error);
}
-static struct file *alloc_file(const char *name, unsigned int flags)
+struct file *memfd_alloc_file(const char *name, unsigned int flags)
{
unsigned int *file_seals;
struct file *file;
@@ -520,5 +520,5 @@ SYSCALL_DEFINE2(memfd_create,
return PTR_ERR(name);
fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0;
- return FD_ADD(fd_flags, alloc_file(name, flags));
+ return FD_ADD(fd_flags, memfd_alloc_file(name, flags));
}
diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c
index 4f6ba63b4310..a34fccc23b6a 100644
--- a/mm/memfd_luo.c
+++ b/mm/memfd_luo.c
@@ -78,6 +78,7 @@
#include <linux/liveupdate.h>
#include <linux/shmem_fs.h>
#include <linux/vmalloc.h>
+#include <linux/memfd.h>
#include "internal.h"
static int memfd_luo_preserve_folios(struct file *file,
@@ -443,11 +444,11 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
if (!ser)
return -EINVAL;
- file = shmem_file_setup("", 0, VM_NORESERVE);
-
+ file = memfd_alloc_file("", 0);
if (IS_ERR(file)) {
pr_err("failed to setup file: %pe\n", file);
- return PTR_ERR(file);
+ err = PTR_ERR(file);
+ goto free_ser;
}
vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
@@ -473,7 +474,8 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
put_file:
fput(file);
-
+free_ser:
+ kho_restore_free(ser);
return err;
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index fbc5a01260c8..9fd8355176eb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -692,6 +692,8 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
unsigned long poisoned_pfn, struct to_kill *tk)
{
unsigned long pfn = 0;
+ unsigned long hwpoison_vaddr;
+ unsigned long mask;
if (pte_present(pte)) {
pfn = pte_pfn(pte);
@@ -702,10 +704,12 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift,
pfn = softleaf_to_pfn(entry);
}
- if (!pfn || pfn != poisoned_pfn)
+ mask = ~((1UL << (shift - PAGE_SHIFT)) - 1);
+ if (!pfn || pfn != (poisoned_pfn & mask))
return 0;
- set_to_kill(tk, addr, shift);
+ hwpoison_vaddr = addr + ((poisoned_pfn - pfn) << PAGE_SHIFT);
+ set_to_kill(tk, hwpoison_vaddr, shift);
return 1;
}
@@ -1883,12 +1887,22 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag)
return count;
}
-static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
+#define MF_HUGETLB_FREED 0 /* freed hugepage */
+#define MF_HUGETLB_IN_USED 1 /* in-use hugepage */
+#define MF_HUGETLB_NON_HUGEPAGE 2 /* not a hugepage */
+#define MF_HUGETLB_FOLIO_PRE_POISONED 3 /* folio already poisoned */
+#define MF_HUGETLB_PAGE_PRE_POISONED 4 /* exact page already poisoned */
+#define MF_HUGETLB_RETRY 5 /* hugepage is busy, retry */
+/*
+ * Set hugetlb folio as hwpoisoned, update folio private raw hwpoison list
+ * to keep track of the poisoned pages.
+ */
+static int hugetlb_update_hwpoison(struct folio *folio, struct page *page)
{
struct llist_head *head;
struct raw_hwp_page *raw_hwp;
struct raw_hwp_page *p;
- int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0;
+ int ret = folio_test_set_hwpoison(folio) ? MF_HUGETLB_FOLIO_PRE_POISONED : 0;
/*
* Once the hwpoison hugepage has lost reliable raw error info,
@@ -1896,20 +1910,17 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page)
* so skip to add additional raw error info.
*/
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
- return -EHWPOISON;
+ return MF_HUGETLB_FOLIO_PRE_POISONED;
head = raw_hwp_list_head(folio);
llist_for_each_entry(p, head->first, node) {
if (p->page == page)
- return -EHWPOISON;
+ return MF_HUGETLB_PAGE_PRE_POISONED;
}
raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC);
if (raw_hwp) {
raw_hwp->page = page;
llist_add(&raw_hwp->node, head);
- /* the first error event will be counted in action_result(). */
- if (ret)
- num_poisoned_pages_inc(page_to_pfn(page));
} else {
/*
* Failed to save raw error info. We no longer trace all
@@ -1957,42 +1968,39 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio)
/*
* Called from hugetlb code with hugetlb_lock held.
- *
- * Return values:
- * 0 - free hugepage
- * 1 - in-use hugepage
- * 2 - not a hugepage
- * -EBUSY - the hugepage is busy (try to retry)
- * -EHWPOISON - the hugepage is already hwpoisoned
*/
int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
bool *migratable_cleared)
{
struct page *page = pfn_to_page(pfn);
struct folio *folio = page_folio(page);
- int ret = 2; /* fallback to normal page handling */
bool count_increased = false;
+ int ret, rc;
- if (!folio_test_hugetlb(folio))
+ if (!folio_test_hugetlb(folio)) {
+ ret = MF_HUGETLB_NON_HUGEPAGE;
goto out;
-
- if (flags & MF_COUNT_INCREASED) {
- ret = 1;
+ } else if (flags & MF_COUNT_INCREASED) {
+ ret = MF_HUGETLB_IN_USED;
count_increased = true;
} else if (folio_test_hugetlb_freed(folio)) {
- ret = 0;
+ ret = MF_HUGETLB_FREED;
} else if (folio_test_hugetlb_migratable(folio)) {
- ret = folio_try_get(folio);
- if (ret)
+ if (folio_try_get(folio)) {
+ ret = MF_HUGETLB_IN_USED;
count_increased = true;
+ } else {
+ ret = MF_HUGETLB_FREED;
+ }
} else {
- ret = -EBUSY;
+ ret = MF_HUGETLB_RETRY;
if (!(flags & MF_NO_RETRY))
goto out;
}
- if (folio_set_hugetlb_hwpoison(folio, page)) {
- ret = -EHWPOISON;
+ rc = hugetlb_update_hwpoison(folio, page);
+ if (rc >= MF_HUGETLB_FOLIO_PRE_POISONED) {
+ ret = rc;
goto out;
}
@@ -2017,10 +2025,16 @@ out:
* with basic operations like hugepage allocation/free/demotion.
* So some of prechecks for hwpoison (pinning, and testing/setting
* PageHWPoison) should be done in single hugetlb_lock range.
+ * Returns:
+ * 0 - not hugetlb, or recovered
+ * -EBUSY - not recovered
+ * -EOPNOTSUPP - hwpoison_filter'ed
+ * -EHWPOISON - folio or exact page already poisoned
+ * -EFAULT - kill_accessing_process finds current->mm null
*/
static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
{
- int res;
+ int res, rv;
struct page *p = pfn_to_page(pfn);
struct folio *folio;
unsigned long page_flags;
@@ -2029,22 +2043,29 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
*hugetlb = 1;
retry:
res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared);
- if (res == 2) { /* fallback to normal page handling */
+ switch (res) {
+ case MF_HUGETLB_NON_HUGEPAGE: /* fallback to normal page handling */
*hugetlb = 0;
return 0;
- } else if (res == -EHWPOISON) {
- if (flags & MF_ACTION_REQUIRED) {
- folio = page_folio(p);
- res = kill_accessing_process(current, folio_pfn(folio), flags);
- }
- action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
- return res;
- } else if (res == -EBUSY) {
+ case MF_HUGETLB_RETRY:
if (!(flags & MF_NO_RETRY)) {
flags |= MF_NO_RETRY;
goto retry;
}
return action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
+ case MF_HUGETLB_FOLIO_PRE_POISONED:
+ case MF_HUGETLB_PAGE_PRE_POISONED:
+ rv = -EHWPOISON;
+ if (flags & MF_ACTION_REQUIRED)
+ rv = kill_accessing_process(current, pfn, flags);
+ if (res == MF_HUGETLB_PAGE_PRE_POISONED)
+ action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED);
+ else
+ action_result(pfn, MF_MSG_HUGE, MF_FAILED);
+ return rv;
+ default:
+ WARN_ON((res != MF_HUGETLB_FREED) && (res != MF_HUGETLB_IN_USED));
+ break;
}
folio = page_folio(p);
@@ -2055,7 +2076,7 @@ retry:
if (migratable_cleared)
folio_set_hugetlb_migratable(folio);
folio_unlock(folio);
- if (res == 1)
+ if (res == MF_HUGETLB_IN_USED)
folio_put(folio);
return -EOPNOTSUPP;
}
@@ -2064,7 +2085,7 @@ retry:
* Handling free hugepage. The possible race with hugepage allocation
* or demotion can be prevented by PageHWPoison flag.
*/
- if (res == 0) {
+ if (res == MF_HUGETLB_FREED) {
folio_unlock(folio);
if (__page_handle_poison(p) > 0) {
page_ref_inc(p);
@@ -2161,6 +2182,9 @@ int register_pfn_address_space(struct pfn_address_space *pfn_space)
{
guard(mutex)(&pfn_space_lock);
+ if (!pfn_space->pfn_to_vma_pgoff)
+ return -EINVAL;
+
if (interval_tree_iter_first(&pfn_space_itree,
pfn_space->node.start,
pfn_space->node.last))
@@ -2183,10 +2207,10 @@ void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
}
EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
-static void add_to_kill_pfn(struct task_struct *tsk,
- struct vm_area_struct *vma,
- struct list_head *to_kill,
- unsigned long pfn)
+static void add_to_kill_pgoff(struct task_struct *tsk,
+ struct vm_area_struct *vma,
+ struct list_head *to_kill,
+ pgoff_t pgoff)
{
struct to_kill *tk;
@@ -2197,12 +2221,12 @@ static void add_to_kill_pfn(struct task_struct *tsk,
}
/* Check for pgoff not backed by struct page */
- tk->addr = vma_address(vma, pfn, 1);
+ tk->addr = vma_address(vma, pgoff, 1);
tk->size_shift = PAGE_SHIFT;
if (tk->addr == -EFAULT)
pr_info("Unable to find address %lx in %s\n",
- pfn, tsk->comm);
+ pgoff, tsk->comm);
get_task_struct(tsk);
tk->tsk = tsk;
@@ -2212,11 +2236,12 @@ static void add_to_kill_pfn(struct task_struct *tsk,
/*
* Collect processes when the error hit a PFN not backed by struct page.
*/
-static void collect_procs_pfn(struct address_space *mapping,
+static void collect_procs_pfn(struct pfn_address_space *pfn_space,
unsigned long pfn, struct list_head *to_kill)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
+ struct address_space *mapping = pfn_space->mapping;
i_mmap_lock_read(mapping);
rcu_read_lock();
@@ -2226,9 +2251,12 @@ static void collect_procs_pfn(struct address_space *mapping,
t = task_early_kill(tsk, true);
if (!t)
continue;
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) {
- if (vma->vm_mm == t->mm)
- add_to_kill_pfn(t, vma, to_kill, pfn);
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX) {
+ pgoff_t pgoff;
+
+ if (vma->vm_mm == t->mm &&
+ !pfn_space->pfn_to_vma_pgoff(vma, pfn, &pgoff))
+ add_to_kill_pgoff(t, vma, to_kill, pgoff);
}
}
rcu_read_unlock();
@@ -2264,7 +2292,7 @@ static int memory_failure_pfn(unsigned long pfn, int flags)
struct pfn_address_space *pfn_space =
container_of(node, struct pfn_address_space, node);
- collect_procs_pfn(pfn_space->mapping, pfn, &tokill);
+ collect_procs_pfn(pfn_space, pfn, &tokill);
mf_handled = true;
}
@@ -2383,31 +2411,29 @@ try_again:
* In fact it's dangerous to directly bump up page count from 0,
* that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
*/
- if (!(flags & MF_COUNT_INCREASED)) {
- res = get_hwpoison_page(p, flags);
- if (!res) {
- if (is_free_buddy_page(p)) {
- if (take_page_off_buddy(p)) {
- page_ref_inc(p);
- res = MF_RECOVERED;
- } else {
- /* We lost the race, try again */
- if (retry) {
- ClearPageHWPoison(p);
- retry = false;
- goto try_again;
- }
- res = MF_FAILED;
- }
- res = action_result(pfn, MF_MSG_BUDDY, res);
+ res = get_hwpoison_page(p, flags);
+ if (!res) {
+ if (is_free_buddy_page(p)) {
+ if (take_page_off_buddy(p)) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
} else {
- res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
+ /* We lost the race, try again */
+ if (retry) {
+ ClearPageHWPoison(p);
+ retry = false;
+ goto try_again;
+ }
+ res = MF_FAILED;
}
- goto unlock_mutex;
- } else if (res < 0) {
- res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
- goto unlock_mutex;
+ res = action_result(pfn, MF_MSG_BUDDY, res);
+ } else {
+ res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
}
+ goto unlock_mutex;
+ } else if (res < 0) {
+ res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
+ goto unlock_mutex;
}
folio = page_folio(p);
diff --git a/mm/memory.c b/mm/memory.c
index 2a55edc48a65..da360a6eb8a4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1465,7 +1465,11 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
static bool
vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
- if (src_vma->vm_flags & VM_COPY_ON_FORK)
+ /*
+ * We check against dst_vma as while sane VMA flags will have been
+ * copied, VM_UFFD_WP may be set only on dst_vma.
+ */
+ if (dst_vma->vm_flags & VM_COPY_ON_FORK)
return true;
/*
* The presence of an anon_vma indicates an anonymous VMA has page
@@ -1963,10 +1967,9 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
do {
next = pud_addr_end(addr, end);
if (pud_trans_huge(*pud)) {
- if (next - addr != HPAGE_PUD_SIZE) {
- mmap_assert_locked(tlb->mm);
+ if (next - addr != HPAGE_PUD_SIZE)
split_huge_pud(vma, pud, addr);
- } else if (zap_huge_pud(tlb, vma, pud, addr))
+ else if (zap_huge_pud(tlb, vma, pud, addr))
goto next;
/* fall through */
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 4c2e0d68eb27..ac7be07e3361 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -427,8 +427,6 @@ void free_zone_device_folio(struct folio *folio)
if (folio_test_anon(folio)) {
for (i = 0; i < nr; i++)
__ClearPageAnonExclusive(folio_page(folio, i));
- } else {
- VM_WARN_ON_ONCE(folio_test_large(folio));
}
/*
@@ -479,10 +477,43 @@ void free_zone_device_folio(struct folio *folio)
}
}
-void zone_device_page_init(struct page *page, unsigned int order)
+void zone_device_page_init(struct page *page, struct dev_pagemap *pgmap,
+ unsigned int order)
{
+ struct page *new_page = page;
+ unsigned int i;
+
VM_WARN_ON_ONCE(order > MAX_ORDER_NR_PAGES);
+ for (i = 0; i < (1UL << order); ++i, ++new_page) {
+ struct folio *new_folio = (struct folio *)new_page;
+
+ /*
+ * new_page could have been part of previous higher order folio
+ * which encodes the order, in page + 1, in the flags bits. We
+ * blindly clear bits which could have set my order field here,
+ * including page head.
+ */
+ new_page->flags.f &= ~0xffUL; /* Clear possible order, page head */
+
+#ifdef NR_PAGES_IN_LARGE_FOLIO
+ /*
+ * This pointer math looks odd, but new_page could have been
+ * part of a previous higher order folio, which sets _nr_pages
+ * in page + 1 (new_page). Therefore, we use pointer casting to
+ * correctly locate the _nr_pages bits within new_page which
+ * could have modified by previous higher order folio.
+ */
+ ((struct folio *)(new_page - 1))->_nr_pages = 0;
+#endif
+
+ new_folio->mapping = NULL;
+ new_folio->pgmap = pgmap; /* Also clear compound head */
+ new_folio->share = 0; /* fsdax only, unused for device private */
+ VM_WARN_ON_FOLIO(folio_ref_count(new_folio), new_folio);
+ VM_WARN_ON_FOLIO(!folio_is_zone_device(new_folio), new_folio);
+ }
+
/*
* Drivers shouldn't be allocating pages after calling
* memunmap_pages().
diff --git a/mm/migrate.c b/mm/migrate.c
index 5169f9717f60..4688b9e38cd2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1458,6 +1458,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
int page_was_mapped = 0;
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
+ enum ttu_flags ttu = 0;
if (folio_ref_count(src) == 1) {
/* page was freed from under us. So we are done. */
@@ -1498,8 +1499,6 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
goto put_anon;
if (folio_mapped(src)) {
- enum ttu_flags ttu = 0;
-
if (!folio_test_anon(src)) {
/*
* In shared mappings, try_to_unmap could potentially
@@ -1516,16 +1515,17 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
try_to_migrate(src, ttu);
page_was_mapped = 1;
-
- if (ttu & TTU_RMAP_LOCKED)
- i_mmap_unlock_write(mapping);
}
if (!folio_mapped(src))
rc = move_to_new_folio(dst, src, mode);
if (page_was_mapped)
- remove_migration_ptes(src, !rc ? dst : src, 0);
+ remove_migration_ptes(src, !rc ? dst : src,
+ ttu ? RMP_LOCKED : 0);
+
+ if (ttu & TTU_RMAP_LOCKED)
+ i_mmap_unlock_write(mapping);
unlock_put_anon:
folio_unlock(dst);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index fc2a6f1e518f..2a809cd8e7fa 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2059,7 +2059,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
*/
static unsigned long __init
deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
- struct zone *zone)
+ struct zone *zone, bool can_resched)
{
int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
@@ -2085,10 +2085,10 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
spfn = chunk_end;
- if (irqs_disabled())
- touch_nmi_watchdog();
- else
+ if (can_resched)
cond_resched();
+ else
+ touch_nmi_watchdog();
}
}
@@ -2101,7 +2101,7 @@ deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn,
{
struct zone *zone = arg;
- deferred_init_memmap_chunk(start_pfn, end_pfn, zone);
+ deferred_init_memmap_chunk(start_pfn, end_pfn, zone, true);
}
static unsigned int __init
@@ -2216,7 +2216,7 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order)
for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1);
nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone);
spfn = epfn, epfn += PAGES_PER_SECTION) {
- nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone);
+ nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone, false);
}
/*
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 247e3f9db6c7..7468ec388455 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -10,6 +10,7 @@
#include <linux/swap.h>
#include <linux/rmap.h>
#include <linux/pgalloc.h>
+#include <linux/hugetlb.h>
#include <asm/tlb.h>
@@ -426,6 +427,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
#endif
tlb->vma_pfn = 0;
+ tlb->fully_unshared_tables = 0;
__tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
}
@@ -460,6 +462,31 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
}
/**
+ * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a
+ * single VMA
+ * @tlb: the mmu_gather structure to initialize
+ * @vma: the vm_area_struct
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for operating on
+ * a single VMA. In contrast to tlb_gather_mmu(), calling this function will
+ * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(),
+ * this function will *not* call flush_cache_range().
+ *
+ * For hugetlb VMAs, this function will also initialize the mmu_gather
+ * page_size accordingly, not requiring a separate call to
+ * tlb_change_page_size().
+ *
+ */
+void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
+{
+ tlb_gather_mmu(tlb, vma->vm_mm);
+ tlb_update_vma_flags(tlb, vma);
+ if (is_vm_hugetlb_page(vma))
+ /* All entries have the same size. */
+ tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma)));
+}
+
+/**
* tlb_finish_mmu - finish an mmu_gather structure
* @tlb: the mmu_gather structure to finish
*
@@ -469,6 +496,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
void tlb_finish_mmu(struct mmu_gather *tlb)
{
/*
+ * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
+ * due to complicated locking requirements with page table unsharing.
+ */
+ VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
+
+ /*
* If there are parallel threads are doing PTE changes on same range
* under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
* flush by batching, one thread may end up seeing inconsistent PTEs
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
index 5b009a9cd8b4..8f5735fda0a2 100644
--- a/mm/numa_memblks.c
+++ b/mm/numa_memblks.c
@@ -7,6 +7,8 @@
#include <linux/numa.h>
#include <linux/numa_memblks.h>
+#include <asm/numa.h>
+
int numa_distance_cnt;
static u8 *numa_distance;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 822e05f1a964..cbf758e27aa2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -167,6 +167,33 @@ static inline void __pcp_trylock_noop(unsigned long *flags) { }
pcp_trylock_finish(UP_flags); \
})
+/*
+ * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e.
+ * a potentially remote cpu drain) and get interrupted by an operation that
+ * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP
+ * spinlock assumptions making the trylock a no-op. So we have to turn that
+ * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no
+ * remote cpu's so we can only be locking the only existing local one.
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+static inline void __flags_noop(unsigned long *flags) { }
+#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
+({ \
+ __flags_noop(&(flags)); \
+ spin_lock(&(ptr)->lock); \
+})
+#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
+({ \
+ spin_unlock(&(ptr)->lock); \
+ __flags_noop(&(flags)); \
+})
+#else
+#define pcp_spin_lock_maybe_irqsave(ptr, flags) \
+ spin_lock_irqsave(&(ptr)->lock, flags)
+#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \
+ spin_unlock_irqrestore(&(ptr)->lock, flags)
+#endif
+
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -914,6 +941,17 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
NULL) != NULL;
}
+static void change_pageblock_range(struct page *pageblock_page,
+ int start_order, int migratetype)
+{
+ int nr_pageblocks = 1 << (start_order - pageblock_order);
+
+ while (nr_pageblocks--) {
+ set_pageblock_migratetype(pageblock_page, migratetype);
+ pageblock_page += pageblock_nr_pages;
+ }
+}
+
/*
* Freeing function for a buddy system allocator.
*
@@ -1000,7 +1038,7 @@ static inline void __free_one_page(struct page *page,
* expand() down the line puts the sub-blocks
* on the right freelists.
*/
- set_pageblock_migratetype(buddy, migratetype);
+ change_pageblock_range(buddy, order, migratetype);
}
combined_pfn = buddy_pfn & pfn;
@@ -2147,17 +2185,6 @@ bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *pag
#endif /* CONFIG_MEMORY_ISOLATION */
-static void change_pageblock_range(struct page *pageblock_page,
- int start_order, int migratetype)
-{
- int nr_pageblocks = 1 << (start_order - pageblock_order);
-
- while (nr_pageblocks--) {
- set_pageblock_migratetype(pageblock_page, migratetype);
- pageblock_page += pageblock_nr_pages;
- }
-}
-
static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
@@ -2556,6 +2583,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
{
int high_min, to_drain, to_drain_batched, batch;
+ unsigned long UP_flags;
bool todo = false;
high_min = READ_ONCE(pcp->high_min);
@@ -2575,9 +2603,9 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
to_drain = pcp->count - pcp->high;
while (to_drain > 0) {
to_drain_batched = min(to_drain, batch);
- spin_lock(&pcp->lock);
+ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
free_pcppages_bulk(zone, to_drain_batched, pcp, 0);
- spin_unlock(&pcp->lock);
+ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
todo = true;
to_drain -= to_drain_batched;
@@ -2594,14 +2622,15 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
+ unsigned long UP_flags;
int to_drain, batch;
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0) {
- spin_lock(&pcp->lock);
+ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
free_pcppages_bulk(zone, to_drain, pcp, 0);
- spin_unlock(&pcp->lock);
+ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
}
}
#endif
@@ -2612,10 +2641,11 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+ unsigned long UP_flags;
int count;
do {
- spin_lock(&pcp->lock);
+ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
count = pcp->count;
if (count) {
int to_drain = min(count,
@@ -2624,7 +2654,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
free_pcppages_bulk(zone, to_drain, pcp, 0);
count -= to_drain;
}
- spin_unlock(&pcp->lock);
+ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
} while (count);
}
@@ -5924,7 +5954,7 @@ static int zone_batchsize(struct zone *zone)
* recycled, this leads to the once large chunks of space being
* fragmented and becoming unavailable for high-order allocations.
*/
- return 0;
+ return 1;
#endif
}
@@ -6109,6 +6139,7 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
{
struct per_cpu_pages *pcp;
struct cpu_cacheinfo *cci;
+ unsigned long UP_flags;
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
cci = get_cpu_cacheinfo(cpu);
@@ -6119,12 +6150,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
* This can reduce zone lock contention without hurting
* cache-hot pages sharing.
*/
- spin_lock(&pcp->lock);
+ pcp_spin_lock_maybe_irqsave(pcp, UP_flags);
if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
pcp->flags |= PCPF_FREE_HIGH_BATCH;
else
pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
- spin_unlock(&pcp->lock);
+ pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags);
}
void setup_pcp_cacheinfo(unsigned int cpu)
@@ -6667,11 +6698,19 @@ static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table *
int old_percpu_pagelist_high_fraction;
int ret;
+ /*
+ * Avoid using pcp_batch_high_lock for reads as the value is read
+ * atomically and a race with offlining is harmless.
+ */
+
+ if (!write)
+ return proc_dointvec_minmax(table, write, buffer, length, ppos);
+
mutex_lock(&pcp_batch_high_lock);
old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || ret < 0)
+ if (ret < 0)
goto out;
/* Sanity checking to avoid pcp imbalance */
@@ -7418,20 +7457,16 @@ bool put_page_back_buddy(struct page *page)
}
#endif
-#ifdef CONFIG_ZONE_DMA
-bool has_managed_dma(void)
+bool has_managed_zone(enum zone_type zone)
{
struct pglist_data *pgdat;
for_each_online_pgdat(pgdat) {
- struct zone *zone = &pgdat->node_zones[ZONE_DMA];
-
- if (managed_zone(zone))
+ if (managed_zone(&pgdat->node_zones[zone]))
return true;
}
return false;
}
-#endif /* CONFIG_ZONE_DMA */
#ifdef CONFIG_UNACCEPTED_MEMORY
diff --git a/mm/page_owner.c b/mm/page_owner.c
index a70245684206..b3260f0c17ba 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -952,7 +952,7 @@ static const struct file_operations page_owner_stack_fops = {
.open = page_owner_stack_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_private,
};
static int page_owner_threshold_get(void *data, u64 *val)
diff --git a/mm/rmap.c b/mm/rmap.c
index f955f02d570e..7b9879ef442d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -76,7 +76,7 @@
#include <linux/mm_inline.h>
#include <linux/oom.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <trace/events/migrate.h>
@@ -2008,26 +2008,25 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma))
goto walk_abort;
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+
+ tlb_gather_mmu_vma(&tlb, vma);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
- * The ref count of the PMD page was
- * dropped which is part of the way map
- * counting is done for shared PMDs.
- * Return 'true' here. When there is
- * no other sharing, huge_pmd_unshare
- * returns false and we will unmap the
- * actual page and drop map count
- * to zero.
+ * The PMD table was unmapped,
+ * consequently unmapping the folio.
*/
goto walk_done;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
if (pte_dirty(pteval))
@@ -2404,31 +2403,29 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* fail if unsuccessful.
*/
if (!anon) {
+ struct mmu_gather tlb;
+
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
if (!hugetlb_vma_trylock_write(vma)) {
page_vma_mapped_walk_done(&pvmw);
ret = false;
break;
}
- if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
- hugetlb_vma_unlock_write(vma);
- flush_tlb_range(vma,
- range.start, range.end);
+ tlb_gather_mmu_vma(&tlb, vma);
+ if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) {
+ hugetlb_vma_unlock_write(vma);
+ huge_pmd_unshare_flush(&tlb, vma);
+ tlb_finish_mmu(&tlb);
/*
- * The ref count of the PMD page was
- * dropped which is part of the way map
- * counting is done for shared PMDs.
- * Return 'true' here. When there is
- * no other sharing, huge_pmd_unshare
- * returns false and we will unmap the
- * actual page and drop map count
- * to zero.
+ * The PMD table was unmapped,
+ * consequently unmapping the folio.
*/
page_vma_mapped_walk_done(&pvmw);
break;
}
hugetlb_vma_unlock_write(vma);
+ tlb_finish_mmu(&tlb);
}
/* Nuke the hugetlb page table entry */
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
diff --git a/mm/shmem.c b/mm/shmem.c
index ec6c01378e9d..79af5f9f8b90 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -962,17 +962,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
* being freed).
*/
static long shmem_free_swap(struct address_space *mapping,
- pgoff_t index, void *radswap)
+ pgoff_t index, pgoff_t end, void *radswap)
{
- int order = xa_get_order(&mapping->i_pages, index);
- void *old;
+ XA_STATE(xas, &mapping->i_pages, index);
+ unsigned int nr_pages = 0;
+ pgoff_t base;
+ void *entry;
- old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
- if (old != radswap)
- return 0;
- free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
+ xas_lock_irq(&xas);
+ entry = xas_load(&xas);
+ if (entry == radswap) {
+ nr_pages = 1 << xas_get_order(&xas);
+ base = round_down(xas.xa_index, nr_pages);
+ if (base < index || base + nr_pages - 1 > end)
+ nr_pages = 0;
+ else
+ xas_store(&xas, NULL);
+ }
+ xas_unlock_irq(&xas);
- return 1 << order;
+ if (nr_pages)
+ free_swap_and_cache_nr(radix_to_swp_entry(radswap), nr_pages);
+
+ return nr_pages;
}
/*
@@ -1124,8 +1136,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend,
if (xa_is_value(folio)) {
if (unfalloc)
continue;
- nr_swaps_freed += shmem_free_swap(mapping,
- indices[i], folio);
+ nr_swaps_freed += shmem_free_swap(mapping, indices[i],
+ end - 1, folio);
continue;
}
@@ -1191,14 +1203,30 @@ whole_folios:
folio = fbatch.folios[i];
if (xa_is_value(folio)) {
+ int order;
long swaps_freed;
if (unfalloc)
continue;
- swaps_freed = shmem_free_swap(mapping, indices[i], folio);
+ swaps_freed = shmem_free_swap(mapping, indices[i],
+ end - 1, folio);
if (!swaps_freed) {
- /* Swap was replaced by page: retry */
- index = indices[i];
+ pgoff_t base = indices[i];
+
+ order = shmem_confirm_swap(mapping, indices[i],
+ radix_to_swp_entry(folio));
+ /*
+ * If found a large swap entry cross the end or start
+ * border, skip it as the truncate_inode_partial_folio
+ * above should have at least zerod its content once.
+ */
+ if (order > 0) {
+ base = round_down(base, 1 << order);
+ if (base < start || base + (1 << order) > end)
+ continue;
+ }
+ /* Swap was replaced by page or extended, retry */
+ index = base;
break;
}
nr_swaps_freed += swaps_freed;
diff --git a/mm/slub.c b/mm/slub.c
index 861592ac5425..cdc1e652ec52 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5694,8 +5694,12 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node)
if (unlikely(!size))
return ZERO_SIZE_PTR;
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
- /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
+ /*
+ * kmalloc_nolock() in PREEMPT_RT is not supported from
+ * non-preemptible context because local_lock becomes a
+ * sleeping lock on RT.
+ */
return NULL;
retry:
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
@@ -6685,8 +6689,12 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object,
static noinline
void memcg_alloc_abort_single(struct kmem_cache *s, void *object)
{
+ struct slab *slab = virt_to_slab(object);
+
+ alloc_tagging_slab_free_hook(s, slab, &object, 1);
+
if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false)))
- do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_);
+ do_slab_free(s, slab, object, object, 1, _RET_IP_);
}
#endif
diff --git a/mm/swap.h b/mm/swap.h
index d034c13d8dd2..1bd466da3039 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -198,7 +198,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug);
void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug);
/* linux/mm/swap_state.c */
-extern struct address_space swap_space __ro_after_init;
+extern struct address_space swap_space __read_mostly;
static inline struct address_space *swap_address_space(swp_entry_t entry)
{
return &swap_space;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5f97c6ae70a2..44d228982521 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,8 +37,7 @@ static const struct address_space_operations swap_aops = {
#endif
};
-/* Set swap_space as read only as swap cache is handled by swap table */
-struct address_space swap_space __ro_after_init = {
+struct address_space swap_space __read_mostly = {
.a_ops = &swap_aops,
};
diff --git a/mm/vma.c b/mm/vma.c
index fc90befd162f..7a908a964d18 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -37,6 +37,8 @@ struct mmap_state {
bool check_ksm_early :1;
/* If we map new, hold the file rmap lock on mapping. */
bool hold_file_rmap_lock :1;
+ /* If .mmap_prepare changed the file, we don't need to pin. */
+ bool file_doesnt_need_get :1;
};
#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
@@ -67,18 +69,13 @@ struct mmap_state {
.state = VMA_MERGE_START, \
}
-/*
- * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain
- * more than one anon_vma_chain connecting it to more than one anon_vma. A merge
- * would mean a wider range of folios sharing the root anon_vma lock, and thus
- * potential lock contention, we do not wish to encourage merging such that this
- * scales to a problem.
- */
-static bool vma_had_uncowed_parents(struct vm_area_struct *vma)
+/* Was this VMA ever forked from a parent, i.e. maybe contains CoW mappings? */
+static bool vma_is_fork_child(struct vm_area_struct *vma)
{
/*
* The list_is_singular() test is to avoid merging VMA cloned from
- * parents. This can improve scalability caused by anon_vma lock.
+ * parents. This can improve scalability caused by the anon_vma root
+ * lock.
*/
return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
}
@@ -115,11 +112,19 @@ static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
VM_WARN_ON(src && src_anon != src->anon_vma);
/* Case 1 - we will dup_anon_vma() from src into tgt. */
- if (!tgt_anon && src_anon)
- return !vma_had_uncowed_parents(src);
+ if (!tgt_anon && src_anon) {
+ struct vm_area_struct *copied_from = vmg->copied_from;
+
+ if (vma_is_fork_child(src))
+ return false;
+ if (vma_is_fork_child(copied_from))
+ return false;
+
+ return true;
+ }
/* Case 2 - we will simply use tgt's anon_vma. */
if (tgt_anon && !src_anon)
- return !vma_had_uncowed_parents(tgt);
+ return !vma_is_fork_child(tgt);
/* Case 3 - the anon_vma's are already shared. */
return src_anon == tgt_anon;
}
@@ -829,6 +834,8 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
VM_WARN_ON_VMG(middle &&
!(vma_iter_addr(vmg->vmi) >= middle->vm_start &&
vma_iter_addr(vmg->vmi) < middle->vm_end), vmg);
+ /* An existing merge can never be used by the mremap() logic. */
+ VM_WARN_ON_VMG(vmg->copied_from, vmg);
vmg->state = VMA_MERGE_NOMERGE;
@@ -1099,6 +1106,33 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
}
/*
+ * vma_merge_copied_range - Attempt to merge a VMA that is being copied by
+ * mremap()
+ *
+ * @vmg: Describes the VMA we are adding, in the copied-to range @vmg->start to
+ * @vmg->end (exclusive), which we try to merge with any adjacent VMAs if
+ * possible.
+ *
+ * vmg->prev, next, start, end, pgoff should all be relative to the COPIED TO
+ * range, i.e. the target range for the VMA.
+ *
+ * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
+ * to the VMA we expanded.
+ *
+ * ASSUMPTIONS: Same as vma_merge_new_range(), except vmg->middle must contain
+ * the copied-from VMA.
+ */
+static struct vm_area_struct *vma_merge_copied_range(struct vma_merge_struct *vmg)
+{
+ /* We must have a copied-from VMA. */
+ VM_WARN_ON_VMG(!vmg->middle, vmg);
+
+ vmg->copied_from = vmg->middle;
+ vmg->middle = NULL;
+ return vma_merge_new_range(vmg);
+}
+
+/*
* vma_expand - Expand an existing VMA
*
* @vmg: Describes a VMA expansion operation.
@@ -1117,46 +1151,52 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
int vma_expand(struct vma_merge_struct *vmg)
{
struct vm_area_struct *anon_dup = NULL;
- bool remove_next = false;
struct vm_area_struct *target = vmg->target;
struct vm_area_struct *next = vmg->next;
+ bool remove_next = false;
vm_flags_t sticky_flags;
-
- sticky_flags = vmg->vm_flags & VM_STICKY;
- sticky_flags |= target->vm_flags & VM_STICKY;
-
- VM_WARN_ON_VMG(!target, vmg);
+ int ret = 0;
mmap_assert_write_locked(vmg->mm);
-
vma_start_write(target);
- if (next && (target != next) && (vmg->end == next->vm_end)) {
- int ret;
- sticky_flags |= next->vm_flags & VM_STICKY;
+ if (next && target != next && vmg->end == next->vm_end)
remove_next = true;
- /* This should already have been checked by this point. */
- VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
- vma_start_write(next);
- /*
- * In this case we don't report OOM, so vmg->give_up_on_mm is
- * safe.
- */
- ret = dup_anon_vma(target, next, &anon_dup);
- if (ret)
- return ret;
- }
+ /* We must have a target. */
+ VM_WARN_ON_VMG(!target, vmg);
+ /* This should have already been checked by this point. */
+ VM_WARN_ON_VMG(remove_next && !can_merge_remove_vma(next), vmg);
/* Not merging but overwriting any part of next is not handled. */
VM_WARN_ON_VMG(next && !remove_next &&
next != target && vmg->end > next->vm_start, vmg);
- /* Only handles expanding */
+ /* Only handles expanding. */
VM_WARN_ON_VMG(target->vm_start < vmg->start ||
target->vm_end > vmg->end, vmg);
+ sticky_flags = vmg->vm_flags & VM_STICKY;
+ sticky_flags |= target->vm_flags & VM_STICKY;
if (remove_next)
- vmg->__remove_next = true;
+ sticky_flags |= next->vm_flags & VM_STICKY;
+
+ /*
+ * If we are removing the next VMA or copying from a VMA
+ * (e.g. mremap()'ing), we must propagate anon_vma state.
+ *
+ * Note that, by convention, callers ignore OOM for this case, so
+ * we don't need to account for vmg->give_up_on_mm here.
+ */
+ if (remove_next)
+ ret = dup_anon_vma(target, next, &anon_dup);
+ if (!ret && vmg->copied_from)
+ ret = dup_anon_vma(target, vmg->copied_from, &anon_dup);
+ if (ret)
+ return ret;
+ if (remove_next) {
+ vma_start_write(next);
+ vmg->__remove_next = true;
+ }
if (commit_merge(vmg))
goto nomem;
@@ -1828,10 +1868,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
- vmg.middle = NULL; /* New VMA range. */
vmg.pgoff = pgoff;
vmg.next = vma_iter_next_rewind(&vmi, NULL);
- new_vma = vma_merge_new_range(&vmg);
+ new_vma = vma_merge_copied_range(&vmg);
if (new_vma) {
/*
@@ -2413,7 +2452,9 @@ static int __mmap_new_file_vma(struct mmap_state *map,
struct vma_iterator *vmi = map->vmi;
int error;
- vma->vm_file = get_file(map->file);
+ vma->vm_file = map->file;
+ if (!map->file_doesnt_need_get)
+ get_file(map->file);
if (!map->file->f_op->mmap)
return 0;
@@ -2601,7 +2642,10 @@ static int call_mmap_prepare(struct mmap_state *map,
/* Update fields permitted to be changed. */
map->pgoff = desc->pgoff;
- map->file = desc->vm_file;
+ if (desc->vm_file != map->file) {
+ map->file_doesnt_need_get = true;
+ map->file = desc->vm_file;
+ }
map->vm_flags = desc->vm_flags;
map->page_prot = desc->page_prot;
/* User-defined fields. */
diff --git a/mm/vma.h b/mm/vma.h
index abada6a64c4e..9d5ee6ac913a 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -106,6 +106,9 @@ struct vma_merge_struct {
struct anon_vma_name *anon_name;
enum vma_merge_state state;
+ /* If copied from (i.e. mremap()'d) the VMA from which we are copying. */
+ struct vm_area_struct *copied_from;
+
/* Flags which callers can use to modify merge behaviour: */
/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ecbac900c35f..e286c2d2068c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -4248,7 +4248,7 @@ void *vzalloc_node_noprof(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node_noprof);
/**
- * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents
+ * vrealloc_node_align - reallocate virtually contiguous memory; contents
* remain unchanged
* @p: object to reallocate memory for
* @size: the size to reallocate
@@ -4322,7 +4322,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align
if (want_init_on_free() || want_init_on_alloc(flags))
memset((void *)p + size, 0, old_size - size);
vm->requested_size = size;
- kasan_poison_vmalloc(p + size, old_size - size);
+ kasan_vrealloc(p, old_size, size);
return (void *)p;
}
@@ -4330,14 +4330,13 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align
* We already have the bytes available in the allocation; use them.
*/
if (size <= alloced_size) {
- kasan_unpoison_vmalloc(p + old_size, size - old_size,
- KASAN_VMALLOC_PROT_NORMAL);
/*
* No need to zero memory here, as unused memory will have
* already been zeroed at initial allocation time or during
* realloc shrink time.
*/
vm->requested_size = size;
+ kasan_vrealloc(p, old_size, size);
return (void *)p;
}
@@ -5025,9 +5024,7 @@ retry:
* With hardware tag-based KASAN, marking is skipped for
* non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
*/
- for (area = 0; area < nr_vms; area++)
- vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
- vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);
+ kasan_unpoison_vmap_areas(vms, nr_vms, KASAN_VMALLOC_PROT_NORMAL);
kfree(vas);
return vms;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 670fe9fae5ba..614ccf39fe3f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -7707,6 +7707,17 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
return ret;
}
+#else
+
+static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask,
+ unsigned long nr_pages,
+ struct scan_control *sc)
+{
+ return 0;
+}
+
+#endif
+
enum {
MEMORY_RECLAIM_SWAPPINESS = 0,
MEMORY_RECLAIM_SWAPPINESS_MAX,
@@ -7814,8 +7825,6 @@ int user_proactive_reclaim(char *buf,
return 0;
}
-#endif
-
/**
* check_move_unevictable_folios - Move evictable folios to appropriate zone
* lru list
diff --git a/mm/zswap.c b/mm/zswap.c
index 5d0f8b13a958..ac9b7a60736b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -787,7 +787,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
return 0;
fail:
- if (acomp)
+ if (!IS_ERR_OR_NULL(acomp))
crypto_free_acomp(acomp);
kfree(buffer);
return ret;