diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 12 | ||||
| -rw-r--r-- | mm/damon/core.c | 41 | ||||
| -rw-r--r-- | mm/damon/sysfs-schemes.c | 10 | ||||
| -rw-r--r-- | mm/damon/sysfs.c | 9 | ||||
| -rw-r--r-- | mm/damon/vaddr.c | 2 | ||||
| -rw-r--r-- | mm/hugetlb.c | 147 | ||||
| -rw-r--r-- | mm/init-mm.c | 5 | ||||
| -rw-r--r-- | mm/internal.h | 8 | ||||
| -rw-r--r-- | mm/kasan/common.c | 53 | ||||
| -rw-r--r-- | mm/kasan/hw_tags.c | 2 | ||||
| -rw-r--r-- | mm/kasan/shadow.c | 4 | ||||
| -rw-r--r-- | mm/kfence/core.c | 40 | ||||
| -rw-r--r-- | mm/kmsan/shadow.c | 2 | ||||
| -rw-r--r-- | mm/ksm.c | 2 | ||||
| -rw-r--r-- | mm/memcontrol.c | 4 | ||||
| -rw-r--r-- | mm/memfd.c | 4 | ||||
| -rw-r--r-- | mm/memfd_luo.c | 10 | ||||
| -rw-r--r-- | mm/memory-failure.c | 170 | ||||
| -rw-r--r-- | mm/memory.c | 11 | ||||
| -rw-r--r-- | mm/memremap.c | 37 | ||||
| -rw-r--r-- | mm/migrate.c | 12 | ||||
| -rw-r--r-- | mm/mm_init.c | 12 | ||||
| -rw-r--r-- | mm/mmu_gather.c | 33 | ||||
| -rw-r--r-- | mm/numa_memblks.c | 2 | ||||
| -rw-r--r-- | mm/page_alloc.c | 91 | ||||
| -rw-r--r-- | mm/page_owner.c | 2 | ||||
| -rw-r--r-- | mm/rmap.c | 45 | ||||
| -rw-r--r-- | mm/shmem.c | 54 | ||||
| -rw-r--r-- | mm/slub.c | 14 | ||||
| -rw-r--r-- | mm/swap.h | 2 | ||||
| -rw-r--r-- | mm/swap_state.c | 3 | ||||
| -rw-r--r-- | mm/vma.c | 122 | ||||
| -rw-r--r-- | mm/vma.h | 3 | ||||
| -rw-r--r-- | mm/vmalloc.c | 11 | ||||
| -rw-r--r-- | mm/vmscan.c | 13 | ||||
| -rw-r--r-- | mm/zswap.c | 2 |
36 files changed, 677 insertions, 317 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index bd0ea5454af8..a992f2203eb9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1220,10 +1220,14 @@ config ZONE_DEVICE Device memory hotplug support allows for establishing pmem, or other device driver discovered memory regions, in the memmap. This allows pfn_to_page() lookups of otherwise - "device-physical" addresses which is needed for using a DAX - mapping in an O_DIRECT operation, among other things. - - If FS_DAX is enabled, then say Y. + "device-physical" addresses which is needed for DAX, PCI_P2PDMA, and + DEVICE_PRIVATE features among others. + + Enabling this option will reduce the entropy of x86 KASLR memory + regions. For example - on a 46 bit system, the entropy goes down + from 16 bits to 15 bits. The actual reduction in entropy depends + on the physical address bits, on processor features, kernel config + (5 level page table) and physical memory present on the system. # # Helpers to mirror range of the CPU page tables of a process into device page diff --git a/mm/damon/core.c b/mm/damon/core.c index f9fc0375890a..84f80a20f233 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1431,6 +1431,35 @@ bool damon_is_running(struct damon_ctx *ctx) return running; } +/* + * damon_call_handle_inactive_ctx() - handle DAMON call request that added to + * an inactive context. + * @ctx: The inactive DAMON context. + * @control: Control variable of the call request. + * + * This function is called in a case that @control is added to @ctx but @ctx is + * not running (inactive). See if @ctx handled @control or not, and cleanup + * @control if it was not handled. + * + * Returns 0 if @control was handled by @ctx, negative error code otherwise. + */ +static int damon_call_handle_inactive_ctx( + struct damon_ctx *ctx, struct damon_call_control *control) +{ + struct damon_call_control *c; + + mutex_lock(&ctx->call_controls_lock); + list_for_each_entry(c, &ctx->call_controls, list) { + if (c == control) { + list_del(&control->list); + mutex_unlock(&ctx->call_controls_lock); + return -EINVAL; + } + } + mutex_unlock(&ctx->call_controls_lock); + return 0; +} + /** * damon_call() - Invoke a given function on DAMON worker thread (kdamond). * @ctx: DAMON context to call the function for. @@ -1461,7 +1490,7 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control) list_add_tail(&control->list, &ctx->call_controls); mutex_unlock(&ctx->call_controls_lock); if (!damon_is_running(ctx)) - return -EINVAL; + return damon_call_handle_inactive_ctx(ctx, control); if (control->repeat) return 0; wait_for_completion(&control->completion); @@ -2051,13 +2080,15 @@ static unsigned long damos_get_node_memcg_used_bp( rcu_read_lock(); memcg = mem_cgroup_from_id(goal->memcg_id); - rcu_read_unlock(); - if (!memcg) { + if (!memcg || !mem_cgroup_tryget(memcg)) { + rcu_read_unlock(); if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) return 0; else /* DAMOS_QUOTA_NODE_MEMCG_FREE_BP */ return 10000; } + rcu_read_unlock(); + mem_cgroup_flush_stats(memcg); lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(goal->nid)); used_pages = lruvec_page_state(lruvec, NR_ACTIVE_ANON); @@ -2065,6 +2096,8 @@ static unsigned long damos_get_node_memcg_used_bp( used_pages += lruvec_page_state(lruvec, NR_ACTIVE_FILE); used_pages += lruvec_page_state(lruvec, NR_INACTIVE_FILE); + mem_cgroup_put(memcg); + si_meminfo_node(&i, goal->nid); if (goal->metric == DAMOS_QUOTA_NODE_MEMCG_USED_BP) numerator = used_pages; @@ -2751,13 +2784,13 @@ done: if (ctx->ops.cleanup) ctx->ops.cleanup(ctx); kfree(ctx->regions_score_histogram); + kdamond_call(ctx, true); pr_debug("kdamond (%d) finishes\n", current->pid); mutex_lock(&ctx->kdamond_lock); ctx->kdamond = NULL; mutex_unlock(&ctx->kdamond_lock); - kdamond_call(ctx, true); damos_walk_cancel(ctx); mutex_lock(&damon_lock); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 30d20f5b3192..3a699dcd5a7f 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -2152,13 +2152,13 @@ static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) return err; err = damos_sysfs_set_dests(scheme); if (err) - goto put_access_pattern_out; + goto rmdir_put_access_pattern_out; err = damon_sysfs_scheme_set_quotas(scheme); if (err) goto put_dests_out; err = damon_sysfs_scheme_set_watermarks(scheme); if (err) - goto put_quotas_access_pattern_out; + goto rmdir_put_quotas_access_pattern_out; err = damos_sysfs_set_filter_dirs(scheme); if (err) goto put_watermarks_quotas_access_pattern_out; @@ -2183,13 +2183,15 @@ put_filters_watermarks_quotas_access_pattern_out: put_watermarks_quotas_access_pattern_out: kobject_put(&scheme->watermarks->kobj); scheme->watermarks = NULL; -put_quotas_access_pattern_out: +rmdir_put_quotas_access_pattern_out: + damon_sysfs_quotas_rm_dirs(scheme->quotas); kobject_put(&scheme->quotas->kobj); scheme->quotas = NULL; put_dests_out: kobject_put(&scheme->dests->kobj); scheme->dests = NULL; -put_access_pattern_out: +rmdir_put_access_pattern_out: + damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); kobject_put(&scheme->access_pattern->kobj); scheme->access_pattern = NULL; return err; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index e2bd2d7becdd..95fd9375a7d8 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -792,7 +792,7 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) nr_regions_range = damon_sysfs_ul_range_alloc(10, 1000); if (!nr_regions_range) { err = -ENOMEM; - goto put_intervals_out; + goto rmdir_put_intervals_out; } err = kobject_init_and_add(&nr_regions_range->kobj, @@ -806,6 +806,8 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) put_nr_regions_intervals_out: kobject_put(&nr_regions_range->kobj); attrs->nr_regions_range = NULL; +rmdir_put_intervals_out: + damon_sysfs_intervals_rm_dirs(intervals); put_intervals_out: kobject_put(&intervals->kobj); attrs->intervals = NULL; @@ -948,7 +950,7 @@ static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context) err = damon_sysfs_context_set_targets(context); if (err) - goto put_attrs_out; + goto rmdir_put_attrs_out; err = damon_sysfs_context_set_schemes(context); if (err) @@ -958,7 +960,8 @@ static int damon_sysfs_context_add_dirs(struct damon_sysfs_context *context) put_targets_attrs_out: kobject_put(&context->targets->kobj); context->targets = NULL; -put_attrs_out: +rmdir_put_attrs_out: + damon_sysfs_attrs_rm_dirs(context->attrs); kobject_put(&context->attrs->kobj); context->attrs = NULL; return err; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 2750c88e7225..23ed738a0bd6 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -743,7 +743,7 @@ huge_out: if (!folio) continue; if (damos_va_filter_out(s, folio, walk->vma, addr, pte, NULL)) - return 0; + continue; damos_va_migrate_dests_add(folio, walk->vma, addr, dests, migration_lists); nr = folio_nr_pages(folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51273baec9e5..a1832da0f623 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4286,6 +4286,11 @@ static int __init hugepages_setup(char *s) unsigned long tmp; char *p = s; + if (!hugepages_supported()) { + pr_warn("HugeTLB: hugepages unsupported, ignoring hugepages=%s cmdline\n", s); + return 0; + } + if (!parsed_valid_hugepagesz) { pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s); parsed_valid_hugepagesz = true; @@ -4366,6 +4371,11 @@ static int __init hugepagesz_setup(char *s) unsigned long size; struct hstate *h; + if (!hugepages_supported()) { + pr_warn("HugeTLB: hugepages unsupported, ignoring hugepagesz=%s cmdline\n", s); + return 0; + } + parsed_valid_hugepagesz = false; size = (unsigned long)memparse(s, NULL); @@ -4414,6 +4424,12 @@ static int __init default_hugepagesz_setup(char *s) unsigned long size; int i; + if (!hugepages_supported()) { + pr_warn("HugeTLB: hugepages unsupported, ignoring default_hugepagesz=%s cmdline\n", + s); + return 0; + } + parsed_valid_hugepagesz = false; if (parsed_default_hugepagesz) { pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s); @@ -5096,7 +5112,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, unsigned long last_addr_mask; pte_t *src_pte, *dst_pte; struct mmu_notifier_range range; - bool shared_pmd = false; + struct mmu_gather tlb; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr, old_end); @@ -5106,6 +5122,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, * range. */ flush_cache_range(vma, range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); @@ -5122,8 +5139,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; - if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { - shared_pmd = true; + if (huge_pmd_unshare(&tlb, vma, old_addr, src_pte)) { old_addr |= last_addr_mask; new_addr |= last_addr_mask; continue; @@ -5134,15 +5150,16 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, break; move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); + tlb_remove_huge_tlb_entry(h, &tlb, src_pte, old_addr); } - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, old_end - len, old_end); + tlb_flush_mmu_tlbonly(&tlb); + huge_pmd_unshare_flush(&tlb, vma); + mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); return len + old_addr - old_end; } @@ -5161,7 +5178,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); bool adjust_reservation; unsigned long last_addr_mask; - bool force_flush = false; WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); @@ -5184,10 +5200,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(tlb, vma, address, ptep)) { spin_unlock(ptl); - tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE); - force_flush = true; address |= last_addr_mask; continue; } @@ -5303,21 +5317,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } tlb_end_vma(tlb, vma); - /* - * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We - * could defer the flush until now, since by holding i_mmap_rwsem we - * guaranteed that the last reference would not be dropped. But we must - * do the flushing before we return, as otherwise i_mmap_rwsem will be - * dropped and the last reference to the shared PMDs page might be - * dropped as well. - * - * In theory we could defer the freeing of the PMD pages as well, but - * huge_pmd_unshare() relies on the exact page_count for the PMD page to - * detect sharing, so we cannot defer the release of the page either. - * Instead, do flush now. - */ - if (force_flush) - tlb_flush_mmu_tlbonly(tlb); + huge_pmd_unshare_flush(tlb, vma); } void __hugetlb_zap_begin(struct vm_area_struct *vma, @@ -6416,11 +6416,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte_t pte; struct hstate *h = hstate_vma(vma); long pages = 0, psize = huge_page_size(h); - bool shared_pmd = false; struct mmu_notifier_range range; unsigned long last_addr_mask; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + struct mmu_gather tlb; /* * In the case of shared PMDs, the area to flush could be beyond @@ -6433,6 +6433,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, BUG_ON(address >= end); flush_cache_range(vma, range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); mmu_notifier_invalidate_range_start(&range); hugetlb_vma_lock_write(vma); @@ -6459,7 +6460,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, } } ptl = huge_pte_lock(h, mm, ptep); - if (huge_pmd_unshare(mm, vma, address, ptep)) { + if (huge_pmd_unshare(&tlb, vma, address, ptep)) { /* * When uffd-wp is enabled on the vma, unshare * shouldn't happen at all. Warn about it if it @@ -6468,7 +6469,6 @@ long hugetlb_change_protection(struct vm_area_struct *vma, WARN_ON_ONCE(uffd_wp || uffd_wp_resolve); pages++; spin_unlock(ptl); - shared_pmd = true; address |= last_addr_mask; continue; } @@ -6529,23 +6529,16 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; + tlb_remove_huge_tlb_entry(h, &tlb, ptep, address); } next: spin_unlock(ptl); cond_resched(); } - /* - * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare - * may have cleared our pud entry and done put_page on the page table: - * once we release i_mmap_rwsem, another task can do the final put_page - * and that page table be reused and filled with junk. If we actually - * did unshare a page of pmds, flush the range corresponding to the pud. - */ - if (shared_pmd) - flush_hugetlb_tlb_range(vma, range.start, range.end); - else - flush_hugetlb_tlb_range(vma, start, end); + + tlb_flush_mmu_tlbonly(&tlb); + huge_pmd_unshare_flush(&tlb, vma); /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are * downgrading page table protection not changing it to point to a new @@ -6556,6 +6549,7 @@ next: i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); return pages > 0 ? (pages << h->order) : pages; } @@ -6912,18 +6906,27 @@ out: return pte; } -/* - * unmap huge page backed by shared pte. +/** + * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * @addr: the address we are trying to unshare. + * @ptep: pointer into the (pmd) page table. * - * Called with page table lock held. + * Called with the page table lock held, the i_mmap_rwsem held in write mode + * and the hugetlb vma lock held in write mode. * - * returns: 1 successfully unmapped a shared pte page - * 0 the underlying pte page is not shared, or it is the last user + * Note: The caller must call huge_pmd_unshare_flush() before dropping the + * i_mmap_rwsem. + * + * Returns: 1 if it was a shared PMD table and it got unmapped, or 0 if it + * was not a shared PMD table. */ -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { unsigned long sz = huge_page_size(hstate_vma(vma)); + struct mm_struct *mm = vma->vm_mm; pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); @@ -6935,18 +6938,36 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); pud_clear(pud); - /* - * Once our caller drops the rmap lock, some other process might be - * using this page table as a normal, non-hugetlb page table. - * Wait for pending gup_fast() in other threads to finish before letting - * that happen. - */ - tlb_remove_table_sync_one(); - ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); + + tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); + mm_dec_nr_pmds(mm); return 1; } +/* + * huge_pmd_unshare_flush - Complete a sequence of huge_pmd_unshare() calls + * @tlb: the current mmu_gather. + * @vma: the vma covering the pmd table. + * + * Perform necessary TLB flushes or IPI broadcasts to synchronize PMD table + * unsharing with concurrent page table walkers. + * + * This function must be called after a sequence of huge_pmd_unshare() + * calls while still holding the i_mmap_rwsem. + */ +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + /* + * We must synchronize page table unsharing such that nobody will + * try reusing a previously-shared page table while it might still + * be in use by previous sharers (TLB, GUP_fast). + */ + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + + tlb_flush_unshared_tables(tlb); +} + #else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, @@ -6955,12 +6976,16 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; } -int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { return 0; } +void huge_pmd_unshare_flush(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ +} + void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { @@ -7227,6 +7252,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long sz = huge_page_size(h); struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; + struct mmu_gather tlb; unsigned long address; spinlock_t *ptl; pte_t *ptep; @@ -7238,6 +7264,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, return; flush_cache_range(vma, start, end); + tlb_gather_mmu_vma(&tlb, vma); + /* * No need to call adjust_range_if_pmd_sharing_possible(), because * we have already done the PUD_SIZE alignment. @@ -7256,10 +7284,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - huge_pmd_unshare(mm, vma, address, ptep); + huge_pmd_unshare(&tlb, vma, address, ptep); spin_unlock(ptl); } - flush_hugetlb_tlb_range(vma, start, end); + huge_pmd_unshare_flush(&tlb, vma); if (take_locks) { i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); @@ -7269,6 +7297,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); + tlb_finish_mmu(&tlb); } /* diff --git a/mm/init-mm.c b/mm/init-mm.c index 4600e7605cab..c5556bb9d5f0 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -44,7 +44,10 @@ struct mm_struct init_mm = { .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, - .cpu_bitmap = CPU_BITS_NONE, +#ifdef CONFIG_SCHED_MM_CID + .mm_cid.lock = __RAW_SPIN_LOCK_UNLOCKED(init_mm.mm_cid.lock), +#endif + .flexible_array = MM_STRUCT_FLEXIBLE_ARRAY_INIT, INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/internal.h b/mm/internal.h index e430da900430..f35dbcf99a86 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -538,16 +538,8 @@ extern unsigned long highest_memmap_pfn; bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); -#ifdef CONFIG_NUMA int user_proactive_reclaim(char *buf, struct mem_cgroup *memcg, pg_data_t *pgdat); -#else -static inline int user_proactive_reclaim(char *buf, - struct mem_cgroup *memcg, pg_data_t *pgdat) -{ - return 0; -} -#endif /* * in mm/rmap.c: diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 1d27f1bd260b..b7d05c2a6d93 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -28,6 +28,7 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/bug.h> +#include <linux/vmalloc.h> #include "kasan.h" #include "../slab.h" @@ -575,3 +576,55 @@ bool __kasan_check_byte(const void *address, unsigned long ip) } return true; } + +#ifdef CONFIG_KASAN_VMALLOC +void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, + kasan_vmalloc_flags_t flags) +{ + unsigned long size; + void *addr; + int area; + u8 tag; + + /* + * If KASAN_VMALLOC_KEEP_TAG was set at this point, all vms[] pointers + * would be unpoisoned with the KASAN_TAG_KERNEL which would disable + * KASAN checks down the line. + */ + if (WARN_ON_ONCE(flags & KASAN_VMALLOC_KEEP_TAG)) + return; + + size = vms[0]->size; + addr = vms[0]->addr; + vms[0]->addr = __kasan_unpoison_vmalloc(addr, size, flags); + tag = get_tag(vms[0]->addr); + + for (area = 1 ; area < nr_vms ; area++) { + size = vms[area]->size; + addr = set_tag(vms[area]->addr, tag); + vms[area]->addr = + __kasan_unpoison_vmalloc(addr, size, flags | KASAN_VMALLOC_KEEP_TAG); + } +} + +void __kasan_vrealloc(const void *addr, unsigned long old_size, + unsigned long new_size) +{ + if (new_size < old_size) { + kasan_poison_last_granule(addr, new_size); + + new_size = round_up(new_size, KASAN_GRANULE_SIZE); + old_size = round_up(old_size, KASAN_GRANULE_SIZE); + if (new_size < old_size) + __kasan_poison_vmalloc(addr + new_size, + old_size - new_size); + } else if (new_size > old_size) { + old_size = round_down(old_size, KASAN_GRANULE_SIZE); + __kasan_unpoison_vmalloc(addr + old_size, + new_size - old_size, + KASAN_VMALLOC_PROT_NORMAL | + KASAN_VMALLOC_VM_ALLOC | + KASAN_VMALLOC_KEEP_TAG); + } +} +#endif diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 1c373cc4b3fa..cbef5e450954 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -361,7 +361,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, return (void *)start; } - tag = kasan_random_tag(); + tag = (flags & KASAN_VMALLOC_KEEP_TAG) ? get_tag(start) : kasan_random_tag(); start = set_tag(start, tag); /* Unpoison and initialize memory up to size. */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 29a751a8a08d..32fbdf759ea2 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -631,7 +631,9 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, !(flags & KASAN_VMALLOC_PROT_NORMAL)) return (void *)start; - start = set_tag(start, kasan_random_tag()); + if (unlikely(!(flags & KASAN_VMALLOC_KEEP_TAG))) + start = set_tag(start, kasan_random_tag()); + kasan_unpoison(start, size, false); return (void *)start; } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 577a1699c553..4f79ec720752 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -596,7 +596,7 @@ static void rcu_guarded_free(struct rcu_head *h) static unsigned long kfence_init_pool(void) { unsigned long addr, start_pfn; - int i; + int i, rand; if (!arch_kfence_init_pool()) return (unsigned long)__kfence_pool; @@ -647,13 +647,27 @@ static unsigned long kfence_init_pool(void) INIT_LIST_HEAD(&meta->list); raw_spin_lock_init(&meta->lock); meta->state = KFENCE_OBJECT_UNUSED; - meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */ - list_add_tail(&meta->list, &kfence_freelist); + /* Use addr to randomize the freelist. */ + meta->addr = i; /* Protect the right redzone. */ - if (unlikely(!kfence_protect(addr + PAGE_SIZE))) + if (unlikely(!kfence_protect(addr + 2 * i * PAGE_SIZE + PAGE_SIZE))) goto reset_slab; + } + + for (i = CONFIG_KFENCE_NUM_OBJECTS; i > 0; i--) { + rand = get_random_u32_below(i); + swap(kfence_metadata_init[i - 1].addr, kfence_metadata_init[rand].addr); + } + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + struct kfence_metadata *meta_1 = &kfence_metadata_init[i]; + struct kfence_metadata *meta_2 = &kfence_metadata_init[meta_1->addr]; + + list_add_tail(&meta_2->list, &kfence_freelist); + } + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { + kfence_metadata_init[i].addr = addr; addr += 2 * PAGE_SIZE; } @@ -666,6 +680,7 @@ static unsigned long kfence_init_pool(void) return 0; reset_slab: + addr += 2 * i * PAGE_SIZE; for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { struct page *page; @@ -823,6 +838,9 @@ static struct notifier_block kfence_check_canary_notifier = { static struct delayed_work kfence_timer; #ifdef CONFIG_KFENCE_STATIC_KEYS +/* Wait queue to wake up allocation-gate timer task. */ +static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); + static int kfence_reboot_callback(struct notifier_block *nb, unsigned long action, void *data) { @@ -832,7 +850,12 @@ static int kfence_reboot_callback(struct notifier_block *nb, */ WRITE_ONCE(kfence_enabled, false); /* Cancel any pending timer work */ - cancel_delayed_work_sync(&kfence_timer); + cancel_delayed_work(&kfence_timer); + /* + * Wake up any blocked toggle_allocation_gate() so it can complete + * early while the system is still able to handle IPIs. + */ + wake_up(&allocation_wait); return NOTIFY_OK; } @@ -842,9 +865,6 @@ static struct notifier_block kfence_reboot_notifier = { .priority = INT_MAX, /* Run early to stop timers ASAP */ }; -/* Wait queue to wake up allocation-gate timer task. */ -static DECLARE_WAIT_QUEUE_HEAD(allocation_wait); - static void wake_up_kfence_timer(struct irq_work *work) { wake_up(&allocation_wait); @@ -873,7 +893,9 @@ static void toggle_allocation_gate(struct work_struct *work) /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate) > 0); + wait_event_idle(allocation_wait, + atomic_read(&kfence_allocation_gate) > 0 || + !READ_ONCE(kfence_enabled)); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index e7f554a31bb4..9e1c5f2b7a41 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -207,7 +207,7 @@ void kmsan_free_page(struct page *page, unsigned int order) if (!kmsan_enabled || kmsan_in_runtime()) return; kmsan_enter_runtime(); - kmsan_internal_poison_memory(page_address(page), page_size(page), + kmsan_internal_poison_memory(page_address(page), PAGE_SIZE << order, GFP_KERNEL & ~(__GFP_RECLAIM), KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); @@ -650,7 +650,7 @@ static int break_ksm_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long en } } out_unlock: - pte_unmap_unlock(ptep, ptl); + pte_unmap_unlock(start_ptep, ptl); return found; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index be810c1fbfc3..86f43b7e5f71 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5638,6 +5638,6 @@ void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) memcg = root_mem_cgroup; pr_warn("Memory cgroup min protection %lukB -- low protection %lukB", - K(atomic_long_read(&memcg->memory.children_min_usage)*PAGE_SIZE), - K(atomic_long_read(&memcg->memory.children_low_usage)*PAGE_SIZE)); + K(atomic_long_read(&memcg->memory.children_min_usage)), + K(atomic_long_read(&memcg->memory.children_low_usage))); } diff --git a/mm/memfd.c b/mm/memfd.c index ab5312aff14b..f032c6052926 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -456,7 +456,7 @@ err_name: return ERR_PTR(error); } -static struct file *alloc_file(const char *name, unsigned int flags) +struct file *memfd_alloc_file(const char *name, unsigned int flags) { unsigned int *file_seals; struct file *file; @@ -520,5 +520,5 @@ SYSCALL_DEFINE2(memfd_create, return PTR_ERR(name); fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0; - return FD_ADD(fd_flags, alloc_file(name, flags)); + return FD_ADD(fd_flags, memfd_alloc_file(name, flags)); } diff --git a/mm/memfd_luo.c b/mm/memfd_luo.c index 4f6ba63b4310..a34fccc23b6a 100644 --- a/mm/memfd_luo.c +++ b/mm/memfd_luo.c @@ -78,6 +78,7 @@ #include <linux/liveupdate.h> #include <linux/shmem_fs.h> #include <linux/vmalloc.h> +#include <linux/memfd.h> #include "internal.h" static int memfd_luo_preserve_folios(struct file *file, @@ -443,11 +444,11 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) if (!ser) return -EINVAL; - file = shmem_file_setup("", 0, VM_NORESERVE); - + file = memfd_alloc_file("", 0); if (IS_ERR(file)) { pr_err("failed to setup file: %pe\n", file); - return PTR_ERR(file); + err = PTR_ERR(file); + goto free_ser; } vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); @@ -473,7 +474,8 @@ static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) put_file: fput(file); - +free_ser: + kho_restore_free(ser); return err; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fbc5a01260c8..9fd8355176eb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -692,6 +692,8 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, unsigned long poisoned_pfn, struct to_kill *tk) { unsigned long pfn = 0; + unsigned long hwpoison_vaddr; + unsigned long mask; if (pte_present(pte)) { pfn = pte_pfn(pte); @@ -702,10 +704,12 @@ static int check_hwpoisoned_entry(pte_t pte, unsigned long addr, short shift, pfn = softleaf_to_pfn(entry); } - if (!pfn || pfn != poisoned_pfn) + mask = ~((1UL << (shift - PAGE_SHIFT)) - 1); + if (!pfn || pfn != (poisoned_pfn & mask)) return 0; - set_to_kill(tk, addr, shift); + hwpoison_vaddr = addr + ((poisoned_pfn - pfn) << PAGE_SHIFT); + set_to_kill(tk, hwpoison_vaddr, shift); return 1; } @@ -1883,12 +1887,22 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) return count; } -static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) +#define MF_HUGETLB_FREED 0 /* freed hugepage */ +#define MF_HUGETLB_IN_USED 1 /* in-use hugepage */ +#define MF_HUGETLB_NON_HUGEPAGE 2 /* not a hugepage */ +#define MF_HUGETLB_FOLIO_PRE_POISONED 3 /* folio already poisoned */ +#define MF_HUGETLB_PAGE_PRE_POISONED 4 /* exact page already poisoned */ +#define MF_HUGETLB_RETRY 5 /* hugepage is busy, retry */ +/* + * Set hugetlb folio as hwpoisoned, update folio private raw hwpoison list + * to keep track of the poisoned pages. + */ +static int hugetlb_update_hwpoison(struct folio *folio, struct page *page) { struct llist_head *head; struct raw_hwp_page *raw_hwp; struct raw_hwp_page *p; - int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; + int ret = folio_test_set_hwpoison(folio) ? MF_HUGETLB_FOLIO_PRE_POISONED : 0; /* * Once the hwpoison hugepage has lost reliable raw error info, @@ -1896,20 +1910,17 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) * so skip to add additional raw error info. */ if (folio_test_hugetlb_raw_hwp_unreliable(folio)) - return -EHWPOISON; + return MF_HUGETLB_FOLIO_PRE_POISONED; head = raw_hwp_list_head(folio); llist_for_each_entry(p, head->first, node) { if (p->page == page) - return -EHWPOISON; + return MF_HUGETLB_PAGE_PRE_POISONED; } raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); if (raw_hwp) { raw_hwp->page = page; llist_add(&raw_hwp->node, head); - /* the first error event will be counted in action_result(). */ - if (ret) - num_poisoned_pages_inc(page_to_pfn(page)); } else { /* * Failed to save raw error info. We no longer trace all @@ -1957,42 +1968,39 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) /* * Called from hugetlb code with hugetlb_lock held. - * - * Return values: - * 0 - free hugepage - * 1 - in-use hugepage - * 2 - not a hugepage - * -EBUSY - the hugepage is busy (try to retry) - * -EHWPOISON - the hugepage is already hwpoisoned */ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); struct folio *folio = page_folio(page); - int ret = 2; /* fallback to normal page handling */ bool count_increased = false; + int ret, rc; - if (!folio_test_hugetlb(folio)) + if (!folio_test_hugetlb(folio)) { + ret = MF_HUGETLB_NON_HUGEPAGE; goto out; - - if (flags & MF_COUNT_INCREASED) { - ret = 1; + } else if (flags & MF_COUNT_INCREASED) { + ret = MF_HUGETLB_IN_USED; count_increased = true; } else if (folio_test_hugetlb_freed(folio)) { - ret = 0; + ret = MF_HUGETLB_FREED; } else if (folio_test_hugetlb_migratable(folio)) { - ret = folio_try_get(folio); - if (ret) + if (folio_try_get(folio)) { + ret = MF_HUGETLB_IN_USED; count_increased = true; + } else { + ret = MF_HUGETLB_FREED; + } } else { - ret = -EBUSY; + ret = MF_HUGETLB_RETRY; if (!(flags & MF_NO_RETRY)) goto out; } - if (folio_set_hugetlb_hwpoison(folio, page)) { - ret = -EHWPOISON; + rc = hugetlb_update_hwpoison(folio, page); + if (rc >= MF_HUGETLB_FOLIO_PRE_POISONED) { + ret = rc; goto out; } @@ -2017,10 +2025,16 @@ out: * with basic operations like hugepage allocation/free/demotion. * So some of prechecks for hwpoison (pinning, and testing/setting * PageHWPoison) should be done in single hugetlb_lock range. + * Returns: + * 0 - not hugetlb, or recovered + * -EBUSY - not recovered + * -EOPNOTSUPP - hwpoison_filter'ed + * -EHWPOISON - folio or exact page already poisoned + * -EFAULT - kill_accessing_process finds current->mm null */ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb) { - int res; + int res, rv; struct page *p = pfn_to_page(pfn); struct folio *folio; unsigned long page_flags; @@ -2029,22 +2043,29 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb *hugetlb = 1; retry: res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); - if (res == 2) { /* fallback to normal page handling */ + switch (res) { + case MF_HUGETLB_NON_HUGEPAGE: /* fallback to normal page handling */ *hugetlb = 0; return 0; - } else if (res == -EHWPOISON) { - if (flags & MF_ACTION_REQUIRED) { - folio = page_folio(p); - res = kill_accessing_process(current, folio_pfn(folio), flags); - } - action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); - return res; - } else if (res == -EBUSY) { + case MF_HUGETLB_RETRY: if (!(flags & MF_NO_RETRY)) { flags |= MF_NO_RETRY; goto retry; } return action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED); + case MF_HUGETLB_FOLIO_PRE_POISONED: + case MF_HUGETLB_PAGE_PRE_POISONED: + rv = -EHWPOISON; + if (flags & MF_ACTION_REQUIRED) + rv = kill_accessing_process(current, pfn, flags); + if (res == MF_HUGETLB_PAGE_PRE_POISONED) + action_result(pfn, MF_MSG_ALREADY_POISONED, MF_FAILED); + else + action_result(pfn, MF_MSG_HUGE, MF_FAILED); + return rv; + default: + WARN_ON((res != MF_HUGETLB_FREED) && (res != MF_HUGETLB_IN_USED)); + break; } folio = page_folio(p); @@ -2055,7 +2076,7 @@ retry: if (migratable_cleared) folio_set_hugetlb_migratable(folio); folio_unlock(folio); - if (res == 1) + if (res == MF_HUGETLB_IN_USED) folio_put(folio); return -EOPNOTSUPP; } @@ -2064,7 +2085,7 @@ retry: * Handling free hugepage. The possible race with hugepage allocation * or demotion can be prevented by PageHWPoison flag. */ - if (res == 0) { + if (res == MF_HUGETLB_FREED) { folio_unlock(folio); if (__page_handle_poison(p) > 0) { page_ref_inc(p); @@ -2161,6 +2182,9 @@ int register_pfn_address_space(struct pfn_address_space *pfn_space) { guard(mutex)(&pfn_space_lock); + if (!pfn_space->pfn_to_vma_pgoff) + return -EINVAL; + if (interval_tree_iter_first(&pfn_space_itree, pfn_space->node.start, pfn_space->node.last)) @@ -2183,10 +2207,10 @@ void unregister_pfn_address_space(struct pfn_address_space *pfn_space) } EXPORT_SYMBOL_GPL(unregister_pfn_address_space); -static void add_to_kill_pfn(struct task_struct *tsk, - struct vm_area_struct *vma, - struct list_head *to_kill, - unsigned long pfn) +static void add_to_kill_pgoff(struct task_struct *tsk, + struct vm_area_struct *vma, + struct list_head *to_kill, + pgoff_t pgoff) { struct to_kill *tk; @@ -2197,12 +2221,12 @@ static void add_to_kill_pfn(struct task_struct *tsk, } /* Check for pgoff not backed by struct page */ - tk->addr = vma_address(vma, pfn, 1); + tk->addr = vma_address(vma, pgoff, 1); tk->size_shift = PAGE_SHIFT; if (tk->addr == -EFAULT) pr_info("Unable to find address %lx in %s\n", - pfn, tsk->comm); + pgoff, tsk->comm); get_task_struct(tsk); tk->tsk = tsk; @@ -2212,11 +2236,12 @@ static void add_to_kill_pfn(struct task_struct *tsk, /* * Collect processes when the error hit a PFN not backed by struct page. */ -static void collect_procs_pfn(struct address_space *mapping, +static void collect_procs_pfn(struct pfn_address_space *pfn_space, unsigned long pfn, struct list_head *to_kill) { struct vm_area_struct *vma; struct task_struct *tsk; + struct address_space *mapping = pfn_space->mapping; i_mmap_lock_read(mapping); rcu_read_lock(); @@ -2226,9 +2251,12 @@ static void collect_procs_pfn(struct address_space *mapping, t = task_early_kill(tsk, true); if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pfn, pfn) { - if (vma->vm_mm == t->mm) - add_to_kill_pfn(t, vma, to_kill, pfn); + vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX) { + pgoff_t pgoff; + + if (vma->vm_mm == t->mm && + !pfn_space->pfn_to_vma_pgoff(vma, pfn, &pgoff)) + add_to_kill_pgoff(t, vma, to_kill, pgoff); } } rcu_read_unlock(); @@ -2264,7 +2292,7 @@ static int memory_failure_pfn(unsigned long pfn, int flags) struct pfn_address_space *pfn_space = container_of(node, struct pfn_address_space, node); - collect_procs_pfn(pfn_space->mapping, pfn, &tokill); + collect_procs_pfn(pfn_space, pfn, &tokill); mf_handled = true; } @@ -2383,31 +2411,29 @@ try_again: * In fact it's dangerous to directly bump up page count from 0, * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. */ - if (!(flags & MF_COUNT_INCREASED)) { - res = get_hwpoison_page(p, flags); - if (!res) { - if (is_free_buddy_page(p)) { - if (take_page_off_buddy(p)) { - page_ref_inc(p); - res = MF_RECOVERED; - } else { - /* We lost the race, try again */ - if (retry) { - ClearPageHWPoison(p); - retry = false; - goto try_again; - } - res = MF_FAILED; - } - res = action_result(pfn, MF_MSG_BUDDY, res); + res = get_hwpoison_page(p, flags); + if (!res) { + if (is_free_buddy_page(p)) { + if (take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; } else { - res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); + /* We lost the race, try again */ + if (retry) { + ClearPageHWPoison(p); + retry = false; + goto try_again; + } + res = MF_FAILED; } - goto unlock_mutex; - } else if (res < 0) { - res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED); - goto unlock_mutex; + res = action_result(pfn, MF_MSG_BUDDY, res); + } else { + res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); } + goto unlock_mutex; + } else if (res < 0) { + res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED); + goto unlock_mutex; } folio = page_folio(p); diff --git a/mm/memory.c b/mm/memory.c index 2a55edc48a65..da360a6eb8a4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1465,7 +1465,11 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, static bool vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { - if (src_vma->vm_flags & VM_COPY_ON_FORK) + /* + * We check against dst_vma as while sane VMA flags will have been + * copied, VM_UFFD_WP may be set only on dst_vma. + */ + if (dst_vma->vm_flags & VM_COPY_ON_FORK) return true; /* * The presence of an anon_vma indicates an anonymous VMA has page @@ -1963,10 +1967,9 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, do { next = pud_addr_end(addr, end); if (pud_trans_huge(*pud)) { - if (next - addr != HPAGE_PUD_SIZE) { - mmap_assert_locked(tlb->mm); + if (next - addr != HPAGE_PUD_SIZE) split_huge_pud(vma, pud, addr); - } else if (zap_huge_pud(tlb, vma, pud, addr)) + else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; /* fall through */ } diff --git a/mm/memremap.c b/mm/memremap.c index 4c2e0d68eb27..ac7be07e3361 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -427,8 +427,6 @@ void free_zone_device_folio(struct folio *folio) if (folio_test_anon(folio)) { for (i = 0; i < nr; i++) __ClearPageAnonExclusive(folio_page(folio, i)); - } else { - VM_WARN_ON_ONCE(folio_test_large(folio)); } /* @@ -479,10 +477,43 @@ void free_zone_device_folio(struct folio *folio) } } -void zone_device_page_init(struct page *page, unsigned int order) +void zone_device_page_init(struct page *page, struct dev_pagemap *pgmap, + unsigned int order) { + struct page *new_page = page; + unsigned int i; + VM_WARN_ON_ONCE(order > MAX_ORDER_NR_PAGES); + for (i = 0; i < (1UL << order); ++i, ++new_page) { + struct folio *new_folio = (struct folio *)new_page; + + /* + * new_page could have been part of previous higher order folio + * which encodes the order, in page + 1, in the flags bits. We + * blindly clear bits which could have set my order field here, + * including page head. + */ + new_page->flags.f &= ~0xffUL; /* Clear possible order, page head */ + +#ifdef NR_PAGES_IN_LARGE_FOLIO + /* + * This pointer math looks odd, but new_page could have been + * part of a previous higher order folio, which sets _nr_pages + * in page + 1 (new_page). Therefore, we use pointer casting to + * correctly locate the _nr_pages bits within new_page which + * could have modified by previous higher order folio. + */ + ((struct folio *)(new_page - 1))->_nr_pages = 0; +#endif + + new_folio->mapping = NULL; + new_folio->pgmap = pgmap; /* Also clear compound head */ + new_folio->share = 0; /* fsdax only, unused for device private */ + VM_WARN_ON_FOLIO(folio_ref_count(new_folio), new_folio); + VM_WARN_ON_FOLIO(!folio_is_zone_device(new_folio), new_folio); + } + /* * Drivers shouldn't be allocating pages after calling * memunmap_pages(). diff --git a/mm/migrate.c b/mm/migrate.c index 5169f9717f60..4688b9e38cd2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1458,6 +1458,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; + enum ttu_flags ttu = 0; if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ @@ -1498,8 +1499,6 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, goto put_anon; if (folio_mapped(src)) { - enum ttu_flags ttu = 0; - if (!folio_test_anon(src)) { /* * In shared mappings, try_to_unmap could potentially @@ -1516,16 +1515,17 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, try_to_migrate(src, ttu); page_was_mapped = 1; - - if (ttu & TTU_RMAP_LOCKED) - i_mmap_unlock_write(mapping); } if (!folio_mapped(src)) rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, !rc ? dst : src, 0); + remove_migration_ptes(src, !rc ? dst : src, + ttu ? RMP_LOCKED : 0); + + if (ttu & TTU_RMAP_LOCKED) + i_mmap_unlock_write(mapping); unlock_put_anon: folio_unlock(dst); diff --git a/mm/mm_init.c b/mm/mm_init.c index fc2a6f1e518f..2a809cd8e7fa 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2059,7 +2059,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, */ static unsigned long __init deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, - struct zone *zone) + struct zone *zone, bool can_resched) { int nid = zone_to_nid(zone); unsigned long nr_pages = 0; @@ -2085,10 +2085,10 @@ deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, spfn = chunk_end; - if (irqs_disabled()) - touch_nmi_watchdog(); - else + if (can_resched) cond_resched(); + else + touch_nmi_watchdog(); } } @@ -2101,7 +2101,7 @@ deferred_init_memmap_job(unsigned long start_pfn, unsigned long end_pfn, { struct zone *zone = arg; - deferred_init_memmap_chunk(start_pfn, end_pfn, zone); + deferred_init_memmap_chunk(start_pfn, end_pfn, zone, true); } static unsigned int __init @@ -2216,7 +2216,7 @@ bool __init deferred_grow_zone(struct zone *zone, unsigned int order) for (spfn = first_deferred_pfn, epfn = SECTION_ALIGN_UP(spfn + 1); nr_pages < nr_pages_needed && spfn < zone_end_pfn(zone); spfn = epfn, epfn += PAGES_PER_SECTION) { - nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone); + nr_pages += deferred_init_memmap_chunk(spfn, epfn, zone, false); } /* diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 247e3f9db6c7..7468ec388455 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -10,6 +10,7 @@ #include <linux/swap.h> #include <linux/rmap.h> #include <linux/pgalloc.h> +#include <linux/hugetlb.h> #include <asm/tlb.h> @@ -426,6 +427,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, #endif tlb->vma_pfn = 0; + tlb->fully_unshared_tables = 0; __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } @@ -460,6 +462,31 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) } /** + * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a + * single VMA + * @tlb: the mmu_gather structure to initialize + * @vma: the vm_area_struct + * + * Called to initialize an (on-stack) mmu_gather structure for operating on + * a single VMA. In contrast to tlb_gather_mmu(), calling this function will + * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(), + * this function will *not* call flush_cache_range(). + * + * For hugetlb VMAs, this function will also initialize the mmu_gather + * page_size accordingly, not requiring a separate call to + * tlb_change_page_size(). + * + */ +void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + tlb_gather_mmu(tlb, vma->vm_mm); + tlb_update_vma_flags(tlb, vma); + if (is_vm_hugetlb_page(vma)) + /* All entries have the same size. */ + tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma))); +} + +/** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish * @@ -469,6 +496,12 @@ void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) void tlb_finish_mmu(struct mmu_gather *tlb) { /* + * We expect an earlier huge_pmd_unshare_flush() call to sort this out, + * due to complicated locking requirements with page table unsharing. + */ + VM_WARN_ON_ONCE(tlb->fully_unshared_tables); + + /* * If there are parallel threads are doing PTE changes on same range * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB * flush by batching, one thread may end up seeing inconsistent PTEs diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c index 5b009a9cd8b4..8f5735fda0a2 100644 --- a/mm/numa_memblks.c +++ b/mm/numa_memblks.c @@ -7,6 +7,8 @@ #include <linux/numa.h> #include <linux/numa_memblks.h> +#include <asm/numa.h> + int numa_distance_cnt; static u8 *numa_distance; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 822e05f1a964..cbf758e27aa2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -167,6 +167,33 @@ static inline void __pcp_trylock_noop(unsigned long *flags) { } pcp_trylock_finish(UP_flags); \ }) +/* + * With the UP spinlock implementation, when we spin_lock(&pcp->lock) (for i.e. + * a potentially remote cpu drain) and get interrupted by an operation that + * attempts pcp_spin_trylock(), we can't rely on the trylock failure due to UP + * spinlock assumptions making the trylock a no-op. So we have to turn that + * spin_lock() to a spin_lock_irqsave(). This works because on UP there are no + * remote cpu's so we can only be locking the only existing local one. + */ +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +static inline void __flags_noop(unsigned long *flags) { } +#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ +({ \ + __flags_noop(&(flags)); \ + spin_lock(&(ptr)->lock); \ +}) +#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ +({ \ + spin_unlock(&(ptr)->lock); \ + __flags_noop(&(flags)); \ +}) +#else +#define pcp_spin_lock_maybe_irqsave(ptr, flags) \ + spin_lock_irqsave(&(ptr)->lock, flags) +#define pcp_spin_unlock_maybe_irqrestore(ptr, flags) \ + spin_unlock_irqrestore(&(ptr)->lock, flags) +#endif + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -914,6 +941,17 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, NULL) != NULL; } +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + /* * Freeing function for a buddy system allocator. * @@ -1000,7 +1038,7 @@ static inline void __free_one_page(struct page *page, * expand() down the line puts the sub-blocks * on the right freelists. */ - set_pageblock_migratetype(buddy, migratetype); + change_pageblock_range(buddy, order, migratetype); } combined_pfn = buddy_pfn & pfn; @@ -2147,17 +2185,6 @@ bool pageblock_unisolate_and_move_free_pages(struct zone *zone, struct page *pag #endif /* CONFIG_MEMORY_ISOLATION */ -static void change_pageblock_range(struct page *pageblock_page, - int start_order, int migratetype) -{ - int nr_pageblocks = 1 << (start_order - pageblock_order); - - while (nr_pageblocks--) { - set_pageblock_migratetype(pageblock_page, migratetype); - pageblock_page += pageblock_nr_pages; - } -} - static inline bool boost_watermark(struct zone *zone) { unsigned long max_boost; @@ -2556,6 +2583,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { int high_min, to_drain, to_drain_batched, batch; + unsigned long UP_flags; bool todo = false; high_min = READ_ONCE(pcp->high_min); @@ -2575,9 +2603,9 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->count - pcp->high; while (to_drain > 0) { to_drain_batched = min(to_drain, batch); - spin_lock(&pcp->lock); + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); free_pcppages_bulk(zone, to_drain_batched, pcp, 0); - spin_unlock(&pcp->lock); + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); todo = true; to_drain -= to_drain_batched; @@ -2594,14 +2622,15 @@ bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { + unsigned long UP_flags; int to_drain, batch; batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) { - spin_lock(&pcp->lock); + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); free_pcppages_bulk(zone, to_drain, pcp, 0); - spin_unlock(&pcp->lock); + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); } } #endif @@ -2612,10 +2641,11 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) static void drain_pages_zone(unsigned int cpu, struct zone *zone) { struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + unsigned long UP_flags; int count; do { - spin_lock(&pcp->lock); + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); count = pcp->count; if (count) { int to_drain = min(count, @@ -2624,7 +2654,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) free_pcppages_bulk(zone, to_drain, pcp, 0); count -= to_drain; } - spin_unlock(&pcp->lock); + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); } while (count); } @@ -5924,7 +5954,7 @@ static int zone_batchsize(struct zone *zone) * recycled, this leads to the once large chunks of space being * fragmented and becoming unavailable for high-order allocations. */ - return 0; + return 1; #endif } @@ -6109,6 +6139,7 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { struct per_cpu_pages *pcp; struct cpu_cacheinfo *cci; + unsigned long UP_flags; pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); cci = get_cpu_cacheinfo(cpu); @@ -6119,12 +6150,12 @@ static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) * This can reduce zone lock contention without hurting * cache-hot pages sharing. */ - spin_lock(&pcp->lock); + pcp_spin_lock_maybe_irqsave(pcp, UP_flags); if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) pcp->flags |= PCPF_FREE_HIGH_BATCH; else pcp->flags &= ~PCPF_FREE_HIGH_BATCH; - spin_unlock(&pcp->lock); + pcp_spin_unlock_maybe_irqrestore(pcp, UP_flags); } void setup_pcp_cacheinfo(unsigned int cpu) @@ -6667,11 +6698,19 @@ static int percpu_pagelist_high_fraction_sysctl_handler(const struct ctl_table * int old_percpu_pagelist_high_fraction; int ret; + /* + * Avoid using pcp_batch_high_lock for reads as the value is read + * atomically and a race with offlining is harmless. + */ + + if (!write) + return proc_dointvec_minmax(table, write, buffer, length, ppos); + mutex_lock(&pcp_batch_high_lock); old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || ret < 0) + if (ret < 0) goto out; /* Sanity checking to avoid pcp imbalance */ @@ -7418,20 +7457,16 @@ bool put_page_back_buddy(struct page *page) } #endif -#ifdef CONFIG_ZONE_DMA -bool has_managed_dma(void) +bool has_managed_zone(enum zone_type zone) { struct pglist_data *pgdat; for_each_online_pgdat(pgdat) { - struct zone *zone = &pgdat->node_zones[ZONE_DMA]; - - if (managed_zone(zone)) + if (managed_zone(&pgdat->node_zones[zone])) return true; } return false; } -#endif /* CONFIG_ZONE_DMA */ #ifdef CONFIG_UNACCEPTED_MEMORY diff --git a/mm/page_owner.c b/mm/page_owner.c index a70245684206..b3260f0c17ba 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -952,7 +952,7 @@ static const struct file_operations page_owner_stack_fops = { .open = page_owner_stack_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_private, }; static int page_owner_threshold_get(void *data, u64 *val) diff --git a/mm/rmap.c b/mm/rmap.c index f955f02d570e..7b9879ef442d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -76,7 +76,7 @@ #include <linux/mm_inline.h> #include <linux/oom.h> -#include <asm/tlbflush.h> +#include <asm/tlb.h> #define CREATE_TRACE_POINTS #include <trace/events/migrate.h> @@ -2008,26 +2008,25 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) goto walk_abort; - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + + tlb_gather_mmu_vma(&tlb, vma); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* - * The ref count of the PMD page was - * dropped which is part of the way map - * counting is done for shared PMDs. - * Return 'true' here. When there is - * no other sharing, huge_pmd_unshare - * returns false and we will unmap the - * actual page and drop map count - * to zero. + * The PMD table was unmapped, + * consequently unmapping the folio. */ goto walk_done; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); if (pte_dirty(pteval)) @@ -2404,31 +2403,29 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * fail if unsuccessful. */ if (!anon) { + struct mmu_gather tlb; + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } - if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - hugetlb_vma_unlock_write(vma); - flush_tlb_range(vma, - range.start, range.end); + tlb_gather_mmu_vma(&tlb, vma); + if (huge_pmd_unshare(&tlb, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + huge_pmd_unshare_flush(&tlb, vma); + tlb_finish_mmu(&tlb); /* - * The ref count of the PMD page was - * dropped which is part of the way map - * counting is done for shared PMDs. - * Return 'true' here. When there is - * no other sharing, huge_pmd_unshare - * returns false and we will unmap the - * actual page and drop map count - * to zero. + * The PMD table was unmapped, + * consequently unmapping the folio. */ page_vma_mapped_walk_done(&pvmw); break; } hugetlb_vma_unlock_write(vma); + tlb_finish_mmu(&tlb); } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); diff --git a/mm/shmem.c b/mm/shmem.c index ec6c01378e9d..79af5f9f8b90 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -962,17 +962,29 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) * being freed). */ static long shmem_free_swap(struct address_space *mapping, - pgoff_t index, void *radswap) + pgoff_t index, pgoff_t end, void *radswap) { - int order = xa_get_order(&mapping->i_pages, index); - void *old; + XA_STATE(xas, &mapping->i_pages, index); + unsigned int nr_pages = 0; + pgoff_t base; + void *entry; - old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0); - if (old != radswap) - return 0; - free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order); + xas_lock_irq(&xas); + entry = xas_load(&xas); + if (entry == radswap) { + nr_pages = 1 << xas_get_order(&xas); + base = round_down(xas.xa_index, nr_pages); + if (base < index || base + nr_pages - 1 > end) + nr_pages = 0; + else + xas_store(&xas, NULL); + } + xas_unlock_irq(&xas); - return 1 << order; + if (nr_pages) + free_swap_and_cache_nr(radix_to_swp_entry(radswap), nr_pages); + + return nr_pages; } /* @@ -1124,8 +1136,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend, if (xa_is_value(folio)) { if (unfalloc) continue; - nr_swaps_freed += shmem_free_swap(mapping, - indices[i], folio); + nr_swaps_freed += shmem_free_swap(mapping, indices[i], + end - 1, folio); continue; } @@ -1191,14 +1203,30 @@ whole_folios: folio = fbatch.folios[i]; if (xa_is_value(folio)) { + int order; long swaps_freed; if (unfalloc) continue; - swaps_freed = shmem_free_swap(mapping, indices[i], folio); + swaps_freed = shmem_free_swap(mapping, indices[i], + end - 1, folio); if (!swaps_freed) { - /* Swap was replaced by page: retry */ - index = indices[i]; + pgoff_t base = indices[i]; + + order = shmem_confirm_swap(mapping, indices[i], + radix_to_swp_entry(folio)); + /* + * If found a large swap entry cross the end or start + * border, skip it as the truncate_inode_partial_folio + * above should have at least zerod its content once. + */ + if (order > 0) { + base = round_down(base, 1 << order); + if (base < start || base + (1 << order) > end) + continue; + } + /* Swap was replaced by page or extended, retry */ + index = base; break; } nr_swaps_freed += swaps_freed; diff --git a/mm/slub.c b/mm/slub.c index 861592ac5425..cdc1e652ec52 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5694,8 +5694,12 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) if (unlikely(!size)) return ZERO_SIZE_PTR; - if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) - /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) + /* + * kmalloc_nolock() in PREEMPT_RT is not supported from + * non-preemptible context because local_lock becomes a + * sleeping lock on RT. + */ return NULL; retry: if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) @@ -6685,8 +6689,12 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) { + struct slab *slab = virt_to_slab(object); + + alloc_tagging_slab_free_hook(s, slab, &object, 1); + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) - do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); + do_slab_free(s, slab, object, object, 1, _RET_IP_); } #endif diff --git a/mm/swap.h b/mm/swap.h index d034c13d8dd2..1bd466da3039 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -198,7 +198,7 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug); void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); /* linux/mm/swap_state.c */ -extern struct address_space swap_space __ro_after_init; +extern struct address_space swap_space __read_mostly; static inline struct address_space *swap_address_space(swp_entry_t entry) { return &swap_space; diff --git a/mm/swap_state.c b/mm/swap_state.c index 5f97c6ae70a2..44d228982521 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,8 +37,7 @@ static const struct address_space_operations swap_aops = { #endif }; -/* Set swap_space as read only as swap cache is handled by swap table */ -struct address_space swap_space __ro_after_init = { +struct address_space swap_space __read_mostly = { .a_ops = &swap_aops, }; @@ -37,6 +37,8 @@ struct mmap_state { bool check_ksm_early :1; /* If we map new, hold the file rmap lock on mapping. */ bool hold_file_rmap_lock :1; + /* If .mmap_prepare changed the file, we don't need to pin. */ + bool file_doesnt_need_get :1; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ @@ -67,18 +69,13 @@ struct mmap_state { .state = VMA_MERGE_START, \ } -/* - * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain - * more than one anon_vma_chain connecting it to more than one anon_vma. A merge - * would mean a wider range of folios sharing the root anon_vma lock, and thus - * potential lock contention, we do not wish to encourage merging such that this - * scales to a problem. - */ -static bool vma_had_uncowed_parents(struct vm_area_struct *vma) +/* Was this VMA ever forked from a parent, i.e. maybe contains CoW mappings? */ +static bool vma_is_fork_child(struct vm_area_struct *vma) { /* * The list_is_singular() test is to avoid merging VMA cloned from - * parents. This can improve scalability caused by anon_vma lock. + * parents. This can improve scalability caused by the anon_vma root + * lock. */ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain); } @@ -115,11 +112,19 @@ static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next) VM_WARN_ON(src && src_anon != src->anon_vma); /* Case 1 - we will dup_anon_vma() from src into tgt. */ - if (!tgt_anon && src_anon) - return !vma_had_uncowed_parents(src); + if (!tgt_anon && src_anon) { + struct vm_area_struct *copied_from = vmg->copied_from; + + if (vma_is_fork_child(src)) + return false; + if (vma_is_fork_child(copied_from)) + return false; + + return true; + } /* Case 2 - we will simply use tgt's anon_vma. */ if (tgt_anon && !src_anon) - return !vma_had_uncowed_parents(tgt); + return !vma_is_fork_child(tgt); /* Case 3 - the anon_vma's are already shared. */ return src_anon == tgt_anon; } @@ -829,6 +834,8 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( VM_WARN_ON_VMG(middle && !(vma_iter_addr(vmg->vmi) >= middle->vm_start && vma_iter_addr(vmg->vmi) < middle->vm_end), vmg); + /* An existing merge can never be used by the mremap() logic. */ + VM_WARN_ON_VMG(vmg->copied_from, vmg); vmg->state = VMA_MERGE_NOMERGE; @@ -1099,6 +1106,33 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) } /* + * vma_merge_copied_range - Attempt to merge a VMA that is being copied by + * mremap() + * + * @vmg: Describes the VMA we are adding, in the copied-to range @vmg->start to + * @vmg->end (exclusive), which we try to merge with any adjacent VMAs if + * possible. + * + * vmg->prev, next, start, end, pgoff should all be relative to the COPIED TO + * range, i.e. the target range for the VMA. + * + * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer + * to the VMA we expanded. + * + * ASSUMPTIONS: Same as vma_merge_new_range(), except vmg->middle must contain + * the copied-from VMA. + */ +static struct vm_area_struct *vma_merge_copied_range(struct vma_merge_struct *vmg) +{ + /* We must have a copied-from VMA. */ + VM_WARN_ON_VMG(!vmg->middle, vmg); + + vmg->copied_from = vmg->middle; + vmg->middle = NULL; + return vma_merge_new_range(vmg); +} + +/* * vma_expand - Expand an existing VMA * * @vmg: Describes a VMA expansion operation. @@ -1117,46 +1151,52 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; - bool remove_next = false; struct vm_area_struct *target = vmg->target; struct vm_area_struct *next = vmg->next; + bool remove_next = false; vm_flags_t sticky_flags; - - sticky_flags = vmg->vm_flags & VM_STICKY; - sticky_flags |= target->vm_flags & VM_STICKY; - - VM_WARN_ON_VMG(!target, vmg); + int ret = 0; mmap_assert_write_locked(vmg->mm); - vma_start_write(target); - if (next && (target != next) && (vmg->end == next->vm_end)) { - int ret; - sticky_flags |= next->vm_flags & VM_STICKY; + if (next && target != next && vmg->end == next->vm_end) remove_next = true; - /* This should already have been checked by this point. */ - VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); - vma_start_write(next); - /* - * In this case we don't report OOM, so vmg->give_up_on_mm is - * safe. - */ - ret = dup_anon_vma(target, next, &anon_dup); - if (ret) - return ret; - } + /* We must have a target. */ + VM_WARN_ON_VMG(!target, vmg); + /* This should have already been checked by this point. */ + VM_WARN_ON_VMG(remove_next && !can_merge_remove_vma(next), vmg); /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON_VMG(next && !remove_next && next != target && vmg->end > next->vm_start, vmg); - /* Only handles expanding */ + /* Only handles expanding. */ VM_WARN_ON_VMG(target->vm_start < vmg->start || target->vm_end > vmg->end, vmg); + sticky_flags = vmg->vm_flags & VM_STICKY; + sticky_flags |= target->vm_flags & VM_STICKY; if (remove_next) - vmg->__remove_next = true; + sticky_flags |= next->vm_flags & VM_STICKY; + + /* + * If we are removing the next VMA or copying from a VMA + * (e.g. mremap()'ing), we must propagate anon_vma state. + * + * Note that, by convention, callers ignore OOM for this case, so + * we don't need to account for vmg->give_up_on_mm here. + */ + if (remove_next) + ret = dup_anon_vma(target, next, &anon_dup); + if (!ret && vmg->copied_from) + ret = dup_anon_vma(target, vmg->copied_from, &anon_dup); + if (ret) + return ret; + if (remove_next) { + vma_start_write(next); + vmg->__remove_next = true; + } if (commit_merge(vmg)) goto nomem; @@ -1828,10 +1868,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - vmg.middle = NULL; /* New VMA range. */ vmg.pgoff = pgoff; vmg.next = vma_iter_next_rewind(&vmi, NULL); - new_vma = vma_merge_new_range(&vmg); + new_vma = vma_merge_copied_range(&vmg); if (new_vma) { /* @@ -2413,7 +2452,9 @@ static int __mmap_new_file_vma(struct mmap_state *map, struct vma_iterator *vmi = map->vmi; int error; - vma->vm_file = get_file(map->file); + vma->vm_file = map->file; + if (!map->file_doesnt_need_get) + get_file(map->file); if (!map->file->f_op->mmap) return 0; @@ -2601,7 +2642,10 @@ static int call_mmap_prepare(struct mmap_state *map, /* Update fields permitted to be changed. */ map->pgoff = desc->pgoff; - map->file = desc->vm_file; + if (desc->vm_file != map->file) { + map->file_doesnt_need_get = true; + map->file = desc->vm_file; + } map->vm_flags = desc->vm_flags; map->page_prot = desc->page_prot; /* User-defined fields. */ @@ -106,6 +106,9 @@ struct vma_merge_struct { struct anon_vma_name *anon_name; enum vma_merge_state state; + /* If copied from (i.e. mremap()'d) the VMA from which we are copying. */ + struct vm_area_struct *copied_from; + /* Flags which callers can use to modify merge behaviour: */ /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ecbac900c35f..e286c2d2068c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4248,7 +4248,7 @@ void *vzalloc_node_noprof(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node_noprof); /** - * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents + * vrealloc_node_align - reallocate virtually contiguous memory; contents * remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate @@ -4322,7 +4322,7 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align if (want_init_on_free() || want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); vm->requested_size = size; - kasan_poison_vmalloc(p + size, old_size - size); + kasan_vrealloc(p, old_size, size); return (void *)p; } @@ -4330,14 +4330,13 @@ void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align * We already have the bytes available in the allocation; use them. */ if (size <= alloced_size) { - kasan_unpoison_vmalloc(p + old_size, size - old_size, - KASAN_VMALLOC_PROT_NORMAL); /* * No need to zero memory here, as unused memory will have * already been zeroed at initial allocation time or during * realloc shrink time. */ vm->requested_size = size; + kasan_vrealloc(p, old_size, size); return (void *)p; } @@ -5025,9 +5024,7 @@ retry: * With hardware tag-based KASAN, marking is skipped for * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc(). */ - for (area = 0; area < nr_vms; area++) - vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr, - vms[area]->size, KASAN_VMALLOC_PROT_NORMAL); + kasan_unpoison_vmap_areas(vms, nr_vms, KASAN_VMALLOC_PROT_NORMAL); kfree(vas); return vms; diff --git a/mm/vmscan.c b/mm/vmscan.c index 670fe9fae5ba..614ccf39fe3f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7707,6 +7707,17 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) return ret; } +#else + +static unsigned long __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, + unsigned long nr_pages, + struct scan_control *sc) +{ + return 0; +} + +#endif + enum { MEMORY_RECLAIM_SWAPPINESS = 0, MEMORY_RECLAIM_SWAPPINESS_MAX, @@ -7814,8 +7825,6 @@ int user_proactive_reclaim(char *buf, return 0; } -#endif - /** * check_move_unevictable_folios - Move evictable folios to appropriate zone * lru list diff --git a/mm/zswap.c b/mm/zswap.c index 5d0f8b13a958..ac9b7a60736b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -787,7 +787,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) return 0; fail: - if (acomp) + if (!IS_ERR_OR_NULL(acomp)) crypto_free_acomp(acomp); kfree(buffer); return ret; |
