From c4602f9fa77fc6bb956ca51a23e7a39439e75cb6 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 9 Jun 2025 10:27:26 +0100 Subject: mm/readahead: store folio order in struct file_ra_state Previously the folio order of the previous readahead request was inferred from the folio who's readahead marker was hit. But due to the way we have to round to non-natural boundaries sometimes, this first folio in the readahead block is often smaller than the preferred order for that request. This means that for cases where the initial sync readahead is poorly aligned, the folio order will ramp up much more slowly. So instead, let's store the order in struct file_ra_state so we are not affected by any required alignment. We previously made enough room in the struct for a 16 order field. This should be plenty big enough since we are limited to MAX_PAGECACHE_ORDER anyway, which is certainly never larger than ~20. Since we now pass order in struct file_ra_state, page_cache_ra_order() no longer needs it's new_order parameter, so let's remove that. Worked example: Here we are touching pages 17-256 sequentially just as we did in the previous commit, but now that we are remembering the preferred order explicitly, we no longer have the slow ramp up problem. Note specifically that we no longer have 2 rounds (2x ~128K) of order-2 folios: TYPE STARTOFFS ENDOFFS SIZE STARTPG ENDPG NRPG ORDER RA ----- ---------- ---------- ---------- ------- ------- ----- ----- -- HOLE 0x00000000 0x00001000 4096 0 1 1 FOLIO 0x00001000 0x00002000 4096 1 2 1 0 FOLIO 0x00002000 0x00003000 4096 2 3 1 0 FOLIO 0x00003000 0x00004000 4096 3 4 1 0 FOLIO 0x00004000 0x00005000 4096 4 5 1 0 FOLIO 0x00005000 0x00006000 4096 5 6 1 0 FOLIO 0x00006000 0x00007000 4096 6 7 1 0 FOLIO 0x00007000 0x00008000 4096 7 8 1 0 FOLIO 0x00008000 0x00009000 4096 8 9 1 0 FOLIO 0x00009000 0x0000a000 4096 9 10 1 0 FOLIO 0x0000a000 0x0000b000 4096 10 11 1 0 FOLIO 0x0000b000 0x0000c000 4096 11 12 1 0 FOLIO 0x0000c000 0x0000d000 4096 12 13 1 0 FOLIO 0x0000d000 0x0000e000 4096 13 14 1 0 FOLIO 0x0000e000 0x0000f000 4096 14 15 1 0 FOLIO 0x0000f000 0x00010000 4096 15 16 1 0 FOLIO 0x00010000 0x00011000 4096 16 17 1 0 FOLIO 0x00011000 0x00012000 4096 17 18 1 0 FOLIO 0x00012000 0x00013000 4096 18 19 1 0 FOLIO 0x00013000 0x00014000 4096 19 20 1 0 FOLIO 0x00014000 0x00015000 4096 20 21 1 0 FOLIO 0x00015000 0x00016000 4096 21 22 1 0 FOLIO 0x00016000 0x00017000 4096 22 23 1 0 FOLIO 0x00017000 0x00018000 4096 23 24 1 0 FOLIO 0x00018000 0x00019000 4096 24 25 1 0 FOLIO 0x00019000 0x0001a000 4096 25 26 1 0 FOLIO 0x0001a000 0x0001b000 4096 26 27 1 0 FOLIO 0x0001b000 0x0001c000 4096 27 28 1 0 FOLIO 0x0001c000 0x0001d000 4096 28 29 1 0 FOLIO 0x0001d000 0x0001e000 4096 29 30 1 0 FOLIO 0x0001e000 0x0001f000 4096 30 31 1 0 FOLIO 0x0001f000 0x00020000 4096 31 32 1 0 FOLIO 0x00020000 0x00021000 4096 32 33 1 0 FOLIO 0x00021000 0x00022000 4096 33 34 1 0 FOLIO 0x00022000 0x00024000 8192 34 36 2 1 FOLIO 0x00024000 0x00028000 16384 36 40 4 2 FOLIO 0x00028000 0x0002c000 16384 40 44 4 2 FOLIO 0x0002c000 0x00030000 16384 44 48 4 2 FOLIO 0x00030000 0x00034000 16384 48 52 4 2 FOLIO 0x00034000 0x00038000 16384 52 56 4 2 FOLIO 0x00038000 0x0003c000 16384 56 60 4 2 FOLIO 0x0003c000 0x00040000 16384 60 64 4 2 FOLIO 0x00040000 0x00050000 65536 64 80 16 4 FOLIO 0x00050000 0x00060000 65536 80 96 16 4 FOLIO 0x00060000 0x00080000 131072 96 128 32 5 FOLIO 0x00080000 0x000a0000 131072 128 160 32 5 FOLIO 0x000a0000 0x000c0000 131072 160 192 32 5 FOLIO 0x000c0000 0x000e0000 131072 192 224 32 5 FOLIO 0x000e0000 0x00100000 131072 224 256 32 5 FOLIO 0x00100000 0x00120000 131072 256 288 32 5 FOLIO 0x00120000 0x00140000 131072 288 320 32 5 Y HOLE 0x00140000 0x00800000 7077888 320 2048 1728 Link: https://lkml.kernel.org/r/20250609092729.274960-5-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Jan Kara Cc: Chaitanya S Prakash Cc: David Hildenbrand Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/internal.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 6b8ed2017743..f91688e2894f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -436,8 +436,7 @@ void zap_page_range_single_batched(struct mmu_gather *tlb, int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio, gfp_t gfp); -void page_cache_ra_order(struct readahead_control *, struct file_ra_state *, - unsigned int order); +void page_cache_ra_order(struct readahead_control *, struct file_ra_state *); void force_page_cache_ra(struct readahead_control *, unsigned long nr); static inline void force_page_cache_readahead(struct address_space *mapping, struct file *file, pgoff_t index, unsigned long nr_to_read) -- cgit v1.2.3 From 96d81e4766f9e88b66a0502b5a7f34a4c20ac754 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 5 Jun 2025 14:51:04 +0100 Subject: mm/pagewalk: split walk_page_range_novma() into kernel/user parts walk_page_range_novma() is rather confusing - it supports two modes, one used often, the other used only for debugging. The first mode is the common case of traversal of kernel page tables, which is what nearly all callers use this for. Secondly it provides an unusual debugging interface that allows for the traversal of page tables in a userland range of memory even for that memory which is not described by a VMA. It is far from certain that such page tables should even exist, but perhaps this is precisely why it is useful as a debugging mechanism. As a result, this is utilised by ptdump only. Historically, things were reversed - ptdump was the only user, and other parts of the kernel evolved to use the kernel page table walking here. Since we have some complicated and confusing locking rules for the novma case, it makes sense to separate the two usages into their own functions. Doing this also provide self-documentation as to the intent of the caller - are they doing something rather unusual or are they simply doing a standard kernel page table walk? We therefore establish two separate functions - walk_page_range_debug() for this single usage, and walk_kernel_page_table_range() for general kernel page table walking. The walk_page_range_debug() function is currently used to traverse both userland and kernel mappings, so we maintain this and in the case of kernel mappings being traversed, we have walk_page_range_debug() invoke walk_kernel_page_table_range() internally. We additionally make walk_page_range_debug() internal to mm. Link: https://lkml.kernel.org/r/20250605135104.90720-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Mike Rapoport (Microsoft) Acked-by: Qi Zheng Reviewed-by: Oscar Salvador Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Albert Ou Cc: Alexandre Ghiti Cc: Barry Song Cc: Huacai Chen Cc: Jann Horn Cc: Jonas Bonn Cc: Liam Howlett Cc: Michal Hocko Cc: Muchun Song Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Stafford Horne Cc: Stefan Kristiansson Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/loongarch/mm/pageattr.c | 2 +- arch/openrisc/kernel/dma.c | 4 +-- arch/riscv/mm/pageattr.c | 8 ++--- include/linux/pagewalk.h | 7 ++-- mm/hugetlb_vmemmap.c | 2 +- mm/internal.h | 3 ++ mm/pagewalk.c | 77 +++++++++++++++++++++++++++++++------------- mm/ptdump.c | 3 +- 8 files changed, 71 insertions(+), 35 deletions(-) (limited to 'mm/internal.h') diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c index 99165903908a..f5e910b68229 100644 --- a/arch/loongarch/mm/pageattr.c +++ b/arch/loongarch/mm/pageattr.c @@ -118,7 +118,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, pgp return 0; mmap_write_lock(&init_mm); - ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, &masks); + ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL, &masks); mmap_write_unlock(&init_mm); flush_tlb_kernel_range(start, end); diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index 3a7b5baaa450..af932a4ad306 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -72,7 +72,7 @@ void *arch_dma_set_uncached(void *cpu_addr, size_t size) * them and setting the cache-inhibit bit. */ mmap_write_lock(&init_mm); - error = walk_page_range_novma(&init_mm, va, va + size, + error = walk_kernel_page_table_range(va, va + size, &set_nocache_walk_ops, NULL, NULL); mmap_write_unlock(&init_mm); @@ -87,7 +87,7 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size) mmap_write_lock(&init_mm); /* walk_page_range shouldn't be able to fail here */ - WARN_ON(walk_page_range_novma(&init_mm, va, va + size, + WARN_ON(walk_kernel_page_table_range(va, va + size, &clear_nocache_walk_ops, NULL, NULL)); mmap_write_unlock(&init_mm); } diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index d815448758a1..3f76db3d2769 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -299,7 +299,7 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, if (ret) goto unlock; - ret = walk_page_range_novma(&init_mm, lm_start, lm_end, + ret = walk_kernel_page_table_range(lm_start, lm_end, &pageattr_ops, NULL, &masks); if (ret) goto unlock; @@ -317,13 +317,13 @@ static int __set_memory(unsigned long addr, int numpages, pgprot_t set_mask, if (ret) goto unlock; - ret = walk_page_range_novma(&init_mm, lm_start, lm_end, + ret = walk_kernel_page_table_range(lm_start, lm_end, &pageattr_ops, NULL, &masks); if (ret) goto unlock; } - ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, + ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL, &masks); unlock: @@ -335,7 +335,7 @@ unlock: */ flush_tlb_all(); #else - ret = walk_page_range_novma(&init_mm, start, end, &pageattr_ops, NULL, + ret = walk_kernel_page_table_range(start, end, &pageattr_ops, NULL, &masks); mmap_write_unlock(&init_mm); diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 9700a29f8afb..8ac2f6d6d2a3 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -129,10 +129,9 @@ struct mm_walk { int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); -int walk_page_range_novma(struct mm_struct *mm, unsigned long start, - unsigned long end, const struct mm_walk_ops *ops, - pgd_t *pgd, - void *private); +int walk_kernel_page_table_range(unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 27245e86df25..ba0fb1b6a5a8 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -166,7 +166,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, VM_BUG_ON(!PAGE_ALIGNED(start | end)); mmap_read_lock(&init_mm); - ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, + ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops, NULL, walk); mmap_read_unlock(&init_mm); if (ret) diff --git a/mm/internal.h b/mm/internal.h index f91688e2894f..2c0d9f197d81 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1604,6 +1604,9 @@ static inline void accept_page(struct page *page) int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +int walk_page_range_debug(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); /* pt_reclaim.c */ bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index e478777c86e1..ff5299eca687 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -585,8 +585,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } /** - * walk_page_range_novma - walk a range of pagetables not backed by a vma - * @mm: mm_struct representing the target process of page table walk + * walk_kernel_page_table_range - walk a range of kernel pagetables. * @start: start address of the virtual address range * @end: end address of the virtual address range * @ops: operation to call during the walk @@ -596,17 +595,61 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for - * walking the kernel pages tables or page tables for firmware. + * walking kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to * take other effective approaches (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ -int walk_page_range_novma(struct mm_struct *mm, unsigned long start, +int walk_kernel_page_table_range(unsigned long start, unsigned long end, + const struct mm_walk_ops *ops, pgd_t *pgd, void *private) +{ + struct mm_struct *mm = &init_mm; + struct mm_walk walk = { + .ops = ops, + .mm = mm, + .pgd = pgd, + .private = private, + .no_vma = true + }; + + if (start >= end) + return -EINVAL; + if (!check_ops_valid(ops)) + return -EINVAL; + + /* + * Kernel intermediate page tables are usually not freed, so the mmap + * read lock is sufficient. But there are some exceptions. + * E.g. memory hot-remove. In which case, the mmap lock is insufficient + * to prevent the intermediate kernel pages tables belonging to the + * specified address range from being freed. The caller should take + * other actions to prevent this race. + */ + mmap_assert_locked(mm); + + return walk_pgd_range(start, end, &walk); +} + +/** + * walk_page_range_debug - walk a range of pagetables not backed by a vma + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @pgd: pgd to walk if different from mm->pgd + * @private: private data for callbacks' usage + * + * Similar to walk_page_range() but can walk any page tables even if they are + * not backed by VMAs. Because 'unusual' entries may be walked this function + * will also not lock the PTEs for the pte_entry() callback. + * + * This is for debugging purposes ONLY. + */ +int walk_page_range_debug(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, - pgd_t *pgd, - void *private) + pgd_t *pgd, void *private) { struct mm_walk walk = { .ops = ops, @@ -616,34 +659,24 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, .no_vma = true }; + /* For convenience, we allow traversal of kernel mappings. */ + if (mm == &init_mm) + return walk_kernel_page_table_range(start, end, ops, + pgd, private); if (start >= end || !walk.mm) return -EINVAL; if (!check_ops_valid(ops)) return -EINVAL; /* - * 1) For walking the user virtual address space: - * * The mmap lock protects the page walker from changes to the page * tables during the walk. However a read lock is insufficient to * protect those areas which don't have a VMA as munmap() detaches * the VMAs before downgrading to a read lock and actually tearing * down PTEs/page tables. In which case, the mmap write lock should - * be hold. - * - * 2) For walking the kernel virtual address space: - * - * The kernel intermediate page tables usually do not be freed, so - * the mmap map read lock is sufficient. But there are some exceptions. - * E.g. memory hot-remove. In which case, the mmap lock is insufficient - * to prevent the intermediate kernel pages tables belonging to the - * specified address range from being freed. The caller should take - * other actions to prevent this race. + * be held. */ - if (mm == &init_mm) - mmap_assert_locked(walk.mm); - else - mmap_assert_write_locked(walk.mm); + mmap_assert_write_locked(mm); return walk_pgd_range(start, end, &walk); } diff --git a/mm/ptdump.c b/mm/ptdump.c index 9374f29cdc6f..61a352aa12ed 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -4,6 +4,7 @@ #include #include #include +#include "internal.h" #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) /* @@ -177,7 +178,7 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd) mmap_write_lock(mm); while (range->start != range->end) { - walk_page_range_novma(mm, range->start, range->end, + walk_page_range_debug(mm, range->start, range->end, &ptdump_ops, pgd, st); range++; } -- cgit v1.2.3 From 29ea04095b9697951621dd5a7843108948d056b8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 10:23:45 -0700 Subject: Revert "mm: rename alloc_demote_folio to alloc_migrate_folio" This reverts commit 8f75267d22bdf8e3baf70f2fa7092d8c2f58da71. Commit 8f75267d22bd ("mm: rename alloc_demote_folio to alloc_migrate_folio") was to reflect the fact the function is called for not only demotion, but also general migrations from DAMOS_MIGRATE_{HOT,COLD}. The previous commit made the DAMOS actions to not use alloc_migrate_folio(), though. So, demote_folio_list() is the only caller of alloc_migrate_folio(), and the name could now be rather confusing. Revert the renaming commit. Link: https://lkml.kernel.org/r/20250616172346.67659-3-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Joshua Hahn Cc: David Hildenbrand Cc: Honggyu Kim Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- mm/vmscan.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 2c0d9f197d81..ce37834bcdce 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1226,7 +1226,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -struct folio *alloc_migrate_folio(struct folio *src, unsigned long private); +struct folio *alloc_demote_folio(struct folio *src, unsigned long private); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); diff --git a/mm/vmscan.c b/mm/vmscan.c index a93a1ba9009e..6bebc91cbf2f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1006,7 +1006,7 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -struct folio *alloc_migrate_folio(struct folio *src, unsigned long private) +struct folio *alloc_demote_folio(struct folio *src, unsigned long private) { struct folio *dst; nodemask_t *allowed_mask; @@ -1069,7 +1069,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_folios, alloc_migrate_folio, NULL, + migrate_pages(demote_folios, alloc_demote_folio, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); -- cgit v1.2.3 From e1b1fe45573aa9919c18c13fcf6ec688534f92e3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 16 Jun 2025 10:23:46 -0700 Subject: Revert "mm: make alloc_demote_folio externally invokable for migration" This reverts commit a00ce85af2a1be494d3b0c9457e8e81cdcce2a89. Commit a00ce85af2a1 ("mm: make alloc_demote_folio externally invokable for migration") was made to let DAMOS_MIGRATE_{HOT,COLD} call the function. But a previous commit made DAMOS_MIGRATE_{HOT,COLD} call alloc_migration_target() instead. Hence there are no more callers of the function outside of vmscan.c. Revert the commit to make the function static again. Link: https://lkml.kernel.org/r/20250616172346.67659-4-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Joshua Hahn Cc: David Hildenbrand Cc: Honggyu Kim Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/internal.h | 1 - mm/vmscan.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index ce37834bcdce..3eb51c31a041 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1226,7 +1226,6 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long); extern void set_pageblock_order(void); -struct folio *alloc_demote_folio(struct folio *src, unsigned long private); unsigned long reclaim_pages(struct list_head *folio_list); unsigned int reclaim_clean_pages_from_list(struct zone *zone, struct list_head *folio_list); diff --git a/mm/vmscan.c b/mm/vmscan.c index 6bebc91cbf2f..620dce753b64 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1006,7 +1006,8 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } -struct folio *alloc_demote_folio(struct folio *src, unsigned long private) +static struct folio *alloc_demote_folio(struct folio *src, + unsigned long private) { struct folio *dst; nodemask_t *allowed_mask; -- cgit v1.2.3 From bfbe71109fa40e8cc05a0f99e6734b7d76ee00b0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 18 Jun 2025 20:42:53 +0100 Subject: mm: update core kernel code to use vm_flags_t consistently The core kernel code is currently very inconsistent in its use of vm_flags_t vs. unsigned long. This prevents us from changing the type of vm_flags_t in the future and is simply not correct, so correct this. While this results in rather a lot of churn, it is a critical pre-requisite for a future planned change to VMA flag type. Additionally, update VMA userland tests to account for the changes. To make review easier and to break things into smaller parts, driver and architecture-specific changes is left for a subsequent commit. The code has been adjusted to cascade the changes across all calling code as far as is needed. We will adjust architecture-specific and driver code in a subsequent patch. Overall, this patch does not introduce any functional change. Link: https://lkml.kernel.org/r/d1588e7bb96d1ea3fe7b9df2c699d5b4592d901d.1750274467.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Kees Cook Acked-by: Mike Rapoport (Microsoft) Acked-by: Jan Kara Acked-by: Christian Brauner Reviewed-by: Vlastimil Babka Acked-by: Oscar Salvador Reviewed-by: Pedro Falcato Acked-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Cc: Jann Horn Cc: Liam R. Howlett Cc: Catalin Marinas Cc: Jarkko Sakkinen Signed-off-by: Andrew Morton --- fs/exec.c | 2 +- fs/userfaultfd.c | 2 +- include/linux/coredump.h | 2 +- include/linux/huge_mm.h | 12 +- include/linux/khugepaged.h | 4 +- include/linux/ksm.h | 4 +- include/linux/memfd.h | 4 +- include/linux/mm.h | 6 +- include/linux/mm_types.h | 2 +- include/linux/mman.h | 4 +- include/linux/rmap.h | 4 +- include/linux/userfaultfd_k.h | 4 +- include/trace/events/fs_dax.h | 6 +- mm/debug.c | 2 +- mm/execmem.c | 8 +- mm/filemap.c | 2 +- mm/gup.c | 2 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 +- mm/internal.h | 4 +- mm/khugepaged.c | 4 +- mm/ksm.c | 2 +- mm/madvise.c | 4 +- mm/mapping_dirty_helpers.c | 2 +- mm/memfd.c | 8 +- mm/memory.c | 4 +- mm/mmap.c | 16 +-- mm/mprotect.c | 8 +- mm/mremap.c | 2 +- mm/nommu.c | 12 +- mm/rmap.c | 4 +- mm/shmem.c | 6 +- mm/userfaultfd.c | 14 +-- mm/vma.c | 80 ++++++------ mm/vma.h | 16 +-- mm/vmscan.c | 4 +- tools/testing/vma/vma.c | 266 +++++++++++++++++++-------------------- tools/testing/vma/vma_internal.h | 8 +- 38 files changed, 270 insertions(+), 270 deletions(-) (limited to 'mm/internal.h') diff --git a/fs/exec.c b/fs/exec.c index ba400aafd640..9faf9052bed9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -604,7 +604,7 @@ int setup_arg_pages(struct linux_binprm *bprm, struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; - unsigned long vm_flags; + vm_flags_t vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index a2928b0aec6f..48e82e19d831 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1242,7 +1242,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, int ret; struct uffdio_register uffdio_register; struct uffdio_register __user *user_uffdio_register; - unsigned long vm_flags; + vm_flags_t vm_flags; bool found; bool basic_ioctls; unsigned long start, end; diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 76e41805b92d..c504b0faecc2 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -10,7 +10,7 @@ #ifdef CONFIG_COREDUMP struct core_vma_metadata { unsigned long start, end; - unsigned long flags; + vm_flags_t flags; unsigned long dump_size; unsigned long pgoff; struct file *file; diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2f190c90192d..7753daac49f7 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -261,7 +261,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, + vm_flags_t vm_flags, unsigned long tva_flags, unsigned long orders); @@ -282,7 +282,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, */ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, + vm_flags_t vm_flags, unsigned long tva_flags, unsigned long orders) { @@ -317,7 +317,7 @@ struct thpsize { (1<vmi, vmg->vmi ? vma_iter_addr(vmg->vmi) : 0, vmg->vmi ? vma_iter_end(vmg->vmi) : 0, vmg->prev, vmg->middle, vmg->next, vmg->target, - vmg->start, vmg->end, vmg->flags, + vmg->start, vmg->end, vmg->vm_flags, vmg->file, vmg->anon_vma, vmg->policy, #ifdef CONFIG_USERFAULTFD vmg->uffd_ctx.ctx, diff --git a/mm/execmem.c b/mm/execmem.c index 2b683e7d864d..627e6cf64f4f 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -26,7 +26,7 @@ static struct execmem_info default_execmem_info __ro_after_init; #ifdef CONFIG_MMU static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, unsigned long vm_flags) + pgprot_t pgprot, vm_flags_t vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; @@ -82,7 +82,7 @@ struct vm_struct *execmem_vmap(size_t size) } #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, - pgprot_t pgprot, unsigned long vm_flags) + pgprot_t pgprot, vm_flags_t vm_flags) { return vmalloc(size); } @@ -256,7 +256,7 @@ out_unlock: static int execmem_cache_populate(struct execmem_range *range, size_t size) { - unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; + vm_flags_t vm_flags = VM_ALLOW_HUGE_VMAP; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; @@ -373,7 +373,7 @@ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; bool use_cache = range->flags & EXECMEM_ROX_CACHE; - unsigned long vm_flags = VM_FLUSH_RESET_PERMS; + vm_flags_t vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; void *p; diff --git a/mm/filemap.c b/mm/filemap.c index 3cf955740148..0d0369fb5fa1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3216,7 +3216,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; - unsigned long vm_flags = vmf->vma->vm_flags; + vm_flags_t vm_flags = vmf->vma->vm_flags; unsigned short mmap_miss; #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/gup.c b/mm/gup.c index cbe8e4b9845b..c08b97e0d344 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2044,7 +2044,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start, { struct vm_area_struct *vma; bool must_unlock = false; - unsigned long vm_flags; + vm_flags_t vm_flags; long i; if (!nr_pages) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1b31985cef11..6411f3107af1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -99,7 +99,7 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) } unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, - unsigned long vm_flags, + vm_flags_t vm_flags, unsigned long tva_flags, unsigned long orders) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a7df0b2a561..c7ba95030241 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7465,8 +7465,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; + vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; + vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the diff --git a/mm/internal.h b/mm/internal.h index 3eb51c31a041..fe83dfca3c72 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -928,7 +928,7 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); -extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, +extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes); /* @@ -1358,7 +1358,7 @@ int migrate_device_coherent_folio(struct folio *folio); struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, - unsigned long flags, unsigned long start, + vm_flags_t vm_flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 15203ea7d007..6b09b09c8f82 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -347,7 +347,7 @@ struct attribute_group khugepaged_attr_group = { #endif /* CONFIG_SYSFS */ int hugepage_madvise(struct vm_area_struct *vma, - unsigned long *vm_flags, int advice) + vm_flags_t *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: @@ -470,7 +470,7 @@ void __khugepaged_enter(struct mm_struct *mm) } void khugepaged_enter_vma(struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_pmd_enabled()) { diff --git a/mm/ksm.c b/mm/ksm.c index 18b3690bb69a..ef73b25fd65a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2840,7 +2840,7 @@ int ksm_disable(struct mm_struct *mm) } int ksm_madvise(struct vm_area_struct *vma, unsigned long start, - unsigned long end, int advice, unsigned long *vm_flags) + unsigned long end, int advice, vm_flags_t *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; diff --git a/mm/madvise.c b/mm/madvise.c index d451438af999..92f427b1b330 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -130,7 +130,7 @@ static int replace_anon_vma_name(struct vm_area_struct *vma, */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long new_flags, + unsigned long end, vm_flags_t new_flags, struct anon_vma_name *anon_name) { struct mm_struct *mm = vma->vm_mm; @@ -1258,7 +1258,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; - unsigned long new_flags = vma->vm_flags; + vm_flags_t new_flags = vma->vm_flags; if (unlikely(!can_modify_vma_madv(vma, behavior))) return -EPERM; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 2f8829b3541a..dc1692ff9e58 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -218,7 +218,7 @@ static void wp_clean_post_vma(struct mm_walk *walk) static int wp_clean_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { - unsigned long vm_flags = READ_ONCE(walk->vma->vm_flags); + vm_flags_t vm_flags = READ_ONCE(walk->vma->vm_flags); /* Skip non-applicable VMAs */ if ((vm_flags & (VM_SHARED | VM_MAYWRITE | VM_HUGETLB)) != diff --git a/mm/memfd.c b/mm/memfd.c index 65a107f72e39..b558c4c3bd27 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -332,10 +332,10 @@ static inline bool is_write_sealed(unsigned int seals) return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); } -static int check_write_seal(unsigned long *vm_flags_ptr) +static int check_write_seal(vm_flags_t *vm_flags_ptr) { - unsigned long vm_flags = *vm_flags_ptr; - unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); + vm_flags_t vm_flags = *vm_flags_ptr; + vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE); /* If a private mapping then writability is irrelevant. */ if (!(mask & VM_SHARED)) @@ -357,7 +357,7 @@ static int check_write_seal(unsigned long *vm_flags_ptr) return 0; } -int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) +int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr) { int err = 0; unsigned int *seals_ptr = memfd_file_seals_ptr(file); diff --git a/mm/memory.c b/mm/memory.c index b0cda5aab398..833426fa5fe0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -797,7 +797,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { - unsigned long vm_flags = dst_vma->vm_flags; + vm_flags_t vm_flags = dst_vma->vm_flags; pte_t orig_pte = ptep_get(src_pte); pte_t pte = orig_pte; struct folio *folio; @@ -6128,7 +6128,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, .gfp_mask = __get_fault_gfp_mask(vma), }; struct mm_struct *mm = vma->vm_mm; - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; diff --git a/mm/mmap.c b/mm/mmap.c index 09c563c95112..8f92cf10b656 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -80,7 +80,7 @@ core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); /* Update vma->vm_page_prot to reflect vma->vm_flags. */ void vma_set_page_prot(struct vm_area_struct *vma) { - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgprot_t vm_page_prot; vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); @@ -228,12 +228,12 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, +bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; - if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; @@ -1207,7 +1207,7 @@ out: return ret; } -int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) +int vm_brk_flags(unsigned long addr, unsigned long request, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; @@ -1224,7 +1224,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) return 0; /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((flags & (~VM_EXEC)) != 0) + if ((vm_flags & (~VM_EXEC)) != 0) return -EINVAL; if (mmap_write_lock_killable(mm)) @@ -1239,7 +1239,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) goto munmap_failed; vma = vma_prev(&vmi); - ret = do_brk_flags(&vmi, vma, addr, len, flags); + ret = do_brk_flags(&vmi, vma, addr, len, vm_flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); @@ -1444,7 +1444,7 @@ static vm_fault_t special_mapping_fault(struct vm_fault *vmf) static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long vm_flags, void *priv, + vm_flags_t vm_flags, void *priv, const struct vm_operations_struct *ops) { int ret; @@ -1496,7 +1496,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma, struct vm_area_struct *_install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long vm_flags, const struct vm_special_mapping *spec) + vm_flags_t vm_flags, const struct vm_special_mapping *spec) { return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, &special_mapping_vmops); diff --git a/mm/mprotect.c b/mm/mprotect.c index 88608d0dc2c2..b873b98ab705 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -596,10 +596,10 @@ static const struct mm_walk_ops prot_none_walk_ops = { int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, - unsigned long start, unsigned long end, unsigned long newflags) + unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - unsigned long oldflags = READ_ONCE(vma->vm_flags); + vm_flags_t oldflags = READ_ONCE(vma->vm_flags); long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; @@ -774,8 +774,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { - unsigned long mask_off_old_flags; - unsigned long newflags; + vm_flags_t mask_off_old_flags; + vm_flags_t newflags; int new_vma_pkey; if (vma->vm_start != tmp) { diff --git a/mm/mremap.c b/mm/mremap.c index 18b215521ada..7e93d3344828 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1025,7 +1025,7 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) struct vm_area_struct *vma = vrm->vma; unsigned long old_addr = vrm->addr; unsigned long old_len = vrm->old_len; - unsigned long dummy = vma->vm_flags; + vm_flags_t dummy = vma->vm_flags; /* * We'd prefer to avoid failure later on in do_munmap: diff --git a/mm/nommu.c b/mm/nommu.c index b624acec6d2e..87e1acab0d64 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -126,7 +126,7 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, unsigned long vm_flags, int node, + pgprot_t prot, vm_flags_t vm_flags, int node, const void *caller) { return __vmalloc_noprof(size, gfp_mask); @@ -844,12 +844,12 @@ static int validate_mmap_request(struct file *file, * we've determined that we can make the mapping, now translate what we * now know into VMA flags */ -static unsigned long determine_vm_flags(struct file *file, - unsigned long prot, - unsigned long flags, - unsigned long capabilities) +static vm_flags_t determine_vm_flags(struct file *file, + unsigned long prot, + unsigned long flags, + unsigned long capabilities) { - unsigned long vm_flags; + vm_flags_t vm_flags; vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags); diff --git a/mm/rmap.c b/mm/rmap.c index fb63d9256f09..a312cae16bb5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -839,7 +839,7 @@ out: struct folio_referenced_arg { int mapcount; int referenced; - unsigned long vm_flags; + vm_flags_t vm_flags; struct mem_cgroup *memcg; }; @@ -984,7 +984,7 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, - struct mem_cgroup *memcg, unsigned long *vm_flags) + struct mem_cgroup *memcg, vm_flags_t *vm_flags) { bool we_locked = false; struct folio_referenced_arg pra = { diff --git a/mm/shmem.c b/mm/shmem.c index eda35be2a8d9..334b7b4a61a0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -615,7 +615,7 @@ static unsigned int shmem_get_orders_within_size(struct inode *inode, static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); @@ -862,7 +862,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { return 0; } @@ -1753,7 +1753,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, { unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); - unsigned long vm_flags = vma ? vma->vm_flags : 0; + vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9ff970980496..95dd8dea6ee4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1901,11 +1901,11 @@ out: } static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, - vm_flags_t flags) + vm_flags_t vm_flags) { - const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; + const bool uffd_wp_changed = (vma->vm_flags ^ vm_flags) & VM_UFFD_WP; - vm_flags_reset(vma, flags); + vm_flags_reset(vma, vm_flags); /* * For shared mappings, we want to enable writenotify while * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply @@ -1917,12 +1917,12 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, static void userfaultfd_set_ctx(struct vm_area_struct *vma, struct userfaultfd_ctx *ctx, - unsigned long flags) + vm_flags_t vm_flags) { vma_start_write(vma); vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx}; userfaultfd_set_vm_flags(vma, - (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags); + (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags); } void userfaultfd_reset_ctx(struct vm_area_struct *vma) @@ -1968,14 +1968,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi, /* Assumes mmap write lock taken, and mm_struct pinned. */ int userfaultfd_register_range(struct userfaultfd_ctx *ctx, struct vm_area_struct *vma, - unsigned long vm_flags, + vm_flags_t vm_flags, unsigned long start, unsigned long end, bool wp_async) { VMA_ITERATOR(vmi, ctx->mm, start); struct vm_area_struct *prev = vma_prev(&vmi); unsigned long vma_end; - unsigned long new_flags; + vm_flags_t new_flags; if (vma->vm_start < start) prev = vma; diff --git a/mm/vma.c b/mm/vma.c index 4b6d0be9ba39..b3d880652359 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -15,7 +15,7 @@ struct mmap_state { unsigned long end; pgoff_t pgoff; unsigned long pglen; - unsigned long flags; + vm_flags_t vm_flags; struct file *file; pgprot_t page_prot; @@ -37,7 +37,7 @@ struct mmap_state { bool check_ksm_early; }; -#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ +#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ @@ -45,9 +45,9 @@ struct mmap_state { .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ - .flags = flags_, \ + .vm_flags = vm_flags_, \ .file = file_, \ - .page_prot = vm_get_page_prot(flags_), \ + .page_prot = vm_get_page_prot(vm_flags_), \ } #define VMG_MMAP_STATE(name, map_, vma_) \ @@ -56,7 +56,7 @@ struct mmap_state { .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ - .flags = (map_)->flags, \ + .vm_flags = (map_)->vm_flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ @@ -95,7 +95,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex * the kernel to generate new VMAs when old one could be * extended instead. */ - if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) + if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY) return false; if (vma->vm_file != vmg->file) return false; @@ -843,7 +843,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ - if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) + if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side)) return NULL; if (left_side) @@ -973,7 +973,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err || commit_merge(vmg)) goto abort; - khugepaged_enter_vma(vmg->target, vmg->flags); + khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -1055,7 +1055,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ - if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) + if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); @@ -1093,7 +1093,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) * following VMA if we have VMAs on both sides. */ if (vmg->target && !vma_expand(vmg)) { - khugepaged_enter_vma(vmg->target, vmg->flags); + khugepaged_enter_vma(vmg->target, vmg->vm_flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; } @@ -1640,11 +1640,11 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) struct vm_area_struct *vma_modify_flags( struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags) + vm_flags_t vm_flags) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.flags = new_flags; + vmg.vm_flags = vm_flags; return vma_modify(&vmg); } @@ -1655,12 +1655,12 @@ struct vm_area_struct struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, + vm_flags_t vm_flags, struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.flags = new_flags; + vmg.vm_flags = vm_flags; vmg.anon_name = new_name; return vma_modify(&vmg); @@ -1685,13 +1685,13 @@ struct vm_area_struct struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, + vm_flags_t vm_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); - vmg.flags = new_flags; + vmg.vm_flags = vm_flags; vmg.uffd_ctx = new_ctx; if (give_up_on_oom) vmg.give_up_on_oom = true; @@ -2327,7 +2327,7 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, static void update_ksm_flags(struct mmap_state *map) { - map->flags = ksm_vma_flags(map->mm, map->file, map->flags); + map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags); } /* @@ -2372,11 +2372,11 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) } /* Check against address space limit. */ - if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) + if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ - if (accountable_mapping(map->file, map->flags)) { + if (accountable_mapping(map->file, map->vm_flags)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { @@ -2386,7 +2386,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) } vms->nr_accounted = 0; - map->flags |= VM_ACCOUNT; + map->vm_flags |= VM_ACCOUNT; } /* @@ -2430,12 +2430,12 @@ static int __mmap_new_file_vma(struct mmap_state *map, * Drivers should not permit writability when previously it was * disallowed. */ - VM_WARN_ON_ONCE(map->flags != vma->vm_flags && - !(map->flags & VM_MAYWRITE) && + VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags && + !(map->vm_flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); map->file = vma->vm_file; - map->flags = vma->vm_flags; + map->vm_flags = vma->vm_flags; return 0; } @@ -2466,7 +2466,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); - vm_flags_init(vma, map->flags); + vm_flags_init(vma, map->vm_flags); vma->vm_page_prot = map->page_prot; if (vma_iter_prealloc(vmi, vma)) { @@ -2476,7 +2476,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (map->file) error = __mmap_new_file_vma(map, vma); - else if (map->flags & VM_SHARED) + else if (map->vm_flags & VM_SHARED) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); @@ -2486,12 +2486,12 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (!map->check_ksm_early) { update_ksm_flags(map); - vm_flags_init(vma, map->flags); + vm_flags_init(vma, map->vm_flags); } #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ - WARN_ON_ONCE(!arch_validate_flags(map->flags)); + WARN_ON_ONCE(!arch_validate_flags(map->vm_flags)); #endif /* Lock the VMA since it is modified after insertion into VMA tree */ @@ -2505,7 +2505,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * call covers the non-merge case. */ if (!vma_is_anonymous(vma)) - khugepaged_enter_vma(vma, map->flags); + khugepaged_enter_vma(vma, map->vm_flags); *vmap = vma; return 0; @@ -2526,7 +2526,7 @@ free_vma: static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; perf_event_mmap(vma); @@ -2579,7 +2579,7 @@ static int call_mmap_prepare(struct mmap_state *map) .pgoff = map->pgoff, .file = map->file, - .vm_flags = map->flags, + .vm_flags = map->vm_flags, .page_prot = map->page_prot, }; @@ -2591,7 +2591,7 @@ static int call_mmap_prepare(struct mmap_state *map) /* Update fields permitted to be changed. */ map->pgoff = desc.pgoff; map->file = desc.file; - map->flags = desc.vm_flags; + map->vm_flags = desc.vm_flags; map->page_prot = desc.page_prot; /* User-defined fields. */ map->vm_ops = desc.vm_ops; @@ -2754,14 +2754,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * @addr: The start address * @len: The length of the increase * @vma: The vma, - * @flags: The VMA Flags + * @vm_flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, unsigned long flags) + unsigned long addr, unsigned long len, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; @@ -2769,9 +2769,9 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - flags = ksm_vma_flags(mm, NULL, flags); - if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) + vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + vm_flags = ksm_vma_flags(mm, NULL, vm_flags); + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) @@ -2785,7 +2785,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { - VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); + VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ @@ -2806,8 +2806,8 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); - vm_flags_init(vma, flags); - vma->vm_page_prot = vm_get_page_prot(flags); + vm_flags_init(vma, vm_flags); + vma->vm_page_prot = vm_get_page_prot(vm_flags); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -2818,7 +2818,7 @@ out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; - if (flags & VM_LOCKED) + if (vm_flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vm_flags_set(vma, VM_SOFTDIRTY); return 0; diff --git a/mm/vma.h b/mm/vma.h index f47112a352db..cf6e3a6371b6 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -98,7 +98,7 @@ struct vma_merge_struct { unsigned long end; pgoff_t pgoff; - unsigned long flags; + vm_flags_t vm_flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; @@ -164,13 +164,13 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } -#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \ +#define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ - .flags = flags_, \ + .vm_flags = vm_flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } @@ -184,7 +184,7 @@ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, .next = NULL, \ .start = start_, \ .end = end_, \ - .flags = vma_->vm_flags, \ + .vm_flags = vma_->vm_flags, \ .pgoff = vma_pgoff_offset(vma_, start_), \ .file = vma_->vm_file, \ .anon_vma = vma_->anon_vma, \ @@ -288,7 +288,7 @@ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags); + vm_flags_t vm_flags); /* We are about to modify the VMA's flags and/or anon_name. */ __must_check struct vm_area_struct @@ -297,7 +297,7 @@ __must_check struct vm_area_struct struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, + vm_flags_t vm_flags, struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ @@ -314,7 +314,7 @@ __must_check struct vm_area_struct struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, - unsigned long new_flags, + vm_flags_t vm_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); @@ -375,7 +375,7 @@ static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma } #ifdef CONFIG_MMU -static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 620dce753b64..56d540b8a1d0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -907,7 +907,7 @@ static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; - unsigned long vm_flags; + vm_flags_t vm_flags; referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); @@ -2120,7 +2120,7 @@ static void shrink_active_list(unsigned long nr_to_scan, { unsigned long nr_taken; unsigned long nr_scanned; - unsigned long vm_flags; + vm_flags_t vm_flags; LIST_HEAD(l_hold); /* The folios which were snipped off */ LIST_HEAD(l_active); LIST_HEAD(l_inactive); diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 7fec5b3de83f..656e1c75b711 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -65,7 +65,7 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm, unsigned long start, unsigned long end, pgoff_t pgoff, - vm_flags_t flags) + vm_flags_t vm_flags) { struct vm_area_struct *ret = vm_area_alloc(mm); @@ -75,7 +75,7 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm, ret->vm_start = start; ret->vm_end = end; ret->vm_pgoff = pgoff; - ret->__vm_flags = flags; + ret->__vm_flags = vm_flags; vma_assert_detached(ret); return ret; @@ -103,9 +103,9 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, unsigned long start, unsigned long end, pgoff_t pgoff, - vm_flags_t flags) + vm_flags_t vm_flags) { - struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, flags); + struct vm_area_struct *vma = alloc_vma(mm, start, end, pgoff, vm_flags); if (vma == NULL) return NULL; @@ -172,7 +172,7 @@ static int expand_existing(struct vma_merge_struct *vmg) * specified new range. */ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, - unsigned long end, pgoff_t pgoff, vm_flags_t flags) + unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags) { vma_iter_set(vmg->vmi, start); @@ -184,7 +184,7 @@ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, vmg->start = start; vmg->end = end; vmg->pgoff = pgoff; - vmg->flags = flags; + vmg->vm_flags = vm_flags; vmg->just_expand = false; vmg->__remove_middle = false; @@ -195,10 +195,10 @@ static void vmg_set_range(struct vma_merge_struct *vmg, unsigned long start, /* Helper function to set both the VMG range and its anon_vma. */ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long start, - unsigned long end, pgoff_t pgoff, vm_flags_t flags, + unsigned long end, pgoff_t pgoff, vm_flags_t vm_flags, struct anon_vma *anon_vma) { - vmg_set_range(vmg, start, end, pgoff, flags); + vmg_set_range(vmg, start, end, pgoff, vm_flags); vmg->anon_vma = anon_vma; } @@ -211,12 +211,12 @@ static void vmg_set_range_anon_vma(struct vma_merge_struct *vmg, unsigned long s static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, struct vma_merge_struct *vmg, unsigned long start, unsigned long end, - pgoff_t pgoff, vm_flags_t flags, + pgoff_t pgoff, vm_flags_t vm_flags, bool *was_merged) { struct vm_area_struct *merged; - vmg_set_range(vmg, start, end, pgoff, flags); + vmg_set_range(vmg, start, end, pgoff, vm_flags); merged = merge_new(vmg); if (merged) { @@ -229,7 +229,7 @@ static struct vm_area_struct *try_merge_new_vma(struct mm_struct *mm, ASSERT_EQ(vmg->state, VMA_MERGE_NOMERGE); - return alloc_and_link_vma(mm, start, end, pgoff, flags); + return alloc_and_link_vma(mm, start, end, pgoff, vm_flags); } /* @@ -301,17 +301,17 @@ static void vma_set_dummy_anon_vma(struct vm_area_struct *vma, static bool test_simple_merge(void) { struct vm_area_struct *vma; - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; - struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, flags); - struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, flags); + struct vm_area_struct *vma_left = alloc_vma(&mm, 0, 0x1000, 0, vm_flags); + struct vm_area_struct *vma_right = alloc_vma(&mm, 0x2000, 0x3000, 2, vm_flags); VMA_ITERATOR(vmi, &mm, 0x1000); struct vma_merge_struct vmg = { .mm = &mm, .vmi = &vmi, .start = 0x1000, .end = 0x2000, - .flags = flags, + .vm_flags = vm_flags, .pgoff = 1, }; @@ -324,7 +324,7 @@ static bool test_simple_merge(void) ASSERT_EQ(vma->vm_start, 0); ASSERT_EQ(vma->vm_end, 0x3000); ASSERT_EQ(vma->vm_pgoff, 0); - ASSERT_EQ(vma->vm_flags, flags); + ASSERT_EQ(vma->vm_flags, vm_flags); detach_free_vma(vma); mtree_destroy(&mm.mm_mt); @@ -335,9 +335,9 @@ static bool test_simple_merge(void) static bool test_simple_modify(void) { struct vm_area_struct *vma; - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; - struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, flags); + struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags); VMA_ITERATOR(vmi, &mm, 0x1000); ASSERT_FALSE(attach_vma(&mm, init_vma)); @@ -394,9 +394,9 @@ static bool test_simple_modify(void) static bool test_simple_expand(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; - struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, flags); + struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x1000, 0, vm_flags); VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { .vmi = &vmi, @@ -422,9 +422,9 @@ static bool test_simple_expand(void) static bool test_simple_shrink(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; - struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, flags); + struct vm_area_struct *vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags); VMA_ITERATOR(vmi, &mm, 0); ASSERT_FALSE(attach_vma(&mm, vma)); @@ -443,7 +443,7 @@ static bool test_simple_shrink(void) static bool test_merge_new(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { @@ -473,18 +473,18 @@ static bool test_merge_new(void) * 0123456789abc * AA B CC */ - vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); + vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); ASSERT_NE(vma_a, NULL); /* We give each VMA a single avc so we can test anon_vma duplication. */ INIT_LIST_HEAD(&vma_a->anon_vma_chain); list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain); - vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags); + vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); ASSERT_NE(vma_b, NULL); INIT_LIST_HEAD(&vma_b->anon_vma_chain); list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain); - vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, flags); + vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags); ASSERT_NE(vma_c, NULL); INIT_LIST_HEAD(&vma_c->anon_vma_chain); list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain); @@ -495,7 +495,7 @@ static bool test_merge_new(void) * 0123456789abc * AA B ** CC */ - vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, flags, &merged); + vma_d = try_merge_new_vma(&mm, &vmg, 0x7000, 0x9000, 7, vm_flags, &merged); ASSERT_NE(vma_d, NULL); INIT_LIST_HEAD(&vma_d->anon_vma_chain); list_add(&dummy_anon_vma_chain_d.same_vma, &vma_d->anon_vma_chain); @@ -510,7 +510,7 @@ static bool test_merge_new(void) */ vma_a->vm_ops = &vm_ops; /* This should have no impact. */ vma_b->anon_vma = &dummy_anon_vma; - vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x2000, 0x3000, 2, vm_flags, &merged); ASSERT_EQ(vma, vma_a); /* Merge with A, delete B. */ ASSERT_TRUE(merged); @@ -527,7 +527,7 @@ static bool test_merge_new(void) * 0123456789abc * AAAA* DD CC */ - vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x4000, 0x5000, 4, vm_flags, &merged); ASSERT_EQ(vma, vma_a); /* Extend A. */ ASSERT_TRUE(merged); @@ -546,7 +546,7 @@ static bool test_merge_new(void) */ vma_d->anon_vma = &dummy_anon_vma; vma_d->vm_ops = &vm_ops; /* This should have no impact. */ - vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x6000, 0x7000, 6, vm_flags, &merged); ASSERT_EQ(vma, vma_d); /* Prepend. */ ASSERT_TRUE(merged); @@ -564,7 +564,7 @@ static bool test_merge_new(void) * AAAAA*DDD CC */ vma_d->vm_ops = NULL; /* This would otherwise degrade the merge. */ - vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x5000, 0x6000, 5, vm_flags, &merged); ASSERT_EQ(vma, vma_a); /* Merge with A, delete D. */ ASSERT_TRUE(merged); @@ -582,7 +582,7 @@ static bool test_merge_new(void) * AAAAAAAAA *CC */ vma_c->anon_vma = &dummy_anon_vma; - vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0xa000, 0xb000, 0xa, vm_flags, &merged); ASSERT_EQ(vma, vma_c); /* Prepend C. */ ASSERT_TRUE(merged); @@ -599,7 +599,7 @@ static bool test_merge_new(void) * 0123456789abc * AAAAAAAAA*CCC */ - vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, flags, &merged); + vma = try_merge_new_vma(&mm, &vmg, 0x9000, 0xa000, 0x9, vm_flags, &merged); ASSERT_EQ(vma, vma_a); /* Extend A and delete C. */ ASSERT_TRUE(merged); @@ -639,7 +639,7 @@ static bool test_merge_new(void) static bool test_vma_merge_special_flags(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { @@ -661,7 +661,7 @@ static bool test_vma_merge_special_flags(void) * 01234 * AAA */ - vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_left = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); ASSERT_NE(vma_left, NULL); /* 1. Set up new VMA with special flag that would otherwise merge. */ @@ -672,12 +672,12 @@ static bool test_vma_merge_special_flags(void) * * This should merge if not for the VM_SPECIAL flag. */ - vmg_set_range(&vmg, 0x3000, 0x4000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x4000, 3, vm_flags); for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = flags | special_flag; - vmg.flags = flags | special_flag; + vma_left->__vm_flags = vm_flags | special_flag; + vmg.vm_flags = vm_flags | special_flag; vma = merge_new(&vmg); ASSERT_EQ(vma, NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); @@ -691,15 +691,15 @@ static bool test_vma_merge_special_flags(void) * * Create a VMA to modify. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); ASSERT_NE(vma, NULL); vmg.middle = vma; for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = flags | special_flag; - vmg.flags = flags | special_flag; + vma_left->__vm_flags = vm_flags | special_flag; + vmg.vm_flags = vm_flags | special_flag; vma = merge_existing(&vmg); ASSERT_EQ(vma, NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); @@ -711,7 +711,7 @@ static bool test_vma_merge_special_flags(void) static bool test_vma_merge_with_close(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { @@ -791,11 +791,11 @@ static bool test_vma_merge_with_close(void) * PPPPPPNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags); vma_next->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); ASSERT_EQ(merge_new(&vmg), vma_prev); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); ASSERT_EQ(vma_prev->vm_start, 0); @@ -816,11 +816,11 @@ static bool test_vma_merge_with_close(void) * proceed. */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); vma->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -844,11 +844,11 @@ static bool test_vma_merge_with_close(void) * proceed. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags); vma->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); /* @@ -872,12 +872,12 @@ static bool test_vma_merge_with_close(void) * PPPVVNNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags); vma->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -898,12 +898,12 @@ static bool test_vma_merge_with_close(void) * PPPPPNNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x9000, 5, vm_flags); vma_next->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -920,15 +920,15 @@ static bool test_vma_merge_with_close(void) static bool test_vma_merge_new_with_close(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { .mm = &mm, .vmi = &vmi, }; - struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); - struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, flags); + struct vm_area_struct *vma_prev = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); + struct vm_area_struct *vma_next = alloc_and_link_vma(&mm, 0x5000, 0x7000, 5, vm_flags); const struct vm_operations_struct vm_ops = { .close = dummy_close, }; @@ -958,7 +958,7 @@ static bool test_vma_merge_new_with_close(void) vma_prev->vm_ops = &vm_ops; vma_next->vm_ops = &vm_ops; - vmg_set_range(&vmg, 0x2000, 0x5000, 2, flags); + vmg_set_range(&vmg, 0x2000, 0x5000, 2, vm_flags); vma = merge_new(&vmg); ASSERT_NE(vma, NULL); ASSERT_EQ(vmg.state, VMA_MERGE_SUCCESS); @@ -975,7 +975,7 @@ static bool test_vma_merge_new_with_close(void) static bool test_merge_existing(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma, *vma_prev, *vma_next; @@ -998,11 +998,11 @@ static bool test_merge_existing(void) * 0123456789 * VNNNNNN */ - vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); + vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, flags, &dummy_anon_vma); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma); vmg.middle = vma; vmg.prev = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -1032,10 +1032,10 @@ static bool test_merge_existing(void) * 0123456789 * NNNNNNN */ - vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, flags); - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, flags); + vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, flags, &dummy_anon_vma); + vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma); vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); ASSERT_EQ(merge_existing(&vmg), vma_next); @@ -1060,11 +1060,11 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPV */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ - vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, flags, &dummy_anon_vma); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -1094,10 +1094,10 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); - vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, &dummy_anon_vma); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -1123,11 +1123,11 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); - vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); - vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, &dummy_anon_vma); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -1158,41 +1158,41 @@ static bool test_merge_existing(void) * PPPVVVVVNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags); - vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, vm_flags); - vmg_set_range(&vmg, 0x4000, 0x5000, 4, flags); + vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags); + vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x6000, 0x7000, 6, flags); + vmg_set_range(&vmg, 0x6000, 0x7000, 6, vm_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x4000, 0x7000, 4, flags); + vmg_set_range(&vmg, 0x4000, 0x7000, 4, vm_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x4000, 0x6000, 4, flags); + vmg_set_range(&vmg, 0x4000, 0x6000, 4, vm_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); ASSERT_EQ(vmg.state, VMA_MERGE_NOMERGE); - vmg_set_range(&vmg, 0x5000, 0x6000, 5, flags); + vmg_set_range(&vmg, 0x5000, 0x6000, 5, vm_flags); vmg.prev = vma; vmg.middle = vma; ASSERT_EQ(merge_existing(&vmg), NULL); @@ -1205,7 +1205,7 @@ static bool test_merge_existing(void) static bool test_anon_vma_non_mergeable(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma, *vma_prev, *vma_next; @@ -1229,9 +1229,9 @@ static bool test_anon_vma_non_mergeable(void) * 0123456789 * PPPPPPPNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, flags); - vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags); /* * Give both prev and next single anon_vma_chain fields, so they will @@ -1239,7 +1239,7 @@ static bool test_anon_vma_non_mergeable(void) * * However, when prev is compared to next, the merge should fail. */ - vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, NULL); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1); @@ -1267,10 +1267,10 @@ static bool test_anon_vma_non_mergeable(void) * 0123456789 * PPPPPPPNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags); - vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, flags, NULL); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, NULL); vmg.prev = vma_prev; vma_set_dummy_anon_vma(vma_prev, &dummy_anon_vma_chain_1); __vma_set_dummy_anon_vma(vma_next, &dummy_anon_vma_chain_2, &dummy_anon_vma_2); @@ -1292,7 +1292,7 @@ static bool test_anon_vma_non_mergeable(void) static bool test_dup_anon_vma(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { @@ -1313,11 +1313,11 @@ static bool test_dup_anon_vma(void) * This covers new VMA merging, as these operations amount to a VMA * expand. */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); vma_next->anon_vma = &dummy_anon_vma; - vmg_set_range(&vmg, 0, 0x5000, 0, flags); + vmg_set_range(&vmg, 0, 0x5000, 0, vm_flags); vmg.target = vma_prev; vmg.next = vma_next; @@ -1339,16 +1339,16 @@ static bool test_dup_anon_vma(void) * extend delete delete */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags); /* Initialise avc so mergeability check passes. */ INIT_LIST_HEAD(&vma_next->anon_vma_chain); list_add(&dummy_anon_vma_chain.same_vma, &vma_next->anon_vma_chain); vma_next->anon_vma = &dummy_anon_vma; - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1372,12 +1372,12 @@ static bool test_dup_anon_vma(void) * extend delete delete */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags); vmg.anon_vma = &dummy_anon_vma; vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1401,11 +1401,11 @@ static bool test_dup_anon_vma(void) * extend shrink/delete */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags); vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); vmg.prev = vma_prev; vmg.middle = vma; @@ -1429,11 +1429,11 @@ static bool test_dup_anon_vma(void) * shrink/delete extend */ - vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, flags); - vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, flags); + vma = alloc_and_link_vma(&mm, 0, 0x5000, 0, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x5000, 0x8000, 5, vm_flags); vma_set_dummy_anon_vma(vma, &dummy_anon_vma_chain); - vmg_set_range(&vmg, 0x3000, 0x5000, 3, flags); + vmg_set_range(&vmg, 0x3000, 0x5000, 3, vm_flags); vmg.prev = vma; vmg.middle = vma; @@ -1452,7 +1452,7 @@ static bool test_dup_anon_vma(void) static bool test_vmi_prealloc_fail(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vma_merge_struct vmg = { @@ -1468,11 +1468,11 @@ static bool test_vmi_prealloc_fail(void) * the duplicated anon_vma is unlinked. */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); vma->anon_vma = &dummy_anon_vma; - vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, flags, &dummy_anon_vma); + vmg_set_range_anon_vma(&vmg, 0x3000, 0x5000, 3, vm_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; vma_set_dummy_anon_vma(vma, &avc); @@ -1496,11 +1496,11 @@ static bool test_vmi_prealloc_fail(void) * performed in this case too. */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, flags); - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); vma->anon_vma = &dummy_anon_vma; - vmg_set_range(&vmg, 0, 0x5000, 3, flags); + vmg_set_range(&vmg, 0, 0x5000, 3, vm_flags); vmg.target = vma_prev; vmg.next = vma; @@ -1518,13 +1518,13 @@ static bool test_vmi_prealloc_fail(void) static bool test_merge_extend(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0x1000); struct vm_area_struct *vma; - vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, flags); - alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, flags); + vma = alloc_and_link_vma(&mm, 0, 0x1000, 0, vm_flags); + alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); /* * Extend a VMA into the gap between itself and the following VMA. @@ -1548,7 +1548,7 @@ static bool test_merge_extend(void) static bool test_copy_vma(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; bool need_locks = false; VMA_ITERATOR(vmi, &mm, 0); @@ -1556,7 +1556,7 @@ static bool test_copy_vma(void) /* Move backwards and do not merge. */ - vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); vma_new = copy_vma(&vma, 0, 0x2000, 0, &need_locks); ASSERT_NE(vma_new, vma); ASSERT_EQ(vma_new->vm_start, 0); @@ -1568,8 +1568,8 @@ static bool test_copy_vma(void) /* Move a VMA into position next to another and merge the two. */ - vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, flags); + vma = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x8000, 6, vm_flags); vma_new = copy_vma(&vma, 0x4000, 0x2000, 4, &need_locks); vma_assert_attached(vma_new); @@ -1581,11 +1581,11 @@ static bool test_copy_vma(void) static bool test_expand_only_mode(void) { - unsigned long flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma_prev, *vma; - VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, flags, 5); + VMG_STATE(vmg, &mm, &vmi, 0x5000, 0x9000, vm_flags, 5); /* * Place a VMA prior to the one we're expanding so we assert that we do @@ -1593,14 +1593,14 @@ static bool test_expand_only_mode(void) * have, through the use of the just_expand flag, indicated we do not * need to do so. */ - alloc_and_link_vma(&mm, 0, 0x2000, 0, flags); + alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); /* * We will be positioned at the prev VMA, but looking to expand to * 0x9000. */ vma_iter_set(&vmi, 0x3000); - vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, flags); + vma_prev = alloc_and_link_vma(&mm, 0x3000, 0x5000, 3, vm_flags); vmg.prev = vma_prev; vmg.just_expand = true; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 3b1b45256d56..f684649b1008 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1084,7 +1084,7 @@ static inline bool mpol_equal(struct mempolicy *, struct mempolicy *) } static inline void khugepaged_enter_vma(struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { (void)vma; (void)vm_flags; @@ -1200,7 +1200,7 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); /* Update vma->vm_page_prot to reflect vma->vm_flags. */ static inline void vma_set_page_prot(struct vm_area_struct *vma) { - unsigned long vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; pgprot_t vm_page_prot; /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ @@ -1280,12 +1280,12 @@ static inline bool capable(int cap) return true; } -static inline bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, +static inline bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; - if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) + if (!(vm_flags & VM_LOCKED) || capable(CAP_IPC_LOCK)) return true; locked_pages = bytes >> PAGE_SHIFT; -- cgit v1.2.3 From 1bc3587a88d291a37dab12d6c14aa7da53304251 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 16 Jun 2025 22:11:11 -0400 Subject: mm/page_alloc: add support for initializing pageblock as isolated MIGRATE_ISOLATE is a standalone bit, so a pageblock cannot be initialized to just MIGRATE_ISOLATE. Add init_pageblock_migratetype() to enable initialize a pageblock with a migratetype and isolated. Link: https://lkml.kernel.org/r/20250617021115.2331563-4-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Brendan Jackman Cc: Johannes Weiner Cc: Kirill A. Shuemov Cc: Mel Gorman Cc: Michal Hocko Cc: Oscar Salvador Cc: Richard Chang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 3 ++- include/linux/page-isolation.h | 3 +++ kernel/kexec_handover.c | 4 ++-- mm/hugetlb.c | 4 ++-- mm/internal.h | 3 ++- mm/memory_hotplug.c | 12 ++++++++---- mm/memremap.c | 2 +- mm/mm_init.c | 24 +++++++++++++++--------- mm/page_alloc.c | 26 ++++++++++++++++++++++++++ 9 files changed, 61 insertions(+), 20 deletions(-) (limited to 'mm/internal.h') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index eaac5ae8c05c..23f038a16231 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -314,7 +314,8 @@ extern int add_memory_driver_managed(int nid, u64 start, u64 size, mhp_t mhp_flags); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, int migratetype); + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock); extern void remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index fc021d3f95ca..14c6a5f691c2 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -41,6 +41,9 @@ static inline void set_pageblock_isolate(struct page *page) #define MEMORY_OFFLINE 0x1 #define REPORT_FAILURE 0x2 +void __meminit init_pageblock_migratetype(struct page *page, + enum migratetype migratetype, + bool isolate); void set_pageblock_migratetype(struct page *page, enum migratetype migratetype); bool move_freepages_block_isolate(struct zone *zone, struct page *page, diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 5a21dbe17950..49634cc3fb43 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -1100,8 +1100,8 @@ static void __init kho_release_scratch(void) ulong pfn; for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) - set_pageblock_migratetype(pfn_to_page(pfn), - MIGRATE_CMA); + init_pageblock_migratetype(pfn_to_page(pfn), + MIGRATE_CMA, false); } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c03896375749..11d5668ff6e7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3297,8 +3297,8 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio, if (folio_test_hugetlb_cma(folio)) init_cma_pageblock(folio_page(folio, i)); else - set_pageblock_migratetype(folio_page(folio, i), - MIGRATE_MOVABLE); + init_pageblock_migratetype(folio_page(folio, i), + MIGRATE_MOVABLE, false); } } diff --git a/mm/internal.h b/mm/internal.h index fe83dfca3c72..22a95a2b7fa1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -820,7 +820,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align, int nid, bool exact_nid); void memmap_init_range(unsigned long, int, unsigned long, unsigned long, - unsigned long, enum meminit_context, struct vmem_altmap *, int); + unsigned long, enum meminit_context, struct vmem_altmap *, int, + bool); #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 403221982c2e..a3c2b0784070 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -747,7 +747,8 @@ static inline void section_taint_zone_device(unsigned long pfn) */ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, int migratetype) + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; @@ -779,7 +780,8 @@ void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * are reserved so nobody should be touching them so we should be safe */ memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, - MEMINIT_HOTPLUG, altmap, migratetype); + MEMINIT_HOTPLUG, altmap, migratetype, + isolate_pageblock); set_zone_contiguous(zone); } @@ -1104,7 +1106,8 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, if (mhp_off_inaccessible) page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages); - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE, + false); for (i = 0; i < nr_pages; i++) { struct page *page = pfn_to_page(pfn + i); @@ -1175,7 +1178,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, /* associate pfn range with the zone */ - move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE, + true); if (!node_state(nid, N_MEMORY)) { /* Adding memory to the node for the first time */ diff --git a/mm/memremap.c b/mm/memremap.c index f75078c14839..b0ce0d8254bd 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -228,7 +228,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, PHYS_PFN(range->start), PHYS_PFN(range_len(range)), params->altmap, - MIGRATE_MOVABLE); + MIGRATE_MOVABLE, false); } mem_hotplug_done(); diff --git a/mm/mm_init.c b/mm/mm_init.c index 02f41e2bdf60..5c21b3af216b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -685,7 +685,8 @@ void __meminit __init_page_from_nid(unsigned long pfn, int nid) __init_single_page(pfn_to_page(pfn), pfn, zid, nid); if (pageblock_aligned(pfn)) - set_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE); + init_pageblock_migratetype(pfn_to_page(pfn), MIGRATE_MOVABLE, + false); } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -874,7 +875,8 @@ static void __init init_unavailable_range(unsigned long spfn, void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, unsigned long zone_end_pfn, enum meminit_context context, - struct vmem_altmap *altmap, int migratetype) + struct vmem_altmap *altmap, int migratetype, + bool isolate_pageblock) { unsigned long pfn, end_pfn = start_pfn + size; struct page *page; @@ -931,7 +933,8 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone * over the place during system boot. */ if (pageblock_aligned(pfn)) { - set_pageblock_migratetype(page, migratetype); + init_pageblock_migratetype(page, migratetype, + isolate_pageblock); cond_resched(); } pfn++; @@ -954,7 +957,8 @@ static void __init memmap_init_zone_range(struct zone *zone, return; memmap_init_range(end_pfn - start_pfn, nid, zone_id, start_pfn, - zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); + zone_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE, + false); if (*hole_pfn < start_pfn) init_unavailable_range(*hole_pfn, start_pfn, zone_id, nid); @@ -1035,7 +1039,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * because this is done early in section_activate() */ if (pageblock_aligned(pfn)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + init_pageblock_migratetype(page, MIGRATE_MOVABLE, false); cond_resched(); } @@ -1996,7 +2000,8 @@ static void __init deferred_free_pages(unsigned long pfn, /* Free a large naturally-aligned chunk if possible */ if (nr_pages == MAX_ORDER_NR_PAGES && IS_MAX_ORDER_ALIGNED(pfn)) { for (i = 0; i < nr_pages; i += pageblock_nr_pages) - set_pageblock_migratetype(page + i, MIGRATE_MOVABLE); + init_pageblock_migratetype(page + i, MIGRATE_MOVABLE, + false); __free_pages_core(page, MAX_PAGE_ORDER, MEMINIT_EARLY); return; } @@ -2006,7 +2011,8 @@ static void __init deferred_free_pages(unsigned long pfn, for (i = 0; i < nr_pages; i++, page++, pfn++) { if (pageblock_aligned(pfn)) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + init_pageblock_migratetype(page, MIGRATE_MOVABLE, + false); __free_pages_core(page, 0, MEMINIT_EARLY); } } @@ -2305,7 +2311,7 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_count(p, 0); } while (++p, --i); - set_pageblock_migratetype(page, MIGRATE_CMA); + init_pageblock_migratetype(page, MIGRATE_CMA, false); set_page_refcounted(page); /* pages were reserved and not allocated */ clear_page_tag_ref(page); @@ -2319,7 +2325,7 @@ void __init init_cma_reserved_pageblock(struct page *page) */ void __init init_cma_pageblock(struct page *page) { - set_pageblock_migratetype(page, MIGRATE_CMA); + init_pageblock_migratetype(page, MIGRATE_CMA, false); adjust_managed_page_count(page, pageblock_nr_pages); page_zone(page)->cma_pages += pageblock_nr_pages; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 61dd34102c14..c7730264bf5f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -544,6 +544,32 @@ __always_inline void set_pageblock_migratetype(struct page *page, MIGRATETYPE_AND_ISO_MASK); } +void __meminit init_pageblock_migratetype(struct page *page, + enum migratetype migratetype, + bool isolate) +{ + unsigned long flags; + + if (unlikely(page_group_by_mobility_disabled && + migratetype < MIGRATE_PCPTYPES)) + migratetype = MIGRATE_UNMOVABLE; + + flags = migratetype; + +#ifdef CONFIG_MEMORY_ISOLATION + if (migratetype == MIGRATE_ISOLATE) { + VM_WARN_ONCE( + 1, + "Set isolate=true to isolate pageblock with a migratetype"); + return; + } + if (isolate) + flags |= BIT(PB_migrate_isolate); +#endif + __set_pfnblock_flags_mask(page, page_to_pfn(page), flags, + MIGRATETYPE_AND_ISO_MASK); +} + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { -- cgit v1.2.3 From df25569d401e36327b339c3f5b3265d74eae90f2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 4 Jul 2025 12:25:20 +0200 Subject: mm: rename PAGE_MAPPING_* to FOLIO_MAPPING_* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the mapping flags are only used for folios, let's rename the defines. Link: https://lkml.kernel.org/r/20250704102524.326966-27-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Harry Yoo Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Brendan Jackman Cc: Byungchul Park Cc: Chengming Zhou Cc: Christian Brauner Cc: Christophe Leroy Cc: Eugenio Pé rez Cc: Greg Kroah-Hartman Cc: Gregory Price Cc: "Huang, Ying" Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Wang Cc: Jerrin Shaji George Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Peter Xu Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Sergey Senozhatsky Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: xu xin Signed-off-by: Andrew Morton --- fs/proc/page.c | 4 ++-- include/linux/fs.h | 2 +- include/linux/mm_types.h | 1 - include/linux/page-flags.h | 20 ++++++++++---------- include/linux/pagemap.h | 2 +- mm/gup.c | 4 ++-- mm/internal.h | 2 +- mm/ksm.c | 4 ++-- mm/rmap.c | 16 ++++++++-------- mm/util.c | 6 +++--- 10 files changed, 30 insertions(+), 31 deletions(-) (limited to 'mm/internal.h') diff --git a/fs/proc/page.c b/fs/proc/page.c index 999af26c7298..0cdc78c0d23f 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -149,7 +149,7 @@ u64 stable_page_flags(const struct page *page) k = folio->flags; mapping = (unsigned long)folio->mapping; - is_anon = mapping & PAGE_MAPPING_ANON; + is_anon = mapping & FOLIO_MAPPING_ANON; /* * pseudo flags for the well known (anonymous) memory mapped pages @@ -158,7 +158,7 @@ u64 stable_page_flags(const struct page *page) u |= 1 << KPF_MMAP; if (is_anon) { u |= 1 << KPF_ANON; - if (mapping & PAGE_MAPPING_KSM) + if (mapping & FOLIO_MAPPING_KSM) u |= 1 << KPF_KSM; } diff --git a/include/linux/fs.h b/include/linux/fs.h index e14e9d11ca0f..d3e7ad6941a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -526,7 +526,7 @@ struct address_space { /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit - * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. + * of struct folio's "mapping" pointer be used for FOLIO_MAPPING_ANON. */ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 804d269a4f5e..1ec273b06691 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -105,7 +105,6 @@ struct page { unsigned int order; }; }; - /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; union { pgoff_t __folio_index; /* Our offset within mapping. */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index ae2b80fcea6a..8e4d6eda8a8d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -695,10 +695,10 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) /* * On an anonymous folio mapped into a user virtual memory area, * folio->mapping points to its anon_vma, not to a struct address_space; - * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. + * with the FOLIO_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous folio in a VM_MERGEABLE area, if CONFIG_KSM is enabled, - * the PAGE_MAPPING_ANON_KSM bit may be set along with the PAGE_MAPPING_ANON + * the FOLIO_MAPPING_ANON_KSM bit may be set along with the FOLIO_MAPPING_ANON * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged folio. See ksm.h. * @@ -713,21 +713,21 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) * false before calling the following functions (e.g., folio_test_anon). * See mm/slab.h. */ -#define PAGE_MAPPING_ANON 0x1 -#define PAGE_MAPPING_ANON_KSM 0x2 -#define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_ANON_KSM) -#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_ANON_KSM) +#define FOLIO_MAPPING_ANON 0x1 +#define FOLIO_MAPPING_ANON_KSM 0x2 +#define FOLIO_MAPPING_KSM (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) +#define FOLIO_MAPPING_FLAGS (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) static __always_inline bool folio_test_anon(const struct folio *folio) { - return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; + return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0; } static __always_inline bool PageAnonNotKsm(const struct page *page) { unsigned long flags = (unsigned long)page_folio(page)->mapping; - return (flags & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_ANON; + return (flags & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_ANON; } static __always_inline bool PageAnon(const struct page *page) @@ -743,8 +743,8 @@ static __always_inline bool PageAnon(const struct page *page) */ static __always_inline bool folio_test_ksm(const struct folio *folio) { - return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == - PAGE_MAPPING_KSM; + return ((unsigned long)folio->mapping & FOLIO_MAPPING_FLAGS) == + FOLIO_MAPPING_KSM; } #else FOLIO_TEST_FLAG_FALSE(ksm) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e63fbfbd5b0f..10a222e68b85 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -502,7 +502,7 @@ static inline pgoff_t mapping_align_index(struct address_space *mapping, static inline bool mapping_large_folio_support(struct address_space *mapping) { /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ - VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, + VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON, "Anonymous mapping always supports large folio"); return mapping_max_folio_order(mapping) > 0; diff --git a/mm/gup.c b/mm/gup.c index 30d320719fa2..adffe663594d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2804,9 +2804,9 @@ static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags) return false; /* Anonymous folios pose no problem. */ - mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS; + mapping_flags = (unsigned long)mapping & FOLIO_MAPPING_FLAGS; if (mapping_flags) - return mapping_flags & PAGE_MAPPING_ANON; + return mapping_flags & FOLIO_MAPPING_ANON; /* * At this point, we know the mapping is non-null and points to an diff --git a/mm/internal.h b/mm/internal.h index 22a95a2b7fa1..2e235740128a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -149,7 +149,7 @@ static inline void *folio_raw_mapping(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; - return (void *)(mapping & ~PAGE_MAPPING_FLAGS); + return (void *)(mapping & ~FOLIO_MAPPING_FLAGS); } /* diff --git a/mm/ksm.c b/mm/ksm.c index ef73b25fd65a..2b0210d41c55 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -893,7 +893,7 @@ static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node, unsigned long kpfn; expected_mapping = (void *)((unsigned long)stable_node | - PAGE_MAPPING_KSM); + FOLIO_MAPPING_KSM); again: kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */ folio = pfn_folio(kpfn); @@ -1070,7 +1070,7 @@ static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio); - folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); + folio->mapping = (void *)((unsigned long)stable_node | FOLIO_MAPPING_KSM); } #ifdef CONFIG_SYSFS diff --git a/mm/rmap.c b/mm/rmap.c index bd83724d14b6..4b1a2a33e39f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -503,12 +503,12 @@ struct anon_vma *folio_get_anon_vma(const struct folio *folio) rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; - anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; @@ -550,12 +550,12 @@ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, retry: rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); - if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; - anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); + anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* @@ -1334,9 +1334,9 @@ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); - anon_vma += PAGE_MAPPING_ANON; + anon_vma += FOLIO_MAPPING_ANON; /* - * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written + * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ @@ -1367,10 +1367,10 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, /* * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and - * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code + * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON; WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); folio->index = linear_page_index(vma, address); } diff --git a/mm/util.c b/mm/util.c index 0b270c43d7d1..20bbfe4ce1b8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -670,9 +670,9 @@ struct anon_vma *folio_anon_vma(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; - if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) + if ((mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) return NULL; - return (void *)(mapping - PAGE_MAPPING_ANON); + return (void *)(mapping - FOLIO_MAPPING_ANON); } /** @@ -699,7 +699,7 @@ struct address_space *folio_mapping(struct folio *folio) return swap_address_space(folio->swap); mapping = folio->mapping; - if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) + if ((unsigned long)mapping & FOLIO_MAPPING_FLAGS) return NULL; return mapping; -- cgit v1.2.3 From e66d7a4f55f44aca39cc74e8c7b4602faf26b4f7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jul 2025 12:49:23 +0200 Subject: mm: convert FPB_IGNORE_* into FPB_RESPECT_* Patch series "mm: folio_pte_batch() improvements", v2. Ever since we added folio_pte_batch() for fork() + munmap() purposes, a lot more users appeared (and more are being proposed), and more functionality was added. Most of the users only need basic functionality, and could benefit from a non-inlined version. So let's clean up folio_pte_batch() and split it into a basic folio_pte_batch() (no flags) and a more advanced folio_pte_batch_ext(). Using either variant will now look much cleaner. This series will likely conflict with some changes in some (old+new) folio_pte_batch() users, but conflicts should be trivial to resolve. This patch (of 4): Respecting these PTE bits is the exception, so let's invert the meaning. With this change, most callers don't have to pass any flags. This is a preparation for splitting folio_pte_batch() into a non-inlined variant that doesn't consume any flags. Long-term, we want folio_pte_batch() to probably ignore most common PTE bits (e.g., write/dirty/young/soft-dirty) that are not relevant for most page table walkers: uffd-wp and protnone might be bits to consider in the future. Only walkers that care about them can opt-in to respect them. No functional change intended. Link: https://lkml.kernel.org/r/20250702104926.212243-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lance Yang Reviewed-by: Zi Yan Reviewed-by: Oscar Salvador Reviewed-by: Dev Jain Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: "Huang, Ying" Cc: Jann Horn Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Michal Hocko Cc: Mike Rapoport Cc: Rakie Kim Cc: Rik van Riel Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 16 ++++++++-------- mm/madvise.c | 3 +-- mm/memory.c | 11 +++++------ mm/mempolicy.c | 4 +--- mm/mlock.c | 3 +-- mm/mremap.c | 3 +-- mm/rmap.c | 3 +-- 7 files changed, 18 insertions(+), 25 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 2e235740128a..e530809ef7d2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -202,17 +202,17 @@ static inline void vma_close(struct vm_area_struct *vma) /* Flags for folio_pte_batch(). */ typedef int __bitwise fpb_t; -/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */ -#define FPB_IGNORE_DIRTY ((__force fpb_t)BIT(0)) +/* Compare PTEs respecting the dirty bit. */ +#define FPB_RESPECT_DIRTY ((__force fpb_t)BIT(0)) -/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */ -#define FPB_IGNORE_SOFT_DIRTY ((__force fpb_t)BIT(1)) +/* Compare PTEs respecting the soft-dirty bit. */ +#define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { - if (flags & FPB_IGNORE_DIRTY) + if (!(flags & FPB_RESPECT_DIRTY)) pte = pte_mkclean(pte); - if (likely(flags & FPB_IGNORE_SOFT_DIRTY)) + if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY))) pte = pte_clear_soft_dirty(pte); return pte_wrprotect(pte_mkold(pte)); } @@ -236,8 +236,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * pages of the same large folio. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, - * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and - * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY). + * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set) + * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set). * * start_ptep must map any page of the folio. max_nr must be at least one and * must be limited by the caller so scanning cannot exceed a single page table. diff --git a/mm/madvise.c b/mm/madvise.c index a34c2c89a53b..e7f1d4caad81 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -346,10 +346,9 @@ static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, pte_t pte, bool *any_young, bool *any_dirty) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; int max_nr = (end - addr) / PAGE_SIZE; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, + return folio_pte_batch(folio, addr, ptep, pte, max_nr, 0, NULL, any_young, any_dirty); } diff --git a/mm/memory.c b/mm/memory.c index 9944380e947d..a03f1964db33 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -990,10 +990,10 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * by keeping the batching logic separate. */ if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) { - if (src_vma->vm_flags & VM_SHARED) - flags |= FPB_IGNORE_DIRTY; - if (!vma_soft_dirty_enabled(src_vma)) - flags |= FPB_IGNORE_SOFT_DIRTY; + if (!(src_vma->vm_flags & VM_SHARED)) + flags |= FPB_RESPECT_DIRTY; + if (vma_soft_dirty_enabled(src_vma)) + flags |= FPB_RESPECT_SOFT_DIRTY; nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, &any_writable, NULL, NULL); @@ -1535,7 +1535,6 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, struct zap_details *details, int *rss, bool *force_flush, bool *force_break, bool *any_skipped) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct mm_struct *mm = tlb->mm; struct folio *folio; struct page *page; @@ -1565,7 +1564,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags, + nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, 0, NULL, NULL, NULL); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1ff7b2174eb7..2a25eedc3b1c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -675,7 +675,6 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct vm_area_struct *vma = walk->vma; struct folio *folio; struct queue_pages *qp = walk->private; @@ -713,8 +712,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, continue; if (folio_test_large(folio) && max_nr != 1) nr = folio_pte_batch(folio, addr, pte, ptent, - max_nr, fpb_flags, - NULL, NULL, NULL); + max_nr, 0, NULL, NULL, NULL); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. diff --git a/mm/mlock.c b/mm/mlock.c index 3cb72b579ffd..2238cdc5eb1c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -307,14 +307,13 @@ void munlock_folio(struct folio *folio) static inline unsigned int folio_mlock_step(struct folio *folio, pte_t *pte, unsigned long addr, unsigned long end) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; unsigned int count = (end - addr) >> PAGE_SHIFT; pte_t ptent = ptep_get(pte); if (!folio_test_large(folio)) return 1; - return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL, + return folio_pte_batch(folio, addr, pte, ptent, count, 0, NULL, NULL, NULL); } diff --git a/mm/mremap.c b/mm/mremap.c index 36585041c760..d4d3ffc93150 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -173,7 +173,6 @@ static pte_t move_soft_dirty_pte(pte_t pte) static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int max_nr) { - const fpb_t flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; struct folio *folio; if (max_nr == 1) @@ -183,7 +182,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr if (!folio || !folio_test_large(folio)) return 1; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, flags, NULL, + return folio_pte_batch(folio, addr, ptep, pte, max_nr, 0, NULL, NULL, NULL); } diff --git a/mm/rmap.c b/mm/rmap.c index 4b1a2a33e39f..366e66651c88 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1849,7 +1849,6 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio, struct page_vma_mapped_walk *pvmw, enum ttu_flags flags, pte_t pte) { - const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; unsigned long end_addr, addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; unsigned int max_nr; @@ -1869,7 +1868,7 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio, if (pte_unused(pte)) return 1; - return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, fpb_flags, + return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, 0, NULL, NULL, NULL); } -- cgit v1.2.3 From 233e28e2a76e6ffcbe33ee7813f98536fe0690b5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jul 2025 12:49:24 +0200 Subject: mm: smaller folio_pte_batch() improvements Let's clean up a bit: (1) No need for start_ptep vs. ptep anymore, we can simply use ptep. (2) Let's switch to "unsigned int" for everything. Negative values do not make sense. (3) We can simplify the code by leaving the pte unchanged after the pte_same() check. (4) Clarify that we should never exceed a single VMA; it indicates a problem in the caller. No functional change intended. Link: https://lkml.kernel.org/r/20250702104926.212243-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lance Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Oscar Salvador Reviewed-by: Dev Jain Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: "Huang, Ying" Cc: Jann Horn Cc: Joshua Hahn Cc: Liam Howlett Cc: Mathew Brost Cc: Michal Hocko Cc: Mike Rapoport Cc: Rakie Kim Cc: Rik van Riel Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/internal.h | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index e530809ef7d2..40ee7200e510 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -221,7 +221,7 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * folio_pte_batch - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. * @addr: The user virtual address the first page is mapped at. - * @start_ptep: Page table pointer for the first entry. + * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. @@ -233,24 +233,24 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive - * pages of the same large folio. + * pages of the same large folio in a single VMA and a single page table. * * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, * the accessed bit, writable bit, dirty bit (unless FPB_RESPECT_DIRTY is set) * and soft-dirty bit (unless FPB_RESPECT_SOFT_DIRTY is set). * - * start_ptep must map any page of the folio. max_nr must be at least one and - * must be limited by the caller so scanning cannot exceed a single page table. + * @ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single VMA and + * a single page table. * * Return: the number of table entries in the batch. */ -static inline int folio_pte_batch(struct folio *folio, unsigned long addr, - pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags, +static inline unsigned int folio_pte_batch(struct folio *folio, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { - pte_t expected_pte, *ptep; - bool writable, young, dirty; - int nr, cur_nr; + unsigned int nr, cur_nr; + pte_t expected_pte; if (any_writable) *any_writable = false; @@ -267,29 +267,22 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr, max_nr = min_t(unsigned long, max_nr, folio_pfn(folio) + folio_nr_pages(folio) - pte_pfn(pte)); - nr = pte_batch_hint(start_ptep, pte); + nr = pte_batch_hint(ptep, pte); expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags); - ptep = start_ptep + nr; + ptep = ptep + nr; while (nr < max_nr) { pte = ptep_get(ptep); - if (any_writable) - writable = !!pte_write(pte); - if (any_young) - young = !!pte_young(pte); - if (any_dirty) - dirty = !!pte_dirty(pte); - pte = __pte_batch_clear_ignored(pte, flags); - if (!pte_same(pte, expected_pte)) + if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte)) break; if (any_writable) - *any_writable |= writable; + *any_writable |= pte_write(pte); if (any_young) - *any_young |= young; + *any_young |= pte_young(pte); if (any_dirty) - *any_dirty |= dirty; + *any_dirty |= pte_dirty(pte); cur_nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, cur_nr); -- cgit v1.2.3 From dd80cfd4878bafc74f2a386c51b5398a12ffeb8c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jul 2025 12:49:25 +0200 Subject: mm: split folio_pte_batch() into folio_pte_batch() and folio_pte_batch_flags() Many users (including upcoming ones) don't really need the flags etc, and can live with the possible overhead of a function call. So let's provide a basic, non-inlined folio_pte_batch(), to avoid code bloat while still providing a variant that optimizes out all flag checks at runtime. folio_pte_batch_flags() will get inlined into folio_pte_batch(), optimizing out any conditionals that depend on input flags. folio_pte_batch() will behave like folio_pte_batch_flags() when no flags are specified. It's okay to add new users of folio_pte_batch_flags(), but using folio_pte_batch() if applicable is preferred. So, before this change, folio_pte_batch() was inlined into the C file optimized by propagating constants within the resulting object file. With this change, we now also have a folio_pte_batch() that is optimized by propagating all constants. But instead of having one instance per object file, we have a single shared one. In zap_present_ptes(), where we care about performance, the compiler already seem to generate a call to a common inlined folio_pte_batch() variant, shared with fork() code. So calling the new non-inlined variant should not make a difference. While at it, drop the "addr" parameter that is unused. Link: https://lkml.kernel.org/r/20250702104926.212243-4-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Andrew Morton Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/ Reviewed-by: Oscar Salvador Reviewed-by: Zi Yan Reviewed-by: Dev Jain Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: "Huang, Ying" Cc: Jann Horn Cc: Joshua Hahn Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Michal Hocko Cc: Mike Rapoport Cc: Rakie Kim Cc: Rik van Riel Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 11 ++++++++--- mm/madvise.c | 4 ++-- mm/memory.c | 8 +++----- mm/mempolicy.c | 3 +-- mm/mlock.c | 3 +-- mm/mremap.c | 3 +-- mm/rmap.c | 3 +-- mm/util.c | 29 +++++++++++++++++++++++++++++ 8 files changed, 46 insertions(+), 18 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 40ee7200e510..c7d18f608c3f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -218,9 +218,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) } /** - * folio_pte_batch - detect a PTE batch for a large folio + * folio_pte_batch_flags - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. - * @addr: The user virtual address the first page is mapped at. * @ptep: Page table pointer for the first entry. * @pte: Page table entry for the first page. * @max_nr: The maximum number of table entries to consider. @@ -243,9 +242,12 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * must be limited by the caller so scanning cannot exceed a single VMA and * a single page table. * + * This function will be inlined to optimize based on the input parameters; + * consider using folio_pte_batch() instead if applicable. + * * Return: the number of table entries in the batch. */ -static inline unsigned int folio_pte_batch(struct folio *folio, unsigned long addr, +static inline unsigned int folio_pte_batch_flags(struct folio *folio, pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags, bool *any_writable, bool *any_young, bool *any_dirty) { @@ -293,6 +295,9 @@ static inline unsigned int folio_pte_batch(struct folio *folio, unsigned long ad return min(nr, max_nr); } +unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, + unsigned int max_nr); + /** * pte_move_swp_offset - Move the swap entry offset field of a swap pte * forward or backward by delta diff --git a/mm/madvise.c b/mm/madvise.c index e7f1d4caad81..7c4958f694b4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -348,8 +348,8 @@ static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, { int max_nr = (end - addr) / PAGE_SIZE; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, 0, NULL, - any_young, any_dirty); + return folio_pte_batch_flags(folio, ptep, pte, max_nr, 0, NULL, + any_young, any_dirty); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, diff --git a/mm/memory.c b/mm/memory.c index a03f1964db33..042088340b73 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -995,8 +995,8 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma if (vma_soft_dirty_enabled(src_vma)) flags |= FPB_RESPECT_SOFT_DIRTY; - nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags, - &any_writable, NULL, NULL); + nr = folio_pte_batch_flags(folio, src_pte, pte, max_nr, flags, + &any_writable, NULL, NULL); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1564,9 +1564,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb, * by keeping the batching logic separate. */ if (unlikely(folio_test_large(folio) && max_nr != 1)) { - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, 0, - NULL, NULL, NULL); - + nr = folio_pte_batch(folio, pte, ptent, max_nr); zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr, addr, details, rss, force_flush, force_break, any_skipped); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2a25eedc3b1c..eb83cff7db8c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -711,8 +711,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (!folio || folio_is_zone_device(folio)) continue; if (folio_test_large(folio) && max_nr != 1) - nr = folio_pte_batch(folio, addr, pte, ptent, - max_nr, 0, NULL, NULL, NULL); + nr = folio_pte_batch(folio, pte, ptent, max_nr); /* * vm_normal_folio() filters out zero pages, but there might * still be reserved folios to skip, perhaps in a VDSO. diff --git a/mm/mlock.c b/mm/mlock.c index 2238cdc5eb1c..a1d93ad33c6d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -313,8 +313,7 @@ static inline unsigned int folio_mlock_step(struct folio *folio, if (!folio_test_large(folio)) return 1; - return folio_pte_batch(folio, addr, pte, ptent, count, 0, NULL, - NULL, NULL); + return folio_pte_batch(folio, pte, ptent, count); } static inline bool allow_mlock_munlock(struct folio *folio, diff --git a/mm/mremap.c b/mm/mremap.c index d4d3ffc93150..1f5bebbb9c0c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -182,8 +182,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr if (!folio || !folio_test_large(folio)) return 1; - return folio_pte_batch(folio, addr, ptep, pte, max_nr, 0, NULL, - NULL, NULL); + return folio_pte_batch(folio, ptep, pte, max_nr); } static int move_ptes(struct pagetable_move_control *pmc, diff --git a/mm/rmap.c b/mm/rmap.c index 366e66651c88..4c833b43fef9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1868,8 +1868,7 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio, if (pte_unused(pte)) return 1; - return folio_pte_batch(folio, addr, pvmw->pte, pte, max_nr, 0, - NULL, NULL, NULL); + return folio_pte_batch(folio, pvmw->pte, pte, max_nr); } /* diff --git a/mm/util.c b/mm/util.c index 20bbfe4ce1b8..f134cefc9062 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1171,3 +1171,32 @@ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) return 0; } EXPORT_SYMBOL(compat_vma_mmap_prepare); + +#ifdef CONFIG_MMU +/** + * folio_pte_batch - detect a PTE batch for a large folio + * @folio: The large folio to detect a PTE batch for. + * @ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @max_nr: The maximum number of table entries to consider. + * + * This is a simplified variant of folio_pte_batch_flags(). + * + * Detect a PTE batch: consecutive (present) PTEs that map consecutive + * pages of the same large folio in a single VMA and a single page table. + * + * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN, + * the accessed bit, writable bit, dirt-bit and soft-dirty bit. + * + * ptep must map any page of the folio. max_nr must be at least one and + * must be limited by the caller so scanning cannot exceed a single VMA and + * a single page table. + * + * Return: the number of table entries in the batch. + */ +unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, + unsigned int max_nr) +{ + return folio_pte_batch_flags(folio, ptep, pte, max_nr, 0, NULL, NULL, NULL); +} +#endif /* CONFIG_MMU */ -- cgit v1.2.3 From 7ae7e811f0a6817b6deeb4f68eb44be0ec3b8e07 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 2 Jul 2025 12:49:26 +0200 Subject: mm: remove boolean output parameters from folio_pte_batch_ext() Instead, let's just allow for specifying through flags whether we want to have bits merged into the original PTE. For the madvise() case, simplify by having only a single parameter for merging young+dirty. For madvise_cold_or_pageout_pte_range() merging the dirty bit is not required, but also not harmful. This code is not that performance critical after all to really force all micro-optimizations. As we now have two pte_t * parameters, use PageTable() to make sure we are actually given a pointer at a copy of the PTE, not a pointer into an actual page table. Link: https://lkml.kernel.org/r/20250702104926.212243-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Dev Jain Reviewed-by: Oscar Salvador Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: "Huang, Ying" Cc: Jann Horn Cc: Joshua Hahn Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mathew Brost Cc: Michal Hocko Cc: Mike Rapoport Cc: Rakie Kim Cc: Rik van Riel Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/internal.h | 65 ++++++++++++++++++++++++++++++++++++++--------------------- mm/madvise.c | 26 +++++------------------- mm/memory.c | 8 ++------ mm/util.c | 2 +- 4 files changed, 50 insertions(+), 51 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index c7d18f608c3f..91773a0ef305 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -208,6 +208,18 @@ typedef int __bitwise fpb_t; /* Compare PTEs respecting the soft-dirty bit. */ #define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1)) +/* + * Merge PTE write bits: if any PTE in the batch is writable, modify the + * PTE at @ptentp to be writable. + */ +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) + +/* + * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, + * modify the PTE at @ptentp to be young or dirty, respectively. + */ +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) + static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { if (!(flags & FPB_RESPECT_DIRTY)) @@ -220,16 +232,12 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) /** * folio_pte_batch_flags - detect a PTE batch for a large folio * @folio: The large folio to detect a PTE batch for. + * @vma: The VMA. Only relevant with FPB_MERGE_WRITE, otherwise can be NULL. * @ptep: Page table pointer for the first entry. - * @pte: Page table entry for the first page. + * @ptentp: Pointer to a COPY of the first page table entry whose flags this + * function updates based on @flags if appropriate. * @max_nr: The maximum number of table entries to consider. * @flags: Flags to modify the PTE batch semantics. - * @any_writable: Optional pointer to indicate whether any entry except the - * first one is writable. - * @any_young: Optional pointer to indicate whether any entry except the - * first one is young. - * @any_dirty: Optional pointer to indicate whether any entry except the - * first one is dirty. * * Detect a PTE batch: consecutive (present) PTEs that map consecutive * pages of the same large folio in a single VMA and a single page table. @@ -242,28 +250,32 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) * must be limited by the caller so scanning cannot exceed a single VMA and * a single page table. * + * Depending on the FPB_MERGE_* flags, the pte stored at @ptentp will + * be updated: it's crucial that a pointer to a COPY of the first + * page table entry, obtained through ptep_get(), is provided as @ptentp. + * * This function will be inlined to optimize based on the input parameters; * consider using folio_pte_batch() instead if applicable. * * Return: the number of table entries in the batch. */ static inline unsigned int folio_pte_batch_flags(struct folio *folio, - pte_t *ptep, pte_t pte, unsigned int max_nr, fpb_t flags, - bool *any_writable, bool *any_young, bool *any_dirty) + struct vm_area_struct *vma, pte_t *ptep, pte_t *ptentp, + unsigned int max_nr, fpb_t flags) { + bool any_writable = false, any_young = false, any_dirty = false; + pte_t expected_pte, pte = *ptentp; unsigned int nr, cur_nr; - pte_t expected_pte; - - if (any_writable) - *any_writable = false; - if (any_young) - *any_young = false; - if (any_dirty) - *any_dirty = false; VM_WARN_ON_FOLIO(!pte_present(pte), folio); VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio); VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio); + /* + * Ensure this is a pointer to a copy not a pointer into a page table. + * If this is a stack value, it won't be a valid virtual address, but + * that's fine because it also cannot be pointing into the page table. + */ + VM_WARN_ON(virt_addr_valid(ptentp) && PageTable(virt_to_page(ptentp))); /* Limit max_nr to the actual remaining PFNs in the folio we could batch. */ max_nr = min_t(unsigned long, max_nr, @@ -279,12 +291,12 @@ static inline unsigned int folio_pte_batch_flags(struct folio *folio, if (!pte_same(__pte_batch_clear_ignored(pte, flags), expected_pte)) break; - if (any_writable) - *any_writable |= pte_write(pte); - if (any_young) - *any_young |= pte_young(pte); - if (any_dirty) - *any_dirty |= pte_dirty(pte); + if (flags & FPB_MERGE_WRITE) + any_writable |= pte_write(pte); + if (flags & FPB_MERGE_YOUNG_DIRTY) { + any_young |= pte_young(pte); + any_dirty |= pte_dirty(pte); + } cur_nr = pte_batch_hint(ptep, pte); expected_pte = pte_advance_pfn(expected_pte, cur_nr); @@ -292,6 +304,13 @@ static inline unsigned int folio_pte_batch_flags(struct folio *folio, nr += cur_nr; } + if (any_writable) + *ptentp = pte_mkwrite(*ptentp, vma); + if (any_young) + *ptentp = pte_mkyoung(*ptentp); + if (any_dirty) + *ptentp = pte_mkdirty(*ptentp); + return min(nr, max_nr); } diff --git a/mm/madvise.c b/mm/madvise.c index 7c4958f694b4..1c30031ab035 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -343,13 +343,12 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, struct folio *folio, pte_t *ptep, - pte_t pte, bool *any_young, - bool *any_dirty) + pte_t *ptentp) { int max_nr = (end - addr) / PAGE_SIZE; - return folio_pte_batch_flags(folio, ptep, pte, max_nr, 0, NULL, - any_young, any_dirty); + return folio_pte_batch_flags(folio, NULL, ptep, ptentp, max_nr, + FPB_MERGE_YOUNG_DIRTY); } static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, @@ -487,13 +486,7 @@ restart: * next pte in the range. */ if (folio_test_large(folio)) { - bool any_young; - - nr = madvise_folio_pte_batch(addr, end, folio, pte, - ptent, &any_young, NULL); - if (any_young) - ptent = pte_mkyoung(ptent); - + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; @@ -723,11 +716,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, * next pte in the range. */ if (folio_test_large(folio)) { - bool any_young, any_dirty; - - nr = madvise_folio_pte_batch(addr, end, folio, pte, - ptent, &any_young, &any_dirty); - + nr = madvise_folio_pte_batch(addr, end, folio, pte, &ptent); if (nr < folio_nr_pages(folio)) { int err; @@ -752,11 +741,6 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, nr = 0; continue; } - - if (any_young) - ptent = pte_mkyoung(ptent); - if (any_dirty) - ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { diff --git a/mm/memory.c b/mm/memory.c index 042088340b73..4619bf5874af 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -972,10 +972,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr, int max_nr, int *rss, struct folio **prealloc) { + fpb_t flags = FPB_MERGE_WRITE; struct page *page; struct folio *folio; - bool any_writable; - fpb_t flags = 0; int err, nr; page = vm_normal_page(src_vma, addr, pte); @@ -995,8 +994,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma if (vma_soft_dirty_enabled(src_vma)) flags |= FPB_RESPECT_SOFT_DIRTY; - nr = folio_pte_batch_flags(folio, src_pte, pte, max_nr, flags, - &any_writable, NULL, NULL); + nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags); folio_ref_add(folio, nr); if (folio_test_anon(folio)) { if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, @@ -1010,8 +1008,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma folio_dup_file_rmap_ptes(folio, page, nr, dst_vma); rss[mm_counter_file(folio)] += nr; } - if (any_writable) - pte = pte_mkwrite(pte, src_vma); __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, nr); return nr; diff --git a/mm/util.c b/mm/util.c index f134cefc9062..68ea833ba25f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1197,6 +1197,6 @@ EXPORT_SYMBOL(compat_vma_mmap_prepare); unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, unsigned int max_nr) { - return folio_pte_batch_flags(folio, ptep, pte, max_nr, 0, NULL, NULL, NULL); + return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); } #endif /* CONFIG_MMU */ -- cgit v1.2.3 From 2b7226af730cc9a8818ff3b39aabcd76861913dd Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 23 Jun 2025 11:58:49 -0700 Subject: mm/memcg: make memory.reclaim interface generic This adds a general call for both parsing as well as the common reclaim semantics. memcg is still the only user and no change in semantics. [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Link: https://lkml.kernel.org/r/20250623185851.830632-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/internal.h | 10 ++++++ mm/memcontrol.c | 77 +++------------------------------------------ mm/vmscan.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 73 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 91773a0ef305..5b0f71e5434b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -533,6 +533,16 @@ extern unsigned long highest_memmap_pfn; bool folio_isolate_lru(struct folio *folio); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); +#ifdef CONFIG_NUMA +int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat); +#else +static inline int user_proactive_reclaim(char *buf, + struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + return 0; +} +#endif /* * in mm/rmap.c: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 70fdeda1120b..235c66d2161b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include @@ -4564,83 +4563,15 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } -enum { - MEMORY_RECLAIM_SWAPPINESS = 0, - MEMORY_RECLAIM_SWAPPINESS_MAX, - MEMORY_RECLAIM_NULL, -}; - -static const match_table_t tokens = { - { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, - { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, - { MEMORY_RECLAIM_NULL, NULL }, -}; - static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned int nr_retries = MAX_RECLAIM_RETRIES; - unsigned long nr_to_reclaim, nr_reclaimed = 0; - int swappiness = -1; - unsigned int reclaim_options; - char *old_buf, *start; - substring_t args[MAX_OPT_ARGS]; - - buf = strstrip(buf); - - old_buf = buf; - nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; - if (buf == old_buf) - return -EINVAL; - - buf = strstrip(buf); - - while ((start = strsep(&buf, " ")) != NULL) { - if (!strlen(start)) - continue; - switch (match_token(start, tokens, args)) { - case MEMORY_RECLAIM_SWAPPINESS: - if (match_int(&args[0], &swappiness)) - return -EINVAL; - if (swappiness < MIN_SWAPPINESS || swappiness > MAX_SWAPPINESS) - return -EINVAL; - break; - case MEMORY_RECLAIM_SWAPPINESS_MAX: - swappiness = SWAPPINESS_ANON_ONLY; - break; - default: - return -EINVAL; - } - } - - reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; - while (nr_reclaimed < nr_to_reclaim) { - /* Will converge on zero, but reclaim enforces a minimum */ - unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; - unsigned long reclaimed; - - if (signal_pending(current)) - return -EINTR; - - /* - * This is the final attempt, drain percpu lru caches in the - * hope of introducing more evictable pages for - * try_to_free_mem_cgroup_pages(). - */ - if (!nr_retries) - lru_add_drain_all(); - - reclaimed = try_to_free_mem_cgroup_pages(memcg, - batch_size, GFP_KERNEL, - reclaim_options, - swappiness == -1 ? NULL : &swappiness); - - if (!reclaimed && !nr_retries--) - return -EAGAIN; + int ret; - nr_reclaimed += reclaimed; - } + ret = user_proactive_reclaim(buf, memcg, NULL); + if (ret) + return ret; return nbytes; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 85ffff3b4d24..9702ee5aa65d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -6714,6 +6715,15 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, return nr_reclaimed; } +#else +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, + gfp_t gfp_mask, + unsigned int reclaim_options, + int *swappiness) +{ + return 0; +} #endif static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) @@ -7708,6 +7718,94 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) return ret; } + +enum { + MEMORY_RECLAIM_SWAPPINESS = 0, + MEMORY_RECLAIM_SWAPPINESS_MAX, + MEMORY_RECLAIM_NULL, +}; +static const match_table_t tokens = { + { MEMORY_RECLAIM_SWAPPINESS, "swappiness=%d"}, + { MEMORY_RECLAIM_SWAPPINESS_MAX, "swappiness=max"}, + { MEMORY_RECLAIM_NULL, NULL }, +}; + +int user_proactive_reclaim(char *buf, struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int swappiness = -1; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + + if (!buf || (!memcg && !pgdat)) + return -EINVAL; + + buf = strstrip(buf); + + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + switch (match_token(start, tokens, args)) { + case MEMORY_RECLAIM_SWAPPINESS: + if (match_int(&args[0], &swappiness)) + return -EINVAL; + if (swappiness < MIN_SWAPPINESS || + swappiness > MAX_SWAPPINESS) + return -EINVAL; + break; + case MEMORY_RECLAIM_SWAPPINESS_MAX: + swappiness = SWAPPINESS_ANON_ONLY; + break; + default: + return -EINVAL; + } + } + + while (nr_reclaimed < nr_to_reclaim) { + /* Will converge on zero, but reclaim enforces a minimum */ + unsigned long batch_size = (nr_to_reclaim - nr_reclaimed) / 4; + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages. + */ + if (!nr_retries) + lru_add_drain_all(); + + if (memcg) { + unsigned int reclaim_options; + + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + reclaimed = try_to_free_mem_cgroup_pages(memcg, + batch_size, GFP_KERNEL, + reclaim_options, + swappiness == -1 ? NULL : &swappiness); + } else { + return -EINVAL; + } + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return 0; +} + #endif /** -- cgit v1.2.3 From 57fae936b40cba55f36bb8e3296f271696c2bb67 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 18 Jul 2025 14:32:41 +0530 Subject: mm: introduce FPB_RESPECT_WRITE for PTE batching infrastructure Patch 6 ("mm: Optimize mprotect() by PTE batching") optimizes mprotect() by batch clearing the ptes, masking in the new protections, and batch setting the ptes. Suppose that the first pte of the batch is writable - with the current implementation of folio_pte_batch(), it is not guaranteed that the other ptes in the batch are already writable too, so we may incorrectly end up setting the writable bit on all ptes via modify_prot_commit_ptes(). Therefore, introduce FPB_RESPECT_WRITE so that all ptes in the batch are writable or not. Link: https://lkml.kernel.org/r/20250718090244.21092-5-dev.jain@arm.com Signed-off-by: Dev Jain Reviewed-by: Lorenzo Stoakes Reviewed-by: Ryan Roberts Reviewed-by: Zi Yan Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: Christophe Leroy Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jann Horn Cc: Joey Gouly Cc: Kevin Brodsky Cc: Lance Yang Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yicong Yang Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- mm/internal.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'mm/internal.h') diff --git a/mm/internal.h b/mm/internal.h index 5b0f71e5434b..28d2d5b051df 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -208,17 +208,20 @@ typedef int __bitwise fpb_t; /* Compare PTEs respecting the soft-dirty bit. */ #define FPB_RESPECT_SOFT_DIRTY ((__force fpb_t)BIT(1)) +/* Compare PTEs respecting the writable bit. */ +#define FPB_RESPECT_WRITE ((__force fpb_t)BIT(2)) + /* * Merge PTE write bits: if any PTE in the batch is writable, modify the * PTE at @ptentp to be writable. */ -#define FPB_MERGE_WRITE ((__force fpb_t)BIT(2)) +#define FPB_MERGE_WRITE ((__force fpb_t)BIT(3)) /* * Merge PTE young and dirty bits: if any PTE in the batch is young or dirty, * modify the PTE at @ptentp to be young or dirty, respectively. */ -#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(3)) +#define FPB_MERGE_YOUNG_DIRTY ((__force fpb_t)BIT(4)) static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) { @@ -226,7 +229,9 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags) pte = pte_mkclean(pte); if (likely(!(flags & FPB_RESPECT_SOFT_DIRTY))) pte = pte_clear_soft_dirty(pte); - return pte_wrprotect(pte_mkold(pte)); + if (likely(!(flags & FPB_RESPECT_WRITE))) + pte = pte_wrprotect(pte); + return pte_mkold(pte); } /** -- cgit v1.2.3