From 93b3037a1482758349f3b0431406bcc457ca1cbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Nov 2020 14:04:46 +0100 Subject: mm: Update ptep_get_lockless()'s comment Improve the comment. Suggested-by: Matthew Wilcox Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221022114424.515572025%40infradead.org --- include/linux/pgtable.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a108b60a6962..c0b29000c3c0 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -300,15 +300,12 @@ static inline pte_t ptep_get(pte_t *ptep) #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH /* - * WARNING: only to be used in the get_user_pages_fast() implementation. - * - * With get_user_pages_fast(), we walk down the pagetables without taking any - * locks. For this we would like to load the pointers atomically, but sometimes - * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What - * we do have is the guarantee that a PTE will only either go from not present - * to present, or present to not present or both -- it will not switch to a - * completely different present page without a TLB flush in between; something - * that we are blocking by holding interrupts off. + * For walking the pagetables without holding any locks. Some architectures + * (eg x86-32 PAE) cannot load the entries atomically without using expensive + * instructions. We are guaranteed that a PTE will only either go from not + * present to present, or present to not present -- it will not switch to a + * completely different present page without a TLB flush inbetween; which we + * are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * -- cgit v1.2.3 From 024d232ae4fcd7a7ce8ea239607d6c1246d7adc8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Nov 2020 17:16:22 +0100 Subject: mm: Fix pmd_read_atomic() AFAICT there's no reason to do anything different than what we do for PTEs. Make it so (also affects SH). Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221022114424.711181252%40infradead.org --- arch/x86/include/asm/pgtable-3level.h | 56 ----------------------------------- include/linux/pgtable.h | 47 ++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 66 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 28556d22feb8..94f50b0100a5 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -34,62 +34,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) ptep->pte_low = pte.pte_low; } -#define pmd_read_atomic pmd_read_atomic -/* - * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with - * a "*pmdp" dereference done by GCC. Problem is, in certain places - * where pte_offset_map_lock() is called, concurrent page faults are - * allowed, if the mmap_lock is hold for reading. An example is mincore - * vs page faults vs MADV_DONTNEED. On the page fault side - * pmd_populate() rightfully does a set_64bit(), but if we're reading the - * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen - * because GCC will not read the 64-bit value of the pmd atomically. - * - * To fix this all places running pte_offset_map_lock() while holding the - * mmap_lock in read mode, shall read the pmdp pointer using this - * function to know if the pmd is null or not, and in turn to know if - * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd - * operations. - * - * Without THP if the mmap_lock is held for reading, the pmd can only - * transition from null to not null while pmd_read_atomic() runs. So - * we can always return atomic pmd values with this function. - * - * With THP if the mmap_lock is held for reading, the pmd can become - * trans_huge or none or point to a pte (and in turn become "stable") - * at any time under pmd_read_atomic(). We could read it truly - * atomically here with an atomic64_read() for the THP enabled case (and - * it would be a whole lot simpler), but to avoid using cmpxchg8b we - * only return an atomic pmdval if the low part of the pmdval is later - * found to be stable (i.e. pointing to a pte). We are also returning a - * 'none' (zero) pmdval if the low part of the pmd is zero. - * - * In some cases the high and low part of the pmdval returned may not be - * consistent if THP is enabled (the low part may point to previously - * mapped hugepage, while the high part may point to a more recently - * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only - * needs the low part of the pmd to be read atomically to decide if the - * pmd is unstable or not, with the only exception when the low part - * of the pmd is zero, in which case we return a 'none' pmd. - */ -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ - pmdval_t ret; - u32 *tmp = (u32 *)pmdp; - - ret = (pmdval_t) (*tmp); - if (ret) { - /* - * If the low part is null, we must not read the high part - * or we can end up with a partial pmd. - */ - smp_rmb(); - ret |= ((pmdval_t)*(tmp + 1)) << 32; - } - - return (pmd_t) { .pmd = ret }; -} - static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index c0b29000c3c0..765fd4bf420f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -298,6 +298,13 @@ static inline pte_t ptep_get(pte_t *ptep) } #endif +#ifndef __HAVE_ARCH_PMDP_GET +static inline pmd_t pmdp_get(pmd_t *pmdp) +{ + return READ_ONCE(*pmdp); +} +#endif + #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures @@ -340,15 +347,42 @@ static inline pte_t ptep_get_lockless(pte_t *ptep) return pte; } -#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ +#define ptep_get_lockless ptep_get_lockless + +#if CONFIG_PGTABLE_LEVELS > 2 +static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) +{ + pmd_t pmd; + + do { + pmd.pmd_low = pmdp->pmd_low; + smp_rmb(); + pmd.pmd_high = pmdp->pmd_high; + smp_rmb(); + } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); + + return pmd; +} +#define pmdp_get_lockless pmdp_get_lockless +#endif /* CONFIG_PGTABLE_LEVELS > 2 */ +#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ + /* * We require that the PTE can be read atomically. */ +#ifndef ptep_get_lockless static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } -#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ +#endif + +#ifndef pmdp_get_lockless +static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) +{ + return pmdp_get(pmdp); +} +#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR @@ -1318,17 +1352,10 @@ static inline int pud_trans_unstable(pud_t *pud) #endif } -#ifndef pmd_read_atomic static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { - /* - * Depend on compiler for an atomic pmd read. NOTE: this is - * only going to work, if the pmdval_t isn't larger than - * an unsigned long. - */ - return *pmdp; + return pmdp_get_lockless(pmdp); } -#endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) -- cgit v1.2.3 From 6ca297d4784625de7b041e8451780643cf5751a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Oct 2022 14:51:44 +0200 Subject: mm: Rename GUP_GET_PTE_LOW_HIGH Since it no longer applies to only PTEs, rename it to PXX. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221022114424.776404066%40infradead.org --- arch/mips/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/x86/Kconfig | 2 +- include/linux/pgtable.h | 4 ++-- mm/Kconfig | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index b26b77673c2c..15cb692b0a09 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -46,7 +46,7 @@ config MIPS select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL - select GUP_GET_PTE_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT + select GUP_GET_PXX_LOW_HIGH if CPU_MIPS32 && PHYS_ADDR_T_64BIT select HAVE_ARCH_COMPILER_H select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB if MIPS_FP_SUPPORT diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 5f220e903e5a..0665ac0add0b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -24,7 +24,7 @@ config SUPERH select GENERIC_PCI_IOMAP if PCI select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD - select GUP_GET_PTE_LOW_HIGH if X2TLB + select GUP_GET_PXX_LOW_HIGH if X2TLB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_KGDB select HAVE_ARCH_SECCOMP_FILTER diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 67745ceab0db..bb1f326ca728 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -157,7 +157,7 @@ config X86 select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY select GENERIC_VDSO_TIME_NS - select GUP_GET_PTE_LOW_HIGH if X86_PAE + select GUP_GET_PXX_LOW_HIGH if X86_PAE select HARDIRQS_SW_RESEND select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 765fd4bf420f..7dd3df742543 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -305,7 +305,7 @@ static inline pmd_t pmdp_get(pmd_t *pmdp) } #endif -#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH +#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH /* * For walking the pagetables without holding any locks. Some architectures * (eg x86-32 PAE) cannot load the entries atomically without using expensive @@ -365,7 +365,7 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) } #define pmdp_get_lockless pmdp_get_lockless #endif /* CONFIG_PGTABLE_LEVELS > 2 */ -#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ +#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ /* * We require that the PTE can be read atomically. diff --git a/mm/Kconfig b/mm/Kconfig index 57e1d8c5b505..0eabd0beb345 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1044,7 +1044,7 @@ config GUP_TEST comment "GUP_TEST needs to have DEBUG_FS enabled" depends on !GUP_TEST && !DEBUG_FS -config GUP_GET_PTE_LOW_HIGH +config GUP_GET_PXX_LOW_HIGH bool config ARCH_HAS_PTE_SPECIAL -- cgit v1.2.3 From dab6e717429e5ec795d558a0e9a5337a1ed33a3d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Nov 2020 17:20:28 +0100 Subject: mm: Rename pmd_read_atomic() There's no point in having the identical routines for PTE/PMD have different names. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20221022114424.841277397%40infradead.org --- include/linux/pgtable.h | 9 ++------- mm/hmm.c | 2 +- mm/khugepaged.c | 2 +- mm/mapping_dirty_helpers.c | 2 +- mm/mprotect.c | 2 +- mm/userfaultfd.c | 2 +- mm/vmscan.c | 4 ++-- 7 files changed, 9 insertions(+), 14 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 7dd3df742543..23348528ff36 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1352,11 +1352,6 @@ static inline int pud_trans_unstable(pud_t *pud) #endif } -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ - return pmdp_get_lockless(pmdp); -} - #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif @@ -1383,13 +1378,13 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) */ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) { - pmd_t pmdval = pmd_read_atomic(pmd); + pmd_t pmdval = pmdp_get_lockless(pmd); /* * The barrier will stabilize the pmdval in a register or on * the stack so that it will stop changing under the code. * * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, - * pmd_read_atomic is allowed to return a not atomic pmdval + * pmdp_get_lockless is allowed to return a not atomic pmdval * (for example pointing to an hugepage that has never been * mapped in the pmd). The below checks will only care about * the low part of the pmd with 32bit PAE x86 anyway, with the diff --git a/mm/hmm.c b/mm/hmm.c index 3850fb625dda..39cf50de76d7 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -361,7 +361,7 @@ again: * huge or device mapping one and compute corresponding pfn * values. */ - pmd = pmd_read_atomic(pmdp); + pmd = pmdp_get_lockless(pmdp); barrier(); if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4734315f7940..52237a777ebd 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -862,7 +862,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, if (!*pmd) return SCAN_PMD_NULL; - pmde = pmd_read_atomic(*pmd); + pmde = pmdp_get_lockless(*pmd); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 1b0ab8fcfd8b..175e424b9ab1 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -126,7 +126,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - pmd_t pmdval = pmd_read_atomic(pmd); + pmd_t pmdval = pmdp_get_lockless(pmd); if (!pmd_trans_unstable(&pmdval)) return 0; diff --git a/mm/mprotect.c b/mm/mprotect.c index 668bfaa6ed2a..f006bafe338f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -294,7 +294,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, */ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) { - pmd_t pmdval = pmd_read_atomic(pmd); + pmd_t pmdval = pmdp_get_lockless(pmd); /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 3d0fef3980b3..89a1c9ca06f8 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -630,7 +630,7 @@ retry: break; } - dst_pmdval = pmd_read_atomic(dst_pmd); + dst_pmdval = pmdp_get_lockless(dst_pmd); /* * If the dst_pmd is mapped as THP don't * override it and just be strict. diff --git a/mm/vmscan.c b/mm/vmscan.c index 04d8b88e5216..88ef873b2d83 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4039,9 +4039,9 @@ restart: /* walk_pte_range() may call get_next_vma() */ vma = args->vma; for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { - pmd_t val = pmd_read_atomic(pmd + i); + pmd_t val = pmdp_get_lockless(pmd + i); - /* for pmd_read_atomic() */ + /* for pmdp_get_lockless() */ barrier(); next = pmd_addr_end(addr, end); -- cgit v1.2.3 From 2dff2c359e829245bc3d80e42e296876d1f0cf8e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Nov 2022 12:53:18 +0100 Subject: mm: Convert __HAVE_ARCH_P..P_GET to the new style Since __HAVE_ARCH_* style guards have been depricated in favour of defining the function name onto itself, convert pxxp_get(). Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/Y2EUEBlQXNgaJgoI@hirez.programming.kicks-ass.net --- arch/powerpc/include/asm/nohash/32/pgtable.h | 2 +- include/linux/pgtable.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/pgtable.h') diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 0d40b33184eb..cb1ac02ae8ee 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -263,7 +263,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p } #ifdef CONFIG_PPC_16K_PAGES -#define __HAVE_ARCH_PTEP_GET +#define ptep_get ptep_get static inline pte_t ptep_get(pte_t *ptep) { pte_basic_t val = READ_ONCE(ptep->pte); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 23348528ff36..70e2a7e06a76 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -291,14 +291,14 @@ static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, ptep_get_and_clear(mm, addr, ptep); } -#ifndef __HAVE_ARCH_PTEP_GET +#ifndef ptep_get static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif -#ifndef __HAVE_ARCH_PMDP_GET +#ifndef pmdp_get static inline pmd_t pmdp_get(pmd_t *pmdp) { return READ_ONCE(*pmdp); -- cgit v1.2.3