From e275d2109cdaea8b4554b9eb8a828bdb8f8ba068 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 26 Oct 2020 10:08:47 +0200 Subject: bus: ti-sysc: Fix reset status check for modules with quirks Commit d46f9fbec719 ("bus: ti-sysc: Use optional clocks on for enable and wait for softreset bit") started showing a "OCP softreset timed out" warning on enable if the interconnect target module is not out of reset. This caused the warning to be often triggered for i2c and hdq while the devices are working properly. Turns out that some interconnect target modules seem to have an unusable reset status bits unless the module specific reset quirks are activated. Let's just skip the reset status check for those modules as we only want to activate the reset quirks when doing a reset, and not on enable. This way we don't see the bogus "OCP softreset timed out" warnings during boot. Fixes: d46f9fbec719 ("bus: ti-sysc: Use optional clocks on for enable and wait for softreset bit") Signed-off-by: Tony Lindgren --- include/linux/platform_data/ti-sysc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h index c59999ce044e..240dce553a0b 100644 --- a/include/linux/platform_data/ti-sysc.h +++ b/include/linux/platform_data/ti-sysc.h @@ -50,6 +50,7 @@ struct sysc_regbits { s8 emufree_shift; }; +#define SYSC_MODULE_QUIRK_ENA_RESETDONE BIT(25) #define SYSC_MODULE_QUIRK_PRUSS BIT(24) #define SYSC_MODULE_QUIRK_DSS_RESET BIT(23) #define SYSC_MODULE_QUIRK_RTC_UNLOCK BIT(22) -- cgit v1.2.3 From cef397038167ac15d085914493d6c86385773709 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 11 Nov 2020 17:52:58 +0100 Subject: arch: pgtable: define MAX_POSSIBLE_PHYSMEM_BITS where needed Stefan Agner reported a bug when using zsram on 32-bit Arm machines with RAM above the 4GB address boundary: Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = a27bd01c [00000000] *pgd=236a0003, *pmd=1ffa64003 Internal error: Oops: 207 [#1] SMP ARM Modules linked in: mdio_bcm_unimac(+) brcmfmac cfg80211 brcmutil raspberrypi_hwmon hci_uart crc32_arm_ce bcm2711_thermal phy_generic genet CPU: 0 PID: 123 Comm: mkfs.ext4 Not tainted 5.9.6 #1 Hardware name: BCM2711 PC is at zs_map_object+0x94/0x338 LR is at zram_bvec_rw.constprop.0+0x330/0xa64 pc : [] lr : [] psr: 60000013 sp : e376bbe0 ip : 00000000 fp : c1e2921c r10: 00000002 r9 : c1dda730 r8 : 00000000 r7 : e8ff7a00 r6 : 00000000 r5 : 02f9ffa0 r4 : e3710000 r3 : 000fdffe r2 : c1e0ce80 r1 : ebf979a0 r0 : 00000000 Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user Control: 30c5383d Table: 235c2a80 DAC: fffffffd Process mkfs.ext4 (pid: 123, stack limit = 0x495a22e6) Stack: (0xe376bbe0 to 0xe376c000) As it turns out, zsram needs to know the maximum memory size, which is defined in MAX_PHYSMEM_BITS when CONFIG_SPARSEMEM is set, or in MAX_POSSIBLE_PHYSMEM_BITS on the x86 architecture. The same problem will be hit on all 32-bit architectures that have a physical address space larger than 4GB and happen to not enable sparsemem and include asm/sparsemem.h from asm/pgtable.h. After the initial discussion, I suggested just always defining MAX_POSSIBLE_PHYSMEM_BITS whenever CONFIG_PHYS_ADDR_T_64BIT is set, or provoking a build error otherwise. This addresses all configurations that can currently have this runtime bug, but leaves all other configurations unchanged. I looked up the possible number of bits in source code and datasheets, here is what I found: - on ARC, CONFIG_ARC_HAS_PAE40 controls whether 32 or 40 bits are used - on ARM, CONFIG_LPAE enables 40 bit addressing, without it we never support more than 32 bits, even though supersections in theory allow up to 40 bits as well. - on MIPS, some MIPS32r1 or later chips support 36 bits, and MIPS32r5 XPA supports up to 60 bits in theory, but 40 bits are more than anyone will ever ship - On PowerPC, there are three different implementations of 36 bit addressing, but 32-bit is used without CONFIG_PTE_64BIT - On RISC-V, the normal page table format can support 34 bit addressing. There is no highmem support on RISC-V, so anything above 2GB is unused, but it might be useful to eventually support CONFIG_ZRAM for high pages. Fixes: 61989a80fb3a ("staging: zsmalloc: zsmalloc memory allocation library") Fixes: 02390b87a945 ("mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS") Acked-by: Thomas Bogendoerfer Reviewed-by: Stefan Agner Tested-by: Stefan Agner Acked-by: Mike Rapoport Link: https://lore.kernel.org/linux-mm/bdfa44bf1c570b05d6c70898e2bbb0acf234ecdf.1604762181.git.stefan@agner.ch/ Signed-off-by: Arnd Bergmann --- arch/arc/include/asm/pgtable.h | 2 ++ arch/arm/include/asm/pgtable-2level.h | 2 ++ arch/arm/include/asm/pgtable-3level.h | 2 ++ arch/mips/include/asm/pgtable-32.h | 3 +++ arch/powerpc/include/asm/book3s/32/pgtable.h | 2 ++ arch/powerpc/include/asm/nohash/32/pgtable.h | 2 ++ arch/riscv/include/asm/pgtable-32.h | 2 ++ include/linux/pgtable.h | 13 +++++++++++++ 8 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index f1ed17edb085..163641726a2b 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -134,8 +134,10 @@ #ifdef CONFIG_ARC_HAS_PAE40 #define PTE_BITS_NON_RWX_IN_PD1 (0xff00000000 | PAGE_MASK | _PAGE_CACHEABLE) +#define MAX_POSSIBLE_PHYSMEM_BITS 40 #else #define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK | _PAGE_CACHEABLE) +#define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif /************************************************************************** diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index 3502c2f746ca..baf7d0204eb5 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -75,6 +75,8 @@ #define PTE_HWTABLE_OFF (PTE_HWTABLE_PTRS * sizeof(pte_t)) #define PTE_HWTABLE_SIZE (PTRS_PER_PTE * sizeof(u32)) +#define MAX_POSSIBLE_PHYSMEM_BITS 32 + /* * PMD_SHIFT determines the size of the area a second-level page table can map * PGDIR_SHIFT determines what a third-level page table entry can map diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index fbb6693c3352..2b85d175e999 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -25,6 +25,8 @@ #define PTE_HWTABLE_OFF (0) #define PTE_HWTABLE_SIZE (PTRS_PER_PTE * sizeof(u64)) +#define MAX_POSSIBLE_PHYSMEM_BITS 40 + /* * PGDIR_SHIFT determines the size a top-level page table entry can map. */ diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index a950fc1ddb4d..6c0532d7b211 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -154,6 +154,7 @@ static inline void pmd_clear(pmd_t *pmdp) #if defined(CONFIG_XPA) +#define MAX_POSSIBLE_PHYSMEM_BITS 40 #define pte_pfn(x) (((unsigned long)((x).pte_high >> _PFN_SHIFT)) | (unsigned long)((x).pte_low << _PAGE_PRESENT_SHIFT)) static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) @@ -169,6 +170,7 @@ pfn_pte(unsigned long pfn, pgprot_t prot) #elif defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) +#define MAX_POSSIBLE_PHYSMEM_BITS 36 #define pte_pfn(x) ((unsigned long)((x).pte_high >> 6)) static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) @@ -183,6 +185,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) #else +#define MAX_POSSIBLE_PHYSMEM_BITS 32 #ifdef CONFIG_CPU_VR41XX #define pte_pfn(x) ((unsigned long)((x).pte >> (PAGE_SHIFT + 2))) #define pfn_pte(pfn, prot) __pte(((pfn) << (PAGE_SHIFT + 2)) | pgprot_val(prot)) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 36443cda8dcf..1376be95e975 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -36,8 +36,10 @@ static inline bool pte_user(pte_t pte) */ #ifdef CONFIG_PTE_64BIT #define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 36 #else #define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif /* diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index ee2243ba96cf..96522f7f0618 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -153,8 +153,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); */ #if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) #define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 36 #else #define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif /* diff --git a/arch/riscv/include/asm/pgtable-32.h b/arch/riscv/include/asm/pgtable-32.h index b0ab66e5fdb1..5b2e79e5bfa5 100644 --- a/arch/riscv/include/asm/pgtable-32.h +++ b/arch/riscv/include/asm/pgtable-32.h @@ -14,4 +14,6 @@ #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) +#define MAX_POSSIBLE_PHYSMEM_BITS 34 + #endif /* _ASM_RISCV_PGTABLE_32_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 71125a4676c4..e237004d498d 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1427,6 +1427,19 @@ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ +#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) +#ifdef CONFIG_PHYS_ADDR_T_64BIT +/* + * ZSMALLOC needs to know the highest PFN on 32-bit architectures + * with physical address space extension, but falls back to + * BITS_PER_LONG otherwise. + */ +#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition +#else +#define MAX_POSSIBLE_PHYSMEM_BITS 32 +#endif +#endif + #ifndef has_transparent_hugepage #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define has_transparent_hugepage() 1 -- cgit v1.2.3 From e1cef2d4c379b2aab43a7dc9601f645048209090 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 19 Nov 2020 14:53:40 +0900 Subject: tools/bootconfig: Align the bootconfig applied initrd image size to 4 Align the bootconfig applied initrd image size to 4. To fill the gap, the bootconfig command uses null characters in between the bootconfig data and the footer. This will expands the footer size but don't change the checksum. Thus the block image of the initrd file with bootconfig is as follows. [initrd][bootconfig][(pad)][size][csum]["#BOOTCONFIG\n"] Link: https://lkml.kernel.org/r/160576522046.320071.8550680670010950634.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 3 ++ tools/bootconfig/main.c | 57 ++++++++++++++++++++++--------------- tools/bootconfig/test-bootconfig.sh | 6 +++- 3 files changed, 42 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index 9903088891fa..2696eb0fc149 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -12,6 +12,9 @@ #define BOOTCONFIG_MAGIC "#BOOTCONFIG\n" #define BOOTCONFIG_MAGIC_LEN 12 +#define BOOTCONFIG_ALIGN_SHIFT 2 +#define BOOTCONFIG_ALIGN (1 << BOOTCONFIG_ALIGN_SHIFT) +#define BOOTCONFIG_ALIGN_MASK (BOOTCONFIG_ALIGN - 1) /* XBC tree node */ struct xbc_node { diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index a0733cbb3c49..4a445b6304bb 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -337,12 +337,13 @@ static int delete_xbc(const char *path) static int apply_xbc(const char *path, const char *xbc_path) { + char *buf, *data, *p; + size_t total_size; struct stat stat; + const char *msg; u32 size, csum; - char *buf, *data; + int pos, pad; int ret, fd; - const char *msg; - int pos; ret = load_xbc_file(xbc_path, &buf); if (ret < 0) { @@ -352,13 +353,12 @@ static int apply_xbc(const char *path, const char *xbc_path) size = strlen(buf) + 1; csum = checksum((unsigned char *)buf, size); - /* Prepare xbc_path data */ - data = malloc(size + 8); + /* Backup the bootconfig data */ + data = calloc(size + BOOTCONFIG_ALIGN + + sizeof(u32) + sizeof(u32) + BOOTCONFIG_MAGIC_LEN, 1); if (!data) return -ENOMEM; - strcpy(data, buf); - *(u32 *)(data + size) = size; - *(u32 *)(data + size + 4) = csum; + memcpy(data, buf, size); /* Check the data format */ ret = xbc_init(buf, &msg, &pos); @@ -399,24 +399,35 @@ static int apply_xbc(const char *path, const char *xbc_path) pr_err("Failed to get the size of %s\n", path); goto out; } - ret = write(fd, data, size + 8); - if (ret < size + 8) { + + /* To align up the total size to BOOTCONFIG_ALIGN, get padding size */ + total_size = stat.st_size + size + sizeof(u32) * 2 + BOOTCONFIG_MAGIC_LEN; + pad = ((total_size + BOOTCONFIG_ALIGN - 1) & (~BOOTCONFIG_ALIGN_MASK)) - total_size; + size += pad; + + /* Add a footer */ + p = data + size; + *(u32 *)p = size; + p += sizeof(u32); + + *(u32 *)p = csum; + p += sizeof(u32); + + memcpy(p, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN); + p += BOOTCONFIG_MAGIC_LEN; + + total_size = p - data; + + ret = write(fd, data, total_size); + if (ret < total_size) { if (ret < 0) ret = -errno; pr_err("Failed to apply a boot config: %d\n", ret); - if (ret < 0) - goto out; - goto out_rollback; - } - /* Write a magic word of the bootconfig */ - ret = write(fd, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN); - if (ret < BOOTCONFIG_MAGIC_LEN) { - if (ret < 0) - ret = -errno; - pr_err("Failed to apply a boot config magic: %d\n", ret); - goto out_rollback; - } - ret = 0; + if (ret >= 0) + goto out_rollback; + } else + ret = 0; + out: close(fd); free(data); diff --git a/tools/bootconfig/test-bootconfig.sh b/tools/bootconfig/test-bootconfig.sh index d295e406a756..baed891d0ba4 100755 --- a/tools/bootconfig/test-bootconfig.sh +++ b/tools/bootconfig/test-bootconfig.sh @@ -9,6 +9,7 @@ else TESTDIR=. fi BOOTCONF=${TESTDIR}/bootconfig +ALIGN=4 INITRD=`mktemp ${TESTDIR}/initrd-XXXX` TEMPCONF=`mktemp ${TESTDIR}/temp-XXXX.bconf` @@ -59,7 +60,10 @@ echo "Show command test" xpass $BOOTCONF $INITRD echo "File size check" -xpass test $new_size -eq $(expr $bconf_size + $initrd_size + 9 + 12) +total_size=$(expr $bconf_size + $initrd_size + 9 + 12 + $ALIGN - 1 ) +total_size=$(expr $total_size / $ALIGN) +total_size=$(expr $total_size \* $ALIGN) +xpass test $new_size -eq $total_size echo "Apply command repeat test" xpass $BOOTCONF -a $TEMPCONF $INITRD -- cgit v1.2.3 From d549699048b4b5c22dd710455bcdb76966e55aa3 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 21 Nov 2020 08:28:17 +0200 Subject: net/packet: fix packet receive on L3 devices without visible hard header In the patchset merged by commit b9fcf0a0d826 ("Merge branch 'support-AF_PACKET-for-layer-3-devices'") L3 devices which did not have header_ops were given one for the purpose of protocol parsing on af_packet transmit path. That change made af_packet receive path regard these devices as having a visible L3 header and therefore aligned incoming skb->data to point to the skb's mac_header. Some devices, such as ipip, xfrmi, and others, do not reset their mac_header prior to ingress and therefore their incoming packets became malformed. Ideally these devices would reset their mac headers, or af_packet would be able to rely on dev->hard_header_len being 0 for such cases, but it seems this is not the case. Fix by changing af_packet RX ll visibility criteria to include the existence of a '.create()' header operation, which is used when creating a device hard header - via dev_hard_header() - by upper layers, and does not exist in these L3 devices. As this predicate may be useful in other situations, add it as a common dev_has_header() helper in netdevice.h. Fixes: b9fcf0a0d826 ("Merge branch 'support-AF_PACKET-for-layer-3-devices'") Signed-off-by: Eyal Birger Acked-by: Jason A. Donenfeld Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/20201121062817.3178900-1-eyal.birger@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 +++++ net/packet/af_packet.c | 18 +++++++++--------- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 964b494b0e8d..fa275a054f46 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3137,6 +3137,11 @@ static inline bool dev_validate_header(const struct net_device *dev, return false; } +static inline bool dev_has_header(const struct net_device *dev) +{ + return dev->header_ops && dev->header_ops->create; +} + typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr, int len, int size); int register_gifconf(unsigned int family, gifconf_func_t *gifconf); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index cefbd50c1090..7a18ffff8551 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -93,8 +93,8 @@ /* Assumptions: - - If the device has no dev->header_ops, there is no LL header visible - above the device. In this case, its hard_header_len should be 0. + - If the device has no dev->header_ops->create, there is no LL header + visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. @@ -108,26 +108,26 @@ On receive: ----------- -Incoming, dev->header_ops != NULL +Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data -Outgoing, dev->header_ops != NULL +Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header -Incoming, dev->header_ops == NULL +Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data -Outgoing, dev->header_ops == NULL +Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume - If dev->header_ops == NULL we are unable to restore the ll header, + If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. @@ -2069,7 +2069,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; - if (dev->header_ops) { + if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * @@ -2198,7 +2198,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; - if (dev->header_ops) { + if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { -- cgit v1.2.3 From acfdd18591eaac25446e976a0c0d190f8b3dbfb1 Mon Sep 17 00:00:00 2001 From: Amit Sunil Dhamne Date: Mon, 23 Nov 2020 21:52:41 -0800 Subject: firmware: xilinx: Use hash-table for api feature check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently array of fix length PM_API_MAX is used to cache the pm_api version (valid or invalid). However ATF based PM APIs values are much higher then PM_API_MAX. So to include ATF based PM APIs also, use hash-table to store the pm_api version status. Signed-off-by: Amit Sunil Dhamne Reported-by: Arnd Bergmann  Signed-off-by: Ravi Patel Signed-off-by: Rajan Vaja Reviewed-by: Arnd Bergmann Tested-by: Michal Simek Fixes: f3217d6f2f7a ("firmware: xilinx: fix out-of-bounds access") Cc: stable Link: https://lore.kernel.org/r/1606197161-25976-1-git-send-email-rajan.vaja@xilinx.com Signed-off-by: Michal Simek --- drivers/firmware/xilinx/zynqmp.c | 63 ++++++++++++++++++++++++++++-------- include/linux/firmware/xlnx-zynqmp.h | 4 --- 2 files changed, 49 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/xilinx/zynqmp.c b/drivers/firmware/xilinx/zynqmp.c index 5828302c7c5c..d08ac824c993 100644 --- a/drivers/firmware/xilinx/zynqmp.c +++ b/drivers/firmware/xilinx/zynqmp.c @@ -20,12 +20,28 @@ #include #include #include +#include #include #include "zynqmp-debug.h" +/* Max HashMap Order for PM API feature check (1<<7 = 128) */ +#define PM_API_FEATURE_CHECK_MAX_ORDER 7 + static bool feature_check_enabled; -static u32 zynqmp_pm_features[PM_API_MAX]; +DEFINE_HASHTABLE(pm_api_features_map, PM_API_FEATURE_CHECK_MAX_ORDER); + +/** + * struct pm_api_feature_data - PM API Feature data + * @pm_api_id: PM API Id, used as key to index into hashmap + * @feature_status: status of PM API feature: valid, invalid + * @hentry: hlist_node that hooks this entry into hashtable + */ +struct pm_api_feature_data { + u32 pm_api_id; + int feature_status; + struct hlist_node hentry; +}; static const struct mfd_cell firmware_devs[] = { { @@ -142,29 +158,37 @@ static int zynqmp_pm_feature(u32 api_id) int ret; u32 ret_payload[PAYLOAD_ARG_CNT]; u64 smc_arg[2]; + struct pm_api_feature_data *feature_data; if (!feature_check_enabled) return 0; - /* Return value if feature is already checked */ - if (api_id > ARRAY_SIZE(zynqmp_pm_features)) - return PM_FEATURE_INVALID; + /* Check for existing entry in hash table for given api */ + hash_for_each_possible(pm_api_features_map, feature_data, hentry, + api_id) { + if (feature_data->pm_api_id == api_id) + return feature_data->feature_status; + } - if (zynqmp_pm_features[api_id] != PM_FEATURE_UNCHECKED) - return zynqmp_pm_features[api_id]; + /* Add new entry if not present */ + feature_data = kmalloc(sizeof(*feature_data), GFP_KERNEL); + if (!feature_data) + return -ENOMEM; + feature_data->pm_api_id = api_id; smc_arg[0] = PM_SIP_SVC | PM_FEATURE_CHECK; smc_arg[1] = api_id; ret = do_fw_call(smc_arg[0], smc_arg[1], 0, ret_payload); - if (ret) { - zynqmp_pm_features[api_id] = PM_FEATURE_INVALID; - return PM_FEATURE_INVALID; - } + if (ret) + ret = -EOPNOTSUPP; + else + ret = ret_payload[1]; - zynqmp_pm_features[api_id] = ret_payload[1]; + feature_data->feature_status = ret; + hash_add(pm_api_features_map, &feature_data->hentry, api_id); - return zynqmp_pm_features[api_id]; + return ret; } /** @@ -200,9 +224,12 @@ int zynqmp_pm_invoke_fn(u32 pm_api_id, u32 arg0, u32 arg1, * Make sure to stay in x0 register */ u64 smc_arg[4]; + int ret; - if (zynqmp_pm_feature(pm_api_id) == PM_FEATURE_INVALID) - return -ENOTSUPP; + /* Check if feature is supported or not */ + ret = zynqmp_pm_feature(pm_api_id); + if (ret < 0) + return ret; smc_arg[0] = PM_SIP_SVC | pm_api_id; smc_arg[1] = ((u64)arg1 << 32) | arg0; @@ -1252,9 +1279,17 @@ static int zynqmp_firmware_probe(struct platform_device *pdev) static int zynqmp_firmware_remove(struct platform_device *pdev) { + struct pm_api_feature_data *feature_data; + int i; + mfd_remove_devices(&pdev->dev); zynqmp_pm_api_debugfs_exit(); + hash_for_each(pm_api_features_map, i, feature_data, hentry) { + hash_del(&feature_data->hentry); + kfree(feature_data); + } + return 0; } diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 5968df82b991..41a1bab98b7e 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -50,10 +50,6 @@ #define ZYNQMP_PM_CAPABILITY_WAKEUP 0x4U #define ZYNQMP_PM_CAPABILITY_UNUSABLE 0x8U -/* Feature check status */ -#define PM_FEATURE_INVALID -1 -#define PM_FEATURE_UNCHECKED 0 - /* * Firmware FPGA Manager flags * XILINX_ZYNQMP_PM_FPGA_FULL: FPGA full reconfiguration -- cgit v1.2.3 From 36ccdf85829a7dd6936dba5d02fa50138471f0d3 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 23 Nov 2020 18:56:00 +0100 Subject: net, xsk: Avoid taking multiple skbuff references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY") addressed the problem that packets were discarded from the Tx AF_XDP ring, when the driver returned NETDEV_TX_BUSY. Part of the fix was bumping the skbuff reference count, so that the buffer would not be freed by dev_direct_xmit(). A reference count larger than one means that the skbuff is "shared", which is not the case. If the "shared" skbuff is sent to the generic XDP receive path, netif_receive_generic_xdp(), and pskb_expand_head() is entered the BUG_ON(skb_shared(skb)) will trigger. This patch adds a variant to dev_direct_xmit(), __dev_direct_xmit(), where a user can select the skbuff free policy. This allows AF_XDP to avoid bumping the reference count, but still keep the NETDEV_TX_BUSY behavior. Fixes: 642e450b6b59 ("xsk: Do not discard packet when NETDEV_TX_BUSY") Reported-by: Yonghong Song Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201123175600.146255-1-bjorn.topel@gmail.com --- include/linux/netdevice.h | 14 +++++++++++++- net/core/dev.c | 8 ++------ net/xdp/xsk.c | 8 +------- 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 964b494b0e8d..76775abf259d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2813,9 +2813,21 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); + int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); -int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); + +static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +{ + int ret; + + ret = __dev_direct_xmit(skb, queue_id); + if (!dev_xmit_complete(ret)) + kfree_skb(skb); + return ret; +} + int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); diff --git a/net/core/dev.c b/net/core/dev.c index 82dc6b48e45f..8588ade790cb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4180,7 +4180,7 @@ int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) } EXPORT_SYMBOL(dev_queue_xmit_accel); -int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) +int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { struct net_device *dev = skb->dev; struct sk_buff *orig_skb = skb; @@ -4210,17 +4210,13 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) dev_xmit_recursion_dec(); local_bh_enable(); - - if (!dev_xmit_complete(ret)) - kfree_skb(skb); - return ret; drop: atomic_long_inc(&dev->tx_dropped); kfree_skb_list(skb); return NET_XMIT_DROP; } -EXPORT_SYMBOL(dev_direct_xmit); +EXPORT_SYMBOL(__dev_direct_xmit); /************************************************************************* * Receiver routines diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5a6cdf7b320d..b7b039bd9d03 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -411,11 +411,7 @@ static int xsk_generic_xmit(struct sock *sk) skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr; skb->destructor = xsk_destruct_skb; - /* Hinder dev_direct_xmit from freeing the packet and - * therefore completing it in the destructor - */ - refcount_inc(&skb->users); - err = dev_direct_xmit(skb, xs->queue_id); + err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ skb->destructor = sock_wfree; @@ -429,12 +425,10 @@ static int xsk_generic_xmit(struct sock *sk) /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ - kfree_skb(skb); err = -EBUSY; goto out; } - consume_skb(skb); sent_frame = true; } -- cgit v1.2.3 From 4df910620bebb5cfe234af16ac8f6474b60215fd Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 25 Nov 2020 13:22:21 +0800 Subject: mm: memcg: relayout structure mem_cgroup to avoid cache interference 0day reported one -22.7% regression for will-it-scale page_fault2 case [1] on a 4 sockets 144 CPU platform, and bisected to it to be caused by Waiman's optimization (commit bd0b230fe1) of saving one 'struct page_counter' space for 'struct mem_cgroup'. Initially we thought it was due to the cache alignment change introduced by the patch, but further debug shows that it is due to some hot data members ('vmstats_local', 'vmstats_percpu', 'vmstats') sit in 2 adjacent cacheline (2N and 2N+1 cacheline), and when adjacent cache line prefetch is enabled, it triggers an "extended level" of cache false sharing for 2 adjacent cache lines. So exchange the 2 member blocks, while keeping mostly the original cache alignment, which can restore and even enhance the performance, and save 64 bytes of space for 'struct mem_cgroup' (from 2880 to 2816, with 0day's default RHEL-8.3 kernel config) [1]. https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/ Fixes: bd0b230fe145 ("mm/memcg: unify swap and memsw page counters") Reported-by: kernel test robot Signed-off-by: Feng Tang Acked-by: Waiman Long Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a80c59af2c60..922a7f600465 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -282,20 +282,6 @@ struct mem_cgroup { MEMCG_PADDING(_pad1_); - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - struct task_struct *move_lock_task; - - /* Legacy local VM stats and events */ - struct memcg_vmstats_percpu __percpu *vmstats_local; - - /* Subtree VM stats and events (batched updates) */ - struct memcg_vmstats_percpu __percpu *vmstats_percpu; - - MEMCG_PADDING(_pad2_); - atomic_long_t vmstats[MEMCG_NR_STAT]; atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; @@ -317,6 +303,20 @@ struct mem_cgroup { struct list_head objcg_list; /* list of inherited objcgs */ #endif + MEMCG_PADDING(_pad2_); + + /* + * set > 0 if pages under this cgroup are moving to other cgroup. + */ + atomic_t moving_account; + struct task_struct *move_lock_task; + + /* Legacy local VM stats and events */ + struct memcg_vmstats_percpu __percpu *vmstats_local; + + /* Subtree VM stats and events (batched updates) */ + struct memcg_vmstats_percpu __percpu *vmstats_percpu; + #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; -- cgit v1.2.3 From bb4c6910c8b41623104c2e64a30615682689a54d Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Thu, 26 Nov 2020 09:28:51 +0100 Subject: genirq/irqdomain: Add an irq_create_mapping_affinity() function There is currently no way to convey the affinity of an interrupt via irq_create_mapping(), which creates issues for devices that expect that affinity to be managed by the kernel. In order to sort this out, rename irq_create_mapping() to irq_create_mapping_affinity() with an additional affinity parameter that can be passed down to irq_domain_alloc_descs(). irq_create_mapping() is re-implemented as a wrapper around irq_create_mapping_affinity(). No functional change. Fixes: e75eafb9b039 ("genirq/msi: Switch to new irq spreading infrastructure") Signed-off-by: Laurent Vivier Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kurz Cc: Michael Ellerman Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201126082852.1178497-2-lvivier@redhat.com --- include/linux/irqdomain.h | 12 ++++++++++-- kernel/irq/irqdomain.c | 13 ++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 71535e87109f..ea5a337e0f8b 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -384,11 +384,19 @@ extern void irq_domain_associate_many(struct irq_domain *domain, extern void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq); -extern unsigned int irq_create_mapping(struct irq_domain *host, - irq_hw_number_t hwirq); +extern unsigned int irq_create_mapping_affinity(struct irq_domain *host, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity); extern unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); extern void irq_dispose_mapping(unsigned int virq); +static inline unsigned int irq_create_mapping(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + return irq_create_mapping_affinity(host, hwirq, NULL); +} + + /** * irq_linear_revmap() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf8b374b892d..e4ca69608f3b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -624,17 +624,19 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) EXPORT_SYMBOL_GPL(irq_create_direct_mapping); /** - * irq_create_mapping() - Map a hardware interrupt into linux irq space + * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space * @domain: domain owning this hardware interrupt or NULL for default domain * @hwirq: hardware irq number in that domain space + * @affinity: irq affinity * * Only one mapping per hardware interrupt is permitted. Returns a linux * irq number. * If the sense/trigger is to be specified, set_irq_type() should be called * on the number returned from that call. */ -unsigned int irq_create_mapping(struct irq_domain *domain, - irq_hw_number_t hwirq) +unsigned int irq_create_mapping_affinity(struct irq_domain *domain, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity) { struct device_node *of_node; int virq; @@ -660,7 +662,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), + affinity); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -676,7 +679,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return virq; } -EXPORT_SYMBOL_GPL(irq_create_mapping); +EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); /** * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs -- cgit v1.2.3 From d421e466c2373095f165ddd25cbabd6c5b077928 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Wed, 2 Dec 2020 20:39:46 -0800 Subject: net/mlx5: DR, Proper handling of unsupported Connect-X6DX SW steering STEs format for Connect-X5 and Connect-X6DX different. Currently, on Connext-X6DX the SW steering would break at some point when building STEs w/o giving a proper error message. Fix this by checking the STE format of the current device when initializing domain: add mlx5_ifc definitions for Connect-X6DX SW steering, read FW capability to get the current format version, and check this version when domain is being created. Fixes: 26d688e33f88 ("net/mlx5: DR, Add Steering entry (STE) utilities") Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c | 5 +++++ drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h | 1 + include/linux/mlx5/mlx5_ifc.h | 9 ++++++++- 4 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c index 6bd34b293007..51bbd88ff021 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c @@ -92,6 +92,7 @@ int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev, caps->eswitch_manager = MLX5_CAP_GEN(mdev, eswitch_manager); caps->gvmi = MLX5_CAP_GEN(mdev, vhca_id); caps->flex_protocols = MLX5_CAP_GEN(mdev, flex_parser_protocols); + caps->sw_format_ver = MLX5_CAP_GEN(mdev, steering_format_version); if (mlx5dr_matcher_supp_flex_parser_icmp_v4(caps)) { caps->flex_parser_id_icmp_dw0 = MLX5_CAP_GEN(mdev, flex_parser_id_icmp_dw0); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c index 890767a2a7cb..aa2c2d6c44e6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_domain.c @@ -223,6 +223,11 @@ static int dr_domain_caps_init(struct mlx5_core_dev *mdev, if (ret) return ret; + if (dmn->info.caps.sw_format_ver != MLX5_STEERING_FORMAT_CONNECTX_5) { + mlx5dr_err(dmn, "SW steering is not supported on this device\n"); + return -EOPNOTSUPP; + } + ret = dr_domain_query_fdb_caps(mdev, dmn); if (ret) return ret; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h index f50f3b107aa3..cf62ea4f882e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_types.h @@ -625,6 +625,7 @@ struct mlx5dr_cmd_caps { u8 max_ft_level; u16 roce_min_src_udp; u8 num_esw_ports; + u8 sw_format_ver; bool eswitch_manager; bool rx_sw_owner; bool tx_sw_owner; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index a092346c7b2d..233352447b1a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1223,6 +1223,11 @@ enum mlx5_fc_bulk_alloc_bitmask { #define MLX5_FC_BULK_NUM_FCS(fc_enum) (MLX5_FC_BULK_SIZE_FACTOR * (fc_enum)) +enum { + MLX5_STEERING_FORMAT_CONNECTX_5 = 0, + MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, +}; + struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_0[0x30]; u8 vhca_id[0x10]; @@ -1521,7 +1526,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 general_obj_types[0x40]; - u8 reserved_at_440[0x20]; + u8 reserved_at_440[0x4]; + u8 steering_format_version[0x4]; + u8 create_qp_start_hint[0x18]; u8 reserved_at_460[0x3]; u8 log_max_uctx[0x5]; -- cgit v1.2.3 From c8bcd9c5be24fb9e6132e97da5a35e55a83e36b9 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 3 Dec 2020 02:25:05 +0100 Subject: tty: Fix ->session locking Currently, locking of ->session is very inconsistent; most places protect it using the legacy tty mutex, but disassociate_ctty(), __do_SAK(), tiocspgrp() and tiocgsid() don't. Two of the writers hold the ctrl_lock (because they already need it for ->pgrp), but __proc_set_tty() doesn't do that yet. On a PREEMPT=y system, an unprivileged user can theoretically abuse this broken locking to read 4 bytes of freed memory via TIOCGSID if tiocgsid() is preempted long enough at the right point. (Other things might also go wrong, especially if root-only ioctls are involved; I'm not sure about that.) Change the locking on ->session such that: - tty_lock() is held by all writers: By making disassociate_ctty() hold it. This should be fine because the same lock can already be taken through the call to tty_vhangup_session(). The tricky part is that we need to shorten the area covered by siglock to be able to take tty_lock() without ugly retry logic; as far as I can tell, this should be fine, since nothing in the signal_struct is touched in the `if (tty)` branch. - ctrl_lock is held by all writers: By changing __proc_set_tty() to hold the lock a little longer. - All readers that aren't holding tty_lock() hold ctrl_lock: By adding locking to tiocgsid() and __do_SAK(), and expanding the area covered by ctrl_lock in tiocspgrp(). Cc: stable@kernel.org Signed-off-by: Jann Horn Reviewed-by: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_io.c | 7 ++++++- drivers/tty/tty_jobctrl.c | 44 +++++++++++++++++++++++++++++++------------- include/linux/tty.h | 4 ++++ 3 files changed, 41 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 9f8b9a567b35..56ade99ef99f 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2897,10 +2897,14 @@ void __do_SAK(struct tty_struct *tty) struct task_struct *g, *p; struct pid *session; int i; + unsigned long flags; if (!tty) return; - session = tty->session; + + spin_lock_irqsave(&tty->ctrl_lock, flags); + session = get_pid(tty->session); + spin_unlock_irqrestore(&tty->ctrl_lock, flags); tty_ldisc_flush(tty); @@ -2932,6 +2936,7 @@ void __do_SAK(struct tty_struct *tty) task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); + put_pid(session); #endif } diff --git a/drivers/tty/tty_jobctrl.c b/drivers/tty/tty_jobctrl.c index baadeea4a289..aa6d0537b379 100644 --- a/drivers/tty/tty_jobctrl.c +++ b/drivers/tty/tty_jobctrl.c @@ -103,8 +103,8 @@ static void __proc_set_tty(struct tty_struct *tty) put_pid(tty->session); put_pid(tty->pgrp); tty->pgrp = get_pid(task_pgrp(current)); - spin_unlock_irqrestore(&tty->ctrl_lock, flags); tty->session = get_pid(task_session(current)); + spin_unlock_irqrestore(&tty->ctrl_lock, flags); if (current->signal->tty) { tty_debug(tty, "current tty %s not NULL!!\n", current->signal->tty->name); @@ -293,20 +293,23 @@ void disassociate_ctty(int on_exit) spin_lock_irq(¤t->sighand->siglock); put_pid(current->signal->tty_old_pgrp); current->signal->tty_old_pgrp = NULL; - tty = tty_kref_get(current->signal->tty); + spin_unlock_irq(¤t->sighand->siglock); + if (tty) { unsigned long flags; + + tty_lock(tty); spin_lock_irqsave(&tty->ctrl_lock, flags); put_pid(tty->session); put_pid(tty->pgrp); tty->session = NULL; tty->pgrp = NULL; spin_unlock_irqrestore(&tty->ctrl_lock, flags); + tty_unlock(tty); tty_kref_put(tty); } - spin_unlock_irq(¤t->sighand->siglock); /* Now clear signal->tty under the lock */ read_lock(&tasklist_lock); session_clear_tty(task_session(current)); @@ -477,14 +480,19 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t return -ENOTTY; if (retval) return retval; - if (!current->signal->tty || - (current->signal->tty != real_tty) || - (real_tty->session != task_session(current))) - return -ENOTTY; + if (get_user(pgrp_nr, p)) return -EFAULT; if (pgrp_nr < 0) return -EINVAL; + + spin_lock_irq(&real_tty->ctrl_lock); + if (!current->signal->tty || + (current->signal->tty != real_tty) || + (real_tty->session != task_session(current))) { + retval = -ENOTTY; + goto out_unlock_ctrl; + } rcu_read_lock(); pgrp = find_vpid(pgrp_nr); retval = -ESRCH; @@ -494,12 +502,12 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t if (session_of_pgrp(pgrp) != task_session(current)) goto out_unlock; retval = 0; - spin_lock_irq(&real_tty->ctrl_lock); put_pid(real_tty->pgrp); real_tty->pgrp = get_pid(pgrp); - spin_unlock_irq(&real_tty->ctrl_lock); out_unlock: rcu_read_unlock(); +out_unlock_ctrl: + spin_unlock_irq(&real_tty->ctrl_lock); return retval; } @@ -511,20 +519,30 @@ out_unlock: * * Obtain the session id of the tty. If there is no session * return an error. - * - * Locking: none. Reference to current->signal->tty is safe. */ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { + unsigned long flags; + pid_t sid; + /* * (tty == real_tty) is a cheap way of * testing if the tty is NOT a master pty. */ if (tty == real_tty && current->signal->tty != real_tty) return -ENOTTY; + + spin_lock_irqsave(&real_tty->ctrl_lock, flags); if (!real_tty->session) - return -ENOTTY; - return put_user(pid_vnr(real_tty->session), p); + goto err; + sid = pid_vnr(real_tty->session); + spin_unlock_irqrestore(&real_tty->ctrl_lock, flags); + + return put_user(sid, p); + +err: + spin_unlock_irqrestore(&real_tty->ctrl_lock, flags); + return -ENOTTY; } /* diff --git a/include/linux/tty.h b/include/linux/tty.h index a99e9b8e4e31..eb33d948788c 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -306,6 +306,10 @@ struct tty_struct { struct termiox *termiox; /* May be NULL for unsupported */ char name[64]; struct pid *pgrp; /* Protected by ctrl lock */ + /* + * Writes protected by both ctrl lock and legacy mutex, readers must use + * at least one of them. + */ struct pid *session; unsigned long flags; int count; -- cgit v1.2.3 From 3ee16db390b42b8a21f2ad2ea2518f3469c6e532 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 30 Nov 2020 10:57:43 -0500 Subject: dm: fix IO splitting Commit 882ec4e609c1 ("dm table: stack 'chunk_sectors' limit to account for target-specific splitting") caused a couple regressions: 1) Using lcm_not_zero() when stacking chunk_sectors was a bug because chunk_sectors must reflect the most limited of all devices in the IO stack. 2) DM targets that set max_io_len but that do _not_ provide an .iterate_devices method no longer had there IO split properly. And commit 5091cdec56fa ("dm: change max_io_len() to use blk_max_size_offset()") also caused a regression where DM no longer supported varied (per target) IO splitting. The implication being the potential for severely reduced performance for IO stacks that use a DM target like dm-cache to hide performance limitations of a slower device (e.g. one that requires 4K IO splitting). Coming full circle: Fix all these issues by discontinuing stacking chunk_sectors up using ti->max_io_len in dm_calculate_queue_limits(), add optional chunk_sectors override argument to blk_max_size_offset() and update DM's max_io_len() to pass ti->max_io_len to its blk_max_size_offset() call. Passing in an optional chunk_sectors override to blk_max_size_offset() allows for code reuse of block's centralized calculation for max IO size based on provided offset and split boundary. Fixes: 882ec4e609c1 ("dm table: stack 'chunk_sectors' limit to account for target-specific splitting") Fixes: 5091cdec56fa ("dm: change max_io_len() to use blk_max_size_offset()") Cc: stable@vger.kernel.org Reported-by: John Dorminy Reported-by: Bruce Johnston Reported-by: Kirill Tkhai Reviewed-by: John Dorminy Signed-off-by: Mike Snitzer Reviewed-by: Jens Axboe --- block/blk-merge.c | 2 +- drivers/md/dm-table.c | 5 ----- drivers/md/dm.c | 19 +++++++++++-------- include/linux/blkdev.h | 11 ++++++----- 4 files changed, 18 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index bcf5e4580603..97b7c2821565 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -144,7 +144,7 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q, static inline unsigned get_max_io_size(struct request_queue *q, struct bio *bio) { - unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); + unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0); unsigned max_sectors = sectors; unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2073ee8d18f4..7eeb7c4169c9 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -1449,10 +1448,6 @@ int dm_calculate_queue_limits(struct dm_table *table, zone_sectors = ti_limits.chunk_sectors; } - /* Stack chunk_sectors if target-specific splitting is required */ - if (ti->max_io_len) - ti_limits.chunk_sectors = lcm_not_zero(ti->max_io_len, - ti_limits.chunk_sectors); /* Set I/O hints portion of queue limits */ if (ti->type->io_hints) ti->type->io_hints(ti, &ti_limits); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 98866e725f25..f7eb3d2964f3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1039,15 +1039,18 @@ static sector_t max_io_len(struct dm_target *ti, sector_t sector) sector_t max_len; /* - * Does the target need to split even further? - * - q->limits.chunk_sectors reflects ti->max_io_len so - * blk_max_size_offset() provides required splitting. - * - blk_max_size_offset() also respects q->limits.max_sectors + * Does the target need to split IO even further? + * - varied (per target) IO splitting is a tenet of DM; this + * explains why stacked chunk_sectors based splitting via + * blk_max_size_offset() isn't possible here. So pass in + * ti->max_io_len to override stacked chunk_sectors. */ - max_len = blk_max_size_offset(ti->table->md->queue, - target_offset); - if (len > max_len) - len = max_len; + if (ti->max_io_len) { + max_len = blk_max_size_offset(ti->table->md->queue, + target_offset, ti->max_io_len); + if (len > max_len) + len = max_len; + } return len; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 639cae2c158b..24ae504cf77d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1073,11 +1073,12 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, * file system requests. */ static inline unsigned int blk_max_size_offset(struct request_queue *q, - sector_t offset) + sector_t offset, + unsigned int chunk_sectors) { - unsigned int chunk_sectors = q->limits.chunk_sectors; - - if (!chunk_sectors) + if (!chunk_sectors && q->limits.chunk_sectors) + chunk_sectors = q->limits.chunk_sectors; + else return q->limits.max_sectors; if (likely(is_power_of_2(chunk_sectors))) @@ -1101,7 +1102,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, req_op(rq) == REQ_OP_SECURE_ERASE) return blk_queue_get_max_sectors(q, req_op(rq)); - return min(blk_max_size_offset(q, offset), + return min(blk_max_size_offset(q, offset, 0), blk_queue_get_max_sectors(q, req_op(rq))); } -- cgit v1.2.3 From 65f33b35722952fa076811d5686bfd8a611a80fa Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 4 Dec 2020 17:21:03 -0500 Subject: block: fix incorrect branching in blk_max_size_offset() If non-zero 'chunk_sectors' is passed in to blk_max_size_offset() that override will be incorrectly ignored. Old blk_max_size_offset() branching, prior to commit 3ee16db390b4, must be used only if passed 'chunk_sectors' override is zero. Fixes: 3ee16db390b4 ("dm: fix IO splitting") Cc: stable@vger.kernel.org # 5.9 Reported-by: John Dorminy Signed-off-by: Mike Snitzer --- include/linux/blkdev.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 24ae504cf77d..033eb5f73b65 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1076,10 +1076,12 @@ static inline unsigned int blk_max_size_offset(struct request_queue *q, sector_t offset, unsigned int chunk_sectors) { - if (!chunk_sectors && q->limits.chunk_sectors) - chunk_sectors = q->limits.chunk_sectors; - else - return q->limits.max_sectors; + if (!chunk_sectors) { + if (q->limits.chunk_sectors) + chunk_sectors = q->limits.chunk_sectors; + else + return q->limits.max_sectors; + } if (likely(is_power_of_2(chunk_sectors))) chunk_sectors -= offset & (chunk_sectors - 1); -- cgit v1.2.3 From e91d8d78237de8d7120c320b3645b7100848f24d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 5 Dec 2020 22:14:51 -0800 Subject: mm/zsmalloc.c: drop ZSMALLOC_PGTABLE_MAPPING While I was doing zram testing, I found sometimes decompression failed since the compression buffer was corrupted. With investigation, I found below commit calls cond_resched unconditionally so it could make a problem in atomic context if the task is reschedule. BUG: sleeping function called from invalid context at mm/vmalloc.c:108 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 946, name: memhog 3 locks held by memhog/946: #0: ffff9d01d4b193e8 (&mm->mmap_lock#2){++++}-{4:4}, at: __mm_populate+0x103/0x160 #1: ffffffffa3d53de0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0xa98/0x1160 #2: ffff9d01d56b8110 (&zspage->lock){.+.+}-{3:3}, at: zs_map_object+0x8e/0x1f0 CPU: 0 PID: 946 Comm: memhog Not tainted 5.9.3-00011-gc5bfc0287345-dirty #316 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 Call Trace: unmap_kernel_range_noflush+0x2eb/0x350 unmap_kernel_range+0x14/0x30 zs_unmap_object+0xd5/0xe0 zram_bvec_rw.isra.0+0x38c/0x8e0 zram_rw_page+0x90/0x101 bdev_write_page+0x92/0xe0 __swap_writepage+0x94/0x4a0 pageout+0xe3/0x3a0 shrink_page_list+0xb94/0xd60 shrink_inactive_list+0x158/0x460 We can fix this by removing the ZSMALLOC_PGTABLE_MAPPING feature (which contains the offending calling code) from zsmalloc. Even though this option showed some amount improvement(e.g., 30%) in some arm32 platforms, it has been headache to maintain since it have abused APIs[1](e.g., unmap_kernel_range in atomic context). Since we are approaching to deprecate 32bit machines and already made the config option available for only builtin build since v5.8, lastly it has been not default option in zsmalloc, it's time to drop the option for better maintenance. [1] http://lore.kernel.org/linux-mm/20201105170249.387069-1-minchan@kernel.org Fixes: e47110e90584 ("mm/vunmap: add cond_resched() in vunmap_pmd_range") Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Reviewed-by: Sergey Senozhatsky Cc: Tony Lindgren Cc: Christoph Hellwig Cc: Harish Sriram Cc: Uladzislau Rezki Cc: Link: https://lkml.kernel.org/r/20201117202916.GA3856507@google.com Signed-off-by: Linus Torvalds --- arch/arm/configs/omap2plus_defconfig | 1 - include/linux/zsmalloc.h | 1 - mm/Kconfig | 13 --------- mm/zsmalloc.c | 54 ------------------------------------ 4 files changed, 69 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 34793aabdb65..58df9fd79a76 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -81,7 +81,6 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y CONFIG_CMA=y CONFIG_ZSMALLOC=m -CONFIG_ZSMALLOC_PGTABLE_MAPPING=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 0fdbf653b173..4807ca4d52e0 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -20,7 +20,6 @@ * zsmalloc mapping modes * * NOTE: These only make a difference when a mapped object spans pages. - * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected. */ enum zs_mapmode { ZS_MM_RW, /* normal read-write mapping */ diff --git a/mm/Kconfig b/mm/Kconfig index d42423f884a7..390165ffbb0f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -707,19 +707,6 @@ config ZSMALLOC returned by an alloc(). This handle must be mapped in order to access the allocated space. -config ZSMALLOC_PGTABLE_MAPPING - bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC=y - help - By default, zsmalloc uses a copy-based object mapping method to - access allocations that span two pages. However, if a particular - architecture (ex, ARM) performs VM mapping faster than copying, - then you should select this. This causes zsmalloc to use page table - mapping rather than copying for object mapping. - - You can check speed with zsmalloc benchmark: - https://github.com/spartacus06/zsmapbench - config ZSMALLOC_STAT bool "Export zsmalloc statistics" depends on ZSMALLOC diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 918c7b019b3d..cdfaaadea8ff 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -293,11 +293,7 @@ struct zspage { }; struct mapping_area { -#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING - struct vm_struct *vm; /* vm area for mapping object that span pages */ -#else char *vm_buf; /* copy buffer for objects that span pages */ -#endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; @@ -1113,54 +1109,6 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING -static inline int __zs_cpu_up(struct mapping_area *area) -{ - /* - * Make sure we don't leak memory if a cpu UP notification - * and zs_init() race and both call zs_cpu_up() on the same cpu - */ - if (area->vm) - return 0; - area->vm = get_vm_area(PAGE_SIZE * 2, 0); - if (!area->vm) - return -ENOMEM; - - /* - * Populate ptes in advance to avoid pte allocation with GFP_KERNEL - * in non-preemtible context of zs_map_object. - */ - return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr, - PAGE_SIZE * 2, NULL, NULL); -} - -static inline void __zs_cpu_down(struct mapping_area *area) -{ - if (area->vm) - free_vm_area(area->vm); - area->vm = NULL; -} - -static inline void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - unsigned long addr = (unsigned long)area->vm->addr; - - BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); - area->vm_addr = area->vm->addr; - return area->vm_addr + off; -} - -static inline void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - unsigned long addr = (unsigned long)area->vm_addr; - - unmap_kernel_range(addr, PAGE_SIZE * 2); -} - -#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ - static inline int __zs_cpu_up(struct mapping_area *area) { /* @@ -1241,8 +1189,6 @@ out: pagefault_enable(); } -#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ - static int zs_cpu_prepare(unsigned int cpu) { struct mapping_area *area; -- cgit v1.2.3