From 875386b98857822b77ac7f95bdf367b70af5b78c Mon Sep 17 00:00:00 2001 From: Manish Rangankar Date: Mon, 21 Aug 2023 18:30:37 +0530 Subject: scsi: qla2xxx: Add Unsolicited LS Request and Response Support for NVMe Introduce infrastructure in the driver to support the processing of unsolicited LS (Link Service) requests. This will involve the utilization of a new pass-up of unsolicited FC-NVMe request IOCB interface. Unsolicited requests will be submitted to the NVMe transport layer through nvme_fc_rcv_ls_req(). Any received LS responses, which are sent using xmt_ls_rsp(), will be forwarded to the firmware through the existing Pass-Through IOCB interface, responsible for sending FC-NVMe Link Service requests and responses. Signed-off-by: Manish Rangankar Signed-off-by: Nilesh Javali Link: https://lore.kernel.org/r/20230821130045.34850-2-njavali@marvell.com Reviewed-by: Himanshu Madhani Signed-off-by: Martin K. Petersen --- include/linux/nvme-fc-driver.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index 4109f1bd6128..f6ef8cf5d774 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -53,10 +53,10 @@ struct nvmefc_ls_req { void *rqstaddr; dma_addr_t rqstdma; - u32 rqstlen; + __le32 rqstlen; void *rspaddr; dma_addr_t rspdma; - u32 rsplen; + __le32 rsplen; u32 timeout; void *private; @@ -120,7 +120,7 @@ struct nvmefc_ls_req { struct nvmefc_ls_rsp { void *rspbuf; dma_addr_t rspdma; - u16 rsplen; + __le32 rsplen; void (*done)(struct nvmefc_ls_rsp *rsp); void *nvme_fc_private; /* LLDD is not to access !! */ -- cgit v1.2.3 From 9ffa7b92bc768609243d79fb0c0125b9704c7061 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 30 Aug 2023 18:11:38 +0200 Subject: thermal: core: Clean up headers of thermal zone registration functions For consistency, add a missing thermal_zone_device_register_with_trips() stub for the CONFIG_THERMAL unset case, specify argument names in all of the thermal zone registration and unregistration function headers and make all of them use white space consistently. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- include/linux/thermal.h | 53 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index eb17495c8acc..a30643104f3f 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -300,16 +300,24 @@ int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); #endif #ifdef CONFIG_THERMAL -struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, - void *, struct thermal_zone_device_ops *, - const struct thermal_zone_params *, int, int); - -void thermal_zone_device_unregister(struct thermal_zone_device *); - -struct thermal_zone_device * -thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int, int, - void *, struct thermal_zone_device_ops *, - const struct thermal_zone_params *, int, int); +struct thermal_zone_device *thermal_zone_device_register( + const char *type, + int num_trips, int mask, + void *devdata, + struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp, + int passive_delay, int polling_delay); + +struct thermal_zone_device *thermal_zone_device_register_with_trips( + const char *type, + struct thermal_trip *trips, + int num_trips, int mask, + void *devdata, + struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp, + int passive_delay, int polling_delay); + +void thermal_zone_device_unregister(struct thermal_zone_device *tz); void *thermal_zone_device_priv(struct thermal_zone_device *tzd); const char *thermal_zone_device_type(struct thermal_zone_device *tzd); @@ -351,14 +359,27 @@ int thermal_zone_device_disable(struct thermal_zone_device *tz); void thermal_zone_device_critical(struct thermal_zone_device *tz); #else static inline struct thermal_zone_device *thermal_zone_device_register( - const char *type, int trips, int mask, void *devdata, - struct thermal_zone_device_ops *ops, - const struct thermal_zone_params *tzp, - int passive_delay, int polling_delay) + const char *type, + int num_trips, int mask, + void *devdata, + struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp, + int passive_delay, int polling_delay) +{ return ERR_PTR(-ENODEV); } + +static inline struct thermal_zone_device *thermal_zone_device_register_with_trips( + const char *type, + struct thermal_trip *trips, + int num_trips, int mask, + void *devdata, + struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp, + int passive_delay, int polling_delay) { return ERR_PTR(-ENODEV); } -static inline void thermal_zone_device_unregister( - struct thermal_zone_device *tz) + +static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz) { } + static inline struct thermal_cooling_device * thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) -- cgit v1.2.3 From d332db8fc1a2dfb4738281b1d6d4ed20115dd9d3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 30 Aug 2023 18:13:35 +0200 Subject: thermal: core: Add function for registering tripless thermal zones Multiple callers of thermal_zone_device_register() don't pass any trips to it and they might use a shortened argument list for that, so add a special function with fewer arguments for this purpose. Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 11 +++++++++++ include/linux/thermal.h | 13 +++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 0bdde1ab5d8b..33ebec043800 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1400,6 +1400,17 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type, int n } EXPORT_SYMBOL_GPL(thermal_zone_device_register); +struct thermal_zone_device *thermal_tripless_zone_device_register( + const char *type, + void *devdata, + struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp) +{ + return thermal_zone_device_register_with_trips(type, NULL, 0, 0, devdata, + ops, tzp, 0, 0); +} +EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register); + void *thermal_zone_device_priv(struct thermal_zone_device *tzd) { return tzd->devdata; diff --git a/include/linux/thermal.h b/include/linux/thermal.h index a30643104f3f..1a1433eb9b59 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -317,6 +317,12 @@ struct thermal_zone_device *thermal_zone_device_register_with_trips( const struct thermal_zone_params *tzp, int passive_delay, int polling_delay); +struct thermal_zone_device *thermal_tripless_zone_device_register( + const char *type, + void *devdata, + struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp); + void thermal_zone_device_unregister(struct thermal_zone_device *tz); void *thermal_zone_device_priv(struct thermal_zone_device *tzd); @@ -377,6 +383,13 @@ static inline struct thermal_zone_device *thermal_zone_device_register_with_trip int passive_delay, int polling_delay) { return ERR_PTR(-ENODEV); } +static inline struct thermal_zone_device *thermal_tripless_zone_device_register( + const char *type, + void *devdata, + struct thermal_zone_device_ops *ops, + const struct thermal_zone_params *tzp) +{ return ERR_PTR(-ENODEV); } + static inline void thermal_zone_device_unregister(struct thermal_zone_device *tz) { } -- cgit v1.2.3 From edd220b33f479cf9dcda0bfefb2cb8c5902e9885 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 30 Aug 2023 18:16:29 +0200 Subject: thermal: core: Drop thermal_zone_device_register() There are no more users of thermal_zone_device_register(), so drop it from the core. Note that thermal_zone_device_register_with_trips() may be renamed to thermal_zone_device_register() in the future, but only after a grace period allowing all of the possible work in progress that may be using the latter to adjust. Signed-off-by: Rafael J. Wysocki --- drivers/thermal/thermal_core.c | 11 ----------- include/linux/thermal.h | 17 ----------------- 2 files changed, 28 deletions(-) (limited to 'include/linux') diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 33ebec043800..8717a3343512 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1389,17 +1389,6 @@ free_tz: } EXPORT_SYMBOL_GPL(thermal_zone_device_register_with_trips); -struct thermal_zone_device *thermal_zone_device_register(const char *type, int ntrips, int mask, - void *devdata, struct thermal_zone_device_ops *ops, - const struct thermal_zone_params *tzp, int passive_delay, - int polling_delay) -{ - return thermal_zone_device_register_with_trips(type, NULL, ntrips, mask, - devdata, ops, tzp, - passive_delay, polling_delay); -} -EXPORT_SYMBOL_GPL(thermal_zone_device_register); - struct thermal_zone_device *thermal_tripless_zone_device_register( const char *type, void *devdata, diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 1a1433eb9b59..c99440aac1a1 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -300,14 +300,6 @@ int thermal_acpi_critical_trip_temp(struct acpi_device *adev, int *ret_temp); #endif #ifdef CONFIG_THERMAL -struct thermal_zone_device *thermal_zone_device_register( - const char *type, - int num_trips, int mask, - void *devdata, - struct thermal_zone_device_ops *ops, - const struct thermal_zone_params *tzp, - int passive_delay, int polling_delay); - struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, struct thermal_trip *trips, @@ -364,15 +356,6 @@ int thermal_zone_device_enable(struct thermal_zone_device *tz); int thermal_zone_device_disable(struct thermal_zone_device *tz); void thermal_zone_device_critical(struct thermal_zone_device *tz); #else -static inline struct thermal_zone_device *thermal_zone_device_register( - const char *type, - int num_trips, int mask, - void *devdata, - struct thermal_zone_device_ops *ops, - const struct thermal_zone_params *tzp, - int passive_delay, int polling_delay) -{ return ERR_PTR(-ENODEV); } - static inline struct thermal_zone_device *thermal_zone_device_register_with_trips( const char *type, struct thermal_trip *trips, -- cgit v1.2.3 From e7716c74e3882405f9eca16faa6cb1bf19995399 Mon Sep 17 00:00:00 2001 From: Philipp Stanner Date: Mon, 21 Aug 2023 10:21:29 +0200 Subject: xarray: Document necessary flag in alloc functions Adds a new line to the docstrings of functions wrapping __xa_alloc() and __xa_alloc_cyclic(), informing about the necessity of flag XA_FLAGS_ALLOC being set previously. The documentation so far says that functions wrapping __xa_alloc() and __xa_alloc_cyclic() are supposed to return either -ENOMEM or -EBUSY in case of an error. If the xarray has been initialized without the flag XA_FLAGS_ALLOC, however, they fail with a different, undocumented error code. As hinted at in Documentation/core-api/xarray.rst, wrappers around these functions should only be invoked when the flag has been set. The functions' documentation should reflect that as well. Signed-off-by: Philipp Stanner Signed-off-by: Matthew Wilcox (Oracle) --- include/linux/xarray.h | 18 ++++++++++++++++++ lib/xarray.c | 6 ++++++ 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 741703b45f61..cb571dfcf4b1 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -856,6 +856,9 @@ static inline int __must_check xa_insert_irq(struct xarray *xa, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * + * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set + * in xa_init_flags(). + * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or @@ -886,6 +889,9 @@ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * + * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set + * in xa_init_flags(). + * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or @@ -916,6 +922,9 @@ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * + * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set + * in xa_init_flags(). + * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or @@ -949,6 +958,9 @@ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, * The search for an empty entry will start at @next and will wrap * around if necessary. * + * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set + * in xa_init_flags(). + * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the @@ -983,6 +995,9 @@ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, * The search for an empty entry will start at @next and will wrap * around if necessary. * + * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set + * in xa_init_flags(). + * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the @@ -1017,6 +1032,9 @@ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, * The search for an empty entry will start at @next and will wrap * around if necessary. * + * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set + * in xa_init_flags(). + * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the diff --git a/lib/xarray.c b/lib/xarray.c index 142e36f9dfda..39f07bfc4dcc 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1802,6 +1802,9 @@ EXPORT_SYMBOL(xa_get_order); * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * + * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set + * in xa_init_flags(). + * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or @@ -1850,6 +1853,9 @@ EXPORT_SYMBOL(__xa_alloc); * The search for an empty entry will start at @next and will wrap * around if necessary. * + * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set + * in xa_init_flags(). + * * Context: Any context. Expects xa_lock to be held on entry. May * release and reacquire xa_lock if @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the -- cgit v1.2.3 From 8f3f06dfd6873135068ccf1a0b386308e8c4da38 Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: raid6: Add LoongArch SIMD syndrome calculation The algorithms work on 64 bytes at a time, which is the L1 cache line size of all current and future LoongArch cores (that we care about), as confirmed by Huacai. The code is based on the generic int.uc algorithm, unrolled 4 times for LSX and 2 times for LASX. Further unrolling does not meaningfully improve the performance according to experiments. Performance numbers measured during system boot on a 3A5000 @ 2.5GHz: > raid6: lasx gen() 12726 MB/s > raid6: lsx gen() 10001 MB/s > raid6: int64x8 gen() 2876 MB/s > raid6: int64x4 gen() 3867 MB/s > raid6: int64x2 gen() 2531 MB/s > raid6: int64x1 gen() 1945 MB/s Comparison of xor() speeds (from different boots but meaningful anyway): > lasx: 11226 MB/s > lsx: 6395 MB/s > int64x4: 2147 MB/s Performance as measured by raid6test: > raid6: lasx gen() 25109 MB/s > raid6: lsx gen() 13233 MB/s > raid6: int64x8 gen() 4164 MB/s > raid6: int64x4 gen() 6005 MB/s > raid6: int64x2 gen() 5781 MB/s > raid6: int64x1 gen() 4119 MB/s > raid6: using algorithm lasx gen() 25109 MB/s > raid6: .... xor() 14439 MB/s, rmw enabled Acked-by: Song Liu Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- include/linux/raid/pq.h | 2 + lib/raid6/Makefile | 1 + lib/raid6/algos.c | 8 + lib/raid6/loongarch.h | 38 ++++ lib/raid6/loongarch_simd.c | 422 +++++++++++++++++++++++++++++++++++++++++++++ lib/raid6/test/Makefile | 12 ++ 6 files changed, 483 insertions(+) create mode 100644 lib/raid6/loongarch.h create mode 100644 lib/raid6/loongarch_simd.c (limited to 'include/linux') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index f29aaaf2eb21..874447485848 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -108,6 +108,8 @@ extern const struct raid6_calls raid6_vpermxor1; extern const struct raid6_calls raid6_vpermxor2; extern const struct raid6_calls raid6_vpermxor4; extern const struct raid6_calls raid6_vpermxor8; +extern const struct raid6_calls raid6_lsx; +extern const struct raid6_calls raid6_lasx; struct raid6_recov_calls { void (*data2)(int, size_t, int, int, void **); diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 45e17619422b..2b9ebe105480 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -9,6 +9,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \ vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o +raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o hostprogs += mktables diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index a22a05c9af8a..739c7ebcae1a 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -73,6 +73,14 @@ const struct raid6_calls * const raid6_algos[] = { &raid6_neonx2, &raid6_neonx1, #endif +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_lsx, +#endif +#endif #if defined(__ia64__) &raid6_intx32, &raid6_intx16, diff --git a/lib/raid6/loongarch.h b/lib/raid6/loongarch.h new file mode 100644 index 000000000000..acfc33ce7056 --- /dev/null +++ b/lib/raid6/loongarch.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2023 WANG Xuerui + * + * raid6/loongarch.h + * + * Definitions common to LoongArch RAID-6 code only + */ + +#ifndef _LIB_RAID6_LOONGARCH_H +#define _LIB_RAID6_LOONGARCH_H + +#ifdef __KERNEL__ + +#include +#include + +#else /* for user-space testing */ + +#include + +/* have to supply these defines for glibc 2.37- and musl */ +#ifndef HWCAP_LOONGARCH_LSX +#define HWCAP_LOONGARCH_LSX (1 << 4) +#endif +#ifndef HWCAP_LOONGARCH_LASX +#define HWCAP_LOONGARCH_LASX (1 << 5) +#endif + +#define kernel_fpu_begin() +#define kernel_fpu_end() + +#define cpu_has_lsx (getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LSX) +#define cpu_has_lasx (getauxval(AT_HWCAP) & HWCAP_LOONGARCH_LASX) + +#endif /* __KERNEL__ */ + +#endif /* _LIB_RAID6_LOONGARCH_H */ diff --git a/lib/raid6/loongarch_simd.c b/lib/raid6/loongarch_simd.c new file mode 100644 index 000000000000..aa5d9f924ca3 --- /dev/null +++ b/lib/raid6/loongarch_simd.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX) + * + * Copyright 2023 WANG Xuerui + * + * Based on the generic RAID-6 code (int.uc): + * + * Copyright 2002-2004 H. Peter Anvin + */ + +#include +#include "loongarch.h" + +/* + * The vector algorithms are currently priority 0, which means the generic + * scalar algorithms are not being disabled if vector support is present. + * This is like the similar LoongArch RAID5 XOR code, with the main reason + * repeated here: it cannot be ruled out at this point of time, that some + * future (maybe reduced) models could run the vector algorithms slower than + * the scalar ones, maybe for errata or micro-op reasons. It may be + * appropriate to revisit this after one or two more uarch generations. + */ + +#ifdef CONFIG_CPU_HAS_LSX +#define NSIZE 16 + +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1])); + asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2])); + asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1])); + asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2])); + asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3])); + } + + kernel_fpu_end(); +} + +static void raid6_lsx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $vr0, $vr1, $vr2, $vr3: wp + * $vr4, $vr5, $vr6, $vr7: wq + * $vr8, $vr9, $vr10, $vr11: wd + * $vr12, $vr13, $vr14, $vr15: w2 + * $vr16, $vr17, $vr18, $vr19: w1 + */ + for (d = 0; d < bytes; d += NSIZE*4) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE])); + asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE])); + asm volatile("vori.b $vr4, $vr0, 0"); + asm volatile("vori.b $vr5, $vr1, 0"); + asm volatile("vori.b $vr6, $vr2, 0"); + asm volatile("vori.b $vr7, $vr3, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE])); + asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE])); + asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("vxor.v $vr16, $vr16, $vr12"); + asm volatile("vxor.v $vr17, $vr17, $vr13"); + asm volatile("vxor.v $vr18, $vr18, $vr14"); + asm volatile("vxor.v $vr19, $vr19, $vr15"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr8"); + asm volatile("vxor.v $vr5, $vr17, $vr9"); + asm volatile("vxor.v $vr6, $vr18, $vr10"); + asm volatile("vxor.v $vr7, $vr19, $vr11"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("vslti.b $vr12, $vr4, 0"); + asm volatile("vslti.b $vr13, $vr5, 0"); + asm volatile("vslti.b $vr14, $vr6, 0"); + asm volatile("vslti.b $vr15, $vr7, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("vslli.b $vr16, $vr4, 1"); + asm volatile("vslli.b $vr17, $vr5, 1"); + asm volatile("vslli.b $vr18, $vr6, 1"); + asm volatile("vslli.b $vr19, $vr7, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("vandi.b $vr12, $vr12, 0x1d"); + asm volatile("vandi.b $vr13, $vr13, 0x1d"); + asm volatile("vandi.b $vr14, $vr14, 0x1d"); + asm volatile("vandi.b $vr15, $vr15, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("vxor.v $vr4, $vr16, $vr12"); + asm volatile("vxor.v $vr5, $vr17, $vr13"); + asm volatile("vxor.v $vr6, $vr18, $vr14"); + asm volatile("vxor.v $vr7, $vr19, $vr15"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "vld $vr20, %0\n\t" + "vld $vr21, %1\n\t" + "vld $vr22, %2\n\t" + "vld $vr23, %3\n\t" + "vld $vr24, %4\n\t" + "vld $vr25, %5\n\t" + "vld $vr26, %6\n\t" + "vld $vr27, %7\n\t" + "vxor.v $vr20, $vr20, $vr0\n\t" + "vxor.v $vr21, $vr21, $vr1\n\t" + "vxor.v $vr22, $vr22, $vr2\n\t" + "vxor.v $vr23, $vr23, $vr3\n\t" + "vxor.v $vr24, $vr24, $vr4\n\t" + "vxor.v $vr25, $vr25, $vr5\n\t" + "vxor.v $vr26, $vr26, $vr6\n\t" + "vxor.v $vr27, $vr27, $vr7\n\t" + "vst $vr20, %0\n\t" + "vst $vr21, %1\n\t" + "vst $vr22, %2\n\t" + "vst $vr23, %3\n\t" + "vst $vr24, %4\n\t" + "vst $vr25, %5\n\t" + "vst $vr26, %6\n\t" + "vst $vr27, %7\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]), + "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lsx = { + raid6_lsx_gen_syndrome, + raid6_lsx_xor_syndrome, + raid6_has_lsx, + "lsx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; + +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +#define NSIZE 32 + +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = disks - 3; /* Highest data disk */ + p = dptr[z0+1]; /* XOR parity */ + q = dptr[z0+2]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= 0; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + /* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */ + asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0])); + asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1])); + /* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */ + asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0])); + asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1])); + } + + kernel_fpu_end(); +} + +static void raid6_lasx_xor_syndrome(int disks, int start, int stop, + size_t bytes, void **ptrs) +{ + u8 **dptr = (u8 **)ptrs; + u8 *p, *q; + int d, z, z0; + + z0 = stop; /* P/Q right side optimization */ + p = dptr[disks-2]; /* XOR parity */ + q = dptr[disks-1]; /* RS syndrome */ + + kernel_fpu_begin(); + + /* + * $xr0, $xr1: wp + * $xr2, $xr3: wq + * $xr4, $xr5: wd + * $xr6, $xr7: w2 + * $xr8, $xr9: w1 + */ + for (d = 0; d < bytes; d += NSIZE*2) { + /* P/Q data pages */ + /* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */ + asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE])); + asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE])); + asm volatile("xvori.b $xr2, $xr0, 0"); + asm volatile("xvori.b $xr3, $xr1, 0"); + for (z = z0-1; z >= start; z--) { + /* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */ + asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE])); + asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE])); + /* wp$$ ^= wd$$; */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* w1$$ ^= w2$$; */ + asm volatile("xvxor.v $xr8, $xr8, $xr6"); + asm volatile("xvxor.v $xr9, $xr9, $xr7"); + /* wq$$ = w1$$ ^ wd$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr4"); + asm volatile("xvxor.v $xr3, $xr9, $xr5"); + } + + /* P/Q left side optimization */ + for (z = start-1; z >= 0; z--) { + /* w2$$ = MASK(wq$$); */ + asm volatile("xvslti.b $xr6, $xr2, 0"); + asm volatile("xvslti.b $xr7, $xr3, 0"); + /* w1$$ = SHLBYTE(wq$$); */ + asm volatile("xvslli.b $xr8, $xr2, 1"); + asm volatile("xvslli.b $xr9, $xr3, 1"); + /* w2$$ &= NBYTES(0x1d); */ + asm volatile("xvandi.b $xr6, $xr6, 0x1d"); + asm volatile("xvandi.b $xr7, $xr7, 0x1d"); + /* wq$$ = w1$$ ^ w2$$; */ + asm volatile("xvxor.v $xr2, $xr8, $xr6"); + asm volatile("xvxor.v $xr3, $xr9, $xr7"); + } + /* + * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$; + * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$; + */ + asm volatile( + "xvld $xr10, %0\n\t" + "xvld $xr11, %1\n\t" + "xvld $xr12, %2\n\t" + "xvld $xr13, %3\n\t" + "xvxor.v $xr10, $xr10, $xr0\n\t" + "xvxor.v $xr11, $xr11, $xr1\n\t" + "xvxor.v $xr12, $xr12, $xr2\n\t" + "xvxor.v $xr13, $xr13, $xr3\n\t" + "xvst $xr10, %0\n\t" + "xvst $xr11, %1\n\t" + "xvst $xr12, %2\n\t" + "xvst $xr13, %3\n\t" + : "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]), + "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]) + ); + } + + kernel_fpu_end(); +} + +const struct raid6_calls raid6_lasx = { + raid6_lasx_gen_syndrome, + raid6_lasx_xor_syndrome, + raid6_has_lasx, + "lasx", + .priority = 0 /* see the comment near the top of the file for reason */ +}; +#undef NSIZE +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 1f693ea3b980..7b244bce32b3 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -41,6 +41,16 @@ ifeq ($(findstring ppc,$(ARCH)),ppc) gcc -c -x c - >/dev/null && rm ./-.o && echo yes) endif +ifeq ($(ARCH),loongarch64) + CFLAGS += -I../../../arch/loongarch/include -DCONFIG_LOONGARCH=1 + CFLAGS += $(shell echo 'vld $$vr0, $$zero, 0' | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_CPU_HAS_LSX=1) + CFLAGS += $(shell echo 'xvld $$xr0, $$zero, 0' | \ + gcc -c -x assembler - >/dev/null 2>&1 && \ + rm ./-.o && echo -DCONFIG_CPU_HAS_LASX=1) +endif + ifeq ($(IS_X86),yes) OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o CFLAGS += -DCONFIG_X86 @@ -54,6 +64,8 @@ else ifeq ($(HAS_ALTIVEC),yes) CFLAGS += -DCONFIG_ALTIVEC OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o +else ifeq ($(ARCH),loongarch64) + OBJS += loongarch_simd.o endif .c.o: -- cgit v1.2.3 From f2091321044d9fbcadb93dfc1c9cf23e563ea40c Mon Sep 17 00:00:00 2001 From: WANG Xuerui Date: Wed, 6 Sep 2023 22:53:55 +0800 Subject: raid6: Add LoongArch SIMD recovery implementation Similar to the syndrome calculation, the recovery algorithms also work on 64 bytes at a time to align with the L1 cache line size of current and future LoongArch cores (that we care about). Which means unrolled-by-4 LSX and unrolled-by-2 LASX code. The assembly is originally based on the x86 SSSE3/AVX2 ports, but register allocation has been redone to take advantage of LSX/LASX's 32 vector registers, and instruction sequence has been optimized to suit (e.g. LoongArch can perform per-byte srl and andi on vectors, but x86 cannot). Performance numbers measured by instrumenting the raid6test code, on a 3A5000 system clocked at 2.5GHz: > lasx 2data: 354.987 MiB/s > lasx datap: 350.430 MiB/s > lsx 2data: 340.026 MiB/s > lsx datap: 337.318 MiB/s > intx1 2data: 164.280 MiB/s > intx1 datap: 187.966 MiB/s Because recovery algorithms are chosen solely based on priority and availability, lasx is marked as priority 2 and lsx priority 1. At least for the current generation of LoongArch micro-architectures, LASX should always be faster than LSX whenever supported, and have similar power consumption characteristics (because the only known LASX-capable uarch, the LA464, always compute the full 256-bit result for vector ops). Acked-by: Song Liu Signed-off-by: WANG Xuerui Signed-off-by: Huacai Chen --- include/linux/raid/pq.h | 2 + lib/raid6/Makefile | 2 +- lib/raid6/algos.c | 8 + lib/raid6/recov_loongarch_simd.c | 513 +++++++++++++++++++++++++++++++++++++++ lib/raid6/test/Makefile | 2 +- 5 files changed, 525 insertions(+), 2 deletions(-) create mode 100644 lib/raid6/recov_loongarch_simd.c (limited to 'include/linux') diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h index 874447485848..006e18decfad 100644 --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -125,6 +125,8 @@ extern const struct raid6_recov_calls raid6_recov_avx2; extern const struct raid6_recov_calls raid6_recov_avx512; extern const struct raid6_recov_calls raid6_recov_s390xc; extern const struct raid6_recov_calls raid6_recov_neon; +extern const struct raid6_recov_calls raid6_recov_lsx; +extern const struct raid6_recov_calls raid6_recov_lasx; extern const struct raid6_calls raid6_neonx1; extern const struct raid6_calls raid6_neonx2; diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile index 2b9ebe105480..035b0a4db476 100644 --- a/lib/raid6/Makefile +++ b/lib/raid6/Makefile @@ -9,7 +9,7 @@ raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o \ vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o raid6_pq-$(CONFIG_S390) += s390vx8.o recov_s390xc.o -raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o +raid6_pq-$(CONFIG_LOONGARCH) += loongarch_simd.o recov_loongarch_simd.o hostprogs += mktables diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index 739c7ebcae1a..0ec534faf019 100644 --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -111,6 +111,14 @@ const struct raid6_recov_calls *const raid6_recov_algos[] = { #endif #if defined(CONFIG_KERNEL_MODE_NEON) &raid6_recov_neon, +#endif +#ifdef CONFIG_LOONGARCH +#ifdef CONFIG_CPU_HAS_LASX + &raid6_recov_lasx, +#endif +#ifdef CONFIG_CPU_HAS_LSX + &raid6_recov_lsx, +#endif #endif &raid6_recov_intx1, NULL diff --git a/lib/raid6/recov_loongarch_simd.c b/lib/raid6/recov_loongarch_simd.c new file mode 100644 index 000000000000..94aeac85e6f7 --- /dev/null +++ b/lib/raid6/recov_loongarch_simd.c @@ -0,0 +1,513 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * RAID6 recovery algorithms in LoongArch SIMD (LSX & LASX) + * + * Copyright (C) 2023 WANG Xuerui + * + * Originally based on recov_avx2.c and recov_ssse3.c: + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas + */ + +#include +#include "loongarch.h" + +/* + * Unlike with the syndrome calculation algorithms, there's no boot-time + * selection of recovery algorithms by benchmarking, so we have to specify + * the priorities and hope the future cores will all have decent vector + * support (i.e. no LASX slower than LSX, or even scalar code). + */ + +#ifdef CONFIG_CPU_HAS_LSX +static int raid6_has_lsx(void) +{ + return cpu_has_lsx; +} + +static void raid6_2data_recov_lsx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * vr20, vr21: qmul + * vr22, vr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + + while (bytes) { + /* vr4 - vr7: Q */ + asm volatile("vld $vr4, %0" : : "m" (q[0])); + asm volatile("vld $vr5, %0" : : "m" (q[16])); + asm volatile("vld $vr6, %0" : : "m" (q[32])); + asm volatile("vld $vr7, %0" : : "m" (q[48])); + /* vr4 - vr7: Q + Qxy */ + asm volatile("vld $vr8, %0" : : "m" (dq[0])); + asm volatile("vld $vr9, %0" : : "m" (dq[16])); + asm volatile("vld $vr10, %0" : : "m" (dq[32])); + asm volatile("vld $vr11, %0" : : "m" (dq[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + /* vr0 - vr3: P */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr0 - vr3: P + Pxy */ + asm volatile("vld $vr8, %0" : : "m" (dp[0])); + asm volatile("vld $vr9, %0" : : "m" (dp[16])); + asm volatile("vld $vr10, %0" : : "m" (dp[32])); + asm volatile("vld $vr11, %0" : : "m" (dp[48])); + asm volatile("vxor.v $vr0, $vr0, $vr8"); + asm volatile("vxor.v $vr1, $vr1, $vr9"); + asm volatile("vxor.v $vr2, $vr2, $vr10"); + asm volatile("vxor.v $vr3, $vr3, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr20, $vr20, $vr4"); + asm volatile("vshuf.b $vr5, $vr20, $vr20, $vr5"); + asm volatile("vshuf.b $vr6, $vr20, $vr20, $vr6"); + asm volatile("vshuf.b $vr7, $vr20, $vr20, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr21, $vr21, $vr8"); + asm volatile("vshuf.b $vr9, $vr21, $vr21, $vr9"); + asm volatile("vshuf.b $vr10, $vr21, $vr21, $vr10"); + asm volatile("vshuf.b $vr11, $vr21, $vr21, $vr11"); + /* vr16 - vr19: B(Q + Qxy) */ + asm volatile("vxor.v $vr16, $vr8, $vr4"); + asm volatile("vxor.v $vr17, $vr9, $vr5"); + asm volatile("vxor.v $vr18, $vr10, $vr6"); + asm volatile("vxor.v $vr19, $vr11, $vr7"); + + /* vr4 - vr7: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("vsrli.b $vr4, $vr0, 4"); + asm volatile("vsrli.b $vr5, $vr1, 4"); + asm volatile("vsrli.b $vr6, $vr2, 4"); + asm volatile("vsrli.b $vr7, $vr3, 4"); + /* vr12 - vr15: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("vandi.b $vr12, $vr0, 0x0f"); + asm volatile("vandi.b $vr13, $vr1, 0x0f"); + asm volatile("vandi.b $vr14, $vr2, 0x0f"); + asm volatile("vandi.b $vr15, $vr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("vshuf.b $vr12, $vr22, $vr22, $vr12"); + asm volatile("vshuf.b $vr13, $vr22, $vr22, $vr13"); + asm volatile("vshuf.b $vr14, $vr22, $vr22, $vr14"); + asm volatile("vshuf.b $vr15, $vr22, $vr22, $vr15"); + /* lookup from pbmul[16] */ + asm volatile("vshuf.b $vr4, $vr23, $vr23, $vr4"); + asm volatile("vshuf.b $vr5, $vr23, $vr23, $vr5"); + asm volatile("vshuf.b $vr6, $vr23, $vr23, $vr6"); + asm volatile("vshuf.b $vr7, $vr23, $vr23, $vr7"); + /* vr4 - vr7: A(P + Pxy) */ + asm volatile("vxor.v $vr4, $vr4, $vr12"); + asm volatile("vxor.v $vr5, $vr5, $vr13"); + asm volatile("vxor.v $vr6, $vr6, $vr14"); + asm volatile("vxor.v $vr7, $vr7, $vr15"); + + /* vr4 - vr7: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr16"); + asm volatile("vxor.v $vr5, $vr5, $vr17"); + asm volatile("vxor.v $vr6, $vr6, $vr18"); + asm volatile("vxor.v $vr7, $vr7, $vr19"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Pxy + Dx = Dy */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (dp[0])); + asm volatile("vst $vr1, %0" : "=m" (dp[16])); + asm volatile("vst $vr2, %0" : "=m" (dp[32])); + asm volatile("vst $vr3, %0" : "=m" (dp[48])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lsx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* vr22, vr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + + while (bytes) { + /* vr0 - vr3: P + Dx */ + asm volatile("vld $vr0, %0" : : "m" (p[0])); + asm volatile("vld $vr1, %0" : : "m" (p[16])); + asm volatile("vld $vr2, %0" : : "m" (p[32])); + asm volatile("vld $vr3, %0" : : "m" (p[48])); + /* vr4 - vr7: Qx */ + asm volatile("vld $vr4, %0" : : "m" (dq[0])); + asm volatile("vld $vr5, %0" : : "m" (dq[16])); + asm volatile("vld $vr6, %0" : : "m" (dq[32])); + asm volatile("vld $vr7, %0" : : "m" (dq[48])); + /* vr4 - vr7: Q + Qx */ + asm volatile("vld $vr8, %0" : : "m" (q[0])); + asm volatile("vld $vr9, %0" : : "m" (q[16])); + asm volatile("vld $vr10, %0" : : "m" (q[32])); + asm volatile("vld $vr11, %0" : : "m" (q[48])); + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + + /* vr8 - vr11: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("vsrli.b $vr8, $vr4, 4"); + asm volatile("vsrli.b $vr9, $vr5, 4"); + asm volatile("vsrli.b $vr10, $vr6, 4"); + asm volatile("vsrli.b $vr11, $vr7, 4"); + /* vr4 - vr7: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("vandi.b $vr4, $vr4, 0x0f"); + asm volatile("vandi.b $vr5, $vr5, 0x0f"); + asm volatile("vandi.b $vr6, $vr6, 0x0f"); + asm volatile("vandi.b $vr7, $vr7, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("vshuf.b $vr4, $vr22, $vr22, $vr4"); + asm volatile("vshuf.b $vr5, $vr22, $vr22, $vr5"); + asm volatile("vshuf.b $vr6, $vr22, $vr22, $vr6"); + asm volatile("vshuf.b $vr7, $vr22, $vr22, $vr7"); + /* lookup from qmul[16] */ + asm volatile("vshuf.b $vr8, $vr23, $vr23, $vr8"); + asm volatile("vshuf.b $vr9, $vr23, $vr23, $vr9"); + asm volatile("vshuf.b $vr10, $vr23, $vr23, $vr10"); + asm volatile("vshuf.b $vr11, $vr23, $vr23, $vr11"); + /* vr4 - vr7: qmul(Q + Qx) = Dx */ + asm volatile("vxor.v $vr4, $vr4, $vr8"); + asm volatile("vxor.v $vr5, $vr5, $vr9"); + asm volatile("vxor.v $vr6, $vr6, $vr10"); + asm volatile("vxor.v $vr7, $vr7, $vr11"); + asm volatile("vst $vr4, %0" : "=m" (dq[0])); + asm volatile("vst $vr5, %0" : "=m" (dq[16])); + asm volatile("vst $vr6, %0" : "=m" (dq[32])); + asm volatile("vst $vr7, %0" : "=m" (dq[48])); + + /* vr0 - vr3: P + Dx + Dx = P */ + asm volatile("vxor.v $vr0, $vr0, $vr4"); + asm volatile("vxor.v $vr1, $vr1, $vr5"); + asm volatile("vxor.v $vr2, $vr2, $vr6"); + asm volatile("vxor.v $vr3, $vr3, $vr7"); + asm volatile("vst $vr0, %0" : "=m" (p[0])); + asm volatile("vst $vr1, %0" : "=m" (p[16])); + asm volatile("vst $vr2, %0" : "=m" (p[32])); + asm volatile("vst $vr3, %0" : "=m" (p[48])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lsx = { + .data2 = raid6_2data_recov_lsx, + .datap = raid6_datap_recov_lsx, + .valid = raid6_has_lsx, + .name = "lsx", + .priority = 1, +}; +#endif /* CONFIG_CPU_HAS_LSX */ + +#ifdef CONFIG_CPU_HAS_LASX +static int raid6_has_lasx(void) +{ + return cpu_has_lasx; +} + +static void raid6_2data_recov_lasx(int disks, size_t bytes, int faila, + int failb, void **ptrs) +{ + u8 *p, *q, *dp, *dq; + const u8 *pbmul; /* P multiplier table for B data */ + const u8 *qmul; /* Q multiplier table (for both) */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data pages + * Use the dead data pages as temporary storage for + * delta p and delta q + */ + dp = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 2] = dp; + dq = (u8 *)ptrs[failb]; + ptrs[failb] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + ptrs[disks - 2] = p; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + pbmul = raid6_vgfmul[raid6_gfexi[failb - faila]]; + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^ raid6_gfexp[failb]]]; + + kernel_fpu_begin(); + + /* + * xr20, xr21: qmul + * xr22, xr23: pbmul + */ + asm volatile("vld $vr20, %0" : : "m" (qmul[0])); + asm volatile("vld $vr21, %0" : : "m" (qmul[16])); + asm volatile("vld $vr22, %0" : : "m" (pbmul[0])); + asm volatile("vld $vr23, %0" : : "m" (pbmul[16])); + asm volatile("xvreplve0.q $xr20, $xr20"); + asm volatile("xvreplve0.q $xr21, $xr21"); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: Q */ + asm volatile("xvld $xr0, %0" : : "m" (q[0])); + asm volatile("xvld $xr1, %0" : : "m" (q[32])); + /* xr0, xr1: Q + Qxy */ + asm volatile("xvld $xr4, %0" : : "m" (dq[0])); + asm volatile("xvld $xr5, %0" : : "m" (dq[32])); + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + /* xr2, xr3: P */ + asm volatile("xvld $xr2, %0" : : "m" (p[0])); + asm volatile("xvld $xr3, %0" : : "m" (p[32])); + /* xr2, xr3: P + Pxy */ + asm volatile("xvld $xr4, %0" : : "m" (dp[0])); + asm volatile("xvld $xr5, %0" : : "m" (dp[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvsrli.b $xr4, $xr0, 4"); + asm volatile("xvsrli.b $xr5, $xr1, 4"); + /* xr0, xr1: lower 4 bits of each byte of (Q + Qxy) */ + asm volatile("xvandi.b $xr0, $xr0, 0x0f"); + asm volatile("xvandi.b $xr1, $xr1, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr0, $xr20, $xr20, $xr0"); + asm volatile("xvshuf.b $xr1, $xr20, $xr20, $xr1"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr21, $xr21, $xr4"); + asm volatile("xvshuf.b $xr5, $xr21, $xr21, $xr5"); + /* xr6, xr7: B(Q + Qxy) */ + asm volatile("xvxor.v $xr6, $xr4, $xr0"); + asm volatile("xvxor.v $xr7, $xr5, $xr1"); + + /* xr4, xr5: higher 4 bits of each byte of (P + Pxy) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr0, xr1: lower 4 bits of each byte of (P + Pxy) */ + asm volatile("xvandi.b $xr0, $xr2, 0x0f"); + asm volatile("xvandi.b $xr1, $xr3, 0x0f"); + /* lookup from pbmul[0] */ + asm volatile("xvshuf.b $xr0, $xr22, $xr22, $xr0"); + asm volatile("xvshuf.b $xr1, $xr22, $xr22, $xr1"); + /* lookup from pbmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr0, xr1: A(P + Pxy) */ + asm volatile("xvxor.v $xr0, $xr0, $xr4"); + asm volatile("xvxor.v $xr1, $xr1, $xr5"); + + /* xr0, xr1: A(P + Pxy) + B(Q + Qxy) = Dx */ + asm volatile("xvxor.v $xr0, $xr0, $xr6"); + asm volatile("xvxor.v $xr1, $xr1, $xr7"); + + /* xr2, xr3: P + Pxy + Dx = Dy */ + asm volatile("xvxor.v $xr2, $xr2, $xr0"); + asm volatile("xvxor.v $xr3, $xr3, $xr1"); + + asm volatile("xvst $xr0, %0" : "=m" (dq[0])); + asm volatile("xvst $xr1, %0" : "=m" (dq[32])); + asm volatile("xvst $xr2, %0" : "=m" (dp[0])); + asm volatile("xvst $xr3, %0" : "=m" (dp[32])); + + bytes -= 64; + p += 64; + q += 64; + dp += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +static void raid6_datap_recov_lasx(int disks, size_t bytes, int faila, + void **ptrs) +{ + u8 *p, *q, *dq; + const u8 *qmul; /* Q multiplier table */ + + p = (u8 *)ptrs[disks - 2]; + q = (u8 *)ptrs[disks - 1]; + + /* + * Compute syndrome with zero for the missing data page + * Use the dead data page as temporary storage for delta q + */ + dq = (u8 *)ptrs[faila]; + ptrs[faila] = (void *)raid6_empty_zero_page; + ptrs[disks - 1] = dq; + + raid6_call.gen_syndrome(disks, bytes, ptrs); + + /* Restore pointer table */ + ptrs[faila] = dq; + ptrs[disks - 1] = q; + + /* Now, pick the proper data tables */ + qmul = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + kernel_fpu_begin(); + + /* xr22, xr23: qmul */ + asm volatile("vld $vr22, %0" : : "m" (qmul[0])); + asm volatile("xvreplve0.q $xr22, $xr22"); + asm volatile("vld $vr23, %0" : : "m" (qmul[16])); + asm volatile("xvreplve0.q $xr23, $xr23"); + + while (bytes) { + /* xr0, xr1: P + Dx */ + asm volatile("xvld $xr0, %0" : : "m" (p[0])); + asm volatile("xvld $xr1, %0" : : "m" (p[32])); + /* xr2, xr3: Qx */ + asm volatile("xvld $xr2, %0" : : "m" (dq[0])); + asm volatile("xvld $xr3, %0" : : "m" (dq[32])); + /* xr2, xr3: Q + Qx */ + asm volatile("xvld $xr4, %0" : : "m" (q[0])); + asm volatile("xvld $xr5, %0" : : "m" (q[32])); + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr4, xr5: higher 4 bits of each byte of (Q + Qx) */ + asm volatile("xvsrli.b $xr4, $xr2, 4"); + asm volatile("xvsrli.b $xr5, $xr3, 4"); + /* xr2, xr3: lower 4 bits of each byte of (Q + Qx) */ + asm volatile("xvandi.b $xr2, $xr2, 0x0f"); + asm volatile("xvandi.b $xr3, $xr3, 0x0f"); + /* lookup from qmul[0] */ + asm volatile("xvshuf.b $xr2, $xr22, $xr22, $xr2"); + asm volatile("xvshuf.b $xr3, $xr22, $xr22, $xr3"); + /* lookup from qmul[16] */ + asm volatile("xvshuf.b $xr4, $xr23, $xr23, $xr4"); + asm volatile("xvshuf.b $xr5, $xr23, $xr23, $xr5"); + /* xr2, xr3: qmul(Q + Qx) = Dx */ + asm volatile("xvxor.v $xr2, $xr2, $xr4"); + asm volatile("xvxor.v $xr3, $xr3, $xr5"); + + /* xr0, xr1: P + Dx + Dx = P */ + asm volatile("xvxor.v $xr0, $xr0, $xr2"); + asm volatile("xvxor.v $xr1, $xr1, $xr3"); + + asm volatile("xvst $xr2, %0" : "=m" (dq[0])); + asm volatile("xvst $xr3, %0" : "=m" (dq[32])); + asm volatile("xvst $xr0, %0" : "=m" (p[0])); + asm volatile("xvst $xr1, %0" : "=m" (p[32])); + + bytes -= 64; + p += 64; + q += 64; + dq += 64; + } + + kernel_fpu_end(); +} + +const struct raid6_recov_calls raid6_recov_lasx = { + .data2 = raid6_2data_recov_lasx, + .datap = raid6_datap_recov_lasx, + .valid = raid6_has_lasx, + .name = "lasx", + .priority = 2, +}; +#endif /* CONFIG_CPU_HAS_LASX */ diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile index 7b244bce32b3..2abe0076a636 100644 --- a/lib/raid6/test/Makefile +++ b/lib/raid6/test/Makefile @@ -65,7 +65,7 @@ else ifeq ($(HAS_ALTIVEC),yes) OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o else ifeq ($(ARCH),loongarch64) - OBJS += loongarch_simd.o + OBJS += loongarch_simd.o recov_loongarch_simd.o endif .c.o: -- cgit v1.2.3 From 9b04c764af18a1dab6d48ca0671f70cdcccf90a2 Mon Sep 17 00:00:00 2001 From: Qing Zhang Date: Wed, 6 Sep 2023 22:54:16 +0800 Subject: kasan: Add __HAVE_ARCH_SHADOW_MAP to support arch specific mapping MIPS, LoongArch and some other architectures have many holes between different segments and the valid address space (256T available) is insufficient to map all these segments to kasan shadow memory with the common formula provided by kasan core. So we need architecture specific mapping formulas to ensure different segments are mapped individually, and only limited space lengths of those specific segments are mapped to shadow. Therefore, when the incoming address is converted to a shadow, we need to add a condition to determine whether it is valid. Reviewed-by: Andrey Konovalov Signed-off-by: Qing Zhang Signed-off-by: Huacai Chen --- include/linux/kasan.h | 2 ++ mm/kasan/kasan.h | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 819b6bc8ac08..3df5499f7936 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -54,11 +54,13 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end); +#ifndef __HAVE_ARCH_SHADOW_MAP static inline void *kasan_mem_to_shadow(const void *addr) { return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET; } +#endif int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 2e973b36fe07..f70e3d7a602e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -291,16 +291,22 @@ struct kasan_stack_ring { #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +#ifndef __HAVE_ARCH_SHADOW_MAP static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT); } +#endif static __always_inline bool addr_has_metadata(const void *addr) { +#ifdef __HAVE_ARCH_SHADOW_MAP + return (kasan_mem_to_shadow((void *)addr) != NULL); +#else return (kasan_reset_tag(addr) >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +#endif } /** -- cgit v1.2.3 From 6fdac58c560e4d164eb8161987bee045147cabe4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 7 Sep 2023 22:19:12 -0400 Subject: tracing: Remove unused trace_event_file dir field Now that eventfs structure is used to create the events directory via the eventfs dynamically allocate code, the "dir" field of the trace_event_file structure is no longer used. Remove it. Link: https://lkml.kernel.org/r/20230908022001.580400115@goodmis.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Andrew Morton Cc: Ajay Kaher Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 1 - kernel/trace/trace_events.c | 13 ------------- 2 files changed, 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index eb5c3add939b..12f875e9e69a 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -650,7 +650,6 @@ struct trace_event_file { struct trace_event_call *event_call; struct event_filter __rcu *filter; struct eventfs_file *ef; - struct dentry *dir; struct trace_array *tr; struct trace_subsystem_dir *system; struct list_head triggers; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2af92177b765..065c63991858 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -992,19 +992,6 @@ static void remove_subsystem(struct trace_subsystem_dir *dir) static void remove_event_file_dir(struct trace_event_file *file) { - struct dentry *dir = file->dir; - struct dentry *child; - - if (dir) { - spin_lock(&dir->d_lock); /* probably unneeded */ - list_for_each_entry(child, &dir->d_subdirs, d_child) { - if (d_really_is_positive(child)) /* probably unneeded */ - d_inode(child)->i_private = NULL; - } - spin_unlock(&dir->d_lock); - - tracefs_remove(dir); - } eventfs_remove(file->ef); list_del(&file->list); remove_subsystem(file->system); -- cgit v1.2.3 From 5d153cd128251aaedc8e9657f0a949ec94952055 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 8 Sep 2023 16:34:59 -0500 Subject: spnego: add missing OID to oid registry Add missing OID to the registry. Some servers and clients (including Windows) now request "NEGOEX - SPNEGEO Extended Negotiation Security") See https://datatracker.ietf.org/doc/html/draft-zhu-negoex-02 Reviewed-by: Namjae Jeon Signed-off-by: Steve French --- include/linux/oid_registry.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h index 0f4a8903922a..f86a08ba0207 100644 --- a/include/linux/oid_registry.h +++ b/include/linux/oid_registry.h @@ -67,6 +67,7 @@ enum OID { OID_msOutlookExpress, /* 1.3.6.1.4.1.311.16.4 */ OID_ntlmssp, /* 1.3.6.1.4.1.311.2.2.10 */ + OID_negoex, /* 1.3.6.1.4.1.311.2.2.30 */ OID_spnego, /* 1.3.6.1.5.5.2 */ -- cgit v1.2.3 From fc52a64416b010c8324e2cb50070faae868521c1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 8 Sep 2023 16:39:29 -0400 Subject: tracing/synthetic: Fix order of struct trace_dynamic_info To make handling BIG and LITTLE endian better the offset/len of dynamic fields of the synthetic events was changed into a structure of: struct trace_dynamic_info { #ifdef CONFIG_CPU_BIG_ENDIAN u16 offset; u16 len; #else u16 len; u16 offset; #endif }; to replace the manual changes of: data_offset = offset & 0xffff; data_offest = len << 16; But if you look closely, the above is: << 16 | offset Which in little endian would be in memory: offset_lo offset_hi len_lo len_hi and in big endian: len_hi len_lo offset_hi offset_lo Which if broken into a structure would be: struct trace_dynamic_info { #ifdef CONFIG_CPU_BIG_ENDIAN u16 len; u16 offset; #else u16 offset; u16 len; #endif }; Which is the opposite of what was defined. Fix this and just to be safe also add "__packed". Link: https://lore.kernel.org/all/20230908154417.5172e343@gandalf.local.home/ Link: https://lore.kernel.org/linux-trace-kernel/20230908163929.2c25f3dc@gandalf.local.home Cc: stable@vger.kernel.org Cc: Mark Rutland Tested-by: Sven Schnelle Acked-by: Masami Hiramatsu (Google) Fixes: ddeea494a16f3 ("tracing/synthetic: Use union instead of casts") Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 12f875e9e69a..21ae37e49319 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -62,13 +62,13 @@ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...); /* Used to find the offset and length of dynamic fields in trace events */ struct trace_dynamic_info { #ifdef CONFIG_CPU_BIG_ENDIAN - u16 offset; u16 len; + u16 offset; #else - u16 len; u16 offset; + u16 len; #endif -}; +} __packed; /* * The trace entry - the most basic unit of tracing. This is what -- cgit v1.2.3 From 08700ec705043eb0cee01b35cf5b9d63f0230d12 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 6 Sep 2023 03:46:57 +0900 Subject: linux/export: fix reference to exported functions for parisc64 John David Anglin reported parisc has been broken since commit ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost"). Like ia64, parisc64 uses a function descriptor. The function references must be prefixed with P%. Also, symbols prefixed $$ from the library have the symbol type STT_LOPROC instead of STT_FUNC. They should be handled as functions too. Fixes: ddb5cdbafaaa ("kbuild: generate KSYMTAB entries by modpost") Reported-by: John David Anglin Tested-by: John David Anglin Tested-by: Helge Deller Closes: https://lore.kernel.org/linux-parisc/1901598a-e11d-f7dd-a5d9-9a69d06e6b6e@bell.net/T/#u Signed-off-by: Masahiro Yamada Signed-off-by: Helge Deller --- include/linux/export-internal.h | 2 ++ scripts/mod/modpost.c | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/export-internal.h b/include/linux/export-internal.h index 1c849db953a5..45fca09b2319 100644 --- a/include/linux/export-internal.h +++ b/include/linux/export-internal.h @@ -52,6 +52,8 @@ #ifdef CONFIG_IA64 #define KSYM_FUNC(name) @fptr(name) +#elif defined(CONFIG_PARISC) && defined(CONFIG_64BIT) +#define KSYM_FUNC(name) P%name #else #define KSYM_FUNC(name) name #endif diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index b29b29707f10..ba981f22908a 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1226,6 +1226,15 @@ static void check_export_symbol(struct module *mod, struct elf_info *elf, */ s->is_func = (ELF_ST_TYPE(sym->st_info) == STT_FUNC); + /* + * For parisc64, symbols prefixed $$ from the library have the symbol type + * STT_LOPROC. They should be handled as functions too. + */ + if (elf->hdr->e_ident[EI_CLASS] == ELFCLASS64 && + elf->hdr->e_machine == EM_PARISC && + ELF_ST_TYPE(sym->st_info) == STT_LOPROC) + s->is_func = true; + if (match(secname, PATTERNS(INIT_SECTIONS))) warn("%s: %s: EXPORT_SYMBOL used for init symbol. Remove __init or EXPORT_SYMBOL.\n", mod->name, name); -- cgit v1.2.3