Merge tag 's390-6.9-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux

Pull s390 updates from Heiko Carstens: - Various virtual vs physical address usage fixes - Fix error handling in Processor Activity Instrumentation device driver, and export number of counters with a sysfs file - Allow for multiple events when Processor Activity Instrumentation counters are monitored in system wide sampling - Change multiplier and shift values of the Time-of-Day clock source to improve steering precision - Remove a couple of unneeded GFP_DMA flags from allocations - Disable mmap alignment if randomize_va_space is also disabled, to avoid a too small heap - Various changes to allow s390 to be compiled with LLVM=1, since ld.lld and llvm-objcopy will have proper s390 support witch clang 19 - Add __uninitialized macro to Compiler Attributes. This is helpful with s390's FPU code where some users have up to 520 byte stack frames. Clearing such stack frames (if INIT_STACK_ALL_PATTERN or INIT_STACK_ALL_ZERO is enabled) before they are used contradicts the intention (performance improvement) of such code sections. - Convert switch_to() to an out-of-line function, and use the generic switch_to header file - Replace the usage of s390's debug feature with pr_debug() calls within the zcrypt device driver - Improve hotplug support of the Adjunct Processor device driver - Improve retry handling in the zcrypt device driver - Various changes to the in-kernel FPU code: - Make in-kernel FPU sections preemptible - Convert various larger inline assemblies and assembler files to C, mainly by using singe instruction inline assemblies. This increases readability, but also allows makes it easier to add proper instrumentation hooks - Cleanup of the header files - Provide fast variants of csum_partial() and csum_partial_copy_nocheck() based on vector instructions - Introduce and use a lock to synchronize accesses to zpci device data structures to avoid inconsistent states caused by concurrent accesses - Compile the kernel without -fPIE. This addresses the following problems if the kernel is compiled with -fPIE: - It uses dynamic symbols (.dynsym), for which the linker refuses to allow more than 64k sections. This can break features which use '-ffunction-sections' and '-fdata-sections', including kpatch-build and function granular KASLR - It unnecessarily uses GOT relocations, adding an extra layer of indirection for many memory accesses - Fix shared_cpu_list for CPU private L2 caches, which incorrectly were reported as globally shared * tag 's390-6.9-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (117 commits) s390/tools: handle rela R_390_GOTPCDBL/R_390_GOTOFF64 s390/cache: prevent rebuild of shared_cpu_list s390/crypto: remove retry loop with sleep from PAES pkey invocation s390/pkey: improve pkey retry behavior s390/zcrypt: improve zcrypt retry behavior s390/zcrypt: introduce retries on in-kernel send CPRB functions s390/ap: introduce mutex to lock the AP bus scan s390/ap: rework ap_scan_bus() to return true on config change s390/ap: clarify AP scan bus related functions and variables s390/ap: rearm APQNs bindings complete completion s390/configs: increase number of LOCKDEP_BITS s390/vfio-ap: handle hardware checkstop state on queue reset operation s390/pai: change sampling event assignment for PMU device driver s390/boot: fix minor comment style damages s390/boot: do not check for zero-termination relocation entry s390/boot: make type of __vmlinux_relocs_64_start|end consistent s390/boot: sanitize kaslr_adjust_relocs() function prototype s390/boot: simplify GOT handling s390: vmlinux.lds.S: fix .got.plt assertion s390/boot: workaround current 'llvm-objdump -t -j ...' behavior ...
2026-03-06 15:25:10 -08:00 · 2024-03-12 10:14:22 -07:00
parent b29f377119 fa9e3139e6
commit 691632f0e8
115 changed files with 3242 additions and 1912 deletions
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -127,6 +127,7 @@ config S390
 	select ARCH_WANT_DEFAULT_BPF_JIT
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select ARCH_WANT_KERNEL_PMD_MKWRITE
+	select ARCH_WANT_LD_ORPHAN_WARN
 	select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
 	select BUILDTIME_TABLE_SORT
 	select CLONE_BACKWARDS2
@@ -448,7 +449,7 @@ config COMPAT
 	select COMPAT_OLD_SIGACTION
 	select HAVE_UID16
 	depends on MULTIUSER
-	depends on !CC_IS_CLANG
+	depends on !CC_IS_CLANG && !LD_IS_LLD
 	help
 	  Select this option if you want to enable your system kernel to
 	  handle system-calls from ELF binaries for 31 bit ESA.  This option
@@ -582,14 +583,23 @@ config RELOCATABLE
 	help
 	  This builds a kernel image that retains relocation information
 	  so it can be loaded at an arbitrary address.
-	  The kernel is linked as a position-independent executable (PIE)
-	  and contains dynamic relocations which are processed early in the
-	  bootup process.
 	  The relocations make the kernel image about 15% larger (compressed
 	  10%), but are discarded at runtime.
 	  Note: this option exists only for documentation purposes, please do
 	  not remove it.

+config PIE_BUILD
+	def_bool CC_IS_CLANG && !$(cc-option,-munaligned-symbols)
+	help
+	  If the compiler is unable to generate code that can manage unaligned
+	  symbols, the kernel is linked as a position-independent executable
+	  (PIE) and includes dynamic relocations that are processed early
+	  during bootup.
+
+	  For kpatch functionality, it is recommended to build the kernel
+	  without the PIE_BUILD option. PIE_BUILD is only enabled when the
+	  compiler lacks proper support for handling unaligned symbols.
+
 config RANDOMIZE_BASE
 	bool "Randomize the address of the kernel image (KASLR)"
 	default y
--- a/arch/s390/Makefile
+++ b/arch/s390/Makefile
@@ -14,8 +14,14 @@ KBUILD_AFLAGS_MODULE += -fPIC
 KBUILD_CFLAGS_MODULE += -fPIC
 KBUILD_AFLAGS	+= -m64
 KBUILD_CFLAGS	+= -m64
+ifdef CONFIG_PIE_BUILD
 KBUILD_CFLAGS	+= -fPIE
-LDFLAGS_vmlinux	:= -pie
+LDFLAGS_vmlinux	:= -pie -z notext
+else
+KBUILD_CFLAGS	+= $(call cc-option,-munaligned-symbols,)
+LDFLAGS_vmlinux	:= --emit-relocs --discard-none
+extra_tools	:= relocs
+endif
 aflags_dwarf	:= -Wa,-gdwarf-2
 KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__
 ifndef CONFIG_AS_IS_LLVM
@@ -143,7 +149,7 @@ archheaders:

 archprepare:
 	$(Q)$(MAKE) $(build)=$(syscalls) kapi
-	$(Q)$(MAKE) $(build)=$(tools) kapi
+	$(Q)$(MAKE) $(build)=$(tools) kapi $(extra_tools)
 ifeq ($(KBUILD_EXTMOD),)
 # We need to generate vdso-offsets.h before compiling certain files in kernel/.
 # In order to do that, we should use the archprepare target, but we can't since
--- a/arch/s390/boot/.gitignore
+++ b/arch/s390/boot/.gitignore
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 image
 bzImage
+relocs.S
 section_cmp.*
 vmlinux
 vmlinux.lds
--- a/arch/s390/boot/Makefile
+++ b/arch/s390/boot/Makefile
@@ -37,7 +37,8 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char

 obj-y	:= head.o als.o startup.o physmem_info.o ipl_parm.o ipl_report.o vmem.o
 obj-y	+= string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o
-obj-y	+= version.o pgm_check_info.o ctype.o ipl_data.o machine_kexec_reloc.o
+obj-y	+= version.o pgm_check_info.o ctype.o ipl_data.o
+obj-y	+= $(if $(CONFIG_PIE_BUILD),machine_kexec_reloc.o,relocs.o)
 obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE))	+= uv.o
 obj-$(CONFIG_RANDOMIZE_BASE)	+= kaslr.o
 obj-y	+= $(if $(CONFIG_KERNEL_UNCOMPRESSED),,decompressor.o) info.o
@@ -48,6 +49,9 @@ targets	:= bzImage section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y
 targets	+= vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
 targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
 targets += vmlinux.bin.zst info.bin syms.bin vmlinux.syms $(obj-all)
+ifndef CONFIG_PIE_BUILD
+targets += relocs.S
+endif

 OBJECTS := $(addprefix $(obj)/,$(obj-y))
 OBJECTS_ALL := $(addprefix $(obj)/,$(obj-all))
@@ -56,9 +60,9 @@ clean-files += vmlinux.map

 quiet_cmd_section_cmp = SECTCMP $*
 define cmd_section_cmp
-	s1=`$(OBJDUMP) -t -j "$*" "$<" | sort | \
+	s1=`$(OBJDUMP) -t "$<" | grep "\s$*\s\+" | sort | \
 		sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \
-	s2=`$(OBJDUMP) -t -j "$*" "$(word 2,$^)" | sort | \
+	s2=`$(OBJDUMP) -t "$(word 2,$^)" | grep "\s$*\s\+" | sort | \
 		sed -n "/0000000000000000/! s/.*\s$*\s\+//p" | sha256sum`; \
 	if [ "$$s1" != "$$s2" ]; then \
 		echo "error: section $* differs between $< and $(word 2,$^)" >&2; \
@@ -73,11 +77,12 @@ $(obj)/bzImage: $(obj)/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.b
 $(obj)/section_cmp%: vmlinux $(obj)/vmlinux FORCE
 	$(call if_changed,section_cmp)

-LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup $(if $(CONFIG_VMLINUX_MAP),-Map=$(obj)/vmlinux.map) --build-id=sha1 -T
+LDFLAGS_vmlinux-$(CONFIG_LD_ORPHAN_WARN) := --orphan-handling=$(CONFIG_LD_ORPHAN_WARN_LEVEL)
+LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) --oformat $(LD_BFD) -e startup $(if $(CONFIG_VMLINUX_MAP),-Map=$(obj)/vmlinux.map) --build-id=sha1 -T
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(OBJECTS_ALL) FORCE
 	$(call if_changed,ld)

-LDFLAGS_vmlinux.syms := --oformat $(LD_BFD) -e startup -T
+LDFLAGS_vmlinux.syms := $(LDFLAGS_vmlinux-y) --oformat $(LD_BFD) -e startup -T
 $(obj)/vmlinux.syms: $(obj)/vmlinux.lds $(OBJECTS) FORCE
 	$(call if_changed,ld)

@@ -93,7 +98,7 @@ OBJCOPYFLAGS_syms.o := -I binary -O elf64-s390 -B s390:64-bit --rename-section .
 $(obj)/syms.o: $(obj)/syms.bin FORCE
 	$(call if_changed,objcopy)

-OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=load
+OBJCOPYFLAGS_info.bin := -O binary --only-section=.vmlinux.info --set-section-flags .vmlinux.info=alloc,load
 $(obj)/info.bin: vmlinux FORCE
 	$(call if_changed,objcopy)

@@ -105,6 +110,14 @@ OBJCOPYFLAGS_vmlinux.bin := -O binary --remove-section=.comment --remove-section
 $(obj)/vmlinux.bin: vmlinux FORCE
 	$(call if_changed,objcopy)

+ifndef CONFIG_PIE_BUILD
+CMD_RELOCS=arch/s390/tools/relocs
+quiet_cmd_relocs = RELOCS $@
+	cmd_relocs = $(CMD_RELOCS) $< > $@
+$(obj)/relocs.S: vmlinux FORCE
+	$(call if_changed,relocs)
+endif
+
 suffix-$(CONFIG_KERNEL_GZIP)  := .gz
 suffix-$(CONFIG_KERNEL_BZIP2) := .bz2
 suffix-$(CONFIG_KERNEL_LZ4)  := .lz4
--- a/arch/s390/boot/boot.h
+++ b/arch/s390/boot/boot.h
@@ -25,9 +25,14 @@ struct vmlinux_info {
 	unsigned long bootdata_size;
 	unsigned long bootdata_preserved_off;
 	unsigned long bootdata_preserved_size;
+#ifdef CONFIG_PIE_BUILD
 	unsigned long dynsym_start;
 	unsigned long rela_dyn_start;
 	unsigned long rela_dyn_end;
+#else
+	unsigned long got_start;
+	unsigned long got_end;
+#endif
 	unsigned long amode31_size;
 	unsigned long init_mm_off;
 	unsigned long swapper_pg_dir_off;
@@ -83,6 +88,7 @@ extern unsigned long vmalloc_size;
 extern int vmalloc_size_set;
 extern char __boot_data_start[], __boot_data_end[];
 extern char __boot_data_preserved_start[], __boot_data_preserved_end[];
+extern char __vmlinux_relocs_64_start[], __vmlinux_relocs_64_end[];
 extern char _decompressor_syms_start[], _decompressor_syms_end[];
 extern char _stack_start[], _stack_end[];
 extern char _end[], _decompressor_end[];
--- a/arch/s390/boot/startup.c
+++ b/arch/s390/boot/startup.c
@@ -141,7 +141,8 @@ static void copy_bootdata(void)
 	memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size);
 }

-static void handle_relocs(unsigned long offset)
+#ifdef CONFIG_PIE_BUILD
+static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset)
 {
 	Elf64_Rela *rela_start, *rela_end, *rela;
 	int r_type, r_sym, rc;
@@ -172,6 +173,54 @@ static void handle_relocs(unsigned long offset)
 	}
 }

+static void kaslr_adjust_got(unsigned long offset) {}
+static void rescue_relocs(void) {}
+static void free_relocs(void) {}
+#else
+static int *vmlinux_relocs_64_start;
+static int *vmlinux_relocs_64_end;
+
+static void rescue_relocs(void)
+{
+	unsigned long size = __vmlinux_relocs_64_end - __vmlinux_relocs_64_start;
+
+	vmlinux_relocs_64_start = (void *)physmem_alloc_top_down(RR_RELOC, size, 0);
+	vmlinux_relocs_64_end = (void *)vmlinux_relocs_64_start + size;
+	memmove(vmlinux_relocs_64_start, __vmlinux_relocs_64_start, size);
+}
+
+static void free_relocs(void)
+{
+	physmem_free(RR_RELOC);
+}
+
+static void kaslr_adjust_relocs(unsigned long min_addr, unsigned long max_addr, unsigned long offset)
+{
+	int *reloc;
+	long loc;
+
+	/* Adjust R_390_64 relocations */
+	for (reloc = vmlinux_relocs_64_start; reloc < vmlinux_relocs_64_end; reloc++) {
+		loc = (long)*reloc + offset;
+		if (loc < min_addr || loc > max_addr)
+			error("64-bit relocation outside of kernel!\n");
+		*(u64 *)loc += offset;
+	}
+}
+
+static void kaslr_adjust_got(unsigned long offset)
+{
+	u64 *entry;
+
+	/*
+	 * Even without -fPIE, Clang still uses a global offset table for some
+	 * reason. Adjust the GOT entries.
+	 */
+	for (entry = (u64 *)vmlinux.got_start; entry < (u64 *)vmlinux.got_end; entry++)
+		*entry += offset;
+}
+#endif
+
 /*
 * Merge information from several sources into a single ident_map_size value.
 * "ident_map_size" represents the upper limit of physical memory we may ever
@@ -299,14 +348,19 @@ static void setup_vmalloc_size(void)
 	vmalloc_size = max(size, vmalloc_size);
 }

-static void offset_vmlinux_info(unsigned long offset)
+static void kaslr_adjust_vmlinux_info(unsigned long offset)
 {
 	*(unsigned long *)(&vmlinux.entry) += offset;
 	vmlinux.bootdata_off += offset;
 	vmlinux.bootdata_preserved_off += offset;
+#ifdef CONFIG_PIE_BUILD
 	vmlinux.rela_dyn_start += offset;
 	vmlinux.rela_dyn_end += offset;
 	vmlinux.dynsym_start += offset;
+#else
+	vmlinux.got_start += offset;
+	vmlinux.got_end += offset;
+#endif
 	vmlinux.init_mm_off += offset;
 	vmlinux.swapper_pg_dir_off += offset;
 	vmlinux.invalid_pg_dir_off += offset;
@@ -361,6 +415,7 @@ void startup_kernel(void)
 	detect_physmem_online_ranges(max_physmem_end);
 	save_ipl_cert_comp_list();
 	rescue_initrd(safe_addr, ident_map_size);
+	rescue_relocs();

 	if (kaslr_enabled()) {
 		vmlinux_lma = randomize_within_range(vmlinux.image_size + vmlinux.bss_size,
@@ -368,7 +423,7 @@ void startup_kernel(void)
 						     ident_map_size);
 		if (vmlinux_lma) {
 			__kaslr_offset = vmlinux_lma - vmlinux.default_lma;
-			offset_vmlinux_info(__kaslr_offset);
+			kaslr_adjust_vmlinux_info(__kaslr_offset);
 		}
 	}
 	vmlinux_lma = vmlinux_lma ?: vmlinux.default_lma;
@@ -393,18 +448,20 @@ void startup_kernel(void)
 	/*
 	 * The order of the following operations is important:
 	 *
-	 * - handle_relocs() must follow clear_bss_section() to establish static
-	 *   memory references to data in .bss to be used by setup_vmem()
+	 * - kaslr_adjust_relocs() must follow clear_bss_section() to establish
+	 *   static memory references to data in .bss to be used by setup_vmem()
 	 *   (i.e init_mm.pgd)
 	 *
-	 * - setup_vmem() must follow handle_relocs() to be able using
+	 * - setup_vmem() must follow kaslr_adjust_relocs() to be able using
 	 *   static memory references to data in .bss (i.e init_mm.pgd)
 	 *
-	 * - copy_bootdata() must follow setup_vmem() to propagate changes to
-	 *   bootdata made by setup_vmem()
+	 * - copy_bootdata() must follow setup_vmem() to propagate changes
+	 *   to bootdata made by setup_vmem()
 	 */
 	clear_bss_section(vmlinux_lma);
-	handle_relocs(__kaslr_offset);
+	kaslr_adjust_relocs(vmlinux_lma, vmlinux_lma + vmlinux.image_size, __kaslr_offset);
+	kaslr_adjust_got(__kaslr_offset);
+	free_relocs();
 	setup_vmem(asce_limit);
 	copy_bootdata();

--- a/arch/s390/boot/vmlinux.lds.S
+++ b/arch/s390/boot/vmlinux.lds.S
@@ -31,6 +31,7 @@ SECTIONS
 		_text = .;	/* Text */
 		*(.text)
 		*(.text.*)
+		INIT_TEXT
 		_etext = . ;
 	}
 	.rodata : {
@@ -39,6 +40,9 @@ SECTIONS
 		*(.rodata.*)
 		_erodata = . ;
 	}
+	.got : {
+		*(.got)
+	}
 	NOTES
 	.data :	{
 		_data = . ;
@@ -106,6 +110,24 @@ SECTIONS
 		_compressed_end = .;
 	}

+#ifndef CONFIG_PIE_BUILD
+	/*
+	 * When the kernel is built with CONFIG_KERNEL_UNCOMPRESSED, the entire
+	 * uncompressed vmlinux.bin is positioned in the bzImage decompressor
+	 * image at the default kernel LMA of 0x100000, enabling it to be
+	 * executed in-place. However, the size of .vmlinux.relocs could be
+	 * large enough to cause an overlap with the uncompressed kernel at the
+	 * address 0x100000. To address this issue, .vmlinux.relocs is
+	 * positioned after the .rodata.compressed.
+	 */
+	. = ALIGN(4);
+	.vmlinux.relocs : {
+		__vmlinux_relocs_64_start = .;
+		*(.vmlinux.relocs_64)
+		__vmlinux_relocs_64_end = .;
+	}
+#endif
+
 #define SB_TRAILER_SIZE 32
 	/* Trailer needed for Secure Boot */
 	. += SB_TRAILER_SIZE; /* make sure .sb.trailer does not overwrite the previous section */
@@ -118,8 +140,34 @@ SECTIONS
 	}
 	_end = .;

+	DWARF_DEBUG
+	ELF_DETAILS
+
+	/*
+	 * Make sure that the .got.plt is either completely empty or it
+	 * contains only the three reserved double words.
+	 */
+	.got.plt : {
+		*(.got.plt)
+	}
+	ASSERT(SIZEOF(.got.plt) == 0 || SIZEOF(.got.plt) == 0x18, "Unexpected GOT/PLT entries detected!")
+
+	/*
+	 * Sections that should stay zero sized, which is safer to
+	 * explicitly check instead of blindly discarding.
+	 */
+	.plt : {
+		*(.plt) *(.plt.*) *(.iplt) *(.igot .igot.plt)
+	}
+	ASSERT(SIZEOF(.plt) == 0, "Unexpected run-time procedure linkages detected!")
+	.rela.dyn : {
+		*(.rela.*) *(.rela_*)
+	}
+	ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
+
 	/* Sections to be discarded */
 	/DISCARD/ : {
+		COMMON_DISCARDS
 		*(.eh_frame)
 		*(__ex_table)
 		*(*__ksymtab*)
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -824,6 +824,8 @@ CONFIG_TEST_LOCKUP=m
 CONFIG_DEBUG_PREEMPT=y
 CONFIG_PROVE_LOCKING=y
 CONFIG_LOCK_STAT=y
+CONFIG_LOCKDEP_BITS=16
+CONFIG_LOCKDEP_CHAINS_BITS=17
 CONFIG_DEBUG_ATOMIC_SLEEP=y
 CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
 CONFIG_DEBUG_IRQFLAGS=y
--- a/arch/s390/crypto/chacha-glue.c
+++ b/arch/s390/crypto/chacha-glue.c
@@ -15,14 +15,14 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sizes.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
 #include "chacha-s390.h"

 static void chacha20_crypt_s390(u32 *state, u8 *dst, const u8 *src,
 				unsigned int nbytes, const u32 *key,
 				u32 *counter)
 {
-	struct kernel_fpu vxstate;
+	DECLARE_KERNEL_FPU_ONSTACK32(vxstate);

 	kernel_fpu_begin(&vxstate, KERNEL_VXR);
 	chacha20_vx(dst, src, nbytes, key, counter);
--- a/arch/s390/crypto/chacha-s390.S
+++ b/arch/s390/crypto/chacha-s390.S
@@ -8,7 +8,7 @@

 #include <linux/linkage.h>
 #include <asm/nospec-insn.h>
-#include <asm/vx-insn.h>
+#include <asm/fpu-insn.h>

 #define SP	%r15
 #define FRAME	(16 * 8 + 4 * 8)
--- a/arch/s390/crypto/crc32-vx.c
+++ b/arch/s390/crypto/crc32-vx.c
@@ -13,8 +13,8 @@
 #include <linux/cpufeature.h>
 #include <linux/crc32.h>
 #include <crypto/internal/hash.h>
-#include <asm/fpu/api.h>
-
+#include <asm/fpu.h>
+#include "crc32-vx.h"

 #define CRC32_BLOCK_SIZE	1
 #define CRC32_DIGEST_SIZE	4
@@ -31,11 +31,6 @@ struct crc_desc_ctx {
 	u32 crc;
 };

-/* Prototypes for functions in assembly files */
-u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
-
 /*
 * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
 *
@@ -49,8 +44,8 @@ u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
 	static u32 __pure ___fname(u32 crc,				    \
 				unsigned char const *data, size_t datalen)  \
 	{								    \
-		struct kernel_fpu vxstate;				    \
 		unsigned long prealign, aligned, remaining;		    \
+		DECLARE_KERNEL_FPU_ONSTACK16(vxstate);			    \
 									    \
 		if (datalen < VX_MIN_LEN + VX_ALIGN_MASK)		    \
 			return ___crc32_sw(crc, data, datalen);		    \
--- a/arch/s390/crypto/crc32-vx.h
+++ b/arch/s390/crypto/crc32-vx.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CRC32_VX_S390_H
+#define _CRC32_VX_S390_H
+
+#include <linux/types.h>
+
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+
+#endif /* _CRC32_VX_S390_H */
--- a/arch/s390/crypto/crc32be-vx.c
+++ b/arch/s390/crypto/crc32be-vx.c
@@ -12,20 +12,17 @@
 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
 */

-#include <linux/linkage.h>
-#include <asm/nospec-insn.h>
-#include <asm/vx-insn.h>
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"

 /* Vector register range containing CRC-32 constants */
-#define CONST_R1R2		%v9
-#define CONST_R3R4		%v10
-#define CONST_R5		%v11
-#define CONST_R6		%v12
-#define CONST_RU_POLY		%v13
-#define CONST_CRC_POLY		%v14
-
-	.data
-	.balign	8
+#define CONST_R1R2		9
+#define CONST_R3R4		10
+#define CONST_R5		11
+#define CONST_R6		12
+#define CONST_RU_POLY		13
+#define CONST_CRC_POLY		14

 /*
 * The CRC-32 constant block contains reduction constants to fold and
@@ -58,105 +55,74 @@
 *	P'(x) = 0xEDB88320
 */

-SYM_DATA_START_LOCAL(constants_CRC_32_BE)
-	.quad		0x08833794c, 0x0e6228b11	# R1, R2
-	.quad		0x0c5b9cd4c, 0x0e8a45605	# R3, R4
-	.quad		0x0f200aa66, 1 << 32		# R5, x32
-	.quad		0x0490d678d, 1			# R6, 1
-	.quad		0x104d101df, 0			# u
-	.quad		0x104C11DB7, 0			# P(x)
-SYM_DATA_END(constants_CRC_32_BE)
+static unsigned long constants_CRC_32_BE[] = {
+	0x08833794c, 0x0e6228b11,	/* R1, R2 */
+	0x0c5b9cd4c, 0x0e8a45605,	/* R3, R4 */
+	0x0f200aa66, 1UL << 32,		/* R5, x32 */
+	0x0490d678d, 1,			/* R6, 1 */
+	0x104d101df, 0,			/* u */
+	0x104C11DB7, 0,			/* P(x) */
+};

-	.previous
-
-	GEN_BR_THUNK %r14
-
-	.text
-/*
- * The CRC-32 function(s) use these calling conventions:
- *
- * Parameters:
- *
- *	%r2:	Initial CRC value, typically ~0; and final CRC (return) value.
- *	%r3:	Input buffer pointer, performance might be improved if the
- *		buffer is on a doubleword boundary.
- *	%r4:	Length of the buffer, must be 64 bytes or greater.
+/**
+ * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ *	  buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
 *
 * Register usage:
- *
- *	%r5:	CRC-32 constant pool base pointer.
 *	V0:	Initial CRC value and intermediate constants and results.
 *	V1..V4:	Data for CRC computation.
 *	V5..V8:	Next data chunks that are fetched from the input buffer.
- *
 *	V9..V14: CRC-32 constants.
 */
-SYM_FUNC_START(crc32_be_vgfm_16)
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
 	/* Load CRC-32 constants */
-	larl	%r5,constants_CRC_32_BE
-	VLM	CONST_R1R2,CONST_CRC_POLY,0,%r5
+	fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE);
+	fpu_vzero(0);

 	/* Load the initial CRC value into the leftmost word of V0. */
-	VZERO	%v0
-	VLVGF	%v0,%r2,0
+	fpu_vlvgf(0, crc, 0);

 	/* Load a 64-byte data chunk and XOR with CRC */
-	VLM	%v1,%v4,0,%r3		/* 64-bytes into V1..V4 */
-	VX	%v1,%v0,%v1		/* V1 ^= CRC */
-	aghi	%r3,64			/* BUF = BUF + 64 */
-	aghi	%r4,-64			/* LEN = LEN - 64 */
+	fpu_vlm(1, 4, buf);
+	fpu_vx(1, 0, 1);
+	buf += 64;
+	size -= 64;

-	/* Check remaining buffer size and jump to proper folding method */
-	cghi	%r4,64
-	jl	.Lless_than_64bytes
+	while (size >= 64) {
+		/* Load the next 64-byte data chunk into V5 to V8 */
+		fpu_vlm(5, 8, buf);

-.Lfold_64bytes_loop:
-	/* Load the next 64-byte data chunk into V5 to V8 */
-	VLM	%v5,%v8,0,%r3
+		/*
+		 * Perform a GF(2) multiplication of the doublewords in V1 with
+		 * the reduction constants in V0.  The intermediate result is
+		 * then folded (accumulated) with the next data chunk in V5 and
+		 * stored in V1.  Repeat this step for the register contents
+		 * in V2, V3, and V4 respectively.
+		 */
+		fpu_vgfmag(1, CONST_R1R2, 1, 5);
+		fpu_vgfmag(2, CONST_R1R2, 2, 6);
+		fpu_vgfmag(3, CONST_R1R2, 3, 7);
+		fpu_vgfmag(4, CONST_R1R2, 4, 8);
+		buf += 64;
+		size -= 64;
+	}

-	/*
-	 * Perform a GF(2) multiplication of the doublewords in V1 with
-	 * the reduction constants in V0.  The intermediate result is
-	 * then folded (accumulated) with the next data chunk in V5 and
-	 * stored in V1.  Repeat this step for the register contents
-	 * in V2, V3, and V4 respectively.
-	 */
-	VGFMAG	%v1,CONST_R1R2,%v1,%v5
-	VGFMAG	%v2,CONST_R1R2,%v2,%v6
-	VGFMAG	%v3,CONST_R1R2,%v3,%v7
-	VGFMAG	%v4,CONST_R1R2,%v4,%v8
-
-	/* Adjust buffer pointer and length for next loop */
-	aghi	%r3,64			/* BUF = BUF + 64 */
-	aghi	%r4,-64			/* LEN = LEN - 64 */
-
-	cghi	%r4,64
-	jnl	.Lfold_64bytes_loop
-
-.Lless_than_64bytes:
 	/* Fold V1 to V4 into a single 128-bit value in V1 */
-	VGFMAG	%v1,CONST_R3R4,%v1,%v2
-	VGFMAG	%v1,CONST_R3R4,%v1,%v3
-	VGFMAG	%v1,CONST_R3R4,%v1,%v4
+	fpu_vgfmag(1, CONST_R3R4, 1, 2);
+	fpu_vgfmag(1, CONST_R3R4, 1, 3);
+	fpu_vgfmag(1, CONST_R3R4, 1, 4);

-	/* Check whether to continue with 64-bit folding */
-	cghi	%r4,16
-	jl	.Lfinal_fold
+	while (size >= 16) {
+		fpu_vl(2, buf);
+		fpu_vgfmag(1, CONST_R3R4, 1, 2);
+		buf += 16;
+		size -= 16;
+	}

-.Lfold_16bytes_loop:
-
-	VL	%v2,0,,%r3		/* Load next data chunk */
-	VGFMAG	%v1,CONST_R3R4,%v1,%v2	/* Fold next data chunk */
-
-	/* Adjust buffer pointer and size for folding next data chunk */
-	aghi	%r3,16
-	aghi	%r4,-16
-
-	/* Process remaining data chunks */
-	cghi	%r4,16
-	jnl	.Lfold_16bytes_loop
-
-.Lfinal_fold:
 	/*
 	 * The R5 constant is used to fold a 128-bit value into an 96-bit value
 	 * that is XORed with the next 96-bit input data chunk.  To use a single
@@ -164,7 +130,7 @@ SYM_FUNC_START(crc32_be_vgfm_16)
 	 * form an intermediate 96-bit value (with appended zeros) which is then
 	 * XORed with the intermediate reduction result.
 	 */
-	VGFMG	%v1,CONST_R5,%v1
+	fpu_vgfmg(1, CONST_R5, 1);

 	/*
 	 * Further reduce the remaining 96-bit value to a 64-bit value using a
@@ -173,7 +139,7 @@ SYM_FUNC_START(crc32_be_vgfm_16)
 	 * doubleword with R6.	The result is a 64-bit value and is subject to
 	 * the Barret reduction.
 	 */
-	VGFMG	%v1,CONST_R6,%v1
+	fpu_vgfmg(1, CONST_R6, 1);

 	/*
 	 * The input values to the Barret reduction are the degree-63 polynomial
@@ -194,20 +160,15 @@ SYM_FUNC_START(crc32_be_vgfm_16)
 	 */

 	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
-	VUPLLF	%v2,%v1
-	VGFMG	%v2,CONST_RU_POLY,%v2
+	fpu_vupllf(2, 1);
+	fpu_vgfmg(2, CONST_RU_POLY, 2);

 	/*
 	 * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
 	 * V2 and XOR the intermediate result, T2(x),  with the value in V1.
 	 * The final result is in the rightmost word of V2.
 	 */
-	VUPLLF	%v2,%v2
-	VGFMAG	%v2,CONST_CRC_POLY,%v2,%v1
-
-.Ldone:
-	VLGVF	%r2,%v2,3
-	BR_EX	%r14
-SYM_FUNC_END(crc32_be_vgfm_16)
-
-.previous
+	fpu_vupllf(2, 2);
+	fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);
+	return fpu_vlgvf(2, 3);
+}
--- a/arch/s390/crypto/crc32le-vx.c
+++ b/arch/s390/crypto/crc32le-vx.c
@@ -13,20 +13,17 @@
 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
 */

-#include <linux/linkage.h>
-#include <asm/nospec-insn.h>
-#include <asm/vx-insn.h>
+#include <linux/types.h>
+#include <asm/fpu.h>
+#include "crc32-vx.h"

 /* Vector register range containing CRC-32 constants */
-#define CONST_PERM_LE2BE	%v9
-#define CONST_R2R1		%v10
-#define CONST_R4R3		%v11
-#define CONST_R5		%v12
-#define CONST_RU_POLY		%v13
-#define CONST_CRC_POLY		%v14
-
-	.data
-	.balign	8
+#define CONST_PERM_LE2BE	9
+#define CONST_R2R1		10
+#define CONST_R4R3		11
+#define CONST_R5		12
+#define CONST_RU_POLY		13
+#define CONST_CRC_POLY		14

 /*
 * The CRC-32 constant block contains reduction constants to fold and
@@ -59,64 +56,43 @@
 *	P'(x) = 0x82F63B78
 */

-SYM_DATA_START_LOCAL(constants_CRC_32_LE)
-	.octa		0x0F0E0D0C0B0A09080706050403020100	# BE->LE mask
-	.quad		0x1c6e41596, 0x154442bd4		# R2, R1
-	.quad		0x0ccaa009e, 0x1751997d0		# R4, R3
-	.octa		0x163cd6124				# R5
-	.octa		0x1F7011641				# u'
-	.octa		0x1DB710641				# P'(x) << 1
-SYM_DATA_END(constants_CRC_32_LE)
+static unsigned long constants_CRC_32_LE[] = {
+	0x0f0e0d0c0b0a0908, 0x0706050403020100,	/* BE->LE mask */
+	0x1c6e41596, 0x154442bd4,		/* R2, R1 */
+	0x0ccaa009e, 0x1751997d0,		/* R4, R3 */
+	0x0, 0x163cd6124,			/* R5 */
+	0x0, 0x1f7011641,			/* u' */
+	0x0, 0x1db710641			/* P'(x) << 1 */
+};

-SYM_DATA_START_LOCAL(constants_CRC_32C_LE)
-	.octa		0x0F0E0D0C0B0A09080706050403020100	# BE->LE mask
-	.quad		0x09e4addf8, 0x740eef02			# R2, R1
-	.quad		0x14cd00bd6, 0xf20c0dfe			# R4, R3
-	.octa		0x0dd45aab8				# R5
-	.octa		0x0dea713f1				# u'
-	.octa		0x105ec76f0				# P'(x) << 1
-SYM_DATA_END(constants_CRC_32C_LE)
+static unsigned long constants_CRC_32C_LE[] = {
+	0x0f0e0d0c0b0a0908, 0x0706050403020100,	/* BE->LE mask */
+	0x09e4addf8, 0x740eef02,		/* R2, R1 */
+	0x14cd00bd6, 0xf20c0dfe,		/* R4, R3 */
+	0x0, 0x0dd45aab8,			/* R5 */
+	0x0, 0x0dea713f1,			/* u' */
+	0x0, 0x105ec76f0			/* P'(x) << 1 */
+};

-	.previous
-
-	GEN_BR_THUNK %r14
-
-	.text
-
-/*
- * The CRC-32 functions use these calling conventions:
- *
- * Parameters:
- *
- *	%r2:	Initial CRC value, typically ~0; and final CRC (return) value.
- *	%r3:	Input buffer pointer, performance might be improved if the
- *		buffer is on a doubleword boundary.
- *	%r4:	Length of the buffer, must be 64 bytes or greater.
+/**
+ * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers
+ * @crc: Initial CRC value, typically ~0.
+ * @buf: Input buffer pointer, performance might be improved if the
+ *	 buffer is on a doubleword boundary.
+ * @size: Size of the buffer, must be 64 bytes or greater.
+ * @constants: CRC-32 constant pool base pointer.
 *
 * Register usage:
- *
- *	%r5:	CRC-32 constant pool base pointer.
- *	V0:	Initial CRC value and intermediate constants and results.
- *	V1..V4:	Data for CRC computation.
- *	V5..V8:	Next data chunks that are fetched from the input buffer.
- *	V9:	Constant for BE->LE conversion and shift operations
- *
+ *	V0:	  Initial CRC value and intermediate constants and results.
+ *	V1..V4:	  Data for CRC computation.
+ *	V5..V8:	  Next data chunks that are fetched from the input buffer.
+ *	V9:	  Constant for BE->LE conversion and shift operations
 *	V10..V14: CRC-32 constants.
 */
-
-SYM_FUNC_START(crc32_le_vgfm_16)
-	larl	%r5,constants_CRC_32_LE
-	j	crc32_le_vgfm_generic
-SYM_FUNC_END(crc32_le_vgfm_16)
-
-SYM_FUNC_START(crc32c_le_vgfm_16)
-	larl	%r5,constants_CRC_32C_LE
-	j	crc32_le_vgfm_generic
-SYM_FUNC_END(crc32c_le_vgfm_16)
-
-SYM_FUNC_START(crc32_le_vgfm_generic)
+static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants)
+{
 	/* Load CRC-32 constants */
-	VLM	CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5
+	fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants);

 	/*
 	 * Load the initial CRC value.
@@ -125,90 +101,73 @@ SYM_FUNC_START(crc32_le_vgfm_generic)
 	 * vector register and is later XORed with the LSB portion
 	 * of the loaded input data.
 	 */
-	VZERO	%v0			/* Clear V0 */
-	VLVGF	%v0,%r2,3		/* Load CRC into rightmost word */
+	fpu_vzero(0);			/* Clear V0 */
+	fpu_vlvgf(0, crc, 3);		/* Load CRC into rightmost word */

 	/* Load a 64-byte data chunk and XOR with CRC */
-	VLM	%v1,%v4,0,%r3		/* 64-bytes into V1..V4 */
-	VPERM	%v1,%v1,%v1,CONST_PERM_LE2BE
-	VPERM	%v2,%v2,%v2,CONST_PERM_LE2BE
-	VPERM	%v3,%v3,%v3,CONST_PERM_LE2BE
-	VPERM	%v4,%v4,%v4,CONST_PERM_LE2BE
+	fpu_vlm(1, 4, buf);
+	fpu_vperm(1, 1, 1, CONST_PERM_LE2BE);
+	fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+	fpu_vperm(3, 3, 3, CONST_PERM_LE2BE);
+	fpu_vperm(4, 4, 4, CONST_PERM_LE2BE);

-	VX	%v1,%v0,%v1		/* V1 ^= CRC */
-	aghi	%r3,64			/* BUF = BUF + 64 */
-	aghi	%r4,-64			/* LEN = LEN - 64 */
+	fpu_vx(1, 0, 1);		/* V1 ^= CRC */
+	buf += 64;
+	size -= 64;

-	cghi	%r4,64
-	jl	.Lless_than_64bytes
+	while (size >= 64) {
+		fpu_vlm(5, 8, buf);
+		fpu_vperm(5, 5, 5, CONST_PERM_LE2BE);
+		fpu_vperm(6, 6, 6, CONST_PERM_LE2BE);
+		fpu_vperm(7, 7, 7, CONST_PERM_LE2BE);
+		fpu_vperm(8, 8, 8, CONST_PERM_LE2BE);
+		/*
+		 * Perform a GF(2) multiplication of the doublewords in V1 with
+		 * the R1 and R2 reduction constants in V0.  The intermediate
+		 * result is then folded (accumulated) with the next data chunk
+		 * in V5 and stored in V1. Repeat this step for the register
+		 * contents in V2, V3, and V4 respectively.
+		 */
+		fpu_vgfmag(1, CONST_R2R1, 1, 5);
+		fpu_vgfmag(2, CONST_R2R1, 2, 6);
+		fpu_vgfmag(3, CONST_R2R1, 3, 7);
+		fpu_vgfmag(4, CONST_R2R1, 4, 8);
+		buf += 64;
+		size -= 64;
+	}

-.Lfold_64bytes_loop:
-	/* Load the next 64-byte data chunk into V5 to V8 */
-	VLM	%v5,%v8,0,%r3
-	VPERM	%v5,%v5,%v5,CONST_PERM_LE2BE
-	VPERM	%v6,%v6,%v6,CONST_PERM_LE2BE
-	VPERM	%v7,%v7,%v7,CONST_PERM_LE2BE
-	VPERM	%v8,%v8,%v8,CONST_PERM_LE2BE
-
-	/*
-	 * Perform a GF(2) multiplication of the doublewords in V1 with
-	 * the R1 and R2 reduction constants in V0.  The intermediate result
-	 * is then folded (accumulated) with the next data chunk in V5 and
-	 * stored in V1. Repeat this step for the register contents
-	 * in V2, V3, and V4 respectively.
-	 */
-	VGFMAG	%v1,CONST_R2R1,%v1,%v5
-	VGFMAG	%v2,CONST_R2R1,%v2,%v6
-	VGFMAG	%v3,CONST_R2R1,%v3,%v7
-	VGFMAG	%v4,CONST_R2R1,%v4,%v8
-
-	aghi	%r3,64			/* BUF = BUF + 64 */
-	aghi	%r4,-64			/* LEN = LEN - 64 */
-
-	cghi	%r4,64
-	jnl	.Lfold_64bytes_loop
-
-.Lless_than_64bytes:
 	/*
 	 * Fold V1 to V4 into a single 128-bit value in V1.  Multiply V1 with R3
 	 * and R4 and accumulating the next 128-bit chunk until a single 128-bit
 	 * value remains.
 	 */
-	VGFMAG	%v1,CONST_R4R3,%v1,%v2
-	VGFMAG	%v1,CONST_R4R3,%v1,%v3
-	VGFMAG	%v1,CONST_R4R3,%v1,%v4
+	fpu_vgfmag(1, CONST_R4R3, 1, 2);
+	fpu_vgfmag(1, CONST_R4R3, 1, 3);
+	fpu_vgfmag(1, CONST_R4R3, 1, 4);

-	cghi	%r4,16
-	jl	.Lfinal_fold
+	while (size >= 16) {
+		fpu_vl(2, buf);
+		fpu_vperm(2, 2, 2, CONST_PERM_LE2BE);
+		fpu_vgfmag(1, CONST_R4R3, 1, 2);
+		buf += 16;
+		size -= 16;
+	}

-.Lfold_16bytes_loop:
-
-	VL	%v2,0,,%r3		/* Load next data chunk */
-	VPERM	%v2,%v2,%v2,CONST_PERM_LE2BE
-	VGFMAG	%v1,CONST_R4R3,%v1,%v2	/* Fold next data chunk */
-
-	aghi	%r3,16
-	aghi	%r4,-16
-
-	cghi	%r4,16
-	jnl	.Lfold_16bytes_loop
-
-.Lfinal_fold:
 	/*
 	 * Set up a vector register for byte shifts.  The shift value must
 	 * be loaded in bits 1-4 in byte element 7 of a vector register.
 	 * Shift by 8 bytes: 0x40
 	 * Shift by 4 bytes: 0x20
 	 */
-	VLEIB	%v9,0x40,7
+	fpu_vleib(9, 0x40, 7);

 	/*
 	 * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
 	 * to move R4 into the rightmost doubleword and set the leftmost
 	 * doubleword to 0x1.
 	 */
-	VSRLB	%v0,CONST_R4R3,%v9
-	VLEIG	%v0,1,0
+	fpu_vsrlb(0, CONST_R4R3, 9);
+	fpu_vleig(0, 1, 0);

 	/*
 	 * Compute GF(2) product of V1 and V0.	The rightmost doubleword
@@ -216,7 +175,7 @@ SYM_FUNC_START(crc32_le_vgfm_generic)
 	 * multiplied by 0x1 and is then XORed with rightmost product.
 	 * Implicitly, the intermediate leftmost product becomes padded
 	 */
-	VGFMG	%v1,%v0,%v1
+	fpu_vgfmg(1, 0, 1);

 	/*
 	 * Now do the final 32-bit fold by multiplying the rightmost word
@@ -231,10 +190,10 @@ SYM_FUNC_START(crc32_le_vgfm_generic)
 	 * rightmost doubleword and the leftmost doubleword is zero to ignore
 	 * the leftmost product of V1.
 	 */
-	VLEIB	%v9,0x20,7		  /* Shift by words */
-	VSRLB	%v2,%v1,%v9		  /* Store remaining bits in V2 */
-	VUPLLF	%v1,%v1			  /* Split rightmost doubleword */
-	VGFMAG	%v1,CONST_R5,%v1,%v2	  /* V1 = (V1 * R5) XOR V2 */
+	fpu_vleib(9, 0x20, 7);		  /* Shift by words */
+	fpu_vsrlb(2, 1, 9);		  /* Store remaining bits in V2 */
+	fpu_vupllf(1, 1);		  /* Split rightmost doubleword */
+	fpu_vgfmag(1, CONST_R5, 1, 2);	  /* V1 = (V1 * R5) XOR V2 */

 	/*
 	 * Apply a Barret reduction to compute the final 32-bit CRC value.
@@ -256,20 +215,26 @@ SYM_FUNC_START(crc32_le_vgfm_generic)
 	 */

 	/* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
-	VUPLLF	%v2,%v1
-	VGFMG	%v2,CONST_RU_POLY,%v2
+	fpu_vupllf(2, 1);
+	fpu_vgfmg(2, CONST_RU_POLY, 2);

 	/*
 	 * Compute the GF(2) product of the CRC polynomial with T1(x) in
 	 * V2 and XOR the intermediate result, T2(x), with the value in V1.
 	 * The final result is stored in word element 2 of V2.
 	 */
-	VUPLLF	%v2,%v2
-	VGFMAG	%v2,CONST_CRC_POLY,%v2,%v1
+	fpu_vupllf(2, 2);
+	fpu_vgfmag(2, CONST_CRC_POLY, 2, 1);

-.Ldone:
-	VLGVF	%r2,%v2,2
-	BR_EX	%r14
-SYM_FUNC_END(crc32_le_vgfm_generic)
+	return fpu_vlgvf(2, 2);
+}

-.previous
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+	return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]);
+}
+
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
+{
+	return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]);
+}
--- a/arch/s390/crypto/paes_s390.c
+++ b/arch/s390/crypto/paes_s390.c
@@ -125,20 +125,8 @@ struct s390_pxts_ctx {
 static inline int __paes_keyblob2pkey(struct key_blob *kb,
 				     struct pkey_protkey *pk)
 {
-	int i, ret;
-
-	/* try three times in case of failure */
-	for (i = 0; i < 3; i++) {
-		if (i > 0 && ret == -EAGAIN && in_task())
-			if (msleep_interruptible(1000))
-				return -EINTR;
-		ret = pkey_keyblob2pkey(kb->key, kb->keylen,
-					pk->protkey, &pk->len, &pk->type);
-		if (ret == 0)
-			break;
-	}
-
-	return ret;
+	return pkey_keyblob2pkey(kb->key, kb->keylen,
+				 pk->protkey, &pk->len, &pk->type);
 }

 static inline int __paes_convert_key(struct s390_paes_ctx *ctx)
--- a/arch/s390/hypfs/hypfs_diag0c.c
+++ b/arch/s390/hypfs/hypfs_diag0c.c
@@ -20,8 +20,7 @@
 */
 static void diag0c_fn(void *data)
 {
-	diag_stat_inc(DIAG_STAT_X00C);
-	diag_amode31_ops.diag0c(((void **)data)[smp_processor_id()]);
+	diag0c(((void **)data)[smp_processor_id()]);
 }

 /*
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -25,7 +25,7 @@

 static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd)
 {
-	union register_pair r1 = { .even = (unsigned long)data, };
+	union register_pair r1 = { .even = virt_to_phys(data), };

 	asm volatile("diag %[r1],%[r3],0x304\n"
 		     : [r1] "+&d" (r1.pair)
@@ -74,7 +74,7 @@ static int __hypfs_sprp_ioctl(void __user *user_area)
 	int rc;

 	rc = -ENOMEM;
-	data = (void *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	data = (void *)get_zeroed_page(GFP_KERNEL);
 	diag304 = kzalloc(sizeof(*diag304), GFP_KERNEL);
 	if (!data || !diag304)
 		goto out;
--- a/arch/s390/include/asm/access-regs.h
+++ b/arch/s390/include/asm/access-regs.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright IBM Corp. 1999, 2024
+ */
+
+#ifndef __ASM_S390_ACCESS_REGS_H
+#define __ASM_S390_ACCESS_REGS_H
+
+#include <linux/instrumented.h>
+#include <asm/sigcontext.h>
+
+struct access_regs {
+	unsigned int regs[NUM_ACRS];
+};
+
+static inline void save_access_regs(unsigned int *acrs)
+{
+	struct access_regs *regs = (struct access_regs *)acrs;
+
+	instrument_write(regs, sizeof(*regs));
+	asm volatile("stamy	0,15,%[regs]"
+		     : [regs] "=QS" (*regs)
+		     :
+		     : "memory");
+}
+
+static inline void restore_access_regs(unsigned int *acrs)
+{
+	struct access_regs *regs = (struct access_regs *)acrs;
+
+	instrument_read(regs, sizeof(*regs));
+	asm volatile("lamy	0,15,%[regs]"
+		     :
+		     : [regs] "QS" (*regs)
+		     : "memory");
+}
+
+#endif /* __ASM_S390_ACCESS_REGS_H */
--- a/arch/s390/include/asm/appldata.h
+++ b/arch/s390/include/asm/appldata.h
@@ -54,13 +54,13 @@ static inline int appldata_asm(struct appldata_parameter_list *parm_list,
 	parm_list->function = fn;
 	parm_list->parlist_length = sizeof(*parm_list);
 	parm_list->buffer_length = length;
-	parm_list->product_id_addr = (unsigned long) id;
+	parm_list->product_id_addr = virt_to_phys(id);
 	parm_list->buffer_addr = virt_to_phys(buffer);
 	diag_stat_inc(DIAG_STAT_X0DC);
 	asm volatile(
 		"	diag	%1,%0,0xdc"
 		: "=d" (ry)
-		: "d" (parm_list), "m" (*parm_list), "m" (*id)
+		: "d" (virt_to_phys(parm_list)), "m" (*parm_list), "m" (*id)
 		: "cc");
 	return ry;
 }
--- a/arch/s390/include/asm/asm-prototypes.h
+++ b/arch/s390/include/asm/asm-prototypes.h
@@ -3,7 +3,7 @@

 #include <linux/kvm_host.h>
 #include <linux/ftrace.h>
-#include <asm/fpu/api.h>
+#include <asm/fpu.h>
 #include <asm-generic/asm-prototypes.h>

 __int128_t __ashlti3(__int128_t a, int b);
--- a/Show More
+++ b/Show More