diff --git a/Documentation/ABI/testing/sysfs-device-mali b/Documentation/ABI/testing/sysfs-device-mali
index b7be50338f6e..cd011dae9b50 100644
--- a/Documentation/ABI/testing/sysfs-device-mali
+++ b/Documentation/ABI/testing/sysfs-device-mali
@@ -236,6 +236,7 @@ Description:
 		device-driver that supports a CSF GPU. The duration value unit
 		is in milliseconds and is used for configuring csf scheduling
 		tick duration.
+
 What:		/sys/class/misc/mali%u/device/reset_timeout
 Description:
 		This attribute is used to set the number of milliseconds to
diff --git a/Documentation/devicetree/bindings/arm/mali-bifrost.txt b/Documentation/devicetree/bindings/arm/mali-bifrost.txt
index 04e1bd1a5a39..2b3b1d028ccd 100644
--- a/Documentation/devicetree/bindings/arm/mali-bifrost.txt
+++ b/Documentation/devicetree/bindings/arm/mali-bifrost.txt
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2013-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2013-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -129,7 +129,7 @@ for details.
 		   set and the setting coresponding to the SYSC_ALLOC register.
 
 
-Example for a Mali GPU with 1 clock and no regulators:
+Example for a Mali GPU with 1 clock and 1 regulator:
 
 gpu@0xfc010000 {
 	compatible = "arm,malit602", "arm,malit60x", "arm,malit6xx", "arm,mali-midgard";
diff --git a/drivers/base/arm/Kbuild b/drivers/base/arm/Kbuild
index 01de13bef37c..e5ded4cf7395 100644
--- a/drivers/base/arm/Kbuild
+++ b/drivers/base/arm/Kbuild
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -28,7 +28,6 @@ subdir-ccflags-y += $(ccflags-y)
 #
 # Kernel modules
 #
-obj-$(CONFIG_DMA_BUF_LOCK) += dma_buf_lock/src/
 obj-$(CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER) += dma_buf_test_exporter/
 obj-$(CONFIG_MALI_MEMORY_GROUP_MANAGER) += memory_group_manager/
 obj-$(CONFIG_MALI_PROTECTED_MEMORY_ALLOCATOR) += protected_memory_allocator/
diff --git a/drivers/base/arm/Kconfig b/drivers/base/arm/Kconfig
index e5fca3a39ccb..7f9f1d4c418a 100644
--- a/drivers/base/arm/Kconfig
+++ b/drivers/base/arm/Kconfig
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -26,16 +26,6 @@ menuconfig MALI_BASE_MODULES
 	  Those modules provide extra features or debug interfaces and,
 	  are optional for the use of the Mali GPU modules.
 
-config DMA_BUF_LOCK
-	bool "Build dma-buf lock module"
-	depends on MALI_BASE_MODULES && MALI_DMA_FENCE
-	default y
-	help
-	  This option will build the dma_buf_lock module.
-
-	  Modules:
-	    - dma_buf_lock.ko
-
 config DMA_SHARED_BUFFER_TEST_EXPORTER
 	bool "Build dma-buf framework test exporter module"
 	depends on MALI_BASE_MODULES && DMA_SHARED_BUFFER
diff --git a/drivers/base/arm/Makefile b/drivers/base/arm/Makefile
index ed5c118a5ef4..c1a61a1106d0 100644
--- a/drivers/base/arm/Makefile
+++ b/drivers/base/arm/Makefile
@@ -38,11 +38,9 @@ ifeq ($(CONFIG_MALI_BASE_MODULES),y)
     CONFIG_MALI_CSF_SUPPORT ?= n
 
     ifneq ($(CONFIG_DMA_SHARED_BUFFER),n)
-        CONFIG_DMA_BUF_LOCK ?= y
         CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER ?= y
     else
         # Prevent misuse when CONFIG_DMA_SHARED_BUFFER=n
-        CONFIG_DMA_BUF_LOCK = n
         CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER = n
     endif
 
@@ -54,7 +52,6 @@ ifeq ($(CONFIG_MALI_BASE_MODULES),y)
 
 else
     # Prevent misuse when CONFIG_MALI_BASE_MODULES=n
-    CONFIG_DMA_BUF_LOCK = n
     CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER = n
     CONFIG_MALI_MEMORY_GROUP_MANAGER = n
     CONFIG_MALI_PROTECTED_MEMORY_ALLOCATOR = n
@@ -64,10 +61,9 @@ endif
 CONFIGS := \
     CONFIG_MALI_BASE_MODULES \
     CONFIG_MALI_CSF_SUPPORT \
-    CONFIG_DMA_BUF_LOCK \
     CONFIG_DMA_SHARED_BUFFER_TEST_EXPORTER \
     CONFIG_MALI_MEMORY_GROUP_MANAGER \
-    CONFIG_MALI_PROTECTED_MEMORY_ALLOCATOR
+    CONFIG_MALI_PROTECTED_MEMORY_ALLOCATOR \
 
 
 #
@@ -92,26 +88,47 @@ EXTRA_CFLAGS := $(foreach config,$(CONFIGS), \
                     $(if $(filter y m,$(value $(value config))), \
                         -D$(value config)=1))
 
-# The following were added to align with W=1 in scripts/Makefile.extrawarn
-# from the Linux source tree
 KBUILD_CFLAGS += -Wall -Werror
+
+# The following were added to align with W=1 in scripts/Makefile.extrawarn
+# from the Linux source tree (v5.18.14)
 KBUILD_CFLAGS += -Wextra -Wunused -Wno-unused-parameter
 KBUILD_CFLAGS += -Wmissing-declarations
 KBUILD_CFLAGS += -Wmissing-format-attribute
 KBUILD_CFLAGS += -Wmissing-prototypes
 KBUILD_CFLAGS += -Wold-style-definition
-KBUILD_CFLAGS += -Wmissing-include-dirs
+# The -Wmissing-include-dirs cannot be enabled as the path to some of the
+# included directories change depending on whether it is an in-tree or
+# out-of-tree build.
 KBUILD_CFLAGS += $(call cc-option, -Wunused-but-set-variable)
 KBUILD_CFLAGS += $(call cc-option, -Wunused-const-variable)
 KBUILD_CFLAGS += $(call cc-option, -Wpacked-not-aligned)
 KBUILD_CFLAGS += $(call cc-option, -Wstringop-truncation)
 # The following turn off the warnings enabled by -Wextra
-KBUILD_CFLAGS += -Wno-missing-field-initializers
 KBUILD_CFLAGS += -Wno-sign-compare
-KBUILD_CFLAGS += -Wno-type-limits
+KBUILD_CFLAGS += -Wno-shift-negative-value
+# This flag is needed to avoid build errors on older kernels
+KBUILD_CFLAGS += $(call cc-option, -Wno-cast-function-type)
 
 KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN1
 
+# The following were added to align with W=2 in scripts/Makefile.extrawarn
+# from the Linux source tree (v5.18.14)
+KBUILD_CFLAGS += -Wdisabled-optimization
+# The -Wshadow flag cannot be enabled unless upstream kernels are
+# patched to fix redefinitions of certain built-in functions and
+# global variables.
+KBUILD_CFLAGS += $(call cc-option, -Wlogical-op)
+KBUILD_CFLAGS += -Wmissing-field-initializers
+KBUILD_CFLAGS += -Wtype-limits
+KBUILD_CFLAGS += $(call cc-option, -Wmaybe-uninitialized)
+KBUILD_CFLAGS += $(call cc-option, -Wunused-macros)
+
+KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN2
+
+# This warning is disabled to avoid build failures in some kernel versions
+KBUILD_CFLAGS += -Wno-ignored-qualifiers
+
 all:
 	$(MAKE) -C $(KDIR) M=$(CURDIR) $(MAKE_ARGS) EXTRA_CFLAGS="$(EXTRA_CFLAGS)" KBUILD_EXTRA_SYMBOLS="$(EXTRA_SYMBOLS)" modules
 
diff --git a/drivers/base/arm/Mconfig b/drivers/base/arm/Mconfig
index a48df6d8b090..f7787f0ccd34 100644
--- a/drivers/base/arm/Mconfig
+++ b/drivers/base/arm/Mconfig
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -26,16 +26,6 @@ menuconfig MALI_BASE_MODULES
 	  Those modules provide extra features or debug interfaces and,
 	  are optional for the use of the Mali GPU modules.
 
-config DMA_BUF_LOCK
-	bool "Build dma-buf lock module"
-	depends on MALI_BASE_MODULES && MALI_DMA_FENCE
-	default y
-	help
-	  This option will build the dma_buf_lock module.
-
-	  Modules:
-	    - dma_buf_lock.ko
-
 config DMA_SHARED_BUFFER_TEST_EXPORTER
 	bool "Build dma-buf framework test exporter module"
 	depends on MALI_BASE_MODULES
@@ -45,7 +35,7 @@ config DMA_SHARED_BUFFER_TEST_EXPORTER
 	  Usable to help test importers.
 
 	  Modules:
-	    - dma-buf-test-exporter.ko
+	  - dma-buf-test-exporter.ko
 
 config MALI_MEMORY_GROUP_MANAGER
 	bool "Build Mali Memory Group Manager module"
@@ -57,7 +47,7 @@ config MALI_MEMORY_GROUP_MANAGER
 	  for memory pools managed by Mali GPU device drivers.
 
 	  Modules:
-	    - memory_group_manager.ko
+	  - memory_group_manager.ko
 
 config MALI_PROTECTED_MEMORY_ALLOCATOR
 	bool "Build Mali Protected Memory Allocator module"
@@ -70,5 +60,5 @@ config MALI_PROTECTED_MEMORY_ALLOCATOR
 	  of Mali GPU device drivers.
 
 	  Modules:
-	    - protected_memory_allocator.ko
+	  - protected_memory_allocator.ko
 
diff --git a/drivers/base/arm/dma_buf_lock/src/dma_buf_lock.c b/drivers/base/arm/dma_buf_lock/src/dma_buf_lock.c
deleted file mode 100644
index 43333ca8e5e2..000000000000
--- a/drivers/base/arm/dma_buf_lock/src/dma_buf_lock.c
+++ /dev/null
@@ -1,908 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
-/*
- *
- * (C) COPYRIGHT 2012-2014, 2017-2018, 2020-2022 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#include <linux/version.h>
-#include <linux/version_compat_defs.h>
-#include <linux/uaccess.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/cdev.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/atomic.h>
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-#include <linux/reservation.h>
-#else
-#include <linux/dma-resv.h>
-#endif
-#include <linux/dma-buf.h>
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/poll.h>
-#include <linux/anon_inodes.h>
-#include <linux/file.h>
-
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-
-#include <linux/fence.h>
-
-#define dma_fence_context_alloc(a) fence_context_alloc(a)
-#define dma_fence_init(a, b, c, d, e) fence_init(a, b, c, d, e)
-#define dma_fence_get(a) fence_get(a)
-#define dma_fence_put(a) fence_put(a)
-#define dma_fence_signal(a) fence_signal(a)
-#define dma_fence_is_signaled(a) fence_is_signaled(a)
-#define dma_fence_add_callback(a, b, c) fence_add_callback(a, b, c)
-#define dma_fence_remove_callback(a, b) fence_remove_callback(a, b)
-
-#if (KERNEL_VERSION(4, 9, 68) > LINUX_VERSION_CODE)
-#define dma_fence_get_status(a) (fence_is_signaled(a) ? (a)->status ?: 1 : 0)
-#else
-#define dma_fence_get_status(a) (fence_is_signaled(a) ? (a)->error ?: 1 : 0)
-#endif
-
-#else
-
-#include <linux/dma-fence.h>
-
-#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
-#define dma_fence_get_status(a) (dma_fence_is_signaled(a) ? \
-	(a)->status ?: 1 \
-	: 0)
-#endif
-
-#endif /* < 4.10.0 */
-
-#include "dma_buf_lock.h"
-
-/* Maximum number of buffers that a single handle can address */
-#define DMA_BUF_LOCK_BUF_MAX 32
-
-#define DMA_BUF_LOCK_DEBUG 1
-
-#define DMA_BUF_LOCK_INIT_BIAS  0xFF
-
-static dev_t dma_buf_lock_dev;
-static struct cdev dma_buf_lock_cdev;
-static struct class *dma_buf_lock_class;
-static const char dma_buf_lock_dev_name[] = "dma_buf_lock";
-
-#if defined(HAVE_UNLOCKED_IOCTL) || defined(HAVE_COMPAT_IOCTL) || ((KERNEL_VERSION(5, 9, 0) <= LINUX_VERSION_CODE))
-static long dma_buf_lock_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-#else
-static int dma_buf_lock_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
-#endif
-
-static const struct file_operations dma_buf_lock_fops = {
-	.owner   = THIS_MODULE,
-#if defined(HAVE_UNLOCKED_IOCTL) || ((KERNEL_VERSION(5, 9, 0) <= LINUX_VERSION_CODE))
-	.unlocked_ioctl   = dma_buf_lock_ioctl,
-#endif
-#if defined(HAVE_COMPAT_IOCTL) || ((KERNEL_VERSION(5, 9, 0) <= LINUX_VERSION_CODE))
-	.compat_ioctl   = dma_buf_lock_ioctl,
-#endif
-};
-
-struct dma_buf_lock_resource {
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-	struct fence fence;
-#else
-	struct dma_fence fence;
-#endif
-	int *list_of_dma_buf_fds;               /* List of buffers copied from userspace */
-	atomic_t locked;                        /* Status of lock */
-	struct dma_buf **dma_bufs;
-	unsigned long exclusive;                /* Exclusive access bitmap */
-	atomic_t fence_dep_count;		/* Number of dma-fence dependencies */
-	struct list_head dma_fence_callbacks;	/* list of all callbacks set up to wait on other fences */
-	wait_queue_head_t wait;
-	struct kref refcount;
-	struct list_head link;
-	struct work_struct work;
-	int count;
-};
-
-/**
- * struct dma_buf_lock_fence_cb - Callback data struct for dma-fence
- * @fence_cb: Callback function
- * @fence:    Pointer to the fence object on which this callback is waiting
- * @res:      Pointer to dma_buf_lock_resource that is waiting on this callback
- * @node:     List head for linking this callback to the lock resource
- */
-struct dma_buf_lock_fence_cb {
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-	struct fence_cb fence_cb;
-	struct fence *fence;
-#else
-	struct dma_fence_cb fence_cb;
-	struct dma_fence *fence;
-#endif
-	struct dma_buf_lock_resource *res;
-	struct list_head node;
-};
-
-static LIST_HEAD(dma_buf_lock_resource_list);
-static DEFINE_MUTEX(dma_buf_lock_mutex);
-
-static inline int is_dma_buf_lock_file(struct file *);
-static void dma_buf_lock_dounlock(struct kref *ref);
-
-
-/*** dma_buf_lock fence part ***/
-
-/* Spin lock protecting all Mali fences as fence->lock. */
-static DEFINE_SPINLOCK(dma_buf_lock_fence_lock);
-
-static const char *
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-dma_buf_lock_fence_get_driver_name(struct fence *fence)
-#else
-dma_buf_lock_fence_get_driver_name(struct dma_fence *fence)
-#endif
-{
-	return "dma_buf_lock";
-}
-
-static const char *
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-dma_buf_lock_fence_get_timeline_name(struct fence *fence)
-#else
-dma_buf_lock_fence_get_timeline_name(struct dma_fence *fence)
-#endif
-{
-	return "dma_buf_lock.timeline";
-}
-
-static bool
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-dma_buf_lock_fence_enable_signaling(struct fence *fence)
-#else
-dma_buf_lock_fence_enable_signaling(struct dma_fence *fence)
-#endif
-{
-	return true;
-}
-
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-const struct fence_ops dma_buf_lock_fence_ops = {
-	.wait = fence_default_wait,
-#else
-const struct dma_fence_ops dma_buf_lock_fence_ops = {
-	.wait = dma_fence_default_wait,
-#endif
-	.get_driver_name = dma_buf_lock_fence_get_driver_name,
-	.get_timeline_name = dma_buf_lock_fence_get_timeline_name,
-	.enable_signaling = dma_buf_lock_fence_enable_signaling,
-};
-
-static void
-dma_buf_lock_fence_init(struct dma_buf_lock_resource *resource)
-{
-	dma_fence_init(&resource->fence,
-		       &dma_buf_lock_fence_ops,
-		       &dma_buf_lock_fence_lock,
-		       0,
-		       0);
-}
-
-static void
-dma_buf_lock_fence_free_callbacks(struct dma_buf_lock_resource *resource)
-{
-	struct dma_buf_lock_fence_cb *cb, *tmp;
-
-	/* Clean up and free callbacks. */
-	list_for_each_entry_safe(cb, tmp, &resource->dma_fence_callbacks, node) {
-		/* Cancel callbacks that hasn't been called yet and release the
-		 * reference taken in dma_buf_lock_fence_add_callback().
-		 */
-		dma_fence_remove_callback(cb->fence, &cb->fence_cb);
-		dma_fence_put(cb->fence);
-		list_del(&cb->node);
-		kfree(cb);
-	}
-}
-
-static void
-dma_buf_lock_fence_work(struct work_struct *pwork)
-{
-	struct dma_buf_lock_resource *resource =
-		container_of(pwork, struct dma_buf_lock_resource, work);
-
-	WARN_ON(atomic_read(&resource->fence_dep_count));
-	WARN_ON(!atomic_read(&resource->locked));
-	WARN_ON(!resource->exclusive);
-
-	mutex_lock(&dma_buf_lock_mutex);
-	kref_put(&resource->refcount, dma_buf_lock_dounlock);
-	mutex_unlock(&dma_buf_lock_mutex);
-}
-
-static void
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-dma_buf_lock_fence_callback(struct fence *fence, struct fence_cb *cb)
-#else
-dma_buf_lock_fence_callback(struct dma_fence *fence, struct dma_fence_cb *cb)
-#endif
-{
-	struct dma_buf_lock_fence_cb *dma_buf_lock_cb = container_of(cb,
-				struct dma_buf_lock_fence_cb,
-				fence_cb);
-	struct dma_buf_lock_resource *resource = dma_buf_lock_cb->res;
-
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-
-	/* Callback function will be invoked in atomic context. */
-
-	if (atomic_dec_and_test(&resource->fence_dep_count)) {
-		atomic_set(&resource->locked, 1);
-		wake_up(&resource->wait);
-
-		if (resource->exclusive)
-			/* Warn if the work was already queued */
-			WARN_ON(!schedule_work(&resource->work));
-	}
-}
-
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-static int
-dma_buf_lock_fence_add_callback(struct dma_buf_lock_resource *resource,
-				struct fence *fence,
-				fence_func_t callback)
-#else
-static int
-dma_buf_lock_fence_add_callback(struct dma_buf_lock_resource *resource,
-				struct dma_fence *fence,
-				dma_fence_func_t callback)
-#endif
-{
-	int err = 0;
-	struct dma_buf_lock_fence_cb *fence_cb;
-
-	if (!fence)
-		return -EINVAL;
-
-	fence_cb = kmalloc(sizeof(*fence_cb), GFP_KERNEL);
-	if (!fence_cb)
-		return -ENOMEM;
-
-	fence_cb->fence = fence;
-	fence_cb->res   = resource;
-	INIT_LIST_HEAD(&fence_cb->node);
-
-	err = dma_fence_add_callback(fence, &fence_cb->fence_cb,
-				     callback);
-
-	if (err == -ENOENT) {
-		/* Fence signaled, get the completion result */
-		err = dma_fence_get_status(fence);
-
-		/* remap success completion to err code */
-		if (err == 1)
-			err = 0;
-
-		kfree(fence_cb);
-	} else if (err) {
-		kfree(fence_cb);
-	} else {
-		/*
-		 * Get reference to fence that will be kept until callback gets
-		 * cleaned up in dma_buf_lock_fence_free_callbacks().
-		 */
-		dma_fence_get(fence);
-		atomic_inc(&resource->fence_dep_count);
-		/* Add callback to resource's list of callbacks */
-		list_add(&fence_cb->node, &resource->dma_fence_callbacks);
-	}
-
-	return err;
-}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-static int
-dma_buf_lock_add_fence_reservation_callback(struct dma_buf_lock_resource *resource,
-					    struct reservation_object *resv,
-					    bool exclusive)
-#else
-static int
-dma_buf_lock_add_fence_reservation_callback(struct dma_buf_lock_resource *resource,
-					    struct dma_resv *resv,
-					    bool exclusive)
-#endif
-{
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-	struct fence *excl_fence = NULL;
-	struct fence **shared_fences = NULL;
-#else
-	struct dma_fence *excl_fence = NULL;
-	struct dma_fence **shared_fences = NULL;
-#endif
-	unsigned int shared_count = 0;
-	int err, i;
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-	err = reservation_object_get_fences_rcu(
-#elif (KERNEL_VERSION(5, 14, 0) > LINUX_VERSION_CODE)
-	err = dma_resv_get_fences_rcu(
-#else
-	err = dma_resv_get_fences(
-#endif
-						resv,
-						&excl_fence,
-						&shared_count,
-						&shared_fences);
-	if (err)
-		return err;
-
-	if (excl_fence) {
-		err = dma_buf_lock_fence_add_callback(resource,
-						      excl_fence,
-						      dma_buf_lock_fence_callback);
-
-		/* Release our reference, taken by reservation_object_get_fences_rcu(),
-		 * to the fence. We have set up our callback (if that was possible),
-		 * and it's the fence's owner is responsible for singling the fence
-		 * before allowing it to disappear.
-		 */
-		dma_fence_put(excl_fence);
-
-		if (err)
-			goto out;
-	}
-
-	if (exclusive) {
-		for (i = 0; i < shared_count; i++) {
-			err = dma_buf_lock_fence_add_callback(resource,
-							      shared_fences[i],
-							      dma_buf_lock_fence_callback);
-			if (err)
-				goto out;
-		}
-	}
-
-	/* Release all our references to the shared fences, taken by
-	 * reservation_object_get_fences_rcu(). We have set up our callback (if
-	 * that was possible), and it's the fence's owner is responsible for
-	 * signaling the fence before allowing it to disappear.
-	 */
-out:
-	for (i = 0; i < shared_count; i++)
-		dma_fence_put(shared_fences[i]);
-	kfree(shared_fences);
-
-	return err;
-}
-
-static void
-dma_buf_lock_release_fence_reservation(struct dma_buf_lock_resource *resource,
-				       struct ww_acquire_ctx *ctx)
-{
-	unsigned int r;
-
-	for (r = 0; r < resource->count; r++)
-		ww_mutex_unlock(&resource->dma_bufs[r]->resv->lock);
-	ww_acquire_fini(ctx);
-}
-
-static int
-dma_buf_lock_acquire_fence_reservation(struct dma_buf_lock_resource *resource,
-				       struct ww_acquire_ctx *ctx)
-{
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-	struct reservation_object *content_resv = NULL;
-#else
-	struct dma_resv *content_resv = NULL;
-#endif
-	unsigned int content_resv_idx = 0;
-	unsigned int r;
-	int err = 0;
-
-	ww_acquire_init(ctx, &reservation_ww_class);
-
-retry:
-	for (r = 0; r < resource->count; r++) {
-		if (resource->dma_bufs[r]->resv == content_resv) {
-			content_resv = NULL;
-			continue;
-		}
-
-		err = ww_mutex_lock(&resource->dma_bufs[r]->resv->lock, ctx);
-		if (err)
-			goto error;
-	}
-
-	ww_acquire_done(ctx);
-	return err;
-
-error:
-	content_resv_idx = r;
-
-	/* Unlock the locked one ones */
-	while (r--)
-		ww_mutex_unlock(&resource->dma_bufs[r]->resv->lock);
-
-	if (content_resv)
-		ww_mutex_unlock(&content_resv->lock);
-
-	/* If we deadlock try with lock_slow and retry */
-	if (err == -EDEADLK) {
-#if DMA_BUF_LOCK_DEBUG
-		pr_debug("deadlock at dma_buf fd %i\n",
-		       resource->list_of_dma_buf_fds[content_resv_idx]);
-#endif
-		content_resv = resource->dma_bufs[content_resv_idx]->resv;
-		ww_mutex_lock_slow(&content_resv->lock, ctx);
-		goto retry;
-	}
-
-	/* If we are here the function failed */
-	ww_acquire_fini(ctx);
-	return err;
-}
-
-static int dma_buf_lock_handle_release(struct inode *inode, struct file *file)
-{
-	struct dma_buf_lock_resource *resource;
-
-	if (!is_dma_buf_lock_file(file))
-		return -EINVAL;
-
-	resource = file->private_data;
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-	mutex_lock(&dma_buf_lock_mutex);
-	kref_put(&resource->refcount, dma_buf_lock_dounlock);
-	mutex_unlock(&dma_buf_lock_mutex);
-
-	return 0;
-}
-
-static __poll_t dma_buf_lock_handle_poll(struct file *file, poll_table *wait)
-{
-	struct dma_buf_lock_resource *resource;
-	unsigned int ret = 0;
-
-	if (!is_dma_buf_lock_file(file)) {
-#if (KERNEL_VERSION(4, 19, 0) > LINUX_VERSION_CODE)
-		return POLLERR;
-#else
-		return EPOLLERR;
-#endif
-	}
-
-	resource = file->private_data;
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-	if (atomic_read(&resource->locked) == 1) {
-		/* Resources have been locked */
-#if (KERNEL_VERSION(4, 19, 0) > LINUX_VERSION_CODE)
-		ret = POLLIN | POLLRDNORM;
-		if (resource->exclusive)
-			ret |= POLLOUT | POLLWRNORM;
-#else
-		ret = EPOLLIN | EPOLLRDNORM;
-		if (resource->exclusive)
-			ret |= EPOLLOUT | EPOLLWRNORM;
-#endif
-	} else {
-		if (!poll_does_not_wait(wait))
-			poll_wait(file, &resource->wait, wait);
-	}
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s : return %i\n", __func__, ret);
-#endif
-	return ret;
-}
-
-static const struct file_operations dma_buf_lock_handle_fops = {
-	.owner		= THIS_MODULE,
-	.release	= dma_buf_lock_handle_release,
-	.poll		= dma_buf_lock_handle_poll,
-};
-
-/*
- * is_dma_buf_lock_file - Check if struct file* is associated with dma_buf_lock
- */
-static inline int is_dma_buf_lock_file(struct file *file)
-{
-	return file->f_op == &dma_buf_lock_handle_fops;
-}
-
-/*
- * Start requested lock.
- *
- * Allocates required memory, copies dma_buf_fd list from userspace,
- * acquires related reservation objects, and starts the lock.
- */
-static int dma_buf_lock_dolock(struct dma_buf_lock_k_request *request)
-{
-	struct dma_buf_lock_resource *resource;
-	struct ww_acquire_ctx ww_ctx;
-	struct file *file;
-	int size;
-	int fd;
-	int i;
-	int ret;
-	int error;
-
-	if (request->list_of_dma_buf_fds == NULL)
-		return -EINVAL;
-	if (request->count <= 0)
-		return -EINVAL;
-	if (request->count > DMA_BUF_LOCK_BUF_MAX)
-		return -EINVAL;
-	if (request->exclusive != DMA_BUF_LOCK_NONEXCLUSIVE &&
-	    request->exclusive != DMA_BUF_LOCK_EXCLUSIVE)
-		return -EINVAL;
-
-	resource = kzalloc(sizeof(*resource), GFP_KERNEL);
-	if (resource == NULL)
-		return -ENOMEM;
-
-	atomic_set(&resource->locked, 0);
-	kref_init(&resource->refcount);
-	INIT_LIST_HEAD(&resource->link);
-	INIT_WORK(&resource->work, dma_buf_lock_fence_work);
-	resource->count = request->count;
-
-	/* Allocate space to store dma_buf_fds received from user space */
-	size = request->count * sizeof(int);
-	resource->list_of_dma_buf_fds = kmalloc(size, GFP_KERNEL);
-
-	if (resource->list_of_dma_buf_fds == NULL) {
-		kfree(resource);
-		return -ENOMEM;
-	}
-
-	/* Allocate space to store dma_buf pointers associated with dma_buf_fds */
-	size = sizeof(struct dma_buf *) * request->count;
-	resource->dma_bufs = kmalloc(size, GFP_KERNEL);
-
-	if (resource->dma_bufs == NULL) {
-		kfree(resource->list_of_dma_buf_fds);
-		kfree(resource);
-		return -ENOMEM;
-	}
-
-	/* Copy requested list of dma_buf_fds from user space */
-	size = request->count * sizeof(int);
-	if (copy_from_user(resource->list_of_dma_buf_fds,
-			   (void __user *)request->list_of_dma_buf_fds,
-			   size) != 0) {
-		kfree(resource->list_of_dma_buf_fds);
-		kfree(resource->dma_bufs);
-		kfree(resource);
-		return -ENOMEM;
-	}
-#if DMA_BUF_LOCK_DEBUG
-	for (i = 0; i < request->count; i++)
-		pr_debug("dma_buf %i = %X\n", i, resource->list_of_dma_buf_fds[i]);
-#endif
-
-	/* Initialize the fence associated with dma_buf_lock resource */
-	dma_buf_lock_fence_init(resource);
-
-	INIT_LIST_HEAD(&resource->dma_fence_callbacks);
-
-	atomic_set(&resource->fence_dep_count, DMA_BUF_LOCK_INIT_BIAS);
-
-	/* Add resource to global list */
-	mutex_lock(&dma_buf_lock_mutex);
-
-	list_add(&resource->link, &dma_buf_lock_resource_list);
-
-	mutex_unlock(&dma_buf_lock_mutex);
-
-	for (i = 0; i < request->count; i++) {
-		/* Convert fd into dma_buf structure */
-		resource->dma_bufs[i] = dma_buf_get(resource->list_of_dma_buf_fds[i]);
-
-		if (IS_ERR_VALUE(PTR_ERR(resource->dma_bufs[i]))) {
-			mutex_lock(&dma_buf_lock_mutex);
-			kref_put(&resource->refcount, dma_buf_lock_dounlock);
-			mutex_unlock(&dma_buf_lock_mutex);
-			return -EINVAL;
-		}
-
-		/*Check the reservation object associated with dma_buf */
-		if (resource->dma_bufs[i]->resv == NULL) {
-			mutex_lock(&dma_buf_lock_mutex);
-			kref_put(&resource->refcount, dma_buf_lock_dounlock);
-			mutex_unlock(&dma_buf_lock_mutex);
-			return -EINVAL;
-		}
-#if DMA_BUF_LOCK_DEBUG
-		pr_debug("%s : dma_buf_fd %i dma_buf %p dma_fence reservation %p\n",
-		       __func__, resource->list_of_dma_buf_fds[i], resource->dma_bufs[i], resource->dma_bufs[i]->resv);
-#endif
-	}
-
-	init_waitqueue_head(&resource->wait);
-
-	kref_get(&resource->refcount);
-
-	error = get_unused_fd_flags(0);
-	if (error < 0)
-		return error;
-
-	fd = error;
-
-	file = anon_inode_getfile("dma_buf_lock", &dma_buf_lock_handle_fops, (void *)resource, 0);
-
-	if (IS_ERR(file)) {
-		put_unused_fd(fd);
-		mutex_lock(&dma_buf_lock_mutex);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		mutex_unlock(&dma_buf_lock_mutex);
-		return PTR_ERR(file);
-	}
-
-	resource->exclusive = request->exclusive;
-
-	/* Start locking process */
-	ret = dma_buf_lock_acquire_fence_reservation(resource, &ww_ctx);
-	if (ret) {
-#if DMA_BUF_LOCK_DEBUG
-		pr_debug("%s : Error %d locking reservations.\n", __func__, ret);
-#endif
-		put_unused_fd(fd);
-		mutex_lock(&dma_buf_lock_mutex);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		mutex_unlock(&dma_buf_lock_mutex);
-		return ret;
-	}
-
-	/* Take an extra reference for exclusive access, which will be dropped
-	 * once the pre-existing fences attached to dma-buf resources, for which
-	 * we have commited for exclusive access, are signaled.
-	 * At a given time there can be only one exclusive fence attached to a
-	 * reservation object, so the new exclusive fence replaces the original
-	 * fence and the future sync is done against the new fence which is
-	 * supposed to be signaled only after the original fence was signaled.
-	 * If the new exclusive fence is signaled prematurely then the resources
-	 * would become available for new access while they are already being
-	 * written to by the original owner.
-	 */
-	if (resource->exclusive)
-		kref_get(&resource->refcount);
-
-	for (i = 0; i < request->count; i++) {
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-		struct reservation_object *resv = resource->dma_bufs[i]->resv;
-#else
-		struct dma_resv *resv = resource->dma_bufs[i]->resv;
-#endif
-		if (!test_bit(i, &resource->exclusive)) {
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			ret = reservation_object_reserve_shared(resv);
-#else
-			ret = dma_resv_reserve_shared(resv, 0);
-#endif
-			if (ret) {
-#if DMA_BUF_LOCK_DEBUG
-				pr_debug("%s : Error %d reserving space for shared fence.\n", __func__, ret);
-#endif
-				break;
-			}
-
-			ret = dma_buf_lock_add_fence_reservation_callback(resource,
-									  resv,
-									  false);
-			if (ret) {
-#if DMA_BUF_LOCK_DEBUG
-				pr_debug("%s : Error %d adding reservation to callback.\n", __func__, ret);
-#endif
-				break;
-			}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			reservation_object_add_shared_fence(resv, &resource->fence);
-#else
-			dma_resv_add_shared_fence(resv, &resource->fence);
-#endif
-		} else {
-			ret = dma_buf_lock_add_fence_reservation_callback(resource, resv, true);
-			if (ret) {
-#if DMA_BUF_LOCK_DEBUG
-				pr_debug("%s : Error %d adding reservation to callback.\n", __func__, ret);
-#endif
-				break;
-			}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			reservation_object_add_excl_fence(resv, &resource->fence);
-#else
-			dma_resv_add_excl_fence(resv, &resource->fence);
-#endif
-		}
-	}
-
-	dma_buf_lock_release_fence_reservation(resource, &ww_ctx);
-
-	/* Test if the callbacks were already triggered */
-	if (!atomic_sub_return(DMA_BUF_LOCK_INIT_BIAS, &resource->fence_dep_count)) {
-		atomic_set(&resource->locked, 1);
-
-		/* Drop the extra reference taken for exclusive access */
-		if (resource->exclusive)
-			dma_buf_lock_fence_work(&resource->work);
-	}
-
-	if (IS_ERR_VALUE((unsigned long)ret)) {
-		put_unused_fd(fd);
-
-		mutex_lock(&dma_buf_lock_mutex);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		mutex_unlock(&dma_buf_lock_mutex);
-
-		return ret;
-	}
-
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s : complete\n", __func__);
-#endif
-	mutex_lock(&dma_buf_lock_mutex);
-	kref_put(&resource->refcount, dma_buf_lock_dounlock);
-	mutex_unlock(&dma_buf_lock_mutex);
-
-	/* Installing the fd is deferred to the very last operation before return
-	 * to avoid allowing userspace to close it during the setup.
-	 */
-	fd_install(fd, file);
-	return fd;
-}
-
-static void dma_buf_lock_dounlock(struct kref *ref)
-{
-	int i;
-	struct dma_buf_lock_resource *resource = container_of(ref, struct dma_buf_lock_resource, refcount);
-
-	atomic_set(&resource->locked, 0);
-
-	/* Signal the resource's fence. */
-	dma_fence_signal(&resource->fence);
-
-	dma_buf_lock_fence_free_callbacks(resource);
-
-	list_del(&resource->link);
-
-	for (i = 0; i < resource->count; i++) {
-		if (resource->dma_bufs[i])
-			dma_buf_put(resource->dma_bufs[i]);
-	}
-
-	kfree(resource->dma_bufs);
-	kfree(resource->list_of_dma_buf_fds);
-	dma_fence_put(&resource->fence);
-}
-
-static int __init dma_buf_lock_init(void)
-{
-	int err;
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-	err = alloc_chrdev_region(&dma_buf_lock_dev, 0, 1, dma_buf_lock_dev_name);
-
-	if (err == 0) {
-		cdev_init(&dma_buf_lock_cdev, &dma_buf_lock_fops);
-
-		err = cdev_add(&dma_buf_lock_cdev, dma_buf_lock_dev, 1);
-
-		if (err == 0) {
-			dma_buf_lock_class = class_create(THIS_MODULE, dma_buf_lock_dev_name);
-			if (IS_ERR(dma_buf_lock_class))
-				err = PTR_ERR(dma_buf_lock_class);
-			else {
-				struct device *mdev = device_create(
-					dma_buf_lock_class, NULL, dma_buf_lock_dev,
-					NULL, "%s", dma_buf_lock_dev_name);
-				if (!IS_ERR(mdev))
-					return 0;
-
-				err = PTR_ERR(mdev);
-				class_destroy(dma_buf_lock_class);
-			}
-			cdev_del(&dma_buf_lock_cdev);
-		}
-
-		unregister_chrdev_region(dma_buf_lock_dev, 1);
-	}
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s failed\n", __func__);
-#endif
-	return err;
-}
-
-static void __exit dma_buf_lock_exit(void)
-{
-#if DMA_BUF_LOCK_DEBUG
-	pr_debug("%s\n", __func__);
-#endif
-
-	/* Unlock all outstanding references */
-	while (1) {
-		struct dma_buf_lock_resource *resource;
-
-		mutex_lock(&dma_buf_lock_mutex);
-		if (list_empty(&dma_buf_lock_resource_list)) {
-			mutex_unlock(&dma_buf_lock_mutex);
-			break;
-		}
-
-		resource = list_entry(dma_buf_lock_resource_list.next,
-			struct dma_buf_lock_resource, link);
-
-		kref_put(&resource->refcount, dma_buf_lock_dounlock);
-		mutex_unlock(&dma_buf_lock_mutex);
-	}
-
-	device_destroy(dma_buf_lock_class, dma_buf_lock_dev);
-
-	class_destroy(dma_buf_lock_class);
-
-	cdev_del(&dma_buf_lock_cdev);
-
-	unregister_chrdev_region(dma_buf_lock_dev, 1);
-}
-
-#if defined(HAVE_UNLOCKED_IOCTL) || defined(HAVE_COMPAT_IOCTL) || ((KERNEL_VERSION(5, 9, 0) <= LINUX_VERSION_CODE))
-static long dma_buf_lock_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-#else
-static int dma_buf_lock_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg)
-#endif
-{
-	struct dma_buf_lock_k_request request;
-	int size = _IOC_SIZE(cmd);
-
-	if (_IOC_TYPE(cmd) != DMA_BUF_LOCK_IOC_MAGIC)
-		return -ENOTTY;
-	if ((_IOC_NR(cmd) < DMA_BUF_LOCK_IOC_MINNR) || (_IOC_NR(cmd) > DMA_BUF_LOCK_IOC_MAXNR))
-		return -ENOTTY;
-
-	switch (cmd) {
-	case DMA_BUF_LOCK_FUNC_LOCK_ASYNC:
-		if (size != sizeof(request))
-			return -ENOTTY;
-		if (copy_from_user(&request, (void __user *)arg, size))
-			return -EFAULT;
-#if DMA_BUF_LOCK_DEBUG
-		pr_debug("DMA_BUF_LOCK_FUNC_LOCK_ASYNC - %i\n", request.count);
-#endif
-		return dma_buf_lock_dolock(&request);
-	}
-
-	return -ENOTTY;
-}
-
-module_init(dma_buf_lock_init);
-module_exit(dma_buf_lock_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_INFO(import_ns, "DMA_BUF");
diff --git a/drivers/base/arm/dma_buf_test_exporter/build.bp b/drivers/base/arm/dma_buf_test_exporter/build.bp
index a49fb81d6665..aabd32aa5103 100644
--- a/drivers/base/arm/dma_buf_test_exporter/build.bp
+++ b/drivers/base/arm/dma_buf_test_exporter/build.bp
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2017, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2017, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,7 +22,7 @@
 bob_kernel_module {
     name: "dma-buf-test-exporter",
     defaults: [
-        "kernel_defaults"
+        "kernel_defaults",
     ],
     srcs: [
         "Kbuild",
diff --git a/drivers/base/arm/dma_buf_test_exporter/dma-buf-test-exporter.c b/drivers/base/arm/dma_buf_test_exporter/dma-buf-test-exporter.c
index 6b9a4d70483a..5f033a60026c 100644
--- a/drivers/base/arm/dma_buf_test_exporter/dma-buf-test-exporter.c
+++ b/drivers/base/arm/dma_buf_test_exporter/dma-buf-test-exporter.c
@@ -19,7 +19,7 @@
  *
  */
 
-#include <linux/dma-buf-test-exporter.h>
+#include <uapi/base/arm/dma_buf_test_exporter/dma-buf-test-exporter.h>
 #include <linux/dma-buf.h>
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
@@ -32,6 +32,9 @@
 #include <linux/highmem.h>
 #include <linux/dma-mapping.h>
 
+#define DMA_BUF_TE_VER_MAJOR 1
+#define DMA_BUF_TE_VER_MINOR 0
+
 /* Maximum size allowed in a single DMA_BUF_TE_ALLOC call */
 #define DMA_BUF_TE_ALLOC_MAX_SIZE ((8ull << 30) >> PAGE_SHIFT) /* 8 GB */
 
diff --git a/drivers/base/arm/memory_group_manager/build.bp b/drivers/base/arm/memory_group_manager/build.bp
index 23db183e4f1b..f4b809e774d2 100644
--- a/drivers/base/arm/memory_group_manager/build.bp
+++ b/drivers/base/arm/memory_group_manager/build.bp
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,7 +22,7 @@
 bob_kernel_module {
     name: "memory_group_manager",
     defaults: [
-        "kernel_defaults"
+        "kernel_defaults",
     ],
     srcs: [
         "Kbuild",
diff --git a/drivers/base/arm/memory_group_manager/memory_group_manager.c b/drivers/base/arm/memory_group_manager/memory_group_manager.c
index 7729492e0c80..825893e3cf8e 100644
--- a/drivers/base/arm/memory_group_manager/memory_group_manager.c
+++ b/drivers/base/arm/memory_group_manager/memory_group_manager.c
@@ -265,8 +265,8 @@ static struct page *example_mgm_alloc_page(
 	struct mgm_groups *const data = mgm_dev->data;
 	struct page *p;
 
-	dev_dbg(data->dev, "%s(mgm_dev=%p, group_id=%d gfp_mask=0x%x order=%u\n",
-		__func__, (void *)mgm_dev, group_id, gfp_mask, order);
+	dev_dbg(data->dev, "%s(mgm_dev=%pK, group_id=%d gfp_mask=0x%x order=%u\n", __func__,
+		(void *)mgm_dev, group_id, gfp_mask, order);
 
 	if (WARN_ON(group_id < 0) ||
 		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
@@ -291,8 +291,8 @@ static void example_mgm_free_page(
 {
 	struct mgm_groups *const data = mgm_dev->data;
 
-	dev_dbg(data->dev, "%s(mgm_dev=%p, group_id=%d page=%p order=%u\n",
-		__func__, (void *)mgm_dev, group_id, (void *)page, order);
+	dev_dbg(data->dev, "%s(mgm_dev=%pK, group_id=%d page=%pK order=%u\n", __func__,
+		(void *)mgm_dev, group_id, (void *)page, order);
 
 	if (WARN_ON(group_id < 0) ||
 		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
@@ -309,9 +309,8 @@ static int example_mgm_get_import_memory_id(
 {
 	struct mgm_groups *const data = mgm_dev->data;
 
-	dev_dbg(data->dev, "%s(mgm_dev=%p, import_data=%p (type=%d)\n",
-		__func__, (void *)mgm_dev, (void *)import_data,
-		(int)import_data->type);
+	dev_dbg(data->dev, "%s(mgm_dev=%pK, import_data=%pK (type=%d)\n", __func__, (void *)mgm_dev,
+		(void *)import_data, (int)import_data->type);
 
 	if (!WARN_ON(!import_data)) {
 		WARN_ON(!import_data->u.dma_buf);
@@ -329,9 +328,8 @@ static u64 example_mgm_update_gpu_pte(
 {
 	struct mgm_groups *const data = mgm_dev->data;
 
-	dev_dbg(data->dev,
-		"%s(mgm_dev=%p, group_id=%d, mmu_level=%d, pte=0x%llx)\n",
-		__func__, (void *)mgm_dev, group_id, mmu_level, pte);
+	dev_dbg(data->dev, "%s(mgm_dev=%pK, group_id=%d, mmu_level=%d, pte=0x%llx)\n", __func__,
+		(void *)mgm_dev, group_id, mmu_level, pte);
 
 	if (WARN_ON(group_id < 0) ||
 		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
@@ -367,9 +365,9 @@ static vm_fault_t example_mgm_vmf_insert_pfn_prot(
 	vm_fault_t fault;
 
 	dev_dbg(data->dev,
-		"%s(mgm_dev=%p, group_id=%d, vma=%p, addr=0x%lx, pfn=0x%lx, prot=0x%llx)\n",
+		"%s(mgm_dev=%pK, group_id=%d, vma=%pK, addr=0x%lx, pfn=0x%lx, prot=0x%llx)\n",
 		__func__, (void *)mgm_dev, group_id, (void *)vma, addr, pfn,
-		(unsigned long long) pgprot_val(prot));
+		(unsigned long long)pgprot_val(prot));
 
 	if (WARN_ON(group_id < 0) ||
 		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS))
diff --git a/drivers/base/arm/protected_memory_allocator/build.bp b/drivers/base/arm/protected_memory_allocator/build.bp
index 4c56154061e8..aef5344da31c 100644
--- a/drivers/base/arm/protected_memory_allocator/build.bp
+++ b/drivers/base/arm/protected_memory_allocator/build.bp
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,7 +22,7 @@
 bob_kernel_module {
     name: "protected_memory_allocator",
     defaults: [
-        "kernel_defaults"
+        "kernel_defaults",
     ],
     srcs: [
         "Kbuild",
diff --git a/drivers/gpu/arm/bifrost/Kbuild b/drivers/gpu/arm/bifrost/Kbuild
index a7f0ba0da1e8..70f3997b2bd3 100644
--- a/drivers/gpu/arm/bifrost/Kbuild
+++ b/drivers/gpu/arm/bifrost/Kbuild
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2012-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2012-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -59,10 +59,8 @@ ifeq ($(CONFIG_MALI_PRFCNT_SET_SELECT_VIA_DEBUG_FS), y)
 endif
 
 ifeq ($(CONFIG_MALI_BIFROST_FENCE_DEBUG), y)
-    ifneq ($(CONFIG_SYNC), y)
-        ifneq ($(CONFIG_SYNC_FILE), y)
-            $(error CONFIG_MALI_BIFROST_FENCE_DEBUG depends on CONFIG_SYNC || CONFIG_SYNC_FILE to be set in Kernel configuration)
-        endif
+    ifneq ($(CONFIG_SYNC_FILE), y)
+        $(error CONFIG_MALI_BIFROST_FENCE_DEBUG depends on CONFIG_SYNC_FILE to be set in Kernel configuration)
     endif
 endif
 
@@ -71,7 +69,7 @@ endif
 #
 
 # Driver version string which is returned to userspace via an ioctl
-MALI_RELEASE_NAME ?= '"g13p0-01eac0"'
+MALI_RELEASE_NAME ?= '"g15p0-01eac0"'
 # Set up defaults if not defined by build system
 ifeq ($(CONFIG_MALI_BIFROST_DEBUG), y)
     MALI_UNIT_TEST = 1
@@ -151,6 +149,7 @@ bifrost_kbase-y := \
     mali_kbase_cache_policy.o \
     mali_kbase_ccswe.o \
     mali_kbase_mem.o \
+    mali_kbase_mem_migrate.o \
     mali_kbase_mem_pool_group.o \
     mali_kbase_native_mgm.o \
     mali_kbase_ctx_sched.o \
@@ -159,12 +158,6 @@ bifrost_kbase-y := \
     mali_kbase_config.o \
     mali_kbase_kinstr_prfcnt.o \
     mali_kbase_vinstr.o \
-    mali_kbase_hwcnt.o \
-    mali_kbase_hwcnt_gpu.o \
-    mali_kbase_hwcnt_gpu_narrow.o \
-    mali_kbase_hwcnt_types.o \
-    mali_kbase_hwcnt_virtualizer.o \
-    mali_kbase_hwcnt_watchdog_if_timer.o \
     mali_kbase_softjobs.o \
     mali_kbase_hw.o \
     mali_kbase_debug.o \
@@ -175,6 +168,7 @@ bifrost_kbase-y := \
     mali_kbase_disjoint_events.o \
     mali_kbase_debug_mem_view.o \
     mali_kbase_debug_mem_zones.o \
+    mali_kbase_debug_mem_allocs.o \
     mali_kbase_smc.o \
     mali_kbase_mem_pool.o \
     mali_kbase_mem_pool_debugfs.o \
@@ -191,24 +185,14 @@ bifrost_kbase-$(CONFIG_DEBUG_FS) += mali_kbase_pbha_debugfs.o
 
 bifrost_kbase-$(CONFIG_MALI_CINSTR_GWT) += mali_kbase_gwt.o
 
-bifrost_kbase-$(CONFIG_SYNC) += \
-    mali_kbase_sync_android.o \
-    mali_kbase_sync_common.o
-
 bifrost_kbase-$(CONFIG_SYNC_FILE) += \
     mali_kbase_fence_ops.o \
     mali_kbase_sync_file.o \
     mali_kbase_sync_common.o
 
-ifeq ($(CONFIG_MALI_CSF_SUPPORT),y)
-    bifrost_kbase-y += \
-        mali_kbase_hwcnt_backend_csf.o \
-        mali_kbase_hwcnt_backend_csf_if_fw.o
-else
+ifneq ($(CONFIG_MALI_CSF_SUPPORT),y)
     bifrost_kbase-y += \
         mali_kbase_jm.o \
-        mali_kbase_hwcnt_backend_jm.o \
-        mali_kbase_hwcnt_backend_jm_watchdog.o \
         mali_kbase_dummy_job_wa.o \
         mali_kbase_debug_job_fault.o \
         mali_kbase_event.o \
@@ -218,11 +202,6 @@ else
         mali_kbase_js_ctx_attr.o \
         mali_kbase_kinstr_jm.o
 
-    bifrost_kbase-$(CONFIG_MALI_BIFROST_DMA_FENCE) += \
-        mali_kbase_fence_ops.o \
-        mali_kbase_dma_fence.o \
-        mali_kbase_fence.o
-
     bifrost_kbase-$(CONFIG_SYNC_FILE) += \
         mali_kbase_fence_ops.o \
         mali_kbase_fence.o
@@ -236,6 +215,7 @@ INCLUDE_SUBDIR = \
     $(src)/backend/gpu/Kbuild \
     $(src)/mmu/Kbuild \
     $(src)/tl/Kbuild \
+    $(src)/hwcnt/Kbuild \
     $(src)/gpu/Kbuild \
     $(src)/thirdparty/Kbuild \
     $(src)/platform/$(MALI_PLATFORM_DIR)/Kbuild
diff --git a/drivers/gpu/arm/bifrost/Kconfig b/drivers/gpu/arm/bifrost/Kconfig
index 54f083dbad27..1bfb59ca14e2 100644
--- a/drivers/gpu/arm/bifrost/Kconfig
+++ b/drivers/gpu/arm/bifrost/Kconfig
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2012-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2012-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -91,16 +91,6 @@ config MALI_BIFROST_ENABLE_TRACE
 	  Enables tracing in kbase. Trace log available through
 	  the "mali_trace" debugfs file, when the CONFIG_DEBUG_FS is enabled
 
-config MALI_BIFROST_DMA_FENCE
-	bool "Enable DMA_BUF fence support for Mali"
-	depends on MALI_BIFROST
-	default n
-	help
-	  Support DMA_BUF fences for Mali.
-
-	  This option should only be enabled if the Linux Kernel has built in
-	  support for DMA_BUF fences.
-
 config MALI_ARBITER_SUPPORT
 	bool "Enable arbiter support for Mali"
 	depends on MALI_BIFROST && !MALI_CSF_SUPPORT
@@ -117,7 +107,7 @@ config MALI_DMA_BUF_MAP_ON_DEMAND
 	depends on MALI_BIFROST
 	default n
 	help
-	  This option caused kbase to set up the GPU mapping of imported
+	  This option will cause kbase to set up the GPU mapping of imported
 	  dma-buf when needed to run atoms. This is the legacy behavior.
 
 	  This is intended for testing and the option will get removed in the
@@ -237,7 +227,7 @@ config MALI_BIFROST_DEBUG
 
 config MALI_BIFROST_FENCE_DEBUG
 	bool "Enable debug sync fence usage"
-	depends on MALI_BIFROST && MALI_BIFROST_EXPERT && (SYNC || SYNC_FILE)
+	depends on MALI_BIFROST && MALI_BIFROST_EXPERT && SYNC_FILE
 	default y if MALI_BIFROST_DEBUG
 	help
 	  Select this option to enable additional checking and reporting on the
@@ -385,9 +375,6 @@ config MALI_ARBITRATION
 	  virtualization setup for Mali
 	  If unsure, say N.
 
-if MALI_ARBITRATION
-source "drivers/gpu/arm/bifrost/arbitration/Kconfig"
-endif
 
 # source "drivers/gpu/arm/bifrost/tests/Kconfig"
 
diff --git a/drivers/gpu/arm/bifrost/Makefile b/drivers/gpu/arm/bifrost/Makefile
index 623177ed26fb..3fb736d7950e 100644
--- a/drivers/gpu/arm/bifrost/Makefile
+++ b/drivers/gpu/arm/bifrost/Makefile
@@ -65,7 +65,7 @@ ifeq ($(CONFIG_MALI_BIFROST),m)
     endif
 
     ifeq ($(CONFIG_XEN),y)
-        ifneq ($(CONFIG_MALI_ARBITRATION), n)
+        ifneq ($(CONFIG_MALI_ARBITER_SUPPORT), n)
             CONFIG_MALI_XEN ?= m
         endif
     endif
@@ -91,14 +91,10 @@ ifeq ($(CONFIG_MALI_BIFROST),m)
             CONFIG_MALI_BIFROST_ENABLE_TRACE ?= y
             CONFIG_MALI_BIFROST_SYSTEM_TRACE ?= y
 
-            ifeq ($(CONFIG_SYNC), y)
+            ifeq ($(CONFIG_SYNC_FILE), y)
                 CONFIG_MALI_BIFROST_FENCE_DEBUG ?= y
             else
-                ifeq ($(CONFIG_SYNC_FILE), y)
-                    CONFIG_MALI_BIFROST_FENCE_DEBUG ?= y
-                else
-                    CONFIG_MALI_BIFROST_FENCE_DEBUG = n
-                endif
+                CONFIG_MALI_BIFROST_FENCE_DEBUG = n
             endif
         else
             # Prevent misuse when CONFIG_MALI_BIFROST_DEBUG=n
@@ -160,7 +156,6 @@ CONFIGS := \
     CONFIG_MALI_BIFROST \
     CONFIG_MALI_CSF_SUPPORT \
     CONFIG_MALI_BIFROST_GATOR_SUPPORT \
-    CONFIG_MALI_BIFROST_DMA_FENCE \
     CONFIG_MALI_ARBITER_SUPPORT \
     CONFIG_MALI_ARBITRATION \
     CONFIG_MALI_ARBITER_MODULES \
@@ -227,26 +222,47 @@ EXTRA_CFLAGS += -DCONFIG_MALI_PLATFORM_NAME=$(CONFIG_MALI_PLATFORM_NAME)
 # KBUILD_EXTRA_SYMBOLS to prevent warnings about unknown functions
 #
 
-# The following were added to align with W=1 in scripts/Makefile.extrawarn
-# from the Linux source tree
 KBUILD_CFLAGS += -Wall -Werror
+
+# The following were added to align with W=1 in scripts/Makefile.extrawarn
+# from the Linux source tree (v5.18.14)
 KBUILD_CFLAGS += -Wextra -Wunused -Wno-unused-parameter
 KBUILD_CFLAGS += -Wmissing-declarations
 KBUILD_CFLAGS += -Wmissing-format-attribute
 KBUILD_CFLAGS += -Wmissing-prototypes
 KBUILD_CFLAGS += -Wold-style-definition
-KBUILD_CFLAGS += -Wmissing-include-dirs
+# The -Wmissing-include-dirs cannot be enabled as the path to some of the
+# included directories change depending on whether it is an in-tree or
+# out-of-tree build.
 KBUILD_CFLAGS += $(call cc-option, -Wunused-but-set-variable)
 KBUILD_CFLAGS += $(call cc-option, -Wunused-const-variable)
 KBUILD_CFLAGS += $(call cc-option, -Wpacked-not-aligned)
 KBUILD_CFLAGS += $(call cc-option, -Wstringop-truncation)
 # The following turn off the warnings enabled by -Wextra
-KBUILD_CFLAGS += -Wno-missing-field-initializers
 KBUILD_CFLAGS += -Wno-sign-compare
-KBUILD_CFLAGS += -Wno-type-limits
+KBUILD_CFLAGS += -Wno-shift-negative-value
+# This flag is needed to avoid build errors on older kernels
+KBUILD_CFLAGS += $(call cc-option, -Wno-cast-function-type)
 
 KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN1
 
+# The following were added to align with W=2 in scripts/Makefile.extrawarn
+# from the Linux source tree (v5.18.14)
+KBUILD_CFLAGS += -Wdisabled-optimization
+# The -Wshadow flag cannot be enabled unless upstream kernels are
+# patched to fix redefinitions of certain built-in functions and
+# global variables.
+KBUILD_CFLAGS += $(call cc-option, -Wlogical-op)
+KBUILD_CFLAGS += -Wmissing-field-initializers
+KBUILD_CFLAGS += -Wtype-limits
+KBUILD_CFLAGS += $(call cc-option, -Wmaybe-uninitialized)
+KBUILD_CFLAGS += $(call cc-option, -Wunused-macros)
+
+KBUILD_CPPFLAGS += -DKBUILD_EXTRA_WARN2
+
+# This warning is disabled to avoid build failures in some kernel versions
+KBUILD_CFLAGS += -Wno-ignored-qualifiers
+
 all:
 	$(MAKE) -C $(KDIR) M=$(CURDIR) $(MAKE_ARGS) EXTRA_CFLAGS="$(EXTRA_CFLAGS)" KBUILD_EXTRA_SYMBOLS="$(EXTRA_SYMBOLS)" modules
 
diff --git a/drivers/gpu/arm/bifrost/Mconfig b/drivers/gpu/arm/bifrost/Mconfig
index fd81ac44af3d..f812bcad639c 100644
--- a/drivers/gpu/arm/bifrost/Mconfig
+++ b/drivers/gpu/arm/bifrost/Mconfig
@@ -97,16 +97,6 @@ config MALI_BIFROST_ENABLE_TRACE
 	  Enables tracing in kbase. Trace log available through
 	  the "mali_trace" debugfs file, when the CONFIG_DEBUG_FS is enabled
 
-config MALI_BIFROST_DMA_FENCE
-	bool "Enable DMA_BUF fence support for Mali"
-	depends on MALI_BIFROST
-	default n
-	help
-	  Support DMA_BUF fences for Mali.
-
-	  This option should only be enabled if the Linux Kernel has built in
-	  support for DMA_BUF fences.
-
 config MALI_ARBITER_SUPPORT
 	bool "Enable arbiter support for Mali"
 	depends on MALI_BIFROST && !MALI_CSF_SUPPORT
@@ -129,7 +119,7 @@ config MALI_DMA_BUF_MAP_ON_DEMAND
 	default n
 	default y if !DMA_BUF_SYNC_IOCTL_SUPPORTED
 	help
-	  This option caused kbase to set up the GPU mapping of imported
+	  This option will cause kbase to set up the GPU mapping of imported
 	  dma-buf when needed to run atoms. This is the legacy behavior.
 
 	  This is intended for testing and the option will get removed in the
@@ -157,17 +147,6 @@ menuconfig MALI_BIFROST_EXPERT
 	  Enabling this option and modifying the default settings may produce
 	  a driver with performance or other limitations.
 
-config MALI_2MB_ALLOC
-	bool "Attempt to allocate 2MB pages"
-	depends on MALI_BIFROST && MALI_BIFROST_EXPERT
-	default n
-	help
-	  Rather than allocating all GPU memory page-by-page, attempt to
-	  allocate 2MB pages from the kernel. This reduces TLB pressure and
-	  helps to prevent memory fragmentation.
-
-	  If in doubt, say N
-
 config MALI_MEMORY_FULLY_BACKED
 	bool "Enable memory fully physically-backed"
 	depends on MALI_BIFROST && MALI_BIFROST_EXPERT
@@ -200,10 +179,10 @@ config MALI_FW_CORE_DUMP
 
 	  Example:
 	  * To explicitly request core dump:
-		echo 1 >/sys/kernel/debug/mali0/fw_core_dump
+	  echo 1 >/sys/kernel/debug/mali0/fw_core_dump
 	  * To output current core dump (after explicitly requesting a core dump,
-	    or kernel driver reported an internal firmware error):
-                cat /sys/kernel/debug/mali0/fw_core_dump
+	  or kernel driver reported an internal firmware error):
+	  cat /sys/kernel/debug/mali0/fw_core_dump
 
 choice
 	prompt "Error injection level"
@@ -343,5 +322,5 @@ config MALI_HW_ERRATA_1485982_USE_CLOCK_ALTERNATIVE
 	  slowest clock will be selected.
 
 
-source "kernel/drivers/gpu/arm/midgard/arbitration/Mconfig"
+source "kernel/drivers/gpu/arm/arbitration/Mconfig"
 source "kernel/drivers/gpu/arm/midgard/tests/Mconfig"
diff --git a/drivers/gpu/arm/bifrost/arbiter/mali_kbase_arbif.c b/drivers/gpu/arm/bifrost/arbiter/mali_kbase_arbif.c
index 64e11ce53625..b5d3cd685ba5 100644
--- a/drivers/gpu/arm/bifrost/arbiter/mali_kbase_arbif.c
+++ b/drivers/gpu/arm/bifrost/arbiter/mali_kbase_arbif.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -28,12 +28,12 @@
 #include <tl/mali_kbase_tracepoints.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
-#include "mali_kbase_arbiter_interface.h"
+#include "linux/mali_arbiter_interface.h"
 
 /* Arbiter interface version against which was implemented this module */
 #define MALI_REQUIRED_KBASE_ARBITER_INTERFACE_VERSION 5
 #if MALI_REQUIRED_KBASE_ARBITER_INTERFACE_VERSION != \
-			MALI_KBASE_ARBITER_INTERFACE_VERSION
+			MALI_ARBITER_INTERFACE_VERSION
 #error "Unsupported Mali Arbiter interface version."
 #endif
 
@@ -205,6 +205,7 @@ int kbase_arbif_init(struct kbase_device *kbdev)
 
 	if (!pdev->dev.driver || !try_module_get(pdev->dev.driver->owner)) {
 		dev_err(kbdev->dev, "arbiter_if driver not available\n");
+		put_device(&pdev->dev);
 		return -EPROBE_DEFER;
 	}
 	kbdev->arb.arb_dev = &pdev->dev;
@@ -212,6 +213,7 @@ int kbase_arbif_init(struct kbase_device *kbdev)
 	if (!arb_if) {
 		dev_err(kbdev->dev, "arbiter_if driver not ready\n");
 		module_put(pdev->dev.driver->owner);
+		put_device(&pdev->dev);
 		return -EPROBE_DEFER;
 	}
 
@@ -233,6 +235,7 @@ int kbase_arbif_init(struct kbase_device *kbdev)
 		if (err) {
 			dev_err(&pdev->dev, "Failed to register with arbiter\n");
 			module_put(pdev->dev.driver->owner);
+			put_device(&pdev->dev);
 			if (err != -EPROBE_DEFER)
 				err = -EFAULT;
 			return err;
@@ -262,8 +265,10 @@ void kbase_arbif_destroy(struct kbase_device *kbdev)
 		arb_if->vm_ops.vm_arb_unregister_dev(kbdev->arb.arb_if);
 	}
 	kbdev->arb.arb_if = NULL;
-	if (kbdev->arb.arb_dev)
+	if (kbdev->arb.arb_dev) {
 		module_put(kbdev->arb.arb_dev->driver->owner);
+		put_device(kbdev->arb.arb_dev);
+	}
 	kbdev->arb.arb_dev = NULL;
 }
 
diff --git a/drivers/gpu/arm/bifrost/arbitration/Kconfig b/drivers/gpu/arm/bifrost/arbitration/Kconfig
deleted file mode 100644
index e991653e8d81..000000000000
--- a/drivers/gpu/arm/bifrost/arbitration/Kconfig
+++ /dev/null
@@ -1,49 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note OR MIT
-#
-# (C) COPYRIGHT 2012-2021 ARM Limited. All rights reserved.
-#
-# This program is free software and is provided to you under the terms of the
-# GNU General Public License version 2 as published by the Free Software
-# Foundation, and any use by you of this program is subject to the terms
-# of such GNU license.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, you can access it online at
-# http://www.gnu.org/licenses/gpl-2.0.html.
-#
-#
-
-config MALI_XEN
-	tristate "Enable Xen Interface reference code"
-	depends on MALI_ARBITRATION && XEN
-	default n
-	help
-	  Enables the build of xen interface modules used in the reference
-	  virtualization setup for Mali
-	  If unsure, say N.
-
-config MALI_ARBITER_MODULES
-	tristate "Enable mali arbiter modules"
-	depends on MALI_ARBITRATION
-	default y
-	help
-	  Enables the build of the arbiter modules used in the reference
-	  virtualization setup for Mali
-	  If unsure, say N
-
-config MALI_GPU_POWER_MODULES
-	tristate "Enable gpu power modules"
-	depends on MALI_ARBITRATION
-	default y
-	help
-	  Enables the build of the gpu power modules used in the reference
-	  virtualization setup for Mali
-	  If unsure, say N
-
-
-source "drivers/gpu/arm/bifrost/arbitration/ptm/Kconfig"
diff --git a/drivers/gpu/arm/bifrost/arbitration/ptm/Kconfig b/drivers/gpu/arm/bifrost/arbitration/ptm/Kconfig
deleted file mode 100644
index 074ebd50daa5..000000000000
--- a/drivers/gpu/arm/bifrost/arbitration/ptm/Kconfig
+++ /dev/null
@@ -1,28 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note OR MIT
-#
-# (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
-#
-# This program is free software and is provided to you under the terms of the
-# GNU General Public License version 2 as published by the Free Software
-# Foundation, and any use by you of this program is subject to the terms
-# of such GNU license.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, you can access it online at
-# http://www.gnu.org/licenses/gpl-2.0.html.
-#
-#
-
-config MALI_PARTITION_MANAGER
-	tristate "Enable compilation of partition manager modules"
-	depends on MALI_ARBITRATION
-	default n
-	help
-	  This option enables the compilation of the partition manager
-	  modules used to configure the Mali-G78AE GPU.
-
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/Kbuild b/drivers/gpu/arm/bifrost/backend/gpu/Kbuild
index 65f07e23412d..7eec91ff6631 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/Kbuild
+++ b/drivers/gpu/arm/bifrost/backend/gpu/Kbuild
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2014-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2014-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.c
index 9587c704ff8a..7c0abbaf860f 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2014-2016, 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2016, 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,12 +22,32 @@
 #include "backend/gpu/mali_kbase_cache_policy_backend.h"
 #include <device/mali_kbase_device.h>
 
+/**
+ * kbasep_amba_register_present() - Check AMBA_<> register is present
+ *                                  in the GPU.
+ * @kbdev:    Device pointer
+ *
+ * Note: Only for arch version 12.x.1 onwards.
+ *
+ * Return: true if AMBA_FEATURES/ENABLE registers are present.
+ */
+static bool kbasep_amba_register_present(struct kbase_device *kbdev)
+{
+	return (ARCH_MAJOR_REV_REG(kbdev->gpu_props.props.raw_props.gpu_id) >=
+		GPU_ID2_ARCH_MAJOR_REV_MAKE(12, 1));
+}
 
 void kbase_cache_set_coherency_mode(struct kbase_device *kbdev,
 		u32 mode)
 {
 	kbdev->current_gpu_coherency_mode = mode;
 
+	if (kbasep_amba_register_present(kbdev)) {
+		u32 val = kbase_reg_read(kbdev, AMBA_ENABLE);
+
+		val = AMBA_ENABLE_COHERENCY_PROTOCOL_SET(val, mode);
+		kbase_reg_write(kbdev, AMBA_ENABLE, val);
+	} else
 		kbase_reg_write(kbdev, COHERENCY_ENABLE, mode);
 }
 
@@ -35,9 +55,38 @@ u32 kbase_cache_get_coherency_features(struct kbase_device *kbdev)
 {
 	u32 coherency_features;
 
+	if (kbasep_amba_register_present(kbdev))
+		coherency_features =
+			kbase_reg_read(kbdev, GPU_CONTROL_REG(AMBA_FEATURES));
+	else
 		coherency_features = kbase_reg_read(
 			kbdev, GPU_CONTROL_REG(COHERENCY_FEATURES));
 
 	return coherency_features;
 }
 
+void kbase_amba_set_memory_cache_support(struct kbase_device *kbdev,
+					 bool enable)
+{
+	if (kbasep_amba_register_present(kbdev)) {
+		u32 val = kbase_reg_read(kbdev, AMBA_ENABLE);
+
+		val = AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SET(val, enable);
+		kbase_reg_write(kbdev, AMBA_ENABLE, val);
+
+	} else {
+		WARN(1, "memory_cache_support not supported");
+	}
+}
+
+void kbase_amba_set_invalidate_hint(struct kbase_device *kbdev, bool enable)
+{
+	if (kbasep_amba_register_present(kbdev)) {
+		u32 val = kbase_reg_read(kbdev, AMBA_ENABLE);
+
+		val = AMBA_ENABLE_INVALIDATE_HINT_SET(val, enable);
+		kbase_reg_write(kbdev, AMBA_ENABLE, val);
+	} else {
+		WARN(1, "invalidate_hint not supported");
+	}
+}
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.h b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.h
index 795dbea40318..758e3be08c16 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.h
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_cache_policy_backend.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2014-2016, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2016, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -43,4 +43,23 @@ void kbase_cache_set_coherency_mode(struct kbase_device *kbdev,
  */
 u32 kbase_cache_get_coherency_features(struct kbase_device *kbdev);
 
+/**
+ * kbase_amba_set_memory_cache_support() - Sets AMBA memory cache support
+ *                                         in the GPU.
+ * @kbdev:    Device pointer
+ * @enable:   true for enable.
+ *
+ * Note: Only for arch version 12.x.1 onwards.
+ */
+void kbase_amba_set_memory_cache_support(struct kbase_device *kbdev,
+					 bool enable);
+/**
+ * kbase_amba_set_invalidate_hint() - Sets AMBA invalidate hint
+ *                                    in the GPU.
+ * @kbdev:    Device pointer
+ * @enable:   true for enable.
+ *
+ * Note: Only for arch version 12.x.1 onwards.
+ */
+void kbase_amba_set_invalidate_hint(struct kbase_device *kbdev, bool enable);
 #endif /* _KBASE_CACHE_POLICY_BACKEND_H_ */
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_instr_defs.h b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_instr_defs.h
index 7190f42c2104..bd2eb8a12047 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_instr_defs.h
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_instr_defs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2014, 2016, 2018-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014, 2016, 2018-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -26,7 +26,7 @@
 #ifndef _KBASE_INSTR_DEFS_H_
 #define _KBASE_INSTR_DEFS_H_
 
-#include <mali_kbase_hwcnt_gpu.h>
+#include <hwcnt/mali_kbase_hwcnt_gpu.h>
 
 /*
  * Instrumentation State Machine States
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_irq_linux.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_irq_linux.c
index 72def5e7aabb..15999cbc9126 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_irq_linux.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_irq_linux.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2014-2016, 2018-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014-2016, 2018-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -163,7 +163,6 @@ static irq_handler_t kbase_handler_table[] = {
 
 #ifdef CONFIG_MALI_BIFROST_DEBUG
 #define  JOB_IRQ_HANDLER JOB_IRQ_TAG
-#define  MMU_IRQ_HANDLER MMU_IRQ_TAG
 #define  GPU_IRQ_HANDLER GPU_IRQ_TAG
 
 /**
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_hw.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_hw.c
index 08de02495a4a..e17014e45f6b 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_hw.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_hw.c
@@ -34,7 +34,7 @@
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_kinstr_jm.h>
 #include <mali_kbase_hwaccess_instr.h>
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <device/mali_kbase_device.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
 #include <backend/gpu/mali_kbase_jm_internal.h>
@@ -1440,6 +1440,11 @@ bool kbase_reset_gpu_is_active(struct kbase_device *kbdev)
 	return true;
 }
 
+bool kbase_reset_gpu_is_not_pending(struct kbase_device *kbdev)
+{
+	return atomic_read(&kbdev->hwaccess.backend.reset_gpu) == KBASE_RESET_GPU_NOT_PENDING;
+}
+
 int kbase_reset_gpu_wait(struct kbase_device *kbdev)
 {
 	wait_event(kbdev->hwaccess.backend.reset_wait,
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_rb.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_rb.c
index 9960beb2e9b4..e5af4ca8fc43 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_rb.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_jm_rb.c
@@ -29,7 +29,7 @@
 #include <mali_kbase_jm.h>
 #include <mali_kbase_js.h>
 #include <tl/mali_kbase_tracepoints.h>
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <mali_kbase_reset_gpu.h>
 #include <mali_kbase_kinstr_jm.h>
 #include <backend/gpu/mali_kbase_cache_policy_backend.h>
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_dummy.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_dummy.c
index e4f4b2455925..1a0209f702ac 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_dummy.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_dummy.c
@@ -80,31 +80,360 @@ static bool ipa_control_timer_enabled;
 #endif
 
 #define LO_MASK(M) ((M) & 0xFFFFFFFF)
+#if !MALI_USE_CSF
 #define HI_MASK(M) ((M) & 0xFFFFFFFF00000000)
+#endif
 
-static u32 get_implementation_register(u32 reg)
-{
-	switch (reg) {
-	case GPU_CONTROL_REG(SHADER_PRESENT_LO):
-		return LO_MASK(DUMMY_IMPLEMENTATION_SHADER_PRESENT);
-	case GPU_CONTROL_REG(TILER_PRESENT_LO):
-		return LO_MASK(DUMMY_IMPLEMENTATION_TILER_PRESENT);
-	case GPU_CONTROL_REG(L2_PRESENT_LO):
-		return LO_MASK(DUMMY_IMPLEMENTATION_L2_PRESENT);
-	case GPU_CONTROL_REG(STACK_PRESENT_LO):
-		return LO_MASK(DUMMY_IMPLEMENTATION_STACK_PRESENT);
+/* Construct a value for the THREAD_FEATURES register, *except* the two most
+ * significant bits, which are set to IMPLEMENTATION_MODEL in
+ * midgard_model_read_reg().
+ */
+#if MALI_USE_CSF
+#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT)                       \
+	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 24))
+#else
+#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT)                       \
+	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 16) | ((MAX_TG_SPLIT) << 24))
+#endif
 
-	case GPU_CONTROL_REG(SHADER_PRESENT_HI):
-	case GPU_CONTROL_REG(TILER_PRESENT_HI):
-	case GPU_CONTROL_REG(L2_PRESENT_HI):
-	case GPU_CONTROL_REG(STACK_PRESENT_HI):
-	/* *** FALLTHROUGH *** */
-	default:
-		return 0;
-	}
-}
+struct error_status_t hw_error_status;
 
-struct {
+/**
+ * struct control_reg_values_t - control register values specific to the GPU being 'emulated'
+ * @name:			GPU name
+ * @gpu_id:			GPU ID to report
+ * @as_present:			Bitmap of address spaces present
+ * @thread_max_threads:		Maximum number of threads per core
+ * @thread_max_workgroup_size:	Maximum number of threads per workgroup
+ * @thread_max_barrier_size:	Maximum number of threads per barrier
+ * @thread_features:		Thread features, NOT INCLUDING the 2
+ *				most-significant bits, which are always set to
+ *				IMPLEMENTATION_MODEL.
+ * @core_features:		Core features
+ * @tiler_features:		Tiler features
+ * @mmu_features:		MMU features
+ * @gpu_features_lo:		GPU features (low)
+ * @gpu_features_hi:		GPU features (high)
+ * @shader_present:		Available shader bitmap
+ * @stack_present:		Core stack present bitmap
+ *
+ */
+struct control_reg_values_t {
+	const char *name;
+	u32 gpu_id;
+	u32 as_present;
+	u32 thread_max_threads;
+	u32 thread_max_workgroup_size;
+	u32 thread_max_barrier_size;
+	u32 thread_features;
+	u32 core_features;
+	u32 tiler_features;
+	u32 mmu_features;
+	u32 gpu_features_lo;
+	u32 gpu_features_hi;
+	u32 shader_present;
+	u32 stack_present;
+};
+
+struct job_slot {
+	int job_active;
+	int job_queued;
+	int job_complete_irq_asserted;
+	int job_irq_mask;
+	int job_disabled;
+};
+
+struct dummy_model_t {
+	int reset_completed;
+	int reset_completed_mask;
+#if !MALI_USE_CSF
+	int prfcnt_sample_completed;
+#endif /* !MALI_USE_CSF */
+	int power_changed_mask;	/* 2bits: _ALL,_SINGLE */
+	int power_changed;	/* 1bit */
+	bool clean_caches_completed;
+	bool clean_caches_completed_irq_enabled;
+#if MALI_USE_CSF
+	bool flush_pa_range_completed;
+	bool flush_pa_range_completed_irq_enabled;
+#endif
+	int power_on;		/* 6bits: SHADER[4],TILER,L2 */
+	u32 stack_power_on_lo;
+	u32 coherency_enable;
+	unsigned int job_irq_js_state;
+	struct job_slot slots[NUM_SLOTS];
+	const struct control_reg_values_t *control_reg_values;
+	u32 l2_config;
+	void *data;
+};
+
+/* Array associating GPU names with control register values. The first
+ * one is used in the case of no match.
+ */
+static const struct control_reg_values_t all_control_reg_values[] = {
+	{
+		.name = "tMIx",
+		.gpu_id = GPU_ID2_MAKE(6, 0, 10, 0, 0, 1, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tHEx",
+		.gpu_id = GPU_ID2_MAKE(6, 2, 0, 1, 0, 3, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tSIx",
+		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 0, 1, 1, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x300,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x209,
+		.mmu_features = 0x2821,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tDVx",
+		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 3, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x300,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x209,
+		.mmu_features = 0x2821,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tNOx",
+		.gpu_id = GPU_ID2_MAKE(7, 2, 1, 1, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tGOx_r0p0",
+		.gpu_id = GPU_ID2_MAKE(7, 2, 2, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tGOx_r1p0",
+		.gpu_id = GPU_ID2_MAKE(7, 4, 0, 2, 1, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
+		.core_features = 0x2,
+		.tiler_features = 0x209,
+		.mmu_features = 0x2823,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tTRx",
+		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 0, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tNAx",
+		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 1, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tBEx",
+		.gpu_id = GPU_ID2_MAKE(9, 2, 0, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tBAx",
+		.gpu_id = GPU_ID2_MAKE(9, 14, 4, 5, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tDUx",
+		.gpu_id = GPU_ID2_MAKE(10, 2, 0, 1, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tODx",
+		.gpu_id = GPU_ID2_MAKE(10, 8, 0, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tGRx",
+		.gpu_id = GPU_ID2_MAKE(10, 10, 0, 3, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.core_features = 0x0, /* core_1e16fma2tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tVAx",
+		.gpu_id = GPU_ID2_MAKE(10, 12, 0, 4, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x180,
+		.thread_max_workgroup_size = 0x180,
+		.thread_max_barrier_size = 0x180,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
+		.core_features = 0x0, /* core_1e16fma2tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0,
+		.gpu_features_hi = 0,
+		.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
+		.stack_present = DUMMY_IMPLEMENTATION_STACK_PRESENT,
+	},
+	{
+		.name = "tTUx",
+		.gpu_id = GPU_ID2_MAKE(11, 8, 5, 2, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x800,
+		.thread_max_workgroup_size = 0x400,
+		.thread_max_barrier_size = 0x400,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x10000, 4, 0),
+		.core_features = 0x0, /* core_1e32fma2tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0xf,
+		.gpu_features_hi = 0,
+		.shader_present = 0xFF,
+		.stack_present = 0xF,
+	},
+	{
+		.name = "tTIx",
+		.gpu_id = GPU_ID2_MAKE(12, 8, 1, 0, 0, 0, 0),
+		.as_present = 0xFF,
+		.thread_max_threads = 0x800,
+		.thread_max_workgroup_size = 0x400,
+		.thread_max_barrier_size = 0x400,
+		.thread_features = THREAD_FEATURES_PARTIAL(0x10000, 16, 0),
+		.core_features = 0x1, /* core_1e64fma4tex */
+		.tiler_features = 0x809,
+		.mmu_features = 0x2830,
+		.gpu_features_lo = 0xf,
+		.gpu_features_hi = 0,
+		.shader_present = 0xFF,
+		.stack_present = 0xF,
+	},
+};
+
+static struct {
 	spinlock_t access_lock;
 #if !MALI_USE_CSF
 	unsigned long prfcnt_base;
@@ -125,74 +454,33 @@ struct {
 #endif /* !MALI_USE_CSF */
 	u64 tiler_counters[KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
 	u64 l2_counters[KBASE_DUMMY_MODEL_MAX_MEMSYS_BLOCKS *
-			KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+					KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
 	u64 shader_counters[KBASE_DUMMY_MODEL_MAX_SHADER_CORES *
-			    KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+						KBASE_DUMMY_MODEL_COUNTER_PER_CORE];
+} performance_counters;
 
-} performance_counters = {
-	.l2_present = DUMMY_IMPLEMENTATION_L2_PRESENT,
-	.shader_present = DUMMY_IMPLEMENTATION_SHADER_PRESENT,
-};
+static u32 get_implementation_register(u32 reg,
+				       const struct control_reg_values_t *const control_reg_values)
+{
+	switch (reg) {
+	case GPU_CONTROL_REG(SHADER_PRESENT_LO):
+		return LO_MASK(control_reg_values->shader_present);
+	case GPU_CONTROL_REG(TILER_PRESENT_LO):
+		return LO_MASK(DUMMY_IMPLEMENTATION_TILER_PRESENT);
+	case GPU_CONTROL_REG(L2_PRESENT_LO):
+		return LO_MASK(DUMMY_IMPLEMENTATION_L2_PRESENT);
+	case GPU_CONTROL_REG(STACK_PRESENT_LO):
+		return LO_MASK(control_reg_values->stack_present);
 
-struct job_slot {
-	int job_active;
-	int job_queued;
-	int job_complete_irq_asserted;
-	int job_irq_mask;
-	int job_disabled;
-};
-
-/**
- * struct control_reg_values_t - control register values specific to the GPU being 'emulated'
- * @name:			GPU name
- * @gpu_id:			GPU ID to report
- * @as_present:			Bitmap of address spaces present
- * @thread_max_threads:		Maximum number of threads per core
- * @thread_max_workgroup_size:	Maximum number of threads per workgroup
- * @thread_max_barrier_size:	Maximum number of threads per barrier
- * @thread_features:		Thread features, NOT INCLUDING the 2
- *				most-significant bits, which are always set to
- *				IMPLEMENTATION_MODEL.
- * @core_features:		Core features
- * @tiler_features:		Tiler features
- * @mmu_features:		MMU features
- * @gpu_features_lo:		GPU features (low)
- * @gpu_features_hi:		GPU features (high)
- */
-struct control_reg_values_t {
-	const char *name;
-	u32 gpu_id;
-	u32 as_present;
-	u32 thread_max_threads;
-	u32 thread_max_workgroup_size;
-	u32 thread_max_barrier_size;
-	u32 thread_features;
-	u32 core_features;
-	u32 tiler_features;
-	u32 mmu_features;
-	u32 gpu_features_lo;
-	u32 gpu_features_hi;
-};
-
-struct dummy_model_t {
-	int reset_completed;
-	int reset_completed_mask;
-#if !MALI_USE_CSF
-	int prfcnt_sample_completed;
-#endif /* !MALI_USE_CSF */
-	int power_changed_mask;	/* 2bits: _ALL,_SINGLE */
-	int power_changed;	/* 1bit */
-	bool clean_caches_completed;
-	bool clean_caches_completed_irq_enabled;
-	int power_on;		/* 6bits: SHADER[4],TILER,L2 */
-	u32 stack_power_on_lo;
-	u32 coherency_enable;
-	unsigned int job_irq_js_state;
-	struct job_slot slots[NUM_SLOTS];
-	const struct control_reg_values_t *control_reg_values;
-	u32 l2_config;
-	void *data;
-};
+	case GPU_CONTROL_REG(SHADER_PRESENT_HI):
+	case GPU_CONTROL_REG(TILER_PRESENT_HI):
+	case GPU_CONTROL_REG(L2_PRESENT_HI):
+	case GPU_CONTROL_REG(STACK_PRESENT_HI):
+	/* *** FALLTHROUGH *** */
+	default:
+		return 0;
+	}
+}
 
 void gpu_device_set_data(void *model, void *data)
 {
@@ -221,238 +509,6 @@ static char *no_mali_gpu = CONFIG_MALI_NO_MALI_DEFAULT_GPU;
 module_param(no_mali_gpu, charp, 0000);
 MODULE_PARM_DESC(no_mali_gpu, "GPU to identify as");
 
-/* Construct a value for the THREAD_FEATURES register, *except* the two most
- * significant bits, which are set to IMPLEMENTATION_MODEL in
- * midgard_model_read_reg().
- */
-#if MALI_USE_CSF
-#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT) \
-	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 24))
-#else
-#define THREAD_FEATURES_PARTIAL(MAX_REGISTERS, MAX_TASK_QUEUE, MAX_TG_SPLIT) \
-	((MAX_REGISTERS) | ((MAX_TASK_QUEUE) << 16) | ((MAX_TG_SPLIT) << 24))
-#endif
-
-/* Array associating GPU names with control register values. The first
- * one is used in the case of no match.
- */
-static const struct control_reg_values_t all_control_reg_values[] = {
-	{
-		.name = "tMIx",
-		.gpu_id = GPU_ID2_MAKE(6, 0, 10, 0, 0, 1, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tHEx",
-		.gpu_id = GPU_ID2_MAKE(6, 2, 0, 1, 0, 3, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tSIx",
-		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 0, 1, 1, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x300,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x209,
-		.mmu_features = 0x2821,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tDVx",
-		.gpu_id = GPU_ID2_MAKE(7, 0, 0, 3, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x300,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x209,
-		.mmu_features = 0x2821,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tNOx",
-		.gpu_id = GPU_ID2_MAKE(7, 2, 1, 1, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tGOx_r0p0",
-		.gpu_id = GPU_ID2_MAKE(7, 2, 2, 2, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tGOx_r1p0",
-		.gpu_id = GPU_ID2_MAKE(7, 4, 0, 2, 1, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 10),
-		.core_features = 0x2,
-		.tiler_features = 0x209,
-		.mmu_features = 0x2823,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tTRx",
-		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 0, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tNAx",
-		.gpu_id = GPU_ID2_MAKE(9, 0, 8, 1, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tBEx",
-		.gpu_id = GPU_ID2_MAKE(9, 2, 0, 2, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tBAx",
-		.gpu_id = GPU_ID2_MAKE(9, 14, 4, 5, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tDUx",
-		.gpu_id = GPU_ID2_MAKE(10, 2, 0, 1, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tODx",
-		.gpu_id = GPU_ID2_MAKE(10, 8, 0, 2, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tGRx",
-		.gpu_id = GPU_ID2_MAKE(10, 10, 0, 3, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.core_features = 0x0, /* core_1e16fma2tex */
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tVAx",
-		.gpu_id = GPU_ID2_MAKE(10, 12, 0, 4, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x180,
-		.thread_max_workgroup_size = 0x180,
-		.thread_max_barrier_size = 0x180,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x6000, 4, 0),
-		.core_features = 0x0, /* core_1e16fma2tex */
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0,
-		.gpu_features_hi = 0,
-	},
-	{
-		.name = "tTUx",
-		.gpu_id = GPU_ID2_MAKE(11, 8, 5, 2, 0, 0, 0),
-		.as_present = 0xFF,
-		.thread_max_threads = 0x800,
-		.thread_max_workgroup_size = 0x400,
-		.thread_max_barrier_size = 0x400,
-		.thread_features = THREAD_FEATURES_PARTIAL(0x10000, 4, 0),
-		.core_features = 0x0, /* core_1e32fma2tex */
-		.tiler_features = 0x809,
-		.mmu_features = 0x2830,
-		.gpu_features_lo = 0xf,
-		.gpu_features_hi = 0,
-	},
-};
-
-struct error_status_t hw_error_status;
-
 #if MALI_USE_CSF
 static u32 gpu_model_get_prfcnt_value(enum kbase_ipa_core_type core_type,
 				      u32 cnt_idx, bool is_low_word)
@@ -1011,6 +1067,21 @@ static const struct control_reg_values_t *find_control_reg_values(const char *gp
 	size_t i;
 	const struct control_reg_values_t *ret = NULL;
 
+	/* Edge case for tGOx, as it has 2 entries in the table for its R0 and R1
+	 * revisions respectively. As none of them are named "tGOx" the name comparison
+	 * needs to be fixed in these cases. CONFIG_GPU_HWVER should be one of "r0p0"
+	 * or "r1p0" and is derived from the DDK's build configuration. In cases
+	 * where it is unavailable, it defaults to tGOx r1p0.
+	 */
+	if (!strcmp(gpu, "tGOx")) {
+#ifdef CONFIG_GPU_HWVER
+		if (!strcmp(CONFIG_GPU_HWVER, "r0p0"))
+			gpu = "tGOx_r0p0";
+		else if (!strcmp(CONFIG_GPU_HWVER, "r1p0"))
+#endif /* CONFIG_GPU_HWVER defined */
+			gpu = "tGOx_r1p0";
+	}
+
 	for (i = 0; i < ARRAY_SIZE(all_control_reg_values); ++i) {
 		const struct control_reg_values_t * const fcrv = &all_control_reg_values[i];
 
@@ -1043,6 +1114,10 @@ void *midgard_model_create(const void *config)
 		dummy->job_irq_js_state = 0;
 		init_register_statuses(dummy);
 		dummy->control_reg_values = find_control_reg_values(no_mali_gpu);
+		performance_counters.l2_present = get_implementation_register(
+			GPU_CONTROL_REG(L2_PRESENT_LO), dummy->control_reg_values);
+		performance_counters.shader_present = get_implementation_register(
+			GPU_CONTROL_REG(SHADER_PRESENT_LO), dummy->control_reg_values);
 	}
 	return dummy;
 }
@@ -1066,6 +1141,8 @@ static void midgard_model_get_outputs(void *h)
 	    hw_error_status.gpu_error_irq ||
 #if !MALI_USE_CSF
 	    dummy->prfcnt_sample_completed ||
+#else
+	    (dummy->flush_pa_range_completed && dummy->flush_pa_range_completed_irq_enabled) ||
 #endif
 	    (dummy->clean_caches_completed && dummy->clean_caches_completed_irq_enabled))
 		gpu_device_raise_irq(dummy, GPU_DUMMY_GPU_IRQ);
@@ -1235,6 +1312,9 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 		dummy->reset_completed_mask = (value >> 8) & 0x01;
 		dummy->power_changed_mask = (value >> 9) & 0x03;
 		dummy->clean_caches_completed_irq_enabled = (value & (1u << 17)) != 0u;
+#if MALI_USE_CSF
+		dummy->flush_pa_range_completed_irq_enabled = (value & (1u << 20)) != 0u;
+#endif
 	} else if (addr == GPU_CONTROL_REG(COHERENCY_ENABLE)) {
 		dummy->coherency_enable = value;
 	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_CLEAR)) {
@@ -1247,10 +1327,17 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 
 		if (value & (1 << 17))
 			dummy->clean_caches_completed = false;
-#if   !MALI_USE_CSF
-		if (value & PRFCNT_SAMPLE_COMPLETED)
+
+#if MALI_USE_CSF
+		if (value & (1u << 20))
+			dummy->flush_pa_range_completed = false;
+#endif /* MALI_USE_CSF */
+
+#if !MALI_USE_CSF
+		if (value & PRFCNT_SAMPLE_COMPLETED) /* (1 << 16) */
 			dummy->prfcnt_sample_completed = 0;
 #endif /* !MALI_USE_CSF */
+
 		/*update error status */
 		hw_error_status.gpu_error_irq &= ~(value);
 	} else if (addr == GPU_CONTROL_REG(GPU_COMMAND)) {
@@ -1274,7 +1361,15 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 			pr_debug("clean caches requested");
 			dummy->clean_caches_completed = true;
 			break;
-#if   !MALI_USE_CSF
+#if MALI_USE_CSF
+		case GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_L2:
+		case GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_L2_LSC:
+		case GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_FULL:
+			pr_debug("pa range flush requested");
+			dummy->flush_pa_range_completed = true;
+			break;
+#endif /* MALI_USE_CSF */
+#if !MALI_USE_CSF
 		case GPU_COMMAND_PRFCNT_SAMPLE:
 			midgard_model_dump_prfcnt();
 			dummy->prfcnt_sample_completed = 1;
@@ -1282,6 +1377,11 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 		default:
 			break;
 		}
+#if MALI_USE_CSF
+	} else if (addr >= GPU_CONTROL_REG(GPU_COMMAND_ARG0_LO) &&
+		   addr <= GPU_CONTROL_REG(GPU_COMMAND_ARG1_HI)) {
+		/* Writes ignored */
+#endif
 	} else if (addr == GPU_CONTROL_REG(L2_CONFIG)) {
 		dummy->l2_config = value;
 	}
@@ -1291,6 +1391,12 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 						(CSF_NUM_DOORBELL * CSF_HW_DOORBELL_PAGE_SIZE))) {
 		if (addr == GPU_CONTROL_REG(CSF_HW_DOORBELL_PAGE_OFFSET))
 			hw_error_status.job_irq_status = JOB_IRQ_GLOBAL_IF;
+	} else if ((addr >= GPU_CONTROL_REG(SYSC_ALLOC0)) &&
+		   (addr < GPU_CONTROL_REG(SYSC_ALLOC(SYSC_ALLOC_COUNT)))) {
+		/* Do nothing */
+	} else if ((addr >= GPU_CONTROL_REG(ASN_HASH_0)) &&
+		   (addr < GPU_CONTROL_REG(ASN_HASH(ASN_HASH_COUNT)))) {
+		/* Do nothing */
 	} else if (addr == IPA_CONTROL_REG(COMMAND)) {
 		pr_debug("Received IPA_CONTROL command");
 	} else if (addr == IPA_CONTROL_REG(TIMER)) {
@@ -1315,8 +1421,7 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 		hw_error_status.mmu_irq_mask = value;
 	} else if (addr == MMU_REG(MMU_IRQ_CLEAR)) {
 		hw_error_status.mmu_irq_rawstat &= (~value);
-	} else if ((addr >= MMU_AS_REG(0, AS_TRANSTAB_LO)) &&
-			(addr <= MMU_AS_REG(15, AS_STATUS))) {
+	} else if ((addr >= MMU_AS_REG(0, AS_TRANSTAB_LO)) && (addr <= MMU_AS_REG(15, AS_STATUS))) {
 		int mem_addr_space = (addr - MMU_AS_REG(0, AS_TRANSTAB_LO))
 									>> 6;
 
@@ -1443,7 +1548,8 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 			dummy->power_changed = 1;
 			break;
 		case SHADER_PWRON_LO:
-			dummy->power_on |= (value & 0xF) << 2;
+			dummy->power_on |=
+				(value & dummy->control_reg_values->shader_present) << 2;
 			dummy->power_changed = 1;
 			break;
 		case L2_PWRON_LO:
@@ -1459,7 +1565,8 @@ u8 midgard_model_write_reg(void *h, u32 addr, u32 value)
 			dummy->power_changed = 1;
 			break;
 		case SHADER_PWROFF_LO:
-			dummy->power_on &= ~((value & 0xF) << 2);
+			dummy->power_on &=
+				~((value & dummy->control_reg_values->shader_present) << 2);
 			dummy->power_changed = 1;
 			break;
 		case L2_PWROFF_LO:
@@ -1546,6 +1653,9 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 	else if (addr == GPU_CONTROL_REG(GPU_IRQ_MASK)) {
 		*value = (dummy->reset_completed_mask << 8) |
 			 ((dummy->clean_caches_completed_irq_enabled ? 1u : 0u) << 17) |
+#if MALI_USE_CSF
+			 ((dummy->flush_pa_range_completed_irq_enabled ? 1u : 0u) << 20) |
+#endif
 			 (dummy->power_changed_mask << 9) | (1 << 7) | 1;
 		pr_debug("GPU_IRQ_MASK read %x", *value);
 	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_RAWSTAT)) {
@@ -1555,6 +1665,9 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 			 (dummy->prfcnt_sample_completed ? PRFCNT_SAMPLE_COMPLETED : 0) |
 #endif /* !MALI_USE_CSF */
 			 ((dummy->clean_caches_completed ? 1u : 0u) << 17) |
+#if MALI_USE_CSF
+			 ((dummy->flush_pa_range_completed ? 1u : 0u) << 20) |
+#endif
 			 hw_error_status.gpu_error_irq;
 		pr_debug("GPU_IRQ_RAWSTAT read %x", *value);
 	} else if (addr == GPU_CONTROL_REG(GPU_IRQ_STATUS)) {
@@ -1569,6 +1682,13 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 				   1u :
 				   0u)
 			  << 17) |
+#if MALI_USE_CSF
+			 (((dummy->flush_pa_range_completed &&
+			    dummy->flush_pa_range_completed_irq_enabled) ?
+				   1u :
+				   0u)
+			  << 20) |
+#endif
 			 hw_error_status.gpu_error_irq;
 		pr_debug("GPU_IRQ_STAT read %x", *value);
 	} else if (addr == GPU_CONTROL_REG(GPU_STATUS)) {
@@ -1581,8 +1701,18 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 		*value = hw_error_status.gpu_fault_status;
 	} else if (addr == GPU_CONTROL_REG(L2_CONFIG)) {
 		*value = dummy->l2_config;
-	} else if ((addr >= GPU_CONTROL_REG(SHADER_PRESENT_LO)) &&
-				(addr <= GPU_CONTROL_REG(L2_MMU_CONFIG))) {
+	}
+#if MALI_USE_CSF
+	else if ((addr >= GPU_CONTROL_REG(SYSC_ALLOC0)) &&
+		 (addr < GPU_CONTROL_REG(SYSC_ALLOC(SYSC_ALLOC_COUNT)))) {
+		*value = 0;
+	} else if ((addr >= GPU_CONTROL_REG(ASN_HASH_0)) &&
+		   (addr < GPU_CONTROL_REG(ASN_HASH(ASN_HASH_COUNT)))) {
+		*value = 0;
+	}
+#endif
+	else if ((addr >= GPU_CONTROL_REG(SHADER_PRESENT_LO)) &&
+		 (addr <= GPU_CONTROL_REG(L2_MMU_CONFIG))) {
 		switch (addr) {
 		case GPU_CONTROL_REG(SHADER_PRESENT_LO):
 		case GPU_CONTROL_REG(SHADER_PRESENT_HI):
@@ -1592,27 +1722,27 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 		case GPU_CONTROL_REG(L2_PRESENT_HI):
 		case GPU_CONTROL_REG(STACK_PRESENT_LO):
 		case GPU_CONTROL_REG(STACK_PRESENT_HI):
-			*value = get_implementation_register(addr);
+			*value = get_implementation_register(addr, dummy->control_reg_values);
 			break;
 		case GPU_CONTROL_REG(SHADER_READY_LO):
 			*value = (dummy->power_on >> 0x02) &
-			get_implementation_register(
-				GPU_CONTROL_REG(SHADER_PRESENT_LO));
+				 get_implementation_register(GPU_CONTROL_REG(SHADER_PRESENT_LO),
+							     dummy->control_reg_values);
 			break;
 		case GPU_CONTROL_REG(TILER_READY_LO):
 			*value = (dummy->power_on >> 0x01) &
-				 get_implementation_register(
-				GPU_CONTROL_REG(TILER_PRESENT_LO));
+				 get_implementation_register(GPU_CONTROL_REG(TILER_PRESENT_LO),
+							     dummy->control_reg_values);
 			break;
 		case GPU_CONTROL_REG(L2_READY_LO):
 			*value = dummy->power_on &
-				 get_implementation_register(
-				GPU_CONTROL_REG(L2_PRESENT_LO));
+				 get_implementation_register(GPU_CONTROL_REG(L2_PRESENT_LO),
+							     dummy->control_reg_values);
 			break;
 		case GPU_CONTROL_REG(STACK_READY_LO):
 			*value = dummy->stack_power_on_lo &
-				 get_implementation_register(
-				GPU_CONTROL_REG(STACK_PRESENT_LO));
+				 get_implementation_register(GPU_CONTROL_REG(STACK_PRESENT_LO),
+							     dummy->control_reg_values);
 			break;
 
 		case GPU_CONTROL_REG(SHADER_READY_HI):
@@ -1904,6 +2034,8 @@ u8 midgard_model_read_reg(void *h, u32 addr, u32 * const value)
 
 		*value = gpu_model_get_prfcnt_value(KBASE_IPA_CORE_TYPE_SHADER,
 						    counter_index, is_low_word);
+	} else if (addr == USER_REG(LATEST_FLUSH)) {
+		*value = 0;
 	}
 #endif
 	else if (addr == GPU_CONTROL_REG(GPU_FEATURES_LO)) {
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_error_generator.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_error_generator.c
index 344046089d6e..972d1c87fb1a 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_error_generator.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_model_error_generator.c
@@ -23,13 +23,6 @@
 #include <linux/random.h>
 #include "backend/gpu/mali_kbase_model_dummy.h"
 
-/* all the error conditions supported by the model */
-#define TOTAL_FAULTS 27
-/* maximum number of levels in the MMU translation table tree */
-#define MAX_MMU_TABLE_LEVEL 4
-/* worst case scenario is <1 MMU fault + 1 job fault + 2 GPU faults> */
-#define MAX_CONCURRENT_FAULTS 3
-
 static struct kbase_error_atom *error_track_list;
 
 unsigned int rand_seed;
@@ -40,6 +33,14 @@ unsigned int error_probability = 50;	/* to be set between 0 and 100 */
 unsigned int multiple_error_probability = 50;
 
 #ifdef CONFIG_MALI_ERROR_INJECT_RANDOM
+
+/* all the error conditions supported by the model */
+#define TOTAL_FAULTS 27
+/* maximum number of levels in the MMU translation table tree */
+#define MAX_MMU_TABLE_LEVEL 4
+/* worst case scenario is <1 MMU fault + 1 job fault + 2 GPU faults> */
+#define MAX_CONCURRENT_FAULTS 3
+
 /**
  * gpu_generate_error - Generate GPU error
  */
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_backend.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_backend.c
index df735d95de9f..5c71fdf154b9 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_backend.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_backend.c
@@ -36,7 +36,7 @@
 #include <linux/pm_runtime.h>
 #include <mali_kbase_reset_gpu.h>
 #endif /* !MALI_USE_CSF */
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
 #include <backend/gpu/mali_kbase_devfreq.h>
 #include <mali_kbase_dummy_job_wa.h>
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_ca.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_ca.c
index d9e3dfcc6994..9e38b904b459 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_ca.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_ca.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2013-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2013-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -92,29 +92,10 @@ void kbase_devfreq_set_core_mask(struct kbase_device *kbdev, u64 core_mask)
 	 * for those cores to get powered down
 	 */
 	if ((core_mask & old_core_mask) != old_core_mask) {
-		bool can_wait;
-
-		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		can_wait = kbdev->pm.backend.gpu_ready && kbase_pm_is_mcu_desired(kbdev);
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-		/* This check is ideally not required, the wait function can
-		 * deal with the GPU power down. But it has been added to
-		 * address the scenario where down-scaling request comes from
-		 * the platform specific code soon after the GPU power down
-		 * and at the time same time application thread tries to
-		 * power up the GPU (on the flush of GPU queue).
-		 * The platform specific @ref callback_power_on that gets
-		 * invoked on power up does not return until down-scaling
-		 * request is complete. The check mitigates the race caused by
-		 * the problem in platform specific code.
-		 */
-		if (likely(can_wait)) {
-			if (kbase_pm_wait_for_desired_state(kbdev)) {
-				dev_warn(kbdev->dev,
-					 "Wait for update of core_mask from %llx to %llx failed",
-					 old_core_mask, core_mask);
-			}
+		if (kbase_pm_wait_for_cores_down_scale(kbdev)) {
+			dev_warn(kbdev->dev,
+				 "Wait for update of core_mask from %llx to %llx failed",
+				 old_core_mask, core_mask);
 		}
 	}
 #endif
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_driver.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_driver.c
index 94b87ce7166b..8173cf6ba7d7 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_driver.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_driver.c
@@ -39,7 +39,7 @@
 
 #include <mali_kbase_reset_gpu.h>
 #include <mali_kbase_ctx_sched.h>
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <mali_kbase_pbha.h>
 #include <backend/gpu/mali_kbase_cache_policy_backend.h>
 #include <device/mali_kbase_device.h>
@@ -538,6 +538,14 @@ static void kbase_pm_l2_config_override(struct kbase_device *kbdev)
 	if (!kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_L2_CONFIG))
 		return;
 
+#if MALI_USE_CSF
+	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_PBHA_HWU)) {
+		val = kbase_reg_read(kbdev, GPU_CONTROL_REG(L2_CONFIG));
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(L2_CONFIG),
+				L2_CONFIG_PBHA_HWU_SET(val, kbdev->pbha_propagate_bits));
+	}
+#endif /* MALI_USE_CSF */
+
 	/*
 	 * Skip if size and hash are not given explicitly,
 	 * which means default values are used.
@@ -599,6 +607,21 @@ static const char *kbase_mcu_state_to_string(enum kbase_mcu_state state)
 		return strings[state];
 }
 
+static
+void kbase_ktrace_log_mcu_state(struct kbase_device *kbdev, enum kbase_mcu_state state)
+{
+#if KBASE_KTRACE_ENABLE
+	switch (state) {
+#define KBASEP_MCU_STATE(n) \
+	case KBASE_MCU_ ## n: \
+		KBASE_KTRACE_ADD(kbdev, PM_MCU_ ## n, NULL, state); \
+		break;
+#include "mali_kbase_pm_mcu_states.h"
+#undef KBASEP_MCU_STATE
+	}
+#endif
+}
+
 static inline bool kbase_pm_handle_mcu_core_attr_update(struct kbase_device *kbdev)
 {
 	struct kbase_pm_backend_data *backend = &kbdev->pm.backend;
@@ -689,7 +712,6 @@ static void wait_mcu_as_inactive(struct kbase_device *kbdev)
 }
 #endif
 
-
 /**
  * kbasep_pm_toggle_power_interrupt - Toggles the IRQ mask for power interrupts
  *                                    from the firmware
@@ -697,10 +719,10 @@ static void wait_mcu_as_inactive(struct kbase_device *kbdev)
  * @kbdev:  Pointer to the device
  * @enable: boolean indicating to enable interrupts or not
  *
- * The POWER_CHANGED_ALL and POWER_CHANGED_SINGLE interrupts can be disabled
- * after L2 has been turned on when FW is controlling the power for the shader
- * cores. Correspondingly, the interrupts can be re-enabled after the MCU has
- * been disabled before the power down of L2.
+ * The POWER_CHANGED_ALL interrupt can be disabled after L2 has been turned on
+ * when FW is controlling the power for the shader cores. Correspondingly, the
+ * interrupts can be re-enabled after the MCU has been disabled before the
+ * power down of L2.
  */
 static void kbasep_pm_toggle_power_interrupt(struct kbase_device *kbdev, bool enable)
 {
@@ -710,10 +732,12 @@ static void kbasep_pm_toggle_power_interrupt(struct kbase_device *kbdev, bool en
 
 	irq_mask = kbase_reg_read(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK));
 
-	if (enable)
-		irq_mask |= POWER_CHANGED_ALL | POWER_CHANGED_SINGLE;
-	else
-		irq_mask &= ~(POWER_CHANGED_ALL | POWER_CHANGED_SINGLE);
+	if (enable) {
+		irq_mask |= POWER_CHANGED_ALL;
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR), POWER_CHANGED_ALL);
+	} else {
+		irq_mask &= ~POWER_CHANGED_ALL;
+	}
 
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_MASK), irq_mask);
 }
@@ -1028,10 +1052,12 @@ static int kbase_pm_mcu_update_state(struct kbase_device *kbdev)
 			     backend->mcu_state);
 		}
 
-		if (backend->mcu_state != prev_state)
+		if (backend->mcu_state != prev_state) {
 			dev_dbg(kbdev->dev, "MCU state transition: %s to %s\n",
 				kbase_mcu_state_to_string(prev_state),
 				kbase_mcu_state_to_string(backend->mcu_state));
+			kbase_ktrace_log_mcu_state(kbdev, backend->mcu_state);
+		}
 
 	} while (backend->mcu_state != prev_state);
 
@@ -1079,6 +1105,21 @@ static const char *kbase_l2_core_state_to_string(enum kbase_l2_core_state state)
 		return strings[state];
 }
 
+static
+void kbase_ktrace_log_l2_core_state(struct kbase_device *kbdev, enum kbase_l2_core_state state)
+{
+#if KBASE_KTRACE_ENABLE
+	switch (state) {
+#define KBASEP_L2_STATE(n) \
+	case KBASE_L2_ ## n: \
+		KBASE_KTRACE_ADD(kbdev, PM_L2_ ## n, NULL, state); \
+		break;
+#include "mali_kbase_pm_l2_states.h"
+#undef KBASEP_L2_STATE
+	}
+#endif
+}
+
 #if !MALI_USE_CSF
 /* On powering on the L2, the tracked kctx becomes stale and can be cleared.
  * This enables the backend to spare the START_FLUSH.INV_SHADER_OTHER
@@ -1136,18 +1177,13 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 				KBASE_PM_CORE_L2);
 		u64 l2_ready = kbase_pm_get_ready_cores(kbdev,
 				KBASE_PM_CORE_L2);
-#ifdef CONFIG_MALI_ARBITER_SUPPORT
-		u64 tiler_trans = kbase_pm_get_trans_cores(
-				kbdev, KBASE_PM_CORE_TILER);
-		u64 tiler_ready = kbase_pm_get_ready_cores(
-				kbdev, KBASE_PM_CORE_TILER);
 
+#ifdef CONFIG_MALI_ARBITER_SUPPORT
 		/*
 		 * kbase_pm_get_ready_cores and kbase_pm_get_trans_cores
 		 * are vulnerable to corruption if gpu is lost
 		 */
-		if (kbase_is_gpu_removed(kbdev)
-				|| kbase_pm_is_gpu_lost(kbdev)) {
+		if (kbase_is_gpu_removed(kbdev) || kbase_pm_is_gpu_lost(kbdev)) {
 			backend->shaders_state =
 				KBASE_SHADERS_OFF_CORESTACK_OFF;
 			backend->hwcnt_desired = false;
@@ -1161,16 +1197,19 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 				 */
 				backend->l2_state =
 					KBASE_L2_ON_HWCNT_DISABLE;
+				KBASE_KTRACE_ADD(kbdev, PM_L2_ON_HWCNT_DISABLE, NULL,
+							backend->l2_state);
 				kbase_pm_trigger_hwcnt_disable(kbdev);
 			}
 
 			if (backend->hwcnt_disabled) {
 				backend->l2_state = KBASE_L2_OFF;
+				KBASE_KTRACE_ADD(kbdev, PM_L2_OFF, NULL, backend->l2_state);
 				dev_dbg(kbdev->dev, "GPU lost has occurred - L2 off\n");
 			}
 			break;
 		}
-#endif /* CONFIG_MALI_ARBITER_SUPPORT */
+#endif
 
 		/* mask off ready from trans in case transitions finished
 		 * between the register reads
@@ -1182,6 +1221,12 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 		switch (backend->l2_state) {
 		case KBASE_L2_OFF:
 			if (kbase_pm_is_l2_desired(kbdev)) {
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+				/* Enable HW timer of IPA control before
+				 * L2 cache is powered-up.
+				 */
+				kbase_ipa_control_handle_gpu_sleep_exit(kbdev);
+#endif
 				/*
 				 * Set the desired config for L2 before
 				 * powering it on
@@ -1221,14 +1266,12 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 			l2_power_up_done = false;
 			if (!l2_trans && l2_ready == l2_present) {
 				if (need_tiler_control(kbdev)) {
-#ifndef CONFIG_MALI_ARBITER_SUPPORT
 					u64 tiler_trans = kbase_pm_get_trans_cores(
 						kbdev, KBASE_PM_CORE_TILER);
 					u64 tiler_ready = kbase_pm_get_ready_cores(
 						kbdev, KBASE_PM_CORE_TILER);
-#endif
-
 					tiler_trans &= ~tiler_ready;
+
 					if (!tiler_trans && tiler_ready == tiler_present) {
 						KBASE_KTRACE_ADD(kbdev,
 								 PM_CORES_CHANGE_AVAILABLE_TILER,
@@ -1437,12 +1480,26 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 				/* We only need to check the L2 here - if the L2
 				 * is off then the tiler is definitely also off.
 				 */
-				if (!l2_trans && !l2_ready)
+				if (!l2_trans && !l2_ready) {
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+					/* Allow clock gating within the GPU and prevent it
+					 * from being seen as active during sleep.
+					 */
+					kbase_ipa_control_handle_gpu_sleep_enter(kbdev);
+#endif
 					/* L2 is now powered off */
 					backend->l2_state = KBASE_L2_OFF;
+				}
 			} else {
-				if (!kbdev->cache_clean_in_progress)
+				if (!kbdev->cache_clean_in_progress) {
+#if MALI_USE_CSF && defined(KBASE_PM_RUNTIME)
+					/* Allow clock gating within the GPU and prevent it
+					 * from being seen as active during sleep.
+					 */
+					kbase_ipa_control_handle_gpu_sleep_enter(kbdev);
+#endif
 					backend->l2_state = KBASE_L2_OFF;
+				}
 			}
 			break;
 
@@ -1457,11 +1514,13 @@ static int kbase_pm_l2_update_state(struct kbase_device *kbdev)
 					backend->l2_state);
 		}
 
-		if (backend->l2_state != prev_state)
+		if (backend->l2_state != prev_state) {
 			dev_dbg(kbdev->dev, "L2 state transition: %s to %s\n",
 				kbase_l2_core_state_to_string(prev_state),
 				kbase_l2_core_state_to_string(
 					backend->l2_state));
+			kbase_ktrace_log_l2_core_state(kbdev, backend->l2_state);
+		}
 
 	} while (backend->l2_state != prev_state);
 
@@ -1925,7 +1984,7 @@ static bool kbase_pm_is_in_desired_state_nolock(struct kbase_device *kbdev)
 			kbdev->pm.backend.shaders_state != KBASE_SHADERS_OFF_CORESTACK_OFF)
 		in_desired_state = false;
 #else
-	in_desired_state = kbase_pm_mcu_is_in_desired_state(kbdev);
+	in_desired_state &= kbase_pm_mcu_is_in_desired_state(kbdev);
 #endif
 
 	return in_desired_state;
@@ -2122,6 +2181,7 @@ void kbase_pm_reset_start_locked(struct kbase_device *kbdev)
 
 	backend->in_reset = true;
 	backend->l2_state = KBASE_L2_RESET_WAIT;
+	KBASE_KTRACE_ADD(kbdev, PM_L2_RESET_WAIT, NULL, backend->l2_state);
 #if !MALI_USE_CSF
 	backend->shaders_state = KBASE_SHADERS_RESET_WAIT;
 #else
@@ -2130,6 +2190,7 @@ void kbase_pm_reset_start_locked(struct kbase_device *kbdev)
 	 */
 	if (likely(kbdev->csf.firmware_inited)) {
 		backend->mcu_state = KBASE_MCU_RESET_WAIT;
+		KBASE_KTRACE_ADD(kbdev, PM_MCU_RESET_WAIT, NULL, backend->mcu_state);
 #ifdef KBASE_PM_RUNTIME
 		backend->exit_gpu_sleep_mode = true;
 #endif
@@ -2328,6 +2389,66 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev)
 }
 KBASE_EXPORT_TEST_API(kbase_pm_wait_for_desired_state);
 
+#if MALI_USE_CSF
+/**
+ * core_mask_update_done - Check if downscaling of shader cores is done
+ *
+ * @kbdev: The kbase device structure for the device.
+ *
+ * This function checks if the downscaling of cores is effectively complete.
+ *
+ * Return: true if the downscale is done.
+ */
+static bool core_mask_update_done(struct kbase_device *kbdev)
+{
+	bool update_done = false;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	/* If MCU is in stable ON state then it implies that the downscale
+	 * request had completed.
+	 * If MCU is not active then it implies all cores are off, so can
+	 * consider the downscale request as complete.
+	 */
+	if ((kbdev->pm.backend.mcu_state == KBASE_MCU_ON) ||
+	    kbase_pm_is_mcu_inactive(kbdev, kbdev->pm.backend.mcu_state))
+		update_done = true;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return update_done;
+}
+
+int kbase_pm_wait_for_cores_down_scale(struct kbase_device *kbdev)
+{
+	long timeout = kbase_csf_timeout_in_jiffies(kbase_get_timeout_ms(kbdev, CSF_PM_TIMEOUT));
+	long remaining;
+	int err = 0;
+
+	/* Wait for core mask update to complete  */
+#if KERNEL_VERSION(4, 13, 1) <= LINUX_VERSION_CODE
+	remaining = wait_event_killable_timeout(
+		kbdev->pm.backend.gpu_in_desired_state_wait,
+		core_mask_update_done(kbdev), timeout);
+#else
+	remaining = wait_event_timeout(
+		kbdev->pm.backend.gpu_in_desired_state_wait,
+		core_mask_update_done(kbdev), timeout);
+#endif
+
+	if (!remaining) {
+		kbase_pm_timed_out(kbdev);
+		err = -ETIMEDOUT;
+	} else if (remaining < 0) {
+		dev_info(
+			kbdev->dev,
+			"Wait for cores down scaling got interrupted");
+		err = (int)remaining;
+	}
+
+	return err;
+}
+#endif
+
 void kbase_pm_enable_interrupts(struct kbase_device *kbdev)
 {
 	unsigned long flags;
@@ -2391,19 +2512,25 @@ static void update_user_reg_page_mapping(struct kbase_device *kbdev)
 	lockdep_assert_held(&kbdev->pm.lock);
 
 	mutex_lock(&kbdev->csf.reg_lock);
-	if (kbdev->csf.mali_file_inode) {
-		/* This would zap the pte corresponding to the mapping of User
-		 * register page for all the Kbase contexts.
-		 */
-		unmap_mapping_range(kbdev->csf.mali_file_inode->i_mapping,
-				    BASEP_MEM_CSF_USER_REG_PAGE_HANDLE,
-				    PAGE_SIZE, 1);
+
+	/* Only if the mappings for USER page exist, update all PTEs associated to it */
+	if (kbdev->csf.nr_user_page_mapped > 0) {
+		if (likely(kbdev->csf.mali_file_inode)) {
+			/* This would zap the pte corresponding to the mapping of User
+			 * register page for all the Kbase contexts.
+			 */
+			unmap_mapping_range(kbdev->csf.mali_file_inode->i_mapping,
+					    BASEP_MEM_CSF_USER_REG_PAGE_HANDLE, PAGE_SIZE, 1);
+		} else {
+			dev_err(kbdev->dev,
+				"Device file inode not exist even if USER page previously mapped");
+		}
 	}
+
 	mutex_unlock(&kbdev->csf.reg_lock);
 }
 #endif
 
-
 /*
  * pmu layout:
  * 0x0000: PMU TAG (RO) (0xCAFECAFE)
@@ -2541,7 +2668,6 @@ void kbase_pm_clock_on(struct kbase_device *kbdev, bool is_resume)
 		backend->gpu_idled = false;
 	}
 #endif
-
 }
 
 KBASE_EXPORT_TEST_API(kbase_pm_clock_on);
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_internal.h b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_internal.h
index dddc10550306..115cd3c34d90 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_internal.h
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_internal.h
@@ -269,6 +269,37 @@ int kbase_pm_wait_for_desired_state(struct kbase_device *kbdev);
  */
 int kbase_pm_wait_for_l2_powered(struct kbase_device *kbdev);
 
+#if MALI_USE_CSF
+/**
+ * kbase_pm_wait_for_cores_down_scale - Wait for the downscaling of shader cores
+ *
+ * @kbdev: The kbase device structure for the device (must be a valid pointer)
+ *
+ * This function can be called to ensure that the downscaling of cores is
+ * effectively complete and it would be safe to lower the voltage.
+ * The function assumes that caller had exercised the MCU state machine for the
+ * downscale request through the kbase_pm_update_state() function.
+ *
+ * This function needs to be used by the caller to safely wait for the completion
+ * of downscale request, instead of kbase_pm_wait_for_desired_state().
+ * The downscale request would trigger a state change in MCU state machine
+ * and so when MCU reaches the stable ON state, it can be inferred that
+ * downscaling is complete. But it has been observed that the wake up of the
+ * waiting thread can get delayed by few milli seconds and by the time the
+ * thread wakes up the power down transition could have started (after the
+ * completion of downscale request).
+ * On the completion of power down transition another wake up signal would be
+ * sent, but again by the time thread wakes up the power up transition can begin.
+ * And the power up transition could then get blocked inside the platform specific
+ * callback_power_on() function due to the thread that called into Kbase (from the
+ * platform specific code) to perform the downscaling and then ended up waiting
+ * for the completion of downscale request.
+ *
+ * Return: 0 on success, error code on error or remaining jiffies on timeout.
+ */
+int kbase_pm_wait_for_cores_down_scale(struct kbase_device *kbdev);
+#endif
+
 /**
  * kbase_pm_update_dynamic_cores_onoff - Update the L2 and shader power state
  *                                       machines after changing shader core
diff --git a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_metrics.c b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_metrics.c
index 4cc2d50db586..29e945d00fbe 100644
--- a/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_metrics.c
+++ b/drivers/gpu/arm/bifrost/backend/gpu/mali_kbase_pm_metrics.c
@@ -38,11 +38,13 @@
 #include <backend/gpu/mali_kbase_pm_defs.h>
 #include <mali_linux_trace.h>
 
+#if defined(CONFIG_MALI_BIFROST_DEVFREQ) || defined(CONFIG_MALI_BIFROST_DVFS) || !MALI_USE_CSF
 /* Shift used for kbasep_pm_metrics_data.time_busy/idle - units of (1 << 8) ns
  * This gives a maximum period between samples of 2^(32+8)/100 ns = slightly
  * under 11s. Exceeding this will cause overflow
  */
 #define KBASE_PM_TIME_SHIFT			8
+#endif
 
 #if MALI_USE_CSF
 /* To get the GPU_ACTIVE value in nano seconds unit */
diff --git a/drivers/gpu/arm/bifrost/build.bp b/drivers/gpu/arm/bifrost/build.bp
index 977d13961786..a17ff432398c 100644
--- a/drivers/gpu/arm/bifrost/build.bp
+++ b/drivers/gpu/arm/bifrost/build.bp
@@ -32,6 +32,7 @@ bob_defaults {
         kbuild_options: [
             "CONFIG_MALI_BIFROST_NO_MALI=y",
             "CONFIG_MALI_NO_MALI_DEFAULT_GPU={{.gpu}}",
+            "CONFIG_GPU_HWVER={{.hwver}}",
         ],
     },
     mali_platform_dt_pin_rst: {
@@ -52,9 +53,6 @@ bob_defaults {
     mali_midgard_enable_trace: {
         kbuild_options: ["CONFIG_MALI_BIFROST_ENABLE_TRACE=y"],
     },
-    mali_dma_fence: {
-        kbuild_options: ["CONFIG_MALI_BIFROST_DMA_FENCE=y"],
-    },
     mali_arbiter_support: {
         kbuild_options: ["CONFIG_MALI_ARBITER_SUPPORT=y"],
     },
@@ -64,7 +62,7 @@ bob_defaults {
     mali_dma_buf_legacy_compat: {
         kbuild_options: ["CONFIG_MALI_DMA_BUF_LEGACY_COMPAT=y"],
     },
-    mali_2mb_alloc: {
+    large_page_alloc: {
         kbuild_options: ["CONFIG_MALI_2MB_ALLOC=y"],
     },
     mali_memory_fully_backed: {
@@ -89,7 +87,7 @@ bob_defaults {
         kbuild_options: ["CONFIG_MALI_BIFROST_ERROR_INJECT=y"],
     },
     mali_gem5_build: {
-       kbuild_options: ["CONFIG_MALI_GEM5_BUILD=y"],
+        kbuild_options: ["CONFIG_MALI_GEM5_BUILD=y"],
     },
     mali_debug: {
         kbuild_options: [
@@ -163,9 +161,7 @@ bob_defaults {
         // (catch-all for experimental CS code without separating it into
         // different features).
         "MALI_INCREMENTAL_RENDERING_JM={{.incremental_rendering_jm}}",
-        "MALI_GPU_TIMESTAMP_CORRECTION={{.gpu_timestamp_correction}}",
         "MALI_BASE_CSF_PERFORMANCE_TESTS={{.base_csf_performance_tests}}",
-        "MALI_GPU_TIMESTAMP_INTERPOLATION={{.gpu_timestamp_interpolation}}",
     ],
 }
 
@@ -184,6 +180,10 @@ bob_kernel_module {
         "context/*.c",
         "context/*.h",
         "context/Kbuild",
+        "hwcnt/*.c",
+        "hwcnt/*.h",
+        "hwcnt/backend/*.h",
+        "hwcnt/Kbuild",
         "ipa/*.c",
         "ipa/*.h",
         "ipa/Kbuild",
@@ -217,6 +217,10 @@ bob_kernel_module {
             "device/backend/*_jm.c",
             "gpu/backend/*_jm.c",
             "gpu/backend/*_jm.h",
+            "hwcnt/backend/*_jm.c",
+            "hwcnt/backend/*_jm.h",
+            "hwcnt/backend/*_jm_*.c",
+            "hwcnt/backend/*_jm_*.h",
             "jm/*.h",
             "tl/backend/*_jm.c",
             "mmu/backend/*_jm.c",
@@ -238,6 +242,10 @@ bob_kernel_module {
             "device/backend/*_csf.c",
             "gpu/backend/*_csf.c",
             "gpu/backend/*_csf.h",
+            "hwcnt/backend/*_csf.c",
+            "hwcnt/backend/*_csf.h",
+            "hwcnt/backend/*_csf_*.c",
+            "hwcnt/backend/*_csf_*.h",
             "tl/backend/*_csf.c",
             "mmu/backend/*_csf.c",
             "ipa/backend/*_csf.c",
diff --git a/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_csf.c b/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_csf.c
index 32bf82526aa3..3abc7a2a66f4 100644
--- a/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_csf.c
+++ b/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_csf.c
@@ -26,7 +26,6 @@
 #include <context/mali_kbase_context_internal.h>
 #include <gpu/mali_kbase_gpu_regmap.h>
 #include <mali_kbase.h>
-#include <mali_kbase_dma_fence.h>
 #include <mali_kbase_mem_linux.h>
 #include <mali_kbase_mem_pool_group.h>
 #include <mmu/mali_kbase_mmu.h>
@@ -39,12 +38,14 @@
 #include <csf/mali_kbase_csf_cpu_queue_debugfs.h>
 #include <mali_kbase_debug_mem_view.h>
 #include <mali_kbase_debug_mem_zones.h>
+#include <mali_kbase_debug_mem_allocs.h>
 #include <mali_kbase_mem_pool_debugfs.h>
 
 void kbase_context_debugfs_init(struct kbase_context *const kctx)
 {
 	kbase_debug_mem_view_init(kctx);
 	kbase_debug_mem_zones_init(kctx);
+	kbase_debug_mem_allocs_init(kctx);
 	kbase_mem_pool_debugfs_init(kctx->kctx_dentry, kctx);
 	kbase_jit_debugfs_init(kctx);
 	kbase_csf_queue_group_debugfs_init(kctx);
diff --git a/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_jm.c b/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_jm.c
index 97cd46e0e5b5..995a08e36f43 100644
--- a/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_jm.c
+++ b/drivers/gpu/arm/bifrost/context/backend/mali_kbase_context_jm.c
@@ -27,7 +27,6 @@
 #include <gpu/mali_kbase_gpu_regmap.h>
 #include <mali_kbase.h>
 #include <mali_kbase_ctx_sched.h>
-#include <mali_kbase_dma_fence.h>
 #include <mali_kbase_kinstr_jm.h>
 #include <mali_kbase_mem_linux.h>
 #include <mali_kbase_mem_pool_group.h>
@@ -37,12 +36,14 @@
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 #include <mali_kbase_debug_mem_view.h>
 #include <mali_kbase_debug_mem_zones.h>
+#include <mali_kbase_debug_mem_allocs.h>
 #include <mali_kbase_mem_pool_debugfs.h>
 
 void kbase_context_debugfs_init(struct kbase_context *const kctx)
 {
 	kbase_debug_mem_view_init(kctx);
 	kbase_debug_mem_zones_init(kctx);
+	kbase_debug_mem_allocs_init(kctx);
 	kbase_mem_pool_debugfs_init(kctx->kctx_dentry, kctx);
 	kbase_jit_debugfs_init(kctx);
 	kbasep_jd_debugfs_ctx_init(kctx);
@@ -128,8 +129,6 @@ static const struct kbase_context_init context_init[] = {
 	{ NULL, kbase_context_free, NULL },
 	{ kbase_context_common_init, kbase_context_common_term,
 	  "Common context initialization failed" },
-	{ kbase_dma_fence_init, kbase_dma_fence_term,
-	  "DMA fence initialization failed" },
 	{ kbase_context_mem_pool_group_init, kbase_context_mem_pool_group_term,
 	  "Memory pool group initialization failed" },
 	{ kbase_mem_evictable_init, kbase_mem_evictable_deinit,
diff --git a/drivers/gpu/arm/bifrost/context/mali_kbase_context.c b/drivers/gpu/arm/bifrost/context/mali_kbase_context.c
index f84e01edee93..b6abfc44d212 100644
--- a/drivers/gpu/arm/bifrost/context/mali_kbase_context.c
+++ b/drivers/gpu/arm/bifrost/context/mali_kbase_context.c
@@ -165,7 +165,9 @@ int kbase_context_common_init(struct kbase_context *kctx)
 	atomic64_set(&kctx->num_fixed_allocs, 0);
 #endif
 
+	kbase_gpu_vm_lock(kctx);
 	bitmap_copy(kctx->cookies, &cookies_mask, BITS_PER_LONG);
+	kbase_gpu_vm_unlock(kctx);
 
 	kctx->id = atomic_add_return(1, &(kctx->kbdev->ctx_num)) - 1;
 
@@ -274,10 +276,8 @@ void kbase_context_common_term(struct kbase_context *kctx)
 
 int kbase_context_mem_pool_group_init(struct kbase_context *kctx)
 {
-	return kbase_mem_pool_group_init(&kctx->mem_pools,
-		kctx->kbdev,
-		&kctx->kbdev->mem_pool_defaults,
-		&kctx->kbdev->mem_pools);
+	return kbase_mem_pool_group_init(&kctx->mem_pools, kctx->kbdev,
+					 &kctx->kbdev->mem_pool_defaults, &kctx->kbdev->mem_pools);
 }
 
 void kbase_context_mem_pool_group_term(struct kbase_context *kctx)
diff --git a/drivers/gpu/arm/bifrost/csf/Kbuild b/drivers/gpu/arm/bifrost/csf/Kbuild
index c5d9154a2e35..1474bdaacb0d 100644
--- a/drivers/gpu/arm/bifrost/csf/Kbuild
+++ b/drivers/gpu/arm/bifrost/csf/Kbuild
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2018-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2018-2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -34,12 +34,16 @@ bifrost_kbase-y += \
     csf/mali_kbase_csf_protected_memory.o \
     csf/mali_kbase_csf_tiler_heap_debugfs.o \
     csf/mali_kbase_csf_cpu_queue_debugfs.o \
-    csf/mali_kbase_csf_event.o
+    csf/mali_kbase_csf_event.o \
+    csf/mali_kbase_csf_firmware_log.o \
+    csf/mali_kbase_csf_tiler_heap_reclaim.o
 
 bifrost_kbase-$(CONFIG_MALI_REAL_HW) += csf/mali_kbase_csf_firmware.o
 
 bifrost_kbase-$(CONFIG_MALI_BIFROST_NO_MALI) += csf/mali_kbase_csf_firmware_no_mali.o
 
+bifrost_kbase-$(CONFIG_DEBUG_FS) += csf/mali_kbase_debug_csf_fault.o
+
 
 ifeq ($(KBUILD_EXTMOD),)
 # in-tree
diff --git a/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.c b/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.c
index e503b08d13b9..c81d0a5a7236 100644
--- a/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.c
+++ b/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.c
@@ -28,8 +28,6 @@
  * Status flags from the STATUS register of the IPA Control interface.
  */
 #define STATUS_COMMAND_ACTIVE ((u32)1 << 0)
-#define STATUS_TIMER_ACTIVE ((u32)1 << 1)
-#define STATUS_AUTO_ACTIVE ((u32)1 << 2)
 #define STATUS_PROTECTED_MODE ((u32)1 << 8)
 #define STATUS_RESET ((u32)1 << 9)
 #define STATUS_TIMER_ENABLED ((u32)1 << 31)
@@ -37,9 +35,7 @@
 /*
  * Commands for the COMMAND register of the IPA Control interface.
  */
-#define COMMAND_NOP ((u32)0)
 #define COMMAND_APPLY ((u32)1)
-#define COMMAND_CLEAR ((u32)2)
 #define COMMAND_SAMPLE ((u32)3)
 #define COMMAND_PROTECTED_ACK ((u32)4)
 #define COMMAND_RESET_ACK ((u32)5)
@@ -965,6 +961,43 @@ void kbase_ipa_control_handle_gpu_reset_post(struct kbase_device *kbdev)
 }
 KBASE_EXPORT_TEST_API(kbase_ipa_control_handle_gpu_reset_post);
 
+#ifdef KBASE_PM_RUNTIME
+void kbase_ipa_control_handle_gpu_sleep_enter(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (kbdev->pm.backend.mcu_state == KBASE_MCU_IN_SLEEP) {
+		/* GPU Sleep is treated as a power down */
+		kbase_ipa_control_handle_gpu_power_off(kbdev);
+
+		/* SELECT_CSHW register needs to be cleared to prevent any
+		 * IPA control message to be sent to the top level GPU HWCNT.
+		 */
+		kbase_reg_write(kbdev, IPA_CONTROL_REG(SELECT_CSHW_LO), 0);
+		kbase_reg_write(kbdev, IPA_CONTROL_REG(SELECT_CSHW_HI), 0);
+
+		/* No need to issue the APPLY command here */
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_ipa_control_handle_gpu_sleep_enter);
+
+void kbase_ipa_control_handle_gpu_sleep_exit(struct kbase_device *kbdev)
+{
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	if (kbdev->pm.backend.mcu_state == KBASE_MCU_IN_SLEEP) {
+		/* To keep things simple, currently exit from
+		 * GPU Sleep is treated as a power on event where
+		 * all 4 SELECT registers are reconfigured.
+		 * On exit from sleep, reconfiguration is needed
+		 * only for the SELECT_CSHW register.
+		 */
+		kbase_ipa_control_handle_gpu_power_on(kbdev);
+	}
+}
+KBASE_EXPORT_TEST_API(kbase_ipa_control_handle_gpu_sleep_exit);
+#endif
+
 #if MALI_UNIT_TEST
 void kbase_ipa_control_rate_change_notify_test(struct kbase_device *kbdev,
 					       u32 clk_index, u32 clk_rate_hz)
diff --git a/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.h b/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.h
index 0469c482dfff..69ff8973bac4 100644
--- a/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.h
+++ b/drivers/gpu/arm/bifrost/csf/ipa_control/mali_kbase_csf_ipa_control.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -198,6 +198,33 @@ void kbase_ipa_control_handle_gpu_reset_pre(struct kbase_device *kbdev);
  */
 void kbase_ipa_control_handle_gpu_reset_post(struct kbase_device *kbdev);
 
+#ifdef KBASE_PM_RUNTIME
+/**
+ * kbase_ipa_control_handle_gpu_sleep_enter - Handle the pre GPU Sleep event
+ *
+ * @kbdev:          Pointer to kbase device.
+ *
+ * This function is called after MCU has been put to sleep state & L2 cache has
+ * been powered down. The top level part of GPU is still powered up when this
+ * function is called.
+ */
+void kbase_ipa_control_handle_gpu_sleep_enter(struct kbase_device *kbdev);
+
+/**
+ * kbase_ipa_control_handle_gpu_sleep_exit - Handle the post GPU Sleep event
+ *
+ * @kbdev:          Pointer to kbase device.
+ *
+ * This function is called when L2 needs to be powered up and MCU can exit the
+ * sleep state. The top level part of GPU is powered up when this function is
+ * called.
+ *
+ * This function must be called only if kbase_ipa_control_handle_gpu_sleep_enter()
+ * was called previously.
+ */
+void kbase_ipa_control_handle_gpu_sleep_exit(struct kbase_device *kbdev);
+#endif
+
 #if MALI_UNIT_TEST
 /**
  * kbase_ipa_control_rate_change_notify_test - Notify GPU rate change
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf.c
index 80e37a36ca76..b77007300c5c 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf.c
@@ -348,9 +348,8 @@ int kbase_csf_alloc_command_stream_user_pages(struct kbase_context *kctx,
 	if (!reg)
 		return -ENOMEM;
 
-	ret = kbase_mem_pool_alloc_pages(
-				&kctx->mem_pools.small[KBASE_MEM_GROUP_CSF_IO],
-				num_pages, queue->phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kctx->mem_pools.small[KBASE_MEM_GROUP_CSF_IO], num_pages,
+					 queue->phys, false);
 
 	if (ret != num_pages)
 		goto phys_alloc_failed;
@@ -374,8 +373,11 @@ int kbase_csf_alloc_command_stream_user_pages(struct kbase_context *kctx,
 
 	queue->db_file_offset = kbdev->csf.db_file_offsets;
 	kbdev->csf.db_file_offsets += BASEP_QUEUE_NR_MMAP_USER_PAGES;
-
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	WARN(atomic_read(&queue->refcount) != 1, "Incorrect refcounting for queue object\n");
+#else
+	WARN(refcount_read(&queue->refcount) != 1, "Incorrect refcounting for queue object\n");
+#endif
 	/* This is the second reference taken on the queue object and
 	 * would be dropped only when the IO mapping is removed either
 	 * explicitly by userspace or implicitly by kernel on process exit.
@@ -444,25 +446,34 @@ static struct kbase_queue *find_queue(struct kbase_context *kctx, u64 base_addr)
 
 static void get_queue(struct kbase_queue *queue)
 {
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	WARN_ON(!atomic_inc_not_zero(&queue->refcount));
+#else
+	WARN_ON(!refcount_inc_not_zero(&queue->refcount));
+#endif
 }
 
 static void release_queue(struct kbase_queue *queue)
 {
 	lockdep_assert_held(&queue->kctx->csf.lock);
-
-	WARN_ON(atomic_read(&queue->refcount) <= 0);
-
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	if (atomic_dec_and_test(&queue->refcount)) {
+#else
+	if (refcount_dec_and_test(&queue->refcount)) {
+#endif
 		/* The queue can't still be on the per context list. */
 		WARN_ON(!list_empty(&queue->link));
 		WARN_ON(queue->group);
+		dev_dbg(queue->kctx->kbdev->dev,
+			"Remove any pending command queue fatal from ctx %d_%d",
+			queue->kctx->tgid, queue->kctx->id);
+		kbase_csf_event_remove_error(queue->kctx, &queue->error);
 		kfree(queue);
 	}
 }
 
 static void oom_event_worker(struct work_struct *data);
-static void fatal_event_worker(struct work_struct *data);
+static void cs_error_worker(struct work_struct *data);
 
 /* Between reg and reg_ex, one and only one must be null */
 static int csf_queue_register_internal(struct kbase_context *kctx,
@@ -565,7 +576,11 @@ static int csf_queue_register_internal(struct kbase_context *kctx,
 	queue->enabled = false;
 
 	queue->priority = reg->priority;
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	atomic_set(&queue->refcount, 1);
+#else
+	refcount_set(&queue->refcount, 1);
+#endif
 
 	queue->group = NULL;
 	queue->bind_state = KBASE_CSF_QUEUE_UNBOUND;
@@ -588,7 +603,7 @@ static int csf_queue_register_internal(struct kbase_context *kctx,
 	INIT_LIST_HEAD(&queue->link);
 	INIT_LIST_HEAD(&queue->error.link);
 	INIT_WORK(&queue->oom_event_work, oom_event_worker);
-	INIT_WORK(&queue->fatal_event_work, fatal_event_worker);
+	INIT_WORK(&queue->cs_error_work, cs_error_worker);
 	list_add(&queue->link, &kctx->csf.queue_list);
 
 	queue->extract_ofs = 0;
@@ -699,11 +714,6 @@ void kbase_csf_queue_terminate(struct kbase_context *kctx,
 		}
 		kbase_gpu_vm_unlock(kctx);
 
-		dev_dbg(kctx->kbdev->dev,
-			"Remove any pending command queue fatal from context %pK\n",
-			(void *)kctx);
-		kbase_csf_event_remove_error(kctx, &queue->error);
-
 		release_queue(queue);
 	}
 
@@ -784,6 +794,11 @@ static struct kbase_queue_group *get_bound_queue_group(
 	return group;
 }
 
+static void enqueue_gpu_submission_work(struct kbase_context *const kctx)
+{
+	queue_work(system_highpri_wq, &kctx->csf.pending_submission_work);
+}
+
 /**
  * pending_submission_worker() - Work item to process pending kicked GPU command queues.
  *
@@ -813,11 +828,21 @@ static void pending_submission_worker(struct work_struct *work)
 	list_for_each_entry(queue, &kctx->csf.queue_list, link) {
 		if (atomic_cmpxchg(&queue->pending, 1, 0) == 1) {
 			struct kbase_queue_group *group = get_bound_queue_group(queue);
+			int ret;
 
-			if (!group || queue->bind_state != KBASE_CSF_QUEUE_BOUND)
+			if (!group || queue->bind_state != KBASE_CSF_QUEUE_BOUND) {
 				dev_dbg(kbdev->dev, "queue is not bound to a group");
-			else
-				WARN_ON(kbase_csf_scheduler_queue_start(queue));
+				continue;
+			}
+
+			ret = kbase_csf_scheduler_queue_start(queue);
+			if (unlikely(ret)) {
+				dev_dbg(kbdev->dev, "Failed to start queue");
+				if (ret == -EBUSY) {
+					atomic_cmpxchg(&queue->pending, 0, 1);
+					enqueue_gpu_submission_work(kctx);
+				}
+			}
 		}
 	}
 
@@ -831,6 +856,8 @@ void kbase_csf_ring_csg_doorbell(struct kbase_device *kbdev, int slot)
 	if (WARN_ON(slot < 0))
 		return;
 
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+
 	kbase_csf_ring_csg_slots_doorbell(kbdev, (u32) (1 << slot));
 }
 
@@ -843,6 +870,8 @@ void kbase_csf_ring_csg_slots_doorbell(struct kbase_device *kbdev,
 		(u32) ((1U << kbdev->csf.global_iface.group_num) - 1);
 	u32 value;
 
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+
 	if (WARN_ON(slot_bitmap > allowed_bitmap))
 		return;
 
@@ -872,6 +901,8 @@ void kbase_csf_ring_cs_kernel_doorbell(struct kbase_device *kbdev,
 	struct kbase_csf_cmd_stream_group_info *ginfo;
 	u32 value;
 
+	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
+
 	if (WARN_ON(csg_nr < 0) ||
 	    WARN_ON(csg_nr >= kbdev->csf.global_iface.group_num))
 		return;
@@ -891,11 +922,6 @@ void kbase_csf_ring_cs_kernel_doorbell(struct kbase_device *kbdev,
 		kbase_csf_ring_csg_doorbell(kbdev, csg_nr);
 }
 
-static void enqueue_gpu_submission_work(struct kbase_context *const kctx)
-{
-	queue_work(system_highpri_wq, &kctx->csf.pending_submission_work);
-}
-
 int kbase_csf_queue_kick(struct kbase_context *kctx,
 			 struct kbase_ioctl_cs_queue_kick *kick)
 {
@@ -1129,9 +1155,8 @@ static int create_normal_suspend_buffer(struct kbase_context *const kctx,
 	}
 
 	/* Get physical page for a normal suspend buffer */
-	err = kbase_mem_pool_alloc_pages(
-			&kctx->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-			nr_pages, &s_buf->phy[0], false);
+	err = kbase_mem_pool_alloc_pages(&kctx->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], nr_pages,
+					 &s_buf->phy[0], false);
 
 	if (err < 0)
 		goto phy_pages_alloc_failed;
@@ -1362,6 +1387,11 @@ static int create_queue_group(struct kbase_context *const kctx,
 			group->cs_unrecoverable = false;
 			group->reevaluate_idle_status = false;
 
+			group->dvs_buf = create->in.dvs_buf;
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+			group->deschedule_deferred_cnt = 0;
+#endif
 
 			group->group_uid = generate_group_uid();
 			create->out.group_uid = group->group_uid;
@@ -1377,6 +1407,9 @@ static int create_queue_group(struct kbase_context *const kctx,
 					MAX_SUPPORTED_STREAMS_PER_GROUP);
 
 			group->run_state = KBASE_CSF_GROUP_INACTIVE;
+			KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_INACTIVE, group,
+						group->run_state);
+
 			err = create_suspend_buffers(kctx, group);
 
 			if (err < 0) {
@@ -1396,6 +1429,17 @@ static int create_queue_group(struct kbase_context *const kctx,
 	return group_handle;
 }
 
+static bool dvs_supported(u32 csf_version)
+{
+	if (GLB_VERSION_MAJOR_GET(csf_version) < 3)
+		return false;
+
+	if (GLB_VERSION_MAJOR_GET(csf_version) == 3)
+		if (GLB_VERSION_MINOR_GET(csf_version) < 2)
+			return false;
+
+	return true;
+}
 
 int kbase_csf_queue_group_create(struct kbase_context *const kctx,
 			union kbase_ioctl_cs_queue_group_create *const create)
@@ -1434,8 +1478,17 @@ int kbase_csf_queue_group_create(struct kbase_context *const kctx,
 		dev_warn(kctx->kbdev->dev, "Unknown exception handler flags set: %u",
 			 create->in.csi_handlers & ~BASE_CSF_EXCEPTION_HANDLER_FLAGS_MASK);
 		err = -EINVAL;
-	} else if (create->in.reserved) {
-		dev_warn(kctx->kbdev->dev, "Reserved field was set to non-0");
+	} else if (!dvs_supported(kctx->kbdev->csf.global_iface.version) &&
+		   create->in.dvs_buf) {
+		dev_warn(
+			kctx->kbdev->dev,
+			"GPU does not support DVS but userspace is trying to use it");
+		err = -EINVAL;
+	} else if (dvs_supported(kctx->kbdev->csf.global_iface.version) &&
+		   !CSG_DVS_BUF_BUFFER_POINTER_GET(create->in.dvs_buf) &&
+		   CSG_DVS_BUF_BUFFER_SIZE_GET(create->in.dvs_buf)) {
+		dev_warn(kctx->kbdev->dev,
+			 "DVS buffer pointer is null but size is not 0");
 		err = -EINVAL;
 	} else {
 		/* For the CSG which satisfies the condition for having
@@ -1555,6 +1608,7 @@ void kbase_csf_term_descheduled_queue_group(struct kbase_queue_group *group)
 			&group->protected_suspend_buf);
 
 	group->run_state = KBASE_CSF_GROUP_TERMINATED;
+	KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_TERMINATED, group, group->run_state);
 }
 
 /**
@@ -1585,6 +1639,34 @@ static void term_queue_group(struct kbase_queue_group *group)
 	kbase_csf_term_descheduled_queue_group(group);
 }
 
+/**
+ * wait_group_deferred_deschedule_completion - Wait for refcount of the group to
+ *         become 0 that was taken when the group deschedule had to be deferred.
+ *
+ * @group: Pointer to GPU command queue group that is being deleted.
+ *
+ * This function is called when Userspace deletes the group and after the group
+ * has been descheduled. The function synchronizes with the other threads that were
+ * also trying to deschedule the group whilst the dumping was going on for a fault.
+ * Please refer the documentation of wait_for_dump_complete_on_group_deschedule()
+ * for more details.
+ */
+static void wait_group_deferred_deschedule_completion(struct kbase_queue_group *group)
+{
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	struct kbase_context *kctx = group->kctx;
+
+	lockdep_assert_held(&kctx->csf.lock);
+
+	if (likely(!group->deschedule_deferred_cnt))
+		return;
+
+	mutex_unlock(&kctx->csf.lock);
+	wait_event(kctx->kbdev->csf.event_wait, !group->deschedule_deferred_cnt);
+	mutex_lock(&kctx->csf.lock);
+#endif
+}
+
 static void cancel_queue_group_events(struct kbase_queue_group *group)
 {
 	cancel_work_sync(&group->timer_event_work);
@@ -1626,24 +1708,39 @@ void kbase_csf_queue_group_terminate(struct kbase_context *kctx,
 	group = find_queue_group(kctx, group_handle);
 
 	if (group) {
-		remove_pending_group_fatal_error(group);
-		term_queue_group(group);
 		kctx->csf.queue_groups[group_handle] = NULL;
+		/* Stop the running of the given group */
+		term_queue_group(group);
+		mutex_unlock(&kctx->csf.lock);
+
+		if (reset_prevented) {
+			/* Allow GPU reset before cancelling the group specific
+			 * work item to avoid potential deadlock.
+			 * Reset prevention isn't needed after group termination.
+			 */
+			kbase_reset_gpu_allow(kbdev);
+			reset_prevented = false;
+		}
+
+		/* Cancel any pending event callbacks. If one is in progress
+		 * then this thread waits synchronously for it to complete (which
+		 * is why we must unlock the context first). We already ensured
+		 * that no more callbacks can be enqueued by terminating the group.
+		 */
+		cancel_queue_group_events(group);
+
+		mutex_lock(&kctx->csf.lock);
+
+		/* Clean up after the termination */
+		remove_pending_group_fatal_error(group);
+
+		wait_group_deferred_deschedule_completion(group);
 	}
 
 	mutex_unlock(&kctx->csf.lock);
 	if (reset_prevented)
 		kbase_reset_gpu_allow(kbdev);
 
-	if (!group)
-		return;
-
-	/* Cancel any pending event callbacks. If one is in progress
-	 * then this thread waits synchronously for it to complete (which
-	 * is why we must unlock the context first). We already ensured
-	 * that no more callbacks can be enqueued by terminating the group.
-	 */
-	cancel_queue_group_events(group);
 	kfree(group);
 }
 
@@ -1738,7 +1835,6 @@ void kbase_csf_active_queue_groups_reset(struct kbase_device *kbdev,
 
 int kbase_csf_ctx_init(struct kbase_context *kctx)
 {
-	struct kbase_device *kbdev = kctx->kbdev;
 	int err = -ENOMEM;
 
 	INIT_LIST_HEAD(&kctx->csf.queue_list);
@@ -1747,19 +1843,6 @@ int kbase_csf_ctx_init(struct kbase_context *kctx)
 	kbase_csf_event_init(kctx);
 
 	kctx->csf.user_reg_vma = NULL;
-	mutex_lock(&kbdev->pm.lock);
-	/* The inode information for /dev/malixx file is not available at the
-	 * time of device probe as the inode is created when the device node
-	 * is created by udevd (through mknod).
-	 */
-	if (kctx->filp) {
-		if (!kbdev->csf.mali_file_inode)
-			kbdev->csf.mali_file_inode = kctx->filp->f_inode;
-
-		/* inode is unique for a file */
-		WARN_ON(kbdev->csf.mali_file_inode != kctx->filp->f_inode);
-	}
-	mutex_unlock(&kbdev->pm.lock);
 
 	/* Mark all the cookies as 'free' */
 	bitmap_fill(kctx->csf.cookies, KBASE_CSF_NUM_USER_IO_PAGES_HANDLE);
@@ -1874,8 +1957,6 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 	else
 		reset_prevented = true;
 
-	cancel_work_sync(&kctx->csf.pending_submission_work);
-
 	mutex_lock(&kctx->csf.lock);
 
 	/* Iterate through the queue groups that were not terminated by
@@ -1894,6 +1975,8 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 	if (reset_prevented)
 		kbase_reset_gpu_allow(kbdev);
 
+	cancel_work_sync(&kctx->csf.pending_submission_work);
+
 	/* Now that all queue groups have been terminated, there can be no
 	 * more OoM or timer event interrupts but there can be inflight work
 	 * items. Destroying the wq will implicitly flush those work items.
@@ -1938,7 +2021,11 @@ void kbase_csf_ctx_term(struct kbase_context *kctx)
 		 * only one reference left that was taken when queue was
 		 * registered.
 		 */
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 		if (atomic_read(&queue->refcount) != 1)
+#else
+		if (refcount_read(&queue->refcount) != 1)
+#endif
 			dev_warn(kctx->kbdev->dev,
 				 "Releasing queue with incorrect refcounting!\n");
 		list_del_init(&queue->link);
@@ -2059,6 +2146,36 @@ static void report_tiler_oom_error(struct kbase_queue_group *group)
 	kbase_event_wakeup(group->kctx);
 }
 
+static void flush_gpu_cache_on_fatal_error(struct kbase_device *kbdev)
+{
+	int err;
+	const unsigned int cache_flush_wait_timeout_ms = 2000;
+
+	kbase_pm_lock(kbdev);
+	/* With the advent of partial cache flush, dirty cache lines could
+	 * be left in the GPU L2 caches by terminating the queue group here
+	 * without waiting for proper cache maintenance. A full cache flush
+	 * here will prevent these dirty cache lines from being arbitrarily
+	 * evicted later and possible causing memory corruption.
+	 */
+	if (kbdev->pm.backend.gpu_powered) {
+		kbase_gpu_start_cache_clean(kbdev, GPU_COMMAND_CACHE_CLN_INV_L2_LSC);
+		err = kbase_gpu_wait_cache_clean_timeout(kbdev, cache_flush_wait_timeout_ms);
+
+		if (err) {
+			dev_warn(
+				kbdev->dev,
+				"[%llu] Timeout waiting for cache clean to complete after fatal error",
+				kbase_backend_get_cycle_cnt(kbdev));
+
+			if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
+				kbase_reset_gpu(kbdev);
+		}
+	}
+
+	kbase_pm_unlock(kbdev);
+}
+
 /**
  * kbase_queue_oom_event - Handle tiler out-of-memory for a GPU command queue.
  *
@@ -2071,8 +2188,8 @@ static void report_tiler_oom_error(struct kbase_queue_group *group)
  * notification to allow the firmware to report out-of-memory again in future.
  * If the out-of-memory condition was successfully handled then this function
  * rings the relevant doorbell to notify the firmware; otherwise, it terminates
- * the GPU command queue group to which the queue is bound. See
- * term_queue_group() for details.
+ * the GPU command queue group to which the queue is bound and notify a waiting
+ * user space client of the failure.
  */
 static void kbase_queue_oom_event(struct kbase_queue *const queue)
 {
@@ -2084,6 +2201,7 @@ static void kbase_queue_oom_event(struct kbase_queue *const queue)
 	struct kbase_csf_cmd_stream_info const *stream;
 	int csi_index = queue->csi_index;
 	u32 cs_oom_ack, cs_oom_req;
+	unsigned long flags;
 
 	lockdep_assert_held(&kctx->csf.lock);
 
@@ -2129,20 +2247,23 @@ static void kbase_queue_oom_event(struct kbase_queue *const queue)
 
 	err = handle_oom_event(group, stream);
 
+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
 	kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_oom_ack,
 					 CS_REQ_TILER_OOM_MASK);
+	kbase_csf_ring_cs_kernel_doorbell(kbdev, csi_index, slot_num, true);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 
-	if (err) {
+	if (unlikely(err)) {
 		dev_warn(
 			kbdev->dev,
 			"Queue group to be terminated, couldn't handle the OoM event\n");
+		kbase_debug_csf_fault_notify(kbdev, kctx, DF_TILER_OOM);
 		kbase_csf_scheduler_unlock(kbdev);
 		term_queue_group(group);
+		flush_gpu_cache_on_fatal_error(kbdev);
 		report_tiler_oom_error(group);
 		return;
 	}
-
-	kbase_csf_ring_cs_kernel_doorbell(kbdev, csi_index, slot_num, true);
 unlock:
 	kbase_csf_scheduler_unlock(kbdev);
 }
@@ -2164,6 +2285,7 @@ static void oom_event_worker(struct work_struct *data)
 	struct kbase_device *const kbdev = kctx->kbdev;
 
 	int err = kbase_reset_gpu_try_prevent(kbdev);
+
 	/* Regardless of whether reset failed or is currently happening, exit
 	 * early
 	 */
@@ -2216,12 +2338,13 @@ static void timer_event_worker(struct work_struct *data)
 	struct kbase_queue_group *const group =
 		container_of(data, struct kbase_queue_group, timer_event_work);
 	struct kbase_context *const kctx = group->kctx;
+	struct kbase_device *const kbdev = kctx->kbdev;
 	bool reset_prevented = false;
-	int err = kbase_reset_gpu_prevent_and_wait(kctx->kbdev);
+	int err = kbase_reset_gpu_prevent_and_wait(kbdev);
 
 	if (err)
 		dev_warn(
-			kctx->kbdev->dev,
+			kbdev->dev,
 			"Unsuccessful GPU reset detected when terminating group %d on progress timeout, attempting to terminate regardless",
 			group->handle);
 	else
@@ -2230,11 +2353,12 @@ static void timer_event_worker(struct work_struct *data)
 	mutex_lock(&kctx->csf.lock);
 
 	term_queue_group(group);
+	flush_gpu_cache_on_fatal_error(kbdev);
 	report_group_timeout_error(group);
 
 	mutex_unlock(&kctx->csf.lock);
 	if (reset_prevented)
-		kbase_reset_gpu_allow(kctx->kbdev);
+		kbase_reset_gpu_allow(kbdev);
 }
 
 /**
@@ -2242,11 +2366,15 @@ static void timer_event_worker(struct work_struct *data)
  *
  * @group: Pointer to GPU queue group for which the timeout event is received.
  *
+ * Notify a waiting user space client of the timeout.
  * Enqueue a work item to terminate the group and notify the event notification
  * thread of progress timeout fault for the GPU command queue group.
  */
 static void handle_progress_timer_event(struct kbase_queue_group *const group)
 {
+	kbase_debug_csf_fault_notify(group->kctx->kbdev, group->kctx,
+		DF_PROGRESS_TIMER_TIMEOUT);
+
 	queue_work(group->kctx->csf.wq, &group->timer_event_work);
 }
 
@@ -2274,16 +2402,20 @@ static void protm_event_worker(struct work_struct *data)
  * handle_fault_event - Handler for CS fault.
  *
  * @queue:  Pointer to queue for which fault event was received.
- * @stream: Pointer to the structure containing info provided by the
- *          firmware about the CSI.
- *
- * Prints meaningful CS fault information.
+ * @cs_ack: Value of the CS_ACK register in the CS kernel input page used for
+ *          the queue.
  *
+ * Print required information about the CS fault and notify the user space client
+ * about the fault.
  */
 static void
-handle_fault_event(struct kbase_queue *const queue,
-		   struct kbase_csf_cmd_stream_info const *const stream)
+handle_fault_event(struct kbase_queue *const queue, const u32 cs_ack)
 {
+	struct kbase_device *const kbdev = queue->kctx->kbdev;
+	struct kbase_csf_cmd_stream_group_info const *ginfo =
+			&kbdev->csf.global_iface.groups[queue->group->csg_nr];
+	struct kbase_csf_cmd_stream_info const *stream =
+			&ginfo->streams[queue->csi_index];
 	const u32 cs_fault = kbase_csf_firmware_cs_output(stream, CS_FAULT);
 	const u64 cs_fault_info =
 		kbase_csf_firmware_cs_output(stream, CS_FAULT_INFO_LO) |
@@ -2295,7 +2427,6 @@ handle_fault_event(struct kbase_queue *const queue,
 		CS_FAULT_EXCEPTION_DATA_GET(cs_fault);
 	const u64 cs_fault_info_exception_data =
 		CS_FAULT_INFO_EXCEPTION_DATA_GET(cs_fault_info);
-	struct kbase_device *const kbdev = queue->kctx->kbdev;
 
 	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
 
@@ -2310,6 +2441,36 @@ handle_fault_event(struct kbase_queue *const queue,
 		 kbase_gpu_exception_name(cs_fault_exception_type),
 		 cs_fault_exception_data, cs_fault_info_exception_data);
 
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	/* CS_RESOURCE_TERMINATED type fault event can be ignored from the
+	 * standpoint of dump on error. It is used to report fault for the CSIs
+	 * that are associated with the same CSG as the CSI for which the actual
+	 * fault was reported by the Iterator.
+	 * Dumping would be triggered when the actual fault is reported.
+	 *
+	 * CS_INHERIT_FAULT can also be ignored. It could happen due to the error
+	 * in other types of queues (cpu/kcpu). If a fault had occurred in some
+	 * other GPU queue then the dump would have been performed anyways when
+	 * that fault was reported.
+	 */
+	if ((cs_fault_exception_type != CS_FAULT_EXCEPTION_TYPE_CS_INHERIT_FAULT) &&
+	    (cs_fault_exception_type != CS_FAULT_EXCEPTION_TYPE_CS_RESOURCE_TERMINATED)) {
+		if (unlikely(kbase_debug_csf_fault_notify(kbdev, queue->kctx, DF_CS_FAULT))) {
+			get_queue(queue);
+			queue->cs_error = cs_fault;
+			queue->cs_error_info = cs_fault_info;
+			queue->cs_error_fatal = false;
+			if (!queue_work(queue->kctx->csf.wq, &queue->cs_error_work))
+				release_queue(queue);
+			return;
+		}
+	}
+#endif
+
+	kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
+					 CS_REQ_FAULT_MASK);
+	kbase_csf_ring_cs_kernel_doorbell(kbdev, queue->csi_index, queue->group->csg_nr, true);
 }
 
 static void report_queue_fatal_error(struct kbase_queue *const queue,
@@ -2341,16 +2502,16 @@ static void report_queue_fatal_error(struct kbase_queue *const queue,
 }
 
 /**
- * fatal_event_worker - Handle the fatal error for the GPU queue
+ * fatal_event_worker - Handle the CS_FATAL/CS_FAULT error for the GPU queue
  *
  * @data: Pointer to a work_struct embedded in GPU command queue.
  *
  * Terminate the CSG and report the error to userspace.
  */
-static void fatal_event_worker(struct work_struct *const data)
+static void cs_error_worker(struct work_struct *const data)
 {
 	struct kbase_queue *const queue =
-		container_of(data, struct kbase_queue, fatal_event_work);
+		container_of(data, struct kbase_queue, cs_error_work);
 	struct kbase_context *const kctx = queue->kctx;
 	struct kbase_device *const kbdev = kctx->kbdev;
 	struct kbase_queue_group *group;
@@ -2365,6 +2526,7 @@ static void fatal_event_worker(struct work_struct *const data)
 	else
 		reset_prevented = true;
 
+	kbase_debug_csf_fault_wait_completion(kbdev);
 	mutex_lock(&kctx->csf.lock);
 
 	group = get_bound_queue_group(queue);
@@ -2373,9 +2535,35 @@ static void fatal_event_worker(struct work_struct *const data)
 		goto unlock;
 	}
 
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (!queue->cs_error_fatal) {
+		unsigned long flags;
+		int slot_num;
+
+		kbase_csf_scheduler_spin_lock(kbdev, &flags);
+		slot_num = kbase_csf_scheduler_group_get_slot_locked(group);
+		if (slot_num >= 0) {
+			struct kbase_csf_cmd_stream_group_info const *ginfo =
+				&kbdev->csf.global_iface.groups[slot_num];
+			struct kbase_csf_cmd_stream_info const *stream =
+				&ginfo->streams[queue->csi_index];
+			u32 const cs_ack =
+				kbase_csf_firmware_cs_output(stream, CS_ACK);
+
+			kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
+				CS_REQ_FAULT_MASK);
+			kbase_csf_ring_cs_kernel_doorbell(kbdev, queue->csi_index,
+				slot_num, true);
+		}
+		kbase_csf_scheduler_spin_unlock(kbdev, flags);
+		goto unlock;
+	}
+#endif
+
 	group_handle = group->handle;
 	term_queue_group(group);
-	report_queue_fatal_error(queue, queue->cs_fatal, queue->cs_fatal_info,
+	flush_gpu_cache_on_fatal_error(kbdev);
+	report_queue_fatal_error(queue, queue->cs_error, queue->cs_error_info,
 				 group_handle);
 
 unlock:
@@ -2391,14 +2579,18 @@ unlock:
  * @queue:    Pointer to queue for which fatal event was received.
  * @stream:   Pointer to the structure containing info provided by the
  *            firmware about the CSI.
+ * @cs_ack: Value of the CS_ACK register in the CS kernel input page used for
+ *          the queue.
  *
- * Prints meaningful CS fatal information.
+ * Notify a waiting user space client of the CS fatal and prints meaningful
+ * information.
  * Enqueue a work item to terminate the group and report the fatal error
  * to user space.
  */
 static void
 handle_fatal_event(struct kbase_queue *const queue,
-		   struct kbase_csf_cmd_stream_info const *const stream)
+		   struct kbase_csf_cmd_stream_info const *const stream,
+		   u32 cs_ack)
 {
 	const u32 cs_fatal = kbase_csf_firmware_cs_output(stream, CS_FATAL);
 	const u64 cs_fatal_info =
@@ -2428,57 +2620,26 @@ handle_fatal_event(struct kbase_queue *const queue,
 
 	if (cs_fatal_exception_type ==
 			CS_FATAL_EXCEPTION_TYPE_FIRMWARE_INTERNAL_ERROR) {
+		kbase_debug_csf_fault_notify(kbdev, queue->kctx, DF_FW_INTERNAL_ERROR);
 		queue_work(system_wq, &kbdev->csf.fw_error_work);
 	} else {
+		kbase_debug_csf_fault_notify(kbdev, queue->kctx, DF_CS_FATAL);
 		if (cs_fatal_exception_type == CS_FATAL_EXCEPTION_TYPE_CS_UNRECOVERABLE) {
 			queue->group->cs_unrecoverable = true;
 			if (kbase_prepare_to_reset_gpu(queue->kctx->kbdev, RESET_FLAGS_NONE))
 				kbase_reset_gpu(queue->kctx->kbdev);
 		}
 		get_queue(queue);
-		queue->cs_fatal = cs_fatal;
-		queue->cs_fatal_info = cs_fatal_info;
-		if (!queue_work(queue->kctx->csf.wq, &queue->fatal_event_work))
+		queue->cs_error = cs_fatal;
+		queue->cs_error_info = cs_fatal_info;
+		queue->cs_error_fatal = true;
+		if (!queue_work(queue->kctx->csf.wq, &queue->cs_error_work))
 			release_queue(queue);
 	}
 
-}
+	kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
+					CS_REQ_FATAL_MASK);
 
-/**
- * handle_queue_exception_event - Handler for CS fatal/fault exception events.
- *
- * @queue:  Pointer to queue for which fatal/fault event was received.
- * @cs_req: Value of the CS_REQ register from the CS's input page.
- * @cs_ack: Value of the CS_ACK register from the CS's output page.
- */
-static void handle_queue_exception_event(struct kbase_queue *const queue,
-					 const u32 cs_req, const u32 cs_ack)
-{
-	struct kbase_csf_cmd_stream_group_info const *ginfo;
-	struct kbase_csf_cmd_stream_info const *stream;
-	struct kbase_context *const kctx = queue->kctx;
-	struct kbase_device *const kbdev = kctx->kbdev;
-	struct kbase_queue_group *group = queue->group;
-	int csi_index = queue->csi_index;
-	int slot_num = group->csg_nr;
-
-	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
-
-	ginfo = &kbdev->csf.global_iface.groups[slot_num];
-	stream = &ginfo->streams[csi_index];
-
-	if ((cs_ack & CS_ACK_FATAL_MASK) != (cs_req & CS_REQ_FATAL_MASK)) {
-		handle_fatal_event(queue, stream);
-		kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
-						 CS_REQ_FATAL_MASK);
-	}
-
-	if ((cs_ack & CS_ACK_FAULT_MASK) != (cs_req & CS_REQ_FAULT_MASK)) {
-		handle_fault_event(queue, stream);
-		kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
-						 CS_REQ_FAULT_MASK);
-		kbase_csf_ring_cs_kernel_doorbell(kbdev, csi_index, slot_num, true);
-	}
 }
 
 /**
@@ -2531,11 +2692,16 @@ static void process_cs_interrupts(struct kbase_queue_group *const group,
 				kbase_csf_firmware_cs_output(stream, CS_ACK);
 			struct workqueue_struct *wq = group->kctx->csf.wq;
 
-			if ((cs_req & CS_REQ_EXCEPTION_MASK) ^
-			    (cs_ack & CS_ACK_EXCEPTION_MASK)) {
+			if ((cs_ack & CS_ACK_FATAL_MASK) != (cs_req & CS_REQ_FATAL_MASK)) {
 				KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, CSI_INTERRUPT_FAULT,
 							 group, queue, cs_req ^ cs_ack);
-				handle_queue_exception_event(queue, cs_req, cs_ack);
+				handle_fatal_event(queue, stream, cs_ack);
+			}
+
+			if ((cs_ack & CS_ACK_FAULT_MASK) != (cs_req & CS_REQ_FAULT_MASK)) {
+				KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, CSI_INTERRUPT_FAULT,
+							 group, queue, cs_req ^ cs_ack);
+				handle_fault_event(queue, cs_ack);
 			}
 
 			/* PROTM_PEND and TILER_OOM can be safely ignored
@@ -2597,6 +2763,8 @@ static void process_cs_interrupts(struct kbase_queue_group *const group,
 		if (test_bit(group->csg_nr, scheduler->csg_slots_idle_mask)) {
 			clear_bit(group->csg_nr,
 				  scheduler->csg_slots_idle_mask);
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_SLOT_IDLE_CLEAR, group,
+							scheduler->csg_slots_idle_mask[0]);
 			dev_dbg(kbdev->dev,
 				"Group-%d on slot %d de-idled by protm request",
 				group->handle, group->csg_nr);
@@ -2698,7 +2866,12 @@ static void process_csg_interrupts(struct kbase_device *const kbdev, int const c
 			/* If there are non-idle CSGs waiting for a slot, fire
 			 * a tock for a replacement.
 			 */
-			mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_INTERRUPT_NON_IDLE_GROUPS,
+						group, req ^ ack);
+			kbase_csf_scheduler_invoke_tock(kbdev);
+		} else {
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_INTERRUPT_NO_NON_IDLE_GROUPS,
+						group, req ^ ack);
 		}
 
 		if (group->scan_seq_num < track->idle_seq) {
@@ -2709,14 +2882,15 @@ static void process_csg_interrupts(struct kbase_device *const kbdev, int const c
 
 	if ((req ^ ack) & CSG_REQ_PROGRESS_TIMER_EVENT_MASK) {
 		kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ, ack,
-			CSG_REQ_PROGRESS_TIMER_EVENT_MASK);
+						  CSG_REQ_PROGRESS_TIMER_EVENT_MASK);
 
-		KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_INTERRUPT_PROGRESS_TIMER_EVENT,
-					 group, req ^ ack);
-		dev_info(kbdev->dev,
+		KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_INTERRUPT_PROGRESS_TIMER_EVENT, group,
+					 req ^ ack);
+		dev_info(
+			kbdev->dev,
 			"[%llu] Iterator PROGRESS_TIMER timeout notification received for group %u of ctx %d_%d on slot %d\n",
-			kbase_backend_get_cycle_cnt(kbdev),
-			group->handle, group->kctx->tgid, group->kctx->id, csg_nr);
+			kbase_backend_get_cycle_cnt(kbdev), group->handle, group->kctx->tgid,
+			group->kctx->id, csg_nr);
 
 		handle_progress_timer_event(group);
 	}
@@ -2904,7 +3078,7 @@ static inline void process_tracked_info_for_protm(struct kbase_device *kbdev,
 		 * for the scheduler to re-examine the case.
 		 */
 		dev_dbg(kbdev->dev, "Attempt pending protm from idle slot %d\n", track->idle_slot);
-		mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
+		kbase_csf_scheduler_invoke_tock(kbdev);
 	} else if (group) {
 		u32 i, num_groups = kbdev->csf.global_iface.group_num;
 		struct kbase_queue_group *grp;
@@ -2927,7 +3101,7 @@ static inline void process_tracked_info_for_protm(struct kbase_device *kbdev,
 				tock_triggered = true;
 				dev_dbg(kbdev->dev,
 					"Attempt new protm from tick/tock idle slot %d\n", i);
-				mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
+				kbase_csf_scheduler_invoke_tock(kbdev);
 				break;
 			}
 		}
@@ -2940,77 +3114,133 @@ static inline void process_tracked_info_for_protm(struct kbase_device *kbdev,
 	}
 }
 
+static void order_job_irq_clear_with_iface_mem_read(void)
+{
+	/* Ensure that write to the JOB_IRQ_CLEAR is ordered with regards to the
+	 * read from interface memory. The ordering is needed considering the way
+	 * FW & Kbase writes to the JOB_IRQ_RAWSTAT and JOB_IRQ_CLEAR registers
+	 * without any synchronization. Without the barrier there is no guarantee
+	 * about the ordering, the write to IRQ_CLEAR can take effect after the read
+	 * from interface memory and that could cause a problem for the scenario where
+	 * FW sends back to back notifications for the same CSG for events like
+	 * SYNC_UPDATE and IDLE, but Kbase gets a single IRQ and observes only the
+	 * first event. Similar thing can happen with glb events like CFG_ALLOC_EN
+	 * acknowledgment and GPU idle notification.
+	 *
+	 *       MCU                                    CPU
+	 *  ---------------                         ----------------
+	 *  Update interface memory                 Write to IRQ_CLEAR to clear current IRQ
+	 *  <barrier>                               <barrier>
+	 *  Write to IRQ_RAWSTAT to raise new IRQ   Read interface memory
+	 */
+
+	/* CPU and GPU would be in the same Outer shareable domain */
+	dmb(osh);
+}
+
 void kbase_csf_interrupt(struct kbase_device *kbdev, u32 val)
 {
-	unsigned long flags;
-	u32 csg_interrupts = val & ~JOB_IRQ_GLOBAL_IF;
-	struct irq_idle_and_protm_track track = { .protm_grp = NULL, .idle_seq = U32_MAX };
+	bool deferred_handling_glb_idle_irq = false;
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 	KBASE_KTRACE_ADD(kbdev, CSF_INTERRUPT_START, NULL, val);
-	kbase_reg_write(kbdev, JOB_CONTROL_REG(JOB_IRQ_CLEAR), val);
 
-	if (csg_interrupts != 0) {
-		kbase_csf_scheduler_spin_lock(kbdev, &flags);
-		/* Looping through and track the highest idle and protm groups */
-		while (csg_interrupts != 0) {
-			int const csg_nr = ffs(csg_interrupts) - 1;
+	do {
+		unsigned long flags;
+		u32 csg_interrupts = val & ~JOB_IRQ_GLOBAL_IF;
+		struct irq_idle_and_protm_track track = { .protm_grp = NULL, .idle_seq = U32_MAX };
+		bool glb_idle_irq_received = false;
 
-			process_csg_interrupts(kbdev, csg_nr, &track);
-			csg_interrupts &= ~(1 << csg_nr);
+		kbase_reg_write(kbdev, JOB_CONTROL_REG(JOB_IRQ_CLEAR), val);
+		order_job_irq_clear_with_iface_mem_read();
+
+		if (csg_interrupts != 0) {
+			kbase_csf_scheduler_spin_lock(kbdev, &flags);
+			/* Looping through and track the highest idle and protm groups */
+			while (csg_interrupts != 0) {
+				int const csg_nr = ffs(csg_interrupts) - 1;
+
+				process_csg_interrupts(kbdev, csg_nr, &track);
+				csg_interrupts &= ~(1 << csg_nr);
+			}
+
+			/* Handle protm from the tracked information */
+			process_tracked_info_for_protm(kbdev, &track);
+			kbase_csf_scheduler_spin_unlock(kbdev, flags);
 		}
 
-		/* Handle protm from the tracked information */
-		process_tracked_info_for_protm(kbdev, &track);
-		kbase_csf_scheduler_spin_unlock(kbdev, flags);
-	}
+		if (val & JOB_IRQ_GLOBAL_IF) {
+			const struct kbase_csf_global_iface *const global_iface =
+				&kbdev->csf.global_iface;
 
-	if (val & JOB_IRQ_GLOBAL_IF) {
-		const struct kbase_csf_global_iface *const global_iface =
-			&kbdev->csf.global_iface;
+			kbdev->csf.interrupt_received = true;
 
-		kbdev->csf.interrupt_received = true;
+			if (!kbdev->csf.firmware_reloaded)
+				kbase_csf_firmware_reload_completed(kbdev);
+			else if (global_iface->output) {
+				u32 glb_req, glb_ack;
 
-		if (!kbdev->csf.firmware_reloaded)
-			kbase_csf_firmware_reload_completed(kbdev);
-		else if (global_iface->output) {
-			u32 glb_req, glb_ack;
+				kbase_csf_scheduler_spin_lock(kbdev, &flags);
+				glb_req =
+					kbase_csf_firmware_global_input_read(global_iface, GLB_REQ);
+				glb_ack = kbase_csf_firmware_global_output(global_iface, GLB_ACK);
+				KBASE_KTRACE_ADD(kbdev, CSF_INTERRUPT_GLB_REQ_ACK, NULL,
+						 glb_req ^ glb_ack);
 
-			kbase_csf_scheduler_spin_lock(kbdev, &flags);
-			glb_req = kbase_csf_firmware_global_input_read(
-					global_iface, GLB_REQ);
-			glb_ack = kbase_csf_firmware_global_output(
-					global_iface, GLB_ACK);
-			KBASE_KTRACE_ADD(kbdev, CSF_INTERRUPT_GLB_REQ_ACK, NULL, glb_req ^ glb_ack);
+				check_protm_enter_req_complete(kbdev, glb_req, glb_ack);
 
-			check_protm_enter_req_complete(kbdev, glb_req, glb_ack);
+				if ((glb_req ^ glb_ack) & GLB_REQ_PROTM_EXIT_MASK)
+					process_protm_exit(kbdev, glb_ack);
 
-			if ((glb_req ^ glb_ack) & GLB_REQ_PROTM_EXIT_MASK)
-				process_protm_exit(kbdev, glb_ack);
-
-			/* Handle IDLE Hysteresis notification event */
-			if ((glb_req ^ glb_ack) & GLB_REQ_IDLE_EVENT_MASK) {
-				dev_dbg(kbdev->dev, "Idle-hysteresis event flagged");
-				kbase_csf_firmware_global_input_mask(
+				/* Handle IDLE Hysteresis notification event */
+				if ((glb_req ^ glb_ack) & GLB_REQ_IDLE_EVENT_MASK) {
+					dev_dbg(kbdev->dev, "Idle-hysteresis event flagged");
+					kbase_csf_firmware_global_input_mask(
 						global_iface, GLB_REQ, glb_ack,
 						GLB_REQ_IDLE_EVENT_MASK);
 
-				kbase_csf_scheduler_process_gpu_idle_event(kbdev);
+					glb_idle_irq_received = true;
+					/* Defer handling this IRQ to account for a race condition
+					 * where the idle worker could be executed before we have
+					 * finished handling all pending IRQs (including CSG IDLE
+					 * IRQs).
+					 */
+					deferred_handling_glb_idle_irq = true;
+				}
+
+				process_prfcnt_interrupts(kbdev, glb_req, glb_ack);
+
+				kbase_csf_scheduler_spin_unlock(kbdev, flags);
+
+				/* Invoke the MCU state machine as a state transition
+				 * might have completed.
+				 */
+				kbase_pm_update_state(kbdev);
 			}
-
-			process_prfcnt_interrupts(kbdev, glb_req, glb_ack);
-
-			kbase_csf_scheduler_spin_unlock(kbdev, flags);
-
-			/* Invoke the MCU state machine as a state transition
-			 * might have completed.
-			 */
-			kbase_pm_update_state(kbdev);
 		}
+
+		if (!glb_idle_irq_received)
+			break;
+		/* Attempt to serve potential IRQs that might have occurred
+		 * whilst handling the previous IRQ. In case we have observed
+		 * the GLB IDLE IRQ without all CSGs having been marked as
+		 * idle, the GPU would be treated as no longer idle and left
+		 * powered on.
+		 */
+		val = kbase_reg_read(kbdev, JOB_CONTROL_REG(JOB_IRQ_STATUS));
+	} while (val);
+
+	if (deferred_handling_glb_idle_irq) {
+		unsigned long flags;
+
+		kbase_csf_scheduler_spin_lock(kbdev, &flags);
+		kbase_csf_scheduler_process_gpu_idle_event(kbdev);
+		kbase_csf_scheduler_spin_unlock(kbdev, flags);
 	}
 
 	wake_up_all(&kbdev->csf.event_wait);
+
 	KBASE_KTRACE_ADD(kbdev, CSF_INTERRUPT_END, NULL, val);
 }
 
@@ -3037,9 +3267,8 @@ int kbase_csf_doorbell_mapping_init(struct kbase_device *kbdev)
 	if (IS_ERR(filp))
 		return PTR_ERR(filp);
 
-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-		1, &phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], 1, &phys,
+					 false);
 
 	if (ret <= 0) {
 		fput(filp);
@@ -3073,9 +3302,8 @@ int kbase_csf_setup_dummy_user_reg_page(struct kbase_device *kbdev)
 
 	kbdev->csf.dummy_user_reg_page = as_tagged(0);
 
-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], 1, &phys,
-		false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], 1, &phys,
+					 false);
 
 	if (ret <= 0)
 		return ret;
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_csg_debugfs.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_csg_debugfs.c
index 92a511d79a05..3afbe6d4005e 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_csg_debugfs.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_csg_debugfs.c
@@ -23,12 +23,135 @@
 #include <mali_kbase.h>
 #include <linux/seq_file.h>
 #include <linux/delay.h>
-#include <csf/mali_kbase_csf_trace_buffer.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 #include "mali_kbase_csf_tl_reader.h"
 
+/* Wait time to be used cumulatively for all the CSG slots.
+ * Since scheduler lock is held when STATUS_UPDATE request is sent, there won't be
+ * any other Host request pending on the FW side and usually FW would be responsive
+ * to the Doorbell IRQs as it won't do any polling for a long time and also it won't
+ * have to wait for any HW state transition to complete for publishing the status.
+ * So it is reasonable to expect that handling of STATUS_UPDATE request would be
+ * relatively very quick.
+ */
+#define STATUS_UPDATE_WAIT_TIMEOUT 500
+
+/* The bitmask of CSG slots for which the STATUS_UPDATE request completed.
+ * The access to it is serialized with scheduler lock, so at a time it would
+ * get used either for "active_groups" or per context "groups" debugfs file.
+ */
+static DECLARE_BITMAP(csg_slots_status_updated, MAX_SUPPORTED_CSGS);
+
+static
+bool csg_slot_status_update_finish(struct kbase_device *kbdev, u32 csg_nr)
+{
+	struct kbase_csf_cmd_stream_group_info const *const ginfo =
+		&kbdev->csf.global_iface.groups[csg_nr];
+
+	return !((kbase_csf_firmware_csg_input_read(ginfo, CSG_REQ) ^
+		  kbase_csf_firmware_csg_output(ginfo, CSG_ACK)) &
+			     CSG_REQ_STATUS_UPDATE_MASK);
+}
+
+static
+bool csg_slots_status_update_finish(struct kbase_device *kbdev,
+		const unsigned long *slots_mask)
+{
+	const u32 max_csg_slots = kbdev->csf.global_iface.group_num;
+	bool changed = false;
+	u32 csg_nr;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	for_each_set_bit(csg_nr, slots_mask, max_csg_slots) {
+		if (csg_slot_status_update_finish(kbdev, csg_nr)) {
+			set_bit(csg_nr, csg_slots_status_updated);
+			changed = true;
+		}
+	}
+
+	return changed;
+}
+
+static void wait_csg_slots_status_update_finish(struct kbase_device *kbdev,
+		unsigned long *slots_mask)
+{
+	const u32 max_csg_slots = kbdev->csf.global_iface.group_num;
+	long remaining = kbase_csf_timeout_in_jiffies(STATUS_UPDATE_WAIT_TIMEOUT);
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	bitmap_zero(csg_slots_status_updated, max_csg_slots);
+
+	while (!bitmap_empty(slots_mask, max_csg_slots) && remaining) {
+		remaining = wait_event_timeout(kbdev->csf.event_wait,
+				csg_slots_status_update_finish(kbdev, slots_mask),
+				remaining);
+		if (likely(remaining)) {
+			bitmap_andnot(slots_mask, slots_mask,
+				csg_slots_status_updated, max_csg_slots);
+		} else {
+			dev_warn(kbdev->dev,
+				 "STATUS_UPDATE request timed out for slots 0x%lx",
+				 slots_mask[0]);
+		}
+	}
+}
+
+static void update_active_groups_status(struct kbase_device *kbdev, struct seq_file *file)
+{
+	u32 max_csg_slots = kbdev->csf.global_iface.group_num;
+	DECLARE_BITMAP(used_csgs, MAX_SUPPORTED_CSGS) = { 0 };
+	u32 csg_nr;
+	unsigned long flags;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	/* Global doorbell ring for CSG STATUS_UPDATE request or User doorbell
+	 * ring for Extract offset update, shall not be made when MCU has been
+	 * put to sleep otherwise it will undesirably make MCU exit the sleep
+	 * state. Also it isn't really needed as FW will implicitly update the
+	 * status of all on-slot groups when MCU sleep request is sent to it.
+	 */
+	if (kbdev->csf.scheduler.state == SCHED_SLEEPING) {
+		bitmap_copy(csg_slots_status_updated,
+			    kbdev->csf.scheduler.csg_inuse_bitmap, max_csg_slots);
+		return;
+	}
+
+	for (csg_nr = 0; csg_nr < max_csg_slots; csg_nr++) {
+		struct kbase_queue_group *const group =
+			kbdev->csf.scheduler.csg_slots[csg_nr].resident_group;
+		if (!group)
+			continue;
+		/* Ring the User doorbell for FW to update the Extract offset */
+		kbase_csf_ring_doorbell(kbdev, group->doorbell_nr);
+		set_bit(csg_nr, used_csgs);
+	}
+
+	/* Return early if there are no on-slot groups */
+	if (bitmap_empty(used_csgs, max_csg_slots))
+		return;
+
+	kbase_csf_scheduler_spin_lock(kbdev, &flags);
+	for_each_set_bit(csg_nr, used_csgs, max_csg_slots) {
+		struct kbase_csf_cmd_stream_group_info const *const ginfo =
+			&kbdev->csf.global_iface.groups[csg_nr];
+		kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ,
+						  ~kbase_csf_firmware_csg_output(ginfo, CSG_ACK),
+						  CSG_REQ_STATUS_UPDATE_MASK);
+	}
+
+	BUILD_BUG_ON(MAX_SUPPORTED_CSGS > (sizeof(used_csgs[0]) * BITS_PER_BYTE));
+	kbase_csf_ring_csg_slots_doorbell(kbdev, used_csgs[0]);
+	kbase_csf_scheduler_spin_unlock(kbdev, flags);
+	wait_csg_slots_status_update_finish(kbdev, used_csgs);
+	/* Wait for the User doobell ring to take effect */
+	msleep(100);
+}
+
 #define MAX_SCHED_STATE_STRING_LEN (16)
 static const char *scheduler_state_to_string(struct kbase_device *kbdev,
 			enum kbase_csf_scheduler_state sched_state)
@@ -77,16 +200,32 @@ static const char *blocked_reason_to_string(u32 reason_id)
 	return cs_blocked_reason[reason_id];
 }
 
+static bool sb_source_supported(u32 glb_version)
+{
+	bool supported = false;
+
+	if (((GLB_VERSION_MAJOR_GET(glb_version) == 3) &&
+	     (GLB_VERSION_MINOR_GET(glb_version) >= 5)) ||
+	    ((GLB_VERSION_MAJOR_GET(glb_version) == 2) &&
+	     (GLB_VERSION_MINOR_GET(glb_version) >= 6)) ||
+	    ((GLB_VERSION_MAJOR_GET(glb_version) == 1) &&
+	     (GLB_VERSION_MINOR_GET(glb_version) >= 3)))
+		supported = true;
+
+	return supported;
+}
+
 static void kbasep_csf_scheduler_dump_active_queue_cs_status_wait(
-	struct seq_file *file, u32 wait_status, u32 wait_sync_value,
-	u64 wait_sync_live_value, u64 wait_sync_pointer, u32 sb_status,
-	u32 blocked_reason)
+	struct seq_file *file, u32 glb_version, u32 wait_status, u32 wait_sync_value,
+	u64 wait_sync_live_value, u64 wait_sync_pointer, u32 sb_status, u32 blocked_reason)
 {
 #define WAITING "Waiting"
 #define NOT_WAITING "Not waiting"
 
 	seq_printf(file, "SB_MASK: %d\n",
 			CS_STATUS_WAIT_SB_MASK_GET(wait_status));
+	if (sb_source_supported(glb_version))
+		seq_printf(file, "SB_SOURCE: %d\n", CS_STATUS_WAIT_SB_SOURCE_GET(wait_status));
 	seq_printf(file, "PROGRESS_WAIT: %s\n",
 			CS_STATUS_WAIT_PROGRESS_WAIT_GET(wait_status) ?
 			WAITING : NOT_WAITING);
@@ -156,10 +295,13 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 	struct kbase_vmap_struct *mapping;
 	u64 *evt;
 	u64 wait_sync_live_value;
+	u32 glb_version;
 
 	if (!queue)
 		return;
 
+	glb_version = queue->kctx->kbdev->csf.global_iface.version;
+
 	if (WARN_ON(queue->csi_index == KBASEP_IF_NR_INVALID ||
 		    !queue->group))
 		return;
@@ -200,9 +342,8 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 			}
 
 			kbasep_csf_scheduler_dump_active_queue_cs_status_wait(
-				file, wait_status, wait_sync_value,
-				wait_sync_live_value, wait_sync_pointer,
-				sb_status, blocked_reason);
+				file, glb_version, wait_status, wait_sync_value,
+				wait_sync_live_value, wait_sync_pointer, sb_status, blocked_reason);
 		}
 	} else {
 		struct kbase_device const *const kbdev =
@@ -257,9 +398,8 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 		}
 
 		kbasep_csf_scheduler_dump_active_queue_cs_status_wait(
-			file, wait_status, wait_sync_value,
-			wait_sync_live_value, wait_sync_pointer, sb_status,
-			blocked_reason);
+			file, glb_version, wait_status, wait_sync_value, wait_sync_live_value,
+			wait_sync_pointer, sb_status, blocked_reason);
 		/* Dealing with cs_trace */
 		if (kbase_csf_scheduler_queue_has_trace(queue))
 			kbasep_csf_scheduler_dump_active_cs_trace(file, stream);
@@ -270,54 +410,6 @@ static void kbasep_csf_scheduler_dump_active_queue(struct seq_file *file,
 	seq_puts(file, "\n");
 }
 
-static void update_active_group_status(struct seq_file *file,
-		struct kbase_queue_group *const group)
-{
-	struct kbase_device *const kbdev = group->kctx->kbdev;
-	struct kbase_csf_cmd_stream_group_info const *const ginfo =
-		&kbdev->csf.global_iface.groups[group->csg_nr];
-	long remaining = kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms);
-	unsigned long flags;
-
-	/* Global doorbell ring for CSG STATUS_UPDATE request or User doorbell
-	 * ring for Extract offset update, shall not be made when MCU has been
-	 * put to sleep otherwise it will undesirably make MCU exit the sleep
-	 * state. Also it isn't really needed as FW will implicitly update the
-	 * status of all on-slot groups when MCU sleep request is sent to it.
-	 */
-	if (kbdev->csf.scheduler.state == SCHED_SLEEPING)
-		return;
-
-	/* Ring the User doobell shared between the queues bound to this
-	 * group, to have FW update the CS_EXTRACT for all the queues
-	 * bound to the group. Ring early so that FW gets adequate time
-	 * for the handling.
-	 */
-	kbase_csf_ring_doorbell(kbdev, group->doorbell_nr);
-
-	kbase_csf_scheduler_spin_lock(kbdev, &flags);
-	kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ,
-			~kbase_csf_firmware_csg_output(ginfo, CSG_ACK),
-			CSG_REQ_STATUS_UPDATE_MASK);
-	kbase_csf_scheduler_spin_unlock(kbdev, flags);
-	kbase_csf_ring_csg_doorbell(kbdev, group->csg_nr);
-
-	remaining = wait_event_timeout(kbdev->csf.event_wait,
-		!((kbase_csf_firmware_csg_input_read(ginfo, CSG_REQ) ^
-		kbase_csf_firmware_csg_output(ginfo, CSG_ACK)) &
-		CSG_REQ_STATUS_UPDATE_MASK), remaining);
-
-	if (!remaining) {
-		dev_err(kbdev->dev,
-			"Timed out for STATUS_UPDATE on group %d on slot %d",
-			group->handle, group->csg_nr);
-
-		seq_printf(file, "*** Warn: Timed out for STATUS_UPDATE on slot %d\n",
-			group->csg_nr);
-		seq_puts(file, "*** The following group-record is likely stale\n");
-	}
-}
-
 static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 		struct kbase_queue_group *const group)
 {
@@ -331,8 +423,6 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 		u8 slot_priority =
 			kbdev->csf.scheduler.csg_slots[group->csg_nr].priority;
 
-		update_active_group_status(file, group);
-
 		ep_c = kbase_csf_firmware_csg_output(ginfo,
 				CSG_STATUS_EP_CURRENT);
 		ep_r = kbase_csf_firmware_csg_output(ginfo, CSG_STATUS_EP_REQ);
@@ -348,6 +438,12 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 				CSG_STATUS_STATE_IDLE_MASK)
 			idle = 'Y';
 
+		if (!test_bit(group->csg_nr, csg_slots_status_updated)) {
+			seq_printf(file, "*** Warn: Timed out for STATUS_UPDATE on slot %d\n",
+				group->csg_nr);
+			seq_puts(file, "*** The following group-record is likely stale\n");
+		}
+
 		seq_puts(file, "GroupID, CSG NR, CSG Prio, Run State, Priority, C_EP(Alloc/Req), F_EP(Alloc/Req), T_EP(Alloc/Req), Exclusive, Idle\n");
 		seq_printf(file, "%7d, %6d, %8d, %9d, %8d, %11d/%3d, %11d/%3d, %11d/%3d, %9c, %4c\n",
 			group->handle,
@@ -363,10 +459,6 @@ static void kbasep_csf_scheduler_dump_active_group(struct seq_file *file,
 			CSG_STATUS_EP_REQ_TILER_EP_GET(ep_r),
 			exclusive,
 			idle);
-
-		/* Wait for the User doobell ring to take effect */
-		if (kbdev->csf.scheduler.state != SCHED_SLEEPING)
-			msleep(100);
 	} else {
 		seq_puts(file, "GroupID, CSG NR, Run State, Priority\n");
 		seq_printf(file, "%7d, %6d, %9d, %8d\n",
@@ -416,10 +508,11 @@ static int kbasep_csf_queue_group_debugfs_show(struct seq_file *file,
 	kbase_csf_scheduler_lock(kbdev);
 	if (kbdev->csf.scheduler.state == SCHED_SLEEPING) {
 		/* Wait for the MCU sleep request to complete. Please refer the
-		 * update_active_group_status() function for the explanation.
+		 * update_active_groups_status() function for the explanation.
 		 */
 		kbase_pm_wait_for_desired_state(kbdev);
 	}
+	update_active_groups_status(kbdev, file);
 	for (gr = 0; gr < MAX_QUEUE_GROUP_NUM; gr++) {
 		struct kbase_queue_group *const group =
 			kctx->csf.queue_groups[gr];
@@ -455,10 +548,11 @@ static int kbasep_csf_scheduler_dump_active_groups(struct seq_file *file,
 	kbase_csf_scheduler_lock(kbdev);
 	if (kbdev->csf.scheduler.state == SCHED_SLEEPING) {
 		/* Wait for the MCU sleep request to complete. Please refer the
-		 * update_active_group_status() function for the explanation.
+		 * update_active_groups_status() function for the explanation.
 		 */
 		kbase_pm_wait_for_desired_state(kbdev);
 	}
+	update_active_groups_status(kbdev, file);
 	for (csg_nr = 0; csg_nr < num_groups; csg_nr++) {
 		struct kbase_queue_group *const group =
 			kbdev->csf.scheduler.csg_slots[csg_nr].resident_group;
@@ -664,7 +758,6 @@ void kbase_csf_debugfs_init(struct kbase_device *kbdev)
 			&kbasep_csf_debugfs_scheduler_state_fops);
 
 	kbase_csf_tl_reader_debugfs_init(kbdev);
-	kbase_csf_firmware_trace_buffer_debugfs_init(kbdev);
 }
 
 #else
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_defs.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_defs.h
index 27aa53de110d..32a1c557e387 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_defs.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_defs.h
@@ -31,6 +31,7 @@
 
 #include "mali_kbase_csf_firmware.h"
 #include "mali_kbase_csf_event.h"
+#include <uapi/gpu/arm/bifrost/csf/mali_kbase_csf_errors_dumpfault.h>
 
 /* Maximum number of KCPU command queues to be created per GPU address space.
  */
@@ -355,14 +356,19 @@ struct kbase_csf_notification {
  * @trace_buffer_size: CS trace buffer size for the queue.
  * @trace_cfg:         CS trace configuration parameters.
  * @error:          GPU command queue fatal information to pass to user space.
- * @fatal_event_work: Work item to handle the CS fatal event reported for this
- *                    queue.
- * @cs_fatal_info:    Records additional information about the CS fatal event.
- * @cs_fatal:         Records information about the CS fatal event.
+ * @cs_error_work:    Work item to handle the CS fatal event reported for this
+ *                    queue or the CS fault event if dump on fault is enabled
+ *                    and acknowledgment for CS fault event needs to be done
+ *                    after dumping is complete.
+ * @cs_error_info:    Records additional information about the CS fatal event or
+ *                    about CS fault event if dump on fault is enabled.
+ * @cs_error:         Records information about the CS fatal event or
+ *                    about CS fault event if dump on fault is enabled.
+ * @cs_error_fatal:   Flag to track if the CS fault or CS fatal event occurred.
  * @pending:          Indicating whether the queue has new submitted work.
- * @extract_ofs: The current EXTRACT offset, this is updated during certain
- *               events such as GPU idle IRQ in order to help detect a
- *               queue's true idle status.
+ * @extract_ofs: The current EXTRACT offset, this is only updated when handling
+ *               the GLB IDLE IRQ if the idle timeout value is non-0 in order
+ *               to help detect a queue's true idle status.
  * @saved_cmd_ptr: The command pointer value for the GPU queue, saved when the
  *                 group to which queue is bound is suspended.
  *                 This can be useful in certain cases to know that till which
@@ -377,7 +383,11 @@ struct kbase_queue {
 	int doorbell_nr;
 	unsigned long db_file_offset;
 	struct list_head link;
+#if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
 	atomic_t refcount;
+#else
+	refcount_t refcount;
+#endif
 	struct kbase_queue_group *group;
 	struct kbase_va_region *queue_reg;
 	struct work_struct oom_event_work;
@@ -397,14 +407,15 @@ struct kbase_queue {
 	u32 trace_buffer_size;
 	u32 trace_cfg;
 	struct kbase_csf_notification error;
-	struct work_struct fatal_event_work;
-	u64 cs_fatal_info;
-	u32 cs_fatal;
+	struct work_struct cs_error_work;
+	u64 cs_error_info;
+	u32 cs_error;
+	bool cs_error_fatal;
 	atomic_t pending;
 	u64 extract_ofs;
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	u64 saved_cmd_ptr;
-#endif
+#endif /* CONFIG_DEBUG_FS */
 };
 
 /**
@@ -498,6 +509,9 @@ struct kbase_protected_suspend_buffer {
  *                   to be returned to userspace if such an error has occurred.
  * @timer_event_work: Work item to handle the progress timeout fatal event
  *                    for the group.
+ * @deschedule_deferred_cnt: Counter keeping a track of the number of threads
+ *                           that tried to deschedule the group and had to defer
+ *                           the descheduling due to the dump on fault.
  */
 struct kbase_queue_group {
 	struct kbase_context *kctx;
@@ -539,6 +553,15 @@ struct kbase_queue_group {
 
 	struct work_struct timer_event_work;
 
+	/**
+	 * @dvs_buf: Address and size of scratch memory.
+	 *
+	 * Used to store intermediate DVS data by the GPU.
+	 */
+	u64 dvs_buf;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	u32 deschedule_deferred_cnt;
+#endif
 };
 
 /**
@@ -548,10 +571,10 @@ struct kbase_queue_group {
  * @lock:   Lock preventing concurrent access to @array and the @in_use bitmap.
  * @array:  Array of pointers to kernel CPU command queues.
  * @in_use: Bitmap which indicates which kernel CPU command queues are in use.
- * @wq:     Dedicated workqueue for processing kernel CPU command queues.
- * @num_cmds:           The number of commands that have been enqueued across
- *                      all the KCPU command queues. This could be used as a
- *                      timestamp to determine the command's enqueueing time.
+ * @cmd_seq_num:        The sequence number assigned to an enqueued command,
+ *                      in incrementing order (older commands shall have a
+ *                      smaller number).
+ * @jit_lock:           Lock to serialise JIT operations.
  * @jit_cmds_head:      A list of the just-in-time memory commands, both
  *                      allocate & free, in submission order, protected
  *                      by kbase_csf_kcpu_queue_context.lock.
@@ -564,9 +587,9 @@ struct kbase_csf_kcpu_queue_context {
 	struct mutex lock;
 	struct kbase_kcpu_command_queue *array[KBASEP_MAX_KCPU_QUEUES];
 	DECLARE_BITMAP(in_use, KBASEP_MAX_KCPU_QUEUES);
-	struct workqueue_struct *wq;
-	u64 num_cmds;
+	atomic64_t cmd_seq_num;
 
+	struct mutex jit_lock;
 	struct list_head jit_cmds_head;
 	struct list_head jit_blocked_queues;
 };
@@ -636,6 +659,28 @@ struct kbase_csf_tiler_heap_context {
 	u64 nr_of_heaps;
 };
 
+/**
+ * struct kbase_csf_ctx_heap_reclaim_info - Object representing the data section of
+ *                                          a kctx for tiler heap reclaim manger
+ * @mgr_link:            Link for hooking up to the heap reclaim manger's kctx lists
+ * @nr_freed_pages:      Number of freed pages from the the kctx, after its attachment
+ *                       to the reclaim manager. This is used for tracking reclaim's
+ *                       free operation progress.
+ * @nr_est_unused_pages: Estimated number of pages that could be freed for the kctx
+ *                       when all its CSGs are off-slot, on attaching to the reclaim
+ *                       manager.
+ * @on_slot_grps:        Number of on-slot groups from this kctx. In principle, if a
+ *                       kctx has groups on-slot, the scheduler will detach it from
+ *                       the tiler heap reclaim manager, i.e. no tiler heap memory
+ *                       reclaiming operations on the kctx.
+ */
+struct kbase_csf_ctx_heap_reclaim_info {
+	struct list_head mgr_link;
+	u32 nr_freed_pages;
+	u32 nr_est_unused_pages;
+	u8 on_slot_grps;
+};
+
 /**
  * struct kbase_csf_scheduler_context - Object representing the scheduler's
  *                                      context for a GPU address space.
@@ -657,6 +702,10 @@ struct kbase_csf_tiler_heap_context {
  *                      streams bound to groups of @idle_wait_groups list.
  * @ngrp_to_schedule:	Number of groups added for the context to the
  *                      'groups_to_schedule' list of scheduler instance.
+ * @heap_info:          Heap reclaim information data of the kctx. As the
+ *                      reclaim action needs to be coordinated with the scheduler
+ *                      operations, any manipulations on the data needs holding
+ *                      the scheduler's mutex lock.
  */
 struct kbase_csf_scheduler_context {
 	struct list_head runnable_groups[KBASE_QUEUE_GROUP_PRIORITY_COUNT];
@@ -666,6 +715,7 @@ struct kbase_csf_scheduler_context {
 	struct workqueue_struct *sync_update_wq;
 	struct work_struct sync_update_work;
 	u32 ngrp_to_schedule;
+	struct kbase_csf_ctx_heap_reclaim_info heap_info;
 };
 
 /**
@@ -808,6 +858,22 @@ struct kbase_csf_csg_slot {
 	u8 priority;
 };
 
+/**
+ * struct kbase_csf_sched_heap_reclaim_mgr - Object for managing tiler heap reclaim
+ *                                           kctx lists inside the CSF device's scheduler.
+ *
+ * @heap_reclaim:   Tiler heap reclaim shrinker object.
+ * @ctx_lists:      Array of kctx lists, size matching CSG defined priorities. The
+ *                  lists track the kctxs attached to the reclaim manager.
+ * @unused_pages:   Estimated number of unused pages from the @ctxlist array. The
+ *                  number is indicative for use with reclaim shrinker's count method.
+ */
+struct kbase_csf_sched_heap_reclaim_mgr {
+	struct shrinker heap_reclaim;
+	struct list_head ctx_lists[KBASE_QUEUE_GROUP_PRIORITY_COUNT];
+	atomic_t unused_pages;
+};
+
 /**
  * struct kbase_csf_scheduler - Object representing the scheduler used for
  *                              CSF for an instance of GPU platform device.
@@ -880,6 +946,8 @@ struct kbase_csf_csg_slot {
  *                          operation to implement timeslice-based scheduling.
  * @tock_work:              Work item that would perform the schedule on tock
  *                          operation to implement the asynchronous scheduling.
+ * @pending_tock_work:      Indicates that the tock work item should re-execute
+ *                          once it's finished instead of going back to sleep.
  * @ping_work:              Work item that would ping the firmware at regular
  *                          intervals, only if there is a single active CSG
  *                          slot, to check if firmware is alive and would
@@ -889,8 +957,6 @@ struct kbase_csf_csg_slot {
  *                          @top_grp.
  * @top_grp:                Pointer to queue group inside @groups_to_schedule
  *                          list that was assigned the highest slot priority.
- * @tock_pending_request:   A "tock" request is pending: a group that is not
- *                          currently on the GPU demands to be scheduled.
  * @active_protm_grp:       Indicates if firmware has been permitted to let GPU
  *                          enter protected mode with the given group. On exit
  *                          from protected mode the pointer is reset to NULL.
@@ -903,6 +969,13 @@ struct kbase_csf_csg_slot {
  *                          handler.
  * @gpu_idle_work:          Work item for facilitating the scheduler to bring
  *                          the GPU to a low-power mode on becoming idle.
+ * @fast_gpu_idle_handling: Indicates whether to relax many of the checks
+ *                          normally done in the GPU idle worker. This is
+ *                          set to true when handling the GLB IDLE IRQ if the
+ *                          idle hysteresis timeout is 0, since it makes it
+ *                          possible to receive this IRQ before the extract
+ *                          offset is published (which would cause more
+ *                          extensive GPU idle checks to fail).
  * @gpu_no_longer_idle:     Effective only when the GPU idle worker has been
  *                          queued for execution, this indicates whether the
  *                          GPU has become non-idle since the last time the
@@ -934,6 +1007,7 @@ struct kbase_csf_csg_slot {
  *                          groups. It is updated on every tick/tock.
  *                          @interrupt_lock is used to serialize the access.
  * @protm_enter_time:       GPU protected mode enter time.
+ * @reclaim_mgr:            CSGs tiler heap manager object.
  */
 struct kbase_csf_scheduler {
 	struct mutex lock;
@@ -960,13 +1034,14 @@ struct kbase_csf_scheduler {
 	struct hrtimer tick_timer;
 	struct work_struct tick_work;
 	struct delayed_work tock_work;
+	atomic_t pending_tock_work;
 	struct delayed_work ping_work;
 	struct kbase_context *top_ctx;
 	struct kbase_queue_group *top_grp;
-	bool tock_pending_request;
 	struct kbase_queue_group *active_protm_grp;
 	struct workqueue_struct *idle_wq;
 	struct work_struct gpu_idle_work;
+	bool fast_gpu_idle_handling;
 	atomic_t gpu_no_longer_idle;
 	atomic_t non_idle_offslot_grps;
 	u32 non_idle_scanout_grps;
@@ -975,6 +1050,7 @@ struct kbase_csf_scheduler {
 	bool tick_timer_active;
 	u32 tick_protm_pending_seq;
 	ktime_t protm_enter_time;
+	struct kbase_csf_sched_heap_reclaim_mgr reclaim_mgr;
 };
 
 /*
@@ -1161,6 +1237,7 @@ struct kbase_ipa_control {
  * @flags: bitmask of CSF_FIRMWARE_ENTRY_* conveying the interface attributes
  * @data_start: Offset into firmware image at which the interface data starts
  * @data_end: Offset into firmware image at which the interface data ends
+ * @virtual_exe_start: Starting GPU execution virtual address of this interface
  * @kernel_map: A kernel mapping of the memory or NULL if not required to be
  *              mapped in the kernel
  * @pma: Array of pointers to protected memory allocations.
@@ -1177,6 +1254,7 @@ struct kbase_csf_firmware_interface {
 	u32 flags;
 	u32 data_start;
 	u32 data_end;
+	u32 virtual_exe_start;
 	void *kernel_map;
 	struct protected_memory_allocation **pma;
 };
@@ -1208,6 +1286,74 @@ struct kbase_csf_mcu_fw {
 	u8 *data;
 };
 
+/*
+ * Firmware log polling period.
+ */
+#define KBASE_CSF_FIRMWARE_LOG_POLL_PERIOD_MS 25
+
+/**
+ * enum kbase_csf_firmware_log_mode - Firmware log operating mode
+ *
+ * @KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL: Manual mode, firmware log can be read
+ * manually by the userspace (and it will also be dumped automatically into
+ * dmesg on GPU reset).
+ *
+ * @KBASE_CSF_FIRMWARE_LOG_MODE_AUTO_PRINT: Automatic printing mode, firmware log
+ * will be periodically emptied into dmesg, manual reading through debugfs is
+ * disabled.
+ */
+enum kbase_csf_firmware_log_mode {
+	KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL,
+	KBASE_CSF_FIRMWARE_LOG_MODE_AUTO_PRINT
+};
+
+/**
+ * struct kbase_csf_firmware_log - Object containing members for handling firmware log.
+ *
+ * @mode:                      Firmware log operating mode.
+ * @busy:                      Indicating whether a firmware log operation is in progress.
+ * @poll_work:                 Work item that would poll firmware log buffer
+ *                             at regular intervals to perform any periodic
+ *                             activities required by current log mode.
+ * @dump_buf:                  Buffer used for dumping the log.
+ * @func_call_list_va_start:   Virtual address of the start of the call list of FW log functions.
+ * @func_call_list_va_end:     Virtual address of the end of the call list of FW log functions.
+ */
+struct kbase_csf_firmware_log {
+	enum kbase_csf_firmware_log_mode mode;
+	atomic_t busy;
+	struct delayed_work poll_work;
+	u8 *dump_buf;
+	u32 func_call_list_va_start;
+	u32 func_call_list_va_end;
+};
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+/**
+ * struct kbase_csf_dump_on_fault - Faulty information to deliver to the daemon
+ *
+ * @error_code:       Error code.
+ * @kctx_tgid:        tgid value of the Kbase context for which the fault happened.
+ * @kctx_id:          id of the Kbase context for which the fault happened.
+ * @enabled:          Flag to indicate that 'csf_fault' debugfs has been opened
+ *                    so dump on fault is enabled.
+ * @fault_wait_wq:    Waitqueue on which user space client is blocked till kbase
+ *                    reports a fault.
+ * @dump_wait_wq:     Waitqueue on which kbase threads are blocked till user space client
+ *                    completes the dump on fault.
+ * @lock:             Lock to protect this struct members from concurrent access.
+ */
+struct kbase_csf_dump_on_fault {
+	enum dumpfault_error_type error_code;
+	u32 kctx_tgid;
+	u32 kctx_id;
+	atomic_t enabled;
+	wait_queue_head_t fault_wait_wq;
+	wait_queue_head_t dump_wait_wq;
+	spinlock_t lock;
+};
+#endif /* CONFIG_DEBUG_FS*/
+
 /**
  * struct kbase_csf_device - Object representing CSF for an instance of GPU
  *                           platform device.
@@ -1251,11 +1397,14 @@ struct kbase_csf_mcu_fw {
  *                          in the address space of every process, that created
  *                          a Base context, to enable the access to LATEST_FLUSH
  *                          register from userspace.
+ * @nr_user_page_mapped:    The number of clients using the mapping of USER page.
+ *                          This is used to maintain backward compatibility.
+ *                          It's protected by @reg_lock.
  * @mali_file_inode:        Pointer to the inode corresponding to mali device
  *                          file. This is needed in order to switch to the
  *                          @dummy_user_reg_page on GPU power down.
  *                          All instances of the mali device file will point to
- *                          the same inode.
+ *                          the same inode. It's protected by @reg_lock.
  * @reg_lock:               Lock to serialize the MCU firmware related actions
  *                          that affect all contexts such as allocation of
  *                          regions from shared interface area, assignment of
@@ -1320,6 +1469,8 @@ struct kbase_csf_mcu_fw {
  * @hwcnt:                  Contain members required for handling the dump of
  *                          HW counters.
  * @fw:                     Copy of the loaded MCU firmware image.
+ * @fw_log:                 Contain members required for handling firmware log.
+ * @dof:                    Structure for dump on fault.
  */
 struct kbase_csf_device {
 	struct kbase_mmu_table mcu_mmu;
@@ -1334,6 +1485,7 @@ struct kbase_csf_device {
 	u32 db_file_offsets;
 	struct tagged_addr dummy_db_page;
 	struct tagged_addr dummy_user_reg_page;
+	u32 nr_user_page_mapped;
 	struct inode *mali_file_inode;
 	struct mutex reg_lock;
 	wait_queue_head_t event_wait;
@@ -1360,6 +1512,10 @@ struct kbase_csf_device {
 	unsigned int fw_timeout_ms;
 	struct kbase_csf_hwcnt hwcnt;
 	struct kbase_csf_mcu_fw fw;
+	struct kbase_csf_firmware_log fw_log;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	struct kbase_csf_dump_on_fault dof;
+#endif /* CONFIG_DEBUG_FS */
 };
 
 /**
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_event.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_event.c
index 170b7ec51af7..49e52938499f 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_event.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_event.c
@@ -169,7 +169,8 @@ void kbase_csf_event_term(struct kbase_context *kctx)
 		kfree(event_cb);
 	}
 
-	WARN_ON(!list_empty(&kctx->csf.event.error_list));
+	WARN(!list_empty(&kctx->csf.event.error_list),
+	     "Error list not empty for ctx %d_%d\n", kctx->tgid, kctx->id);
 
 	spin_unlock_irqrestore(&kctx->csf.event.lock, flags);
 }
@@ -244,6 +245,14 @@ bool kbase_csf_event_error_pending(struct kbase_context *kctx)
 	bool error_pending = false;
 	unsigned long flags;
 
+	/* Withhold the error event if the dump on fault is ongoing.
+	 * This would prevent the Userspace from taking error recovery actions
+	 * (which can potentially affect the state that is being dumped).
+	 * Event handling thread would eventually notice the error event.
+	 */
+	if (unlikely(!kbase_debug_csf_fault_dump_complete(kctx->kbdev)))
+		return false;
+
 	spin_lock_irqsave(&kctx->csf.event.lock, flags);
 	error_pending = !list_empty(&kctx->csf.event.error_list);
 
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.c
index 0fb56e0094c5..1f4a4d9b6876 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.c
@@ -21,6 +21,7 @@
 
 #include "mali_kbase.h"
 #include "mali_kbase_csf_firmware_cfg.h"
+#include "mali_kbase_csf_firmware_log.h"
 #include "mali_kbase_csf_trace_buffer.h"
 #include "mali_kbase_csf_timeout.h"
 #include "mali_kbase_mem.h"
@@ -77,9 +78,11 @@ MODULE_PARM_DESC(fw_debug,
 	"Enables effective use of a debugger for debugging firmware code.");
 #endif
 
-#define FIRMWARE_HEADER_MAGIC    (0xC3F13A6Eul)
-#define FIRMWARE_HEADER_VERSION  (0ul)
-#define FIRMWARE_HEADER_LENGTH   (0x14ul)
+
+#define FIRMWARE_HEADER_MAGIC		(0xC3F13A6Eul)
+#define FIRMWARE_HEADER_VERSION_MAJOR	(0ul)
+#define FIRMWARE_HEADER_VERSION_MINOR	(2ul)
+#define FIRMWARE_HEADER_LENGTH		(0x14ul)
 
 #define CSF_FIRMWARE_ENTRY_SUPPORTED_FLAGS \
 	(CSF_FIRMWARE_ENTRY_READ | \
@@ -92,10 +95,10 @@ MODULE_PARM_DESC(fw_debug,
 
 #define CSF_FIRMWARE_ENTRY_TYPE_INTERFACE     (0)
 #define CSF_FIRMWARE_ENTRY_TYPE_CONFIGURATION (1)
-#define CSF_FIRMWARE_ENTRY_TYPE_FUTF_TEST     (2)
 #define CSF_FIRMWARE_ENTRY_TYPE_TRACE_BUFFER  (3)
 #define CSF_FIRMWARE_ENTRY_TYPE_TIMELINE_METADATA (4)
 #define CSF_FIRMWARE_ENTRY_TYPE_BUILD_INFO_METADATA (6)
+#define CSF_FIRMWARE_ENTRY_TYPE_FUNC_CALL_LIST    (7)
 
 #define CSF_FIRMWARE_CACHE_MODE_NONE              (0ul << 3)
 #define CSF_FIRMWARE_CACHE_MODE_CACHED            (1ul << 3)
@@ -431,8 +434,8 @@ static void load_fw_image_section(struct kbase_device *kbdev, const u8 *data,
 			memset(p + copy_len, 0, zi_len);
 		}
 
-		kbase_sync_single_for_device(kbdev, kbase_dma_addr(page),
-				PAGE_SIZE, DMA_TO_DEVICE);
+		kbase_sync_single_for_device(kbdev, kbase_dma_addr_from_tagged(phys[page_num]),
+					     PAGE_SIZE, DMA_TO_DEVICE);
 		kunmap_atomic(p);
 	}
 }
@@ -525,6 +528,58 @@ static inline bool entry_find_large_page_to_reuse(
 	*pma = NULL;
 
 
+	/* If the section starts at 2MB aligned boundary,
+	 * then use 2MB page(s) for it.
+	 */
+	if (!(virtual_start & (SZ_2M - 1))) {
+		*num_pages_aligned =
+			round_up(*num_pages_aligned, NUM_4K_PAGES_IN_2MB_PAGE);
+		*is_small_page = false;
+		goto out;
+	}
+
+	/* If the section doesn't lie within the same 2MB aligned boundary,
+	 * then use 4KB pages as it would be complicated to use a 2MB page
+	 * for such section.
+	 */
+	if ((virtual_start & ~(SZ_2M - 1)) != (virtual_end & ~(SZ_2M - 1)))
+		goto out;
+
+	/* Find the nearest 2MB aligned section which comes before the current
+	 * section.
+	 */
+	list_for_each_entry(interface, &kbdev->csf.firmware_interfaces, node) {
+		const u32 virtual_diff = virtual_start - interface->virtual;
+
+		if (interface->virtual > virtual_end)
+			continue;
+
+		if (interface->virtual & (SZ_2M - 1))
+			continue;
+
+		if (virtual_diff < virtual_diff_min) {
+			target_interface = interface;
+			virtual_diff_min = virtual_diff;
+		}
+	}
+
+	if (target_interface) {
+		const u32 page_index = virtual_diff_min >> PAGE_SHIFT;
+
+		if (page_index >= target_interface->num_pages_aligned)
+			goto out;
+
+		if (target_interface->phys)
+			*phys = &target_interface->phys[page_index];
+
+		if (target_interface->pma)
+			*pma = &target_interface->pma[page_index / NUM_4K_PAGES_IN_2MB_PAGE];
+
+		*is_small_page = false;
+		reuse_large_page = true;
+	}
+
+out:
 	return reuse_large_page;
 }
 
@@ -555,6 +610,8 @@ static int parse_memory_setup_entry(struct kbase_device *kbdev,
 	u32 num_pages;
 	u32 num_pages_aligned;
 	char *name;
+	void *name_entry;
+	unsigned int name_len;
 	struct tagged_addr *phys = NULL;
 	struct kbase_csf_firmware_interface *interface = NULL;
 	bool allocated_pages = false, protected_mode = false;
@@ -625,8 +682,8 @@ static int parse_memory_setup_entry(struct kbase_device *kbdev,
 	} else {
 		if (!reuse_pages) {
 			ret = kbase_mem_pool_alloc_pages(
-				kbase_mem_pool_group_select(
-					kbdev, KBASE_MEM_GROUP_CSF_FW, is_small_page),
+				kbase_mem_pool_group_select(kbdev, KBASE_MEM_GROUP_CSF_FW,
+							    is_small_page),
 				num_pages_aligned, phys, false);
 		}
 	}
@@ -643,21 +700,24 @@ static int parse_memory_setup_entry(struct kbase_device *kbdev,
 			data_start, data_end);
 
 	/* Allocate enough memory for the struct kbase_csf_firmware_interface and
-	 * the name of the interface. An extra byte is allocated to place a
-	 * NUL-terminator in. This should already be included according to the
-	 * specification but here we add it anyway to be robust against a
-	 * corrupt firmware image.
+	 * the name of the interface.
 	 */
-	interface = kmalloc(sizeof(*interface) +
-			size - INTERFACE_ENTRY_NAME_OFFSET + 1, GFP_KERNEL);
+	name_entry = (void *)entry + INTERFACE_ENTRY_NAME_OFFSET;
+	name_len = strnlen(name_entry, size - INTERFACE_ENTRY_NAME_OFFSET);
+	if (size < (INTERFACE_ENTRY_NAME_OFFSET + name_len + 1 + sizeof(u32))) {
+		dev_err(kbdev->dev, "Memory setup entry too short to contain virtual_exe_start");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	interface = kmalloc(sizeof(*interface) + name_len + 1, GFP_KERNEL);
 	if (!interface) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	name = (void *)(interface + 1);
-	memcpy(name, entry + (INTERFACE_ENTRY_NAME_OFFSET / sizeof(*entry)),
-			size - INTERFACE_ENTRY_NAME_OFFSET);
-	name[size - INTERFACE_ENTRY_NAME_OFFSET] = 0;
+	memcpy(name, name_entry, name_len);
+	name[name_len] = 0;
 
 	interface->name = name;
 	interface->phys = phys;
@@ -672,6 +732,11 @@ static int parse_memory_setup_entry(struct kbase_device *kbdev,
 	interface->data_end = data_end;
 	interface->pma = pma;
 
+	/* Discover the virtual execution address field after the end of the name
+	 * field taking into account the NULL-termination character.
+	 */
+	interface->virtual_exe_start = *((u32 *)(name_entry + name_len + 1));
+
 	mem_flags = convert_mem_flags(kbdev, flags, &cache_mode);
 
 	if (flags & CSF_FIRMWARE_ENTRY_SHARED) {
@@ -956,6 +1021,15 @@ static int load_firmware_entry(struct kbase_device *kbdev, const struct kbase_cs
 			return -EINVAL;
 		}
 		return parse_build_info_metadata_entry(kbdev, fw, entry, size);
+	case CSF_FIRMWARE_ENTRY_TYPE_FUNC_CALL_LIST:
+		/* Function call list section */
+		if (size < 2 * sizeof(*entry)) {
+			dev_err(kbdev->dev, "Function call list entry too short (size=%u)\n",
+				size);
+			return -EINVAL;
+		}
+		kbase_csf_firmware_log_parse_logging_call_list_entry(kbdev, entry);
+		break;
 	}
 
 	if (!optional) {
@@ -1179,40 +1253,80 @@ static int parse_capabilities(struct kbase_device *kbdev)
 	return 0;
 }
 
+static inline void access_firmware_memory_common(struct kbase_device *kbdev,
+		struct kbase_csf_firmware_interface *interface, u32 offset_bytes,
+		u32 *value, const bool read)
+{
+	u32 page_num = offset_bytes >> PAGE_SHIFT;
+	u32 offset_in_page = offset_bytes & ~PAGE_MASK;
+	struct page *target_page = as_page(interface->phys[page_num]);
+	uintptr_t cpu_addr = (uintptr_t)kmap_atomic(target_page);
+	u32 *addr = (u32 *)(cpu_addr + offset_in_page);
+
+	if (read) {
+		kbase_sync_single_for_device(kbdev,
+			kbase_dma_addr_from_tagged(interface->phys[page_num]) + offset_in_page,
+			sizeof(u32), DMA_BIDIRECTIONAL);
+		*value = *addr;
+	} else {
+		*addr = *value;
+		kbase_sync_single_for_device(kbdev,
+			kbase_dma_addr_from_tagged(interface->phys[page_num]) + offset_in_page,
+			sizeof(u32), DMA_BIDIRECTIONAL);
+	}
+
+	kunmap_atomic((u32 *)cpu_addr);
+}
+
 static inline void access_firmware_memory(struct kbase_device *kbdev,
 	u32 gpu_addr, u32 *value, const bool read)
 {
-	struct kbase_csf_firmware_interface *interface;
+	struct kbase_csf_firmware_interface *interface, *access_interface = NULL;
+	u32 offset_bytes = 0;
 
 	list_for_each_entry(interface, &kbdev->csf.firmware_interfaces, node) {
 		if ((gpu_addr >= interface->virtual) &&
 			(gpu_addr < interface->virtual + (interface->num_pages << PAGE_SHIFT))) {
-			u32 offset_bytes = gpu_addr - interface->virtual;
-			u32 page_num = offset_bytes >> PAGE_SHIFT;
-			u32 offset_in_page = offset_bytes & ~PAGE_MASK;
-			struct page *target_page = as_page(
-				interface->phys[page_num]);
-			u32 *cpu_addr = kmap_atomic(target_page);
-
-			if (read) {
-				kbase_sync_single_for_device(kbdev,
-					kbase_dma_addr(target_page) + offset_in_page,
-					sizeof(u32), DMA_BIDIRECTIONAL);
-
-				*value = cpu_addr[offset_in_page >> 2];
-			} else {
-				cpu_addr[offset_in_page >> 2] = *value;
-
-				kbase_sync_single_for_device(kbdev,
-					kbase_dma_addr(target_page) + offset_in_page,
-					sizeof(u32), DMA_BIDIRECTIONAL);
-			}
-
-			kunmap_atomic(cpu_addr);
-			return;
+			offset_bytes = gpu_addr - interface->virtual;
+			access_interface = interface;
+			break;
 		}
 	}
-	dev_warn(kbdev->dev, "Invalid GPU VA %x passed\n", gpu_addr);
+
+	if (access_interface)
+		access_firmware_memory_common(kbdev, access_interface, offset_bytes, value, read);
+	else
+		dev_warn(kbdev->dev, "Invalid GPU VA %x passed", gpu_addr);
+}
+
+static inline void access_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 *value, const bool read)
+{
+	struct kbase_csf_firmware_interface *interface, *access_interface = NULL;
+	u32 offset_bytes = 0;
+
+	list_for_each_entry(interface, &kbdev->csf.firmware_interfaces, node) {
+		if ((gpu_addr >= interface->virtual_exe_start) &&
+			(gpu_addr < interface->virtual_exe_start +
+				(interface->num_pages << PAGE_SHIFT))) {
+			offset_bytes = gpu_addr - interface->virtual_exe_start;
+			access_interface = interface;
+
+			/* If there's an overlap in execution address range between a moved and a
+			 * non-moved areas, always prefer the moved one. The idea is that FW may
+			 * move sections around during init time, but after the layout is settled,
+			 * any moved sections are going to override non-moved areas at the same
+			 * location.
+			 */
+			if (interface->virtual_exe_start != interface->virtual)
+				break;
+		}
+	}
+
+	if (access_interface)
+		access_firmware_memory_common(kbdev, access_interface, offset_bytes, value, read);
+	else
+		dev_warn(kbdev->dev, "Invalid GPU VA %x passed", gpu_addr);
 }
 
 void kbase_csf_read_firmware_memory(struct kbase_device *kbdev,
@@ -1227,6 +1341,18 @@ void kbase_csf_update_firmware_memory(struct kbase_device *kbdev,
 	access_firmware_memory(kbdev, gpu_addr, &value, false);
 }
 
+void kbase_csf_read_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 *value)
+{
+	access_firmware_memory_exe(kbdev, gpu_addr, value, true);
+}
+
+void kbase_csf_update_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 value)
+{
+	access_firmware_memory_exe(kbdev, gpu_addr, &value, false);
+}
+
 void kbase_csf_firmware_cs_input(
 	const struct kbase_csf_cmd_stream_info *const info, const u32 offset,
 	const u32 value)
@@ -1462,11 +1588,10 @@ static bool global_request_complete(struct kbase_device *const kbdev,
 	return complete;
 }
 
-static int wait_for_global_request(struct kbase_device *const kbdev,
-				   u32 const req_mask)
+static int wait_for_global_request_with_timeout(struct kbase_device *const kbdev,
+						u32 const req_mask, unsigned int timeout_ms)
 {
-	const long wait_timeout =
-		kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms);
+	const long wait_timeout = kbase_csf_timeout_in_jiffies(timeout_ms);
 	long remaining;
 	int err = 0;
 
@@ -1475,10 +1600,9 @@ static int wait_for_global_request(struct kbase_device *const kbdev,
 				       wait_timeout);
 
 	if (!remaining) {
-		dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for global request %x to complete",
-			 kbase_backend_get_cycle_cnt(kbdev),
-			 kbdev->csf.fw_timeout_ms,
-			 req_mask);
+		dev_warn(kbdev->dev,
+			 "[%llu] Timeout (%d ms) waiting for global request %x to complete",
+			 kbase_backend_get_cycle_cnt(kbdev), timeout_ms, req_mask);
 		err = -ETIMEDOUT;
 
 	}
@@ -1486,6 +1610,11 @@ static int wait_for_global_request(struct kbase_device *const kbdev,
 	return err;
 }
 
+static int wait_for_global_request(struct kbase_device *const kbdev, u32 const req_mask)
+{
+	return wait_for_global_request_with_timeout(kbdev, req_mask, kbdev->csf.fw_timeout_ms);
+}
+
 static void set_global_request(
 	const struct kbase_csf_global_iface *const global_iface,
 	u32 const req_mask)
@@ -1559,6 +1688,25 @@ static void enable_gpu_idle_timer(struct kbase_device *const kbdev)
 }
 
 
+/**
+ * kbasep_enable_rtu - Enable Ray Tracing Unit on powering up shader core
+ *
+ * @kbdev:     The kbase device structure of the device
+ *
+ * This function needs to be called to enable the Ray Tracing Unit
+ * by writing SHADER_PWRFEATURES only when host controls shader cores power.
+ */
+static void kbasep_enable_rtu(struct kbase_device *kbdev)
+{
+	const u32 gpu_id = kbdev->gpu_props.props.raw_props.gpu_id;
+
+	if (gpu_id < GPU_ID2_PRODUCT_MAKE(12, 8, 3, 0))
+		return;
+
+	if (kbdev->csf.firmware_hctl_core_pwr)
+		kbase_reg_write(kbdev, GPU_CONTROL_REG(SHADER_PWRFEATURES), 1);
+}
+
 static void global_init(struct kbase_device *const kbdev, u64 core_mask)
 {
 	u32 const ack_irq_mask =
@@ -1574,6 +1722,8 @@ static void global_init(struct kbase_device *const kbdev, u64 core_mask)
 
 	kbase_csf_scheduler_spin_lock(kbdev, &flags);
 
+	kbasep_enable_rtu(kbdev);
+
 	/* Update shader core allocation enable mask */
 	enable_endpoints_global(global_iface, core_mask);
 	enable_shader_poweroff_timer(kbdev, global_iface);
@@ -1854,7 +2004,6 @@ end:
 
 static u32 convert_dur_to_core_pwroff_count(struct kbase_device *kbdev, const u32 dur_us)
 {
-#define PWROFF_VAL_UNIT_SHIFT (10)
 	/* Get the cntfreq_el0 value, which drives the SYSTEM_TIMESTAMP */
 	u64 freq = arch_timer_get_cntfrq();
 	u64 dur_val = dur_us;
@@ -1991,16 +2140,6 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 	kbdev->csf.fw_timeout_ms =
 		kbase_get_timeout_ms(kbdev, CSF_FIRMWARE_TIMEOUT);
 
-	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
-#ifdef KBASE_PM_RUNTIME
-	if (kbase_pm_gpu_sleep_allowed(kbdev))
-		kbdev->csf.gpu_idle_hysteresis_ms /=
-			FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
-#endif
-	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
-	kbdev->csf.gpu_idle_dur_count = convert_dur_to_idle_count(
-		kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
-
 	kbdev->csf.mcu_core_pwroff_dur_us = DEFAULT_GLB_PWROFF_TIMEOUT_US;
 	kbdev->csf.mcu_core_pwroff_dur_count = convert_dur_to_core_pwroff_count(
 		kbdev, DEFAULT_GLB_PWROFF_TIMEOUT_US);
@@ -2020,7 +2159,26 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 	return 0;
 }
 
-int kbase_csf_firmware_init(struct kbase_device *kbdev)
+void kbase_csf_firmware_early_term(struct kbase_device *kbdev)
+{
+	mutex_destroy(&kbdev->csf.reg_lock);
+}
+
+int kbase_csf_firmware_late_init(struct kbase_device *kbdev)
+{
+	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
+#ifdef KBASE_PM_RUNTIME
+	if (kbase_pm_gpu_sleep_allowed(kbdev))
+		kbdev->csf.gpu_idle_hysteresis_ms /= FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
+#endif
+	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
+	kbdev->csf.gpu_idle_dur_count =
+		convert_dur_to_idle_count(kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
+
+	return 0;
+}
+
+int kbase_csf_firmware_load_init(struct kbase_device *kbdev)
 {
 	const struct firmware *firmware = NULL;
 	struct kbase_csf_mcu_fw *const mcu_fw = &kbdev->csf.fw;
@@ -2093,7 +2251,8 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	version_minor = mcu_fw->data[4];
 	version_major = mcu_fw->data[5];
 
-	if (version_major != FIRMWARE_HEADER_VERSION) {
+	if (version_major != FIRMWARE_HEADER_VERSION_MAJOR ||
+			version_minor != FIRMWARE_HEADER_VERSION_MINOR) {
 		dev_err(kbdev->dev,
 				"Firmware header version %d.%d not understood\n",
 				version_major, version_minor);
@@ -2188,6 +2347,12 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	if (ret != 0)
 		goto err_out;
 
+	ret = kbase_csf_firmware_log_init(kbdev);
+	if (ret != 0) {
+		dev_err(kbdev->dev, "Failed to initialize FW trace (err %d)", ret);
+		goto err_out;
+	}
+
 	/* Firmware loaded successfully, ret = 0 */
 	KBASE_KTRACE_ADD(kbdev, CSF_FIRMWARE_BOOT, NULL,
 			(((u64)version_hash) << 32) |
@@ -2195,11 +2360,11 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	return 0;
 
 err_out:
-	kbase_csf_firmware_term(kbdev);
+	kbase_csf_firmware_unload_term(kbdev);
 	return ret;
 }
 
-void kbase_csf_firmware_term(struct kbase_device *kbdev)
+void kbase_csf_firmware_unload_term(struct kbase_device *kbdev)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -2210,6 +2375,8 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev)
 
 	WARN(ret, "failed to wait for GPU reset");
 
+	kbase_csf_firmware_log_term(kbdev);
+
 	kbase_csf_firmware_cfg_term(kbdev);
 
 	kbase_csf_timeout_term(kbdev);
@@ -2297,8 +2464,6 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev)
 	 */
 	kbase_mcu_shared_interface_region_tracker_term(kbdev);
 
-	mutex_destroy(&kbdev->csf.reg_lock);
-
 	kbase_mmu_term(kbdev, &kbdev->csf.mcu_mmu);
 
 	/* Release the address space */
@@ -2350,10 +2515,11 @@ void kbase_csf_firmware_ping(struct kbase_device *const kbdev)
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 }
 
-int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev)
+int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev, unsigned int wait_timeout_ms)
 {
 	kbase_csf_firmware_ping(kbdev);
-	return wait_for_global_request(kbdev, GLB_REQ_PING_MASK);
+
+	return wait_for_global_request_with_timeout(kbdev, GLB_REQ_PING_MASK, wait_timeout_ms);
 }
 
 int kbase_csf_firmware_set_timeout(struct kbase_device *const kbdev,
@@ -2392,7 +2558,7 @@ void kbase_csf_enter_protected_mode(struct kbase_device *kbdev)
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 }
 
-void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
+int kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
 {
 	int err;
 
@@ -2432,12 +2598,14 @@ void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
 		}
 	}
 
-	if (err) {
+	if (unlikely(err)) {
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
 			kbase_reset_gpu(kbdev);
 	}
 
 	KBASE_TLSTREAM_AUX_PROTECTED_ENTER_END(kbdev, kbdev);
+
+	return err;
 }
 
 void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
@@ -2651,9 +2819,8 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 	if (!page_list)
 		goto page_list_alloc_error;
 
-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-		num_pages, phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
+					 phys, false);
 	if (ret <= 0)
 		goto phys_mem_pool_alloc_error;
 
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.h
index 85caaa7b2ab4..7560a298ac9c 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware.h
@@ -364,7 +364,45 @@ void kbase_csf_update_firmware_memory(struct kbase_device *kbdev,
 	u32 gpu_addr, u32 value);
 
 /**
- * kbase_csf_firmware_early_init() - Early initializatin for the firmware.
+ * kbase_csf_read_firmware_memory_exe - Read a value in a GPU address in the
+ *                                      region of its final execution location.
+ *
+ * @kbdev:     Device pointer
+ * @gpu_addr:  GPU address to read
+ * @value:     Output pointer to which the read value will be written
+ *
+ * This function read a value in a GPU address that belongs to a private loaded
+ * firmware memory region based on its final execution location. The function
+ * assumes that the location is not permanently mapped on the CPU address space,
+ * therefore it maps it and then unmaps it to access it independently. This function
+ * needs to be used when accessing firmware memory regions which will be moved to
+ * their final execution location during firmware boot using an address based on the
+ * final execution location.
+ */
+void kbase_csf_read_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 *value);
+
+/**
+ * kbase_csf_update_firmware_memory_exe - Write a value in a GPU address in the
+ *                                        region of its final execution location.
+ *
+ * @kbdev:     Device pointer
+ * @gpu_addr:  GPU address to write
+ * @value:     Value to write
+ *
+ * This function writes a value in a GPU address that belongs to a private loaded
+ * firmware memory region based on its final execution location. The function
+ * assumes that the location is not permanently mapped on the CPU address space,
+ * therefore it maps it and then unmaps it to access it independently. This function
+ * needs to be used when accessing firmware memory regions which will be moved to
+ * their final execution location during firmware boot using an address based on the
+ * final execution location.
+ */
+void kbase_csf_update_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 value);
+
+/**
+ * kbase_csf_firmware_early_init() - Early initialization for the firmware.
  * @kbdev: Kbase device
  *
  * Initialize resources related to the firmware. Must be called at kbase probe.
@@ -374,22 +412,43 @@ void kbase_csf_update_firmware_memory(struct kbase_device *kbdev,
 int kbase_csf_firmware_early_init(struct kbase_device *kbdev);
 
 /**
- * kbase_csf_firmware_init() - Load the firmware for the CSF MCU
+ * kbase_csf_firmware_early_term() - Terminate resources related to the firmware
+ *                                   after the firmware unload has been done.
+ *
+ * @kbdev: Device pointer
+ *
+ * This should be called only when kbase probe fails or gets rmmoded.
+ */
+void kbase_csf_firmware_early_term(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_late_init() - Late initialization for the firmware.
+ * @kbdev: Kbase device
+ *
+ * Initialize resources related to the firmware. But must be called after
+ * backend late init is done. Must be used at probe time only.
+ *
+ * Return: 0 if successful, negative error code on failure
+ */
+int kbase_csf_firmware_late_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_load_init() - Load the firmware for the CSF MCU
  * @kbdev: Kbase device
  *
  * Request the firmware from user space and load it into memory.
  *
  * Return: 0 if successful, negative error code on failure
  */
-int kbase_csf_firmware_init(struct kbase_device *kbdev);
+int kbase_csf_firmware_load_init(struct kbase_device *kbdev);
 
 /**
- * kbase_csf_firmware_term() - Unload the firmware
+ * kbase_csf_firmware_unload_term() - Unload the firmware
  * @kbdev: Kbase device
  *
- * Frees the memory allocated by kbase_csf_firmware_init()
+ * Frees the memory allocated by kbase_csf_firmware_load_init()
  */
-void kbase_csf_firmware_term(struct kbase_device *kbdev);
+void kbase_csf_firmware_unload_term(struct kbase_device *kbdev);
 
 /**
  * kbase_csf_firmware_ping - Send the ping request to firmware.
@@ -404,13 +463,14 @@ void kbase_csf_firmware_ping(struct kbase_device *kbdev);
  * kbase_csf_firmware_ping_wait - Send the ping request to firmware and waits.
  *
  * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ * @wait_timeout_ms: Timeout to get the acknowledgment for PING request from FW.
  *
  * The function sends the ping request to firmware and waits to confirm it is
  * alive.
  *
  * Return: 0 on success, or negative on failure.
  */
-int kbase_csf_firmware_ping_wait(struct kbase_device *kbdev);
+int kbase_csf_firmware_ping_wait(struct kbase_device *kbdev, unsigned int wait_timeout_ms);
 
 /**
  * kbase_csf_firmware_set_timeout - Set a hardware endpoint progress timeout.
@@ -447,8 +507,10 @@ void kbase_csf_enter_protected_mode(struct kbase_device *kbdev);
  * This function needs to be called after kbase_csf_enter_protected_mode() to
  * wait for the GPU to actually enter protected mode. GPU reset is triggered if
  * the wait is unsuccessful.
+ *
+ * Return: 0 on success, or negative on failure.
  */
-void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev);
+int kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev);
 
 static inline bool kbase_csf_firmware_mcu_halted(struct kbase_device *kbdev)
 {
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_cfg.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_cfg.c
index ad4ae74c7569..c895b080143a 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_cfg.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_cfg.c
@@ -20,13 +20,17 @@
  */
 
 #include <mali_kbase.h>
-#include "mali_kbase_csf_firmware_cfg.h"
 #include <mali_kbase_reset_gpu.h>
 #include <linux/version.h>
 
+#include "mali_kbase_csf_firmware_cfg.h"
+#include "mali_kbase_csf_firmware_log.h"
+
 #if CONFIG_SYSFS
 #define CSF_FIRMWARE_CFG_SYSFS_DIR_NAME "firmware_config"
 
+#define CSF_FIRMWARE_CFG_LOG_VERBOSITY_ENTRY_NAME "Log verbosity"
+
 /**
  * struct firmware_config - Configuration item within the MCU firmware
  *
@@ -125,7 +129,7 @@ static ssize_t store_fw_cfg(struct kobject *kobj,
 
 	if (attr == &fw_cfg_attr_cur) {
 		unsigned long flags;
-		u32 val;
+		u32 val, cur_val;
 		int ret = kstrtouint(buf, 0, &val);
 
 		if (ret) {
@@ -140,7 +144,9 @@ static ssize_t store_fw_cfg(struct kobject *kobj,
 			return -EINVAL;
 
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		if (config->cur_val == val) {
+
+		cur_val = config->cur_val;
+		if (cur_val == val) {
 			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 			return count;
 		}
@@ -177,6 +183,20 @@ static ssize_t store_fw_cfg(struct kobject *kobj,
 
 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
+		/* Enable FW logging only if Log verbosity is non-zero */
+		if (!strcmp(config->name, CSF_FIRMWARE_CFG_LOG_VERBOSITY_ENTRY_NAME) &&
+		    (!cur_val || !val)) {
+			ret = kbase_csf_firmware_log_toggle_logging_calls(kbdev, val);
+			if (ret) {
+				/* Undo FW configuration changes */
+				spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+				config->cur_val = cur_val;
+				kbase_csf_update_firmware_memory(kbdev, config->address, cur_val);
+				spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+				return ret;
+			}
+		}
+
 		/* If we can update the config without firmware reset then
 		 * we need to just trigger FIRMWARE_CONFIG_UPDATE.
 		 */
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.c
new file mode 100644
index 000000000000..20d8c0d4fdb1
--- /dev/null
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.c
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include <mali_kbase.h>
+#include "backend/gpu/mali_kbase_pm_internal.h"
+#include <csf/mali_kbase_csf_firmware_log.h>
+#include <csf/mali_kbase_csf_trace_buffer.h>
+#include <linux/debugfs.h>
+#include <linux/string.h>
+#include <linux/workqueue.h>
+
+/*
+ * ARMv7 instruction: Branch with Link calls a subroutine at a PC-relative address.
+ */
+#define ARMV7_T1_BL_IMM_INSTR		0xd800f000
+
+/*
+ * ARMv7 instruction: Branch with Link calls a subroutine at a PC-relative address, maximum
+ * negative jump offset.
+ */
+#define ARMV7_T1_BL_IMM_RANGE_MIN	-16777216
+
+/*
+ * ARMv7 instruction: Branch with Link calls a subroutine at a PC-relative address, maximum
+ * positive jump offset.
+ */
+#define ARMV7_T1_BL_IMM_RANGE_MAX	16777214
+
+/*
+ * ARMv7 instruction: Double NOP instructions.
+ */
+#define ARMV7_DOUBLE_NOP_INSTR		0xbf00bf00
+
+#if defined(CONFIG_DEBUG_FS)
+
+static int kbase_csf_firmware_log_enable_mask_read(void *data, u64 *val)
+{
+	struct kbase_device *kbdev = (struct kbase_device *)data;
+	struct firmware_trace_buffer *tb =
+		kbase_csf_firmware_get_trace_buffer(kbdev, FIRMWARE_LOG_BUF_NAME);
+
+	if (tb == NULL) {
+		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
+		return -EIO;
+	}
+	/* The enabled traces limited to u64 here, regarded practical */
+	*val = kbase_csf_firmware_trace_buffer_get_active_mask64(tb);
+	return 0;
+}
+
+static int kbase_csf_firmware_log_enable_mask_write(void *data, u64 val)
+{
+	struct kbase_device *kbdev = (struct kbase_device *)data;
+	struct firmware_trace_buffer *tb =
+		kbase_csf_firmware_get_trace_buffer(kbdev, FIRMWARE_LOG_BUF_NAME);
+	u64 new_mask;
+	unsigned int enable_bits_count;
+
+	if (tb == NULL) {
+		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
+		return -EIO;
+	}
+
+	/* Ignore unsupported types */
+	enable_bits_count = kbase_csf_firmware_trace_buffer_get_trace_enable_bits_count(tb);
+	if (enable_bits_count > 64) {
+		dev_dbg(kbdev->dev, "Limit enabled bits count from %u to 64", enable_bits_count);
+		enable_bits_count = 64;
+	}
+	new_mask = val & ((1 << enable_bits_count) - 1);
+
+	if (new_mask != kbase_csf_firmware_trace_buffer_get_active_mask64(tb))
+		return kbase_csf_firmware_trace_buffer_set_active_mask64(tb, new_mask);
+	else
+		return 0;
+}
+
+static int kbasep_csf_firmware_log_debugfs_open(struct inode *in, struct file *file)
+{
+	struct kbase_device *kbdev = in->i_private;
+
+	file->private_data = kbdev;
+	dev_dbg(kbdev->dev, "Opened firmware trace buffer dump debugfs file");
+
+	return 0;
+}
+
+static ssize_t kbasep_csf_firmware_log_debugfs_read(struct file *file, char __user *buf,
+						    size_t size, loff_t *ppos)
+{
+	struct kbase_device *kbdev = file->private_data;
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+	unsigned int n_read;
+	unsigned long not_copied;
+	/* Limit reads to the kernel dump buffer size */
+	size_t mem = MIN(size, FIRMWARE_LOG_DUMP_BUF_SIZE);
+	int ret;
+
+	struct firmware_trace_buffer *tb =
+		kbase_csf_firmware_get_trace_buffer(kbdev, FIRMWARE_LOG_BUF_NAME);
+
+	if (tb == NULL) {
+		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
+		return -EIO;
+	}
+
+	if (atomic_cmpxchg(&fw_log->busy, 0, 1) != 0)
+		return -EBUSY;
+
+	/* Reading from userspace is only allowed in manual mode */
+	if (fw_log->mode != KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	n_read = kbase_csf_firmware_trace_buffer_read_data(tb, fw_log->dump_buf, mem);
+
+	/* Do the copy, if we have obtained some trace data */
+	not_copied = (n_read) ? copy_to_user(buf, fw_log->dump_buf, n_read) : 0;
+
+	if (not_copied) {
+		dev_err(kbdev->dev, "Couldn't copy trace buffer data to user space buffer");
+		ret = -EFAULT;
+		goto out;
+	}
+
+	*ppos += n_read;
+	ret = n_read;
+
+out:
+	atomic_set(&fw_log->busy, 0);
+	return ret;
+}
+
+static int kbase_csf_firmware_log_mode_read(void *data, u64 *val)
+{
+	struct kbase_device *kbdev = (struct kbase_device *)data;
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+
+	*val = fw_log->mode;
+	return 0;
+}
+
+static int kbase_csf_firmware_log_mode_write(void *data, u64 val)
+{
+	struct kbase_device *kbdev = (struct kbase_device *)data;
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+	int ret = 0;
+
+	if (atomic_cmpxchg(&fw_log->busy, 0, 1) != 0)
+		return -EBUSY;
+
+	if (val == fw_log->mode)
+		goto out;
+
+	switch (val) {
+	case KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL:
+		cancel_delayed_work_sync(&fw_log->poll_work);
+		break;
+	case KBASE_CSF_FIRMWARE_LOG_MODE_AUTO_PRINT:
+		schedule_delayed_work(&fw_log->poll_work,
+				      msecs_to_jiffies(KBASE_CSF_FIRMWARE_LOG_POLL_PERIOD_MS));
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	fw_log->mode = val;
+
+out:
+	atomic_set(&fw_log->busy, 0);
+	return ret;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(kbase_csf_firmware_log_enable_mask_fops,
+			 kbase_csf_firmware_log_enable_mask_read,
+			 kbase_csf_firmware_log_enable_mask_write, "%llx\n");
+
+static const struct file_operations kbasep_csf_firmware_log_debugfs_fops = {
+	.owner = THIS_MODULE,
+	.open = kbasep_csf_firmware_log_debugfs_open,
+	.read = kbasep_csf_firmware_log_debugfs_read,
+	.llseek = no_llseek,
+};
+
+DEFINE_DEBUGFS_ATTRIBUTE(kbase_csf_firmware_log_mode_fops, kbase_csf_firmware_log_mode_read,
+			 kbase_csf_firmware_log_mode_write, "%llu\n");
+
+#endif /* CONFIG_DEBUG_FS */
+
+static void kbase_csf_firmware_log_poll(struct work_struct *work)
+{
+	struct kbase_device *kbdev =
+		container_of(work, struct kbase_device, csf.fw_log.poll_work.work);
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+
+	schedule_delayed_work(&fw_log->poll_work,
+			      msecs_to_jiffies(KBASE_CSF_FIRMWARE_LOG_POLL_PERIOD_MS));
+
+	kbase_csf_firmware_log_dump_buffer(kbdev);
+}
+
+int kbase_csf_firmware_log_init(struct kbase_device *kbdev)
+{
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+
+	/* Add one byte for null-termination */
+	fw_log->dump_buf = kmalloc(FIRMWARE_LOG_DUMP_BUF_SIZE + 1, GFP_KERNEL);
+	if (fw_log->dump_buf == NULL)
+		return -ENOMEM;
+
+	/* Ensure null-termination for all strings */
+	fw_log->dump_buf[FIRMWARE_LOG_DUMP_BUF_SIZE] = 0;
+
+	fw_log->mode = KBASE_CSF_FIRMWARE_LOG_MODE_MANUAL;
+
+	atomic_set(&fw_log->busy, 0);
+	INIT_DEFERRABLE_WORK(&fw_log->poll_work, kbase_csf_firmware_log_poll);
+
+#if defined(CONFIG_DEBUG_FS)
+	debugfs_create_file("fw_trace_enable_mask", 0644, kbdev->mali_debugfs_directory, kbdev,
+			    &kbase_csf_firmware_log_enable_mask_fops);
+	debugfs_create_file("fw_traces", 0444, kbdev->mali_debugfs_directory, kbdev,
+			    &kbasep_csf_firmware_log_debugfs_fops);
+	debugfs_create_file("fw_trace_mode", 0644, kbdev->mali_debugfs_directory, kbdev,
+			    &kbase_csf_firmware_log_mode_fops);
+#endif /* CONFIG_DEBUG_FS */
+
+	return 0;
+}
+
+void kbase_csf_firmware_log_term(struct kbase_device *kbdev)
+{
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+
+	if (fw_log->dump_buf) {
+		cancel_delayed_work_sync(&fw_log->poll_work);
+		kfree(fw_log->dump_buf);
+		fw_log->dump_buf = NULL;
+	}
+}
+
+void kbase_csf_firmware_log_dump_buffer(struct kbase_device *kbdev)
+{
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+	u8 *buf = fw_log->dump_buf, *p, *pnewline, *pend, *pendbuf;
+	unsigned int read_size, remaining_size;
+	struct firmware_trace_buffer *tb =
+		kbase_csf_firmware_get_trace_buffer(kbdev, FIRMWARE_LOG_BUF_NAME);
+
+	if (tb == NULL) {
+		dev_dbg(kbdev->dev, "Can't get the trace buffer, firmware trace dump skipped");
+		return;
+	}
+
+	if (atomic_cmpxchg(&fw_log->busy, 0, 1) != 0)
+		return;
+
+	/* FW should only print complete messages, so there's no need to handle
+	 * partial messages over multiple invocations of this function
+	 */
+
+	p = buf;
+	pendbuf = &buf[FIRMWARE_LOG_DUMP_BUF_SIZE];
+
+	while ((read_size = kbase_csf_firmware_trace_buffer_read_data(tb, p, pendbuf - p))) {
+		pend = p + read_size;
+		p = buf;
+
+		while (p < pend && (pnewline = memchr(p, '\n', pend - p))) {
+			/* Null-terminate the string */
+			*pnewline = 0;
+
+			dev_err(kbdev->dev, "FW> %s", p);
+
+			p = pnewline + 1;
+		}
+
+		remaining_size = pend - p;
+
+		if (!remaining_size) {
+			p = buf;
+		} else if (remaining_size < FIRMWARE_LOG_DUMP_BUF_SIZE) {
+			/* Copy unfinished string to the start of the buffer */
+			memmove(buf, p, remaining_size);
+			p = &buf[remaining_size];
+		} else {
+			/* Print abnormally long string without newlines */
+			dev_err(kbdev->dev, "FW> %s", buf);
+			p = buf;
+		}
+	}
+
+	if (p != buf) {
+		/* Null-terminate and print last unfinished string */
+		*p = 0;
+		dev_err(kbdev->dev, "FW> %s", buf);
+	}
+
+	atomic_set(&fw_log->busy, 0);
+}
+
+void kbase_csf_firmware_log_parse_logging_call_list_entry(struct kbase_device *kbdev,
+							  const uint32_t *entry)
+{
+	kbdev->csf.fw_log.func_call_list_va_start = entry[0];
+	kbdev->csf.fw_log.func_call_list_va_end = entry[1];
+}
+
+/**
+ * toggle_logging_calls_in_loaded_image - Toggles FW log func calls in loaded FW image.
+ *
+ * @kbdev:  Instance of a GPU platform device that implements a CSF interface.
+ * @enable: Whether to enable or disable the function calls.
+ */
+static void toggle_logging_calls_in_loaded_image(struct kbase_device *kbdev, bool enable)
+{
+	uint32_t bl_instruction, diff;
+	uint32_t imm11, imm10, i1, i2, j1, j2, sign;
+	uint32_t calling_address = 0, callee_address = 0;
+	uint32_t list_entry = kbdev->csf.fw_log.func_call_list_va_start;
+	const uint32_t list_va_end = kbdev->csf.fw_log.func_call_list_va_end;
+
+	if (list_entry == 0 || list_va_end == 0)
+		return;
+
+	if (enable) {
+		for (; list_entry < list_va_end; list_entry += 2 * sizeof(uint32_t)) {
+			/* Read calling address */
+			kbase_csf_read_firmware_memory(kbdev, list_entry, &calling_address);
+			/* Read callee address */
+			kbase_csf_read_firmware_memory(kbdev, list_entry + sizeof(uint32_t),
+					&callee_address);
+
+			diff = callee_address - calling_address - 4;
+			sign = !!(diff & 0x80000000);
+			if (ARMV7_T1_BL_IMM_RANGE_MIN > (int32_t)diff &&
+					ARMV7_T1_BL_IMM_RANGE_MAX < (int32_t)diff) {
+				dev_warn(kbdev->dev, "FW log patch 0x%x out of range, skipping",
+						calling_address);
+				continue;
+			}
+
+			i1 = (diff & 0x00800000) >> 23;
+			j1 = !i1 ^ sign;
+			i2 = (diff & 0x00400000) >> 22;
+			j2 = !i2 ^ sign;
+			imm11 = (diff & 0xffe) >> 1;
+			imm10 = (diff & 0x3ff000) >> 12;
+
+			/* Compose BL instruction */
+			bl_instruction = ARMV7_T1_BL_IMM_INSTR;
+			bl_instruction |= j1 << 29;
+			bl_instruction |= j2 << 27;
+			bl_instruction |= imm11 << 16;
+			bl_instruction |= sign << 10;
+			bl_instruction |= imm10;
+
+			/* Patch logging func calls in their load location */
+			dev_dbg(kbdev->dev, "FW log patch 0x%x: 0x%x\n", calling_address,
+					bl_instruction);
+			kbase_csf_update_firmware_memory_exe(kbdev, calling_address,
+					bl_instruction);
+		}
+	} else {
+		for (; list_entry < list_va_end; list_entry += 2 * sizeof(uint32_t)) {
+			/* Read calling address */
+			kbase_csf_read_firmware_memory(kbdev, list_entry, &calling_address);
+
+			/* Overwrite logging func calls with 2 NOP instructions */
+			kbase_csf_update_firmware_memory_exe(kbdev, calling_address,
+					ARMV7_DOUBLE_NOP_INSTR);
+		}
+	}
+}
+
+int kbase_csf_firmware_log_toggle_logging_calls(struct kbase_device *kbdev, u32 val)
+{
+	unsigned long flags;
+	struct kbase_csf_firmware_log *fw_log = &kbdev->csf.fw_log;
+	bool mcu_inactive;
+	bool resume_needed = false;
+	int ret = 0;
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+
+	if (atomic_cmpxchg(&fw_log->busy, 0, 1) != 0)
+		return -EBUSY;
+
+	/* Suspend all the active CS groups */
+	dev_dbg(kbdev->dev, "Suspend all the active CS groups");
+
+	kbase_csf_scheduler_lock(kbdev);
+	while (scheduler->state != SCHED_SUSPENDED) {
+		kbase_csf_scheduler_unlock(kbdev);
+		kbase_csf_scheduler_pm_suspend(kbdev);
+		kbase_csf_scheduler_lock(kbdev);
+		resume_needed = true;
+	}
+
+	/* Wait for the MCU to get disabled */
+	dev_info(kbdev->dev, "Wait for the MCU to get disabled");
+	ret = kbase_pm_wait_for_desired_state(kbdev);
+	if (ret) {
+		dev_err(kbdev->dev,
+			"wait for PM state failed when toggling FW logging calls");
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	mcu_inactive =
+		kbase_pm_is_mcu_inactive(kbdev, kbdev->pm.backend.mcu_state);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+	if (!mcu_inactive) {
+		dev_err(kbdev->dev,
+			"MCU not inactive after PM state wait when toggling FW logging calls");
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	/* Toggle FW logging call in the loaded FW image */
+	toggle_logging_calls_in_loaded_image(kbdev, val);
+	dev_dbg(kbdev->dev, "FW logging: %s", val ? "enabled" : "disabled");
+
+out:
+	kbase_csf_scheduler_unlock(kbdev);
+	if (resume_needed)
+		/* Resume queue groups and start mcu */
+		kbase_csf_scheduler_pm_resume(kbdev);
+	atomic_set(&fw_log->busy, 0);
+	return ret;
+}
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.h
new file mode 100644
index 000000000000..8d7a2210a457
--- /dev/null
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_log.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _KBASE_CSF_FIRMWARE_LOG_H_
+#define _KBASE_CSF_FIRMWARE_LOG_H_
+
+#include <mali_kbase.h>
+
+/*
+ * Firmware log dumping buffer size.
+ */
+#define FIRMWARE_LOG_DUMP_BUF_SIZE PAGE_SIZE
+
+/**
+ * kbase_csf_firmware_log_init - Initialize firmware log handling.
+ *
+ * @kbdev: Pointer to the Kbase device
+ *
+ * Return: The initialization error code.
+ */
+int kbase_csf_firmware_log_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_log_term - Terminate firmware log handling.
+ *
+ * @kbdev: Pointer to the Kbase device
+ */
+void kbase_csf_firmware_log_term(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_log_dump_buffer - Read remaining data in the firmware log
+ *                                  buffer and print it to dmesg.
+ *
+ * @kbdev: Pointer to the Kbase device
+ */
+void kbase_csf_firmware_log_dump_buffer(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_firmware_log_parse_logging_call_list_entry - Parse FW logging function call list entry.
+ *
+ * @kbdev: Instance of a GPU platform device that implements a CSF interface.
+ * @entry: Pointer to section.
+ */
+void kbase_csf_firmware_log_parse_logging_call_list_entry(struct kbase_device *kbdev,
+							  const uint32_t *entry);
+/**
+ * kbase_csf_firmware_log_toggle_logging_calls - Enables/Disables FW logging function calls.
+ *
+ * @kbdev:  Instance of a GPU platform device that implements a CSF interface.
+ * @val:    Configuration option value.
+ *
+ * Return: 0 if successful, negative error code on failure
+ */
+int kbase_csf_firmware_log_toggle_logging_calls(struct kbase_device *kbdev, u32 val);
+
+#endif /* _KBASE_CSF_FIRMWARE_LOG_H_ */
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_no_mali.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_no_mali.c
index 54f1f6b9c199..f414d8894306 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_no_mali.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_firmware_no_mali.c
@@ -273,6 +273,18 @@ void kbase_csf_update_firmware_memory(struct kbase_device *kbdev,
 	/* NO_MALI: Nothing to do here */
 }
 
+void kbase_csf_read_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 *value)
+{
+	/* NO_MALI: Nothing to do here */
+}
+
+void kbase_csf_update_firmware_memory_exe(struct kbase_device *kbdev,
+	u32 gpu_addr, u32 value)
+{
+	/* NO_MALI: Nothing to do here */
+}
+
 void kbase_csf_firmware_cs_input(
 	const struct kbase_csf_cmd_stream_info *const info, const u32 offset,
 	const u32 value)
@@ -971,7 +983,6 @@ end:
 
 static u32 convert_dur_to_core_pwroff_count(struct kbase_device *kbdev, const u32 dur_us)
 {
-#define PWROFF_VAL_UNIT_SHIFT (10)
 	/* Get the cntfreq_el0 value, which drives the SYSTEM_TIMESTAMP */
 	u64 freq = arch_timer_get_cntfrq();
 	u64 dur_val = dur_us;
@@ -1046,16 +1057,6 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 	kbdev->csf.fw_timeout_ms =
 		kbase_get_timeout_ms(kbdev, CSF_FIRMWARE_TIMEOUT);
 
-	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
-#ifdef KBASE_PM_RUNTIME
-	if (kbase_pm_gpu_sleep_allowed(kbdev))
-		kbdev->csf.gpu_idle_hysteresis_ms /=
-			FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
-#endif
-	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
-	kbdev->csf.gpu_idle_dur_count = convert_dur_to_idle_count(
-		kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
-
 	INIT_LIST_HEAD(&kbdev->csf.firmware_interfaces);
 	INIT_LIST_HEAD(&kbdev->csf.firmware_config);
 	INIT_LIST_HEAD(&kbdev->csf.firmware_trace_buffers.list);
@@ -1068,7 +1069,26 @@ int kbase_csf_firmware_early_init(struct kbase_device *kbdev)
 	return 0;
 }
 
-int kbase_csf_firmware_init(struct kbase_device *kbdev)
+void kbase_csf_firmware_early_term(struct kbase_device *kbdev)
+{
+	mutex_destroy(&kbdev->csf.reg_lock);
+}
+
+int kbase_csf_firmware_late_init(struct kbase_device *kbdev)
+{
+	kbdev->csf.gpu_idle_hysteresis_ms = FIRMWARE_IDLE_HYSTERESIS_TIME_MS;
+#ifdef KBASE_PM_RUNTIME
+	if (kbase_pm_gpu_sleep_allowed(kbdev))
+		kbdev->csf.gpu_idle_hysteresis_ms /= FIRMWARE_IDLE_HYSTERESIS_GPU_SLEEP_SCALER;
+#endif
+	WARN_ON(!kbdev->csf.gpu_idle_hysteresis_ms);
+	kbdev->csf.gpu_idle_dur_count =
+		convert_dur_to_idle_count(kbdev, kbdev->csf.gpu_idle_hysteresis_ms);
+
+	return 0;
+}
+
+int kbase_csf_firmware_load_init(struct kbase_device *kbdev)
 {
 	int ret;
 
@@ -1134,11 +1154,11 @@ int kbase_csf_firmware_init(struct kbase_device *kbdev)
 	return 0;
 
 error:
-	kbase_csf_firmware_term(kbdev);
+	kbase_csf_firmware_unload_term(kbdev);
 	return ret;
 }
 
-void kbase_csf_firmware_term(struct kbase_device *kbdev)
+void kbase_csf_firmware_unload_term(struct kbase_device *kbdev)
 {
 	cancel_work_sync(&kbdev->csf.fw_error_work);
 
@@ -1173,8 +1193,6 @@ void kbase_csf_firmware_term(struct kbase_device *kbdev)
 
 	/* NO_MALI: No trace buffers to terminate */
 
-	mutex_destroy(&kbdev->csf.reg_lock);
-
 	/* This will also free up the region allocated for the shared interface
 	 * entry parsed from the firmware image.
 	 */
@@ -1227,8 +1245,9 @@ void kbase_csf_firmware_ping(struct kbase_device *const kbdev)
 	kbase_csf_scheduler_spin_unlock(kbdev, flags);
 }
 
-int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev)
+int kbase_csf_firmware_ping_wait(struct kbase_device *const kbdev, unsigned int wait_timeout_ms)
 {
+	CSTD_UNUSED(wait_timeout_ms);
 	kbase_csf_firmware_ping(kbdev);
 	return wait_for_global_request(kbdev, GLB_REQ_PING_MASK);
 }
@@ -1267,7 +1286,7 @@ void kbase_csf_enter_protected_mode(struct kbase_device *kbdev)
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 }
 
-void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
+int kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
 {
 	int err = wait_for_global_request(kbdev, GLB_REQ_PROTM_ENTER_MASK);
 
@@ -1275,6 +1294,8 @@ void kbase_csf_wait_protected_mode_enter(struct kbase_device *kbdev)
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 	}
+
+	return err;
 }
 
 void kbase_csf_firmware_trigger_mcu_halt(struct kbase_device *kbdev)
@@ -1483,9 +1504,8 @@ int kbase_csf_firmware_mcu_shared_mapping_init(
 	if (!page_list)
 		goto page_list_alloc_error;
 
-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-		num_pages, phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
+					 phys, false);
 	if (ret <= 0)
 		goto phys_mem_pool_alloc_error;
 
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_heap_context_alloc.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_heap_context_alloc.c
index 4b3931f6ccf7..1876d505dd5b 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_heap_context_alloc.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_heap_context_alloc.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -154,8 +154,8 @@ u64 kbase_csf_heap_context_allocator_alloc(
 	struct kbase_csf_heap_context_allocator *const ctx_alloc)
 {
 	struct kbase_context *const kctx = ctx_alloc->kctx;
-	u64 flags = BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
-		BASE_MEM_PROT_CPU_WR | BASEP_MEM_NO_USER_FREE;
+	u64 flags = BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_CPU_WR |
+		    BASEP_MEM_NO_USER_FREE | BASE_MEM_PROT_CPU_RD;
 	u64 nr_pages = PFN_UP(HEAP_CTX_REGION_SIZE);
 	u64 heap_gpu_va = 0;
 
@@ -164,10 +164,6 @@ u64 kbase_csf_heap_context_allocator_alloc(
 	 */
 	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
 
-#ifdef CONFIG_MALI_VECTOR_DUMP
-	flags |= BASE_MEM_PROT_CPU_RD;
-#endif
-
 	mutex_lock(&ctx_alloc->lock);
 
 	/* If the pool of heap contexts wasn't already allocated then
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.c
index 542f04579898..0b3f1334a9e6 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.c
@@ -55,7 +55,7 @@ static int kbase_kcpu_map_import_prepare(
 	long i;
 	int ret = 0;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	/* Take the processes mmap lock */
 	down_read(kbase_mem_get_process_mmap_lock());
@@ -114,7 +114,7 @@ static int kbase_kcpu_unmap_import_prepare_internal(
 	struct kbase_va_region *reg;
 	int ret = 0;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	kbase_gpu_vm_lock(kctx);
 
@@ -182,7 +182,8 @@ static void kbase_jit_add_to_pending_alloc_list(
 			&kctx->csf.kcpu_queues.jit_blocked_queues;
 	struct kbase_kcpu_command_queue *blocked_queue;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);
 
 	list_for_each_entry(blocked_queue,
 			&kctx->csf.kcpu_queues.jit_blocked_queues,
@@ -227,25 +228,28 @@ static int kbase_kcpu_jit_allocate_process(
 	u32 i;
 	int ret;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
-
-	if (alloc_info->blocked) {
-		list_del(&queue->jit_blocked);
-		alloc_info->blocked = false;
-	}
+	lockdep_assert_held(&queue->lock);
 
 	if (WARN_ON(!info))
 		return -EINVAL;
 
+	mutex_lock(&kctx->csf.kcpu_queues.jit_lock);
+
 	/* Check if all JIT IDs are not in use */
 	for (i = 0; i < count; i++, info++) {
 		/* The JIT ID is still in use so fail the allocation */
 		if (kctx->jit_alloc[info->id]) {
 			dev_dbg(kctx->kbdev->dev, "JIT ID still in use");
-			return -EINVAL;
+			ret = -EINVAL;
+			goto fail;
 		}
 	}
 
+	if (alloc_info->blocked) {
+		list_del(&queue->jit_blocked);
+		alloc_info->blocked = false;
+	}
+
 	/* Now start the allocation loop */
 	for (i = 0, info = alloc_info->info; i < count; i++, info++) {
 		/* Create a JIT allocation */
@@ -280,7 +284,7 @@ static int kbase_kcpu_jit_allocate_process(
 				 */
 				dev_warn_ratelimited(kctx->kbdev->dev, "JIT alloc command failed: %pK\n", cmd);
 				ret = -ENOMEM;
-				goto fail;
+				goto fail_rollback;
 			}
 
 			/* There are pending frees for an active allocation
@@ -298,7 +302,8 @@ static int kbase_kcpu_jit_allocate_process(
 				kctx->jit_alloc[info->id] = NULL;
 			}
 
-			return -EAGAIN;
+			ret = -EAGAIN;
+			goto fail;
 		}
 
 		/* Bind it to the user provided ID. */
@@ -314,7 +319,7 @@ static int kbase_kcpu_jit_allocate_process(
 				KBASE_REG_CPU_WR, &mapping);
 		if (!ptr) {
 			ret = -ENOMEM;
-			goto fail;
+			goto fail_rollback;
 		}
 
 		reg = kctx->jit_alloc[info->id];
@@ -323,9 +328,11 @@ static int kbase_kcpu_jit_allocate_process(
 		kbase_vunmap(kctx, &mapping);
 	}
 
+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);
+
 	return 0;
 
-fail:
+fail_rollback:
 	/* Roll back completely */
 	for (i = 0, info = alloc_info->info; i < count; i++, info++) {
 		/* Free the allocations that were successful.
@@ -338,6 +345,8 @@ fail:
 
 		kctx->jit_alloc[info->id] = KBASE_RESERVED_REG_JIT_ALLOC;
 	}
+fail:
+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);
 
 	return ret;
 }
@@ -354,7 +363,7 @@ static int kbase_kcpu_jit_allocate_prepare(
 	int ret = 0;
 	u32 i;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	if (!data || count > kcpu_queue->kctx->jit_max_allocations ||
 			count > ARRAY_SIZE(kctx->jit_alloc)) {
@@ -392,11 +401,13 @@ static int kbase_kcpu_jit_allocate_prepare(
 	}
 
 	current_command->type = BASE_KCPU_COMMAND_TYPE_JIT_ALLOC;
-	list_add_tail(&current_command->info.jit_alloc.node,
-			&kctx->csf.kcpu_queues.jit_cmds_head);
 	current_command->info.jit_alloc.info = info;
 	current_command->info.jit_alloc.count = count;
 	current_command->info.jit_alloc.blocked = false;
+	mutex_lock(&kctx->csf.kcpu_queues.jit_lock);
+	list_add_tail(&current_command->info.jit_alloc.node,
+			&kctx->csf.kcpu_queues.jit_cmds_head);
+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);
 
 	return 0;
 out_free:
@@ -415,7 +426,9 @@ static void kbase_kcpu_jit_allocate_finish(
 		struct kbase_kcpu_command_queue *queue,
 		struct kbase_kcpu_command *cmd)
 {
-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
+
+	mutex_lock(&queue->kctx->csf.kcpu_queues.jit_lock);
 
 	/* Remove this command from the jit_cmds_head list */
 	list_del(&cmd->info.jit_alloc.node);
@@ -429,6 +442,8 @@ static void kbase_kcpu_jit_allocate_finish(
 		cmd->info.jit_alloc.blocked = false;
 	}
 
+	mutex_unlock(&queue->kctx->csf.kcpu_queues.jit_lock);
+
 	kfree(cmd->info.jit_alloc.info);
 }
 
@@ -441,18 +456,17 @@ static void kbase_kcpu_jit_retry_pending_allocs(struct kbase_context *kctx)
 {
 	struct kbase_kcpu_command_queue *blocked_queue;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);
 
 	/*
 	 * Reschedule all queues blocked by JIT_ALLOC commands.
 	 * NOTE: This code traverses the list of blocked queues directly. It
 	 * only works as long as the queued works are not executed at the same
 	 * time. This precondition is true since we're holding the
-	 * kbase_csf_kcpu_queue_context.lock .
+	 * kbase_csf_kcpu_queue_context.jit_lock .
 	 */
-	list_for_each_entry(blocked_queue,
-			&kctx->csf.kcpu_queues.jit_blocked_queues, jit_blocked)
-		queue_work(kctx->csf.kcpu_queues.wq, &blocked_queue->work);
+	list_for_each_entry(blocked_queue, &kctx->csf.kcpu_queues.jit_blocked_queues, jit_blocked)
+		queue_work(blocked_queue->wq, &blocked_queue->work);
 }
 
 static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
@@ -469,7 +483,8 @@ static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
 	if (WARN_ON(!ids))
 		return -EINVAL;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
+	mutex_lock(&kctx->csf.kcpu_queues.jit_lock);
 
 	KBASE_TLSTREAM_TL_KBASE_ARRAY_BEGIN_KCPUQUEUE_EXECUTE_JIT_FREE_END(queue->kctx->kbdev,
 									   queue);
@@ -501,9 +516,6 @@ static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
 			queue->kctx->kbdev, queue, item_err, pages_used);
 	}
 
-	/* Free the list of ids */
-	kfree(ids);
-
 	/*
 	 * Remove this command from the jit_cmds_head list and retry pending
 	 * allocations.
@@ -511,6 +523,11 @@ static int kbase_kcpu_jit_free_process(struct kbase_kcpu_command_queue *queue,
 	list_del(&cmd->info.jit_free.node);
 	kbase_kcpu_jit_retry_pending_allocs(kctx);
 
+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);
+
+	/* Free the list of ids */
+	kfree(ids);
+
 	return rc;
 }
 
@@ -526,7 +543,7 @@ static int kbase_kcpu_jit_free_prepare(
 	int ret;
 	u32 i;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	/* Sanity checks */
 	if (!count || count > ARRAY_SIZE(kctx->jit_alloc)) {
@@ -572,10 +589,12 @@ static int kbase_kcpu_jit_free_prepare(
 	}
 
 	current_command->type = BASE_KCPU_COMMAND_TYPE_JIT_FREE;
-	list_add_tail(&current_command->info.jit_free.node,
-			&kctx->csf.kcpu_queues.jit_cmds_head);
 	current_command->info.jit_free.ids = ids;
 	current_command->info.jit_free.count = count;
+	mutex_lock(&kctx->csf.kcpu_queues.jit_lock);
+	list_add_tail(&current_command->info.jit_free.node,
+			&kctx->csf.kcpu_queues.jit_cmds_head);
+	mutex_unlock(&kctx->csf.kcpu_queues.jit_lock);
 
 	return 0;
 out_free:
@@ -601,7 +620,7 @@ static int kbase_csf_queue_group_suspend_prepare(
 	int pinned_pages = 0, ret = 0;
 	struct kbase_va_region *reg;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	if (suspend_buf->size < csg_suspend_buf_size)
 		return -EINVAL;
@@ -652,9 +671,12 @@ static int kbase_csf_queue_group_suspend_prepare(
 		u64 start, end, i;
 
 		if (((reg->flags & KBASE_REG_ZONE_MASK) != KBASE_REG_ZONE_SAME_VA) ||
-				reg->nr_pages < nr_pages ||
-				kbase_reg_current_backed_size(reg) !=
-					reg->nr_pages) {
+		    (kbase_reg_current_backed_size(reg) < nr_pages) ||
+		    !(reg->flags & KBASE_REG_CPU_WR) ||
+		    (reg->gpu_alloc->type != KBASE_MEM_TYPE_NATIVE) ||
+		    (reg->flags & KBASE_REG_DONT_NEED) ||
+		    (reg->flags & KBASE_REG_ACTIVE_JIT_ALLOC) ||
+		    (reg->flags & KBASE_REG_NO_USER_FREE)) {
 			ret = -EINVAL;
 			goto out_clean_pages;
 		}
@@ -703,9 +725,8 @@ static enum kbase_csf_event_callback_action event_cqs_callback(void *param)
 {
 	struct kbase_kcpu_command_queue *kcpu_queue =
 		(struct kbase_kcpu_command_queue *)param;
-	struct kbase_context *const kctx = kcpu_queue->kctx;
 
-	queue_work(kctx->csf.kcpu_queues.wq, &kcpu_queue->work);
+	queue_work(kcpu_queue->wq, &kcpu_queue->work);
 
 	return KBASE_CSF_EVENT_CALLBACK_KEEP;
 }
@@ -735,7 +756,7 @@ static int kbase_kcpu_cqs_wait_process(struct kbase_device *kbdev,
 {
 	u32 i;
 
-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
 
 	if (WARN_ON(!cqs_wait->objs))
 		return -EINVAL;
@@ -803,7 +824,7 @@ static int kbase_kcpu_cqs_wait_prepare(struct kbase_kcpu_command_queue *queue,
 	struct base_cqs_wait_info *objs;
 	unsigned int nr_objs = cqs_wait_info->nr_objs;
 
-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
 
 	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
 		return -EINVAL;
@@ -857,7 +878,7 @@ static void kbase_kcpu_cqs_set_process(struct kbase_device *kbdev,
 {
 	unsigned int i;
 
-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
 
 	if (WARN_ON(!cqs_set->objs))
 		return;
@@ -898,11 +919,10 @@ static int kbase_kcpu_cqs_set_prepare(
 		struct base_kcpu_command_cqs_set_info *cqs_set_info,
 		struct kbase_kcpu_command *current_command)
 {
-	struct kbase_context *const kctx = kcpu_queue->kctx;
 	struct base_cqs_set *objs;
 	unsigned int nr_objs = cqs_set_info->nr_objs;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
 		return -EINVAL;
@@ -952,7 +972,7 @@ static int kbase_kcpu_cqs_wait_operation_process(struct kbase_device *kbdev,
 {
 	u32 i;
 
-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
 
 	if (WARN_ON(!cqs_wait_operation->objs))
 		return -EINVAL;
@@ -1039,7 +1059,7 @@ static int kbase_kcpu_cqs_wait_operation_prepare(struct kbase_kcpu_command_queue
 	struct base_cqs_wait_operation_info *objs;
 	unsigned int nr_objs = cqs_wait_operation_info->nr_objs;
 
-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
 
 	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
 		return -EINVAL;
@@ -1094,7 +1114,7 @@ static void kbase_kcpu_cqs_set_operation_process(
 {
 	unsigned int i;
 
-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
 
 	if (WARN_ON(!cqs_set_operation->objs))
 		return;
@@ -1161,11 +1181,10 @@ static int kbase_kcpu_cqs_set_operation_prepare(
 		struct base_kcpu_command_cqs_set_operation_info *cqs_set_operation_info,
 		struct kbase_kcpu_command *current_command)
 {
-	struct kbase_context *const kctx = kcpu_queue->kctx;
 	struct base_cqs_set_operation_info *objs;
 	unsigned int nr_objs = cqs_set_operation_info->nr_objs;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	if (nr_objs > BASEP_KCPU_CQS_MAX_NUM_OBJS)
 		return -EINVAL;
@@ -1212,7 +1231,7 @@ static void kbase_csf_fence_wait_callback(struct dma_fence *fence,
 				  fence->context, fence->seqno);
 
 	/* Resume kcpu command queue processing. */
-	queue_work(kctx->csf.kcpu_queues.wq, &kcpu_queue->work);
+	queue_work(kcpu_queue->wq, &kcpu_queue->work);
 }
 
 static void kbase_kcpu_fence_wait_cancel(
@@ -1221,7 +1240,7 @@ static void kbase_kcpu_fence_wait_cancel(
 {
 	struct kbase_context *const kctx = kcpu_queue->kctx;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	if (WARN_ON(!fence_info->fence))
 		return;
@@ -1293,7 +1312,7 @@ static void fence_timeout_callback(struct timer_list *timer)
 	kbase_sync_fence_info_get(fence, &info);
 
 	if (info.status == 1) {
-		queue_work(kctx->csf.kcpu_queues.wq, &kcpu_queue->work);
+		queue_work(kcpu_queue->wq, &kcpu_queue->work);
 	} else if (info.status == 0) {
 		dev_warn(kctx->kbdev->dev, "fence has not yet signalled in %ums",
 			 FENCE_WAIT_TIMEOUT_MS);
@@ -1345,7 +1364,7 @@ static int kbase_kcpu_fence_wait_process(
 #endif
 	struct kbase_context *const kctx = kcpu_queue->kctx;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	if (WARN_ON(!fence_info->fence))
 		return -EINVAL;
@@ -1401,7 +1420,6 @@ static int kbase_kcpu_fence_wait_prepare(
 		struct base_kcpu_command_fence_info *fence_info,
 		struct kbase_kcpu_command *current_command)
 {
-	struct kbase_context *const kctx = kcpu_queue->kctx;
 #if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
 	struct fence *fence_in;
 #else
@@ -1409,7 +1427,7 @@ static int kbase_kcpu_fence_wait_prepare(
 #endif
 	struct base_fence fence;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	if (copy_from_user(&fence, u64_to_user_ptr(fence_info->fence),
 			sizeof(fence)))
@@ -1460,7 +1478,6 @@ static int kbase_kcpu_fence_signal_prepare(
 		struct base_kcpu_command_fence_info *fence_info,
 		struct kbase_kcpu_command *current_command)
 {
-	struct kbase_context *const kctx = kcpu_queue->kctx;
 #if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
 	struct fence *fence_out;
 #else
@@ -1471,7 +1488,7 @@ static int kbase_kcpu_fence_signal_prepare(
 	int ret = 0;
 	int fd;
 
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&kcpu_queue->lock);
 
 	if (copy_from_user(&fence, u64_to_user_ptr(fence_info->fence),
 			sizeof(fence)))
@@ -1549,11 +1566,9 @@ static void kcpu_queue_process_worker(struct work_struct *data)
 	struct kbase_kcpu_command_queue *queue = container_of(data,
 				struct kbase_kcpu_command_queue, work);
 
-	mutex_lock(&queue->kctx->csf.kcpu_queues.lock);
-
+	mutex_lock(&queue->lock);
 	kcpu_queue_process(queue, false);
-
-	mutex_unlock(&queue->kctx->csf.kcpu_queues.lock);
+	mutex_unlock(&queue->lock);
 }
 
 static int delete_queue(struct kbase_context *kctx, u32 id)
@@ -1569,6 +1584,17 @@ static int delete_queue(struct kbase_context *kctx, u32 id)
 		KBASE_KTRACE_ADD_CSF_KCPU(kctx->kbdev, KCPU_QUEUE_DELETE,
 			queue, queue->num_pending_cmds, queue->cqs_wait_count);
 
+		/* Disassociate the queue from the system to prevent further
+		 * submissions. Draining pending commands would be acceptable
+		 * even if a new queue is created using the same ID.
+		 */
+		kctx->csf.kcpu_queues.array[id] = NULL;
+		bitmap_clear(kctx->csf.kcpu_queues.in_use, id, 1);
+
+		mutex_unlock(&kctx->csf.kcpu_queues.lock);
+
+		mutex_lock(&queue->lock);
+
 		/* Drain the remaining work for this queue first and go past
 		 * all the waits.
 		 */
@@ -1580,17 +1606,17 @@ static int delete_queue(struct kbase_context *kctx, u32 id)
 		/* All CQS wait commands should have been cleaned up */
 		WARN_ON(queue->cqs_wait_count);
 
-		kctx->csf.kcpu_queues.array[id] = NULL;
-		bitmap_clear(kctx->csf.kcpu_queues.in_use, id, 1);
-
 		/* Fire the tracepoint with the mutex held to enforce correct
 		 * ordering with the summary stream.
 		 */
 		KBASE_TLSTREAM_TL_KBASE_DEL_KCPUQUEUE(kctx->kbdev, queue);
 
-		mutex_unlock(&kctx->csf.kcpu_queues.lock);
+		mutex_unlock(&queue->lock);
 
 		cancel_work_sync(&queue->work);
+		destroy_workqueue(queue->wq);
+
+		mutex_destroy(&queue->lock);
 
 		kfree(queue);
 	} else {
@@ -1657,7 +1683,7 @@ static void kcpu_queue_process(struct kbase_kcpu_command_queue *queue,
 	bool process_next = true;
 	size_t i;
 
-	lockdep_assert_held(&queue->kctx->csf.kcpu_queues.lock);
+	lockdep_assert_held(&queue->lock);
 
 	for (i = 0; i != queue->num_pending_cmds; ++i) {
 		struct kbase_kcpu_command *cmd =
@@ -2058,9 +2084,11 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 
 	/* The offset to the first command that is being processed or yet to
 	 * be processed is of u8 type, so the number of commands inside the
-	 * queue cannot be more than 256.
+	 * queue cannot be more than 256. The current implementation expects
+	 * exactly 256, any other size will require the addition of wrapping
+	 * logic.
 	 */
-	BUILD_BUG_ON(KBASEP_KCPU_QUEUE_SIZE > 256);
+	BUILD_BUG_ON(KBASEP_KCPU_QUEUE_SIZE != 256);
 
 	/* Whilst the backend interface allows enqueueing multiple commands in
 	 * a single operation, the Base interface does not expose any mechanism
@@ -2076,13 +2104,13 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 	}
 
 	mutex_lock(&kctx->csf.kcpu_queues.lock);
-
-	if (!kctx->csf.kcpu_queues.array[enq->id]) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	queue = kctx->csf.kcpu_queues.array[enq->id];
+	mutex_unlock(&kctx->csf.kcpu_queues.lock);
+
+	if (queue == NULL)
+		return -EINVAL;
+
+	mutex_lock(&queue->lock);
 
 	if (kcpu_queue_get_space(queue) < enq->nr_commands) {
 		ret = -EBUSY;
@@ -2097,7 +2125,7 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 	 * for the possibility to roll back.
 	 */
 
-	for (i = 0; (i != enq->nr_commands) && !ret; ++i, ++kctx->csf.kcpu_queues.num_cmds) {
+	for (i = 0; (i != enq->nr_commands) && !ret; ++i) {
 		struct kbase_kcpu_command *kcpu_cmd =
 			&queue->commands[(u8)(queue->start_offset + queue->num_pending_cmds + i)];
 		struct base_kcpu_command command;
@@ -2120,7 +2148,7 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 			}
 		}
 
-		kcpu_cmd->enqueue_ts = kctx->csf.kcpu_queues.num_cmds;
+		kcpu_cmd->enqueue_ts = atomic64_inc_return(&kctx->csf.kcpu_queues.cmd_seq_num);
 		switch (command.type) {
 		case BASE_KCPU_COMMAND_TYPE_FENCE_WAIT:
 #if IS_ENABLED(CONFIG_SYNC_FILE)
@@ -2208,13 +2236,10 @@ int kbase_csf_kcpu_queue_enqueue(struct kbase_context *kctx,
 
 		queue->num_pending_cmds += enq->nr_commands;
 		kcpu_queue_process(queue, false);
-	} else {
-		/* Roll back the number of enqueued commands */
-		kctx->csf.kcpu_queues.num_cmds -= i;
 	}
 
 out:
-	mutex_unlock(&kctx->csf.kcpu_queues.lock);
+	mutex_unlock(&queue->lock);
 
 	return ret;
 }
@@ -2228,14 +2253,9 @@ int kbase_csf_kcpu_queue_context_init(struct kbase_context *kctx)
 	for (idx = 0; idx < KBASEP_MAX_KCPU_QUEUES; ++idx)
 		kctx->csf.kcpu_queues.array[idx] = NULL;
 
-	kctx->csf.kcpu_queues.wq = alloc_workqueue("mali_kbase_csf_kcpu",
-					WQ_UNBOUND | WQ_HIGHPRI, 0);
-	if (!kctx->csf.kcpu_queues.wq)
-		return -ENOMEM;
-
 	mutex_init(&kctx->csf.kcpu_queues.lock);
 
-	kctx->csf.kcpu_queues.num_cmds = 0;
+	atomic64_set(&kctx->csf.kcpu_queues.cmd_seq_num, 0);
 
 	return 0;
 }
@@ -2253,7 +2273,6 @@ void kbase_csf_kcpu_queue_context_term(struct kbase_context *kctx)
 			(void)delete_queue(kctx, id);
 	}
 
-	destroy_workqueue(kctx->csf.kcpu_queues.wq);
 	mutex_destroy(&kctx->csf.kcpu_queues.lock);
 }
 
@@ -2297,8 +2316,17 @@ int kbase_csf_kcpu_queue_new(struct kbase_context *kctx,
 		goto out;
 	}
 
+	queue->wq = alloc_workqueue("mali_kbase_csf_kcpu_wq_%i", WQ_UNBOUND | WQ_HIGHPRI, 0, idx);
+	if (queue->wq == NULL) {
+		kfree(queue);
+		ret = -ENOMEM;
+
+		goto out;
+	}
+
 	bitmap_set(kctx->csf.kcpu_queues.in_use, idx, 1);
 	kctx->csf.kcpu_queues.array[idx] = queue;
+	mutex_init(&queue->lock);
 	queue->kctx = kctx;
 	queue->start_offset = 0;
 	queue->num_pending_cmds = 0;
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.h
index a4db86984721..5f9b8e0684bc 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_kcpu.h
@@ -236,9 +236,11 @@ struct kbase_kcpu_command {
 /**
  * struct kbase_kcpu_command_queue - a command queue executed by the kernel
  *
+ * @lock:			Lock to protect accesses to this queue.
  * @kctx:			The context to which this command queue belongs.
  * @commands:			Array of commands which have been successfully
  *				enqueued to this command queue.
+ * @wq:				Dedicated workqueue for processing commands.
  * @work:			struct work_struct which contains a pointer to
  *				the function which handles processing of kcpu
  *				commands enqueued into a kcpu command queue;
@@ -274,8 +276,10 @@ struct kbase_kcpu_command {
  * @fence_timeout:		Timer used to detect the fence wait timeout.
  */
 struct kbase_kcpu_command_queue {
+	struct mutex lock;
 	struct kbase_context *kctx;
 	struct kbase_kcpu_command commands[KBASEP_KCPU_QUEUE_SIZE];
+	struct workqueue_struct *wq;
 	struct work_struct work;
 	u8 start_offset;
 	u8 id;
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_registers.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_registers.h
index 177569bfb427..6dde56cb161a 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_registers.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_registers.h
@@ -163,6 +163,8 @@
 #define CSG_PROTM_SUSPEND_BUF_HI 0x004C /* () Protected mode suspend buffer, high word */
 #define CSG_CONFIG 0x0050 /* () CSG configuration options */
 #define CSG_ITER_TRACE_CONFIG 0x0054 /* () CSG trace configuration */
+#define CSG_DVS_BUF_LO 0x0060 /* () Normal mode deferred vertex shading work buffer, low word */
+#define CSG_DVS_BUF_HI 0x0064 /* () Normal mode deferred vertex shading work buffer, high word */
 
 /* CSG_OUTPUT_BLOCK register offsets */
 #define CSG_ACK 0x0000 /* () CSG acknowledge flags */
@@ -547,6 +549,13 @@
 #define CS_STATUS_WAIT_SB_MASK_SET(reg_val, value) \
 	(((reg_val) & ~CS_STATUS_WAIT_SB_MASK_MASK) |  \
 	 (((value) << CS_STATUS_WAIT_SB_MASK_SHIFT) & CS_STATUS_WAIT_SB_MASK_MASK))
+#define CS_STATUS_WAIT_SB_SOURCE_SHIFT 16
+#define CS_STATUS_WAIT_SB_SOURCE_MASK (0xF << CS_STATUS_WAIT_SB_SOURCE_SHIFT)
+#define CS_STATUS_WAIT_SB_SOURCE_GET(reg_val)                                                      \
+	(((reg_val)&CS_STATUS_WAIT_SB_SOURCE_MASK) >> CS_STATUS_WAIT_SB_SOURCE_SHIFT)
+#define CS_STATUS_WAIT_SB_SOURCE_SET(reg_val, value)                                               \
+	(((reg_val) & ~CS_STATUS_WAIT_SB_SOURCE_MASK) |                                            \
+	 (((value) << CS_STATUS_WAIT_SB_SOURCE_SHIFT) & CS_STATUS_WAIT_SB_SOURCE_MASK))
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_SHIFT 24
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_MASK (0xF << CS_STATUS_WAIT_SYNC_WAIT_CONDITION_SHIFT)
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GET(reg_val) \
@@ -557,6 +566,7 @@
 /* CS_STATUS_WAIT_SYNC_WAIT_CONDITION values */
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_LE 0x0
 #define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GT 0x1
+#define CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GE 0x5
 /* End of CS_STATUS_WAIT_SYNC_WAIT_CONDITION values */
 #define CS_STATUS_WAIT_PROGRESS_WAIT_SHIFT 28
 #define CS_STATUS_WAIT_PROGRESS_WAIT_MASK (0x1 << CS_STATUS_WAIT_PROGRESS_WAIT_SHIFT)
@@ -835,11 +845,6 @@
 #define CSG_REQ_IDLE_GET(reg_val) (((reg_val)&CSG_REQ_IDLE_MASK) >> CSG_REQ_IDLE_SHIFT)
 #define CSG_REQ_IDLE_SET(reg_val, value) \
 	(((reg_val) & ~CSG_REQ_IDLE_MASK) | (((value) << CSG_REQ_IDLE_SHIFT) & CSG_REQ_IDLE_MASK))
-#define CSG_REQ_DOORBELL_SHIFT 30
-#define CSG_REQ_DOORBELL_MASK (0x1 << CSG_REQ_DOORBELL_SHIFT)
-#define CSG_REQ_DOORBELL_GET(reg_val) (((reg_val)&CSG_REQ_DOORBELL_MASK) >> CSG_REQ_DOORBELL_SHIFT)
-#define CSG_REQ_DOORBELL_SET(reg_val, value) \
-	(((reg_val) & ~CSG_REQ_DOORBELL_MASK) | (((value) << CSG_REQ_DOORBELL_SHIFT) & CSG_REQ_DOORBELL_MASK))
 #define CSG_REQ_PROGRESS_TIMER_EVENT_SHIFT 31
 #define CSG_REQ_PROGRESS_TIMER_EVENT_MASK (0x1 << CSG_REQ_PROGRESS_TIMER_EVENT_SHIFT)
 #define CSG_REQ_PROGRESS_TIMER_EVENT_GET(reg_val) \
@@ -956,6 +961,21 @@
 	(((reg_val) & ~CSG_PROTM_SUSPEND_BUF_POINTER_MASK) |  \
 	 (((value) << CSG_PROTM_SUSPEND_BUF_POINTER_SHIFT) & CSG_PROTM_SUSPEND_BUF_POINTER_MASK))
 
+/* CSG_DVS_BUF_BUFFER register */
+#define CSG_DVS_BUF_BUFFER_SIZE_SHIFT GPU_U(0)
+#define CSG_DVS_BUF_BUFFER_SIZE_MASK (GPU_U(0xFFF) << CSG_DVS_BUF_BUFFER_SIZE_SHIFT)
+#define CSG_DVS_BUF_BUFFER_SIZE_GET(reg_val) (((reg_val)&CSG_DVS_BUF_BUFFER_SIZE_MASK) >> CSG_DVS_BUF_BUFFER_SIZE_SHIFT)
+#define CSG_DVS_BUF_BUFFER_SIZE_SET(reg_val, value) \
+	(((reg_val) & ~CSG_DVS_BUF_BUFFER_SIZE_MASK) |  \
+	 (((value) << CSG_DVS_BUF_BUFFER_SIZE_SHIFT) & CSG_DVS_BUF_BUFFER_SIZE_MASK))
+#define CSG_DVS_BUF_BUFFER_POINTER_SHIFT GPU_U(12)
+#define CSG_DVS_BUF_BUFFER_POINTER_MASK                                                            \
+	(GPU_ULL(0xFFFFFFFFFFFFF) << CSG_DVS_BUF_BUFFER_POINTER_SHIFT)
+#define CSG_DVS_BUF_BUFFER_POINTER_GET(reg_val) \
+	(((reg_val)&CSG_DVS_BUF_BUFFER_POINTER_MASK) >> CSG_DVS_BUF_BUFFER_POINTER_SHIFT)
+#define CSG_DVS_BUF_BUFFER_POINTER_SET(reg_val, value) \
+	(((reg_val) & ~CSG_DVS_BUF_BUFFER_POINTER_MASK) |  \
+	 (((value) << CSG_DVS_BUF_BUFFER_POINTER_SHIFT) & CSG_DVS_BUF_BUFFER_POINTER_MASK))
 
 /* End of CSG_INPUT_BLOCK register set definitions */
 
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_reset_gpu.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_reset_gpu.c
index 10de93faee27..fe3b91a4845d 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_reset_gpu.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_reset_gpu.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -21,7 +21,7 @@
 
 #include <mali_kbase.h>
 #include <mali_kbase_ctx_sched.h>
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 #include <device/mali_kbase_device.h>
 #include <backend/gpu/mali_kbase_irq_internal.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
@@ -29,7 +29,7 @@
 #include <csf/mali_kbase_csf_trace_buffer.h>
 #include <csf/ipa_control/mali_kbase_csf_ipa_control.h>
 #include <mali_kbase_reset_gpu.h>
-#include <linux/string.h>
+#include <csf/mali_kbase_csf_firmware_log.h>
 
 enum kbasep_soft_reset_status {
 	RESET_SUCCESS = 0,
@@ -257,68 +257,6 @@ static void kbase_csf_debug_dump_registers(struct kbase_device *kbdev)
 		kbase_reg_read(kbdev, GPU_CONTROL_REG(TILER_CONFIG)));
 }
 
-static void kbase_csf_dump_firmware_trace_buffer(struct kbase_device *kbdev)
-{
-	u8 *buf, *p, *pnewline, *pend, *pendbuf;
-	unsigned int read_size, remaining_size;
-	struct firmware_trace_buffer *tb =
-		kbase_csf_firmware_get_trace_buffer(kbdev, FW_TRACE_BUF_NAME);
-
-	if (tb == NULL) {
-		dev_dbg(kbdev->dev, "Can't get the trace buffer, firmware trace dump skipped");
-		return;
-	}
-
-	buf = kmalloc(PAGE_SIZE + 1, GFP_KERNEL);
-	if (buf == NULL) {
-		dev_err(kbdev->dev, "Short of memory, firmware trace dump skipped");
-		return;
-	}
-
-	buf[PAGE_SIZE] = 0;
-
-	p = buf;
-	pendbuf = &buf[PAGE_SIZE];
-
-	dev_err(kbdev->dev, "Firmware trace buffer dump:");
-	while ((read_size = kbase_csf_firmware_trace_buffer_read_data(tb, p,
-								pendbuf - p))) {
-		pend = p + read_size;
-		p = buf;
-
-		while (p < pend && (pnewline = memchr(p, '\n', pend - p))) {
-			/* Null-terminate the string */
-			*pnewline = 0;
-
-			dev_err(kbdev->dev, "FW> %s", p);
-
-			p = pnewline + 1;
-		}
-
-		remaining_size = pend - p;
-
-		if (!remaining_size) {
-			p = buf;
-		} else if (remaining_size < PAGE_SIZE) {
-			/* Copy unfinished string to the start of the buffer */
-			memmove(buf, p, remaining_size);
-			p = &buf[remaining_size];
-		} else {
-			/* Print abnormal page-long string without newlines */
-			dev_err(kbdev->dev, "FW> %s", buf);
-			p = buf;
-		}
-	}
-
-	if (p != buf) {
-		/* Null-terminate and print last unfinished string */
-		*p = 0;
-		dev_err(kbdev->dev, "FW> %s", buf);
-	}
-
-	kfree(buf);
-}
-
 /**
  * kbase_csf_hwcnt_on_reset_error() - Sets HWCNT to appropriate state in the
  *                                    event of an error during GPU reset.
@@ -378,7 +316,6 @@ static enum kbasep_soft_reset_status kbase_csf_reset_gpu_once(struct kbase_devic
 		"The flush has completed so reset the active indicator\n");
 	kbdev->irq_reset_flush = false;
 
-	mutex_lock(&kbdev->pm.lock);
 	if (!silent)
 		dev_err(kbdev->dev, "Resetting GPU (allowing up to %d ms)",
 								RESET_TIMEOUT);
@@ -389,7 +326,7 @@ static enum kbasep_soft_reset_status kbase_csf_reset_gpu_once(struct kbase_devic
 	if (!silent) {
 		kbase_csf_debug_dump_registers(kbdev);
 		if (likely(firmware_inited))
-			kbase_csf_dump_firmware_trace_buffer(kbdev);
+			kbase_csf_firmware_log_dump_buffer(kbdev);
 	}
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@@ -403,6 +340,7 @@ static enum kbasep_soft_reset_status kbase_csf_reset_gpu_once(struct kbase_devic
 	 */
 	kbase_hwcnt_backend_csf_on_before_reset(&kbdev->hwcnt_gpu_iface);
 
+	mutex_lock(&kbdev->pm.lock);
 	/* Reset the GPU */
 	err = kbase_pm_init_hw(kbdev, 0);
 
@@ -633,6 +571,11 @@ bool kbase_reset_gpu_is_active(struct kbase_device *kbdev)
 	return kbase_csf_reset_state_is_active(reset_state);
 }
 
+bool kbase_reset_gpu_is_not_pending(struct kbase_device *kbdev)
+{
+	return atomic_read(&kbdev->csf.reset.state) == KBASE_CSF_RESET_GPU_NOT_PENDING;
+}
+
 int kbase_reset_gpu_wait(struct kbase_device *kbdev)
 {
 	const long wait_timeout =
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.c
index af3b6912845d..b3cdef7dae52 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.c
@@ -31,6 +31,7 @@
 #include <csf/mali_kbase_csf_registers.h>
 #include <uapi/gpu/arm/bifrost/mali_base_kernel.h>
 #include <mali_kbase_hwaccess_time.h>
+#include "mali_kbase_csf_tiler_heap_reclaim.h"
 
 /* Value to indicate that a queue group is not groups_to_schedule list */
 #define KBASEP_GROUP_PREPARED_SEQ_NUM_INVALID (U32_MAX)
@@ -50,36 +51,12 @@
 /* CSF scheduler time slice value */
 #define CSF_SCHEDULER_TIME_TICK_MS (100) /* 100 milliseconds */
 
-/*
- * CSF scheduler time threshold for converting "tock" requests into "tick" if
- * they come too close to the end of a tick interval. This avoids scheduling
- * twice in a row.
- */
-#define CSF_SCHEDULER_TIME_TICK_THRESHOLD_MS \
-	CSF_SCHEDULER_TIME_TICK_MS
-
-#define CSF_SCHEDULER_TIME_TICK_THRESHOLD_JIFFIES \
-	msecs_to_jiffies(CSF_SCHEDULER_TIME_TICK_THRESHOLD_MS)
-
-/* Nanoseconds per millisecond */
-#define NS_PER_MS ((u64)1000 * 1000)
-
-/*
- * CSF minimum time to reschedule for a new "tock" request. Bursts of "tock"
- * requests are not serviced immediately, but shall wait for a minimum time in
- * order to reduce load on the CSF scheduler thread.
- */
-#define CSF_SCHEDULER_TIME_TOCK_JIFFIES 1 /* 1 jiffies-time */
-
-/* CS suspended and is idle (empty ring buffer) */
-#define CS_IDLE_FLAG (1 << 0)
-
-/* CS suspended and is wait for a CQS condition */
-#define CS_WAIT_SYNC_FLAG (1 << 1)
-
 /* A GPU address space slot is reserved for MCU. */
 #define NUM_RESERVED_AS_SLOTS (1)
 
+/* Time to wait for completion of PING req before considering MCU as hung */
+#define FW_PING_AFTER_ERROR_TIMEOUT_MS (10)
+
 static int scheduler_group_schedule(struct kbase_queue_group *group);
 static void remove_group_from_idle_wait(struct kbase_queue_group *const group);
 static
@@ -97,9 +74,105 @@ static int suspend_active_queue_groups(struct kbase_device *kbdev,
 static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
 					      bool system_suspend);
 static void schedule_in_cycle(struct kbase_queue_group *group, bool force);
+static bool queue_group_scheduled_locked(struct kbase_queue_group *group);
 
 #define kctx_as_enabled(kctx) (!kbase_ctx_flag(kctx, KCTX_AS_DISABLED_ON_FAULT))
 
+/**
+ * wait_for_dump_complete_on_group_deschedule() - Wait for dump on fault and
+ *              scheduling tick/tock to complete before the group deschedule.
+ *
+ * @group: Pointer to the group that is being descheduled.
+ *
+ * This function blocks the descheduling of the group until the dump on fault is
+ * completed and scheduling tick/tock has completed.
+ * To deschedule an on slot group CSG termination request would be sent and that
+ * might time out if the fault had occurred and also potentially affect the state
+ * being dumped. Moreover the scheduler lock would be held, so the access to debugfs
+ * files would get blocked.
+ * Scheduler lock and 'kctx->csf.lock' are released before this function starts
+ * to wait. When a request sent by the Scheduler to the FW times out, Scheduler
+ * would also wait for the dumping to complete and release the Scheduler lock
+ * before the wait. Meanwhile Userspace can try to delete the group, this function
+ * would ensure that the group doesn't exit the Scheduler until scheduling
+ * tick/tock has completed. Though very unlikely, group deschedule can be triggered
+ * from multiple threads around the same time and after the wait Userspace thread
+ * can win the race and get the group descheduled and free the memory for group
+ * pointer before the other threads wake up and notice that group has already been
+ * descheduled. To avoid the freeing in such a case, a sort of refcount is used
+ * for the group which is incremented & decremented across the wait.
+ */
+static
+void wait_for_dump_complete_on_group_deschedule(struct kbase_queue_group *group)
+{
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	struct kbase_device *kbdev = group->kctx->kbdev;
+	struct kbase_context *kctx = group->kctx;
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+
+	lockdep_assert_held(&kctx->csf.lock);
+	lockdep_assert_held(&scheduler->lock);
+
+	if (likely(!kbase_debug_csf_fault_dump_enabled(kbdev)))
+		return;
+
+	while ((!kbase_debug_csf_fault_dump_complete(kbdev) ||
+	       (scheduler->state == SCHED_BUSY)) &&
+	       queue_group_scheduled_locked(group)) {
+		group->deschedule_deferred_cnt++;
+		mutex_unlock(&scheduler->lock);
+		mutex_unlock(&kctx->csf.lock);
+		kbase_debug_csf_fault_wait_completion(kbdev);
+		mutex_lock(&kctx->csf.lock);
+		mutex_lock(&scheduler->lock);
+		group->deschedule_deferred_cnt--;
+	}
+#endif
+}
+
+/**
+ * schedule_actions_trigger_df() - Notify the client about the fault and
+ *                                 wait for the dumping to complete.
+ *
+ * @kbdev: Pointer to the device
+ * @kctx:  Pointer to the context associated with the CSG slot for which
+ *         the timeout was seen.
+ * @error: Error code indicating the type of timeout that occurred.
+ *
+ * This function notifies the Userspace client waiting for the faults and wait
+ * for the Client to complete the dumping.
+ * The function is called only from Scheduling tick/tock when a request sent by
+ * the Scheduler to FW times out or from the protm event work item of the group
+ * when the protected mode entry request times out.
+ * In the latter case there is no wait done as scheduler lock would be released
+ * immediately. In the former case the function waits and releases the scheduler
+ * lock before the wait. It has been ensured that the Scheduler view of the groups
+ * won't change meanwhile, so no group can enter/exit the Scheduler, become
+ * runnable or go off slot.
+ */
+static void schedule_actions_trigger_df(struct kbase_device *kbdev,
+	struct kbase_context *kctx, enum dumpfault_error_type error)
+{
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	if (!kbase_debug_csf_fault_notify(kbdev, kctx, error))
+		return;
+
+	if (unlikely(scheduler->state != SCHED_BUSY)) {
+		WARN_ON(error != DF_PROTECTED_MODE_ENTRY_FAILURE);
+		return;
+	}
+
+	mutex_unlock(&scheduler->lock);
+	kbase_debug_csf_fault_wait_completion(kbdev);
+	mutex_lock(&scheduler->lock);
+	WARN_ON(scheduler->state != SCHED_BUSY);
+#endif
+}
+
 #ifdef KBASE_PM_RUNTIME
 /**
  * wait_for_scheduler_to_exit_sleep() - Wait for Scheduler to exit the
@@ -207,6 +280,7 @@ static int force_scheduler_to_exit_sleep(struct kbase_device *kbdev)
 	}
 
 	scheduler->state = SCHED_SUSPENDED;
+	KBASE_KTRACE_ADD(kbdev, SCHED_SUSPENDED, NULL, scheduler->state);
 
 	return 0;
 
@@ -472,6 +546,7 @@ void kbase_csf_scheduler_process_gpu_idle_event(struct kbase_device *kbdev)
 	int non_idle_offslot_grps;
 	bool can_suspend_on_idle;
 
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 	lockdep_assert_held(&scheduler->interrupt_lock);
 
 	non_idle_offslot_grps = atomic_read(&scheduler->non_idle_offslot_grps);
@@ -481,12 +556,23 @@ void kbase_csf_scheduler_process_gpu_idle_event(struct kbase_device *kbdev)
 
 	if (!non_idle_offslot_grps) {
 		if (can_suspend_on_idle) {
+			/* fast_gpu_idle_handling is protected by the
+			 * interrupt_lock, which would prevent this from being
+			 * updated whilst gpu_idle_worker() is executing.
+			 */
+			scheduler->fast_gpu_idle_handling =
+				(kbdev->csf.gpu_idle_hysteresis_ms == 0) ||
+				!kbase_csf_scheduler_all_csgs_idle(kbdev);
+
 			/* The GPU idle worker relies on update_on_slot_queues_offsets() to have
 			 * finished. It's queued before to reduce the time it takes till execution
 			 * but it'll eventually be blocked by the scheduler->interrupt_lock.
 			 */
 			enqueue_gpu_idle_work(scheduler);
-			update_on_slot_queues_offsets(kbdev);
+
+			/* The extract offsets are unused in fast GPU idle handling */
+			if (!scheduler->fast_gpu_idle_handling)
+				update_on_slot_queues_offsets(kbdev);
 		}
 	} else {
 		/* Advance the scheduling tick to get the non-idle suspended groups loaded soon */
@@ -604,10 +690,14 @@ static bool scheduler_protm_wait_quit(struct kbase_device *kbdev)
 	remaining = wait_event_timeout(kbdev->csf.event_wait,
 			!kbase_csf_scheduler_protected_mode_in_use(kbdev), wt);
 
-	if (!remaining) {
+	if (unlikely(!remaining)) {
+		struct kbase_queue_group *group = kbdev->csf.scheduler.active_protm_grp;
+		struct kbase_context *kctx = group ? group->kctx : NULL;
+
 		dev_warn(kbdev->dev, "[%llu] Timeout (%d ms), protm_quit wait skipped",
 			kbase_backend_get_cycle_cnt(kbdev),
 			kbdev->csf.fw_timeout_ms);
+		schedule_actions_trigger_df(kbdev, kctx, DF_PROTECTED_MODE_EXIT_TIMEOUT);
 		success = false;
 	}
 
@@ -728,7 +818,8 @@ static int scheduler_pm_active_handle_suspend(struct kbase_device *kbdev,
  *                                     Scheduler
  *
  * @kbdev: Pointer to the device
- * @flags: flags containing previous interrupt state
+ * @flags: Pointer to the flags variable containing the interrupt state
+ *         when hwaccess lock was acquired.
  *
  * This function is called when Scheduler needs to be activated from the
  * sleeping state.
@@ -736,14 +827,14 @@ static int scheduler_pm_active_handle_suspend(struct kbase_device *kbdev,
  * MCU is initiated. It resets the flag that indicates to the MCU state
  * machine that MCU needs to be put in sleep state.
  *
- * Note: This function shall be called with hwaccess lock held and it will
- * release that lock.
+ * Note: This function shall be called with hwaccess lock held and it may
+ * release that lock and reacquire it.
  *
  * Return: zero when the PM reference was taken and non-zero when the
  * system is being suspending/suspended.
  */
 static int scheduler_pm_active_after_sleep(struct kbase_device *kbdev,
-					   unsigned long flags)
+					   unsigned long *flags)
 {
 	u32 prev_count;
 	int ret = 0;
@@ -754,20 +845,20 @@ static int scheduler_pm_active_after_sleep(struct kbase_device *kbdev,
 	prev_count = kbdev->csf.scheduler.pm_active_count;
 	if (!WARN_ON(prev_count == U32_MAX))
 		kbdev->csf.scheduler.pm_active_count++;
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 	/* On 0 => 1, make a pm_ctx_active request */
 	if (!prev_count) {
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, *flags);
+
 		ret = kbase_pm_context_active_handle_suspend(kbdev,
 				KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE);
 
-		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+		spin_lock_irqsave(&kbdev->hwaccess_lock, *flags);
 		if (ret)
 			kbdev->csf.scheduler.pm_active_count--;
 		else
 			kbdev->pm.backend.gpu_sleep_mode_active = false;
 		kbase_pm_update_state(kbdev);
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 	}
 
 	return ret;
@@ -871,8 +962,8 @@ static void scheduler_wakeup(struct kbase_device *kbdev, bool kick)
 			"Re-activating the Scheduler out of sleep");
 
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-		ret = scheduler_pm_active_after_sleep(kbdev, flags);
-		/* hwaccess_lock is released in the previous function call. */
+		ret = scheduler_pm_active_after_sleep(kbdev, &flags);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 #endif
 	}
 
@@ -886,6 +977,7 @@ static void scheduler_wakeup(struct kbase_device *kbdev, bool kick)
 	}
 
 	scheduler->state = SCHED_INACTIVE;
+	KBASE_KTRACE_ADD(kbdev, SCHED_INACTIVE, NULL, scheduler->state);
 
 	if (kick)
 		scheduler_enable_tick_timer_nolock(kbdev);
@@ -901,6 +993,7 @@ static void scheduler_suspend(struct kbase_device *kbdev)
 		dev_dbg(kbdev->dev, "Suspending the Scheduler");
 		scheduler_pm_idle(kbdev);
 		scheduler->state = SCHED_SUSPENDED;
+		KBASE_KTRACE_ADD(kbdev, SCHED_SUSPENDED, NULL, scheduler->state);
 	}
 }
 
@@ -931,6 +1024,8 @@ static void update_idle_suspended_group_state(struct kbase_queue_group *group)
 					 KBASE_CSF_GROUP_SUSPENDED);
 	} else if (group->run_state == KBASE_CSF_GROUP_SUSPENDED_ON_IDLE) {
 		group->run_state = KBASE_CSF_GROUP_SUSPENDED;
+		KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_SUSPENDED, group,
+					 group->run_state);
 
 		/* If scheduler is not suspended and the given group's
 		 * static priority (reflected by the scan_seq_num) is inside
@@ -1055,6 +1150,7 @@ static int halt_stream_sync(struct kbase_queue *queue)
 	struct kbase_csf_cmd_stream_info *stream;
 	int csi_index = queue->csi_index;
 	long remaining = kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms);
+	unsigned long flags;
 
 	if (WARN_ON(!group) ||
 	    WARN_ON(!kbasep_csf_scheduler_group_is_on_slot_locked(group)))
@@ -1086,12 +1182,15 @@ static int halt_stream_sync(struct kbase_queue *queue)
 			kbase_csf_timeout_in_jiffies(kbdev->csf.fw_timeout_ms);
 	}
 
+	spin_lock_irqsave(&kbdev->csf.scheduler.interrupt_lock, flags);
 	/* Set state to STOP */
 	kbase_csf_firmware_cs_input_mask(stream, CS_REQ, CS_REQ_STATE_STOP,
 					 CS_REQ_STATE_MASK);
 
-	KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, CSI_STOP_REQ, group, queue, 0u);
 	kbase_csf_ring_cs_kernel_doorbell(kbdev, csi_index, group->csg_nr, true);
+	spin_unlock_irqrestore(&kbdev->csf.scheduler.interrupt_lock, flags);
+
+	KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, CSI_STOP_REQ, group, queue, 0u);
 
 	/* Timed wait */
 	remaining = wait_event_timeout(kbdev->csf.event_wait,
@@ -1362,6 +1461,7 @@ int kbase_csf_scheduler_queue_stop(struct kbase_queue *queue)
 	}
 
 	mutex_unlock(&kbdev->csf.scheduler.lock);
+	KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, QUEUE_STOP, group, queue, group->run_state);
 	return err;
 }
 
@@ -1439,6 +1539,7 @@ static void program_cs(struct kbase_device *kbdev,
 	struct kbase_csf_cmd_stream_group_info *ginfo;
 	struct kbase_csf_cmd_stream_info *stream;
 	int csi_index = queue->csi_index;
+	unsigned long flags;
 	u64 user_input;
 	u64 user_output;
 
@@ -1495,6 +1596,20 @@ static void program_cs(struct kbase_device *kbdev,
 	/* Enable all interrupts for now */
 	kbase_csf_firmware_cs_input(stream, CS_ACK_IRQ_MASK, ~((u32)0));
 
+	spin_lock_irqsave(&kbdev->csf.scheduler.interrupt_lock, flags);
+
+	/* The fault bit could be misaligned between CS_REQ and CS_ACK if the
+	 * acknowledgment was deferred due to dump on fault and the group was
+	 * removed from the CSG slot before the fault could be acknowledged.
+	 */
+	if (queue->enabled) {
+		u32 const cs_ack =
+			kbase_csf_firmware_cs_output(stream, CS_ACK);
+
+		kbase_csf_firmware_cs_input_mask(stream, CS_REQ, cs_ack,
+						 CS_REQ_FAULT_MASK);
+	}
+
 	/*
 	 * Enable the CSG idle notification once the CS's ringbuffer
 	 * becomes empty or the CS becomes sync_idle, waiting sync update
@@ -1508,11 +1623,12 @@ static void program_cs(struct kbase_device *kbdev,
 	kbase_csf_firmware_cs_input_mask(stream, CS_REQ,
 		queue->enabled ? CS_REQ_STATE_START : CS_REQ_STATE_STOP,
 		CS_REQ_STATE_MASK);
+	kbase_csf_ring_cs_kernel_doorbell(kbdev, csi_index, group->csg_nr,
+					  ring_csg_doorbell);
+	spin_unlock_irqrestore(&kbdev->csf.scheduler.interrupt_lock, flags);
 
 	KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, CSI_START, group, queue, queue->enabled);
 
-	kbase_csf_ring_cs_kernel_doorbell(kbdev, csi_index, group->csg_nr,
-					  ring_csg_doorbell);
 	update_hw_active(queue, true);
 }
 
@@ -1532,6 +1648,13 @@ int kbase_csf_scheduler_queue_start(struct kbase_queue *queue)
 
 	mutex_lock(&kbdev->csf.scheduler.lock);
 
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (unlikely(kbdev->csf.scheduler.state == SCHED_BUSY)) {
+		mutex_unlock(&kbdev->csf.scheduler.lock);
+		return -EBUSY;
+	}
+#endif
+
 	KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, QUEUE_START, group, queue,
 				   group->run_state);
 	KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, QUEUE_SYNC_UPDATE_WAIT_STATUS, queue->group, queue,
@@ -1716,6 +1839,7 @@ static void halt_csg_slot(struct kbase_queue_group *group, bool suspend)
 		/* Set state to SUSPEND/TERMINATE */
 		kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ, halt_cmd,
 						  CSG_REQ_STATE_MASK);
+		kbase_csf_ring_csg_doorbell(kbdev, slot);
 		spin_unlock_irqrestore(&kbdev->csf.scheduler.interrupt_lock,
 					flags);
 		atomic_set(&csg_slot[slot].state, CSG_SLOT_DOWN2STOP);
@@ -1724,7 +1848,6 @@ static void halt_csg_slot(struct kbase_queue_group *group, bool suspend)
 
 		KBASE_TLSTREAM_TL_KBASE_DEVICE_HALT_CSG(
 			kbdev, kbdev->gpu_props.props.raw_props.gpu_id, slot);
-		kbase_csf_ring_csg_doorbell(kbdev, slot);
 	}
 }
 
@@ -1738,6 +1861,31 @@ static void suspend_csg_slot(struct kbase_queue_group *group)
 	halt_csg_slot(group, true);
 }
 
+static bool csf_wait_ge_condition_supported(struct kbase_device *kbdev)
+{
+	const uint32_t glb_major = GLB_VERSION_MAJOR_GET(kbdev->csf.global_iface.version);
+	const uint32_t glb_minor = GLB_VERSION_MINOR_GET(kbdev->csf.global_iface.version);
+
+	switch (glb_major) {
+	case 0:
+		break;
+	case 1:
+		if (glb_minor >= 4)
+			return true;
+		break;
+	case 2:
+		if (glb_minor >= 6)
+			return true;
+		break;
+	case 3:
+		if (glb_minor >= 6)
+			return true;
+		break;
+	default:
+		return true;
+	}
+	return false;
+}
 /**
  * evaluate_sync_update() - Evaluate the sync wait condition the GPU command
  *                          queue has been blocked on.
@@ -1754,11 +1902,13 @@ static bool evaluate_sync_update(struct kbase_queue *queue)
 	u32 sync_wait_cond;
 	u32 sync_current_val;
 	struct kbase_device *kbdev;
+	bool sync_wait_cond_valid = false;
 
 	if (WARN_ON(!queue))
 		return false;
 
 	kbdev = queue->kctx->kbdev;
+
 	lockdep_assert_held(&kbdev->csf.scheduler.lock);
 
 	sync_ptr = kbase_phy_alloc_mapping_get(queue->kctx, queue->sync_ptr,
@@ -1777,9 +1927,12 @@ static bool evaluate_sync_update(struct kbase_queue *queue)
 
 	sync_wait_cond =
 		CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GET(queue->status_wait);
+	sync_wait_cond_valid = (sync_wait_cond == CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GT) ||
+			       (sync_wait_cond == CS_STATUS_WAIT_SYNC_WAIT_CONDITION_LE) ||
+			       ((sync_wait_cond == CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GE) &&
+				csf_wait_ge_condition_supported(kbdev));
 
-	WARN_ON((sync_wait_cond != CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GT) &&
-		(sync_wait_cond != CS_STATUS_WAIT_SYNC_WAIT_CONDITION_LE));
+	WARN_ON(!sync_wait_cond_valid);
 
 	sync_current_val = READ_ONCE(*sync_ptr);
 	KBASE_KTRACE_ADD_CSF_GRP_Q(kbdev, QUEUE_SYNC_UPDATE_CUR_VAL, queue->group, queue,
@@ -1790,6 +1943,8 @@ static bool evaluate_sync_update(struct kbase_queue *queue)
 
 	if (((sync_wait_cond == CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GT) &&
 	     (sync_current_val > queue->sync_value)) ||
+	    ((sync_wait_cond == CS_STATUS_WAIT_SYNC_WAIT_CONDITION_GE) &&
+	     (sync_current_val >= queue->sync_value) && csf_wait_ge_condition_supported(kbdev)) ||
 	    ((sync_wait_cond == CS_STATUS_WAIT_SYNC_WAIT_CONDITION_LE) &&
 	     (sync_current_val <= queue->sync_value))) {
 		/* The sync wait condition is satisfied so the group to which
@@ -1892,12 +2047,48 @@ static void schedule_in_cycle(struct kbase_queue_group *group, bool force)
 	 * of work needs to be enforced in situation such as entering into
 	 * protected mode).
 	 */
-	if ((likely(scheduler_timer_is_enabled_nolock(kbdev)) || force) &&
-			!scheduler->tock_pending_request) {
-		scheduler->tock_pending_request = true;
+	if (likely(scheduler_timer_is_enabled_nolock(kbdev)) || force) {
 		dev_dbg(kbdev->dev, "Kicking async for group %d\n",
 			group->handle);
-		mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
+		kbase_csf_scheduler_invoke_tock(kbdev);
+	}
+}
+
+static void ktrace_log_group_state(struct kbase_queue_group *const group)
+{
+	switch (group->run_state) {
+	case KBASE_CSF_GROUP_INACTIVE:
+		KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_INACTIVE, group,
+					group->run_state);
+		break;
+	case KBASE_CSF_GROUP_RUNNABLE:
+		KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_RUNNABLE, group,
+					group->run_state);
+		break;
+	case KBASE_CSF_GROUP_IDLE:
+		KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_IDLE, group,
+					group->run_state);
+		break;
+	case KBASE_CSF_GROUP_SUSPENDED:
+		KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_SUSPENDED, group,
+					group->run_state);
+		break;
+	case KBASE_CSF_GROUP_SUSPENDED_ON_IDLE:
+		KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_SUSPENDED_ON_IDLE, group,
+					group->run_state);
+		break;
+	case KBASE_CSF_GROUP_SUSPENDED_ON_WAIT_SYNC:
+		KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_SUSPENDED_ON_WAIT_SYNC,
+					group, group->run_state);
+		break;
+	case KBASE_CSF_GROUP_FAULT_EVICTED:
+		KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_FAULT_EVICTED, group,
+					group->run_state);
+		break;
+	case KBASE_CSF_GROUP_TERMINATED:
+		KBASE_KTRACE_ADD_CSF_GRP(group->kctx->kbdev, CSF_GROUP_TERMINATED, group,
+					group->run_state);
+		break;
 	}
 }
 
@@ -1918,6 +2109,8 @@ void insert_group_to_runnable(struct kbase_csf_scheduler *const scheduler,
 
 	group->run_state = run_state;
 
+	ktrace_log_group_state(group);
+
 	if (run_state == KBASE_CSF_GROUP_RUNNABLE)
 		group->prepared_seq_num = KBASEP_GROUP_PREPARED_SEQ_NUM_INVALID;
 
@@ -1969,6 +2162,9 @@ void remove_group_from_runnable(struct kbase_csf_scheduler *const scheduler,
 	WARN_ON(!queue_group_scheduled_locked(group));
 
 	group->run_state = run_state;
+
+	ktrace_log_group_state(group);
+
 	list_del_init(&group->link);
 
 	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
@@ -2067,6 +2263,8 @@ static void insert_group_to_idle_wait(struct kbase_queue_group *const group)
 	KBASE_KTRACE_ADD_CSF_GRP(kctx->kbdev, GROUP_IDLE_WAIT_INSERT, group,
 				 kctx->csf.sched.num_idle_wait_grps);
 	group->run_state = KBASE_CSF_GROUP_SUSPENDED_ON_WAIT_SYNC;
+	KBASE_KTRACE_ADD_CSF_GRP(kctx->kbdev, CSF_GROUP_SUSPENDED_ON_WAIT_SYNC, group,
+				 group->run_state);
 	dev_dbg(kctx->kbdev->dev,
 		"Group-%d suspended on sync_wait, total wait_groups: %u\n",
 		group->handle, kctx->csf.sched.num_idle_wait_grps);
@@ -2092,6 +2290,7 @@ static void remove_group_from_idle_wait(struct kbase_queue_group *const group)
 				NULL;
 	KBASE_KTRACE_ADD_CSF_GRP(kctx->kbdev, GROUP_IDLE_WAIT_HEAD, new_head_grp, 0u);
 	group->run_state = KBASE_CSF_GROUP_INACTIVE;
+	KBASE_KTRACE_ADD_CSF_GRP(kctx->kbdev, CSF_GROUP_INACTIVE, group, group->run_state);
 }
 
 static void deschedule_idle_wait_group(struct kbase_csf_scheduler *scheduler,
@@ -2270,14 +2469,19 @@ static void save_csg_slot(struct kbase_queue_group *group)
 			else {
 				group->run_state =
 					KBASE_CSF_GROUP_SUSPENDED_ON_IDLE;
+				KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_SUSPENDED_ON_IDLE, group,
+							 group->run_state);
 				dev_dbg(kbdev->dev, "Group-%d suspended: idle",
 					group->handle);
 			}
 		} else {
 			group->run_state = KBASE_CSF_GROUP_SUSPENDED;
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_SUSPENDED, group,
+						 group->run_state);
 		}
 
 		update_offslot_non_idle_cnt_on_grp_suspend(group);
+		kbase_csf_tiler_heap_reclaim_sched_notify_grp_suspend(group);
 	}
 }
 
@@ -2400,6 +2604,7 @@ static void update_csg_slot_priority(struct kbase_queue_group *group, u8 prio)
 	csg_req ^= CSG_REQ_EP_CFG_MASK;
 	kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ, csg_req,
 					  CSG_REQ_EP_CFG_MASK);
+	kbase_csf_ring_csg_doorbell(kbdev, slot);
 	spin_unlock_irqrestore(&kbdev->csf.scheduler.interrupt_lock, flags);
 
 	csg_slot->priority = prio;
@@ -2410,7 +2615,6 @@ static void update_csg_slot_priority(struct kbase_queue_group *group, u8 prio)
 
 	KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_SLOT_PRIO_UPDATE, group, prev_prio);
 
-	kbase_csf_ring_csg_doorbell(kbdev, slot);
 	set_bit(slot, kbdev->csf.scheduler.csg_slots_prio_update);
 }
 
@@ -2522,6 +2726,12 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot,
 			protm_suspend_buf >> 32);
 	}
 
+	if (group->dvs_buf) {
+		kbase_csf_firmware_csg_input(ginfo, CSG_DVS_BUF_LO,
+					     group->dvs_buf & U32_MAX);
+		kbase_csf_firmware_csg_input(ginfo, CSG_DVS_BUF_HI,
+					     group->dvs_buf >> 32);
+	}
 
 	/* Enable all interrupts for now */
 	kbase_csf_firmware_csg_input(ginfo, CSG_ACK_IRQ_MASK, ~((u32)0));
@@ -2542,6 +2752,7 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot,
 
 	kbase_csf_firmware_csg_input_mask(ginfo, CSG_REQ,
 			state, CSG_REQ_STATE_MASK);
+	kbase_csf_ring_csg_doorbell(kbdev, slot);
 	spin_unlock_irqrestore(&kbdev->csf.scheduler.interrupt_lock, flags);
 
 	/* Update status before rings the door-bell, marking ready => run */
@@ -2561,7 +2772,8 @@ static void program_csg_slot(struct kbase_queue_group *group, s8 slot,
 				 (((u64)ep_cfg) << 32) | ((((u32)kctx->as_nr) & 0xF) << 16) |
 					 (state & (CSG_REQ_STATE_MASK >> CS_REQ_STATE_SHIFT)));
 
-	kbase_csf_ring_csg_doorbell(kbdev, slot);
+	/* Update the heap reclaim manager */
+	kbase_csf_tiler_heap_reclaim_sched_notify_grp_active(group);
 
 	/* Programming a slot consumes a group from scanout */
 	update_offslot_non_idle_cnt_for_onslot_grp(group);
@@ -2623,8 +2835,11 @@ static void sched_evict_group(struct kbase_queue_group *group, bool fault,
 
 		WARN_ON(group->run_state != KBASE_CSF_GROUP_INACTIVE);
 
-		if (fault)
+		if (fault) {
 			group->run_state = KBASE_CSF_GROUP_FAULT_EVICTED;
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_FAULT_EVICTED, group,
+						 scheduler->total_runnable_grps);
+		}
 
 		KBASE_KTRACE_ADD_CSF_GRP(kbdev, GROUP_EVICT, group,
 					 (((u64)scheduler->total_runnable_grps) << 32) |
@@ -2634,6 +2849,8 @@ static void sched_evict_group(struct kbase_queue_group *group, bool fault,
 		/* Notify a group has been evicted */
 		wake_up_all(&kbdev->csf.event_wait);
 	}
+
+	kbase_csf_tiler_heap_reclaim_sched_notify_grp_evict(group);
 }
 
 static int term_group_sync(struct kbase_queue_group *group)
@@ -2648,11 +2865,16 @@ static int term_group_sync(struct kbase_queue_group *group)
 		group->cs_unrecoverable || csg_slot_stopped_locked(kbdev, group->csg_nr),
 		remaining);
 
-	if (!remaining) {
+	if (unlikely(!remaining)) {
+		enum dumpfault_error_type error_type = DF_CSG_TERMINATE_TIMEOUT;
+
 		dev_warn(kbdev->dev, "[%llu] term request timeout (%d ms) for group %d of context %d_%d on slot %d",
 			 kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms,
 			 group->handle, group->kctx->tgid,
 			 group->kctx->id, group->csg_nr);
+		if (kbase_csf_firmware_ping_wait(kbdev, FW_PING_AFTER_ERROR_TIMEOUT_MS))
+			error_type = DF_PING_REQUEST_TIMEOUT;
+		kbase_debug_csf_fault_notify(kbdev, group->kctx, error_type);
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
 
@@ -2667,6 +2889,7 @@ void kbase_csf_scheduler_group_deschedule(struct kbase_queue_group *group)
 {
 	struct kbase_device *kbdev = group->kctx->kbdev;
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	bool wait_for_termination = true;
 	bool on_slot;
 
 	kbase_reset_gpu_assert_failed_or_prevented(kbdev);
@@ -2674,6 +2897,7 @@ void kbase_csf_scheduler_group_deschedule(struct kbase_queue_group *group)
 	mutex_lock(&scheduler->lock);
 
 	KBASE_KTRACE_ADD_CSF_GRP(kbdev, GROUP_DESCHEDULE, group, group->run_state);
+	wait_for_dump_complete_on_group_deschedule(group);
 	if (!queue_group_scheduled_locked(group))
 		goto unlock;
 
@@ -2681,39 +2905,28 @@ void kbase_csf_scheduler_group_deschedule(struct kbase_queue_group *group)
 
 #ifdef KBASE_PM_RUNTIME
 	/* If the queue group is on slot and Scheduler is in SLEEPING state,
-	 * then we need to wait here for Scheduler to exit the sleep state
-	 * (i.e. wait for the runtime suspend or power down of GPU). This would
-	 * be better than aborting the power down. The group will be suspended
-	 * anyways on power down, so won't have to send the CSG termination
-	 * request to FW.
+	 * then we need to wake up the Scheduler to exit the sleep state rather
+	 * than waiting for the runtime suspend or power down of GPU.
+	 * The group termination is usually triggered in the context of Application
+	 * thread and it has been seen that certain Apps can destroy groups at
+	 * random points and not necessarily when the App is exiting.
 	 */
 	if (on_slot && (scheduler->state == SCHED_SLEEPING)) {
-		if (wait_for_scheduler_to_exit_sleep(kbdev)) {
+		scheduler_wakeup(kbdev, true);
+
+		/* Wait for MCU firmware to start running */
+		if (kbase_csf_scheduler_wait_mcu_active(kbdev)) {
 			dev_warn(
 				kbdev->dev,
-				"Wait for scheduler to exit sleep state timedout when terminating group %d of context %d_%d on slot %d",
+				"[%llu] Wait for MCU active failed when terminating group %d of context %d_%d on slot %d",
+				kbase_backend_get_cycle_cnt(kbdev),
 				group->handle, group->kctx->tgid,
 				group->kctx->id, group->csg_nr);
-
-			scheduler_wakeup(kbdev, true);
-
-			/* Wait for MCU firmware to start running */
-			if (kbase_csf_scheduler_wait_mcu_active(kbdev))
-				dev_warn(
-					kbdev->dev,
-					"[%llu] Wait for MCU active failed when terminating group %d of context %d_%d on slot %d",
-					kbase_backend_get_cycle_cnt(kbdev),
-					group->handle, group->kctx->tgid,
-					group->kctx->id, group->csg_nr);
+			/* No point in waiting for CSG termination if MCU didn't
+			 * become active.
+			 */
+			wait_for_termination = false;
 		}
-
-		/* Check the group state again as scheduler lock would have been
-		 * released when waiting for the exit from SLEEPING state.
-		 */
-		if (!queue_group_scheduled_locked(group))
-			goto unlock;
-
-		on_slot = kbasep_csf_scheduler_group_is_on_slot_locked(group);
 	}
 #endif
 	if (!on_slot) {
@@ -2721,7 +2934,11 @@ void kbase_csf_scheduler_group_deschedule(struct kbase_queue_group *group)
 	} else {
 		bool as_faulty;
 
-		term_group_sync(group);
+		if (likely(wait_for_termination))
+			term_group_sync(group);
+		else
+			term_csg_slot(group);
+
 		/* Treat the csg been terminated */
 		as_faulty = cleanup_csg_slot(group);
 		/* remove from the scheduler list */
@@ -2770,6 +2987,8 @@ static int scheduler_group_schedule(struct kbase_queue_group *group)
 				group));
 
 			group->run_state = KBASE_CSF_GROUP_RUNNABLE;
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_RUNNABLE, group,
+						 group->run_state);
 
 			/* A normal mode CSG could be idle onslot during
 			 * protected mode. In this case clear the
@@ -3124,7 +3343,7 @@ static void program_suspending_csg_slots(struct kbase_device *kbdev)
 				csg_slot_stopped_raw),
 			remaining);
 
-		if (remaining) {
+		if (likely(remaining)) {
 			u32 i;
 
 			for_each_set_bit(i, changed, num_groups) {
@@ -3164,6 +3383,7 @@ static void program_suspending_csg_slots(struct kbase_device *kbdev)
 			for_each_set_bit(i, slot_mask, num_groups) {
 				struct kbase_queue_group *const group =
 					scheduler->csg_slots[i].resident_group;
+				enum dumpfault_error_type error_type = DF_CSG_SUSPEND_TIMEOUT;
 
 				struct base_gpu_queue_group_error const
 					err_payload = { .error_type =
@@ -3177,10 +3397,6 @@ static void program_suspending_csg_slots(struct kbase_device *kbdev)
 				if (unlikely(group == NULL))
 					continue;
 
-				kbase_csf_add_group_fatal_error(group,
-								&err_payload);
-				kbase_event_wakeup(group->kctx);
-
 				/* TODO GPUCORE-25328: The CSG can't be
 				 * terminated, the GPU will be reset as a
 				 * work-around.
@@ -3192,6 +3408,13 @@ static void program_suspending_csg_slots(struct kbase_device *kbdev)
 					group->handle, group->kctx->tgid,
 					group->kctx->id, i,
 					kbdev->csf.fw_timeout_ms);
+				if (kbase_csf_firmware_ping_wait(kbdev,
+								 FW_PING_AFTER_ERROR_TIMEOUT_MS))
+					error_type = DF_PING_REQUEST_TIMEOUT;
+				schedule_actions_trigger_df(kbdev, group->kctx, error_type);
+
+				kbase_csf_add_group_fatal_error(group, &err_payload);
+				kbase_event_wakeup(group->kctx);
 
 				/* The group has failed suspension, stop
 				 * further examination.
@@ -3279,7 +3502,7 @@ static void wait_csg_slots_start(struct kbase_device *kbdev)
 			slots_state_changed(kbdev, changed, csg_slot_running),
 			remaining);
 
-		if (remaining) {
+		if (likely(remaining)) {
 			for_each_set_bit(i, changed, num_groups) {
 				struct kbase_queue_group *group =
 					scheduler->csg_slots[i].resident_group;
@@ -3287,12 +3510,22 @@ static void wait_csg_slots_start(struct kbase_device *kbdev)
 				/* The on slot csg is now running */
 				clear_bit(i, slot_mask);
 				group->run_state = KBASE_CSF_GROUP_RUNNABLE;
+				KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_RUNNABLE, group,
+							 group->run_state);
 			}
 		} else {
-			dev_warn(kbdev->dev, "[%llu] Timeout (%d ms) waiting for CSG slots to start, slots: 0x%*pb\n",
-				 kbase_backend_get_cycle_cnt(kbdev),
-				 kbdev->csf.fw_timeout_ms,
-				 num_groups, slot_mask);
+			const int csg_nr = ffs(slot_mask[0]) - 1;
+			struct kbase_queue_group *group =
+				scheduler->csg_slots[csg_nr].resident_group;
+			enum dumpfault_error_type error_type = DF_CSG_START_TIMEOUT;
+
+			dev_err(kbdev->dev,
+				"[%llu] Timeout (%d ms) waiting for CSG slots to start, slots: 0x%*pb\n",
+				kbase_backend_get_cycle_cnt(kbdev), kbdev->csf.fw_timeout_ms,
+				num_groups, slot_mask);
+			if (kbase_csf_firmware_ping_wait(kbdev, FW_PING_AFTER_ERROR_TIMEOUT_MS))
+				error_type = DF_PING_REQUEST_TIMEOUT;
+			schedule_actions_trigger_df(kbdev, group->kctx, error_type);
 
 			if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 				kbase_reset_gpu(kbdev);
@@ -3409,11 +3642,10 @@ static int wait_csg_slots_handshake_ack(struct kbase_device *kbdev,
 						   slot_mask, dones),
 				remaining);
 
-		if (remaining)
+		if (likely(remaining))
 			bitmap_andnot(slot_mask, slot_mask, dones, num_groups);
 		else {
 
-
 			/* Timed-out on the wait */
 			return -ETIMEDOUT;
 		}
@@ -3432,17 +3664,25 @@ static void wait_csg_slots_finish_prio_update(struct kbase_device *kbdev)
 
 	lockdep_assert_held(&kbdev->csf.scheduler.lock);
 
-	if (ret != 0) {
-		/* The update timeout is not regarded as a serious
-		 * issue, no major consequences are expected as a
-		 * result, so just warn the case.
-		 */
+	if (unlikely(ret != 0)) {
+		const int csg_nr = ffs(slot_mask[0]) - 1;
+		struct kbase_queue_group *group =
+			kbdev->csf.scheduler.csg_slots[csg_nr].resident_group;
+		enum dumpfault_error_type error_type = DF_CSG_EP_CFG_TIMEOUT;
+
 		dev_warn(
 			kbdev->dev,
 			"[%llu] Timeout (%d ms) on CSG_REQ:EP_CFG, skipping the update wait: slot mask=0x%lx",
 			kbase_backend_get_cycle_cnt(kbdev),
 			kbdev->csf.fw_timeout_ms,
 			slot_mask[0]);
+		if (kbase_csf_firmware_ping_wait(kbdev, FW_PING_AFTER_ERROR_TIMEOUT_MS))
+			error_type = DF_PING_REQUEST_TIMEOUT;
+		schedule_actions_trigger_df(kbdev, group->kctx, error_type);
+
+		/* Timeout could indicate firmware is unresponsive so trigger a GPU reset. */
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
+			kbase_reset_gpu(kbdev);
 	}
 }
 
@@ -3508,6 +3748,7 @@ void kbase_csf_scheduler_evict_ctx_slots(struct kbase_device *kbdev,
 	kbase_event_wakeup(kctx);
 
 	mutex_unlock(&scheduler->lock);
+	KBASE_KTRACE_ADD(kbdev, SCHEDULER_EVICT_CTX_SLOTS_END, kctx, num_groups);
 }
 
 /**
@@ -3690,6 +3931,8 @@ static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev,
 			    CSG_SLOT_RUNNING) {
 			if (kctx_as_enabled(input_grp->kctx) &&
 			    scheduler_slot_protm_ack(kbdev, input_grp, slot)) {
+				int err;
+
 				/* Option of acknowledging to multiple
 				 * CSGs from the same kctx is dropped,
 				 * after consulting with the
@@ -3708,9 +3951,13 @@ static void scheduler_group_check_protm_enter(struct kbase_device *const kbdev,
 
 				spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 
-				kbase_csf_wait_protected_mode_enter(kbdev);
+				err = kbase_csf_wait_protected_mode_enter(kbdev);
 				mutex_unlock(&kbdev->mmu_hw_mutex);
 
+				if (err)
+					schedule_actions_trigger_df(kbdev, input_grp->kctx,
+							DF_PROTECTED_MODE_ENTRY_FAILURE);
+
 				scheduler->protm_enter_time = ktime_get_raw();
 
 				return;
@@ -4093,8 +4340,6 @@ static void scheduler_update_idle_slots_status(struct kbase_device *kbdev,
 		}
 	}
 
-	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
-
 
 	/* The groups are aggregated into a single kernel doorbell request */
 	if (!bitmap_empty(csg_bitmap, num_groups)) {
@@ -4103,15 +4348,22 @@ static void scheduler_update_idle_slots_status(struct kbase_device *kbdev,
 		u32 db_slots = (u32)csg_bitmap[0];
 
 		kbase_csf_ring_csg_slots_doorbell(kbdev, db_slots);
+		spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 
 		if (wait_csg_slots_handshake_ack(kbdev,
 				CSG_REQ_STATUS_UPDATE_MASK, csg_bitmap, wt)) {
+			const int csg_nr = ffs(csg_bitmap[0]) - 1;
+			struct kbase_queue_group *group =
+				scheduler->csg_slots[csg_nr].resident_group;
+
 			dev_warn(
 				kbdev->dev,
 				"[%llu] Timeout (%d ms) on CSG_REQ:STATUS_UPDATE, treat groups as not idle: slot mask=0x%lx",
 				kbase_backend_get_cycle_cnt(kbdev),
 				kbdev->csf.fw_timeout_ms,
 				csg_bitmap[0]);
+			schedule_actions_trigger_df(kbdev, group->kctx,
+				DF_CSG_STATUS_UPDATE_TIMEOUT);
 
 			/* Store the bitmap of timed out slots */
 			bitmap_copy(failed_csg_bitmap, csg_bitmap, num_groups);
@@ -4131,6 +4383,8 @@ static void scheduler_update_idle_slots_status(struct kbase_device *kbdev,
 			KBASE_KTRACE_ADD(kbdev, SCHEDULER_UPDATE_IDLE_SLOTS_ACK, NULL, db_slots);
 			csg_bitmap[0] = db_slots;
 		}
+	} else {
+		spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 	}
 }
 
@@ -4185,17 +4439,21 @@ static void scheduler_handle_idle_slots(struct kbase_device *kbdev)
 
 		if (group_on_slot_is_idle(kbdev, i)) {
 			group->run_state = KBASE_CSF_GROUP_IDLE;
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_IDLE, group, group->run_state);
 			set_bit(i, scheduler->csg_slots_idle_mask);
 			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_SLOT_IDLE_SET,
 						 group, scheduler->csg_slots_idle_mask[0]);
-		} else
+		} else {
 			group->run_state = KBASE_CSF_GROUP_RUNNABLE;
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_RUNNABLE, group,
+						 group->run_state);
+		}
 	}
 
 	bitmap_or(scheduler->csg_slots_idle_mask,
 		  scheduler->csg_slots_idle_mask,
 		  failed_csg_bitmap, num_groups);
-	KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSG_SLOT_IDLE_SET, NULL,
+	KBASE_KTRACE_ADD_CSF_GRP(kbdev, SCHEDULER_HANDLE_IDLE_SLOTS, NULL,
 				 scheduler->csg_slots_idle_mask[0]);
 	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 }
@@ -4281,7 +4539,12 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
 
 	int ret = suspend_active_queue_groups(kbdev, slot_mask);
 
-	if (ret) {
+	if (unlikely(ret)) {
+		const int csg_nr = ffs(slot_mask[0]) - 1;
+		struct kbase_queue_group *group =
+			scheduler->csg_slots[csg_nr].resident_group;
+		enum dumpfault_error_type error_type = DF_CSG_SUSPEND_TIMEOUT;
+
 		/* The suspend of CSGs failed,
 		 * trigger the GPU reset to be in a deterministic state.
 		 */
@@ -4289,6 +4552,9 @@ static int suspend_active_groups_on_powerdown(struct kbase_device *kbdev,
 			 kbase_backend_get_cycle_cnt(kbdev),
 			 kbdev->csf.fw_timeout_ms,
 			 kbdev->csf.global_iface.group_num, slot_mask);
+		if (kbase_csf_firmware_ping_wait(kbdev, FW_PING_AFTER_ERROR_TIMEOUT_MS))
+			error_type = DF_PING_REQUEST_TIMEOUT;
+		schedule_actions_trigger_df(kbdev, group->kctx, error_type);
 
 		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE))
 			kbase_reset_gpu(kbdev);
@@ -4372,6 +4638,21 @@ static bool scheduler_idle_suspendable(struct kbase_device *kbdev)
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	spin_lock(&scheduler->interrupt_lock);
+
+	if (scheduler->fast_gpu_idle_handling) {
+		scheduler->fast_gpu_idle_handling = false;
+
+		if (scheduler->total_runnable_grps) {
+			suspend = !atomic_read(&scheduler->non_idle_offslot_grps) &&
+				  kbase_pm_idle_groups_sched_suspendable(kbdev);
+		} else
+			suspend = kbase_pm_no_runnables_sched_suspendable(kbdev);
+		spin_unlock(&scheduler->interrupt_lock);
+		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+		return suspend;
+	}
+
 	if (scheduler->total_runnable_grps) {
 
 		/* Check both on-slots and off-slots groups idle status */
@@ -4418,6 +4699,7 @@ static void scheduler_sleep_on_idle(struct kbase_device *kbdev)
 	cancel_tick_timer(kbdev);
 	scheduler_pm_idle_before_sleep(kbdev);
 	scheduler->state = SCHED_SLEEPING;
+	KBASE_KTRACE_ADD(kbdev, SCHED_SLEEPING, NULL, scheduler->state);
 }
 #endif
 
@@ -4471,8 +4753,17 @@ static void gpu_idle_worker(struct work_struct *work)
 				 __ENCODE_KTRACE_INFO(true, false, false));
 		return;
 	}
+	kbase_debug_csf_fault_wait_completion(kbdev);
 	mutex_lock(&scheduler->lock);
 
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (unlikely(scheduler->state == SCHED_BUSY)) {
+		mutex_unlock(&scheduler->lock);
+		kbase_reset_gpu_allow(kbdev);
+		return;
+	}
+#endif
+
 	scheduler_is_idle_suspendable = scheduler_idle_suspendable(kbdev);
 	if (scheduler_is_idle_suspendable) {
 		KBASE_KTRACE_ADD(kbdev, SCHEDULER_GPU_IDLE_WORKER_HANDLING_START, NULL,
@@ -4484,6 +4775,8 @@ static void gpu_idle_worker(struct work_struct *work)
 		else
 #endif
 			all_groups_suspended = scheduler_suspend_on_idle(kbdev);
+
+		KBASE_KTRACE_ADD(kbdev, SCHEDULER_GPU_IDLE_WORKER_HANDLING_END, NULL, 0u);
 	}
 
 	mutex_unlock(&scheduler->lock);
@@ -4641,14 +4934,162 @@ static int prepare_fast_local_tock(struct kbase_device *kbdev)
 		struct kbase_csf_csg_slot *csg_slot = &scheduler->csg_slots[i];
 		struct kbase_queue_group *group = csg_slot->resident_group;
 
-		if (!queue_group_idle_locked(group))
+		if (!queue_group_idle_locked(group)) {
 			group->run_state = KBASE_CSF_GROUP_IDLE;
+			KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_IDLE, group, group->run_state);
+		}
 	}
 
 	/* Return the number of idle slots for potential replacement */
 	return bitmap_weight(csg_bitmap, num_groups);
 }
 
+static int wait_csg_slots_suspend(struct kbase_device *kbdev, unsigned long *slot_mask,
+				  unsigned int timeout_ms)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	long remaining = kbase_csf_timeout_in_jiffies(timeout_ms);
+	u32 num_groups = kbdev->csf.global_iface.group_num;
+	int err = 0;
+	DECLARE_BITMAP(slot_mask_local, MAX_SUPPORTED_CSGS);
+
+	lockdep_assert_held(&scheduler->lock);
+
+	bitmap_copy(slot_mask_local, slot_mask, MAX_SUPPORTED_CSGS);
+
+	while (!bitmap_empty(slot_mask_local, MAX_SUPPORTED_CSGS) && remaining) {
+		DECLARE_BITMAP(changed, MAX_SUPPORTED_CSGS);
+
+		bitmap_copy(changed, slot_mask_local, MAX_SUPPORTED_CSGS);
+
+		remaining = wait_event_timeout(
+			kbdev->csf.event_wait,
+			slots_state_changed(kbdev, changed, csg_slot_stopped_locked), remaining);
+
+		if (likely(remaining)) {
+			u32 i;
+
+			for_each_set_bit(i, changed, num_groups) {
+				struct kbase_queue_group *group;
+
+				if (WARN_ON(!csg_slot_stopped_locked(kbdev, (s8)i)))
+					continue;
+
+				/* The on slot csg is now stopped */
+				clear_bit(i, slot_mask_local);
+
+				group = scheduler->csg_slots[i].resident_group;
+				if (likely(group)) {
+					/* Only do save/cleanup if the
+					 * group is not terminated during
+					 * the sleep.
+					 */
+					save_csg_slot(group);
+					if (cleanup_csg_slot(group))
+						sched_evict_group(group, true, true);
+				}
+			}
+		} else {
+			dev_warn(
+				kbdev->dev,
+				"[%llu] Suspend request sent on CSG slots 0x%lx timed out for slots 0x%lx",
+				kbase_backend_get_cycle_cnt(kbdev), slot_mask[0],
+				slot_mask_local[0]);
+			/* Return the bitmask of the timed out slots to the caller */
+			bitmap_copy(slot_mask, slot_mask_local, MAX_SUPPORTED_CSGS);
+
+			err = -ETIMEDOUT;
+		}
+	}
+
+	return err;
+}
+
+/**
+ * evict_lru_or_blocked_csg() - Evict the least-recently-used idle or blocked CSG
+ *
+ * @kbdev: Pointer to the device
+ *
+ * Used to allow for speedier starting/resumption of another CSG. The worst-case
+ * scenario of the evicted CSG being scheduled next is expected to be rare.
+ * Also, the eviction will not be applied if the GPU is running in protected mode.
+ * Otherwise the the eviction attempt would force the MCU to quit the execution of
+ * the protected mode, and likely re-request to enter it again.
+ */
+static void evict_lru_or_blocked_csg(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	size_t i;
+	struct kbase_queue_group *lru_idle_group = NULL;
+	const u32 total_csg_slots = kbdev->csf.global_iface.group_num;
+	const bool all_addr_spaces_used = (scheduler->num_active_address_spaces >=
+					   (kbdev->nr_hw_address_spaces - NUM_RESERVED_AS_SLOTS));
+	u8 as_usage[BASE_MAX_NR_AS] = { 0 };
+
+	lockdep_assert_held(&scheduler->lock);
+	if (kbase_csf_scheduler_protected_mode_in_use(kbdev))
+		return;
+
+	BUILD_BUG_ON(MAX_SUPPORTED_CSGS > (sizeof(int) * BITS_PER_BYTE));
+	if (fls(scheduler->csg_inuse_bitmap[0]) != total_csg_slots)
+		return; /* Some CSG slots remain unused */
+
+	if (all_addr_spaces_used) {
+		for (i = 0; i != total_csg_slots; ++i) {
+			if (scheduler->csg_slots[i].resident_group != NULL)
+				as_usage[scheduler->csg_slots[i].resident_group->kctx->as_nr]++;
+		}
+	}
+
+	for (i = 0; i != total_csg_slots; ++i) {
+		struct kbase_queue_group *const group = scheduler->csg_slots[i].resident_group;
+
+		/* We expect that by this point all groups would normally be
+		 * assigned a physical CSG slot, but if circumstances have
+		 * changed then bail out of this optimisation.
+		 */
+		if (group == NULL)
+			return;
+
+		/* Real-time priority CSGs must be kept on-slot even when
+		 * idle.
+		 */
+		if ((group->run_state == KBASE_CSF_GROUP_IDLE) &&
+		    (group->priority != BASE_QUEUE_GROUP_PRIORITY_REALTIME) &&
+		    ((lru_idle_group == NULL) ||
+		     (lru_idle_group->prepared_seq_num < group->prepared_seq_num))) {
+			/* If all address spaces are used, we need to ensure the group does not
+			 * share the AS with other active CSGs. Or CSG would be freed without AS
+			 * and this optimization would not work.
+			 */
+			if ((!all_addr_spaces_used) || (as_usage[group->kctx->as_nr] == 1))
+				lru_idle_group = group;
+		}
+	}
+
+	if (lru_idle_group != NULL) {
+		unsigned long slot_mask = 1 << lru_idle_group->csg_nr;
+
+		dev_dbg(kbdev->dev, "Suspending LRU idle group %d of context %d_%d on slot %d",
+			lru_idle_group->handle, lru_idle_group->kctx->tgid,
+			lru_idle_group->kctx->id, lru_idle_group->csg_nr);
+		suspend_queue_group(lru_idle_group);
+		if (wait_csg_slots_suspend(kbdev, &slot_mask, kbdev->csf.fw_timeout_ms)) {
+			enum dumpfault_error_type error_type = DF_CSG_SUSPEND_TIMEOUT;
+
+			dev_warn(
+				kbdev->dev,
+				"[%llu] LRU idle group %d of context %d_%d failed to suspend on slot %d (timeout %d ms)",
+				kbase_backend_get_cycle_cnt(kbdev), lru_idle_group->handle,
+				lru_idle_group->kctx->tgid, lru_idle_group->kctx->id,
+				lru_idle_group->csg_nr, kbdev->csf.fw_timeout_ms);
+			if (kbase_csf_firmware_ping_wait(kbdev, FW_PING_AFTER_ERROR_TIMEOUT_MS))
+				error_type = DF_PING_REQUEST_TIMEOUT;
+			schedule_actions_trigger_df(kbdev, lru_idle_group->kctx, error_type);
+		}
+	}
+}
+
 static void schedule_actions(struct kbase_device *kbdev, bool is_tick)
 {
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
@@ -4796,6 +5237,8 @@ redo_local_tock:
 	} else {
 		spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 	}
+
+	evict_lru_or_blocked_csg(kbdev);
 }
 
 /**
@@ -4817,6 +5260,9 @@ static bool can_skip_scheduling(struct kbase_device *kbdev)
 
 	lockdep_assert_held(&scheduler->lock);
 
+	if (unlikely(!kbase_reset_gpu_is_not_pending(kbdev)))
+		return true;
+
 	if (scheduler->state == SCHED_SUSPENDED)
 		return true;
 
@@ -4826,12 +5272,12 @@ static bool can_skip_scheduling(struct kbase_device *kbdev)
 
 		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 		if (kbdev->pm.backend.exit_gpu_sleep_mode) {
-			int ret = scheduler_pm_active_after_sleep(kbdev, flags);
-			/* hwaccess_lock is released in the previous function
-			 * call.
-			 */
+			int ret = scheduler_pm_active_after_sleep(kbdev, &flags);
+
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 			if (!ret) {
 				scheduler->state = SCHED_INACTIVE;
+				KBASE_KTRACE_ADD(kbdev, SCHED_INACTIVE, NULL, scheduler->state);
 				return false;
 			}
 
@@ -4849,14 +5295,11 @@ static bool can_skip_scheduling(struct kbase_device *kbdev)
 
 static void schedule_on_tock(struct work_struct *work)
 {
-	struct kbase_device *kbdev = container_of(work, struct kbase_device,
-					csf.scheduler.tock_work.work);
+	struct kbase_device *kbdev =
+		container_of(work, struct kbase_device, csf.scheduler.tock_work.work);
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 	int err;
 
-	/* Tock work item is serviced */
-	scheduler->tock_pending_request = false;
-
 	err = kbase_reset_gpu_try_prevent(kbdev);
 	/* Regardless of whether reset failed or is currently happening, exit
 	 * early
@@ -4864,21 +5307,28 @@ static void schedule_on_tock(struct work_struct *work)
 	if (err)
 		return;
 
+	kbase_debug_csf_fault_wait_completion(kbdev);
 	mutex_lock(&scheduler->lock);
 	if (can_skip_scheduling(kbdev))
+	{
+		atomic_set(&scheduler->pending_tock_work, false);
 		goto exit_no_schedule_unlock;
+	}
 
 	WARN_ON(!(scheduler->state == SCHED_INACTIVE));
 	scheduler->state = SCHED_BUSY;
+	KBASE_KTRACE_ADD(kbdev, SCHED_BUSY, NULL, scheduler->state);
 
 	/* Undertaking schedule action steps */
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_TOCK_START, NULL, 0u);
-	schedule_actions(kbdev, false);
+	while (atomic_cmpxchg(&scheduler->pending_tock_work, true, false) == true)
+		schedule_actions(kbdev, false);
 
 	/* Record time information on a non-skipped tock */
 	scheduler->last_schedule = jiffies;
 
 	scheduler->state = SCHED_INACTIVE;
+	KBASE_KTRACE_ADD(kbdev, SCHED_INACTIVE, NULL, scheduler->state);
 	if (!scheduler->total_runnable_grps)
 		enqueue_gpu_idle_work(scheduler);
 	mutex_unlock(&scheduler->lock);
@@ -4897,8 +5347,8 @@ exit_no_schedule_unlock:
 
 static void schedule_on_tick(struct work_struct *work)
 {
-	struct kbase_device *kbdev = container_of(work, struct kbase_device,
-					csf.scheduler.tick_work);
+	struct kbase_device *kbdev =
+		container_of(work, struct kbase_device, csf.scheduler.tick_work);
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 
 	int err = kbase_reset_gpu_try_prevent(kbdev);
@@ -4908,6 +5358,7 @@ static void schedule_on_tick(struct work_struct *work)
 	if (err)
 		return;
 
+	kbase_debug_csf_fault_wait_completion(kbdev);
 	mutex_lock(&scheduler->lock);
 
 	WARN_ON(scheduler->tick_timer_active);
@@ -4915,6 +5366,7 @@ static void schedule_on_tick(struct work_struct *work)
 		goto exit_no_schedule_unlock;
 
 	scheduler->state = SCHED_BUSY;
+	KBASE_KTRACE_ADD(kbdev, SCHED_BUSY, NULL, scheduler->state);
 
 	/* Undertaking schedule action steps */
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_TICK_START, NULL, scheduler->total_runnable_grps);
@@ -4936,6 +5388,7 @@ static void schedule_on_tick(struct work_struct *work)
 
 	scheduler->state = SCHED_INACTIVE;
 	mutex_unlock(&scheduler->lock);
+	KBASE_KTRACE_ADD(kbdev, SCHED_INACTIVE, NULL, scheduler->state);
 	kbase_reset_gpu_allow(kbdev);
 
 	dev_dbg(kbdev->dev, "Waking up for event after schedule-on-tick completes.");
@@ -4949,67 +5402,6 @@ exit_no_schedule_unlock:
 	kbase_reset_gpu_allow(kbdev);
 }
 
-static int wait_csg_slots_suspend(struct kbase_device *kbdev,
-			   const unsigned long *slot_mask,
-			   unsigned int timeout_ms)
-{
-	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
-	long remaining = kbase_csf_timeout_in_jiffies(timeout_ms);
-	u32 num_groups = kbdev->csf.global_iface.group_num;
-	int err = 0;
-	DECLARE_BITMAP(slot_mask_local, MAX_SUPPORTED_CSGS);
-
-	lockdep_assert_held(&scheduler->lock);
-
-	bitmap_copy(slot_mask_local, slot_mask, MAX_SUPPORTED_CSGS);
-
-	while (!bitmap_empty(slot_mask_local, MAX_SUPPORTED_CSGS)
-		&& remaining) {
-		DECLARE_BITMAP(changed, MAX_SUPPORTED_CSGS);
-
-		bitmap_copy(changed, slot_mask_local, MAX_SUPPORTED_CSGS);
-
-		remaining = wait_event_timeout(kbdev->csf.event_wait,
-			slots_state_changed(kbdev, changed,
-				csg_slot_stopped_locked),
-			remaining);
-
-		if (remaining) {
-			u32 i;
-
-			for_each_set_bit(i, changed, num_groups) {
-				struct kbase_queue_group *group;
-
-				if (WARN_ON(!csg_slot_stopped_locked(kbdev, (s8)i)))
-					continue;
-
-				/* The on slot csg is now stopped */
-				clear_bit(i, slot_mask_local);
-
-				group = scheduler->csg_slots[i].resident_group;
-				if (likely(group)) {
-					/* Only do save/cleanup if the
-					 * group is not terminated during
-					 * the sleep.
-					 */
-					save_csg_slot(group);
-					if (cleanup_csg_slot(group))
-						sched_evict_group(group, true, true);
-				}
-			}
-		} else {
-			dev_warn(kbdev->dev, "[%llu] Timeout waiting for CSG slots to suspend, slot_mask: 0x%*pb\n",
-				 kbase_backend_get_cycle_cnt(kbdev),
-				 num_groups, slot_mask_local);
-
-
-			err = -ETIMEDOUT;
-		}
-	}
-
-	return err;
-}
-
 static int suspend_active_queue_groups(struct kbase_device *kbdev,
 				       unsigned long *slot_mask)
 {
@@ -5172,6 +5564,7 @@ static bool scheduler_handle_reset_in_protected_mode(struct kbase_device *kbdev)
 
 		cleanup_csg_slot(group);
 		group->run_state = KBASE_CSF_GROUP_SUSPENDED;
+		KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_SUSPENDED, group, group->run_state);
 
 		/* Simply treat the normal mode groups as non-idle. The tick
 		 * scheduled after the reset will re-initialize the counter
@@ -5186,10 +5579,15 @@ unlock:
 	return suspend_on_slot_groups;
 }
 
+static void cancel_tick_work(struct kbase_csf_scheduler *const scheduler)
+{
+	cancel_work_sync(&scheduler->tick_work);
+}
+
 static void cancel_tock_work(struct kbase_csf_scheduler *const scheduler)
 {
+	atomic_set(&scheduler->pending_tock_work, false);
 	cancel_delayed_work_sync(&scheduler->tock_work);
-	scheduler->tock_pending_request = false;
 }
 
 static void scheduler_inner_reset(struct kbase_device *kbdev)
@@ -5203,7 +5601,7 @@ static void scheduler_inner_reset(struct kbase_device *kbdev)
 	/* Cancel any potential queued delayed work(s) */
 	cancel_work_sync(&kbdev->csf.scheduler.gpu_idle_work);
 	cancel_tick_timer(kbdev);
-	cancel_work_sync(&scheduler->tick_work);
+	cancel_tick_work(scheduler);
 	cancel_tock_work(scheduler);
 	cancel_delayed_work_sync(&scheduler->ping_work);
 
@@ -5238,6 +5636,8 @@ void kbase_csf_scheduler_reset(struct kbase_device *kbdev)
 
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_RESET_START, NULL, 0u);
 
+	kbase_debug_csf_fault_wait_completion(kbdev);
+
 	if (scheduler_handle_reset_in_protected_mode(kbdev) &&
 	    !suspend_active_queue_groups_on_reset(kbdev)) {
 		/* As all groups have been successfully evicted from the CSG
@@ -5274,6 +5674,8 @@ void kbase_csf_scheduler_reset(struct kbase_device *kbdev)
 
 	mutex_unlock(&kbdev->kctx_list_lock);
 
+	KBASE_KTRACE_ADD(kbdev, SCHEDULER_RESET_END, NULL, 0u);
+
 	/* After queue groups reset, the scheduler data fields clear out */
 	scheduler_inner_reset(kbdev);
 }
@@ -5328,7 +5730,7 @@ static void firmware_aliveness_monitor(struct work_struct *work)
 
 	kbase_csf_scheduler_wait_mcu_active(kbdev);
 
-	err = kbase_csf_firmware_ping_wait(kbdev);
+	err = kbase_csf_firmware_ping_wait(kbdev, kbdev->csf.fw_timeout_ms);
 
 	if (err) {
 		/* It is acceptable to enqueue a reset whilst we've prevented
@@ -5687,6 +6089,8 @@ static bool check_sync_update_for_on_slot_group(
 				 */
 				group->reevaluate_idle_status = true;
 				group->run_state = KBASE_CSF_GROUP_RUNNABLE;
+				KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_RUNNABLE, group,
+							 group->run_state);
 			}
 
 			KBASE_KTRACE_ADD_CSF_GRP(kbdev, GROUP_SYNC_UPDATE_DONE, group, 0u);
@@ -5796,6 +6200,15 @@ static void check_group_sync_update_worker(struct work_struct *work)
 
 	mutex_lock(&scheduler->lock);
 
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (unlikely(scheduler->state == SCHED_BUSY)) {
+		queue_work(kctx->csf.sched.sync_update_wq,
+			&kctx->csf.sched.sync_update_work);
+		mutex_unlock(&scheduler->lock);
+		return;
+	}
+#endif
+
 	KBASE_KTRACE_ADD(kbdev, SCHEDULER_GROUP_SYNC_UPDATE_WORKER_START, kctx, 0u);
 	if (kctx->csf.sched.num_idle_wait_grps != 0) {
 		struct kbase_queue_group *group, *temp;
@@ -5871,6 +6284,8 @@ int kbase_csf_scheduler_context_init(struct kbase_context *kctx)
 	INIT_WORK(&kctx->csf.sched.sync_update_work,
 		check_group_sync_update_worker);
 
+	kbase_csf_tiler_heap_reclaim_ctx_init(kctx);
+
 	err = kbase_csf_event_wait_add(kctx, check_group_sync_update_cb, kctx);
 
 	if (err) {
@@ -5930,6 +6345,7 @@ int kbase_csf_scheduler_early_init(struct kbase_device *kbdev)
 
 	INIT_WORK(&scheduler->tick_work, schedule_on_tick);
 	INIT_DEFERRABLE_WORK(&scheduler->tock_work, schedule_on_tock);
+	atomic_set(&scheduler->pending_tock_work, false);
 
 	INIT_DEFERRABLE_WORK(&scheduler->ping_work, firmware_aliveness_monitor);
 
@@ -5945,18 +6361,19 @@ int kbase_csf_scheduler_early_init(struct kbase_device *kbdev)
 		(sizeof(scheduler->csgs_events_enable_mask) * BITS_PER_BYTE));
 	bitmap_fill(scheduler->csgs_events_enable_mask, MAX_SUPPORTED_CSGS);
 	scheduler->state = SCHED_SUSPENDED;
+	KBASE_KTRACE_ADD(kbdev, SCHED_SUSPENDED, NULL, scheduler->state);
 	scheduler->pm_active_count = 0;
 	scheduler->ngrp_to_schedule = 0;
 	scheduler->total_runnable_grps = 0;
 	scheduler->top_ctx = NULL;
 	scheduler->top_grp = NULL;
 	scheduler->last_schedule = 0;
-	scheduler->tock_pending_request = false;
 	scheduler->active_protm_grp = NULL;
 	scheduler->csg_scheduling_period_ms = CSF_SCHEDULER_TIME_TICK_MS;
 	scheduler_doorbell_init(kbdev);
 
 	INIT_WORK(&scheduler->gpu_idle_work, gpu_idle_worker);
+	scheduler->fast_gpu_idle_handling = false;
 	atomic_set(&scheduler->gpu_no_longer_idle, false);
 	atomic_set(&scheduler->non_idle_offslot_grps, 0);
 
@@ -5964,6 +6381,8 @@ int kbase_csf_scheduler_early_init(struct kbase_device *kbdev)
 	scheduler->tick_timer.function = tick_timer_callback;
 	scheduler->tick_timer_active = false;
 
+	kbase_csf_tiler_heap_reclaim_mgr_init(kbdev);
+
 	return 0;
 }
 
@@ -5981,22 +6400,26 @@ void kbase_csf_scheduler_term(struct kbase_device *kbdev)
 		mutex_lock(&kbdev->csf.scheduler.lock);
 
 		if (kbdev->csf.scheduler.state != SCHED_SUSPENDED) {
+			unsigned long flags;
 			/* The power policy could prevent the Scheduler from
 			 * getting suspended when GPU becomes idle.
 			 */
+			spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 			WARN_ON(kbase_pm_idle_groups_sched_suspendable(kbdev));
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 			scheduler_suspend(kbdev);
 		}
 
 		mutex_unlock(&kbdev->csf.scheduler.lock);
 		cancel_delayed_work_sync(&kbdev->csf.scheduler.ping_work);
 		cancel_tick_timer(kbdev);
-		cancel_work_sync(&kbdev->csf.scheduler.tick_work);
+		cancel_tick_work(&kbdev->csf.scheduler);
 		cancel_tock_work(&kbdev->csf.scheduler);
-		mutex_destroy(&kbdev->csf.scheduler.lock);
 		kfree(kbdev->csf.scheduler.csg_slots);
 		kbdev->csf.scheduler.csg_slots = NULL;
 	}
+	KBASE_KTRACE_ADD_CSF_GRP(kbdev, CSF_GROUP_TERMINATED, NULL,
+				 kbase_csf_scheduler_get_nr_active_csgs(kbdev));
 }
 
 void kbase_csf_scheduler_early_term(struct kbase_device *kbdev)
@@ -6005,6 +6428,9 @@ void kbase_csf_scheduler_early_term(struct kbase_device *kbdev)
 		destroy_workqueue(kbdev->csf.scheduler.idle_wq);
 	if (kbdev->csf.scheduler.wq)
 		destroy_workqueue(kbdev->csf.scheduler.wq);
+
+	kbase_csf_tiler_heap_reclaim_mgr_term(kbdev);
+	mutex_destroy(&kbdev->csf.scheduler.lock);
 }
 
 /**
@@ -6069,13 +6495,12 @@ void kbase_csf_scheduler_timer_set_enabled(struct kbase_device *kbdev,
 	if (currently_enabled && !enable) {
 		scheduler->timer_enabled = false;
 		cancel_tick_timer(kbdev);
-		cancel_delayed_work(&scheduler->tock_work);
-		scheduler->tock_pending_request = false;
 		mutex_unlock(&scheduler->lock);
 		/* The non-sync version to cancel the normal work item is not
 		 * available, so need to drop the lock before cancellation.
 		 */
-		cancel_work_sync(&scheduler->tick_work);
+		cancel_tick_work(scheduler);
+		cancel_tock_work(scheduler);
 		return;
 	}
 
@@ -6112,6 +6537,12 @@ int kbase_csf_scheduler_pm_suspend_no_lock(struct kbase_device *kbdev)
 	int result = 0;
 
 	lockdep_assert_held(&scheduler->lock);
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	if (unlikely(scheduler->state == SCHED_BUSY))
+		return -EBUSY;
+#endif
+
 #ifdef KBASE_PM_RUNTIME
 	/* If scheduler is in sleeping state, then MCU needs to be activated
 	 * to suspend CSGs.
@@ -6147,7 +6578,7 @@ int kbase_csf_scheduler_pm_suspend(struct kbase_device *kbdev)
 	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
 
 	/* Cancel any potential queued delayed work(s) */
-	cancel_work_sync(&scheduler->tick_work);
+	cancel_tick_work(scheduler);
 	cancel_tock_work(scheduler);
 
 	result = kbase_reset_gpu_prevent_and_wait(kbdev);
@@ -6271,6 +6702,7 @@ int kbase_csf_scheduler_handle_runtime_suspend(struct kbase_device *kbdev)
 	}
 
 	scheduler->state = SCHED_SUSPENDED;
+	KBASE_KTRACE_ADD(kbdev, SCHED_SUSPENDED, NULL, scheduler->state);
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	kbdev->pm.backend.gpu_sleep_mode_active = false;
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.h
index 12df5054e573..d22d7c8b9dce 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_scheduler.h
@@ -36,7 +36,9 @@
  * If the CSG is already scheduled and resident, the CSI will be started
  * right away, otherwise once the group is made resident.
  *
- * Return: 0 on success, or negative on failure.
+ * Return: 0 on success, or negative on failure. -EBUSY is returned to
+ * indicate to the caller that queue could not be enabled due to Scheduler
+ * state and the caller can try to enable the queue after sometime.
  */
 int kbase_csf_scheduler_queue_start(struct kbase_queue *queue);
 
@@ -530,12 +532,30 @@ static inline void kbase_csf_scheduler_invoke_tick(struct kbase_device *kbdev)
 	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
 	unsigned long flags;
 
+	KBASE_KTRACE_ADD(kbdev, SCHEDULER_TICK_INVOKE, NULL, 0u);
 	spin_lock_irqsave(&scheduler->interrupt_lock, flags);
 	if (!scheduler->tick_timer_active)
 		queue_work(scheduler->wq, &scheduler->tick_work);
 	spin_unlock_irqrestore(&scheduler->interrupt_lock, flags);
 }
 
+/**
+ * kbase_csf_scheduler_invoke_tock() - Invoke the scheduling tock
+ *
+ * @kbdev: Pointer to the device
+ *
+ * This function will queue the scheduling tock work item for immediate
+ * execution.
+ */
+static inline void kbase_csf_scheduler_invoke_tock(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+
+	KBASE_KTRACE_ADD(kbdev, SCHEDULER_TOCK_INVOKE, NULL, 0u);
+	if (atomic_cmpxchg(&scheduler->pending_tock_work, false, true) == false)
+		mod_delayed_work(scheduler->wq, &scheduler->tock_work, 0);
+}
+
 /**
  * kbase_csf_scheduler_queue_has_trace() - report whether the queue has been
  *                                         configured to operate with the
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.c
index 769369150687..909362da0047 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -25,6 +25,26 @@
 #include "mali_kbase_csf_tiler_heap_def.h"
 #include "mali_kbase_csf_heap_context_alloc.h"
 
+/* Tiler heap shrink stop limit for maintaining a minimum number of chunks */
+#define HEAP_SHRINK_STOP_LIMIT (1)
+
+/**
+ * struct kbase_csf_gpu_buffer_heap - A gpu buffer object specific to tiler heap
+ *
+ * @cdsbp_0:       Descriptor_type and buffer_type
+ * @size:          The size of the current heap chunk
+ * @pointer:       Pointer to the current heap chunk
+ * @low_pointer:   Pointer to low end of current heap chunk
+ * @high_pointer:  Pointer to high end of current heap chunk
+ */
+struct kbase_csf_gpu_buffer_heap {
+	u32 cdsbp_0;
+	u32 size;
+	u64 pointer;
+	u64 low_pointer;
+	u64 high_pointer;
+} __packed;
+
 /**
  * encode_chunk_ptr - Encode the address and size of a chunk as an integer.
  *
@@ -73,6 +93,35 @@ static struct kbase_csf_tiler_heap_chunk *get_last_chunk(
 		struct kbase_csf_tiler_heap_chunk, link);
 }
 
+/**
+ * remove_external_chunk_mappings - Remove external mappings from a chunk that
+ *                                  is being transitioned to the tiler heap
+ *                                  memory system.
+ *
+ * @kctx:  kbase context the chunk belongs to.
+ * @chunk: The chunk whose external mappings are going to be removed.
+ *
+ * This function marks the region as DONT NEED. Along with KBASE_REG_NO_USER_FREE, this indicates
+ * that the VA region is owned by the tiler heap and could potentially be shrunk at any time. Other
+ * parts of kbase outside of tiler heap management should not take references on its physical
+ * pages, and should not modify them.
+ */
+static void remove_external_chunk_mappings(struct kbase_context *const kctx,
+					   struct kbase_csf_tiler_heap_chunk *chunk)
+{
+	lockdep_assert_held(&kctx->reg_lock);
+
+	if (chunk->region->cpu_alloc != NULL) {
+		kbase_mem_shrink_cpu_mapping(kctx, chunk->region, 0,
+					     chunk->region->cpu_alloc->nents);
+	}
+#if !defined(CONFIG_MALI_VECTOR_DUMP)
+	chunk->region->flags |= KBASE_REG_DONT_NEED;
+#endif
+
+	dev_dbg(kctx->kbdev->dev, "Removed external mappings from chunk 0x%llX", chunk->gpu_va);
+}
+
 /**
  * link_chunk - Link a chunk into a tiler heap
  *
@@ -93,19 +142,12 @@ static int link_chunk(struct kbase_csf_tiler_heap *const heap,
 
 	if (prev) {
 		struct kbase_context *const kctx = heap->kctx;
-		struct kbase_vmap_struct map;
-		u64 *const prev_hdr = kbase_vmap_prot(kctx, prev->gpu_va,
-			sizeof(*prev_hdr), KBASE_REG_CPU_WR, &map);
+		u64 *prev_hdr = prev->map.addr;
 
-		if (unlikely(!prev_hdr)) {
-			dev_err(kctx->kbdev->dev,
-				"Failed to map tiler heap chunk 0x%llX\n",
-				prev->gpu_va);
-			return -ENOMEM;
-		}
+		WARN((prev->region->flags & KBASE_REG_CPU_CACHED),
+		     "Cannot support CPU cached chunks without sync operations");
 
 		*prev_hdr = encode_chunk_ptr(heap->chunk_size, chunk->gpu_va);
-		kbase_vunmap(kctx, &map);
 
 		dev_dbg(kctx->kbdev->dev,
 			"Linked tiler heap chunks, 0x%llX -> 0x%llX\n",
@@ -132,152 +174,264 @@ static int link_chunk(struct kbase_csf_tiler_heap *const heap,
 static int init_chunk(struct kbase_csf_tiler_heap *const heap,
 	struct kbase_csf_tiler_heap_chunk *const chunk, bool link_with_prev)
 {
-	struct kbase_vmap_struct map;
-	struct u64 *chunk_hdr = NULL;
+	int err = 0;
+	u64 *chunk_hdr;
 	struct kbase_context *const kctx = heap->kctx;
 
+	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);
+
 	if (unlikely(chunk->gpu_va & ~CHUNK_ADDR_MASK)) {
 		dev_err(kctx->kbdev->dev,
 			"Tiler heap chunk address is unusable\n");
 		return -EINVAL;
 	}
 
-	chunk_hdr = kbase_vmap_prot(kctx,
-		chunk->gpu_va, CHUNK_HDR_SIZE, KBASE_REG_CPU_WR, &map);
-
-	if (unlikely(!chunk_hdr)) {
-		dev_err(kctx->kbdev->dev,
-			"Failed to map a tiler heap chunk header\n");
-		return -ENOMEM;
+	WARN((chunk->region->flags & KBASE_REG_CPU_CACHED),
+	     "Cannot support CPU cached chunks without sync operations");
+	chunk_hdr = chunk->map.addr;
+	if (WARN(chunk->map.size < CHUNK_HDR_SIZE,
+		 "Tiler chunk kernel mapping was not large enough for zero-init")) {
+		return -EINVAL;
 	}
 
 	memset(chunk_hdr, 0, CHUNK_HDR_SIZE);
-	kbase_vunmap(kctx, &map);
+	INIT_LIST_HEAD(&chunk->link);
 
 	if (link_with_prev)
-		return link_chunk(heap, chunk);
-	else
-		return 0;
+		err = link_chunk(heap, chunk);
+
+	if (unlikely(err)) {
+		dev_err(kctx->kbdev->dev, "Failed to link a chunk to a tiler heap\n");
+		return -EINVAL;
+	}
+
+	list_add_tail(&chunk->link, &heap->chunks_list);
+	heap->chunk_count++;
+
+	return err;
+}
+
+/**
+ * remove_unlinked_chunk - Remove a chunk that is not currently linked into a
+ *                         heap.
+ *
+ * @kctx:  Kbase context that was used to allocate the memory.
+ * @chunk: Chunk that has been allocated, but not linked into a heap.
+ */
+static void remove_unlinked_chunk(struct kbase_context *kctx,
+				  struct kbase_csf_tiler_heap_chunk *chunk)
+{
+	if (WARN_ON(!list_empty(&chunk->link)))
+		return;
+
+	kbase_gpu_vm_lock(kctx);
+	kbase_vunmap(kctx, &chunk->map);
+	/* KBASE_REG_DONT_NEED regions will be confused with ephemeral regions (inc freed JIT
+	 * regions), and so we must clear that flag too before freeing
+	 */
+#if !defined(CONFIG_MALI_VECTOR_DUMP)
+	chunk->region->flags &= ~(KBASE_REG_NO_USER_FREE | KBASE_REG_DONT_NEED);
+#else
+	chunk->region->flags &= ~KBASE_REG_NO_USER_FREE;
+#endif
+	kbase_mem_free_region(kctx, chunk->region);
+	kbase_gpu_vm_unlock(kctx);
+
+	kfree(chunk);
+}
+
+/**
+ * alloc_new_chunk - Allocate new chunk metadata for the tiler heap, reserve a fully backed VA
+ *                   region for the chunk, and provide a kernel mapping.
+ * @kctx:       kbase context with which the chunk will be linked
+ * @chunk_size: the size of the chunk from the corresponding heap
+ *
+ * Allocate the chunk tracking metadata and a corresponding fully backed VA region for the
+ * chunk. The kernel may need to invoke the reclaim path while trying to fulfill the allocation, so
+ * we cannot hold any lock that would be held in the shrinker paths (JIT evict lock or tiler heap
+ * lock).
+ *
+ * Since the chunk may have its physical backing removed, to prevent use-after-free scenarios we
+ * ensure that it is protected from being mapped by other parts of kbase.
+ *
+ * The chunk's GPU memory can be accessed via its 'map' member, but should only be done so by the
+ * shrinker path, as it may be otherwise shrunk at any time.
+ *
+ * Return: pointer to kbase_csf_tiler_heap_chunk on success or a NULL pointer
+ *         on failure
+ */
+static struct kbase_csf_tiler_heap_chunk *alloc_new_chunk(struct kbase_context *kctx,
+							  u64 chunk_size)
+{
+	u64 nr_pages = PFN_UP(chunk_size);
+	u64 flags = BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR | BASE_MEM_PROT_CPU_WR |
+		    BASEP_MEM_NO_USER_FREE | BASE_MEM_COHERENT_LOCAL | BASE_MEM_PROT_CPU_RD;
+	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
+	/* The chunk kernel mapping needs to be large enough to:
+	 * - initially zero the CHUNK_HDR_SIZE area
+	 * - on shrinking, access the NEXT_CHUNK_ADDR_SIZE area
+	 */
+	const size_t chunk_kernel_map_size = max(CHUNK_HDR_SIZE, NEXT_CHUNK_ADDR_SIZE);
+
+	/* Calls to this function are inherently synchronous, with respect to
+	 * MMU operations.
+	 */
+	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
+	flags |= kbase_mem_group_id_set(kctx->jit_group_id);
+
+	chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+	if (unlikely(!chunk)) {
+		dev_err(kctx->kbdev->dev,
+			"No kernel memory for a new tiler heap chunk\n");
+		return NULL;
+	}
+
+	/* Allocate GPU memory for the new chunk. */
+	chunk->region =
+		kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, &chunk->gpu_va, mmu_sync_info);
+
+	if (unlikely(!chunk->region)) {
+		dev_err(kctx->kbdev->dev, "Failed to allocate a tiler heap chunk!\n");
+		goto unroll_chunk;
+	}
+
+	kbase_gpu_vm_lock(kctx);
+
+	/* Some checks done here as KBASE_REG_NO_USER_FREE still allows such things to be made
+	 * whilst we had dropped the region lock
+	 */
+	if (unlikely(atomic_read(&chunk->region->gpu_alloc->kernel_mappings) > 0)) {
+		dev_err(kctx->kbdev->dev, "Chunk region has active kernel mappings!\n");
+		goto unroll_region;
+	}
+
+	/* Whilst we can be sure of a number of other restrictions due to BASEP_MEM_NO_USER_FREE
+	 * being requested, it's useful to document in code what those restrictions are, and ensure
+	 * they remain in place in future.
+	 */
+	if (WARN(!chunk->region->gpu_alloc,
+		 "KBASE_REG_NO_USER_FREE chunks should not have had their alloc freed")) {
+		goto unroll_region;
+	}
+
+	if (WARN(chunk->region->gpu_alloc->type != KBASE_MEM_TYPE_NATIVE,
+		 "KBASE_REG_NO_USER_FREE chunks should not have been freed and then reallocated as imported/non-native regions")) {
+		goto unroll_region;
+	}
+
+	if (WARN((chunk->region->flags & KBASE_REG_ACTIVE_JIT_ALLOC),
+		 "KBASE_REG_NO_USER_FREE chunks should not have been freed and then reallocated as JIT regions")) {
+		goto unroll_region;
+	}
+
+	if (WARN((chunk->region->flags & KBASE_REG_DONT_NEED),
+		 "KBASE_REG_NO_USER_FREE chunks should not have been made ephemeral")) {
+		goto unroll_region;
+	}
+
+	if (WARN(atomic_read(&chunk->region->cpu_alloc->gpu_mappings) > 1,
+		 "KBASE_REG_NO_USER_FREE chunks should not have been aliased")) {
+		goto unroll_region;
+	}
+
+	if (unlikely(!kbase_vmap_reg(kctx, chunk->region, chunk->gpu_va, chunk_kernel_map_size,
+				     (KBASE_REG_CPU_RD | KBASE_REG_CPU_WR), &chunk->map,
+				     KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING))) {
+		dev_err(kctx->kbdev->dev, "Failed to map chunk header for shrinking!\n");
+		goto unroll_region;
+	}
+
+	remove_external_chunk_mappings(kctx, chunk);
+	kbase_gpu_vm_unlock(kctx);
+
+	return chunk;
+
+unroll_region:
+	/* KBASE_REG_DONT_NEED regions will be confused with ephemeral regions (inc freed JIT
+	 * regions), and so we must clear that flag too before freeing.
+	 */
+#if !defined(CONFIG_MALI_VECTOR_DUMP)
+	chunk->region->flags &= ~(KBASE_REG_NO_USER_FREE | KBASE_REG_DONT_NEED);
+#else
+	chunk->region->flags &= ~KBASE_REG_NO_USER_FREE;
+#endif
+	kbase_mem_free_region(kctx, chunk->region);
+	kbase_gpu_vm_unlock(kctx);
+unroll_chunk:
+	kfree(chunk);
+	return NULL;
 }
 
 /**
  * create_chunk - Create a tiler heap chunk
  *
  * @heap: Pointer to the tiler heap for which to allocate memory.
- * @link_with_prev: Flag to indicate if the chunk to be allocated needs to be
- *                  linked with the previously allocated chunk.
  *
- * This function allocates a chunk of memory for a tiler heap and adds it to
- * the end of the list of chunks associated with that heap. The size of the
- * chunk is not a parameter because it is configured per-heap not per-chunk.
+ * This function allocates a chunk of memory for a tiler heap, adds it to the
+ * the list of chunks associated with that heap both on the host side and in GPU
+ * memory.
  *
  * Return: 0 if successful or a negative error code on failure.
  */
-static int create_chunk(struct kbase_csf_tiler_heap *const heap,
-			bool link_with_prev)
+static int create_chunk(struct kbase_csf_tiler_heap *const heap)
 {
 	int err = 0;
-	struct kbase_context *const kctx = heap->kctx;
-	u64 nr_pages = PFN_UP(heap->chunk_size);
-	u64 flags = BASE_MEM_PROT_GPU_RD | BASE_MEM_PROT_GPU_WR |
-		BASE_MEM_PROT_CPU_WR | BASEP_MEM_NO_USER_FREE |
-		BASE_MEM_COHERENT_LOCAL;
 	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
 
-	/* Calls to this function are inherently synchronous, with respect to
-	 * MMU operations.
-	 */
-	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
-
-	flags |= kbase_mem_group_id_set(kctx->jit_group_id);
-
-#if defined(CONFIG_MALI_BIFROST_DEBUG) || defined(CONFIG_MALI_VECTOR_DUMP)
-	flags |= BASE_MEM_PROT_CPU_RD;
-#endif
-
-	chunk = kzalloc(sizeof(*chunk), GFP_KERNEL);
+	chunk = alloc_new_chunk(heap->kctx, heap->chunk_size);
 	if (unlikely(!chunk)) {
-		dev_err(kctx->kbdev->dev,
-			"No kernel memory for a new tiler heap chunk\n");
-		return -ENOMEM;
-	}
-
-	/* Allocate GPU memory for the new chunk. */
-	INIT_LIST_HEAD(&chunk->link);
-	chunk->region =
-		kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, &chunk->gpu_va, mmu_sync_info);
-
-	if (unlikely(!chunk->region)) {
-		dev_err(kctx->kbdev->dev,
-			"Failed to allocate a tiler heap chunk\n");
 		err = -ENOMEM;
-	} else {
-		err = init_chunk(heap, chunk, link_with_prev);
-		if (unlikely(err)) {
-			kbase_gpu_vm_lock(kctx);
-			chunk->region->flags &= ~KBASE_REG_NO_USER_FREE;
-			kbase_mem_free_region(kctx, chunk->region);
-			kbase_gpu_vm_unlock(kctx);
-		}
+		goto allocation_failure;
 	}
 
-	if (unlikely(err)) {
-		kfree(chunk);
-	} else {
-		list_add_tail(&chunk->link, &heap->chunks_list);
-		heap->chunk_count++;
+	mutex_lock(&heap->kctx->csf.tiler_heaps.lock);
+	err = init_chunk(heap, chunk, true);
+	mutex_unlock(&heap->kctx->csf.tiler_heaps.lock);
 
-		dev_dbg(kctx->kbdev->dev, "Created tiler heap chunk 0x%llX\n",
-			chunk->gpu_va);
-	}
+	if (unlikely(err))
+		goto initialization_failure;
 
+	dev_dbg(heap->kctx->kbdev->dev, "Created tiler heap chunk 0x%llX\n", chunk->gpu_va);
+
+	return 0;
+initialization_failure:
+	remove_unlinked_chunk(heap->kctx, chunk);
+allocation_failure:
 	return err;
 }
 
 /**
- * delete_chunk - Delete a tiler heap chunk
- *
- * @heap:  Pointer to the tiler heap for which @chunk was allocated.
- * @chunk: Pointer to a chunk to be deleted.
- *
- * This function frees a tiler heap chunk previously allocated by @create_chunk
- * and removes it from the list of chunks associated with the heap.
- *
- * WARNING: The deleted chunk is not unlinked from the list of chunks used by
- *          the GPU, therefore it is only safe to use this function when
- *          deleting a heap.
- */
-static void delete_chunk(struct kbase_csf_tiler_heap *const heap,
-	struct kbase_csf_tiler_heap_chunk *const chunk)
-{
-	struct kbase_context *const kctx = heap->kctx;
-
-	kbase_gpu_vm_lock(kctx);
-	chunk->region->flags &= ~KBASE_REG_NO_USER_FREE;
-	kbase_mem_free_region(kctx, chunk->region);
-	kbase_gpu_vm_unlock(kctx);
-	list_del(&chunk->link);
-	heap->chunk_count--;
-	kfree(chunk);
-}
-
-/**
- * delete_all_chunks - Delete all chunks belonging to a tiler heap
+ * delete_all_chunks - Delete all chunks belonging to an unlinked tiler heap
  *
  * @heap: Pointer to a tiler heap.
  *
- * This function empties the list of chunks associated with a tiler heap by
- * freeing all chunks previously allocated by @create_chunk.
+ * This function empties the list of chunks associated with a tiler heap by freeing all chunks
+ * previously allocated by @create_chunk.
+ *
+ * The heap must not be reachable from a &struct kbase_context.csf.tiler_heaps.list, as the
+ * tiler_heaps lock cannot be held whilst deleting its chunks due to also needing the &struct
+ * kbase_context.region_lock.
+ *
+ * WARNING: Whilst the deleted chunks are unlinked from host memory, they are not unlinked from the
+ *          list of chunks used by the GPU, therefore it is only safe to use this function when
+ *          deleting a heap.
  */
 static void delete_all_chunks(struct kbase_csf_tiler_heap *heap)
 {
+	struct kbase_context *const kctx = heap->kctx;
 	struct list_head *entry = NULL, *tmp = NULL;
 
+	WARN(!list_empty(&heap->link),
+	     "Deleting a heap's chunks when that heap is still linked requires the tiler_heaps lock, which cannot be held by the caller");
+
 	list_for_each_safe(entry, tmp, &heap->chunks_list) {
 		struct kbase_csf_tiler_heap_chunk *chunk = list_entry(
 			entry, struct kbase_csf_tiler_heap_chunk, link);
 
-		delete_chunk(heap, chunk);
+		list_del_init(&chunk->link);
+		heap->chunk_count--;
+
+		remove_unlinked_chunk(kctx, chunk);
 	}
 }
 
@@ -299,7 +453,7 @@ static int create_initial_chunks(struct kbase_csf_tiler_heap *const heap,
 	u32 i;
 
 	for (i = 0; (i < nchunks) && likely(!err); i++)
-		err = create_chunk(heap, true);
+		err = create_chunk(heap);
 
 	if (unlikely(err))
 		delete_all_chunks(heap);
@@ -308,14 +462,17 @@ static int create_initial_chunks(struct kbase_csf_tiler_heap *const heap,
 }
 
 /**
- * delete_heap - Delete a tiler heap
+ * delete_heap - Delete an unlinked tiler heap
  *
  * @heap: Pointer to a tiler heap to be deleted.
  *
  * This function frees any chunks allocated for a tiler heap previously
- * initialized by @kbase_csf_tiler_heap_init and removes it from the list of
- * heaps associated with the kbase context. The heap context structure used by
+ * initialized by @kbase_csf_tiler_heap_init. The heap context structure used by
  * the firmware is also freed.
+ *
+ * The heap must not be reachable from a &struct kbase_context.csf.tiler_heaps.list, as the
+ * tiler_heaps lock cannot be held whilst deleting it due to also needing the &struct
+ * kbase_context.region_lock.
  */
 static void delete_heap(struct kbase_csf_tiler_heap *heap)
 {
@@ -323,23 +480,41 @@ static void delete_heap(struct kbase_csf_tiler_heap *heap)
 
 	dev_dbg(kctx->kbdev->dev, "Deleting tiler heap 0x%llX\n", heap->gpu_va);
 
-	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);
+	WARN(!list_empty(&heap->link),
+	     "Deleting a heap that is still linked requires the tiler_heaps lock, which cannot be held by the caller");
 
+	/* Make sure that all of the VA regions corresponding to the chunks are
+	 * freed at this time and that the work queue is not trying to access freed
+	 * memory.
+	 *
+	 * Note: since the heap is unlinked, and that no references are made to chunks other
+	 * than from their heap, there is no need to separately move the chunks out of the
+	 * heap->chunks_list to delete them.
+	 */
 	delete_all_chunks(heap);
 
+	kbase_vunmap(kctx, &heap->gpu_va_map);
 	/* We could optimize context destruction by not freeing leaked heap
-	 * contexts but it doesn't seem worth the extra complexity.
+	 * contexts but it doesn't seem worth the extra complexity. After this
+	 * point, the suballocation is returned to the heap context allocator and
+	 * may be overwritten with new data, meaning heap->gpu_va should not
+	 * be used past this point.
 	 */
 	kbase_csf_heap_context_allocator_free(&kctx->csf.tiler_heaps.ctx_alloc,
 		heap->gpu_va);
 
-	list_del(&heap->link);
-
 	WARN_ON(heap->chunk_count);
 	KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(kctx->kbdev, kctx->id,
 		heap->heap_id, 0, 0, heap->max_chunks, heap->chunk_size, 0,
 		heap->target_in_flight, 0);
 
+	if (heap->buf_desc_reg) {
+		kbase_vunmap(kctx, &heap->buf_desc_map);
+		kbase_gpu_vm_lock(kctx);
+		heap->buf_desc_reg->flags &= ~KBASE_REG_NO_USER_FREE;
+		kbase_gpu_vm_unlock(kctx);
+	}
+
 	kfree(heap);
 }
 
@@ -375,6 +550,23 @@ static struct kbase_csf_tiler_heap *find_tiler_heap(
 	return NULL;
 }
 
+static struct kbase_csf_tiler_heap_chunk *find_chunk(struct kbase_csf_tiler_heap *heap,
+						     u64 const chunk_gpu_va)
+{
+	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
+
+	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);
+
+	list_for_each_entry(chunk, &heap->chunks_list, link) {
+		if (chunk->gpu_va == chunk_gpu_va)
+			return chunk;
+	}
+
+	dev_dbg(heap->kctx->kbdev->dev, "Tiler heap chunk 0x%llX was not found\n", chunk_gpu_va);
+
+	return NULL;
+}
+
 int kbase_csf_tiler_heap_context_init(struct kbase_context *const kctx)
 {
 	int err = kbase_csf_heap_context_allocator_init(
@@ -393,37 +585,88 @@ int kbase_csf_tiler_heap_context_init(struct kbase_context *const kctx)
 
 void kbase_csf_tiler_heap_context_term(struct kbase_context *const kctx)
 {
+	LIST_HEAD(local_heaps_list);
 	struct list_head *entry = NULL, *tmp = NULL;
 
 	dev_dbg(kctx->kbdev->dev, "Terminating a context for tiler heaps\n");
 
 	mutex_lock(&kctx->csf.tiler_heaps.lock);
+	list_splice_init(&kctx->csf.tiler_heaps.list, &local_heaps_list);
+	mutex_unlock(&kctx->csf.tiler_heaps.lock);
 
-	list_for_each_safe(entry, tmp, &kctx->csf.tiler_heaps.list) {
+	list_for_each_safe(entry, tmp, &local_heaps_list) {
 		struct kbase_csf_tiler_heap *heap = list_entry(
 			entry, struct kbase_csf_tiler_heap, link);
+
+		list_del_init(&heap->link);
 		delete_heap(heap);
 	}
 
-	mutex_unlock(&kctx->csf.tiler_heaps.lock);
 	mutex_destroy(&kctx->csf.tiler_heaps.lock);
 
 	kbase_csf_heap_context_allocator_term(&kctx->csf.tiler_heaps.ctx_alloc);
 }
 
-int kbase_csf_tiler_heap_init(struct kbase_context *const kctx,
-	u32 const chunk_size, u32 const initial_chunks, u32 const max_chunks,
-	u16 const target_in_flight, u64 *const heap_gpu_va,
-	u64 *const first_chunk_va)
+/**
+ * kbasep_is_buffer_descriptor_region_suitable - Check if a VA region chosen to house
+ *                                               the tiler heap buffer descriptor
+ *                                               is suitable for the purpose.
+ * @kctx: kbase context of the tiler heap
+ * @reg:  VA region being checked for suitability
+ *
+ * The tiler heap buffer descriptor memory does not admit page faults according
+ * to its design, so it must have the entirety of the backing upon allocation,
+ * and it has to remain alive as long as the tiler heap is alive, meaning it
+ * cannot be allocated from JIT/Ephemeral, or user freeable memory.
+ *
+ * Return: true on suitability, false otherwise.
+ */
+static bool kbasep_is_buffer_descriptor_region_suitable(struct kbase_context *const kctx,
+							struct kbase_va_region *const reg)
+{
+	if (kbase_is_region_invalid_or_free(reg)) {
+		dev_err(kctx->kbdev->dev, "Region is either invalid or free!\n");
+		return false;
+	}
+
+	if (!(reg->flags & KBASE_REG_CPU_RD) || (reg->flags & KBASE_REG_DONT_NEED) ||
+	    (reg->flags & KBASE_REG_PF_GROW) || (reg->flags & KBASE_REG_ACTIVE_JIT_ALLOC)) {
+		dev_err(kctx->kbdev->dev, "Region has invalid flags: 0x%lX!\n", reg->flags);
+		return false;
+	}
+
+	if (reg->gpu_alloc->type != KBASE_MEM_TYPE_NATIVE) {
+		dev_err(kctx->kbdev->dev, "Region has invalid type!\n");
+		return false;
+	}
+
+	if ((reg->nr_pages != kbase_reg_current_backed_size(reg)) ||
+	    (reg->nr_pages < PFN_UP(sizeof(struct kbase_csf_gpu_buffer_heap)))) {
+		dev_err(kctx->kbdev->dev, "Region has invalid backing!\n");
+		return false;
+	}
+
+	return true;
+}
+
+#define TILER_BUF_DESC_SIZE (sizeof(struct kbase_csf_gpu_buffer_heap))
+
+int kbase_csf_tiler_heap_init(struct kbase_context *const kctx, u32 const chunk_size,
+			      u32 const initial_chunks, u32 const max_chunks,
+			      u16 const target_in_flight, u64 const buf_desc_va,
+			      u64 *const heap_gpu_va, u64 *const first_chunk_va)
 {
 	int err = 0;
 	struct kbase_csf_tiler_heap *heap = NULL;
 	struct kbase_csf_heap_context_allocator *const ctx_alloc =
 		&kctx->csf.tiler_heaps.ctx_alloc;
+	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
+	struct kbase_va_region *gpu_va_reg = NULL;
+	void *vmap_ptr = NULL;
 
 	dev_dbg(kctx->kbdev->dev,
-		"Creating a tiler heap with %u chunks (limit: %u) of size %u\n",
-		initial_chunks, max_chunks, chunk_size);
+		"Creating a tiler heap with %u chunks (limit: %u) of size %u, buf_desc_va: 0x%llx\n",
+		initial_chunks, max_chunks, chunk_size, buf_desc_va);
 
 	if (!kbase_mem_allow_alloc(kctx))
 		return -EINVAL;
@@ -445,8 +688,7 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx,
 
 	heap = kzalloc(sizeof(*heap), GFP_KERNEL);
 	if (unlikely(!heap)) {
-		dev_err(kctx->kbdev->dev,
-			"No kernel memory for a new tiler heap\n");
+		dev_err(kctx->kbdev->dev, "No kernel memory for a new tiler heap");
 		return -ENOMEM;
 	}
 
@@ -454,57 +696,126 @@ int kbase_csf_tiler_heap_init(struct kbase_context *const kctx,
 	heap->chunk_size = chunk_size;
 	heap->max_chunks = max_chunks;
 	heap->target_in_flight = target_in_flight;
+	heap->buf_desc_checked = false;
 	INIT_LIST_HEAD(&heap->chunks_list);
+	INIT_LIST_HEAD(&heap->link);
+
+	/* Check on the buffer descriptor virtual Address */
+	if (buf_desc_va) {
+		struct kbase_va_region *buf_desc_reg;
+
+		kbase_gpu_vm_lock(kctx);
+		buf_desc_reg =
+			kbase_region_tracker_find_region_enclosing_address(kctx, buf_desc_va);
+
+		if (!kbasep_is_buffer_descriptor_region_suitable(kctx, buf_desc_reg)) {
+			kbase_gpu_vm_unlock(kctx);
+			dev_err(kctx->kbdev->dev,
+				"Could not find a suitable VA region for the tiler heap buf desc!\n");
+			err = -EINVAL;
+			goto buf_desc_not_suitable;
+		}
+
+		/* If we don't prevent userspace from unmapping this, we may run into
+		 * use-after-free, as we don't check for the existence of the region throughout.
+		 */
+		buf_desc_reg->flags |= KBASE_REG_NO_USER_FREE;
+
+		heap->buf_desc_va = buf_desc_va;
+		heap->buf_desc_reg = buf_desc_reg;
+
+		vmap_ptr = kbase_vmap_reg(kctx, buf_desc_reg, buf_desc_va, TILER_BUF_DESC_SIZE,
+					  KBASE_REG_CPU_RD, &heap->buf_desc_map,
+					  KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING);
+		kbase_gpu_vm_unlock(kctx);
+
+		if (unlikely(!vmap_ptr)) {
+			dev_err(kctx->kbdev->dev,
+				"Could not vmap buffer descriptor into kernel memory (err %d)\n",
+				err);
+			err = -ENOMEM;
+			goto buf_desc_vmap_failed;
+		}
+	}
 
 	heap->gpu_va = kbase_csf_heap_context_allocator_alloc(ctx_alloc);
-
 	if (unlikely(!heap->gpu_va)) {
-		dev_dbg(kctx->kbdev->dev,
-			"Failed to allocate a tiler heap context");
+		dev_dbg(kctx->kbdev->dev, "Failed to allocate a tiler heap context\n");
 		err = -ENOMEM;
-	} else {
-		err = create_initial_chunks(heap, initial_chunks);
-		if (unlikely(err))
-			kbase_csf_heap_context_allocator_free(ctx_alloc, heap->gpu_va);
+		goto heap_context_alloc_failed;
 	}
 
+	gpu_va_reg = ctx_alloc->region;
+
+	kbase_gpu_vm_lock(kctx);
+	/* gpu_va_reg was created with BASEP_MEM_NO_USER_FREE, the code to unset this only happens
+	 * on kctx termination (after all syscalls on kctx have finished), and so it is safe to
+	 * assume that gpu_va_reg is still present.
+	 */
+	vmap_ptr = kbase_vmap_reg(kctx, gpu_va_reg, heap->gpu_va, NEXT_CHUNK_ADDR_SIZE,
+				  (KBASE_REG_CPU_RD | KBASE_REG_CPU_WR), &heap->gpu_va_map,
+				  KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING);
+	kbase_gpu_vm_unlock(kctx);
+	if (unlikely(!vmap_ptr)) {
+		dev_dbg(kctx->kbdev->dev, "Failed to vmap the correct heap GPU VA address\n");
+		err = -ENOMEM;
+		goto heap_context_vmap_failed;
+	}
+
+	err = create_initial_chunks(heap, initial_chunks);
 	if (unlikely(err)) {
-		kfree(heap);
-	} else {
-		struct kbase_csf_tiler_heap_chunk const *chunk = list_first_entry(
-			&heap->chunks_list, struct kbase_csf_tiler_heap_chunk, link);
+		dev_dbg(kctx->kbdev->dev, "Failed to create the initial tiler heap chunks\n");
+		goto create_chunks_failed;
+	}
+	chunk = list_first_entry(&heap->chunks_list, struct kbase_csf_tiler_heap_chunk, link);
 
-		*heap_gpu_va = heap->gpu_va;
-		*first_chunk_va = chunk->gpu_va;
+	*heap_gpu_va = heap->gpu_va;
+	*first_chunk_va = chunk->gpu_va;
 
-		mutex_lock(&kctx->csf.tiler_heaps.lock);
-		kctx->csf.tiler_heaps.nr_of_heaps++;
-		heap->heap_id = kctx->csf.tiler_heaps.nr_of_heaps;
-		list_add(&heap->link, &kctx->csf.tiler_heaps.list);
+	mutex_lock(&kctx->csf.tiler_heaps.lock);
+	kctx->csf.tiler_heaps.nr_of_heaps++;
+	heap->heap_id = kctx->csf.tiler_heaps.nr_of_heaps;
+	list_add(&heap->link, &kctx->csf.tiler_heaps.list);
 
-		KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(
-			kctx->kbdev, kctx->id, heap->heap_id,
-			PFN_UP(heap->chunk_size * heap->max_chunks),
-			PFN_UP(heap->chunk_size * heap->chunk_count), heap->max_chunks,
-			heap->chunk_size, heap->chunk_count, heap->target_in_flight, 0);
+	KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(kctx->kbdev, kctx->id, heap->heap_id,
+					    PFN_UP(heap->chunk_size * heap->max_chunks),
+					    PFN_UP(heap->chunk_size * heap->chunk_count),
+					    heap->max_chunks, heap->chunk_size, heap->chunk_count,
+					    heap->target_in_flight, 0);
 
 #if defined(CONFIG_MALI_VECTOR_DUMP)
-		list_for_each_entry(chunk, &heap->chunks_list, link) {
-			KBASE_TLSTREAM_JD_TILER_HEAP_CHUNK_ALLOC(
-				kctx->kbdev, kctx->id, heap->heap_id, chunk->gpu_va);
-		}
-#endif
-
-		dev_dbg(kctx->kbdev->dev, "Created tiler heap 0x%llX\n", heap->gpu_va);
-		mutex_unlock(&kctx->csf.tiler_heaps.lock);
-		kctx->running_total_tiler_heap_nr_chunks += heap->chunk_count;
-		kctx->running_total_tiler_heap_memory +=
-			heap->chunk_size * heap->chunk_count;
-		if (kctx->running_total_tiler_heap_memory >
-		    kctx->peak_total_tiler_heap_memory)
-			kctx->peak_total_tiler_heap_memory =
-				kctx->running_total_tiler_heap_memory;
+	list_for_each_entry(chunk, &heap->chunks_list, link) {
+		KBASE_TLSTREAM_JD_TILER_HEAP_CHUNK_ALLOC(kctx->kbdev, kctx->id, heap->heap_id,
+							 chunk->gpu_va);
 	}
+#endif
+	kctx->running_total_tiler_heap_nr_chunks += heap->chunk_count;
+	kctx->running_total_tiler_heap_memory += (u64)heap->chunk_size * heap->chunk_count;
+	if (kctx->running_total_tiler_heap_memory > kctx->peak_total_tiler_heap_memory)
+		kctx->peak_total_tiler_heap_memory = kctx->running_total_tiler_heap_memory;
+
+	dev_dbg(kctx->kbdev->dev,
+		"Created tiler heap 0x%llX, buffer descriptor 0x%llX, ctx_%d_%d\n", heap->gpu_va,
+		buf_desc_va, kctx->tgid, kctx->id);
+	mutex_unlock(&kctx->csf.tiler_heaps.lock);
+
+	return 0;
+
+create_chunks_failed:
+	kbase_vunmap(kctx, &heap->gpu_va_map);
+heap_context_vmap_failed:
+	kbase_csf_heap_context_allocator_free(ctx_alloc, heap->gpu_va);
+heap_context_alloc_failed:
+	if (heap->buf_desc_reg)
+		kbase_vunmap(kctx, &heap->buf_desc_map);
+buf_desc_vmap_failed:
+	if (heap->buf_desc_reg) {
+		kbase_gpu_vm_lock(kctx);
+		heap->buf_desc_reg->flags &= ~KBASE_REG_NO_USER_FREE;
+		kbase_gpu_vm_unlock(kctx);
+	}
+buf_desc_not_suitable:
+	kfree(heap);
 	return err;
 }
 
@@ -517,16 +828,19 @@ int kbase_csf_tiler_heap_term(struct kbase_context *const kctx,
 	u64 heap_size = 0;
 
 	mutex_lock(&kctx->csf.tiler_heaps.lock);
-
 	heap = find_tiler_heap(kctx, heap_gpu_va);
 	if (likely(heap)) {
 		chunk_count = heap->chunk_count;
 		heap_size = heap->chunk_size * chunk_count;
-		delete_heap(heap);
-	} else
-		err = -EINVAL;
 
-	mutex_unlock(&kctx->csf.tiler_heaps.lock);
+		list_del_init(&heap->link);
+	} else {
+		err = -EINVAL;
+	}
+
+	/* Update stats whilst still holding the lock so they are in sync with the tiler_heaps.list
+	 * at all times
+	 */
 	if (likely(kctx->running_total_tiler_heap_memory >= heap_size))
 		kctx->running_total_tiler_heap_memory -= heap_size;
 	else
@@ -537,36 +851,27 @@ int kbase_csf_tiler_heap_term(struct kbase_context *const kctx,
 	else
 		dev_warn(kctx->kbdev->dev,
 			 "Running total tiler chunk count lower than expected!");
+	if (!err)
+		dev_dbg(kctx->kbdev->dev,
+			"Terminated tiler heap 0x%llX, buffer descriptor 0x%llX, ctx_%d_%d\n",
+			heap->gpu_va, heap->buf_desc_va, kctx->tgid, kctx->id);
+	mutex_unlock(&kctx->csf.tiler_heaps.lock);
+
+	/* Deletion requires the kctx->reg_lock, so must only operate on it whilst unlinked from
+	 * the kctx's csf.tiler_heaps.list, and without holding the csf.tiler_heaps.lock
+	 */
+	if (likely(heap))
+		delete_heap(heap);
+
 	return err;
 }
 
-/**
- * alloc_new_chunk - Allocate a new chunk for the tiler heap.
- *
- * @heap:               Pointer to the tiler heap.
- * @nr_in_flight:       Number of render passes that are in-flight, must not be zero.
- * @pending_frag_count: Number of render passes in-flight with completed vertex/tiler stage.
- *                      The minimum value is zero but it must be less or equal to
- *                      the total number of render passes in flight
- * @new_chunk_ptr:      Where to store the GPU virtual address & size of the new
- *                      chunk allocated for the heap.
- *
- * This function will allocate a new chunk for the chunked tiler heap depending
- * on the settings provided by userspace when the heap was created and the
- * heap's statistics (like number of render passes in-flight).
- *
- * Return: 0 if a new chunk was allocated otherwise an appropriate negative
- *         error code.
- */
-static int alloc_new_chunk(struct kbase_csf_tiler_heap *heap,
-		u32 nr_in_flight, u32 pending_frag_count, u64 *new_chunk_ptr)
+static int validate_allocation_request(struct kbase_csf_tiler_heap *heap, u32 nr_in_flight,
+				       u32 pending_frag_count)
 {
-	int err = -ENOMEM;
-
 	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);
 
-	if (WARN_ON(!nr_in_flight) ||
-		WARN_ON(pending_frag_count > nr_in_flight))
+	if (WARN_ON(!nr_in_flight) || WARN_ON(pending_frag_count > nr_in_flight))
 		return -EINVAL;
 
 	if (nr_in_flight <= heap->target_in_flight) {
@@ -574,66 +879,446 @@ static int alloc_new_chunk(struct kbase_csf_tiler_heap *heap,
 			/* Not exceeded the target number of render passes yet so be
 			 * generous with memory.
 			 */
-			err = create_chunk(heap, false);
-
-			if (likely(!err)) {
-				struct kbase_csf_tiler_heap_chunk *new_chunk =
-								get_last_chunk(heap);
-				if (!WARN_ON(!new_chunk)) {
-					*new_chunk_ptr =
-						encode_chunk_ptr(heap->chunk_size,
-								 new_chunk->gpu_va);
-					return 0;
-				}
-			}
+			return 0;
 		} else if (pending_frag_count > 0) {
-			err = -EBUSY;
+			return -EBUSY;
 		} else {
-			err = -ENOMEM;
+			return -ENOMEM;
 		}
 	} else {
 		/* Reached target number of render passes in flight.
 		 * Wait for some of them to finish
 		 */
-		err = -EBUSY;
+		return -EBUSY;
 	}
-
-	return err;
+	return -ENOMEM;
 }
 
 int kbase_csf_tiler_heap_alloc_new_chunk(struct kbase_context *kctx,
 	u64 gpu_heap_va, u32 nr_in_flight, u32 pending_frag_count, u64 *new_chunk_ptr)
 {
 	struct kbase_csf_tiler_heap *heap;
+	struct kbase_csf_tiler_heap_chunk *chunk;
 	int err = -EINVAL;
+	u64 chunk_size = 0;
+	u64 heap_id = 0;
+
+	/* To avoid potential locking issues during allocation, this is handled
+	 * in three phases:
+	 * 1. Take the lock, find the corresponding heap, and find its chunk size
+	 * (this is always 2 MB, but may change down the line).
+	 * 2. Allocate memory for the chunk and its region.
+	 * 3. If the heap still exists, link it to the end of the list. If it
+	 * doesn't, roll back the allocation.
+	 */
 
 	mutex_lock(&kctx->csf.tiler_heaps.lock);
+	heap = find_tiler_heap(kctx, gpu_heap_va);
+	if (likely(heap)) {
+		chunk_size = heap->chunk_size;
+		heap_id = heap->heap_id;
+	} else {
+		dev_err(kctx->kbdev->dev, "Heap 0x%llX does not exist", gpu_heap_va);
+		mutex_unlock(&kctx->csf.tiler_heaps.lock);
+		goto prelink_failure;
+	}
 
+	err = validate_allocation_request(heap, nr_in_flight, pending_frag_count);
+	if (unlikely(err)) {
+		dev_err(kctx->kbdev->dev,
+			"Not allocating new chunk for heap 0x%llX due to current heap state (err %d)",
+			gpu_heap_va, err);
+		mutex_unlock(&kctx->csf.tiler_heaps.lock);
+		goto prelink_failure;
+	}
+	mutex_unlock(&kctx->csf.tiler_heaps.lock);
+	/* this heap must not be used whilst we have dropped the lock */
+	heap = NULL;
+
+	chunk = alloc_new_chunk(kctx, chunk_size);
+	if (unlikely(!chunk)) {
+		dev_err(kctx->kbdev->dev, "Could not allocate chunk of size %lld for ctx %d_%d",
+			chunk_size, kctx->tgid, kctx->id);
+		goto prelink_failure;
+	}
+
+	/* After this point, the heap that we were targeting could already have had the needed
+	 * chunks allocated, if we were handling multiple OoM events on multiple threads, so
+	 * we need to revalidate the need for the allocation.
+	 */
+	mutex_lock(&kctx->csf.tiler_heaps.lock);
 	heap = find_tiler_heap(kctx, gpu_heap_va);
 
-	if (likely(heap)) {
-		err = alloc_new_chunk(heap, nr_in_flight, pending_frag_count,
-			new_chunk_ptr);
-		if (likely(!err)) {
-			/* update total and peak tiler heap memory record */
-			kctx->running_total_tiler_heap_nr_chunks++;
-			kctx->running_total_tiler_heap_memory += heap->chunk_size;
-
-			if (kctx->running_total_tiler_heap_memory >
-			    kctx->peak_total_tiler_heap_memory)
-				kctx->peak_total_tiler_heap_memory =
-					kctx->running_total_tiler_heap_memory;
-		}
-
-		KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(
-			kctx->kbdev, kctx->id, heap->heap_id,
-			PFN_UP(heap->chunk_size * heap->max_chunks),
-			PFN_UP(heap->chunk_size * heap->chunk_count),
-			heap->max_chunks, heap->chunk_size, heap->chunk_count,
-			heap->target_in_flight, nr_in_flight);
+	if (unlikely(!heap)) {
+		dev_err(kctx->kbdev->dev, "Tiler heap 0x%llX no longer exists!\n", gpu_heap_va);
+		mutex_unlock(&kctx->csf.tiler_heaps.lock);
+		goto unroll_chunk;
 	}
 
+	if (heap_id != heap->heap_id) {
+		dev_err(kctx->kbdev->dev,
+			"Tiler heap 0x%llX was removed from ctx %d_%d while allocating chunk of size %lld!",
+			gpu_heap_va, kctx->tgid, kctx->id, chunk_size);
+		mutex_unlock(&kctx->csf.tiler_heaps.lock);
+		goto unroll_chunk;
+	}
+
+	if (WARN_ON(chunk_size != heap->chunk_size)) {
+		mutex_unlock(&kctx->csf.tiler_heaps.lock);
+		goto unroll_chunk;
+	}
+
+	err = validate_allocation_request(heap, nr_in_flight, pending_frag_count);
+	if (unlikely(err)) {
+		dev_warn(
+			kctx->kbdev->dev,
+			"Aborting linking chunk to heap 0x%llX: heap state changed during allocation (err %d)",
+			gpu_heap_va, err);
+		mutex_unlock(&kctx->csf.tiler_heaps.lock);
+		goto unroll_chunk;
+	}
+
+	err = init_chunk(heap, chunk, false);
+
+	/* On error, the chunk would not be linked, so we can still treat it as an unlinked
+	 * chunk for error handling.
+	 */
+	if (unlikely(err)) {
+		dev_err(kctx->kbdev->dev,
+			"Could not link chunk(0x%llX) with tiler heap 0%llX in ctx %d_%d due to error %d",
+			chunk->gpu_va, gpu_heap_va, kctx->tgid, kctx->id, err);
+		mutex_unlock(&kctx->csf.tiler_heaps.lock);
+		goto unroll_chunk;
+	}
+
+	*new_chunk_ptr = encode_chunk_ptr(heap->chunk_size, chunk->gpu_va);
+
+	/* update total and peak tiler heap memory record */
+	kctx->running_total_tiler_heap_nr_chunks++;
+	kctx->running_total_tiler_heap_memory += heap->chunk_size;
+
+	if (kctx->running_total_tiler_heap_memory > kctx->peak_total_tiler_heap_memory)
+		kctx->peak_total_tiler_heap_memory = kctx->running_total_tiler_heap_memory;
+
+	KBASE_TLSTREAM_AUX_TILER_HEAP_STATS(kctx->kbdev, kctx->id, heap->heap_id,
+					    PFN_UP(heap->chunk_size * heap->max_chunks),
+					    PFN_UP(heap->chunk_size * heap->chunk_count),
+					    heap->max_chunks, heap->chunk_size, heap->chunk_count,
+					    heap->target_in_flight, nr_in_flight);
+
 	mutex_unlock(&kctx->csf.tiler_heaps.lock);
 
+	return err;
+unroll_chunk:
+	remove_unlinked_chunk(kctx, chunk);
+prelink_failure:
 	return err;
 }
+
+static bool delete_chunk_physical_pages(struct kbase_csf_tiler_heap *heap, u64 chunk_gpu_va,
+					u64 *hdr_val)
+{
+	int err;
+	u64 *chunk_hdr;
+	struct kbase_context *kctx = heap->kctx;
+	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
+
+	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);
+
+	chunk = find_chunk(heap, chunk_gpu_va);
+	if (unlikely(!chunk)) {
+		dev_warn(kctx->kbdev->dev,
+			 "Failed to find tiler heap(0x%llX) chunk(0x%llX) for reclaim-delete\n",
+			 heap->gpu_va, chunk_gpu_va);
+		return false;
+	}
+
+	WARN((chunk->region->flags & KBASE_REG_CPU_CACHED),
+	     "Cannot support CPU cached chunks without sync operations");
+	chunk_hdr = chunk->map.addr;
+	*hdr_val = *chunk_hdr;
+
+	dev_dbg(kctx->kbdev->dev,
+		"Reclaim: delete chunk(0x%llx) in heap(0x%llx), header value(0x%llX)\n",
+		chunk_gpu_va, heap->gpu_va, *hdr_val);
+
+	err = kbase_mem_shrink_gpu_mapping(kctx, chunk->region, 0, chunk->region->gpu_alloc->nents);
+	if (unlikely(err)) {
+		dev_warn(
+			kctx->kbdev->dev,
+			"Reclaim: shrinking GPU mapping failed on chunk(0x%llx) in heap(0x%llx) (err %d)\n",
+			chunk_gpu_va, heap->gpu_va, err);
+
+		/* Cannot free the pages whilst references on the GPU remain, so keep the chunk on
+		 * the heap's chunk list and try a different heap.
+		 */
+
+		return false;
+	}
+	/* Destroy the mapping before the physical pages which are mapped are destroyed. */
+	kbase_vunmap(kctx, &chunk->map);
+
+	err = kbase_free_phy_pages_helper(chunk->region->gpu_alloc,
+					  chunk->region->gpu_alloc->nents);
+	if (unlikely(err)) {
+		dev_warn(
+			kctx->kbdev->dev,
+			"Reclaim: remove physical backing failed on chunk(0x%llx) in heap(0x%llx) (err %d), continuing with deferred removal\n",
+			chunk_gpu_va, heap->gpu_va, err);
+
+		/* kbase_free_phy_pages_helper() should only fail on invalid input, and WARNs
+		 * anyway, so continue instead of returning early.
+		 *
+		 * Indeed, we don't want to leave the chunk on the heap's chunk list whilst it has
+		 * its mapping removed, as that could lead to problems. It's safest to instead
+		 * continue with deferred destruction of the chunk.
+		 */
+	}
+
+	dev_dbg(kctx->kbdev->dev,
+		"Reclaim: delete chunk(0x%llx) in heap(0x%llx), header value(0x%llX)\n",
+		chunk_gpu_va, heap->gpu_va, *hdr_val);
+
+	mutex_lock(&heap->kctx->jit_evict_lock);
+	list_move(&chunk->region->jit_node, &kctx->jit_destroy_head);
+	mutex_unlock(&heap->kctx->jit_evict_lock);
+
+	list_del(&chunk->link);
+	heap->chunk_count--;
+	kfree(chunk);
+
+	return true;
+}
+
+static void sanity_check_gpu_buffer_heap(struct kbase_csf_tiler_heap *heap,
+					 struct kbase_csf_gpu_buffer_heap *desc)
+{
+	u64 first_hoarded_chunk_gpu_va = desc->pointer & CHUNK_ADDR_MASK;
+
+	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);
+
+	if (first_hoarded_chunk_gpu_va) {
+		struct kbase_csf_tiler_heap_chunk *chunk =
+			find_chunk(heap, first_hoarded_chunk_gpu_va);
+
+		if (likely(chunk)) {
+			dev_dbg(heap->kctx->kbdev->dev,
+				"Buffer descriptor 0x%llX sanity check ok, HW reclaim allowed\n",
+				heap->buf_desc_va);
+
+			heap->buf_desc_checked = true;
+			return;
+		}
+	}
+	/* If there is no match, defer the check to next time */
+	dev_dbg(heap->kctx->kbdev->dev, "Buffer descriptor 0x%llX runtime sanity check deferred\n",
+		heap->buf_desc_va);
+}
+
+static bool can_read_hw_gpu_buffer_heap(struct kbase_csf_tiler_heap *heap, u64 *chunk_gpu_va_ptr)
+{
+	struct kbase_context *kctx = heap->kctx;
+
+	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);
+
+	/* Initialize the descriptor pointer value to 0 */
+	*chunk_gpu_va_ptr = 0;
+
+	/* The BufferDescriptor on heap is a hint on creation, do a sanity check at runtime */
+	if (heap->buf_desc_reg && !heap->buf_desc_checked) {
+		struct kbase_csf_gpu_buffer_heap *desc = heap->buf_desc_map.addr;
+
+		/* BufferDescriptor is supplied by userspace, so could be CPU-cached */
+		if (heap->buf_desc_map.flags & KBASE_VMAP_FLAG_SYNC_NEEDED)
+			kbase_sync_mem_regions(kctx, &heap->buf_desc_map, KBASE_SYNC_TO_CPU);
+
+		sanity_check_gpu_buffer_heap(heap, desc);
+		if (heap->buf_desc_checked)
+			*chunk_gpu_va_ptr = desc->pointer & CHUNK_ADDR_MASK;
+	}
+
+	return heap->buf_desc_checked;
+}
+
+static u32 delete_hoarded_chunks(struct kbase_csf_tiler_heap *heap)
+{
+	u32 freed = 0;
+	u64 chunk_gpu_va = 0;
+	struct kbase_context *kctx = heap->kctx;
+	struct kbase_csf_tiler_heap_chunk *chunk = NULL;
+
+	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);
+
+	if (can_read_hw_gpu_buffer_heap(heap, &chunk_gpu_va)) {
+		u64 chunk_hdr_val;
+		u64 *hw_hdr;
+
+		if (!chunk_gpu_va) {
+			struct kbase_csf_gpu_buffer_heap *desc = heap->buf_desc_map.addr;
+
+			/* BufferDescriptor is supplied by userspace, so could be CPU-cached */
+			if (heap->buf_desc_map.flags & KBASE_VMAP_FLAG_SYNC_NEEDED)
+				kbase_sync_mem_regions(kctx, &heap->buf_desc_map,
+						       KBASE_SYNC_TO_CPU);
+			chunk_gpu_va = desc->pointer & CHUNK_ADDR_MASK;
+
+			if (!chunk_gpu_va) {
+				dev_dbg(kctx->kbdev->dev,
+					"Buffer descriptor 0x%llX has no chunks (NULL) for reclaim scan\n",
+					heap->buf_desc_va);
+				goto out;
+			}
+		}
+
+		chunk = find_chunk(heap, chunk_gpu_va);
+		if (unlikely(!chunk))
+			goto out;
+
+		WARN((chunk->region->flags & KBASE_REG_CPU_CACHED),
+		     "Cannot support CPU cached chunks without sync operations");
+		hw_hdr = chunk->map.addr;
+
+		/* Move onto the next chunk relevant information */
+		chunk_hdr_val = *hw_hdr;
+		chunk_gpu_va = chunk_hdr_val & CHUNK_ADDR_MASK;
+
+		while (chunk_gpu_va && heap->chunk_count > HEAP_SHRINK_STOP_LIMIT) {
+			bool success =
+				delete_chunk_physical_pages(heap, chunk_gpu_va, &chunk_hdr_val);
+
+			if (!success)
+				break;
+
+			freed++;
+			/* On success, chunk_hdr_val is updated, extract the next chunk address */
+			chunk_gpu_va = chunk_hdr_val & CHUNK_ADDR_MASK;
+		}
+
+		/* Update the existing hardware chunk header, after reclaim deletion of chunks */
+		*hw_hdr = chunk_hdr_val;
+
+		dev_dbg(heap->kctx->kbdev->dev,
+			"HW reclaim scan freed chunks: %u, set hw_hdr[0]: 0x%llX\n", freed,
+			chunk_hdr_val);
+	} else {
+		dev_dbg(kctx->kbdev->dev,
+			"Skip HW reclaim scan, (disabled: buffer descriptor 0x%llX)\n",
+			heap->buf_desc_va);
+	}
+out:
+	return freed;
+}
+
+static u64 delete_unused_chunk_pages(struct kbase_csf_tiler_heap *heap)
+{
+	u32 freed_chunks = 0;
+	u64 freed_pages = 0;
+	u64 chunk_gpu_va;
+	u64 chunk_hdr_val;
+	struct kbase_context *kctx = heap->kctx;
+	u64 *ctx_ptr;
+
+	lockdep_assert_held(&kctx->csf.tiler_heaps.lock);
+
+	WARN(heap->gpu_va_map.flags & KBASE_VMAP_FLAG_SYNC_NEEDED,
+	     "Cannot support CPU cached heap context without sync operations");
+
+	ctx_ptr = heap->gpu_va_map.addr;
+
+	/* Extract the first chunk address from the context's free_list_head */
+	chunk_hdr_val = *ctx_ptr;
+	chunk_gpu_va = chunk_hdr_val & CHUNK_ADDR_MASK;
+
+	while (chunk_gpu_va) {
+		u64 hdr_val;
+		bool success = delete_chunk_physical_pages(heap, chunk_gpu_va, &hdr_val);
+
+		if (!success)
+			break;
+
+		freed_chunks++;
+		chunk_hdr_val = hdr_val;
+		/* extract the next chunk address */
+		chunk_gpu_va = chunk_hdr_val & CHUNK_ADDR_MASK;
+	}
+
+	/* Update the post-scan deletion to context header */
+	*ctx_ptr = chunk_hdr_val;
+
+	/* Try to scan the HW hoarded list of unused chunks */
+	freed_chunks += delete_hoarded_chunks(heap);
+	freed_pages = freed_chunks * PFN_UP(heap->chunk_size);
+	dev_dbg(heap->kctx->kbdev->dev,
+		"Scan reclaim freed chunks/pages %u/%llu, set heap-ctx_u64[0]: 0x%llX\n",
+		freed_chunks, freed_pages, chunk_hdr_val);
+
+	/* Update context tiler heaps memory usage */
+	kctx->running_total_tiler_heap_memory -= freed_pages << PAGE_SHIFT;
+	kctx->running_total_tiler_heap_nr_chunks -= freed_chunks;
+	return freed_pages;
+}
+
+u32 kbase_csf_tiler_heap_scan_kctx_unused_pages(struct kbase_context *kctx, u32 to_free)
+{
+	u64 freed = 0;
+	struct kbase_csf_tiler_heap *heap;
+
+	mutex_lock(&kctx->csf.tiler_heaps.lock);
+
+	list_for_each_entry(heap, &kctx->csf.tiler_heaps.list, link) {
+		freed += delete_unused_chunk_pages(heap);
+
+		/* If freed enough, then stop here */
+		if (freed >= to_free)
+			break;
+	}
+
+	mutex_unlock(&kctx->csf.tiler_heaps.lock);
+	/* The scan is surely not more than 4-G pages, but for logic flow limit it */
+	if (WARN_ON(unlikely(freed > U32_MAX)))
+		return U32_MAX;
+	else
+		return (u32)freed;
+}
+
+static u64 count_unused_heap_pages(struct kbase_csf_tiler_heap *heap)
+{
+	u32 chunk_cnt = 0;
+	u64 page_cnt = 0;
+
+	lockdep_assert_held(&heap->kctx->csf.tiler_heaps.lock);
+
+	/* Here the count is basically an informed estimate, avoiding the costly mapping/unmaping
+	 * in the chunk list walk. The downside is that the number is a less reliable guide for
+	 * later on scan (free) calls on this heap for what actually is freeable.
+	 */
+	if (heap->chunk_count > HEAP_SHRINK_STOP_LIMIT) {
+		chunk_cnt = heap->chunk_count - HEAP_SHRINK_STOP_LIMIT;
+		page_cnt = chunk_cnt * PFN_UP(heap->chunk_size);
+	}
+
+	dev_dbg(heap->kctx->kbdev->dev,
+		"Reclaim count chunks/pages %u/%llu (estimated), heap_va: 0x%llX\n", chunk_cnt,
+		page_cnt, heap->gpu_va);
+
+	return page_cnt;
+}
+
+u32 kbase_csf_tiler_heap_count_kctx_unused_pages(struct kbase_context *kctx)
+{
+	u64 page_cnt = 0;
+	struct kbase_csf_tiler_heap *heap;
+
+	mutex_lock(&kctx->csf.tiler_heaps.lock);
+
+	list_for_each_entry(heap, &kctx->csf.tiler_heaps.list, link)
+		page_cnt += count_unused_heap_pages(heap);
+
+	mutex_unlock(&kctx->csf.tiler_heaps.lock);
+
+	/* The count is surely not more than 4-G pages, but for logic flow limit it */
+	if (WARN_ON(unlikely(page_cnt > U32_MAX)))
+		return U32_MAX;
+	else
+		return (u32)page_cnt;
+}
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.h
index f4b80da68fe5..1b5cb560894f 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap.h
@@ -23,7 +23,6 @@
 #define _KBASE_CSF_TILER_HEAP_H_
 
 #include <mali_kbase.h>
-
 /**
  * kbase_csf_tiler_heap_context_init - Initialize the tiler heaps context for a
  *                                     GPU address space
@@ -58,6 +57,12 @@ void kbase_csf_tiler_heap_context_term(struct kbase_context *kctx);
  * @target_in_flight: Number of render-passes that the driver should attempt to
  *                    keep in flight for which allocation of new chunks is
  *                    allowed. Must not be zero.
+ * @buf_desc_va: Buffer descriptor GPU virtual address. This is a hint for
+ *               indicating that the caller is intending to perform tiler heap
+ *               chunks reclaim for those that are hoarded with hardware while
+ *               the associated shader activites are suspended and the CSGs are
+ *               off slots. If the referred reclaiming is not desired, can
+ *               set it to 0.
  * @gpu_heap_va: Where to store the GPU virtual address of the context that was
  *               set up for the tiler heap.
  * @first_chunk_va: Where to store the GPU virtual address of the first chunk
@@ -66,10 +71,9 @@ void kbase_csf_tiler_heap_context_term(struct kbase_context *kctx);
  *
  * Return: 0 if successful or a negative error code on failure.
  */
-int kbase_csf_tiler_heap_init(struct kbase_context *kctx,
-	u32 chunk_size, u32 initial_chunks, u32 max_chunks,
-	u16 target_in_flight, u64 *gpu_heap_va,
-	u64 *first_chunk_va);
+int kbase_csf_tiler_heap_init(struct kbase_context *kctx, u32 chunk_size, u32 initial_chunks,
+			      u32 max_chunks, u16 target_in_flight, u64 const buf_desc_va,
+			      u64 *gpu_heap_va, u64 *first_chunk_va);
 
 /**
  * kbase_csf_tiler_heap_term - Terminate a chunked tiler memory heap.
@@ -112,4 +116,27 @@ int kbase_csf_tiler_heap_term(struct kbase_context *kctx, u64 gpu_heap_va);
  */
 int kbase_csf_tiler_heap_alloc_new_chunk(struct kbase_context *kctx,
 	u64 gpu_heap_va, u32 nr_in_flight, u32 pending_frag_count, u64 *new_chunk_ptr);
+
+/**
+ * kbase_csf_tiler_heap_scan_kctx_unused_pages - Performs the tiler heap shrinker calim's scan
+ *                                               functionality.
+ *
+ * @kctx:               Pointer to the kbase context for which the tiler heap recalim is to be
+ *                      operated with.
+ * @to_free:            Number of pages suggested for the reclaim scan (free) method to reach.
+ *
+ * Return: the actual number of pages the scan method has freed from the call.
+ */
+u32 kbase_csf_tiler_heap_scan_kctx_unused_pages(struct kbase_context *kctx, u32 to_free);
+
+/**
+ * kbase_csf_tiler_heap_count_kctx_unused_pages - Performs the tiler heap shrinker calim's count
+ *                                                functionality.
+ *
+ * @kctx:               Pointer to the kbase context for which the tiler heap recalim is to be
+ *                      operated with.
+ *
+ * Return: a number of pages that could likely be freed on the subsequent scan method call.
+ */
+u32 kbase_csf_tiler_heap_count_kctx_unused_pages(struct kbase_context *kctx);
 #endif
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_def.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_def.h
index 2c006d9dc9e4..96f2b03d2d31 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_def.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_def.h
@@ -56,12 +56,20 @@
 	((CHUNK_HDR_NEXT_ADDR_MASK >> CHUNK_HDR_NEXT_ADDR_POS) << \
 	 CHUNK_HDR_NEXT_ADDR_ENCODE_SHIFT)
 
+/* The size of the area needed to be vmapped prior to handing the tiler heap
+ * over to the tiler, so that the shrinker could be invoked.
+ */
+#define NEXT_CHUNK_ADDR_SIZE (sizeof(u64))
+
 /**
  * struct kbase_csf_tiler_heap_chunk - A tiler heap chunk managed by the kernel
  *
  * @link:   Link to this chunk in a list of chunks belonging to a
  *          @kbase_csf_tiler_heap.
  * @region: Pointer to the GPU memory region allocated for the chunk.
+ * @map:    Kernel VA mapping so that we would not need to use vmap in the
+ *          shrinker callback, which can allocate. This maps only the header
+ *          of the chunk, so it could be traversed.
  * @gpu_va: GPU virtual address of the start of the memory region.
  *          This points to the header of the chunk and not to the low address
  *          of free memory within it.
@@ -75,9 +83,12 @@
 struct kbase_csf_tiler_heap_chunk {
 	struct list_head link;
 	struct kbase_va_region *region;
+	struct kbase_vmap_struct map;
 	u64 gpu_va;
 };
 
+#define HEAP_BUF_DESCRIPTOR_CHECKED (1 << 0)
+
 /**
  * struct kbase_csf_tiler_heap - A tiler heap managed by the kernel
  *
@@ -85,6 +96,20 @@ struct kbase_csf_tiler_heap_chunk {
  *                   associated.
  * @link:            Link to this heap in a list of tiler heaps belonging to
  *                   the @kbase_csf_tiler_heap_context.
+ * @chunks_list:     Linked list of allocated chunks.
+ * @gpu_va:          The GPU virtual address of the heap context structure that
+ *                   was allocated for the firmware. This is also used to
+ *                   uniquely identify the heap.
+ * @heap_id:         Unique id representing the heap, assigned during heap
+ *                   initialization.
+ * @buf_desc_va:     Buffer descriptor GPU VA. Can be 0 for backward compatible
+ *                   to earlier version base interfaces.
+ * @buf_desc_reg:    Pointer to the VA region that covers the provided buffer
+ *                   descriptor memory object pointed to by buf_desc_va.
+ * @gpu_va_map:      Kernel VA mapping of the GPU VA region.
+ * @buf_desc_map:    Kernel VA mapping of the buffer descriptor, read from
+ *                   during the tiler heap shrinker. Sync operations may need
+ *                   to be done before each read.
  * @chunk_size:      Size of each chunk, in bytes. Must be page-aligned.
  * @chunk_count:     The number of chunks currently allocated. Must not be
  *                   zero or greater than @max_chunks.
@@ -93,22 +118,23 @@ struct kbase_csf_tiler_heap_chunk {
  * @target_in_flight: Number of render-passes that the driver should attempt
  *                    to keep in flight for which allocation of new chunks is
  *                    allowed. Must not be zero.
- * @gpu_va:          The GPU virtual address of the heap context structure that
- *                   was allocated for the firmware. This is also used to
- *                   uniquely identify the heap.
- * @heap_id:         Unique id representing the heap, assigned during heap
- *                   initialization.
- * @chunks_list:     Linked list of allocated chunks.
+ * @buf_desc_checked: Indicates if runtime check on buffer descriptor has been done.
  */
 struct kbase_csf_tiler_heap {
 	struct kbase_context *kctx;
 	struct list_head link;
+	struct list_head chunks_list;
+	u64 gpu_va;
+	u64 heap_id;
+	u64 buf_desc_va;
+	struct kbase_va_region *buf_desc_reg;
+	struct kbase_vmap_struct buf_desc_map;
+	struct kbase_vmap_struct gpu_va_map;
 	u32 chunk_size;
 	u32 chunk_count;
 	u32 max_chunks;
 	u16 target_in_flight;
-	u64 gpu_va;
-	u64 heap_id;
-	struct list_head chunks_list;
+	bool buf_desc_checked;
 };
+
 #endif /* !_KBASE_CSF_TILER_HEAP_DEF_H_ */
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.c
new file mode 100644
index 000000000000..bcab31d27945
--- /dev/null
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.c
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include <mali_kbase.h>
+#include "mali_kbase_csf.h"
+#include "mali_kbase_csf_tiler_heap.h"
+#include "mali_kbase_csf_tiler_heap_reclaim.h"
+
+/* Tiler heap shrinker seek value, needs to be higher than jit and memory pools */
+#define HEAP_SHRINKER_SEEKS (DEFAULT_SEEKS + 2)
+
+/* Tiler heap shrinker batch value */
+#define HEAP_SHRINKER_BATCH (512)
+
+/* Tiler heap reclaim scan (free) method size for limiting a scan run length */
+#define HEAP_RECLAIM_SCAN_BATCH_SIZE (HEAP_SHRINKER_BATCH << 7)
+
+static u8 get_kctx_highest_csg_priority(struct kbase_context *kctx)
+{
+	u8 prio;
+
+	for (prio = KBASE_QUEUE_GROUP_PRIORITY_REALTIME; prio < KBASE_QUEUE_GROUP_PRIORITY_LOW;
+	     prio++)
+		if (!list_empty(&kctx->csf.sched.runnable_groups[prio]))
+			break;
+
+	if (prio != KBASE_QUEUE_GROUP_PRIORITY_REALTIME && kctx->csf.sched.num_idle_wait_grps) {
+		struct kbase_queue_group *group;
+
+		list_for_each_entry(group, &kctx->csf.sched.idle_wait_groups, link) {
+			if (group->priority < prio)
+				prio = group->priority;
+		}
+	}
+
+	return prio;
+}
+
+static void detach_ctx_from_heap_reclaim_mgr(struct kbase_context *kctx)
+{
+	struct kbase_csf_scheduler *const scheduler = &kctx->kbdev->csf.scheduler;
+	struct kbase_csf_ctx_heap_reclaim_info *info = &kctx->csf.sched.heap_info;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	if (!list_empty(&info->mgr_link)) {
+		u32 remaining = (info->nr_est_unused_pages > info->nr_freed_pages) ?
+					info->nr_est_unused_pages - info->nr_freed_pages :
+					0;
+
+		list_del_init(&info->mgr_link);
+		if (remaining)
+			WARN_ON(atomic_sub_return(remaining, &scheduler->reclaim_mgr.unused_pages) <
+				0);
+
+		dev_dbg(kctx->kbdev->dev,
+			"Reclaim_mgr_detach: ctx_%d_%d, est_pages=0%u, freed_pages=%u", kctx->tgid,
+			kctx->id, info->nr_est_unused_pages, info->nr_freed_pages);
+	}
+}
+
+static void attach_ctx_to_heap_reclaim_mgr(struct kbase_context *kctx)
+{
+	struct kbase_csf_ctx_heap_reclaim_info *const info = &kctx->csf.sched.heap_info;
+	struct kbase_csf_scheduler *const scheduler = &kctx->kbdev->csf.scheduler;
+	u8 const prio = get_kctx_highest_csg_priority(kctx);
+
+	lockdep_assert_held(&scheduler->lock);
+
+	if (WARN_ON(!list_empty(&info->mgr_link)))
+		list_del_init(&info->mgr_link);
+
+	/* Count the pages that could be freed */
+	info->nr_est_unused_pages = kbase_csf_tiler_heap_count_kctx_unused_pages(kctx);
+	/* Initialize the scan operation tracking pages */
+	info->nr_freed_pages = 0;
+
+	list_add_tail(&info->mgr_link, &scheduler->reclaim_mgr.ctx_lists[prio]);
+	/* Accumulate the estimated pages to the manager total field */
+	atomic_add(info->nr_est_unused_pages, &scheduler->reclaim_mgr.unused_pages);
+
+	dev_dbg(kctx->kbdev->dev, "Reclaim_mgr_attach: ctx_%d_%d, est_count_pages=%u", kctx->tgid,
+		kctx->id, info->nr_est_unused_pages);
+}
+
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_active(struct kbase_queue_group *group)
+{
+	struct kbase_context *kctx = group->kctx;
+	struct kbase_csf_ctx_heap_reclaim_info *info = &kctx->csf.sched.heap_info;
+
+	lockdep_assert_held(&kctx->kbdev->csf.scheduler.lock);
+
+	info->on_slot_grps++;
+	/* If the kctx has an on-slot change from 0 => 1, detach it from reclaim_mgr */
+	if (info->on_slot_grps == 1) {
+		dev_dbg(kctx->kbdev->dev, "CSG_%d_%d_%d on-slot, remove kctx from reclaim manager",
+			group->kctx->tgid, group->kctx->id, group->handle);
+
+		detach_ctx_from_heap_reclaim_mgr(kctx);
+	}
+}
+
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_evict(struct kbase_queue_group *group)
+{
+	struct kbase_context *kctx = group->kctx;
+	struct kbase_csf_ctx_heap_reclaim_info *const info = &kctx->csf.sched.heap_info;
+	struct kbase_csf_scheduler *const scheduler = &kctx->kbdev->csf.scheduler;
+	const u32 num_groups = kctx->kbdev->csf.global_iface.group_num;
+	u32 on_slot_grps = 0;
+	u32 i;
+
+	lockdep_assert_held(&scheduler->lock);
+
+	/* Group eviction from the scheduler is a bit more complex, but fairly less
+	 * frequent in operations. Taking the opportunity to actually count the
+	 * on-slot CSGs from the given kctx, for robustness and clearer code logic.
+	 */
+	for_each_set_bit(i, scheduler->csg_inuse_bitmap, num_groups) {
+		struct kbase_csf_csg_slot *csg_slot = &scheduler->csg_slots[i];
+		struct kbase_queue_group *grp = csg_slot->resident_group;
+
+		if (unlikely(!grp))
+			continue;
+
+		if (grp->kctx == kctx)
+			on_slot_grps++;
+	}
+
+	info->on_slot_grps = on_slot_grps;
+
+	/* If the kctx has no other CSGs on-slot, handle the heap reclaim related actions */
+	if (!info->on_slot_grps) {
+		if (kctx->csf.sched.num_runnable_grps || kctx->csf.sched.num_idle_wait_grps) {
+			/* The kctx has other operational CSGs, attach it if not yet done */
+			if (list_empty(&info->mgr_link)) {
+				dev_dbg(kctx->kbdev->dev,
+					"CSG_%d_%d_%d evict, add kctx to reclaim manager",
+					group->kctx->tgid, group->kctx->id, group->handle);
+
+				attach_ctx_to_heap_reclaim_mgr(kctx);
+			}
+		} else {
+			/* The kctx is a zombie after the group eviction, drop it out */
+			dev_dbg(kctx->kbdev->dev,
+				"CSG_%d_%d_%d evict leading to zombie kctx, dettach from reclaim manager",
+				group->kctx->tgid, group->kctx->id, group->handle);
+
+			detach_ctx_from_heap_reclaim_mgr(kctx);
+		}
+	}
+}
+
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_suspend(struct kbase_queue_group *group)
+{
+	struct kbase_context *kctx = group->kctx;
+	struct kbase_csf_ctx_heap_reclaim_info *info = &kctx->csf.sched.heap_info;
+
+	lockdep_assert_held(&kctx->kbdev->csf.scheduler.lock);
+
+	if (!WARN_ON(info->on_slot_grps == 0))
+		info->on_slot_grps--;
+	/* If the kctx has no CSGs on-slot, attach it to scheduler's reclaim manager */
+	if (info->on_slot_grps == 0) {
+		dev_dbg(kctx->kbdev->dev, "CSG_%d_%d_%d off-slot, add kctx to reclaim manager",
+			group->kctx->tgid, group->kctx->id, group->handle);
+
+		attach_ctx_to_heap_reclaim_mgr(kctx);
+	}
+}
+
+static unsigned long reclaim_unused_heap_pages(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+	struct kbase_csf_sched_heap_reclaim_mgr *const mgr = &scheduler->reclaim_mgr;
+	unsigned long total_freed_pages = 0;
+	int prio;
+
+	lockdep_assert_held(&kbdev->csf.scheduler.lock);
+
+	for (prio = KBASE_QUEUE_GROUP_PRIORITY_LOW;
+	     total_freed_pages < HEAP_RECLAIM_SCAN_BATCH_SIZE &&
+	     prio >= KBASE_QUEUE_GROUP_PRIORITY_REALTIME;
+	     prio--) {
+		struct kbase_csf_ctx_heap_reclaim_info *info, *tmp;
+		u32 cnt_ctxs = 0;
+
+		list_for_each_entry_safe(info, tmp, &scheduler->reclaim_mgr.ctx_lists[prio],
+					 mgr_link) {
+			struct kbase_context *kctx =
+				container_of(info, struct kbase_context, csf.sched.heap_info);
+			u32 freed_pages = kbase_csf_tiler_heap_scan_kctx_unused_pages(
+				kctx, info->nr_est_unused_pages);
+
+			if (freed_pages) {
+				/* Remove the freed pages from the manager retained estimate. The
+				 * accumulated removals from the kctx should not exceed the kctx
+				 * initially notified contribution amount:
+				 *   info->nr_est_unused_pages.
+				 */
+				u32 rm_cnt = MIN(info->nr_est_unused_pages - info->nr_freed_pages,
+						 freed_pages);
+
+				WARN_ON(atomic_sub_return(rm_cnt, &mgr->unused_pages) < 0);
+
+				/* tracking the freed pages, before a potential detach call */
+				info->nr_freed_pages += freed_pages;
+				total_freed_pages += freed_pages;
+
+				schedule_work(&kctx->jit_work);
+			}
+
+			/* If the kctx can't offer anymore, drop it from the reclaim manger,
+			 * otherwise leave it remaining in. If the kctx changes its state (i.e.
+			 * some CSGs becoming on-slot), the scheduler will pull it out.
+			 */
+			if (info->nr_freed_pages >= info->nr_est_unused_pages || freed_pages == 0)
+				detach_ctx_from_heap_reclaim_mgr(kctx);
+
+			cnt_ctxs++;
+
+			/* Enough has been freed, break to avoid holding the lock too long */
+			if (total_freed_pages >= HEAP_RECLAIM_SCAN_BATCH_SIZE)
+				break;
+		}
+
+		dev_dbg(kbdev->dev, "Reclaim free heap pages: %lu (cnt_ctxs: %u, prio: %d)",
+			total_freed_pages, cnt_ctxs, prio);
+	}
+
+	dev_dbg(kbdev->dev, "Reclaim free total heap pages: %lu (across all CSG priority)",
+		total_freed_pages);
+
+	return total_freed_pages;
+}
+
+static unsigned long kbase_csf_tiler_heap_reclaim_count_free_pages(struct kbase_device *kbdev,
+								   struct shrink_control *sc)
+{
+	struct kbase_csf_sched_heap_reclaim_mgr *mgr = &kbdev->csf.scheduler.reclaim_mgr;
+	unsigned long page_cnt = atomic_read(&mgr->unused_pages);
+
+	dev_dbg(kbdev->dev, "Reclaim count unused pages (estimate): %lu", page_cnt);
+
+	return page_cnt;
+}
+
+static unsigned long kbase_csf_tiler_heap_reclaim_scan_free_pages(struct kbase_device *kbdev,
+								  struct shrink_control *sc)
+{
+	struct kbase_csf_sched_heap_reclaim_mgr *mgr = &kbdev->csf.scheduler.reclaim_mgr;
+	unsigned long freed = 0;
+	unsigned long avail = 0;
+
+	/* If Scheduler is busy in action, return 0 */
+	if (!mutex_trylock(&kbdev->csf.scheduler.lock)) {
+		struct kbase_csf_scheduler *const scheduler = &kbdev->csf.scheduler;
+
+		/* Wait for roughly 2-ms */
+		wait_event_timeout(kbdev->csf.event_wait, (scheduler->state != SCHED_BUSY),
+				   msecs_to_jiffies(2));
+		if (!mutex_trylock(&kbdev->csf.scheduler.lock)) {
+			dev_dbg(kbdev->dev, "Tiler heap reclaim scan see device busy (freed: 0)");
+			return 0;
+		}
+	}
+
+	avail = atomic_read(&mgr->unused_pages);
+	if (avail)
+		freed = reclaim_unused_heap_pages(kbdev);
+
+	mutex_unlock(&kbdev->csf.scheduler.lock);
+
+#if (KERNEL_VERSION(4, 14, 0) <= LINUX_VERSION_CODE)
+	if (freed > sc->nr_to_scan)
+		sc->nr_scanned = freed;
+#endif /* (KERNEL_VERSION(4, 14, 0) <= LINUX_VERSION_CODE) */
+
+	dev_info(kbdev->dev, "Tiler heap reclaim scan freed pages: %lu (unused: %lu)", freed,
+		 avail);
+
+	/* On estimate suggesting available, yet actual free failed, return STOP */
+	if (avail && !freed)
+		return SHRINK_STOP;
+	else
+		return freed;
+}
+
+static unsigned long kbase_csf_tiler_heap_reclaim_count_objects(struct shrinker *s,
+								struct shrink_control *sc)
+{
+	struct kbase_device *kbdev =
+		container_of(s, struct kbase_device, csf.scheduler.reclaim_mgr.heap_reclaim);
+
+	return kbase_csf_tiler_heap_reclaim_count_free_pages(kbdev, sc);
+}
+
+static unsigned long kbase_csf_tiler_heap_reclaim_scan_objects(struct shrinker *s,
+							       struct shrink_control *sc)
+{
+	struct kbase_device *kbdev =
+		container_of(s, struct kbase_device, csf.scheduler.reclaim_mgr.heap_reclaim);
+
+	return kbase_csf_tiler_heap_reclaim_scan_free_pages(kbdev, sc);
+}
+
+void kbase_csf_tiler_heap_reclaim_ctx_init(struct kbase_context *kctx)
+{
+	/* Per-kctx heap_info object initialization */
+	memset(&kctx->csf.sched.heap_info, 0, sizeof(struct kbase_csf_ctx_heap_reclaim_info));
+	INIT_LIST_HEAD(&kctx->csf.sched.heap_info.mgr_link);
+}
+
+void kbase_csf_tiler_heap_reclaim_mgr_init(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	struct shrinker *reclaim = &scheduler->reclaim_mgr.heap_reclaim;
+	u8 prio;
+
+	for (prio = KBASE_QUEUE_GROUP_PRIORITY_REALTIME; prio < KBASE_QUEUE_GROUP_PRIORITY_COUNT;
+	     prio++)
+		INIT_LIST_HEAD(&scheduler->reclaim_mgr.ctx_lists[prio]);
+
+	atomic_set(&scheduler->reclaim_mgr.unused_pages, 0);
+
+	reclaim->count_objects = kbase_csf_tiler_heap_reclaim_count_objects;
+	reclaim->scan_objects = kbase_csf_tiler_heap_reclaim_scan_objects;
+	reclaim->seeks = HEAP_SHRINKER_SEEKS;
+	reclaim->batch = HEAP_SHRINKER_BATCH;
+
+#if !defined(CONFIG_MALI_VECTOR_DUMP)
+	register_shrinker(reclaim);
+#endif
+}
+
+void kbase_csf_tiler_heap_reclaim_mgr_term(struct kbase_device *kbdev)
+{
+	struct kbase_csf_scheduler *scheduler = &kbdev->csf.scheduler;
+	u8 prio;
+
+#if !defined(CONFIG_MALI_VECTOR_DUMP)
+	unregister_shrinker(&scheduler->reclaim_mgr.heap_reclaim);
+#endif
+
+	for (prio = KBASE_QUEUE_GROUP_PRIORITY_REALTIME; prio < KBASE_QUEUE_GROUP_PRIORITY_COUNT;
+	     prio++)
+		WARN_ON(!list_empty(&scheduler->reclaim_mgr.ctx_lists[prio]));
+
+	WARN_ON(atomic_read(&scheduler->reclaim_mgr.unused_pages));
+}
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.h
new file mode 100644
index 000000000000..b6e580e48df6
--- /dev/null
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tiler_heap_reclaim.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _KBASE_CSF_TILER_HEAP_RECLAIM_H_
+#define _KBASE_CSF_TILER_HEAP_RECLAIM_H_
+
+#include <mali_kbase.h>
+
+/**
+ * kbase_csf_tiler_heap_reclaim_sched_notify_grp_active - Notifier function for the scheduler
+ *                                                        to use when a group is put on-slot.
+ *
+ * @group: Pointer to the group object that has been placed on-slot for running.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_active(struct kbase_queue_group *group);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_sched_notify_grp_evict - Notifier function for the scheduler
+ *               to use when a group is evicted out of the schedulder's scope, i.e no run of
+ *               the group is possible afterwards.
+ *
+ * @group: Pointer to the group object that has been evicted.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_evict(struct kbase_queue_group *group);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_sched_notify_grp_suspend - Notifier function for the scheduler
+ *                to use when a group is suspended from running, but could resume in future.
+ *
+ * @group: Pointer to the group object that is in suspended state.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_sched_notify_grp_suspend(struct kbase_queue_group *group);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_ctx_init - Initializer on per context data fields for use
+ *                                         with the tiler heap reclaim manager.
+ *
+ * @kctx: Pointer to the kbase_context.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_ctx_init(struct kbase_context *kctx);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_mgr_init - Initializer for the tiler heap reclaim manger.
+ *
+ * @kbdev: Pointer to the device.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_mgr_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_csf_tiler_heap_reclaim_mgr_term - Termination call for the tiler heap reclaim manger.
+ *
+ * @kbdev: Pointer to the device.
+ *
+ */
+void kbase_csf_tiler_heap_reclaim_mgr_term(struct kbase_device *kbdev);
+
+#endif
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tl_reader.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tl_reader.c
index 6ae1029ab123..71ec91e3de03 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tl_reader.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_tl_reader.c
@@ -88,13 +88,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(kbase_csf_tl_poll_interval_fops,
 		kbase_csf_tl_debugfs_poll_interval_read,
 		kbase_csf_tl_debugfs_poll_interval_write, "%llu\n");
 
-
 void kbase_csf_tl_reader_debugfs_init(struct kbase_device *kbdev)
 {
 	debugfs_create_file("csf_tl_poll_interval_in_ms", 0644,
 		kbdev->debugfs_instr_directory, kbdev,
 		&kbase_csf_tl_poll_interval_fops);
-
 }
 #endif
 
@@ -166,11 +164,10 @@ static int kbase_ts_converter_init(
  *
  * Return: The CPU timestamp.
  */
-static void __maybe_unused
-kbase_ts_converter_convert(const struct kbase_ts_converter *self, u64 *gpu_ts)
+static u64 __maybe_unused
+kbase_ts_converter_convert(const struct kbase_ts_converter *self, u64 gpu_ts)
 {
-	u64 old_gpu_ts = *gpu_ts;
-	*gpu_ts = div64_u64(old_gpu_ts * self->multiplier, self->divisor) +
+	return div64_u64(gpu_ts * self->multiplier, self->divisor) +
 		  self->offset;
 }
 
@@ -250,7 +247,6 @@ static void tl_reader_reset(struct kbase_csf_tl_reader *self)
 	self->tl_header.btc = 0;
 }
 
-
 int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
 {
 	int ret = 0;
@@ -275,7 +271,6 @@ int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
 		return -EBUSY;
 	}
 
-
 	/* Copying the whole buffer in a single shot. We assume
 	 * that the buffer will not contain partially written messages.
 	 */
@@ -326,8 +321,8 @@ int kbase_csf_tl_reader_flush_buffer(struct kbase_csf_tl_reader *self)
 		{
 			struct kbase_csffw_tl_message *msg =
 				(struct kbase_csffw_tl_message *) csffw_data_it;
-			kbase_ts_converter_convert(&self->ts_converter,
-						   &msg->timestamp);
+			msg->timestamp = kbase_ts_converter_convert(&self->ts_converter,
+						   msg->timestamp);
 		}
 
 		/* Copy the message out to the tl_stream. */
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.c
index 23202c87a404..46872f937dbf 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.c
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.c
@@ -119,7 +119,7 @@ static const struct firmware_trace_buffer_data trace_buffer_data[] = {
 #if MALI_UNIT_TEST
 	{ "fwutf", { 0 }, 1 },
 #endif
-	{ FW_TRACE_BUF_NAME, { 0 }, 4 },
+	{ FIRMWARE_LOG_BUF_NAME, { 0 }, 4 },
 	{ "benchmark", { 0 }, 2 },
 	{ "timeline", { 0 }, KBASE_CSF_TL_BUFFER_NR_PAGES },
 };
@@ -506,10 +506,16 @@ unsigned int kbase_csf_firmware_trace_buffer_read_data(
 }
 EXPORT_SYMBOL(kbase_csf_firmware_trace_buffer_read_data);
 
-#if IS_ENABLED(CONFIG_DEBUG_FS)
+static void update_trace_buffer_active_mask64(struct firmware_trace_buffer *tb, u64 mask)
+{
+	unsigned int i;
+
+	for (i = 0; i < tb->trace_enable_entry_count; i++)
+		kbasep_csf_firmware_trace_buffer_update_trace_enable_bit(tb, i, (mask >> i) & 1);
+}
 
 #define U32_BITS 32
-static u64 get_trace_buffer_active_mask64(struct firmware_trace_buffer *tb)
+u64 kbase_csf_firmware_trace_buffer_get_active_mask64(struct firmware_trace_buffer *tb)
 {
 	u64 active_mask = tb->trace_enable_init_mask[0];
 
@@ -519,18 +525,7 @@ static u64 get_trace_buffer_active_mask64(struct firmware_trace_buffer *tb)
 	return active_mask;
 }
 
-static void update_trace_buffer_active_mask64(struct firmware_trace_buffer *tb,
-		u64 mask)
-{
-	unsigned int i;
-
-	for (i = 0; i < tb->trace_enable_entry_count; i++)
-		kbasep_csf_firmware_trace_buffer_update_trace_enable_bit(
-			tb, i, (mask >> i) & 1);
-}
-
-static int set_trace_buffer_active_mask64(struct firmware_trace_buffer *tb,
-		u64 mask)
+int kbase_csf_firmware_trace_buffer_set_active_mask64(struct firmware_trace_buffer *tb, u64 mask)
 {
 	struct kbase_device *kbdev = tb->kbdev;
 	unsigned long flags;
@@ -558,123 +553,3 @@ static int set_trace_buffer_active_mask64(struct firmware_trace_buffer *tb,
 
 	return err;
 }
-
-static int kbase_csf_firmware_trace_enable_mask_read(void *data, u64 *val)
-{
-	struct kbase_device *kbdev = (struct kbase_device *)data;
-	struct firmware_trace_buffer *tb =
-		kbase_csf_firmware_get_trace_buffer(kbdev, FW_TRACE_BUF_NAME);
-
-	if (tb == NULL) {
-		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
-		return -EIO;
-	}
-	/* The enabled traces limited to u64 here, regarded practical */
-	*val = get_trace_buffer_active_mask64(tb);
-	return 0;
-}
-
-static int kbase_csf_firmware_trace_enable_mask_write(void *data, u64 val)
-{
-	struct kbase_device *kbdev = (struct kbase_device *)data;
-	struct firmware_trace_buffer *tb =
-		kbase_csf_firmware_get_trace_buffer(kbdev, FW_TRACE_BUF_NAME);
-	u64 new_mask;
-	unsigned int enable_bits_count;
-
-	if (tb == NULL) {
-		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
-		return -EIO;
-	}
-
-	/* Ignore unsupported types */
-	enable_bits_count =
-	    kbase_csf_firmware_trace_buffer_get_trace_enable_bits_count(tb);
-	if (enable_bits_count > 64) {
-		dev_dbg(kbdev->dev, "Limit enabled bits count from %u to 64",
-			enable_bits_count);
-		enable_bits_count = 64;
-	}
-	new_mask = val & ((1 << enable_bits_count) - 1);
-
-	if (new_mask != get_trace_buffer_active_mask64(tb))
-		return set_trace_buffer_active_mask64(tb, new_mask);
-	else
-		return 0;
-}
-
-static int kbasep_csf_firmware_trace_debugfs_open(struct inode *in,
-		struct file *file)
-{
-	struct kbase_device *kbdev = in->i_private;
-
-	file->private_data = kbdev;
-	dev_dbg(kbdev->dev, "Opened firmware trace buffer dump debugfs file");
-
-	return 0;
-}
-
-static ssize_t kbasep_csf_firmware_trace_debugfs_read(struct file *file,
-		char __user *buf, size_t size, loff_t *ppos)
-{
-	struct kbase_device *kbdev = file->private_data;
-	u8 *pbyte;
-	unsigned int n_read;
-	unsigned long not_copied;
-	/* Limit the kernel buffer to no more than two pages */
-	size_t mem = MIN(size, 2 * PAGE_SIZE);
-	unsigned long flags;
-
-	struct firmware_trace_buffer *tb =
-		kbase_csf_firmware_get_trace_buffer(kbdev, FW_TRACE_BUF_NAME);
-
-	if (tb == NULL) {
-		dev_err(kbdev->dev, "Couldn't get the firmware trace buffer");
-		return -EIO;
-	}
-
-	pbyte = kmalloc(mem, GFP_KERNEL);
-	if (pbyte == NULL) {
-		dev_err(kbdev->dev, "Couldn't allocate memory for trace buffer dump");
-		return -ENOMEM;
-	}
-
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	n_read = kbase_csf_firmware_trace_buffer_read_data(tb, pbyte, mem);
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-	/* Do the copy, if we have obtained some trace data */
-	not_copied = (n_read) ? copy_to_user(buf, pbyte, n_read) : 0;
-	kfree(pbyte);
-
-	if (!not_copied) {
-		*ppos += n_read;
-		return n_read;
-	}
-
-	dev_err(kbdev->dev, "Couldn't copy trace buffer data to user space buffer");
-	return -EFAULT;
-}
-
-DEFINE_DEBUGFS_ATTRIBUTE(kbase_csf_firmware_trace_enable_mask_fops,
-			 kbase_csf_firmware_trace_enable_mask_read,
-			 kbase_csf_firmware_trace_enable_mask_write, "%llx\n");
-
-static const struct file_operations kbasep_csf_firmware_trace_debugfs_fops = {
-	.owner = THIS_MODULE,
-	.open = kbasep_csf_firmware_trace_debugfs_open,
-	.read = kbasep_csf_firmware_trace_debugfs_read,
-	.llseek = no_llseek,
-};
-
-void kbase_csf_firmware_trace_buffer_debugfs_init(struct kbase_device *kbdev)
-{
-	debugfs_create_file("fw_trace_enable_mask", 0644,
-			    kbdev->mali_debugfs_directory, kbdev,
-			    &kbase_csf_firmware_trace_enable_mask_fops);
-
-	debugfs_create_file("fw_traces", 0444,
-			    kbdev->mali_debugfs_directory, kbdev,
-			    &kbasep_csf_firmware_trace_debugfs_fops);
-}
-#endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.h
index a28d0f057700..0389d093a904 100644
--- a/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.h
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_csf_trace_buffer.h
@@ -25,7 +25,7 @@
 #include <linux/types.h>
 
 #define CSF_FIRMWARE_TRACE_ENABLE_INIT_MASK_MAX (4)
-#define FW_TRACE_BUF_NAME "fwlog"
+#define FIRMWARE_LOG_BUF_NAME "fwlog"
 
 /* Forward declarations */
 struct firmware_trace_buffer;
@@ -165,14 +165,23 @@ bool kbase_csf_firmware_trace_buffer_is_empty(
 unsigned int kbase_csf_firmware_trace_buffer_read_data(
 	struct firmware_trace_buffer *trace_buffer, u8 *data, unsigned int num_bytes);
 
-#if IS_ENABLED(CONFIG_DEBUG_FS)
 /**
- * kbase_csf_firmware_trace_buffer_debugfs_init() - Add debugfs entries for
- * setting enable mask and dumping the binary firmware trace buffer
+ * kbase_csf_firmware_trace_buffer_get_active_mask64 - Get trace buffer active mask
  *
- * @kbdev: Pointer to the device
+ * @tb: Trace buffer handle
+ *
+ * Return: Trace buffer active mask.
  */
-void kbase_csf_firmware_trace_buffer_debugfs_init(struct kbase_device *kbdev);
-#endif /* CONFIG_DEBUG_FS */
+u64 kbase_csf_firmware_trace_buffer_get_active_mask64(struct firmware_trace_buffer *tb);
+
+/**
+ * kbase_csf_firmware_trace_buffer_set_active_mask64 - Set trace buffer active mask
+ *
+ * @tb: Trace buffer handle
+ * @mask: New active mask
+ *
+ * Return: 0 if successful, negative error code on failure.
+ */
+int kbase_csf_firmware_trace_buffer_set_active_mask64(struct firmware_trace_buffer *tb, u64 mask);
 
 #endif /* _KBASE_CSF_TRACE_BUFFER_H_ */
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.c b/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.c
new file mode 100644
index 000000000000..185779c16815
--- /dev/null
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#include <mali_kbase.h>
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+
+/**
+ * kbasep_fault_occurred - Check if fault occurred.
+ *
+ * @kbdev:  Device pointer
+ *
+ * Return: true if a fault occurred.
+ */
+static bool kbasep_fault_occurred(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	ret = (kbdev->csf.dof.error_code != DF_NO_ERROR);
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	return ret;
+}
+
+void kbase_debug_csf_fault_wait_completion(struct kbase_device *kbdev)
+{
+	if (likely(!kbase_debug_csf_fault_dump_enabled(kbdev))) {
+		dev_dbg(kbdev->dev, "No userspace client for dumping exists");
+		return;
+	}
+
+	wait_event(kbdev->csf.dof.dump_wait_wq, kbase_debug_csf_fault_dump_complete(kbdev));
+}
+KBASE_EXPORT_TEST_API(kbase_debug_csf_fault_wait_completion);
+
+/**
+ * kbase_debug_csf_fault_wakeup - Wake up a waiting user space client.
+ *
+ * @kbdev:   Kbase device
+ */
+static void kbase_debug_csf_fault_wakeup(struct kbase_device *kbdev)
+{
+	wake_up_interruptible(&kbdev->csf.dof.fault_wait_wq);
+}
+
+bool kbase_debug_csf_fault_notify(struct kbase_device *kbdev,
+	struct kbase_context *kctx, enum dumpfault_error_type error)
+{
+	unsigned long flags;
+
+	if (likely(!kbase_debug_csf_fault_dump_enabled(kbdev)))
+		return false;
+
+	if (WARN_ON(error == DF_NO_ERROR))
+		return false;
+
+	if (kctx && kbase_ctx_flag(kctx, KCTX_DYING)) {
+		dev_info(kbdev->dev, "kctx %d_%d is dying when error %d is reported",
+			kctx->tgid, kctx->id, error);
+		kctx = NULL;
+	}
+
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+
+	/* Only one fault at a time can be processed */
+	if (kbdev->csf.dof.error_code) {
+		dev_info(kbdev->dev, "skip this fault as there's a pending fault");
+		goto unlock;
+	}
+
+	kbdev->csf.dof.kctx_tgid = kctx ? kctx->tgid : 0;
+	kbdev->csf.dof.kctx_id = kctx ? kctx->id : 0;
+	kbdev->csf.dof.error_code = error;
+	kbase_debug_csf_fault_wakeup(kbdev);
+
+unlock:
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+	return true;
+}
+
+static ssize_t debug_csf_fault_read(struct file *file, char __user *buffer, size_t size,
+				    loff_t *f_pos)
+{
+#define BUF_SIZE 64
+	struct kbase_device *kbdev;
+	unsigned long flags;
+	int count;
+	char buf[BUF_SIZE];
+	u32 tgid, ctx_id;
+	enum dumpfault_error_type error_code;
+
+	if (unlikely(!file)) {
+		pr_warn("%s: file is NULL", __func__);
+		return -EINVAL;
+	}
+
+	kbdev = file->private_data;
+	if (unlikely(!buffer)) {
+		dev_warn(kbdev->dev, "%s: buffer is NULL", __func__);
+		return -EINVAL;
+	}
+
+	if (unlikely(*f_pos < 0)) {
+		dev_warn(kbdev->dev, "%s: f_pos is negative", __func__);
+		return -EINVAL;
+	}
+
+	if (size < sizeof(buf)) {
+		dev_warn(kbdev->dev, "%s: buffer is too small", __func__);
+		return -EINVAL;
+	}
+
+	if (wait_event_interruptible(kbdev->csf.dof.fault_wait_wq, kbasep_fault_occurred(kbdev)))
+		return -ERESTARTSYS;
+
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	tgid = kbdev->csf.dof.kctx_tgid;
+	ctx_id = kbdev->csf.dof.kctx_id;
+	error_code = kbdev->csf.dof.error_code;
+	BUILD_BUG_ON(sizeof(buf) < (sizeof(tgid) + sizeof(ctx_id) + sizeof(error_code)));
+	count = scnprintf(buf, sizeof(buf), "%u_%u_%u\n", tgid, ctx_id, error_code);
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	dev_info(kbdev->dev, "debug csf fault info read");
+	return simple_read_from_buffer(buffer, size, f_pos, buf, count);
+}
+
+static int debug_csf_fault_open(struct inode *in, struct file *file)
+{
+	struct kbase_device *kbdev;
+
+	if (unlikely(!in)) {
+		pr_warn("%s: inode is NULL", __func__);
+		return -EINVAL;
+	}
+
+	kbdev = in->i_private;
+	if (unlikely(!file)) {
+		dev_warn(kbdev->dev, "%s: file is NULL", __func__);
+		return -EINVAL;
+	}
+
+	if (atomic_cmpxchg(&kbdev->csf.dof.enabled, 0, 1) == 1) {
+		dev_warn(kbdev->dev, "Only one client is allowed for dump on fault");
+		return -EBUSY;
+	}
+
+	dev_info(kbdev->dev, "debug csf fault file open");
+
+	return simple_open(in, file);
+}
+
+static ssize_t debug_csf_fault_write(struct file *file, const char __user *ubuf, size_t count,
+				     loff_t *ppos)
+{
+	struct kbase_device *kbdev;
+	unsigned long flags;
+
+	if (unlikely(!file)) {
+		pr_warn("%s: file is NULL", __func__);
+		return -EINVAL;
+	}
+
+	kbdev = file->private_data;
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	kbdev->csf.dof.error_code = DF_NO_ERROR;
+	kbdev->csf.dof.kctx_tgid = 0;
+	kbdev->csf.dof.kctx_id = 0;
+	dev_info(kbdev->dev, "debug csf fault dump complete");
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	/* User space finished the dump.
+	 * Wake up blocked kernel threads to proceed.
+	 */
+	wake_up(&kbdev->csf.dof.dump_wait_wq);
+
+	return count;
+}
+
+static int debug_csf_fault_release(struct inode *in, struct file *file)
+{
+	struct kbase_device *kbdev;
+	unsigned long flags;
+
+	if (unlikely(!in)) {
+		pr_warn("%s: inode is NULL", __func__);
+		return -EINVAL;
+	}
+
+	kbdev = in->i_private;
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	kbdev->csf.dof.kctx_tgid = 0;
+	kbdev->csf.dof.kctx_id = 0;
+	kbdev->csf.dof.error_code = DF_NO_ERROR;
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	atomic_set(&kbdev->csf.dof.enabled, 0);
+	dev_info(kbdev->dev, "debug csf fault file close");
+
+	/* User space closed the debugfs file.
+	 * Wake up blocked kernel threads to resume.
+	 */
+	wake_up(&kbdev->csf.dof.dump_wait_wq);
+
+	return 0;
+}
+
+static const struct file_operations kbasep_debug_csf_fault_fops = {
+	.owner = THIS_MODULE,
+	.open = debug_csf_fault_open,
+	.read = debug_csf_fault_read,
+	.write = debug_csf_fault_write,
+	.llseek = default_llseek,
+	.release = debug_csf_fault_release,
+};
+
+void kbase_debug_csf_fault_debugfs_init(struct kbase_device *kbdev)
+{
+	const char *fname = "csf_fault";
+
+	if (unlikely(!kbdev)) {
+		pr_warn("%s: kbdev is NULL", __func__);
+		return;
+	}
+
+	debugfs_create_file(fname, 0600, kbdev->mali_debugfs_directory, kbdev,
+			    &kbasep_debug_csf_fault_fops);
+}
+
+int kbase_debug_csf_fault_init(struct kbase_device *kbdev)
+{
+	if (unlikely(!kbdev)) {
+		pr_warn("%s: kbdev is NULL", __func__);
+		return -EINVAL;
+	}
+
+	init_waitqueue_head(&(kbdev->csf.dof.fault_wait_wq));
+	init_waitqueue_head(&(kbdev->csf.dof.dump_wait_wq));
+	spin_lock_init(&kbdev->csf.dof.lock);
+	kbdev->csf.dof.kctx_tgid = 0;
+	kbdev->csf.dof.kctx_id = 0;
+	kbdev->csf.dof.error_code = DF_NO_ERROR;
+	atomic_set(&kbdev->csf.dof.enabled, 0);
+
+	return 0;
+}
+
+void kbase_debug_csf_fault_term(struct kbase_device *kbdev)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
diff --git a/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.h b/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.h
new file mode 100644
index 000000000000..6e9b1a9d51de
--- /dev/null
+++ b/drivers/gpu/arm/bifrost/csf/mali_kbase_debug_csf_fault.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _KBASE_DEBUG_CSF_FAULT_H
+#define _KBASE_DEBUG_CSF_FAULT_H
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+/**
+ * kbase_debug_csf_fault_debugfs_init - Initialize CSF fault debugfs
+ * @kbdev:	Device pointer
+ */
+void kbase_debug_csf_fault_debugfs_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_debug_csf_fault_init - Create the fault event wait queue per device
+ *                              and initialize the required resources.
+ * @kbdev:    Device pointer
+ *
+ * Return: Zero on success or a negative error code.
+ */
+int kbase_debug_csf_fault_init(struct kbase_device *kbdev);
+
+/**
+ * kbase_debug_csf_fault_term - Clean up resources created by
+ *		                @kbase_debug_csf_fault_init.
+ * @kbdev:    Device pointer
+ */
+void kbase_debug_csf_fault_term(struct kbase_device *kbdev);
+
+/**
+ * kbase_debug_csf_fault_wait_completion - Wait for the client to complete.
+ *
+ * @kbdev:    Device Pointer
+ *
+ * Wait for the user space client to finish reading the fault information.
+ * This function must be called in thread context.
+ */
+void kbase_debug_csf_fault_wait_completion(struct kbase_device *kbdev);
+
+/**
+ * kbase_debug_csf_fault_notify - Notify client of a fault.
+ *
+ * @kbdev:    Device pointer
+ * @kctx:     Faulty context (can be NULL)
+ * @error:    Error code.
+ *
+ * Store fault information and wake up the user space client.
+ *
+ * Return: true if a dump on fault was initiated or was is in progress and
+ *         so caller can opt to wait for the dumping to complete.
+ */
+bool kbase_debug_csf_fault_notify(struct kbase_device *kbdev,
+		struct kbase_context *kctx, enum dumpfault_error_type error);
+
+/**
+ * kbase_debug_csf_fault_dump_enabled - Check if dump on fault is enabled.
+ *
+ * @kbdev:  Device pointer
+ *
+ * Return: true if debugfs file is opened so dump on fault is enabled.
+ */
+static inline bool kbase_debug_csf_fault_dump_enabled(struct kbase_device *kbdev)
+{
+	return atomic_read(&kbdev->csf.dof.enabled);
+}
+
+/**
+ * kbase_debug_csf_fault_dump_complete - Check if dump on fault is completed.
+ *
+ * @kbdev:  Device pointer
+ *
+ * Return: true if dump on fault completes or file is closed.
+ */
+static inline bool kbase_debug_csf_fault_dump_complete(struct kbase_device *kbdev)
+{
+	unsigned long flags;
+	bool ret;
+
+	if (likely(!kbase_debug_csf_fault_dump_enabled(kbdev)))
+		return true;
+
+	spin_lock_irqsave(&kbdev->csf.dof.lock, flags);
+	ret = (kbdev->csf.dof.error_code == DF_NO_ERROR);
+	spin_unlock_irqrestore(&kbdev->csf.dof.lock, flags);
+
+	return ret;
+}
+#else /* CONFIG_DEBUG_FS */
+static inline int kbase_debug_csf_fault_init(struct kbase_device *kbdev)
+{
+	return 0;
+}
+
+static inline void kbase_debug_csf_fault_term(struct kbase_device *kbdev)
+{
+}
+
+static inline void kbase_debug_csf_fault_wait_completion(struct kbase_device *kbdev)
+{
+}
+
+static inline bool kbase_debug_csf_fault_notify(struct kbase_device *kbdev,
+		struct kbase_context *kctx, enum dumpfault_error_type error)
+{
+	return false;
+}
+
+static inline bool kbase_debug_csf_fault_dump_enabled(struct kbase_device *kbdev)
+{
+	return false;
+}
+
+static inline bool kbase_debug_csf_fault_dump_complete(struct kbase_device *kbdev)
+{
+	return true;
+}
+#endif /* CONFIG_DEBUG_FS */
+
+#endif /*_KBASE_DEBUG_CSF_FAULT_H*/
diff --git a/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_ktrace_codes_csf.h b/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_ktrace_codes_csf.h
index 9e4da9f11787..41b2b00f18c8 100644
--- a/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_ktrace_codes_csf.h
+++ b/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_ktrace_codes_csf.h
@@ -42,19 +42,25 @@ int dummy_array[] = {
 	/*
 	 * Generic CSF events
 	 */
+	/* info_val = 0 */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_EVICT_CTX_SLOTS_START),
+	/* info_val == number of CSGs supported */
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_EVICT_CTX_SLOTS_END),
 	/* info_val[0:7]   == fw version_minor
 	 * info_val[15:8]  == fw version_major
 	 * info_val[63:32] == fw version_hash
 	 */
 	KBASE_KTRACE_CODE_MAKE_CODE(CSF_FIRMWARE_BOOT),
 	KBASE_KTRACE_CODE_MAKE_CODE(CSF_FIRMWARE_REBOOT),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TOCK_INVOKE),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TICK_INVOKE),
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TOCK_START),
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TOCK_END),
 	/* info_val == total number of runnable groups across all kctxs */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TICK_START),
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_TICK_END),
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_RESET_START),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_RESET_END),
 	/* info_val = timeout in ms */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_PROTM_WAIT_QUIT_START),
 	/* info_val = remaining ms timeout, or 0 if timedout */
@@ -101,6 +107,8 @@ int dummy_array[] = {
 	 * purpose.
 	 */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_GPU_IDLE_WORKER_HANDLING_START),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_GPU_IDLE_WORKER_HANDLING_END),
+
 	KBASE_KTRACE_CODE_MAKE_CODE(CSF_FIRMWARE_MCU_HALTED),
 	KBASE_KTRACE_CODE_MAKE_CODE(CSF_FIRMWARE_MCU_SLEEP),
 
@@ -126,6 +134,8 @@ int dummy_array[] = {
 	 * group->csg_nr indicates which bit was set
 	 */
 	KBASE_KTRACE_CODE_MAKE_CODE(CSG_SLOT_IDLE_SET),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSG_INTERRUPT_NO_NON_IDLE_GROUPS),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSG_INTERRUPT_NON_IDLE_GROUPS),
 	/* info_val = scheduler's new csg_slots_idle_mask[0]
 	 * group->csg_nr indicates which bit was cleared
 	 *
@@ -190,10 +200,37 @@ int dummy_array[] = {
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_NONIDLE_OFFSLOT_GRP_INC),
 	/* info_val == new count of off-slot non-idle groups */
 	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_NONIDLE_OFFSLOT_GRP_DEC),
+	/* info_val = scheduler's new csg_slots_idle_mask[0]
+	 * group->csg_nr indicates which bit was set
+	 */
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHEDULER_HANDLE_IDLE_SLOTS),
 
 	KBASE_KTRACE_CODE_MAKE_CODE(PROTM_EVENT_WORKER_START),
 	KBASE_KTRACE_CODE_MAKE_CODE(PROTM_EVENT_WORKER_END),
 
+	/* info_val = scheduler state */
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHED_BUSY),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHED_INACTIVE),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHED_SUSPENDED),
+	KBASE_KTRACE_CODE_MAKE_CODE(SCHED_SLEEPING),
+
+	/* info_val = mcu state */
+#define KBASEP_MCU_STATE(n) KBASE_KTRACE_CODE_MAKE_CODE(PM_MCU_ ## n),
+#include "backend/gpu/mali_kbase_pm_mcu_states.h"
+#undef KBASEP_MCU_STATE
+
+	/* info_val = number of runnable groups */
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_INACTIVE),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_RUNNABLE),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_IDLE),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_SUSPENDED),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_SUSPENDED_ON_IDLE),
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_SUSPENDED_ON_WAIT_SYNC),
+	/* info_val = new run state of the evicted group */
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_FAULT_EVICTED),
+	/* info_val = get the number of active CSGs */
+	KBASE_KTRACE_CODE_MAKE_CODE(CSF_GROUP_TERMINATED),
+
 	/*
 	 * Group + Queue events
 	 */
diff --git a/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_linux_ktrace_csf.h b/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_linux_ktrace_csf.h
index 86e81e510b47..ddcac906c492 100644
--- a/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_linux_ktrace_csf.h
+++ b/drivers/gpu/arm/bifrost/debug/backend/mali_kbase_debug_linux_ktrace_csf.h
@@ -31,13 +31,17 @@
  * Generic CSF events - using the common DEFINE_MALI_ADD_EVENT
  */
 DEFINE_MALI_ADD_EVENT(SCHEDULER_EVICT_CTX_SLOTS_START);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_EVICT_CTX_SLOTS_END);
 DEFINE_MALI_ADD_EVENT(CSF_FIRMWARE_BOOT);
 DEFINE_MALI_ADD_EVENT(CSF_FIRMWARE_REBOOT);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_TOCK_INVOKE);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_TICK_INVOKE);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_TOCK_START);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_TOCK_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_TICK_START);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_TICK_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_RESET_START);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_RESET_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_PROTM_WAIT_QUIT_START);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_PROTM_WAIT_QUIT_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_GROUP_SYNC_UPDATE_EVENT);
@@ -58,8 +62,16 @@ DEFINE_MALI_ADD_EVENT(SCHEDULER_GROUP_SYNC_UPDATE_WORKER_START);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_GROUP_SYNC_UPDATE_WORKER_END);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_UPDATE_IDLE_SLOTS_ACK);
 DEFINE_MALI_ADD_EVENT(SCHEDULER_GPU_IDLE_WORKER_HANDLING_START);
+DEFINE_MALI_ADD_EVENT(SCHEDULER_GPU_IDLE_WORKER_HANDLING_END);
 DEFINE_MALI_ADD_EVENT(CSF_FIRMWARE_MCU_HALTED);
 DEFINE_MALI_ADD_EVENT(CSF_FIRMWARE_MCU_SLEEP);
+DEFINE_MALI_ADD_EVENT(SCHED_BUSY);
+DEFINE_MALI_ADD_EVENT(SCHED_INACTIVE);
+DEFINE_MALI_ADD_EVENT(SCHED_SUSPENDED);
+DEFINE_MALI_ADD_EVENT(SCHED_SLEEPING);
+#define KBASEP_MCU_STATE(n) DEFINE_MALI_ADD_EVENT(PM_MCU_ ## n);
+#include "backend/gpu/mali_kbase_pm_mcu_states.h"
+#undef KBASEP_MCU_STATE
 
 DECLARE_EVENT_CLASS(mali_csf_grp_q_template,
 	TP_PROTO(struct kbase_device *kbdev, struct kbase_queue_group *group,
@@ -136,6 +148,8 @@ DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_STOPPED);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_CLEANED);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_UPDATE_IDLE_SLOT_REQ);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_IDLE_SET);
+DEFINE_MALI_CSF_GRP_EVENT(CSG_INTERRUPT_NO_NON_IDLE_GROUPS);
+DEFINE_MALI_CSF_GRP_EVENT(CSG_INTERRUPT_NON_IDLE_GROUPS);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_IDLE_CLEAR);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_SLOT_PRIO_UPDATE);
 DEFINE_MALI_CSF_GRP_EVENT(CSG_INTERRUPT_SYNC_UPDATE);
@@ -160,8 +174,17 @@ DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_PROTM_EXIT);
 DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_TOP_GRP);
 DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_NONIDLE_OFFSLOT_GRP_INC);
 DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_NONIDLE_OFFSLOT_GRP_DEC);
+DEFINE_MALI_CSF_GRP_EVENT(SCHEDULER_HANDLE_IDLE_SLOTS);
 DEFINE_MALI_CSF_GRP_EVENT(PROTM_EVENT_WORKER_START);
 DEFINE_MALI_CSF_GRP_EVENT(PROTM_EVENT_WORKER_END);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_INACTIVE);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_RUNNABLE);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_IDLE);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_SUSPENDED);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_SUSPENDED_ON_IDLE);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_SUSPENDED_ON_WAIT_SYNC);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_FAULT_EVICTED);
+DEFINE_MALI_CSF_GRP_EVENT(CSF_GROUP_TERMINATED);
 
 #undef DEFINE_MALI_CSF_GRP_EVENT
 
diff --git a/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_ktrace_codes.h b/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_ktrace_codes.h
index 1c6b4cd26fe0..6103c3ee04a8 100644
--- a/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_ktrace_codes.h
+++ b/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_ktrace_codes.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2011-2015, 2018-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2011-2015, 2018-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -142,6 +142,11 @@ int dummy_array[] = {
 	KBASE_KTRACE_CODE_MAKE_CODE(PM_RUNTIME_SUSPEND_CALLBACK),
 	KBASE_KTRACE_CODE_MAKE_CODE(PM_RUNTIME_RESUME_CALLBACK),
 
+	/* info_val = l2 state */
+#define KBASEP_L2_STATE(n) KBASE_KTRACE_CODE_MAKE_CODE(PM_L2_ ## n),
+#include "backend/gpu/mali_kbase_pm_l2_states.h"
+#undef KBASEP_L2_STATE
+
 	/*
 	 * Context Scheduler events
 	 */
diff --git a/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_linux_ktrace.h b/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_linux_ktrace.h
index 5fac763d1916..6d96647161b4 100644
--- a/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_linux_ktrace.h
+++ b/drivers/gpu/arm/bifrost/debug/mali_kbase_debug_linux_ktrace.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2014, 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2014, 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -98,6 +98,9 @@ DEFINE_MALI_ADD_EVENT(PM_WAKE_WAITERS);
 DEFINE_MALI_ADD_EVENT(PM_POWEROFF_WAIT_WQ);
 DEFINE_MALI_ADD_EVENT(PM_RUNTIME_SUSPEND_CALLBACK);
 DEFINE_MALI_ADD_EVENT(PM_RUNTIME_RESUME_CALLBACK);
+#define KBASEP_L2_STATE(n) DEFINE_MALI_ADD_EVENT(PM_L2_ ## n);
+#include "backend/gpu/mali_kbase_pm_l2_states.h"
+#undef KBASEP_L2_STATE
 DEFINE_MALI_ADD_EVENT(SCHED_RETAIN_CTX_NOLOCK);
 DEFINE_MALI_ADD_EVENT(SCHED_RELEASE_CTX);
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
diff --git a/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_csf.c b/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_csf.c
index 1e84f6b2644d..277569381292 100644
--- a/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_csf.c
+++ b/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_csf.c
@@ -23,8 +23,8 @@
 #include <device/mali_kbase_device.h>
 
 #include <mali_kbase_hwaccess_backend.h>
-#include <mali_kbase_hwcnt_backend_csf_if_fw.h>
-#include <mali_kbase_hwcnt_watchdog_if_timer.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.h>
+#include <hwcnt/mali_kbase_hwcnt_watchdog_if_timer.h>
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_reset_gpu.h>
 #include <csf/mali_kbase_csf.h>
@@ -40,9 +40,10 @@
 #include <backend/gpu/mali_kbase_js_internal.h>
 #include <backend/gpu/mali_kbase_clk_rate_trace_mgr.h>
 #include <csf/mali_kbase_csf_csg_debugfs.h>
-#include <mali_kbase_hwcnt_virtualizer.h>
+#include <hwcnt/mali_kbase_hwcnt_virtualizer.h>
 #include <mali_kbase_kinstr_prfcnt.h>
 #include <mali_kbase_vinstr.h>
+#include <tl/mali_kbase_timeline.h>
 
 /**
  * kbase_device_firmware_hwcnt_term - Terminate CSF firmware and HWC
@@ -60,7 +61,7 @@ static void kbase_device_firmware_hwcnt_term(struct kbase_device *kbdev)
 		kbase_vinstr_term(kbdev->vinstr_ctx);
 		kbase_hwcnt_virtualizer_term(kbdev->hwcnt_gpu_virt);
 		kbase_hwcnt_backend_csf_metadata_term(&kbdev->hwcnt_gpu_iface);
-		kbase_csf_firmware_term(kbdev);
+		kbase_csf_firmware_unload_term(kbdev);
 	}
 }
 
@@ -197,6 +198,20 @@ static int kbase_csf_early_init(struct kbase_device *kbdev)
 static void kbase_csf_early_term(struct kbase_device *kbdev)
 {
 	kbase_csf_scheduler_early_term(kbdev);
+	kbase_csf_firmware_early_term(kbdev);
+}
+
+/**
+ * kbase_csf_late_init - late initialization for firmware.
+ * @kbdev:	Device pointer
+ *
+ * Return: 0 on success, error code otherwise.
+ */
+static int kbase_csf_late_init(struct kbase_device *kbdev)
+{
+	int err = kbase_csf_firmware_late_init(kbdev);
+
+	return err;
 }
 
 /**
@@ -269,59 +284,48 @@ static void kbase_device_hwcnt_backend_csf_term(struct kbase_device *kbdev)
 
 static const struct kbase_device_init dev_init[] = {
 #if IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
-	{ kbase_gpu_device_create, kbase_gpu_device_destroy,
-	  "Dummy model initialization failed" },
+	{ kbase_gpu_device_create, kbase_gpu_device_destroy, "Dummy model initialization failed" },
 #else
 	{ assign_irqs, NULL, "IRQ search failed" },
 	{ registers_map, registers_unmap, "Register map failed" },
 #endif
-	{ power_control_init, power_control_term,
-	  "Power control initialization failed" },
+	{ power_control_init, power_control_term, "Power control initialization failed" },
 	{ kbase_device_io_history_init, kbase_device_io_history_term,
 	  "Register access history initialization failed" },
-	{ kbase_device_early_init, kbase_device_early_term,
-	  "Early device initialization failed" },
-	{ kbase_device_populate_max_freq, NULL,
-	  "Populating max frequency failed" },
-	{ kbase_pm_lowest_gpu_freq_init, NULL,
-	  "Lowest freq initialization failed" },
+	{ kbase_device_early_init, kbase_device_early_term, "Early device initialization failed" },
+	{ kbase_device_populate_max_freq, NULL, "Populating max frequency failed" },
+	{ kbase_pm_lowest_gpu_freq_init, NULL, "Lowest freq initialization failed" },
 	{ kbase_device_misc_init, kbase_device_misc_term,
 	  "Miscellaneous device initialization failed" },
 	{ kbase_device_pcm_dev_init, kbase_device_pcm_dev_term,
 	  "Priority control manager initialization failed" },
-	{ kbase_ctx_sched_init, kbase_ctx_sched_term,
-	  "Context scheduler initialization failed" },
-	{ kbase_mem_init, kbase_mem_term,
-	  "Memory subsystem initialization failed" },
+	{ kbase_ctx_sched_init, kbase_ctx_sched_term, "Context scheduler initialization failed" },
+	{ kbase_mem_init, kbase_mem_term, "Memory subsystem initialization failed" },
 	{ kbase_csf_protected_memory_init, kbase_csf_protected_memory_term,
 	  "Protected memory allocator initialization failed" },
 	{ kbase_device_coherency_init, NULL, "Device coherency init failed" },
 	{ kbase_protected_mode_init, kbase_protected_mode_term,
 	  "Protected mode subsystem initialization failed" },
-	{ kbase_device_list_init, kbase_device_list_term,
-	  "Device list setup failed" },
+	{ kbase_device_list_init, kbase_device_list_term, "Device list setup failed" },
 	{ kbase_device_timeline_init, kbase_device_timeline_term,
 	  "Timeline stream initialization failed" },
 	{ kbase_clk_rate_trace_manager_init, kbase_clk_rate_trace_manager_term,
 	  "Clock rate trace manager initialization failed" },
-	{ kbase_device_hwcnt_watchdog_if_init,
-	  kbase_device_hwcnt_watchdog_if_term,
+	{ kbase_device_hwcnt_watchdog_if_init, kbase_device_hwcnt_watchdog_if_term,
 	  "GPU hwcnt backend watchdog interface creation failed" },
-	{ kbase_device_hwcnt_backend_csf_if_init,
-	  kbase_device_hwcnt_backend_csf_if_term,
+	{ kbase_device_hwcnt_backend_csf_if_init, kbase_device_hwcnt_backend_csf_if_term,
 	  "GPU hwcnt backend CSF interface creation failed" },
-	{ kbase_device_hwcnt_backend_csf_init,
-	  kbase_device_hwcnt_backend_csf_term,
+	{ kbase_device_hwcnt_backend_csf_init, kbase_device_hwcnt_backend_csf_term,
 	  "GPU hwcnt backend creation failed" },
 	{ kbase_device_hwcnt_context_init, kbase_device_hwcnt_context_term,
 	  "GPU hwcnt context initialization failed" },
-	{ kbase_csf_early_init, kbase_csf_early_term,
-	  "Early CSF initialization failed" },
-	{ kbase_backend_late_init, kbase_backend_late_term,
-	  "Late backend initialization failed" },
+	{ kbase_csf_early_init, kbase_csf_early_term, "Early CSF initialization failed" },
+	{ kbase_backend_late_init, kbase_backend_late_term, "Late backend initialization failed" },
+	{ kbase_csf_late_init, NULL, "Late CSF initialization failed" },
 	{ NULL, kbase_device_firmware_hwcnt_term, NULL },
-	{ kbase_device_debugfs_init, kbase_device_debugfs_term,
-	  "DebugFS initialization failed" },
+	{ kbase_debug_csf_fault_init, kbase_debug_csf_fault_term,
+	  "CSF fault debug initialization failed" },
+	{ kbase_device_debugfs_init, kbase_device_debugfs_term, "DebugFS initialization failed" },
 	/* Sysfs init needs to happen before registering the device with
 	 * misc_register(), otherwise it causes a race condition between
 	 * registering the device and a uevent event being generated for
@@ -339,8 +343,7 @@ static const struct kbase_device_init dev_init[] = {
 	  "Misc device registration failed" },
 	{ kbase_gpuprops_populate_user_buffer, kbase_gpuprops_free_user_buffer,
 	  "GPU property population failed" },
-	{ kbase_device_late_init, kbase_device_late_term,
-	  "Late device initialization failed" },
+	{ kbase_device_late_init, kbase_device_late_term, "Late device initialization failed" },
 };
 
 static void kbase_device_term_partial(struct kbase_device *kbdev,
@@ -468,7 +471,7 @@ static int kbase_csf_firmware_deferred_init(struct kbase_device *kbdev)
 
 	lockdep_assert_held(&kbdev->fw_load_lock);
 
-	err = kbase_csf_firmware_init(kbdev);
+	err = kbase_csf_firmware_load_init(kbdev);
 	if (!err) {
 		unsigned long flags;
 
@@ -498,11 +501,12 @@ int kbase_device_firmware_init_once(struct kbase_device *kbdev)
 
 		ret = kbase_device_hwcnt_csf_deferred_init(kbdev);
 		if (ret) {
-			kbase_csf_firmware_term(kbdev);
+			kbase_csf_firmware_unload_term(kbdev);
 			goto out;
 		}
 
 		kbase_csf_debugfs_init(kbdev);
+		kbase_timeline_io_debugfs_init(kbdev);
 out:
 		kbase_pm_context_idle(kbdev);
 	}
diff --git a/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_hw_csf.c b/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_hw_csf.c
index 7939bfd8e74c..3b792968a7d7 100644
--- a/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_hw_csf.c
+++ b/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_hw_csf.c
@@ -115,6 +115,9 @@ void kbase_gpu_interrupt(struct kbase_device *kbdev, u32 val)
 									GPU_EXCEPTION_TYPE_SW_FAULT_0,
 							} } };
 
+			kbase_debug_csf_fault_notify(kbdev, scheduler->active_protm_grp->kctx,
+						     DF_GPU_PROTECTED_FAULT);
+
 			scheduler->active_protm_grp->faulted = true;
 			kbase_csf_add_group_fatal_error(
 				scheduler->active_protm_grp, &err_payload);
@@ -201,8 +204,11 @@ static bool kbase_is_register_accessible(u32 offset)
 
 void kbase_reg_write(struct kbase_device *kbdev, u32 offset, u32 value)
 {
-	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
-	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
+	if (WARN_ON(!kbdev->pm.backend.gpu_powered))
+		return;
+
+	if (WARN_ON(kbdev->dev == NULL))
+		return;
 
 	if (!kbase_is_register_accessible(offset))
 		return;
@@ -222,8 +228,11 @@ u32 kbase_reg_read(struct kbase_device *kbdev, u32 offset)
 {
 	u32 val;
 
-	KBASE_DEBUG_ASSERT(kbdev->pm.backend.gpu_powered);
-	KBASE_DEBUG_ASSERT(kbdev->dev != NULL);
+	if (WARN_ON(!kbdev->pm.backend.gpu_powered))
+		return 0;
+
+	if (WARN_ON(kbdev->dev == NULL))
+		return 0;
 
 	if (!kbase_is_register_accessible(offset))
 		return 0;
diff --git a/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_jm.c b/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_jm.c
index ed7512ef2e39..129b4e430c52 100644
--- a/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_jm.c
+++ b/drivers/gpu/arm/bifrost/device/backend/mali_kbase_device_jm.c
@@ -27,9 +27,9 @@
 #include <mali_kbase_hwaccess_backend.h>
 #include <mali_kbase_ctx_sched.h>
 #include <mali_kbase_reset_gpu.h>
-#include <mali_kbase_hwcnt_watchdog_if_timer.h>
-#include <mali_kbase_hwcnt_backend_jm.h>
-#include <mali_kbase_hwcnt_backend_jm_watchdog.h>
+#include <hwcnt/mali_kbase_hwcnt_watchdog_if_timer.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_jm.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h>
 
 #if IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 #include <backend/gpu/mali_kbase_model_linux.h>
diff --git a/drivers/gpu/arm/bifrost/device/mali_kbase_device.c b/drivers/gpu/arm/bifrost/device/mali_kbase_device.c
index 7004e347fa1b..fa3669a409e2 100644
--- a/drivers/gpu/arm/bifrost/device/mali_kbase_device.c
+++ b/drivers/gpu/arm/bifrost/device/mali_kbase_device.c
@@ -42,8 +42,8 @@
 #include <tl/mali_kbase_timeline.h>
 #include "mali_kbase_kinstr_prfcnt.h"
 #include "mali_kbase_vinstr.h"
-#include "mali_kbase_hwcnt_context.h"
-#include "mali_kbase_hwcnt_virtualizer.h"
+#include "hwcnt/mali_kbase_hwcnt_context.h"
+#include "hwcnt/mali_kbase_hwcnt_virtualizer.h"
 
 #include "mali_kbase_device.h"
 #include "mali_kbase_device_internal.h"
@@ -56,17 +56,15 @@
 #include "arbiter/mali_kbase_arbiter_pm.h"
 #endif /* CONFIG_MALI_ARBITER_SUPPORT */
 
-/* NOTE: Magic - 0x45435254 (TRCE in ASCII).
- * Supports tracing feature provided in the base module.
- * Please keep it in sync with the value of base module.
- */
-#define TRACE_BUFFER_HEADER_SPECIAL 0x45435254
+#if defined(CONFIG_DEBUG_FS) && !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 
 /* Number of register accesses for the buffer that we allocate during
  * initialization time. The buffer size can be changed later via debugfs.
  */
 #define KBASEP_DEFAULT_REGISTER_HISTORY_SIZE ((u16)512)
 
+#endif /* defined(CONFIG_DEBUG_FS) && !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI) */
+
 static DEFINE_MUTEX(kbase_dev_list_lock);
 static LIST_HEAD(kbase_dev_list);
 static int kbase_dev_nr;
diff --git a/drivers/gpu/arm/bifrost/device/mali_kbase_device.h b/drivers/gpu/arm/bifrost/device/mali_kbase_device.h
index 6706a61d5baa..f025011009d5 100644
--- a/drivers/gpu/arm/bifrost/device/mali_kbase_device.h
+++ b/drivers/gpu/arm/bifrost/device/mali_kbase_device.h
@@ -130,7 +130,11 @@ bool kbase_is_gpu_removed(struct kbase_device *kbdev);
  *
  * Return: 0 if successful or a negative error code on failure.
  */
-#define kbase_gpu_cache_flush_pa_range_and_busy_wait(kbdev, phys, nr_bytes, flush_op) (0)
+#if MALI_USE_CSF
+int kbase_gpu_cache_flush_pa_range_and_busy_wait(struct kbase_device *kbdev, phys_addr_t phys,
+						 size_t nr_bytes, u32 flush_op);
+#endif /* MALI_USE_CSF */
+
 /**
  * kbase_gpu_cache_flush_and_busy_wait - Start a cache flush and busy wait
  * @kbdev: Kbase device
diff --git a/drivers/gpu/arm/bifrost/device/mali_kbase_device_hw.c b/drivers/gpu/arm/bifrost/device/mali_kbase_device_hw.c
index 4bd545a82299..d55495045892 100644
--- a/drivers/gpu/arm/bifrost/device/mali_kbase_device_hw.c
+++ b/drivers/gpu/arm/bifrost/device/mali_kbase_device_hw.c
@@ -27,9 +27,6 @@
 #include <mali_kbase_reset_gpu.h>
 #include <mmu/mali_kbase_mmu.h>
 
-#define U64_LO_MASK ((1ULL << 32) - 1)
-#define U64_HI_MASK (~U64_LO_MASK)
-
 #if !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 bool kbase_is_gpu_removed(struct kbase_device *kbdev)
 {
@@ -86,7 +83,38 @@ static int busy_wait_on_irq(struct kbase_device *kbdev, u32 irq_bit)
 	return 0;
 }
 
-#define kbase_gpu_cache_flush_pa_range_and_busy_wait(kbdev, phys, nr_bytes, flush_op) (0)
+#if MALI_USE_CSF
+#define U64_LO_MASK ((1ULL << 32) - 1)
+#define U64_HI_MASK (~U64_LO_MASK)
+
+int kbase_gpu_cache_flush_pa_range_and_busy_wait(struct kbase_device *kbdev, phys_addr_t phys,
+						 size_t nr_bytes, u32 flush_op)
+{
+	u64 start_pa, end_pa;
+	int ret = 0;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	/* 1. Clear the interrupt FLUSH_PA_RANGE_COMPLETED bit. */
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_IRQ_CLEAR), FLUSH_PA_RANGE_COMPLETED);
+
+	/* 2. Issue GPU_CONTROL.COMMAND.FLUSH_PA_RANGE operation. */
+	start_pa = phys;
+	end_pa = start_pa + nr_bytes - 1;
+
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND_ARG0_LO), start_pa & U64_LO_MASK);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND_ARG0_HI),
+			(start_pa & U64_HI_MASK) >> 32);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND_ARG1_LO), end_pa & U64_LO_MASK);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND_ARG1_HI), (end_pa & U64_HI_MASK) >> 32);
+	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND), flush_op);
+
+	/* 3. Busy-wait irq status to be enabled. */
+	ret = busy_wait_on_irq(kbdev, (u32)FLUSH_PA_RANGE_COMPLETED);
+
+	return ret;
+}
+#endif /* MALI_USE_CSF */
 
 int kbase_gpu_cache_flush_and_busy_wait(struct kbase_device *kbdev,
 					u32 flush_op)
diff --git a/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_fault_jm.c b/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_fault_jm.c
index 37015ccacd7c..7f3743ca6432 100644
--- a/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_fault_jm.c
+++ b/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_fault_jm.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -170,7 +170,7 @@ const char *kbase_gpu_exception_name(u32 const exception_code)
 	default:
 		e = "UNKNOWN";
 		break;
-	};
+	}
 
 	return e;
 }
diff --git a/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_csf.h b/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_csf.h
index 06c725c0e757..e7457ddb5534 100644
--- a/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_csf.h
+++ b/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_csf.h
@@ -35,10 +35,7 @@
 #define MCU_SUBSYSTEM_BASE 0x20000
 
 /* IPA control registers */
-#define IPA_CONTROL_BASE       0x40000
-#define IPA_CONTROL_REG(r)     (IPA_CONTROL_BASE+(r))
 #define COMMAND                0x000 /* (WO) Command register */
-#define STATUS                 0x004 /* (RO) Status register */
 #define TIMER                  0x008 /* (RW) Timer control register */
 
 #define SELECT_CSHW_LO         0x010 /* (RW) Counter select for CS hardware, low word */
@@ -127,8 +124,16 @@
 
 #define MCU_STATUS_HALTED        (1 << 1)
 
+#define L2_CONFIG_PBHA_HWU_SHIFT GPU_U(12)
+#define L2_CONFIG_PBHA_HWU_MASK (GPU_U(0xF) << L2_CONFIG_PBHA_HWU_SHIFT)
+#define L2_CONFIG_PBHA_HWU_GET(reg_val)                                                            \
+	(((reg_val)&L2_CONFIG_PBHA_HWU_MASK) >> L2_CONFIG_PBHA_HWU_SHIFT)
+#define L2_CONFIG_PBHA_HWU_SET(reg_val, value)                                                     \
+	(((reg_val) & ~L2_CONFIG_PBHA_HWU_MASK) |                                                  \
+	 (((value) << L2_CONFIG_PBHA_HWU_SHIFT) & L2_CONFIG_PBHA_HWU_MASK))
+
 /* JOB IRQ flags */
-#define JOB_IRQ_GLOBAL_IF       (1 << 31)   /* Global interface interrupt received */
+#define JOB_IRQ_GLOBAL_IF (1u << 31) /* Global interface interrupt received */
 
 /* GPU_COMMAND codes */
 #define GPU_COMMAND_CODE_NOP                0x00 /* No operation, nothing happens */
diff --git a/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h b/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h
index c349f4b058cd..380ec30d607f 100644
--- a/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h
+++ b/drivers/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h
@@ -127,29 +127,12 @@
 
 #define JOB_SLOT_REG(n, r)      (JOB_CONTROL_REG(JOB_SLOT0 + ((n) << 7)) + (r))
 
-#define JS_HEAD_LO             0x00	/* (RO) Job queue head pointer for job slot n, low word */
-#define JS_HEAD_HI             0x04	/* (RO) Job queue head pointer for job slot n, high word */
-#define JS_TAIL_LO             0x08	/* (RO) Job queue tail pointer for job slot n, low word */
-#define JS_TAIL_HI             0x0C	/* (RO) Job queue tail pointer for job slot n, high word */
-#define JS_AFFINITY_LO         0x10	/* (RO) Core affinity mask for job slot n, low word */
-#define JS_AFFINITY_HI         0x14	/* (RO) Core affinity mask for job slot n, high word */
-#define JS_CONFIG              0x18	/* (RO) Configuration settings for job slot n */
-/* (RO) Extended affinity mask for job slot n*/
-#define JS_XAFFINITY           0x1C
+#define JS_XAFFINITY           0x1C /* (RO) Extended affinity mask for job slot n*/
 
 #define JS_COMMAND             0x20	/* (WO) Command register for job slot n */
 #define JS_STATUS              0x24	/* (RO) Status register for job slot n */
 
-#define JS_HEAD_NEXT_LO        0x40	/* (RW) Next job queue head pointer for job slot n, low word */
-#define JS_HEAD_NEXT_HI        0x44	/* (RW) Next job queue head pointer for job slot n, high word */
-
-#define JS_AFFINITY_NEXT_LO    0x50	/* (RW) Next core affinity mask for job slot n, low word */
-#define JS_AFFINITY_NEXT_HI    0x54	/* (RW) Next core affinity mask for job slot n, high word */
-#define JS_CONFIG_NEXT         0x58	/* (RW) Next configuration settings for job slot n */
-/* (RW) Next extended affinity mask for job slot n */
-#define JS_XAFFINITY_NEXT      0x5C
-
-#define JS_COMMAND_NEXT        0x60	/* (RW) Next command register for job slot n */
+#define JS_XAFFINITY_NEXT      0x5C /* (RW) Next extended affinity mask for job slot n */
 
 #define JS_FLUSH_ID_NEXT       0x70	/* (RW) Next job slot n cache flush ID */
 
diff --git a/drivers/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h b/drivers/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h
index 396ebd5e21c9..282f566c0746 100644
--- a/drivers/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h
+++ b/drivers/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h
@@ -45,9 +45,6 @@
 /* Begin Register Offsets */
 /* GPU control registers */
 
-#define GPU_CONTROL_BASE        0x0000
-#define GPU_CONTROL_REG(r)      (GPU_CONTROL_BASE + (r))
-#define GPU_ID                  0x000   /* (RO) GPU and revision identifier */
 #define L2_FEATURES             0x004   /* (RO) Level 2 cache features */
 #define TILER_FEATURES          0x00C   /* (RO) Tiler Features */
 #define MEM_FEATURES            0x010   /* (RO) Memory system features */
@@ -100,6 +97,10 @@
 
 #define TEXTURE_FEATURES_REG(n) GPU_CONTROL_REG(TEXTURE_FEATURES_0 + ((n) << 2))
 
+#define GPU_COMMAND_ARG0_LO 0x0D0 /* (RW) Additional parameter 0 for GPU commands, low word */
+#define GPU_COMMAND_ARG0_HI 0x0D4 /* (RW) Additional parameter 0 for GPU commands, high word */
+#define GPU_COMMAND_ARG1_LO 0x0D8 /* (RW) Additional parameter 1 for GPU commands, low word */
+#define GPU_COMMAND_ARG1_HI 0x0DC /* (RW) Additional parameter 1 for GPU commands, high word */
 
 #define SHADER_PRESENT_LO       0x100   /* (RO) Shader core present bitmap, low word */
 #define SHADER_PRESENT_HI       0x104   /* (RO) Shader core present bitmap, high word */
@@ -113,26 +114,10 @@
 #define STACK_PRESENT_LO        0xE00   /* (RO) Core stack present bitmap, low word */
 #define STACK_PRESENT_HI        0xE04   /* (RO) Core stack present bitmap, high word */
 
-#define SHADER_READY_LO         0x140   /* (RO) Shader core ready bitmap, low word */
-#define SHADER_READY_HI         0x144   /* (RO) Shader core ready bitmap, high word */
-
-#define TILER_READY_LO          0x150   /* (RO) Tiler core ready bitmap, low word */
-#define TILER_READY_HI          0x154   /* (RO) Tiler core ready bitmap, high word */
-
-#define L2_READY_LO             0x160   /* (RO) Level 2 cache ready bitmap, low word */
-#define L2_READY_HI             0x164   /* (RO) Level 2 cache ready bitmap, high word */
-
 #define STACK_READY_LO          0xE10   /* (RO) Core stack ready bitmap, low word */
 #define STACK_READY_HI          0xE14   /* (RO) Core stack ready bitmap, high word */
 
-#define SHADER_PWRON_LO         0x180   /* (WO) Shader core power on bitmap, low word */
-#define SHADER_PWRON_HI         0x184   /* (WO) Shader core power on bitmap, high word */
-
-#define TILER_PWRON_LO          0x190   /* (WO) Tiler core power on bitmap, low word */
-#define TILER_PWRON_HI          0x194   /* (WO) Tiler core power on bitmap, high word */
-
-#define L2_PWRON_LO             0x1A0   /* (WO) Level 2 cache power on bitmap, low word */
-#define L2_PWRON_HI             0x1A4   /* (WO) Level 2 cache power on bitmap, high word */
+#define SHADER_PWRFEATURES      0x188   /* (RW) Shader core power features */
 
 #define STACK_PWRON_LO          0xE20   /* (RO) Core stack power on bitmap, low word */
 #define STACK_PWRON_HI          0xE24   /* (RO) Core stack power on bitmap, high word */
@@ -181,6 +166,8 @@
 #define COHERENCY_FEATURES      0x300   /* (RO) Coherency features present */
 #define COHERENCY_ENABLE        0x304   /* (RW) Coherency enable */
 
+#define AMBA_FEATURES           0x300   /* (RO) AMBA bus supported features */
+#define AMBA_ENABLE             0x304   /* (RW) AMBA features enable */
 
 #define SHADER_CONFIG           0xF04   /* (RW) Shader core configuration (implementation-specific) */
 #define TILER_CONFIG            0xF08   /* (RW) Tiler core configuration (implementation-specific) */
@@ -188,13 +175,7 @@
 
 /* Job control registers */
 
-#define JOB_CONTROL_BASE        0x1000
-
-#define JOB_CONTROL_REG(r)      (JOB_CONTROL_BASE + (r))
-
 #define JOB_IRQ_RAWSTAT         0x000   /* Raw interrupt status register */
-#define JOB_IRQ_CLEAR           0x004   /* Interrupt clear register */
-#define JOB_IRQ_MASK            0x008   /* Interrupt mask register */
 #define JOB_IRQ_STATUS          0x00C   /* Interrupt status register */
 
 /* MMU control registers */
@@ -203,7 +184,6 @@
 #define MMU_IRQ_MASK            0x008   /* (RW) Interrupt mask register */
 #define MMU_IRQ_STATUS          0x00C   /* (RO) Interrupt status register */
 
-#define MMU_AS0                 0x400   /* Configuration registers for address space 0 */
 #define MMU_AS1                 0x440   /* Configuration registers for address space 1 */
 #define MMU_AS2                 0x480   /* Configuration registers for address space 2 */
 #define MMU_AS3                 0x4C0   /* Configuration registers for address space 3 */
@@ -221,25 +201,13 @@
 #define MMU_AS15                0x7C0   /* Configuration registers for address space 15 */
 
 /* MMU address space control registers */
-
-#define MMU_AS_REG(n, r)        (MMU_REG(MMU_AS0 + ((n) << 6)) + (r))
-
-#define AS_TRANSTAB_LO         0x00	/* (RW) Translation Table Base Address for address space n, low word */
-#define AS_TRANSTAB_HI         0x04	/* (RW) Translation Table Base Address for address space n, high word */
-#define AS_MEMATTR_LO          0x08	/* (RW) Memory attributes for address space n, low word. */
-#define AS_MEMATTR_HI          0x0C	/* (RW) Memory attributes for address space n, high word. */
 #define AS_LOCKADDR_LO         0x10	/* (RW) Lock region address for address space n, low word */
 #define AS_LOCKADDR_HI         0x14	/* (RW) Lock region address for address space n, high word */
-#define AS_COMMAND             0x18	/* (WO) MMU command register for address space n */
 #define AS_FAULTSTATUS         0x1C	/* (RO) MMU fault status register for address space n */
 #define AS_FAULTADDRESS_LO     0x20	/* (RO) Fault Address for address space n, low word */
 #define AS_FAULTADDRESS_HI     0x24	/* (RO) Fault Address for address space n, high word */
 #define AS_STATUS              0x28	/* (RO) Status flags for address space n */
 
-/* (RW) Translation table configuration for address space n, low word */
-#define AS_TRANSCFG_LO         0x30
-/* (RW) Translation table configuration for address space n, high word */
-#define AS_TRANSCFG_HI         0x34
 /* (RO) Secondary fault address for address space n, low word */
 #define AS_FAULTEXTRA_LO       0x38
 /* (RO) Secondary fault address for address space n, high word */
@@ -464,6 +432,80 @@
 #define L2_CONFIG_ASN_HASH_ENABLE_MASK         (1ul << L2_CONFIG_ASN_HASH_ENABLE_SHIFT)
 /* End L2_CONFIG register */
 
+/* AMBA_FEATURES register */
+#define AMBA_FEATURES_ACE_LITE_SHIFT GPU_U(0)
+#define AMBA_FEATURES_ACE_LITE_MASK (GPU_U(0x1) << AMBA_FEATURES_ACE_LITE_SHIFT)
+#define AMBA_FEATURES_ACE_LITE_GET(reg_val)                                    \
+	(((reg_val)&AMBA_FEATURES_ACE_LITE_MASK) >>                            \
+	 AMBA_FEATURES_ACE_LITE_SHIFT)
+#define AMBA_FEATURES_ACE_LITE_SET(reg_val, value)                             \
+	(((reg_val) & ~AMBA_FEATURES_ACE_LITE_MASK) |                          \
+	 (((value) << AMBA_FEATURES_ACE_LITE_SHIFT) &                          \
+	  AMBA_FEATURES_ACE_LITE_MASK))
+#define AMBA_FEATURES_ACE_SHIFT GPU_U(1)
+#define AMBA_FEATURES_ACE_MASK (GPU_U(0x1) << AMBA_FEATURES_ACE_SHIFT)
+#define AMBA_FEATURES_ACE_GET(reg_val)                                         \
+	(((reg_val)&AMBA_FEATURES_ACE_MASK) >> AMBA_FEATURES_ACE_SHIFT)
+#define AMBA_FEATURES_ACE_SET(reg_val, value)                                  \
+	(((reg_val) & ~AMBA_FEATURES_ACE_MASK) |                               \
+	 (((value) << AMBA_FEATURES_ACE_SHIFT) & AMBA_FEATURES_ACE_MASK))
+#define AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SHIFT GPU_U(5)
+#define AMBA_FEATURES_MEMORY_CACHE_SUPPORT_MASK                                \
+	(GPU_U(0x1) << AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SHIFT)
+#define AMBA_FEATURES_MEMORY_CACHE_SUPPORT_GET(reg_val)                        \
+	(((reg_val)&AMBA_FEATURES_MEMORY_CACHE_SUPPORT_MASK) >>                \
+	 AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SHIFT)
+#define AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SET(reg_val, value)                 \
+	(((reg_val) & ~AMBA_FEATURES_MEMORY_CACHE_SUPPORT_MASK) |              \
+	 (((value) << AMBA_FEATURES_MEMORY_CACHE_SUPPORT_SHIFT) &              \
+	  AMBA_FEATURES_MEMORY_CACHE_SUPPORT_MASK))
+#define AMBA_FEATURES_INVALIDATE_HINT_SHIFT GPU_U(6)
+#define AMBA_FEATURES_INVALIDATE_HINT_MASK                                     \
+	(GPU_U(0x1) << AMBA_FEATURES_INVALIDATE_HINT_SHIFT)
+#define AMBA_FEATURES_INVALIDATE_HINT_GET(reg_val)                             \
+	(((reg_val)&AMBA_FEATURES_INVALIDATE_HINT_MASK) >>                     \
+	 AMBA_FEATURES_INVALIDATE_HINT_SHIFT)
+#define AMBA_FEATURES_INVALIDATE_HINT_SET(reg_val, value)                      \
+	(((reg_val) & ~AMBA_FEATURES_INVALIDATE_HINT_MASK) |                   \
+	 (((value) << AMBA_FEATURES_INVALIDATE_HINT_SHIFT) &                   \
+	  AMBA_FEATURES_INVALIDATE_HINT_MASK))
+
+/* AMBA_ENABLE register */
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_SHIFT GPU_U(0)
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_MASK                                    \
+	(GPU_U(0x1F) << AMBA_ENABLE_COHERENCY_PROTOCOL_SHIFT)
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_GET(reg_val)                            \
+	(((reg_val)&AMBA_ENABLE_COHERENCY_PROTOCOL_MASK) >>                    \
+	 AMBA_ENABLE_COHERENCY_PROTOCOL_SHIFT)
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_SET(reg_val, value)                     \
+	(((reg_val) & ~AMBA_ENABLE_COHERENCY_PROTOCOL_MASK) |                  \
+	 (((value) << AMBA_ENABLE_COHERENCY_PROTOCOL_SHIFT) &                  \
+	  AMBA_ENABLE_COHERENCY_PROTOCOL_MASK))
+/* AMBA_ENABLE_coherency_protocol values */
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_ACE_LITE 0x0
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_ACE 0x1
+#define AMBA_ENABLE_COHERENCY_PROTOCOL_NO_COHERENCY 0x1F
+/* End of AMBA_ENABLE_coherency_protocol values */
+#define AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SHIFT GPU_U(5)
+#define AMBA_ENABLE_MEMORY_CACHE_SUPPORT_MASK                                  \
+	(GPU_U(0x1) << AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SHIFT)
+#define AMBA_ENABLE_MEMORY_CACHE_SUPPORT_GET(reg_val)                          \
+	(((reg_val)&AMBA_ENABLE_MEMORY_CACHE_SUPPORT_MASK) >>                  \
+	 AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SHIFT)
+#define AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SET(reg_val, value)                   \
+	(((reg_val) & ~AMBA_ENABLE_MEMORY_CACHE_SUPPORT_MASK) |                \
+	 (((value) << AMBA_ENABLE_MEMORY_CACHE_SUPPORT_SHIFT) &                \
+	  AMBA_ENABLE_MEMORY_CACHE_SUPPORT_MASK))
+#define AMBA_ENABLE_INVALIDATE_HINT_SHIFT GPU_U(6)
+#define AMBA_ENABLE_INVALIDATE_HINT_MASK                                       \
+	(GPU_U(0x1) << AMBA_ENABLE_INVALIDATE_HINT_SHIFT)
+#define AMBA_ENABLE_INVALIDATE_HINT_GET(reg_val)                               \
+	(((reg_val)&AMBA_ENABLE_INVALIDATE_HINT_MASK) >>                       \
+	 AMBA_ENABLE_INVALIDATE_HINT_SHIFT)
+#define AMBA_ENABLE_INVALIDATE_HINT_SET(reg_val, value)                        \
+	(((reg_val) & ~AMBA_ENABLE_INVALIDATE_HINT_MASK) |                     \
+	 (((value) << AMBA_ENABLE_INVALIDATE_HINT_SHIFT) &                     \
+	  AMBA_ENABLE_INVALIDATE_HINT_MASK))
 
 /* IDVS_GROUP register */
 #define IDVS_GROUP_SIZE_SHIFT (16)
diff --git a/drivers/base/arm/dma_buf_lock/src/Kbuild b/drivers/gpu/arm/bifrost/hwcnt/Kbuild
similarity index 53%
rename from drivers/base/arm/dma_buf_lock/src/Kbuild
rename to drivers/gpu/arm/bifrost/hwcnt/Kbuild
index b6b741b39119..c1a381b24593 100644
--- a/drivers/base/arm/dma_buf_lock/src/Kbuild
+++ b/drivers/gpu/arm/bifrost/hwcnt/Kbuild
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2012, 2020-2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
@@ -18,6 +18,20 @@
 #
 #
 
-ifeq ($(CONFIG_DMA_BUF_LOCK), y)
-obj-m := dma_buf_lock.o
+bifrost_kbase-y += \
+    hwcnt/mali_kbase_hwcnt.o \
+    hwcnt/mali_kbase_hwcnt_gpu.o \
+    hwcnt/mali_kbase_hwcnt_gpu_narrow.o \
+    hwcnt/mali_kbase_hwcnt_types.o \
+    hwcnt/mali_kbase_hwcnt_virtualizer.o \
+    hwcnt/mali_kbase_hwcnt_watchdog_if_timer.o
+
+ifeq ($(CONFIG_MALI_CSF_SUPPORT),y)
+    bifrost_kbase-y += \
+        hwcnt/backend/mali_kbase_hwcnt_backend_csf.o \
+        hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.o
+else
+    bifrost_kbase-y += \
+        hwcnt/backend/mali_kbase_hwcnt_backend_jm.o \
+        hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.o
 endif
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend.h b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend.h
similarity index 85%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend.h
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend.h
index b069fc12be69..6cfa6f5ee6f4 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -56,8 +56,8 @@ struct kbase_hwcnt_backend;
  *
  * Return: Non-NULL pointer to immutable hardware counter metadata.
  */
-typedef const struct kbase_hwcnt_metadata *kbase_hwcnt_backend_metadata_fn(
-	const struct kbase_hwcnt_backend_info *info);
+typedef const struct kbase_hwcnt_metadata *
+kbase_hwcnt_backend_metadata_fn(const struct kbase_hwcnt_backend_info *info);
 
 /**
  * typedef kbase_hwcnt_backend_init_fn - Initialise a counter backend.
@@ -69,9 +69,8 @@ typedef const struct kbase_hwcnt_metadata *kbase_hwcnt_backend_metadata_fn(
  *
  * Return: 0 on success, else error code.
  */
-typedef int kbase_hwcnt_backend_init_fn(
-	const struct kbase_hwcnt_backend_info *info,
-	struct kbase_hwcnt_backend **out_backend);
+typedef int kbase_hwcnt_backend_init_fn(const struct kbase_hwcnt_backend_info *info,
+					struct kbase_hwcnt_backend **out_backend);
 
 /**
  * typedef kbase_hwcnt_backend_term_fn - Terminate a counter backend.
@@ -86,8 +85,7 @@ typedef void kbase_hwcnt_backend_term_fn(struct kbase_hwcnt_backend *backend);
  *
  * Return: Backend timestamp in nanoseconds.
  */
-typedef u64 kbase_hwcnt_backend_timestamp_ns_fn(
-	struct kbase_hwcnt_backend *backend);
+typedef u64 kbase_hwcnt_backend_timestamp_ns_fn(struct kbase_hwcnt_backend *backend);
 
 /**
  * typedef kbase_hwcnt_backend_dump_enable_fn - Start counter dumping with the
@@ -102,9 +100,8 @@ typedef u64 kbase_hwcnt_backend_timestamp_ns_fn(
  *
  * Return: 0 on success, else error code.
  */
-typedef int kbase_hwcnt_backend_dump_enable_fn(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map);
+typedef int kbase_hwcnt_backend_dump_enable_fn(struct kbase_hwcnt_backend *backend,
+					       const struct kbase_hwcnt_enable_map *enable_map);
 
 /**
  * typedef kbase_hwcnt_backend_dump_enable_nolock_fn - Start counter dumping
@@ -118,9 +115,9 @@ typedef int kbase_hwcnt_backend_dump_enable_fn(
  *
  * Return: 0 on success, else error code.
  */
-typedef int kbase_hwcnt_backend_dump_enable_nolock_fn(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map);
+typedef int
+kbase_hwcnt_backend_dump_enable_nolock_fn(struct kbase_hwcnt_backend *backend,
+					  const struct kbase_hwcnt_enable_map *enable_map);
 
 /**
  * typedef kbase_hwcnt_backend_dump_disable_fn - Disable counter dumping with
@@ -130,8 +127,7 @@ typedef int kbase_hwcnt_backend_dump_enable_nolock_fn(
  * If the backend is already disabled, does nothing.
  * Any undumped counter values since the last dump get will be lost.
  */
-typedef void kbase_hwcnt_backend_dump_disable_fn(
-	struct kbase_hwcnt_backend *backend);
+typedef void kbase_hwcnt_backend_dump_disable_fn(struct kbase_hwcnt_backend *backend);
 
 /**
  * typedef kbase_hwcnt_backend_dump_clear_fn - Reset all the current undumped
@@ -142,8 +138,7 @@ typedef void kbase_hwcnt_backend_dump_disable_fn(
  *
  * Return: 0 on success, else error code.
  */
-typedef int kbase_hwcnt_backend_dump_clear_fn(
-	struct kbase_hwcnt_backend *backend);
+typedef int kbase_hwcnt_backend_dump_clear_fn(struct kbase_hwcnt_backend *backend);
 
 /**
  * typedef kbase_hwcnt_backend_dump_request_fn - Request an asynchronous counter
@@ -157,9 +152,8 @@ typedef int kbase_hwcnt_backend_dump_clear_fn(
  *
  * Return: 0 on success, else error code.
  */
-typedef int kbase_hwcnt_backend_dump_request_fn(
-	struct kbase_hwcnt_backend *backend,
-	u64 *dump_time_ns);
+typedef int kbase_hwcnt_backend_dump_request_fn(struct kbase_hwcnt_backend *backend,
+						u64 *dump_time_ns);
 
 /**
  * typedef kbase_hwcnt_backend_dump_wait_fn - Wait until the last requested
@@ -170,8 +164,7 @@ typedef int kbase_hwcnt_backend_dump_request_fn(
  *
  * Return: 0 on success, else error code.
  */
-typedef int kbase_hwcnt_backend_dump_wait_fn(
-	struct kbase_hwcnt_backend *backend);
+typedef int kbase_hwcnt_backend_dump_wait_fn(struct kbase_hwcnt_backend *backend);
 
 /**
  * typedef kbase_hwcnt_backend_dump_get_fn - Copy or accumulate enable the
@@ -189,11 +182,10 @@ typedef int kbase_hwcnt_backend_dump_wait_fn(
  *
  * Return: 0 on success, else error code.
  */
-typedef int kbase_hwcnt_backend_dump_get_fn(
-	struct kbase_hwcnt_backend *backend,
-	struct kbase_hwcnt_dump_buffer *dump_buffer,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	bool accumulate);
+typedef int kbase_hwcnt_backend_dump_get_fn(struct kbase_hwcnt_backend *backend,
+					    struct kbase_hwcnt_dump_buffer *dump_buffer,
+					    const struct kbase_hwcnt_enable_map *enable_map,
+					    bool accumulate);
 
 /**
  * struct kbase_hwcnt_backend_interface - Hardware counter backend virtual
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf.c b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
similarity index 76%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf.c
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
index 8afc990662da..10d40bedc0f8 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.c
@@ -19,9 +19,9 @@
  *
  */
 
-#include "mali_kbase_hwcnt_backend_csf.h"
-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend_csf.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 
 #include <linux/log2.h>
 #include <linux/kernel.h>
@@ -267,8 +267,7 @@ struct kbase_hwcnt_backend_csf {
 	struct work_struct hwc_threshold_work;
 };
 
-static bool kbasep_hwcnt_backend_csf_backend_exists(
-	struct kbase_hwcnt_backend_csf_info *csf_info)
+static bool kbasep_hwcnt_backend_csf_backend_exists(struct kbase_hwcnt_backend_csf_info *csf_info)
 {
 	WARN_ON(!csf_info);
 	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
@@ -282,19 +281,20 @@ static bool kbasep_hwcnt_backend_csf_backend_exists(
  * @backend_csf: Non-NULL pointer to backend.
  * @enable_map:  Non-NULL pointer to enable map specifying enabled counters.
  */
-static void kbasep_hwcnt_backend_csf_cc_initial_sample(
-	struct kbase_hwcnt_backend_csf *backend_csf,
-	const struct kbase_hwcnt_enable_map *enable_map)
+static void
+kbasep_hwcnt_backend_csf_cc_initial_sample(struct kbase_hwcnt_backend_csf *backend_csf,
+					   const struct kbase_hwcnt_enable_map *enable_map)
 {
 	u64 clk_enable_map = enable_map->clk_enable_map;
 	u64 cycle_counts[BASE_MAX_NR_CLOCKS_REGULATORS];
 	size_t clk;
 
 	/* Read cycle count from CSF interface for both clock domains. */
-	backend_csf->info->csf_if->get_gpu_cycle_count(
-		backend_csf->info->csf_if->ctx, cycle_counts, clk_enable_map);
+	backend_csf->info->csf_if->get_gpu_cycle_count(backend_csf->info->csf_if->ctx, cycle_counts,
+						       clk_enable_map);
 
-	kbase_hwcnt_metadata_for_each_clock(enable_map->metadata, clk) {
+	kbase_hwcnt_metadata_for_each_clock(enable_map->metadata, clk)
+	{
 		if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, clk))
 			backend_csf->prev_cycle_count[clk] = cycle_counts[clk];
 	}
@@ -303,42 +303,35 @@ static void kbasep_hwcnt_backend_csf_cc_initial_sample(
 	backend_csf->clk_enable_map = clk_enable_map;
 }
 
-static void
-kbasep_hwcnt_backend_csf_cc_update(struct kbase_hwcnt_backend_csf *backend_csf)
+static void kbasep_hwcnt_backend_csf_cc_update(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	u64 cycle_counts[BASE_MAX_NR_CLOCKS_REGULATORS];
 	size_t clk;
 
-	backend_csf->info->csf_if->assert_lock_held(
-		backend_csf->info->csf_if->ctx);
+	backend_csf->info->csf_if->assert_lock_held(backend_csf->info->csf_if->ctx);
 
-	backend_csf->info->csf_if->get_gpu_cycle_count(
-		backend_csf->info->csf_if->ctx, cycle_counts,
-		backend_csf->clk_enable_map);
+	backend_csf->info->csf_if->get_gpu_cycle_count(backend_csf->info->csf_if->ctx, cycle_counts,
+						       backend_csf->clk_enable_map);
 
-	kbase_hwcnt_metadata_for_each_clock(backend_csf->info->metadata, clk) {
-		if (kbase_hwcnt_clk_enable_map_enabled(
-			    backend_csf->clk_enable_map, clk)) {
+	kbase_hwcnt_metadata_for_each_clock(backend_csf->info->metadata, clk)
+	{
+		if (kbase_hwcnt_clk_enable_map_enabled(backend_csf->clk_enable_map, clk)) {
 			backend_csf->cycle_count_elapsed[clk] =
-				cycle_counts[clk] -
-				backend_csf->prev_cycle_count[clk];
+				cycle_counts[clk] - backend_csf->prev_cycle_count[clk];
 			backend_csf->prev_cycle_count[clk] = cycle_counts[clk];
 		}
 	}
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_timestamp_ns_fn */
-static u64
-kbasep_hwcnt_backend_csf_timestamp_ns(struct kbase_hwcnt_backend *backend)
+static u64 kbasep_hwcnt_backend_csf_timestamp_ns(struct kbase_hwcnt_backend *backend)
 {
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
+	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 
 	if (!backend_csf || !backend_csf->info || !backend_csf->info->csf_if)
 		return 0;
 
-	return backend_csf->info->csf_if->timestamp_ns(
-		backend_csf->info->csf_if->ctx);
+	return backend_csf->info->csf_if->timestamp_ns(backend_csf->info->csf_if->ctx);
 }
 
 /** kbasep_hwcnt_backend_csf_process_enable_map() - Process the enable_map to
@@ -347,8 +340,8 @@ kbasep_hwcnt_backend_csf_timestamp_ns(struct kbase_hwcnt_backend *backend)
  *                                                  required.
  *@phys_enable_map: HWC physical enable map to be processed.
  */
-static void kbasep_hwcnt_backend_csf_process_enable_map(
-	struct kbase_hwcnt_physical_enable_map *phys_enable_map)
+static void
+kbasep_hwcnt_backend_csf_process_enable_map(struct kbase_hwcnt_physical_enable_map *phys_enable_map)
 {
 	WARN_ON(!phys_enable_map);
 
@@ -408,19 +401,19 @@ static void kbasep_hwcnt_backend_csf_init_layout(
 	};
 }
 
-static void kbasep_hwcnt_backend_csf_reset_internal_buffers(
-	struct kbase_hwcnt_backend_csf *backend_csf)
+static void
+kbasep_hwcnt_backend_csf_reset_internal_buffers(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes;
 
 	memset(backend_csf->to_user_buf, 0, user_buf_bytes);
 	memset(backend_csf->accum_buf, 0, user_buf_bytes);
-	memset(backend_csf->old_sample_buf, 0,
-	       backend_csf->info->prfcnt_info.dump_bytes);
+	memset(backend_csf->old_sample_buf, 0, backend_csf->info->prfcnt_info.dump_bytes);
 }
 
-static void kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(
-	struct kbase_hwcnt_backend_csf *backend_csf, u32 *sample)
+static void
+kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(struct kbase_hwcnt_backend_csf *backend_csf,
+						      u32 *sample)
 {
 	u32 block_idx;
 	const struct kbase_hwcnt_csf_physical_layout *phys_layout;
@@ -434,8 +427,8 @@ static void kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(
 	}
 }
 
-static void kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(
-	struct kbase_hwcnt_backend_csf *backend_csf)
+static void
+kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	u32 idx;
 	u32 *sample;
@@ -446,19 +439,16 @@ static void kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(
 
 	for (idx = 0; idx < backend_csf->info->ring_buf_cnt; idx++) {
 		sample = (u32 *)&cpu_dump_base[idx * dump_bytes];
-		kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(
-			backend_csf, sample);
+		kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(backend_csf, sample);
 	}
 }
 
-static void kbasep_hwcnt_backend_csf_update_user_sample(
-	struct kbase_hwcnt_backend_csf *backend_csf)
+static void kbasep_hwcnt_backend_csf_update_user_sample(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	size_t user_buf_bytes = backend_csf->info->metadata->dump_buf_bytes;
 
 	/* Copy the data into the sample and wait for the user to get it. */
-	memcpy(backend_csf->to_user_buf, backend_csf->accum_buf,
-	       user_buf_bytes);
+	memcpy(backend_csf->to_user_buf, backend_csf->accum_buf, user_buf_bytes);
 
 	/* After copied data into user sample, clear the accumulator values to
 	 * prepare for the next accumulator, such as the next request or
@@ -468,9 +458,8 @@ static void kbasep_hwcnt_backend_csf_update_user_sample(
 }
 
 static void kbasep_hwcnt_backend_csf_accumulate_sample(
-	const struct kbase_hwcnt_csf_physical_layout *phys_layout,
-	size_t dump_bytes, u64 *accum_buf, const u32 *old_sample_buf,
-	const u32 *new_sample_buf, bool clearing_samples)
+	const struct kbase_hwcnt_csf_physical_layout *phys_layout, size_t dump_bytes,
+	u64 *accum_buf, const u32 *old_sample_buf, const u32 *new_sample_buf, bool clearing_samples)
 {
 	size_t block_idx;
 	const u32 *old_block = old_sample_buf;
@@ -487,10 +476,8 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 
 	for (block_idx = phys_layout->fw_block_cnt; block_idx < phys_layout->block_cnt;
 	     block_idx++) {
-		const u32 old_enable_mask =
-			old_block[phys_layout->enable_mask_offset];
-		const u32 new_enable_mask =
-			new_block[phys_layout->enable_mask_offset];
+		const u32 old_enable_mask = old_block[phys_layout->enable_mask_offset];
+		const u32 new_enable_mask = new_block[phys_layout->enable_mask_offset];
 
 		if (new_enable_mask == 0) {
 			/* Hardware block was unavailable or we didn't turn on
@@ -503,9 +490,7 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 			size_t ctr_idx;
 
 			/* Unconditionally copy the headers. */
-			for (ctr_idx = 0;
-			     ctr_idx < phys_layout->headers_per_block;
-			     ctr_idx++) {
+			for (ctr_idx = 0; ctr_idx < phys_layout->headers_per_block; ctr_idx++) {
 				acc_block[ctr_idx] = new_block[ctr_idx];
 			}
 
@@ -534,34 +519,25 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 					 * counters only, as we know previous
 					 * values are zeroes.
 					 */
-					for (ctr_idx =
-						     phys_layout
-							     ->headers_per_block;
-					     ctr_idx < values_per_block;
-					     ctr_idx++) {
-						acc_block[ctr_idx] +=
-							new_block[ctr_idx];
+					for (ctr_idx = phys_layout->headers_per_block;
+					     ctr_idx < values_per_block; ctr_idx++) {
+						acc_block[ctr_idx] += new_block[ctr_idx];
 					}
 				} else {
 					/* Hardware block was previously
 					 * available. Accumulate the delta
 					 * between old and new counter values.
 					 */
-					for (ctr_idx =
-						     phys_layout
-							     ->headers_per_block;
-					     ctr_idx < values_per_block;
-					     ctr_idx++) {
+					for (ctr_idx = phys_layout->headers_per_block;
+					     ctr_idx < values_per_block; ctr_idx++) {
 						acc_block[ctr_idx] +=
-							new_block[ctr_idx] -
-							old_block[ctr_idx];
+							new_block[ctr_idx] - old_block[ctr_idx];
 					}
 				}
 			} else {
 				for (ctr_idx = phys_layout->headers_per_block;
 				     ctr_idx < values_per_block; ctr_idx++) {
-					acc_block[ctr_idx] +=
-						new_block[ctr_idx];
+					acc_block[ctr_idx] += new_block[ctr_idx];
 				}
 			}
 		}
@@ -570,18 +546,16 @@ static void kbasep_hwcnt_backend_csf_accumulate_sample(
 		acc_block += values_per_block;
 	}
 
-	WARN_ON(old_block !=
-		old_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
-	WARN_ON(new_block !=
-		new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
+	WARN_ON(old_block != old_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
+	WARN_ON(new_block != new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
 	WARN_ON(acc_block != accum_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES) -
 				     (values_per_block * phys_layout->fw_block_cnt));
 	(void)dump_bytes;
 }
 
-static void kbasep_hwcnt_backend_csf_accumulate_samples(
-	struct kbase_hwcnt_backend_csf *backend_csf, u32 extract_index_to_start,
-	u32 insert_index_to_stop)
+static void kbasep_hwcnt_backend_csf_accumulate_samples(struct kbase_hwcnt_backend_csf *backend_csf,
+							u32 extract_index_to_start,
+							u32 insert_index_to_stop)
 {
 	u32 raw_idx;
 	unsigned long flags;
@@ -598,25 +572,22 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(
 
 	/* Sync all the buffers to CPU side before read the data. */
 	backend_csf->info->csf_if->ring_buf_sync(backend_csf->info->csf_if->ctx,
-						 backend_csf->ring_buf,
-						 extract_index_to_start,
+						 backend_csf->ring_buf, extract_index_to_start,
 						 insert_index_to_stop, true);
 
 	/* Consider u32 wrap case, '!=' is used here instead of '<' operator */
-	for (raw_idx = extract_index_to_start; raw_idx != insert_index_to_stop;
-	     raw_idx++) {
+	for (raw_idx = extract_index_to_start; raw_idx != insert_index_to_stop; raw_idx++) {
 		/* The logical "&" acts as a modulo operation since buf_count
 		 * must be a power of two.
 		 */
 		const u32 buf_idx = raw_idx & (ring_buf_cnt - 1);
 
-		new_sample_buf =
-			(u32 *)&cpu_dump_base[buf_idx * buf_dump_bytes];
+		new_sample_buf = (u32 *)&cpu_dump_base[buf_idx * buf_dump_bytes];
 
-		kbasep_hwcnt_backend_csf_accumulate_sample(
-			&backend_csf->phys_layout, buf_dump_bytes,
-			backend_csf->accum_buf, old_sample_buf, new_sample_buf,
-			clearing_samples);
+		kbasep_hwcnt_backend_csf_accumulate_sample(&backend_csf->phys_layout,
+							   buf_dump_bytes, backend_csf->accum_buf,
+							   old_sample_buf, new_sample_buf,
+							   clearing_samples);
 
 		old_sample_buf = new_sample_buf;
 	}
@@ -625,19 +596,16 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(
 	memcpy(backend_csf->old_sample_buf, new_sample_buf, buf_dump_bytes);
 
 	/* Reset the prfcnt_en header on each sample before releasing them. */
-	for (raw_idx = extract_index_to_start; raw_idx != insert_index_to_stop;
-	     raw_idx++) {
+	for (raw_idx = extract_index_to_start; raw_idx != insert_index_to_stop; raw_idx++) {
 		const u32 buf_idx = raw_idx & (ring_buf_cnt - 1);
 		u32 *sample = (u32 *)&cpu_dump_base[buf_idx * buf_dump_bytes];
 
-		kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(
-			backend_csf, sample);
+		kbasep_hwcnt_backend_csf_zero_sample_prfcnt_en_header(backend_csf, sample);
 	}
 
 	/* Sync zeroed buffers to avoid coherency issues on future use. */
 	backend_csf->info->csf_if->ring_buf_sync(backend_csf->info->csf_if->ctx,
-						 backend_csf->ring_buf,
-						 extract_index_to_start,
+						 backend_csf->ring_buf, extract_index_to_start,
 						 insert_index_to_stop, false);
 
 	/* After consuming all samples between extract_idx and insert_idx,
@@ -645,22 +613,20 @@ static void kbasep_hwcnt_backend_csf_accumulate_samples(
 	 * can be released back to the ring buffer pool.
 	 */
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
-	backend_csf->info->csf_if->set_extract_index(
-		backend_csf->info->csf_if->ctx, insert_index_to_stop);
+	backend_csf->info->csf_if->set_extract_index(backend_csf->info->csf_if->ctx,
+						     insert_index_to_stop);
 	/* Update the watchdog last seen index to check any new FW auto samples
 	 * in next watchdog callback.
 	 */
 	backend_csf->watchdog_last_seen_insert_idx = insert_index_to_stop;
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 }
 
 static void kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
 	struct kbase_hwcnt_backend_csf *backend_csf,
 	enum kbase_hwcnt_backend_csf_enable_state new_state)
 {
-	backend_csf->info->csf_if->assert_lock_held(
-		backend_csf->info->csf_if->ctx);
+	backend_csf->info->csf_if->assert_lock_held(backend_csf->info->csf_if->ctx);
 
 	if (backend_csf->enable_state != new_state) {
 		backend_csf->enable_state = new_state;
@@ -691,26 +657,22 @@ static void kbasep_hwcnt_backend_watchdog_timer_cb(void *info)
 	    (!csf_info->fw_in_protected_mode) &&
 	    /* 3. dump state indicates no other dumping is in progress. */
 	    ((backend_csf->dump_state == KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE) ||
-	     (backend_csf->dump_state ==
-	      KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED))) {
+	     (backend_csf->dump_state == KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED))) {
 		u32 extract_index;
 		u32 insert_index;
 
 		/* Read the raw extract and insert indexes from the CSF interface. */
-		csf_info->csf_if->get_indexes(csf_info->csf_if->ctx,
-					      &extract_index, &insert_index);
+		csf_info->csf_if->get_indexes(csf_info->csf_if->ctx, &extract_index, &insert_index);
 
 		/* Do watchdog request if no new FW auto samples. */
-		if (insert_index ==
-		    backend_csf->watchdog_last_seen_insert_idx) {
+		if (insert_index == backend_csf->watchdog_last_seen_insert_idx) {
 			/* Trigger the watchdog request. */
 			csf_info->csf_if->dump_request(csf_info->csf_if->ctx);
 
 			/* A watchdog dump is required, change the state to
 			 * start the request process.
 			 */
-			backend_csf->dump_state =
-				KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED;
+			backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED;
 		}
 	}
 
@@ -719,12 +681,10 @@ static void kbasep_hwcnt_backend_watchdog_timer_cb(void *info)
 	 * counter enabled interrupt.
 	 */
 	if ((backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_ENABLED) ||
-	    (backend_csf->enable_state ==
-	     KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED)) {
+	    (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED)) {
 		/* Reschedule the timer for next watchdog callback. */
-		csf_info->watchdog_if->modify(
-			csf_info->watchdog_if->timer,
-			HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS);
+		csf_info->watchdog_if->modify(csf_info->watchdog_if->timer,
+					      HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS);
 	}
 
 	csf_info->csf_if->unlock(csf_info->csf_if->ctx, flags);
@@ -747,8 +707,7 @@ static void kbasep_hwcnt_backend_csf_dump_worker(struct work_struct *work)
 	u32 insert_index;
 
 	WARN_ON(!work);
-	backend_csf = container_of(work, struct kbase_hwcnt_backend_csf,
-				   hwc_dump_work);
+	backend_csf = container_of(work, struct kbase_hwcnt_backend_csf, hwc_dump_work);
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 	/* Assert the backend is not destroyed. */
 	WARN_ON(backend_csf != backend_csf->info->backend);
@@ -757,26 +716,22 @@ static void kbasep_hwcnt_backend_csf_dump_worker(struct work_struct *work)
 	 * launched.
 	 */
 	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
-		WARN_ON(backend_csf->dump_state !=
-			KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE);
+		WARN_ON(backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE);
 		WARN_ON(!completion_done(&backend_csf->dump_completed));
-		backend_csf->info->csf_if->unlock(
-			backend_csf->info->csf_if->ctx, flags);
+		backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
 
-	WARN_ON(backend_csf->dump_state !=
-		KBASE_HWCNT_BACKEND_CSF_DUMP_WORKER_LAUNCHED);
+	WARN_ON(backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_WORKER_LAUNCHED);
 
 	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_ACCUMULATING;
 	insert_index_to_acc = backend_csf->insert_index_to_accumulate;
 
 	/* Read the raw extract and insert indexes from the CSF interface. */
-	backend_csf->info->csf_if->get_indexes(backend_csf->info->csf_if->ctx,
-					       &extract_index, &insert_index);
+	backend_csf->info->csf_if->get_indexes(backend_csf->info->csf_if->ctx, &extract_index,
+					       &insert_index);
 
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
 	/* Accumulate up to the insert we grabbed at the prfcnt request
 	 * interrupt.
@@ -797,22 +752,18 @@ static void kbasep_hwcnt_backend_csf_dump_worker(struct work_struct *work)
 	/* The backend was disabled or had an error while we were accumulating.
 	 */
 	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
-		WARN_ON(backend_csf->dump_state !=
-			KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE);
+		WARN_ON(backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE);
 		WARN_ON(!completion_done(&backend_csf->dump_completed));
-		backend_csf->info->csf_if->unlock(
-			backend_csf->info->csf_if->ctx, flags);
+		backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
 
-	WARN_ON(backend_csf->dump_state !=
-		KBASE_HWCNT_BACKEND_CSF_DUMP_ACCUMULATING);
+	WARN_ON(backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_ACCUMULATING);
 
 	/* Our work here is done - set the wait object and unblock waiters. */
 	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
 	complete_all(&backend_csf->dump_completed);
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 }
 
 /**
@@ -832,23 +783,21 @@ static void kbasep_hwcnt_backend_csf_threshold_worker(struct work_struct *work)
 
 	WARN_ON(!work);
 
-	backend_csf = container_of(work, struct kbase_hwcnt_backend_csf,
-				   hwc_threshold_work);
+	backend_csf = container_of(work, struct kbase_hwcnt_backend_csf, hwc_threshold_work);
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 
 	/* Assert the backend is not destroyed. */
 	WARN_ON(backend_csf != backend_csf->info->backend);
 
 	/* Read the raw extract and insert indexes from the CSF interface. */
-	backend_csf->info->csf_if->get_indexes(backend_csf->info->csf_if->ctx,
-					       &extract_index, &insert_index);
+	backend_csf->info->csf_if->get_indexes(backend_csf->info->csf_if->ctx, &extract_index,
+					       &insert_index);
 
 	/* The backend was disabled or had an error while the worker was being
 	 * launched.
 	 */
 	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
-		backend_csf->info->csf_if->unlock(
-			backend_csf->info->csf_if->ctx, flags);
+		backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
 
@@ -857,14 +806,11 @@ static void kbasep_hwcnt_backend_csf_threshold_worker(struct work_struct *work)
 	 * interfere.
 	 */
 	if ((backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE) &&
-	    (backend_csf->dump_state !=
-	     KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED)) {
-		backend_csf->info->csf_if->unlock(
-			backend_csf->info->csf_if->ctx, flags);
+	    (backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED)) {
+		backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
 	/* Accumulate everything we possibly can. We grabbed the insert index
 	 * immediately after we acquired the lock but before we checked whether
@@ -873,14 +819,13 @@ static void kbasep_hwcnt_backend_csf_threshold_worker(struct work_struct *work)
 	 * fact that our insert will not exceed the concurrent dump's
 	 * insert_to_accumulate, so we don't risk accumulating too much data.
 	 */
-	kbasep_hwcnt_backend_csf_accumulate_samples(backend_csf, extract_index,
-						    insert_index);
+	kbasep_hwcnt_backend_csf_accumulate_samples(backend_csf, extract_index, insert_index);
 
 	/* No need to wake up anything since it is not a user dump request. */
 }
 
-static void kbase_hwcnt_backend_csf_submit_dump_worker(
-	struct kbase_hwcnt_backend_csf_info *csf_info)
+static void
+kbase_hwcnt_backend_csf_submit_dump_worker(struct kbase_hwcnt_backend_csf_info *csf_info)
 {
 	u32 extract_index;
 
@@ -888,31 +833,26 @@ static void kbase_hwcnt_backend_csf_submit_dump_worker(
 	csf_info->csf_if->assert_lock_held(csf_info->csf_if->ctx);
 
 	WARN_ON(!kbasep_hwcnt_backend_csf_backend_exists(csf_info));
-	WARN_ON(csf_info->backend->enable_state !=
-		KBASE_HWCNT_BACKEND_CSF_ENABLED);
-	WARN_ON(csf_info->backend->dump_state !=
-		KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT);
+	WARN_ON(csf_info->backend->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED);
+	WARN_ON(csf_info->backend->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT);
 
 	/* Save insert index now so that the dump worker only accumulates the
 	 * HWC data associated with this request. Extract index is not stored
 	 * as that needs to be checked when accumulating to prevent re-reading
 	 * buffers that have already been read and returned to the GPU.
 	 */
-	csf_info->csf_if->get_indexes(
-		csf_info->csf_if->ctx, &extract_index,
-		&csf_info->backend->insert_index_to_accumulate);
-	csf_info->backend->dump_state =
-		KBASE_HWCNT_BACKEND_CSF_DUMP_WORKER_LAUNCHED;
+	csf_info->csf_if->get_indexes(csf_info->csf_if->ctx, &extract_index,
+				      &csf_info->backend->insert_index_to_accumulate);
+	csf_info->backend->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_WORKER_LAUNCHED;
 
 	/* Submit the accumulator task into the work queue. */
-	queue_work(csf_info->backend->hwc_dump_workq,
-		   &csf_info->backend->hwc_dump_work);
+	queue_work(csf_info->backend->hwc_dump_workq, &csf_info->backend->hwc_dump_work);
 }
 
-static void kbasep_hwcnt_backend_csf_get_physical_enable(
-	struct kbase_hwcnt_backend_csf *backend_csf,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	struct kbase_hwcnt_backend_csf_if_enable *enable)
+static void
+kbasep_hwcnt_backend_csf_get_physical_enable(struct kbase_hwcnt_backend_csf *backend_csf,
+					     const struct kbase_hwcnt_enable_map *enable_map,
+					     struct kbase_hwcnt_backend_csf_if_enable *enable)
 {
 	enum kbase_hwcnt_physical_set phys_counter_set;
 	struct kbase_hwcnt_physical_enable_map phys_enable_map;
@@ -924,8 +864,7 @@ static void kbasep_hwcnt_backend_csf_get_physical_enable(
 	 */
 	kbasep_hwcnt_backend_csf_process_enable_map(&phys_enable_map);
 
-	kbase_hwcnt_gpu_set_to_physical(&phys_counter_set,
-					backend_csf->info->counter_set);
+	kbase_hwcnt_gpu_set_to_physical(&phys_counter_set, backend_csf->info->counter_set);
 
 	/* Use processed enable_map to enable HWC in HW level. */
 	enable->fe_bm = phys_enable_map.fe_bm;
@@ -937,33 +876,29 @@ static void kbasep_hwcnt_backend_csf_get_physical_enable(
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_enable_nolock_fn */
-static int kbasep_hwcnt_backend_csf_dump_enable_nolock(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map)
+static int
+kbasep_hwcnt_backend_csf_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
+					    const struct kbase_hwcnt_enable_map *enable_map)
 {
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
+	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 	struct kbase_hwcnt_backend_csf_if_enable enable;
 	int err;
 
-	if (!backend_csf || !enable_map ||
-	    (enable_map->metadata != backend_csf->info->metadata))
+	if (!backend_csf || !enable_map || (enable_map->metadata != backend_csf->info->metadata))
 		return -EINVAL;
 
-	backend_csf->info->csf_if->assert_lock_held(
-		backend_csf->info->csf_if->ctx);
+	backend_csf->info->csf_if->assert_lock_held(backend_csf->info->csf_if->ctx);
 
-	kbasep_hwcnt_backend_csf_get_physical_enable(backend_csf, enable_map,
-						     &enable);
+	kbasep_hwcnt_backend_csf_get_physical_enable(backend_csf, enable_map, &enable);
 
 	/* enable_state should be DISABLED before we transfer it to enabled */
 	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_DISABLED)
 		return -EIO;
 
-	err = backend_csf->info->watchdog_if->enable(
-		backend_csf->info->watchdog_if->timer,
-		HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS,
-		kbasep_hwcnt_backend_watchdog_timer_cb, backend_csf->info);
+	err = backend_csf->info->watchdog_if->enable(backend_csf->info->watchdog_if->timer,
+						     HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS,
+						     kbasep_hwcnt_backend_watchdog_timer_cb,
+						     backend_csf->info);
 	if (err)
 		return err;
 
@@ -981,58 +916,46 @@ static int kbasep_hwcnt_backend_csf_dump_enable_nolock(
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_enable_fn */
-static int kbasep_hwcnt_backend_csf_dump_enable(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map)
+static int kbasep_hwcnt_backend_csf_dump_enable(struct kbase_hwcnt_backend *backend,
+						const struct kbase_hwcnt_enable_map *enable_map)
 {
 	int errcode;
 	unsigned long flags;
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
+	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 
 	if (!backend_csf)
 		return -EINVAL;
 
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
-	errcode = kbasep_hwcnt_backend_csf_dump_enable_nolock(backend,
-							      enable_map);
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	errcode = kbasep_hwcnt_backend_csf_dump_enable_nolock(backend, enable_map);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 	return errcode;
 }
 
 static void kbasep_hwcnt_backend_csf_wait_enable_transition_complete(
 	struct kbase_hwcnt_backend_csf *backend_csf, unsigned long *lock_flags)
 {
-	backend_csf->info->csf_if->assert_lock_held(
-		backend_csf->info->csf_if->ctx);
+	backend_csf->info->csf_if->assert_lock_held(backend_csf->info->csf_if->ctx);
 
-	while ((backend_csf->enable_state ==
-		KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) ||
-	       (backend_csf->enable_state ==
-		KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED)) {
-		backend_csf->info->csf_if->unlock(
-			backend_csf->info->csf_if->ctx, *lock_flags);
+	while ((backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) ||
+	       (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED)) {
+		backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, *lock_flags);
 
-		wait_event(
-			backend_csf->enable_state_waitq,
-			(backend_csf->enable_state !=
-			 KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) &&
-				(backend_csf->enable_state !=
-				 KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED));
+		wait_event(backend_csf->enable_state_waitq,
+			   (backend_csf->enable_state !=
+			    KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) &&
+				   (backend_csf->enable_state !=
+				    KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED));
 
-		backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx,
-						lock_flags);
+		backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, lock_flags);
 	}
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_disable_fn */
-static void
-kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
+static void kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 {
 	unsigned long flags;
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
+	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 	bool do_disable = false;
 
 	WARN_ON(!backend_csf);
@@ -1042,24 +965,20 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 	/* Make sure we wait until any previous enable or disable have completed
 	 * before doing anything.
 	 */
-	kbasep_hwcnt_backend_csf_wait_enable_transition_complete(backend_csf,
-								 &flags);
+	kbasep_hwcnt_backend_csf_wait_enable_transition_complete(backend_csf, &flags);
 
 	if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_DISABLED ||
-	    backend_csf->enable_state ==
-		    KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR) {
+	    backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR) {
 		/* If we are already disabled or in an unrecoverable error
 		 * state, there is nothing for us to do.
 		 */
-		backend_csf->info->csf_if->unlock(
-			backend_csf->info->csf_if->ctx, flags);
+		backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 		return;
 	}
 
 	if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_ENABLED) {
 		kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
-			backend_csf,
-			KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED);
+			backend_csf, KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED);
 		backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE;
 		complete_all(&backend_csf->dump_completed);
 		/* Only disable if we were previously enabled - in all other
@@ -1071,15 +990,13 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 	WARN_ON(backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE);
 	WARN_ON(!completion_done(&backend_csf->dump_completed));
 
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
 	/* Deregister the timer and block until any timer callback has completed.
 	 * We've transitioned out of the ENABLED state so we can guarantee it
 	 * won't reschedule itself.
 	 */
-	backend_csf->info->watchdog_if->disable(
-		backend_csf->info->watchdog_if->timer);
+	backend_csf->info->watchdog_if->disable(backend_csf->info->watchdog_if->timer);
 
 	/* Block until any async work has completed. We have transitioned out of
 	 * the ENABLED state so we can guarantee no new work will concurrently
@@ -1090,11 +1007,9 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 
 	if (do_disable)
-		backend_csf->info->csf_if->dump_disable(
-			backend_csf->info->csf_if->ctx);
+		backend_csf->info->csf_if->dump_disable(backend_csf->info->csf_if->ctx);
 
-	kbasep_hwcnt_backend_csf_wait_enable_transition_complete(backend_csf,
-								 &flags);
+	kbasep_hwcnt_backend_csf_wait_enable_transition_complete(backend_csf, &flags);
 
 	switch (backend_csf->enable_state) {
 	case KBASE_HWCNT_BACKEND_CSF_DISABLED_WAIT_FOR_WORKER:
@@ -1103,8 +1018,7 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 		break;
 	case KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR_WAIT_FOR_WORKER:
 		kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
-			backend_csf,
-			KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR);
+			backend_csf, KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR);
 		break;
 	default:
 		WARN_ON(true);
@@ -1114,8 +1028,7 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 	backend_csf->user_requested = false;
 	backend_csf->watchdog_last_seen_insert_idx = 0;
 
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
 	/* After disable, zero the header of all buffers in the ring buffer back
 	 * to 0 to prepare for the next enable.
@@ -1123,9 +1036,9 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 	kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(backend_csf);
 
 	/* Sync zeroed buffers to avoid coherency issues on future use. */
-	backend_csf->info->csf_if->ring_buf_sync(
-		backend_csf->info->csf_if->ctx, backend_csf->ring_buf, 0,
-		backend_csf->info->ring_buf_cnt, false);
+	backend_csf->info->csf_if->ring_buf_sync(backend_csf->info->csf_if->ctx,
+						 backend_csf->ring_buf, 0,
+						 backend_csf->info->ring_buf_cnt, false);
 
 	/* Reset accumulator, old_sample_buf and user_sample to all-0 to prepare
 	 * for next enable.
@@ -1134,13 +1047,11 @@ kbasep_hwcnt_backend_csf_dump_disable(struct kbase_hwcnt_backend *backend)
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_request_fn */
-static int
-kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
-				      u64 *dump_time_ns)
+static int kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
+						 u64 *dump_time_ns)
 {
 	unsigned long flags;
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
+	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 	bool do_request = false;
 	bool watchdog_dumping = false;
 
@@ -1153,22 +1064,18 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 	 * the user dump buffer is already zeroed. We can just short circuit to
 	 * the DUMP_COMPLETED state.
 	 */
-	if (backend_csf->enable_state ==
-	    KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) {
-		backend_csf->dump_state =
-			KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
+	if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) {
+		backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
 		*dump_time_ns = kbasep_hwcnt_backend_csf_timestamp_ns(backend);
 		kbasep_hwcnt_backend_csf_cc_update(backend_csf);
 		backend_csf->user_requested = true;
-		backend_csf->info->csf_if->unlock(
-			backend_csf->info->csf_if->ctx, flags);
+		backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 		return 0;
 	}
 
 	/* Otherwise, make sure we're already enabled. */
 	if (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_ENABLED) {
-		backend_csf->info->csf_if->unlock(
-			backend_csf->info->csf_if->ctx, flags);
+		backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 		return -EIO;
 	}
 
@@ -1181,15 +1088,12 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 	 * request can be processed instead of ignored.
 	 */
 	if ((backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE) &&
-	    (backend_csf->dump_state !=
-	     KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED) &&
-	    (backend_csf->dump_state !=
-	     KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED)) {
+	    (backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED) &&
+	    (backend_csf->dump_state != KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED)) {
 		/* HWC is disabled or another user dump is ongoing,
 		 * or we're on fault.
 		 */
-		backend_csf->info->csf_if->unlock(
-			backend_csf->info->csf_if->ctx, flags);
+		backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 		/* HWC is disabled or another dump is ongoing, or we are on
 		 * fault.
 		 */
@@ -1199,8 +1103,7 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 	/* Reset the completion so dump_wait() has something to wait on. */
 	reinit_completion(&backend_csf->dump_completed);
 
-	if (backend_csf->dump_state ==
-	    KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED)
+	if (backend_csf->dump_state == KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED)
 		watchdog_dumping = true;
 
 	if ((backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_ENABLED) &&
@@ -1208,15 +1111,13 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 		/* Only do the request if we are fully enabled and not in
 		 * protected mode.
 		 */
-		backend_csf->dump_state =
-			KBASE_HWCNT_BACKEND_CSF_DUMP_REQUESTED;
+		backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_REQUESTED;
 		do_request = true;
 	} else {
 		/* Skip the request and waiting for ack and go straight to
 		 * checking the insert and kicking off the worker to do the dump
 		 */
-		backend_csf->dump_state =
-			KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT;
+		backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_QUERYING_INSERT;
 	}
 
 	/* CSF firmware might enter protected mode now, but still call request.
@@ -1238,31 +1139,26 @@ kbasep_hwcnt_backend_csf_dump_request(struct kbase_hwcnt_backend *backend,
 		 * ownership of the sample which watchdog requested.
 		 */
 		if (!watchdog_dumping)
-			backend_csf->info->csf_if->dump_request(
-				backend_csf->info->csf_if->ctx);
+			backend_csf->info->csf_if->dump_request(backend_csf->info->csf_if->ctx);
 	} else
 		kbase_hwcnt_backend_csf_submit_dump_worker(backend_csf->info);
 
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
 	/* Modify watchdog timer to delay the regular check time since
 	 * just requested.
 	 */
-	backend_csf->info->watchdog_if->modify(
-		backend_csf->info->watchdog_if->timer,
-		HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS);
+	backend_csf->info->watchdog_if->modify(backend_csf->info->watchdog_if->timer,
+					       HWCNT_BACKEND_WATCHDOG_TIMER_INTERVAL_MS);
 
 	return 0;
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_wait_fn */
-static int
-kbasep_hwcnt_backend_csf_dump_wait(struct kbase_hwcnt_backend *backend)
+static int kbasep_hwcnt_backend_csf_dump_wait(struct kbase_hwcnt_backend *backend)
 {
 	unsigned long flags;
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
+	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 	int errcode;
 
 	if (!backend_csf)
@@ -1275,26 +1171,21 @@ kbasep_hwcnt_backend_csf_dump_wait(struct kbase_hwcnt_backend *backend)
 	 * set.
 	 */
 	if (backend_csf->user_requested &&
-	    ((backend_csf->dump_state ==
-	      KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED) ||
-	     (backend_csf->dump_state ==
-	      KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED)))
+	    ((backend_csf->dump_state == KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED) ||
+	     (backend_csf->dump_state == KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED)))
 		errcode = 0;
 	else
 		errcode = -EIO;
 
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
 	return errcode;
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_clear_fn */
-static int
-kbasep_hwcnt_backend_csf_dump_clear(struct kbase_hwcnt_backend *backend)
+static int kbasep_hwcnt_backend_csf_dump_clear(struct kbase_hwcnt_backend *backend)
 {
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
+	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 	int errcode;
 	u64 ts;
 
@@ -1313,13 +1204,12 @@ kbasep_hwcnt_backend_csf_dump_clear(struct kbase_hwcnt_backend *backend)
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_dump_get_fn */
-static int kbasep_hwcnt_backend_csf_dump_get(
-	struct kbase_hwcnt_backend *backend,
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_enable_map *dst_enable_map, bool accumulate)
+static int kbasep_hwcnt_backend_csf_dump_get(struct kbase_hwcnt_backend *backend,
+					     struct kbase_hwcnt_dump_buffer *dst,
+					     const struct kbase_hwcnt_enable_map *dst_enable_map,
+					     bool accumulate)
 {
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
+	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 	int ret;
 	size_t clk;
 
@@ -1329,9 +1219,9 @@ static int kbasep_hwcnt_backend_csf_dump_get(
 		return -EINVAL;
 
 	/* Extract elapsed cycle count for each clock domain if enabled. */
-	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) {
-		if (!kbase_hwcnt_clk_enable_map_enabled(
-			    dst_enable_map->clk_enable_map, clk))
+	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk)
+	{
+		if (!kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			continue;
 
 		/* Reset the counter to zero if accumulation is off. */
@@ -1344,8 +1234,7 @@ static int kbasep_hwcnt_backend_csf_dump_get(
 	 * as it is undefined to call this function without a prior succeeding
 	 * one to dump_wait().
 	 */
-	ret = kbase_hwcnt_csf_dump_get(dst, backend_csf->to_user_buf,
-				       dst_enable_map, accumulate);
+	ret = kbase_hwcnt_csf_dump_get(dst, backend_csf->to_user_buf, dst_enable_map, accumulate);
 
 	return ret;
 }
@@ -1357,8 +1246,7 @@ static int kbasep_hwcnt_backend_csf_dump_get(
  * Can be safely called on a backend in any state of partial construction.
  *
  */
-static void
-kbasep_hwcnt_backend_csf_destroy(struct kbase_hwcnt_backend_csf *backend_csf)
+static void kbasep_hwcnt_backend_csf_destroy(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	if (!backend_csf)
 		return;
@@ -1388,9 +1276,8 @@ kbasep_hwcnt_backend_csf_destroy(struct kbase_hwcnt_backend_csf *backend_csf)
  *
  * Return: 0 on success, else error code.
  */
-static int
-kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
-				struct kbase_hwcnt_backend_csf **out_backend)
+static int kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
+					   struct kbase_hwcnt_backend_csf **out_backend)
 {
 	struct kbase_hwcnt_backend_csf *backend_csf = NULL;
 	int errcode = -ENOMEM;
@@ -1403,27 +1290,23 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
 		goto alloc_error;
 
 	backend_csf->info = csf_info;
-	kbasep_hwcnt_backend_csf_init_layout(&csf_info->prfcnt_info,
-					     &backend_csf->phys_layout);
+	kbasep_hwcnt_backend_csf_init_layout(&csf_info->prfcnt_info, &backend_csf->phys_layout);
 
-	backend_csf->accum_buf =
-		kzalloc(csf_info->metadata->dump_buf_bytes, GFP_KERNEL);
+	backend_csf->accum_buf = kzalloc(csf_info->metadata->dump_buf_bytes, GFP_KERNEL);
 	if (!backend_csf->accum_buf)
 		goto err_alloc_acc_buf;
 
-	backend_csf->old_sample_buf =
-		kzalloc(csf_info->prfcnt_info.dump_bytes, GFP_KERNEL);
+	backend_csf->old_sample_buf = kzalloc(csf_info->prfcnt_info.dump_bytes, GFP_KERNEL);
 	if (!backend_csf->old_sample_buf)
 		goto err_alloc_pre_sample_buf;
 
-	backend_csf->to_user_buf =
-		kzalloc(csf_info->metadata->dump_buf_bytes, GFP_KERNEL);
+	backend_csf->to_user_buf = kzalloc(csf_info->metadata->dump_buf_bytes, GFP_KERNEL);
 	if (!backend_csf->to_user_buf)
 		goto err_alloc_user_sample_buf;
 
-	errcode = csf_info->csf_if->ring_buf_alloc(
-		csf_info->csf_if->ctx, csf_info->ring_buf_cnt,
-		&backend_csf->ring_buf_cpu_base, &backend_csf->ring_buf);
+	errcode = csf_info->csf_if->ring_buf_alloc(csf_info->csf_if->ctx, csf_info->ring_buf_cnt,
+						   &backend_csf->ring_buf_cpu_base,
+						   &backend_csf->ring_buf);
 	if (errcode)
 		goto err_ring_buf_alloc;
 	errcode = -ENOMEM;
@@ -1432,9 +1315,9 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
 	kbasep_hwcnt_backend_csf_zero_all_prfcnt_en_header(backend_csf);
 
 	/* Sync zeroed buffers to avoid coherency issues on use. */
-	backend_csf->info->csf_if->ring_buf_sync(
-		backend_csf->info->csf_if->ctx, backend_csf->ring_buf, 0,
-		backend_csf->info->ring_buf_cnt, false);
+	backend_csf->info->csf_if->ring_buf_sync(backend_csf->info->csf_if->ctx,
+						 backend_csf->ring_buf, 0,
+						 backend_csf->info->ring_buf_cnt, false);
 
 	init_completion(&backend_csf->dump_completed);
 
@@ -1448,10 +1331,8 @@ kbasep_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_info *csf_info,
 	if (!backend_csf->hwc_dump_workq)
 		goto err_alloc_workqueue;
 
-	INIT_WORK(&backend_csf->hwc_dump_work,
-		  kbasep_hwcnt_backend_csf_dump_worker);
-	INIT_WORK(&backend_csf->hwc_threshold_work,
-		  kbasep_hwcnt_backend_csf_threshold_worker);
+	INIT_WORK(&backend_csf->hwc_dump_work, kbasep_hwcnt_backend_csf_dump_worker);
+	INIT_WORK(&backend_csf->hwc_threshold_work, kbasep_hwcnt_backend_csf_threshold_worker);
 
 	backend_csf->enable_state = KBASE_HWCNT_BACKEND_CSF_DISABLED;
 	backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE;
@@ -1481,14 +1362,12 @@ alloc_error:
 }
 
 /* CSF backend implementation of kbase_hwcnt_backend_init_fn */
-static int
-kbasep_hwcnt_backend_csf_init(const struct kbase_hwcnt_backend_info *info,
-			      struct kbase_hwcnt_backend **out_backend)
+static int kbasep_hwcnt_backend_csf_init(const struct kbase_hwcnt_backend_info *info,
+					 struct kbase_hwcnt_backend **out_backend)
 {
 	unsigned long flags;
 	struct kbase_hwcnt_backend_csf *backend_csf = NULL;
-	struct kbase_hwcnt_backend_csf_info *csf_info =
-		(struct kbase_hwcnt_backend_csf_info *)info;
+	struct kbase_hwcnt_backend_csf_info *csf_info = (struct kbase_hwcnt_backend_csf_info *)info;
 	int errcode;
 	bool success = false;
 
@@ -1509,11 +1388,9 @@ kbasep_hwcnt_backend_csf_init(const struct kbase_hwcnt_backend_info *info,
 		*out_backend = (struct kbase_hwcnt_backend *)backend_csf;
 		success = true;
 		if (csf_info->unrecoverable_error_happened)
-			backend_csf->enable_state =
-				KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR;
+			backend_csf->enable_state = KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR;
 	}
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
 	/* Destroy the new created backend if the backend has already created
 	 * before. In normal case, this won't happen if the client call init()
@@ -1531,8 +1408,7 @@ kbasep_hwcnt_backend_csf_init(const struct kbase_hwcnt_backend_info *info,
 static void kbasep_hwcnt_backend_csf_term(struct kbase_hwcnt_backend *backend)
 {
 	unsigned long flags;
-	struct kbase_hwcnt_backend_csf *backend_csf =
-		(struct kbase_hwcnt_backend_csf *)backend;
+	struct kbase_hwcnt_backend_csf *backend_csf = (struct kbase_hwcnt_backend_csf *)backend;
 
 	if (!backend)
 		return;
@@ -1544,8 +1420,7 @@ static void kbasep_hwcnt_backend_csf_term(struct kbase_hwcnt_backend *backend)
 	 */
 	backend_csf->info->csf_if->lock(backend_csf->info->csf_if->ctx, &flags);
 	backend_csf->info->backend = NULL;
-	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx,
-					  flags);
+	backend_csf->info->csf_if->unlock(backend_csf->info->csf_if->ctx, flags);
 
 	kbasep_hwcnt_backend_csf_destroy(backend_csf);
 }
@@ -1557,8 +1432,7 @@ static void kbasep_hwcnt_backend_csf_term(struct kbase_hwcnt_backend *backend)
  * Can be safely called on a backend info in any state of partial construction.
  *
  */
-static void kbasep_hwcnt_backend_csf_info_destroy(
-	const struct kbase_hwcnt_backend_csf_info *info)
+static void kbasep_hwcnt_backend_csf_info_destroy(const struct kbase_hwcnt_backend_csf_info *info)
 {
 	if (!info)
 		return;
@@ -1585,10 +1459,10 @@ static void kbasep_hwcnt_backend_csf_info_destroy(
  *
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_backend_csf_info_create(
-	struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
-	struct kbase_hwcnt_watchdog_interface *watchdog_if,
-	const struct kbase_hwcnt_backend_csf_info **out_info)
+static int
+kbasep_hwcnt_backend_csf_info_create(struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
+				     struct kbase_hwcnt_watchdog_interface *watchdog_if,
+				     const struct kbase_hwcnt_backend_csf_info **out_info)
 {
 	struct kbase_hwcnt_backend_csf_info *info = NULL;
 
@@ -1611,8 +1485,7 @@ static int kbasep_hwcnt_backend_csf_info_create(
 		.counter_set = KBASE_HWCNT_SET_PRIMARY,
 #endif
 		.backend = NULL, .csf_if = csf_if, .ring_buf_cnt = ring_buf_cnt,
-		.fw_in_protected_mode = false,
-		.unrecoverable_error_happened = false,
+		.fw_in_protected_mode = false, .unrecoverable_error_happened = false,
 		.watchdog_if = watchdog_if,
 	};
 	*out_info = info;
@@ -1632,19 +1505,17 @@ kbasep_hwcnt_backend_csf_metadata(const struct kbase_hwcnt_backend_info *info)
 	return ((const struct kbase_hwcnt_backend_csf_info *)info)->metadata;
 }
 
-static void kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
-	struct kbase_hwcnt_backend_csf *backend_csf)
+static void
+kbasep_hwcnt_backend_csf_handle_unrecoverable_error(struct kbase_hwcnt_backend_csf *backend_csf)
 {
 	bool do_disable = false;
 
-	backend_csf->info->csf_if->assert_lock_held(
-		backend_csf->info->csf_if->ctx);
+	backend_csf->info->csf_if->assert_lock_held(backend_csf->info->csf_if->ctx);
 
 	/* We are already in or transitioning to the unrecoverable error state.
 	 * Early out.
 	 */
-	if ((backend_csf->enable_state ==
-	     KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR) ||
+	if ((backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR) ||
 	    (backend_csf->enable_state ==
 	     KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR_WAIT_FOR_WORKER))
 		return;
@@ -1654,8 +1525,7 @@ static void kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
 	 */
 	if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_DISABLED) {
 		kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
-			backend_csf,
-			KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR);
+			backend_csf, KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR);
 		return;
 	}
 
@@ -1663,12 +1533,11 @@ static void kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
 	 * disabled, we don't want to disable twice if an unrecoverable error
 	 * happens while we are disabling.
 	 */
-	do_disable = (backend_csf->enable_state !=
-		      KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED);
+	do_disable =
+		(backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED);
 
 	kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
-		backend_csf,
-		KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR_WAIT_FOR_WORKER);
+		backend_csf, KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR_WAIT_FOR_WORKER);
 
 	/* Transition the dump to the IDLE state and unblock any waiters. The
 	 * IDLE state signifies an error.
@@ -1681,15 +1550,13 @@ static void kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
 	 * happens while we are disabling.
 	 */
 	if (do_disable)
-		backend_csf->info->csf_if->dump_disable(
-			backend_csf->info->csf_if->ctx);
+		backend_csf->info->csf_if->dump_disable(backend_csf->info->csf_if->ctx);
 }
 
-static void kbasep_hwcnt_backend_csf_handle_recoverable_error(
-	struct kbase_hwcnt_backend_csf *backend_csf)
+static void
+kbasep_hwcnt_backend_csf_handle_recoverable_error(struct kbase_hwcnt_backend_csf *backend_csf)
 {
-	backend_csf->info->csf_if->assert_lock_held(
-		backend_csf->info->csf_if->ctx);
+	backend_csf->info->csf_if->assert_lock_held(backend_csf->info->csf_if->ctx);
 
 	switch (backend_csf->enable_state) {
 	case KBASE_HWCNT_BACKEND_CSF_DISABLED:
@@ -1705,8 +1572,7 @@ static void kbasep_hwcnt_backend_csf_handle_recoverable_error(
 		/* A seemingly recoverable error that occurs while we are
 		 * transitioning to enabled is probably unrecoverable.
 		 */
-		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
-			backend_csf);
+		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(backend_csf);
 		return;
 	case KBASE_HWCNT_BACKEND_CSF_ENABLED:
 		/* Start transitioning to the disabled state. We can't wait for
@@ -1715,22 +1581,19 @@ static void kbasep_hwcnt_backend_csf_handle_recoverable_error(
 		 * disable().
 		 */
 		kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
-			backend_csf,
-			KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED);
+			backend_csf, KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED);
 		/* Transition the dump to the IDLE state and unblock any
 		 * waiters. The IDLE state signifies an error.
 		 */
 		backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_IDLE;
 		complete_all(&backend_csf->dump_completed);
 
-		backend_csf->info->csf_if->dump_disable(
-			backend_csf->info->csf_if->ctx);
+		backend_csf->info->csf_if->dump_disable(backend_csf->info->csf_if->ctx);
 		return;
 	}
 }
 
-void kbase_hwcnt_backend_csf_protm_entered(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_protm_entered(struct kbase_hwcnt_backend_interface *iface)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info =
 		(struct kbase_hwcnt_backend_csf_info *)iface->info;
@@ -1744,8 +1607,7 @@ void kbase_hwcnt_backend_csf_protm_entered(
 	kbase_hwcnt_backend_csf_on_prfcnt_sample(iface);
 }
 
-void kbase_hwcnt_backend_csf_protm_exited(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_protm_exited(struct kbase_hwcnt_backend_interface *iface)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 
@@ -1755,8 +1617,7 @@ void kbase_hwcnt_backend_csf_protm_exited(
 	csf_info->fw_in_protected_mode = false;
 }
 
-void kbase_hwcnt_backend_csf_on_unrecoverable_error(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_on_unrecoverable_error(struct kbase_hwcnt_backend_interface *iface)
 {
 	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_info *csf_info;
@@ -1776,8 +1637,7 @@ void kbase_hwcnt_backend_csf_on_unrecoverable_error(
 	csf_info->csf_if->unlock(csf_info->csf_if->ctx, flags);
 }
 
-void kbase_hwcnt_backend_csf_on_before_reset(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_on_before_reset(struct kbase_hwcnt_backend_interface *iface)
 {
 	unsigned long flags;
 	struct kbase_hwcnt_backend_csf_info *csf_info;
@@ -1795,8 +1655,7 @@ void kbase_hwcnt_backend_csf_on_before_reset(
 	backend_csf = csf_info->backend;
 
 	if ((backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_DISABLED) &&
-	    (backend_csf->enable_state !=
-	     KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR)) {
+	    (backend_csf->enable_state != KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR)) {
 		/* Before a reset occurs, we must either have been disabled
 		 * (else we lose data) or we should have encountered an
 		 * unrecoverable error. Either way, we will have disabled the
@@ -1807,13 +1666,11 @@ void kbase_hwcnt_backend_csf_on_before_reset(
 		 * We can't wait for this disable to complete, but it doesn't
 		 * really matter, the power is being pulled.
 		 */
-		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
-			csf_info->backend);
+		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(csf_info->backend);
 	}
 
 	/* A reset is the only way to exit the unrecoverable error state */
-	if (backend_csf->enable_state ==
-	    KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR) {
+	if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_UNRECOVERABLE_ERROR) {
 		kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
 			backend_csf, KBASE_HWCNT_BACKEND_CSF_DISABLED);
 	}
@@ -1821,8 +1678,7 @@ void kbase_hwcnt_backend_csf_on_before_reset(
 	csf_info->csf_if->unlock(csf_info->csf_if->ctx, flags);
 }
 
-void kbase_hwcnt_backend_csf_on_prfcnt_sample(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_on_prfcnt_sample(struct kbase_hwcnt_backend_interface *iface)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 	struct kbase_hwcnt_backend_csf *backend_csf;
@@ -1836,10 +1692,8 @@ void kbase_hwcnt_backend_csf_on_prfcnt_sample(
 	backend_csf = csf_info->backend;
 
 	/* Skip the dump_work if it's a watchdog request. */
-	if (backend_csf->dump_state ==
-	    KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED) {
-		backend_csf->dump_state =
-			KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
+	if (backend_csf->dump_state == KBASE_HWCNT_BACKEND_CSF_DUMP_WATCHDOG_REQUESTED) {
+		backend_csf->dump_state = KBASE_HWCNT_BACKEND_CSF_DUMP_COMPLETED;
 		return;
 	}
 
@@ -1853,8 +1707,7 @@ void kbase_hwcnt_backend_csf_on_prfcnt_sample(
 	kbase_hwcnt_backend_csf_submit_dump_worker(csf_info);
 }
 
-void kbase_hwcnt_backend_csf_on_prfcnt_threshold(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_on_prfcnt_threshold(struct kbase_hwcnt_backend_interface *iface)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 	struct kbase_hwcnt_backend_csf *backend_csf;
@@ -1871,12 +1724,10 @@ void kbase_hwcnt_backend_csf_on_prfcnt_threshold(
 		/* Submit the threshold work into the work queue to consume the
 		 * available samples.
 		 */
-		queue_work(backend_csf->hwc_dump_workq,
-			   &backend_csf->hwc_threshold_work);
+		queue_work(backend_csf->hwc_dump_workq, &backend_csf->hwc_threshold_work);
 }
 
-void kbase_hwcnt_backend_csf_on_prfcnt_overflow(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_on_prfcnt_overflow(struct kbase_hwcnt_backend_interface *iface)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 
@@ -1897,8 +1748,7 @@ void kbase_hwcnt_backend_csf_on_prfcnt_overflow(
 	kbasep_hwcnt_backend_csf_handle_recoverable_error(csf_info->backend);
 }
 
-void kbase_hwcnt_backend_csf_on_prfcnt_enable(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_on_prfcnt_enable(struct kbase_hwcnt_backend_interface *iface)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 	struct kbase_hwcnt_backend_csf *backend_csf;
@@ -1911,12 +1761,10 @@ void kbase_hwcnt_backend_csf_on_prfcnt_enable(
 		return;
 	backend_csf = csf_info->backend;
 
-	if (backend_csf->enable_state ==
-	    KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) {
+	if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_ENABLED) {
 		kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
 			backend_csf, KBASE_HWCNT_BACKEND_CSF_ENABLED);
-	} else if (backend_csf->enable_state ==
-		   KBASE_HWCNT_BACKEND_CSF_ENABLED) {
+	} else if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_ENABLED) {
 		/* Unexpected, but we are already in the right state so just
 		 * ignore it.
 		 */
@@ -1924,13 +1772,11 @@ void kbase_hwcnt_backend_csf_on_prfcnt_enable(
 		/* Unexpected state change, assume everything is broken until
 		 * we reset.
 		 */
-		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
-			csf_info->backend);
+		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(csf_info->backend);
 	}
 }
 
-void kbase_hwcnt_backend_csf_on_prfcnt_disable(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_on_prfcnt_disable(struct kbase_hwcnt_backend_interface *iface)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 	struct kbase_hwcnt_backend_csf *backend_csf;
@@ -1943,13 +1789,10 @@ void kbase_hwcnt_backend_csf_on_prfcnt_disable(
 		return;
 	backend_csf = csf_info->backend;
 
-	if (backend_csf->enable_state ==
-	    KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED) {
+	if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_TRANSITIONING_TO_DISABLED) {
 		kbasep_hwcnt_backend_csf_change_es_and_wake_waiters(
-			backend_csf,
-			KBASE_HWCNT_BACKEND_CSF_DISABLED_WAIT_FOR_WORKER);
-	} else if (backend_csf->enable_state ==
-		   KBASE_HWCNT_BACKEND_CSF_DISABLED) {
+			backend_csf, KBASE_HWCNT_BACKEND_CSF_DISABLED_WAIT_FOR_WORKER);
+	} else if (backend_csf->enable_state == KBASE_HWCNT_BACKEND_CSF_DISABLED) {
 		/* Unexpected, but we are already in the right state so just
 		 * ignore it.
 		 */
@@ -1957,13 +1800,11 @@ void kbase_hwcnt_backend_csf_on_prfcnt_disable(
 		/* Unexpected state change, assume everything is broken until
 		 * we reset.
 		 */
-		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(
-			csf_info->backend);
+		kbasep_hwcnt_backend_csf_handle_unrecoverable_error(csf_info->backend);
 	}
 }
 
-int kbase_hwcnt_backend_csf_metadata_init(
-	struct kbase_hwcnt_backend_interface *iface)
+int kbase_hwcnt_backend_csf_metadata_init(struct kbase_hwcnt_backend_interface *iface)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 	struct kbase_hwcnt_gpu_info gpu_info;
@@ -1975,8 +1816,7 @@ int kbase_hwcnt_backend_csf_metadata_init(
 
 	WARN_ON(!csf_info->csf_if->get_prfcnt_info);
 
-	csf_info->csf_if->get_prfcnt_info(csf_info->csf_if->ctx,
-					  &csf_info->prfcnt_info);
+	csf_info->csf_if->get_prfcnt_info(csf_info->csf_if->ctx, &csf_info->prfcnt_info);
 
 	/* The clock domain counts should not exceed the number of maximum
 	 * number of clock regulators.
@@ -1988,14 +1828,12 @@ int kbase_hwcnt_backend_csf_metadata_init(
 	gpu_info.core_mask = csf_info->prfcnt_info.core_mask;
 	gpu_info.clk_cnt = csf_info->prfcnt_info.clk_cnt;
 	gpu_info.prfcnt_values_per_block =
-		csf_info->prfcnt_info.prfcnt_block_size /
-		KBASE_HWCNT_VALUE_HW_BYTES;
+		csf_info->prfcnt_info.prfcnt_block_size / KBASE_HWCNT_VALUE_HW_BYTES;
 	return kbase_hwcnt_csf_metadata_create(&gpu_info, csf_info->counter_set,
 					       &csf_info->metadata);
 }
 
-void kbase_hwcnt_backend_csf_metadata_term(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_csf_metadata_term(struct kbase_hwcnt_backend_interface *iface)
 {
 	struct kbase_hwcnt_backend_csf_info *csf_info;
 
@@ -2009,10 +1847,9 @@ void kbase_hwcnt_backend_csf_metadata_term(
 	}
 }
 
-int kbase_hwcnt_backend_csf_create(
-	struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
-	struct kbase_hwcnt_watchdog_interface *watchdog_if,
-	struct kbase_hwcnt_backend_interface *iface)
+int kbase_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
+				   struct kbase_hwcnt_watchdog_interface *watchdog_if,
+				   struct kbase_hwcnt_backend_interface *iface)
 {
 	int errcode;
 	const struct kbase_hwcnt_backend_csf_info *info = NULL;
@@ -2024,8 +1861,7 @@ int kbase_hwcnt_backend_csf_create(
 	if (!is_power_of_2(ring_buf_cnt))
 		return -EINVAL;
 
-	errcode = kbasep_hwcnt_backend_csf_info_create(csf_if, ring_buf_cnt,
-						       watchdog_if, &info);
+	errcode = kbasep_hwcnt_backend_csf_info_create(csf_if, ring_buf_cnt, watchdog_if, &info);
 	if (errcode)
 		return errcode;
 
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf.h b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
similarity index 77%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf.h
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
index e0cafbe70660..9c5a5c996ebd 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -27,9 +27,9 @@
 #ifndef _KBASE_HWCNT_BACKEND_CSF_H_
 #define _KBASE_HWCNT_BACKEND_CSF_H_
 
-#include "mali_kbase_hwcnt_backend.h"
-#include "mali_kbase_hwcnt_backend_csf_if.h"
-#include "mali_kbase_hwcnt_watchdog_if.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h"
+#include "hwcnt/mali_kbase_hwcnt_watchdog_if.h"
 
 /**
  * kbase_hwcnt_backend_csf_create() - Create a CSF hardware counter backend
@@ -47,10 +47,9 @@
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_backend_csf_create(
-	struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
-	struct kbase_hwcnt_watchdog_interface *watchdog_if,
-	struct kbase_hwcnt_backend_interface *iface);
+int kbase_hwcnt_backend_csf_create(struct kbase_hwcnt_backend_csf_if *csf_if, u32 ring_buf_cnt,
+				   struct kbase_hwcnt_watchdog_interface *watchdog_if,
+				   struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_metadata_init() - Initialize the metadata for a CSF
@@ -58,16 +57,14 @@ int kbase_hwcnt_backend_csf_create(
  * @iface: Non-NULL pointer to backend interface structure
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_backend_csf_metadata_init(
-	struct kbase_hwcnt_backend_interface *iface);
+int kbase_hwcnt_backend_csf_metadata_init(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_metadata_term() - Terminate the metadata for a CSF
  *                                           hardware counter backend.
  * @iface: Non-NULL pointer to backend interface structure.
  */
-void kbase_hwcnt_backend_csf_metadata_term(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_metadata_term(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_destroy() - Destroy a CSF hardware counter backend
@@ -77,8 +74,7 @@ void kbase_hwcnt_backend_csf_metadata_term(
  * Can be safely called on an all-zeroed interface, or on an already destroyed
  * interface.
  */
-void kbase_hwcnt_backend_csf_destroy(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_destroy(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_protm_entered() - CSF HWC backend function to receive
@@ -86,8 +82,7 @@ void kbase_hwcnt_backend_csf_destroy(
  *                                           has been entered.
  * @iface: Non-NULL pointer to HWC backend interface.
  */
-void kbase_hwcnt_backend_csf_protm_entered(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_protm_entered(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_protm_exited() - CSF HWC backend function to receive
@@ -95,8 +90,7 @@ void kbase_hwcnt_backend_csf_protm_entered(
  *                                          been exited.
  * @iface: Non-NULL pointer to HWC backend interface.
  */
-void kbase_hwcnt_backend_csf_protm_exited(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_protm_exited(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_on_unrecoverable_error() - CSF HWC backend function
@@ -108,8 +102,7 @@ void kbase_hwcnt_backend_csf_protm_exited(
  * with reset, or that may put HWC logic in state that could result in hang. For
  * example, on bus error, or when FW becomes unresponsive.
  */
-void kbase_hwcnt_backend_csf_on_unrecoverable_error(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_unrecoverable_error(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_on_before_reset() - CSF HWC backend function to be
@@ -119,16 +112,14 @@ void kbase_hwcnt_backend_csf_on_unrecoverable_error(
  *                                             were in it.
  * @iface: Non-NULL pointer to HWC backend interface.
  */
-void kbase_hwcnt_backend_csf_on_before_reset(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_before_reset(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_on_prfcnt_sample() - CSF performance counter sample
  *                                              complete interrupt handler.
  * @iface: Non-NULL pointer to HWC backend interface.
  */
-void kbase_hwcnt_backend_csf_on_prfcnt_sample(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_sample(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_on_prfcnt_threshold() - CSF performance counter
@@ -136,31 +127,27 @@ void kbase_hwcnt_backend_csf_on_prfcnt_sample(
  *                                                 interrupt handler.
  * @iface: Non-NULL pointer to HWC backend interface.
  */
-void kbase_hwcnt_backend_csf_on_prfcnt_threshold(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_threshold(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_on_prfcnt_overflow() - CSF performance counter buffer
  *                                                overflow interrupt handler.
  * @iface: Non-NULL pointer to HWC backend interface.
  */
-void kbase_hwcnt_backend_csf_on_prfcnt_overflow(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_overflow(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_on_prfcnt_enable() - CSF performance counter enabled
  *                                              interrupt handler.
  * @iface: Non-NULL pointer to HWC backend interface.
  */
-void kbase_hwcnt_backend_csf_on_prfcnt_enable(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_enable(struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_csf_on_prfcnt_disable() - CSF performance counter
  *                                               disabled interrupt handler.
  * @iface: Non-NULL pointer to HWC backend interface.
  */
-void kbase_hwcnt_backend_csf_on_prfcnt_disable(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_csf_on_prfcnt_disable(struct kbase_hwcnt_backend_interface *iface);
 
 #endif /* _KBASE_HWCNT_BACKEND_CSF_H_ */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf_if.h b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
similarity index 85%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf_if.h
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
index 24b26c2bd6f4..382a3adaa127 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf_if.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h
@@ -85,8 +85,8 @@ struct kbase_hwcnt_backend_csf_if_prfcnt_info {
  *                                                          held.
  * @ctx: Non-NULL pointer to a CSF context.
  */
-typedef void kbase_hwcnt_backend_csf_if_assert_lock_held_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef void
+kbase_hwcnt_backend_csf_if_assert_lock_held_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_lock_fn - Acquire backend spinlock.
@@ -95,9 +95,8 @@ typedef void kbase_hwcnt_backend_csf_if_assert_lock_held_fn(
  * @flags: Pointer to the memory location that would store the previous
  *         interrupt state.
  */
-typedef void kbase_hwcnt_backend_csf_if_lock_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	unsigned long *flags);
+typedef void kbase_hwcnt_backend_csf_if_lock_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						unsigned long *flags);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_unlock_fn - Release backend spinlock.
@@ -106,9 +105,8 @@ typedef void kbase_hwcnt_backend_csf_if_lock_fn(
  * @flags: Previously stored interrupt state when Scheduler interrupt
  *         spinlock was acquired.
  */
-typedef void kbase_hwcnt_backend_csf_if_unlock_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	unsigned long flags);
+typedef void kbase_hwcnt_backend_csf_if_unlock_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						  unsigned long flags);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn - Get performance
@@ -137,10 +135,10 @@ typedef void kbase_hwcnt_backend_csf_if_get_prfcnt_info_fn(
  *
  * Return: 0 on success, else error code.
  */
-typedef int kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 buf_count,
-	void **cpu_dump_base,
-	struct kbase_hwcnt_backend_csf_if_ring_buf **ring_buf);
+typedef int
+kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					     u32 buf_count, void **cpu_dump_base,
+					     struct kbase_hwcnt_backend_csf_if_ring_buf **ring_buf);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_ring_buf_sync_fn - Sync HWC dump buffers
@@ -159,10 +157,10 @@ typedef int kbase_hwcnt_backend_csf_if_ring_buf_alloc_fn(
  * Flush cached HWC dump buffer data to ensure that all writes from GPU and CPU
  * are correctly observed.
  */
-typedef void kbase_hwcnt_backend_csf_if_ring_buf_sync_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	u32 buf_index_first, u32 buf_index_last, bool for_cpu);
+typedef void
+kbase_hwcnt_backend_csf_if_ring_buf_sync_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					    struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
+					    u32 buf_index_first, u32 buf_index_last, bool for_cpu);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_ring_buf_free_fn - Free a ring buffer for
@@ -171,9 +169,9 @@ typedef void kbase_hwcnt_backend_csf_if_ring_buf_sync_fn(
  * @ctx:      Non-NULL pointer to a CSF interface context.
  * @ring_buf: Non-NULL pointer to the ring buffer which to be freed.
  */
-typedef void kbase_hwcnt_backend_csf_if_ring_buf_free_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf);
+typedef void
+kbase_hwcnt_backend_csf_if_ring_buf_free_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					    struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_timestamp_ns_fn - Get the current
@@ -183,8 +181,7 @@ typedef void kbase_hwcnt_backend_csf_if_ring_buf_free_fn(
  *
  * Return: CSF interface timestamp in nanoseconds.
  */
-typedef u64 kbase_hwcnt_backend_csf_if_timestamp_ns_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef u64 kbase_hwcnt_backend_csf_if_timestamp_ns_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_dump_enable_fn - Setup and enable hardware
@@ -195,10 +192,10 @@ typedef u64 kbase_hwcnt_backend_csf_if_timestamp_ns_fn(
  *
  * Requires lock to be taken before calling.
  */
-typedef void kbase_hwcnt_backend_csf_if_dump_enable_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	struct kbase_hwcnt_backend_csf_if_enable *enable);
+typedef void
+kbase_hwcnt_backend_csf_if_dump_enable_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					  struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
+					  struct kbase_hwcnt_backend_csf_if_enable *enable);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_dump_disable_fn - Disable hardware counter
@@ -207,8 +204,7 @@ typedef void kbase_hwcnt_backend_csf_if_dump_enable_fn(
  *
  * Requires lock to be taken before calling.
  */
-typedef void kbase_hwcnt_backend_csf_if_dump_disable_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef void kbase_hwcnt_backend_csf_if_dump_disable_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_dump_request_fn - Request a HWC dump.
@@ -217,8 +213,7 @@ typedef void kbase_hwcnt_backend_csf_if_dump_disable_fn(
  *
  * Requires lock to be taken before calling.
  */
-typedef void kbase_hwcnt_backend_csf_if_dump_request_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx);
+typedef void kbase_hwcnt_backend_csf_if_dump_request_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_get_indexes_fn - Get current extract and
@@ -231,9 +226,8 @@ typedef void kbase_hwcnt_backend_csf_if_dump_request_fn(
  *
  * Requires lock to be taken before calling.
  */
-typedef void kbase_hwcnt_backend_csf_if_get_indexes_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 *extract_index,
-	u32 *insert_index);
+typedef void kbase_hwcnt_backend_csf_if_get_indexes_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						       u32 *extract_index, u32 *insert_index);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_set_extract_index_fn - Update the extract
@@ -245,8 +239,9 @@ typedef void kbase_hwcnt_backend_csf_if_get_indexes_fn(
  *
  * Requires lock to be taken before calling.
  */
-typedef void kbase_hwcnt_backend_csf_if_set_extract_index_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 extract_index);
+typedef void
+kbase_hwcnt_backend_csf_if_set_extract_index_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						u32 extract_index);
 
 /**
  * typedef kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn - Get the current
@@ -260,9 +255,9 @@ typedef void kbase_hwcnt_backend_csf_if_set_extract_index_fn(
  *
  * Requires lock to be taken before calling.
  */
-typedef void kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u64 *cycle_counts,
-	u64 clk_enable_map);
+typedef void
+kbase_hwcnt_backend_csf_if_get_gpu_cycle_count_fn(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						  u64 *cycle_counts, u64 clk_enable_map);
 
 /**
  * struct kbase_hwcnt_backend_csf_if - Hardware counter backend CSF virtual
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf_if_fw.c b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
similarity index 73%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf_if_fw.c
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
index ab33a0b26486..9985752a3748 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf_if_fw.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.c
@@ -26,12 +26,12 @@
 #include <mali_kbase.h>
 #include <gpu/mali_kbase_gpu_regmap.h>
 #include <device/mali_kbase_device.h>
-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 #include <csf/mali_kbase_csf_registers.h>
 
 #include "csf/mali_kbase_csf_firmware.h"
-#include "mali_kbase_hwcnt_backend_csf_if_fw.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.h"
 #include "mali_kbase_hwaccess_time.h"
 #include "backend/gpu/mali_kbase_clk_rate_trace_mgr.h"
 
@@ -42,9 +42,6 @@
 #include <backend/gpu/mali_kbase_model_dummy.h>
 #endif /* CONFIG_MALI_BIFROST_NO_MALI */
 
-/** The number of nanoseconds in a second. */
-#define NSECS_IN_SEC 1000000000ull /* ns */
-
 /* Ring buffer virtual address start at 4GB  */
 #define KBASE_HWC_CSF_RING_BUFFER_VA_START (1ull << 32)
 
@@ -90,8 +87,8 @@ struct kbase_hwcnt_backend_csf_if_fw_ctx {
 	struct kbase_ccswe ccswe_shader_cores;
 };
 
-static void kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
+static void
+kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
 	struct kbase_device *kbdev;
@@ -104,9 +101,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(
 	kbase_csf_scheduler_spin_lock_assert_held(kbdev);
 }
 
-static void
-kbasep_hwcnt_backend_csf_if_fw_lock(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-				    unsigned long *flags)
+static void kbasep_hwcnt_backend_csf_if_fw_lock(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						unsigned long *flags)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
 	struct kbase_device *kbdev;
@@ -119,8 +115,8 @@ kbasep_hwcnt_backend_csf_if_fw_lock(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
 	kbase_csf_scheduler_spin_lock(kbdev, flags);
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_unlock(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, unsigned long flags)
+static void kbasep_hwcnt_backend_csf_if_fw_unlock(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						  unsigned long flags)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx;
 	struct kbase_device *kbdev;
@@ -141,22 +137,19 @@ static void kbasep_hwcnt_backend_csf_if_fw_unlock(
  * @clk_index:        Clock index
  * @clk_rate_hz:      Clock frequency(hz)
  */
-static void kbasep_hwcnt_backend_csf_if_fw_on_freq_change(
-	struct kbase_clk_rate_listener *rate_listener, u32 clk_index,
-	u32 clk_rate_hz)
+static void
+kbasep_hwcnt_backend_csf_if_fw_on_freq_change(struct kbase_clk_rate_listener *rate_listener,
+					      u32 clk_index, u32 clk_rate_hz)
 {
-	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
-		container_of(rate_listener,
-			     struct kbase_hwcnt_backend_csf_if_fw_ctx,
-			     rate_listener);
+	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx = container_of(
+		rate_listener, struct kbase_hwcnt_backend_csf_if_fw_ctx, rate_listener);
 	u64 timestamp_ns;
 
 	if (clk_index != KBASE_CLOCK_DOMAIN_SHADER_CORES)
 		return;
 
 	timestamp_ns = ktime_get_raw_ns();
-	kbase_ccswe_freq_change(&fw_ctx->ccswe_shader_cores, timestamp_ns,
-				clk_rate_hz);
+	kbase_ccswe_freq_change(&fw_ctx->ccswe_shader_cores, timestamp_ns, clk_rate_hz);
 }
 
 /**
@@ -165,17 +158,16 @@ static void kbasep_hwcnt_backend_csf_if_fw_on_freq_change(
  * @fw_ctx:         Non-NULL pointer to CSF firmware interface context.
  * @clk_enable_map: Non-NULL pointer to enable map specifying enabled counters.
  */
-static void kbasep_hwcnt_backend_csf_if_fw_cc_enable(
-	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx, u64 clk_enable_map)
+static void
+kbasep_hwcnt_backend_csf_if_fw_cc_enable(struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx,
+					 u64 clk_enable_map)
 {
 	struct kbase_device *kbdev = fw_ctx->kbdev;
 
-	if (kbase_hwcnt_clk_enable_map_enabled(
-		    clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
 		/* software estimation for non-top clock domains */
 		struct kbase_clk_rate_trace_manager *rtm = &kbdev->pm.clk_rtm;
-		const struct kbase_clk_data *clk_data =
-			rtm->clks[KBASE_CLOCK_DOMAIN_SHADER_CORES];
+		const struct kbase_clk_data *clk_data = rtm->clks[KBASE_CLOCK_DOMAIN_SHADER_CORES];
 		u32 cur_freq;
 		unsigned long flags;
 		u64 timestamp_ns;
@@ -186,11 +178,9 @@ static void kbasep_hwcnt_backend_csf_if_fw_cc_enable(
 
 		cur_freq = (u32)clk_data->clock_val;
 		kbase_ccswe_reset(&fw_ctx->ccswe_shader_cores);
-		kbase_ccswe_freq_change(&fw_ctx->ccswe_shader_cores,
-					timestamp_ns, cur_freq);
+		kbase_ccswe_freq_change(&fw_ctx->ccswe_shader_cores, timestamp_ns, cur_freq);
 
-		kbase_clk_rate_trace_manager_subscribe_no_lock(
-			rtm, &fw_ctx->rate_listener);
+		kbase_clk_rate_trace_manager_subscribe_no_lock(rtm, &fw_ctx->rate_listener);
 
 		spin_unlock_irqrestore(&rtm->lock, flags);
 	}
@@ -203,17 +193,15 @@ static void kbasep_hwcnt_backend_csf_if_fw_cc_enable(
  *
  * @fw_ctx:     Non-NULL pointer to CSF firmware interface context.
  */
-static void kbasep_hwcnt_backend_csf_if_fw_cc_disable(
-	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx)
+static void
+kbasep_hwcnt_backend_csf_if_fw_cc_disable(struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx)
 {
 	struct kbase_device *kbdev = fw_ctx->kbdev;
 	struct kbase_clk_rate_trace_manager *rtm = &kbdev->pm.clk_rtm;
 	u64 clk_enable_map = fw_ctx->clk_enable_map;
 
-	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map,
-					       KBASE_CLOCK_DOMAIN_SHADER_CORES))
-		kbase_clk_rate_trace_manager_unsubscribe(
-			rtm, &fw_ctx->rate_listener);
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES))
+		kbase_clk_rate_trace_manager_unsubscribe(rtm, &fw_ctx->rate_listener);
 }
 
 static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
@@ -244,8 +232,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	u32 prfcnt_size;
 	u32 prfcnt_hw_size;
 	u32 prfcnt_fw_size;
-	u32 prfcnt_block_size = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK *
-				KBASE_HWCNT_VALUE_HW_BYTES;
+	u32 prfcnt_block_size =
+		KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK * KBASE_HWCNT_VALUE_HW_BYTES;
 
 	WARN_ON(!ctx);
 	WARN_ON(!prfcnt_info);
@@ -262,10 +250,9 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	 */
 	if ((kbdev->gpu_props.props.raw_props.gpu_id & GPU_ID2_PRODUCT_MODEL) >=
 	    GPU_ID2_PRODUCT_TTUX) {
-		prfcnt_block_size =
-			PRFCNT_FEATURES_COUNTER_BLOCK_SIZE_GET(kbase_reg_read(
-				kbdev, GPU_CONTROL_REG(PRFCNT_FEATURES)))
-			<< 8;
+		prfcnt_block_size = PRFCNT_FEATURES_COUNTER_BLOCK_SIZE_GET(
+					    kbase_reg_read(kbdev, GPU_CONTROL_REG(PRFCNT_FEATURES)))
+				    << 8;
 	}
 
 	*prfcnt_info = (struct kbase_hwcnt_backend_csf_if_prfcnt_info){
@@ -280,17 +267,14 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info(
 	};
 
 	/* Block size must be multiple of counter size. */
-	WARN_ON((prfcnt_info->prfcnt_block_size % KBASE_HWCNT_VALUE_HW_BYTES) !=
-		0);
+	WARN_ON((prfcnt_info->prfcnt_block_size % KBASE_HWCNT_VALUE_HW_BYTES) != 0);
 	/* Total size must be multiple of block size. */
-	WARN_ON((prfcnt_info->dump_bytes % prfcnt_info->prfcnt_block_size) !=
-		0);
+	WARN_ON((prfcnt_info->dump_bytes % prfcnt_info->prfcnt_block_size) != 0);
 #endif
 }
 
 static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 buf_count,
-	void **cpu_dump_base,
+	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 buf_count, void **cpu_dump_base,
 	struct kbase_hwcnt_backend_csf_if_ring_buf **out_ring_buf)
 {
 	struct kbase_device *kbdev;
@@ -342,9 +326,8 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 		goto page_list_alloc_error;
 
 	/* Get physical page for the buffer */
-	ret = kbase_mem_pool_alloc_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
-		phys, false);
+	ret = kbase_mem_pool_alloc_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
+					 phys, false);
 	if (ret != num_pages)
 		goto phys_mem_pool_alloc_error;
 
@@ -360,9 +343,8 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 		KBASE_REG_MEMATTR_INDEX(AS_MEMATTR_INDEX_NON_CACHEABLE);
 
 	/* Update MMU table */
-	ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu,
-				     gpu_va_base >> PAGE_SHIFT, phys, num_pages,
-				     flags, MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW,
+	ret = kbase_mmu_insert_pages(kbdev, &kbdev->csf.mcu_mmu, gpu_va_base >> PAGE_SHIFT, phys,
+				     num_pages, flags, MCU_AS_NR, KBASE_MEM_GROUP_CSF_FW,
 				     mmu_sync_info);
 	if (ret)
 		goto mmu_insert_failed;
@@ -381,17 +363,15 @@ static int kbasep_hwcnt_backend_csf_if_fw_ring_buf_alloc(
 	fw_ring_buf->as_nr = MCU_AS_NR;
 
 	*cpu_dump_base = fw_ring_buf->cpu_dump_base;
-	*out_ring_buf =
-		(struct kbase_hwcnt_backend_csf_if_ring_buf *)fw_ring_buf;
+	*out_ring_buf = (struct kbase_hwcnt_backend_csf_if_ring_buf *)fw_ring_buf;
 
 	return 0;
 
 mmu_insert_failed:
 	vunmap(cpu_addr);
 vmap_error:
-	kbase_mem_pool_free_pages(
-		&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages,
-		phys, false, false);
+	kbase_mem_pool_free_pages(&kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW], num_pages, phys,
+				  false, false);
 phys_mem_pool_alloc_error:
 	kfree(page_list);
 page_list_alloc_error:
@@ -401,10 +381,10 @@ phys_alloc_error:
 	return -ENOMEM;
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	u32 buf_index_first, u32 buf_index_last, bool for_cpu)
+static void
+kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					     struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
+					     u32 buf_index_first, u32 buf_index_last, bool for_cpu)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ring_buf *fw_ring_buf =
 		(struct kbase_hwcnt_backend_csf_if_fw_ring_buf *)ring_buf;
@@ -435,8 +415,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
 	 * inclusive at both ends so full flushes are not 0 -> 0.
 	 */
 	ring_buf_index_first = buf_index_first & (fw_ring_buf->buf_count - 1);
-	ring_buf_index_last =
-		(buf_index_last - 1) & (fw_ring_buf->buf_count - 1);
+	ring_buf_index_last = (buf_index_last - 1) & (fw_ring_buf->buf_count - 1);
 
 	/* The start address is the offset of the first buffer. */
 	start_address = fw_ctx->buf_bytes * ring_buf_index_first;
@@ -453,15 +432,11 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
 			struct page *pg = as_page(fw_ring_buf->phys[i]);
 
 			if (for_cpu) {
-				kbase_sync_single_for_cpu(fw_ctx->kbdev,
-							  kbase_dma_addr(pg),
-							  PAGE_SIZE,
-							  DMA_BIDIRECTIONAL);
+				kbase_sync_single_for_cpu(fw_ctx->kbdev, kbase_dma_addr(pg),
+							  PAGE_SIZE, DMA_BIDIRECTIONAL);
 			} else {
-				kbase_sync_single_for_device(fw_ctx->kbdev,
-							     kbase_dma_addr(pg),
-							     PAGE_SIZE,
-							     DMA_BIDIRECTIONAL);
+				kbase_sync_single_for_device(fw_ctx->kbdev, kbase_dma_addr(pg),
+							     PAGE_SIZE, DMA_BIDIRECTIONAL);
 			}
 		}
 
@@ -473,28 +448,24 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_sync(
 		struct page *pg = as_page(fw_ring_buf->phys[i]);
 
 		if (for_cpu) {
-			kbase_sync_single_for_cpu(fw_ctx->kbdev,
-						  kbase_dma_addr(pg), PAGE_SIZE,
+			kbase_sync_single_for_cpu(fw_ctx->kbdev, kbase_dma_addr(pg), PAGE_SIZE,
 						  DMA_BIDIRECTIONAL);
 		} else {
-			kbase_sync_single_for_device(fw_ctx->kbdev,
-						     kbase_dma_addr(pg),
-						     PAGE_SIZE,
+			kbase_sync_single_for_device(fw_ctx->kbdev, kbase_dma_addr(pg), PAGE_SIZE,
 						     DMA_BIDIRECTIONAL);
 		}
 	}
 }
 
-static u64 kbasep_hwcnt_backend_csf_if_fw_timestamp_ns(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
+static u64 kbasep_hwcnt_backend_csf_if_fw_timestamp_ns(struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
 	CSTD_UNUSED(ctx);
 	return ktime_get_raw_ns();
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf)
+static void
+kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					     struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ring_buf *fw_ring_buf =
 		(struct kbase_hwcnt_backend_csf_if_fw_ring_buf *)ring_buf;
@@ -513,10 +484,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(
 
 		vunmap(fw_ring_buf->cpu_dump_base);
 
-		kbase_mem_pool_free_pages(
-			&fw_ctx->kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
-			fw_ring_buf->num_pages, fw_ring_buf->phys, false,
-			false);
+		kbase_mem_pool_free_pages(&fw_ctx->kbdev->mem_pools.small[KBASE_MEM_GROUP_CSF_FW],
+					  fw_ring_buf->num_pages, fw_ring_buf->phys, false, false);
 
 		kfree(fw_ring_buf->phys);
 
@@ -524,10 +493,10 @@ static void kbasep_hwcnt_backend_csf_if_fw_ring_buf_free(
 	}
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx,
-	struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
-	struct kbase_hwcnt_backend_csf_if_enable *enable)
+static void
+kbasep_hwcnt_backend_csf_if_fw_dump_enable(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+					   struct kbase_hwcnt_backend_csf_if_ring_buf *ring_buf,
+					   struct kbase_hwcnt_backend_csf_if_enable *enable)
 {
 	u32 prfcnt_config;
 	struct kbase_device *kbdev;
@@ -550,8 +519,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
 	prfcnt_config = GLB_PRFCNT_CONFIG_SET_SELECT_SET(prfcnt_config, enable->counter_set);
 
 	/* Configure the ring buffer base address */
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_JASID,
-					fw_ring_buf->as_nr);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_JASID, fw_ring_buf->as_nr);
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_BASE_LO,
 					fw_ring_buf->gpu_dump_base & U32_MAX);
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_BASE_HI,
@@ -561,38 +529,29 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
 	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_EXTRACT, 0);
 
 	/* Configure the enable bitmap */
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CSF_EN,
-					enable->fe_bm);
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_SHADER_EN,
-					enable->shader_bm);
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_MMU_L2_EN,
-					enable->mmu_l2_bm);
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_TILER_EN,
-					enable->tiler_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CSF_EN, enable->fe_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_SHADER_EN, enable->shader_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_MMU_L2_EN, enable->mmu_l2_bm);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_TILER_EN, enable->tiler_bm);
 
 	/* Configure the HWC set and buffer size */
-	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CONFIG,
-					prfcnt_config);
+	kbase_csf_firmware_global_input(global_iface, GLB_PRFCNT_CONFIG, prfcnt_config);
 
 	kbdev->csf.hwcnt.enable_pending = true;
 
 	/* Unmask the interrupts */
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_ENABLE_MASK,
-		GLB_ACK_IRQ_MASK_PRFCNT_ENABLE_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_ENABLE_MASK,
+					     GLB_ACK_IRQ_MASK_PRFCNT_ENABLE_MASK);
 
 	/* Enable the HWC */
 	kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ,
@@ -600,15 +559,12 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_enable(
 					     GLB_REQ_PRFCNT_ENABLE_MASK);
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 
-	prfcnt_config = kbase_csf_firmware_global_input_read(global_iface,
-							     GLB_PRFCNT_CONFIG);
+	prfcnt_config = kbase_csf_firmware_global_input_read(global_iface, GLB_PRFCNT_CONFIG);
 
-	kbasep_hwcnt_backend_csf_if_fw_cc_enable(fw_ctx,
-						 enable->clk_enable_map);
+	kbasep_hwcnt_backend_csf_if_fw_cc_enable(fw_ctx, enable->clk_enable_map);
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
+static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
 	struct kbase_device *kbdev;
 	struct kbase_csf_global_iface *global_iface;
@@ -623,20 +579,16 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(
 
 	/* Disable the HWC */
 	kbdev->csf.hwcnt.enable_pending = true;
-	kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ, 0,
-					     GLB_REQ_PRFCNT_ENABLE_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_REQ, 0, GLB_REQ_PRFCNT_ENABLE_MASK);
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 
 	/* mask the interrupts */
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK, 0,
-		GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK, 0,
-		GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK);
-	kbase_csf_firmware_global_input_mask(
-		global_iface, GLB_ACK_IRQ_MASK, 0,
-		GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK, 0,
+					     GLB_ACK_IRQ_MASK_PRFCNT_SAMPLE_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK, 0,
+					     GLB_ACK_IRQ_MASK_PRFCNT_THRESHOLD_MASK);
+	kbase_csf_firmware_global_input_mask(global_iface, GLB_ACK_IRQ_MASK, 0,
+					     GLB_ACK_IRQ_MASK_PRFCNT_OVERFLOW_MASK);
 
 	/* In case we have a previous request in flight when the disable
 	 * happens.
@@ -646,8 +598,7 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_disable(
 	kbasep_hwcnt_backend_csf_if_fw_cc_disable(fw_ctx);
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_dump_request(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx)
+static void kbasep_hwcnt_backend_csf_if_fw_dump_request(struct kbase_hwcnt_backend_csf_if_ctx *ctx)
 {
 	u32 glb_req;
 	struct kbase_device *kbdev;
@@ -670,9 +621,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_dump_request(
 	kbase_csf_ring_doorbell(kbdev, CSF_KERNEL_DOORBELL_NR);
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_get_indexes(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 *extract_index,
-	u32 *insert_index)
+static void kbasep_hwcnt_backend_csf_if_fw_get_indexes(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						       u32 *extract_index, u32 *insert_index)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
@@ -682,14 +632,15 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_indexes(
 	WARN_ON(!insert_index);
 	kbasep_hwcnt_backend_csf_if_fw_assert_lock_held(ctx);
 
-	*extract_index = kbase_csf_firmware_global_input_read(
-		&fw_ctx->kbdev->csf.global_iface, GLB_PRFCNT_EXTRACT);
-	*insert_index = kbase_csf_firmware_global_output(
-		&fw_ctx->kbdev->csf.global_iface, GLB_PRFCNT_INSERT);
+	*extract_index = kbase_csf_firmware_global_input_read(&fw_ctx->kbdev->csf.global_iface,
+							      GLB_PRFCNT_EXTRACT);
+	*insert_index = kbase_csf_firmware_global_output(&fw_ctx->kbdev->csf.global_iface,
+							 GLB_PRFCNT_INSERT);
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_set_extract_index(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u32 extract_idx)
+static void
+kbasep_hwcnt_backend_csf_if_fw_set_extract_index(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						 u32 extract_idx)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
@@ -700,13 +651,13 @@ static void kbasep_hwcnt_backend_csf_if_fw_set_extract_index(
 	/* Set the raw extract index to release the buffer back to the ring
 	 * buffer.
 	 */
-	kbase_csf_firmware_global_input(&fw_ctx->kbdev->csf.global_iface,
-					GLB_PRFCNT_EXTRACT, extract_idx);
+	kbase_csf_firmware_global_input(&fw_ctx->kbdev->csf.global_iface, GLB_PRFCNT_EXTRACT,
+					extract_idx);
 }
 
-static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(
-	struct kbase_hwcnt_backend_csf_if_ctx *ctx, u64 *cycle_counts,
-	u64 clk_enable_map)
+static void
+kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(struct kbase_hwcnt_backend_csf_if_ctx *ctx,
+						   u64 *cycle_counts, u64 clk_enable_map)
 {
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx =
 		(struct kbase_hwcnt_backend_csf_if_fw_ctx *)ctx;
@@ -723,12 +674,12 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(
 
 		if (clk == KBASE_CLOCK_DOMAIN_TOP) {
 			/* Read cycle count for top clock domain. */
-			kbase_backend_get_gpu_time_norequest(
-				fw_ctx->kbdev, &cycle_counts[clk], NULL, NULL);
+			kbase_backend_get_gpu_time_norequest(fw_ctx->kbdev, &cycle_counts[clk],
+							     NULL, NULL);
 		} else {
 			/* Estimate cycle count for non-top clock domain. */
-			cycle_counts[clk] = kbase_ccswe_cycle_at(
-				&fw_ctx->ccswe_shader_cores, timestamp_ns);
+			cycle_counts[clk] =
+				kbase_ccswe_cycle_at(&fw_ctx->ccswe_shader_cores, timestamp_ns);
 		}
 	}
 }
@@ -738,8 +689,8 @@ static void kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count(
  *
  * @fw_ctx: Pointer to context to destroy.
  */
-static void kbasep_hwcnt_backend_csf_if_fw_ctx_destroy(
-	struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx)
+static void
+kbasep_hwcnt_backend_csf_if_fw_ctx_destroy(struct kbase_hwcnt_backend_csf_if_fw_ctx *fw_ctx)
 {
 	if (!fw_ctx)
 		return;
@@ -754,9 +705,9 @@ static void kbasep_hwcnt_backend_csf_if_fw_ctx_destroy(
  * @out_ctx: Non-NULL pointer to where info is stored on success.
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_backend_csf_if_fw_ctx_create(
-	struct kbase_device *kbdev,
-	struct kbase_hwcnt_backend_csf_if_fw_ctx **out_ctx)
+static int
+kbasep_hwcnt_backend_csf_if_fw_ctx_create(struct kbase_device *kbdev,
+					  struct kbase_hwcnt_backend_csf_if_fw_ctx **out_ctx)
 {
 	u8 clk;
 	int errcode = -ENOMEM;
@@ -780,8 +731,7 @@ static int kbasep_hwcnt_backend_csf_if_fw_ctx_create(
 
 	ctx->clk_enable_map = 0;
 	kbase_ccswe_init(&ctx->ccswe_shader_cores);
-	ctx->rate_listener.notify =
-		kbasep_hwcnt_backend_csf_if_fw_on_freq_change;
+	ctx->rate_listener.notify = kbasep_hwcnt_backend_csf_if_fw_on_freq_change;
 
 	*out_ctx = ctx;
 
@@ -791,8 +741,7 @@ error:
 	return errcode;
 }
 
-void kbase_hwcnt_backend_csf_if_fw_destroy(
-	struct kbase_hwcnt_backend_csf_if *if_fw)
+void kbase_hwcnt_backend_csf_if_fw_destroy(struct kbase_hwcnt_backend_csf_if *if_fw)
 {
 	if (!if_fw)
 		return;
@@ -802,8 +751,8 @@ void kbase_hwcnt_backend_csf_if_fw_destroy(
 	memset(if_fw, 0, sizeof(*if_fw));
 }
 
-int kbase_hwcnt_backend_csf_if_fw_create(
-	struct kbase_device *kbdev, struct kbase_hwcnt_backend_csf_if *if_fw)
+int kbase_hwcnt_backend_csf_if_fw_create(struct kbase_device *kbdev,
+					 struct kbase_hwcnt_backend_csf_if *if_fw)
 {
 	int errcode;
 	struct kbase_hwcnt_backend_csf_if_fw_ctx *ctx = NULL;
@@ -816,8 +765,7 @@ int kbase_hwcnt_backend_csf_if_fw_create(
 		return errcode;
 
 	if_fw->ctx = (struct kbase_hwcnt_backend_csf_if_ctx *)ctx;
-	if_fw->assert_lock_held =
-		kbasep_hwcnt_backend_csf_if_fw_assert_lock_held;
+	if_fw->assert_lock_held = kbasep_hwcnt_backend_csf_if_fw_assert_lock_held;
 	if_fw->lock = kbasep_hwcnt_backend_csf_if_fw_lock;
 	if_fw->unlock = kbasep_hwcnt_backend_csf_if_fw_unlock;
 	if_fw->get_prfcnt_info = kbasep_hwcnt_backend_csf_if_fw_get_prfcnt_info;
@@ -828,11 +776,9 @@ int kbase_hwcnt_backend_csf_if_fw_create(
 	if_fw->dump_enable = kbasep_hwcnt_backend_csf_if_fw_dump_enable;
 	if_fw->dump_disable = kbasep_hwcnt_backend_csf_if_fw_dump_disable;
 	if_fw->dump_request = kbasep_hwcnt_backend_csf_if_fw_dump_request;
-	if_fw->get_gpu_cycle_count =
-		kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count;
+	if_fw->get_gpu_cycle_count = kbasep_hwcnt_backend_csf_if_fw_get_gpu_cycle_count;
 	if_fw->get_indexes = kbasep_hwcnt_backend_csf_if_fw_get_indexes;
-	if_fw->set_extract_index =
-		kbasep_hwcnt_backend_csf_if_fw_set_extract_index;
+	if_fw->set_extract_index = kbasep_hwcnt_backend_csf_if_fw_set_extract_index;
 
 	return 0;
 }
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf_if_fw.h b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.h
similarity index 82%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf_if_fw.h
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.h
index b69668b2e822..71d1506694f4 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_csf_if_fw.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_csf_if_fw.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -26,7 +26,7 @@
 #ifndef _KBASE_HWCNT_BACKEND_CSF_IF_FW_H_
 #define _KBASE_HWCNT_BACKEND_CSF_IF_FW_H_
 
-#include "mali_kbase_hwcnt_backend_csf_if.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend_csf_if.h"
 
 /**
  * kbase_hwcnt_backend_csf_if_fw_create() - Create a firmware CSF interface
@@ -36,15 +36,14 @@
  *         creation success.
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_backend_csf_if_fw_create(
-	struct kbase_device *kbdev, struct kbase_hwcnt_backend_csf_if *if_fw);
+int kbase_hwcnt_backend_csf_if_fw_create(struct kbase_device *kbdev,
+					 struct kbase_hwcnt_backend_csf_if *if_fw);
 
 /**
  * kbase_hwcnt_backend_csf_if_fw_destroy() - Destroy a firmware CSF interface of
  *                                           hardware counter backend.
  * @if_fw: Pointer to a CSF interface to destroy.
  */
-void kbase_hwcnt_backend_csf_if_fw_destroy(
-	struct kbase_hwcnt_backend_csf_if *if_fw);
+void kbase_hwcnt_backend_csf_if_fw_destroy(struct kbase_hwcnt_backend_csf_if *if_fw);
 
 #endif /* _KBASE_HWCNT_BACKEND_CSF_IF_FW_H_ */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm.c b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
similarity index 73%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm.c
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
index c995a1923583..9d9889a0e426 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.c
@@ -19,9 +19,9 @@
  *
  */
 
-#include "mali_kbase_hwcnt_backend_jm.h"
-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend_jm.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 #include "mali_kbase.h"
 #include "backend/gpu/mali_kbase_pm_ca.h"
 #include "mali_kbase_hwaccess_instr.h"
@@ -136,9 +136,8 @@ struct kbase_hwcnt_backend_jm {
  *
  * Return: 0 on success, else error code.
  */
-static int
-kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
-				      struct kbase_hwcnt_gpu_info *info)
+static int kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
+						 struct kbase_hwcnt_gpu_info *info)
 {
 	size_t clk;
 
@@ -153,13 +152,11 @@ kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
 	{
 		const struct base_gpu_props *props = &kbdev->gpu_props.props;
 		const size_t l2_count = props->l2_props.num_l2_slices;
-		const size_t core_mask =
-			props->coherency_info.group[0].core_mask;
+		const size_t core_mask = props->coherency_info.group[0].core_mask;
 
 		info->l2_count = l2_count;
 		info->core_mask = core_mask;
-		info->prfcnt_values_per_block =
-			KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
+		info->prfcnt_values_per_block = KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK;
 	}
 #endif /* CONFIG_MALI_BIFROST_NO_MALI */
 
@@ -173,9 +170,8 @@ kbasep_hwcnt_backend_jm_gpu_info_init(struct kbase_device *kbdev,
 	return 0;
 }
 
-static void kbasep_hwcnt_backend_jm_init_layout(
-	const struct kbase_hwcnt_gpu_info *gpu_info,
-	struct kbase_hwcnt_jm_physical_layout *phys_layout)
+static void kbasep_hwcnt_backend_jm_init_layout(const struct kbase_hwcnt_gpu_info *gpu_info,
+						struct kbase_hwcnt_jm_physical_layout *phys_layout)
 {
 	u8 shader_core_cnt;
 
@@ -189,32 +185,29 @@ static void kbasep_hwcnt_backend_jm_init_layout(
 		.tiler_cnt = KBASE_HWCNT_V5_TILER_BLOCK_COUNT,
 		.mmu_l2_cnt = gpu_info->l2_count,
 		.shader_cnt = shader_core_cnt,
-		.block_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT +
-			     KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
+		.block_cnt = KBASE_HWCNT_V5_FE_BLOCK_COUNT + KBASE_HWCNT_V5_TILER_BLOCK_COUNT +
 			     gpu_info->l2_count + shader_core_cnt,
 		.shader_avail_mask = gpu_info->core_mask,
 		.headers_per_block = KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
 		.values_per_block = gpu_info->prfcnt_values_per_block,
-		.counters_per_block = gpu_info->prfcnt_values_per_block -
-				      KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
+		.counters_per_block =
+			gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK,
 		.enable_mask_offset = KBASE_HWCNT_V5_PRFCNT_EN_HEADER,
 	};
 }
 
-static void kbasep_hwcnt_backend_jm_dump_sample(
-	const struct kbase_hwcnt_backend_jm *const backend_jm)
+static void
+kbasep_hwcnt_backend_jm_dump_sample(const struct kbase_hwcnt_backend_jm *const backend_jm)
 {
 	size_t block_idx;
 	const u32 *new_sample_buf = backend_jm->cpu_dump_va;
 	const u32 *new_block = new_sample_buf;
 	u64 *dst_buf = backend_jm->to_user_buf;
 	u64 *dst_block = dst_buf;
-	const size_t values_per_block =
-		backend_jm->phys_layout.values_per_block;
+	const size_t values_per_block = backend_jm->phys_layout.values_per_block;
 	const size_t dump_bytes = backend_jm->info->dump_bytes;
 
-	for (block_idx = 0; block_idx < backend_jm->phys_layout.block_cnt;
-	     block_idx++) {
+	for (block_idx = 0; block_idx < backend_jm->phys_layout.block_cnt; block_idx++) {
 		size_t ctr_idx;
 
 		for (ctr_idx = 0; ctr_idx < values_per_block; ctr_idx++)
@@ -224,10 +217,8 @@ static void kbasep_hwcnt_backend_jm_dump_sample(
 		dst_block += values_per_block;
 	}
 
-	WARN_ON(new_block !=
-		new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
-	WARN_ON(dst_block !=
-		dst_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
+	WARN_ON(new_block != new_sample_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
+	WARN_ON(dst_block != dst_buf + (dump_bytes / KBASE_HWCNT_VALUE_HW_BYTES));
 }
 
 /**
@@ -237,21 +228,18 @@ static void kbasep_hwcnt_backend_jm_dump_sample(
  * @clk_index:        Clock index
  * @clk_rate_hz:      Clock frequency(hz)
  */
-static void kbasep_hwcnt_backend_jm_on_freq_change(
-	struct kbase_clk_rate_listener *rate_listener,
-	u32 clk_index,
-	u32 clk_rate_hz)
+static void kbasep_hwcnt_backend_jm_on_freq_change(struct kbase_clk_rate_listener *rate_listener,
+						   u32 clk_index, u32 clk_rate_hz)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm = container_of(
-		rate_listener, struct kbase_hwcnt_backend_jm, rate_listener);
+	struct kbase_hwcnt_backend_jm *backend_jm =
+		container_of(rate_listener, struct kbase_hwcnt_backend_jm, rate_listener);
 	u64 timestamp_ns;
 
 	if (clk_index != KBASE_CLOCK_DOMAIN_SHADER_CORES)
 		return;
 
 	timestamp_ns = ktime_get_raw_ns();
-	kbase_ccswe_freq_change(
-		&backend_jm->ccswe_shader_cores, timestamp_ns, clk_rate_hz);
+	kbase_ccswe_freq_change(&backend_jm->ccswe_shader_cores, timestamp_ns, clk_rate_hz);
 }
 
 /**
@@ -261,53 +249,42 @@ static void kbasep_hwcnt_backend_jm_on_freq_change(
  * @enable_map:   Non-NULL pointer to enable map specifying enabled counters.
  * @timestamp_ns: Timestamp(ns) when HWCNT were enabled.
  */
-static void kbasep_hwcnt_backend_jm_cc_enable(
-	struct kbase_hwcnt_backend_jm *backend_jm,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	u64 timestamp_ns)
+static void kbasep_hwcnt_backend_jm_cc_enable(struct kbase_hwcnt_backend_jm *backend_jm,
+					      const struct kbase_hwcnt_enable_map *enable_map,
+					      u64 timestamp_ns)
 {
 	struct kbase_device *kbdev = backend_jm->kctx->kbdev;
 	u64 clk_enable_map = enable_map->clk_enable_map;
 	u64 cycle_count;
 
-	if (kbase_hwcnt_clk_enable_map_enabled(
-		    clk_enable_map, KBASE_CLOCK_DOMAIN_TOP)) {
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_TOP)) {
 		/* turn on the cycle counter */
 		kbase_pm_request_gpu_cycle_counter_l2_is_on(kbdev);
 		/* Read cycle count for top clock domain. */
-		kbase_backend_get_gpu_time_norequest(
-			kbdev, &cycle_count, NULL, NULL);
+		kbase_backend_get_gpu_time_norequest(kbdev, &cycle_count, NULL, NULL);
 
-		backend_jm->prev_cycle_count[KBASE_CLOCK_DOMAIN_TOP] =
-			cycle_count;
+		backend_jm->prev_cycle_count[KBASE_CLOCK_DOMAIN_TOP] = cycle_count;
 	}
 
-	if (kbase_hwcnt_clk_enable_map_enabled(
-		    clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
 		/* software estimation for non-top clock domains */
 		struct kbase_clk_rate_trace_manager *rtm = &kbdev->pm.clk_rtm;
-		const struct kbase_clk_data *clk_data =
-			rtm->clks[KBASE_CLOCK_DOMAIN_SHADER_CORES];
+		const struct kbase_clk_data *clk_data = rtm->clks[KBASE_CLOCK_DOMAIN_SHADER_CORES];
 		u32 cur_freq;
 		unsigned long flags;
 
 		spin_lock_irqsave(&rtm->lock, flags);
 
-		cur_freq = (u32) clk_data->clock_val;
+		cur_freq = (u32)clk_data->clock_val;
 		kbase_ccswe_reset(&backend_jm->ccswe_shader_cores);
-		kbase_ccswe_freq_change(
-			&backend_jm->ccswe_shader_cores,
-			timestamp_ns,
-			cur_freq);
+		kbase_ccswe_freq_change(&backend_jm->ccswe_shader_cores, timestamp_ns, cur_freq);
 
-		kbase_clk_rate_trace_manager_subscribe_no_lock(
-			rtm, &backend_jm->rate_listener);
+		kbase_clk_rate_trace_manager_subscribe_no_lock(rtm, &backend_jm->rate_listener);
 
 		spin_unlock_irqrestore(&rtm->lock, flags);
 
 		/* ccswe was reset. The estimated cycle is zero. */
-		backend_jm->prev_cycle_count[
-			KBASE_CLOCK_DOMAIN_SHADER_CORES] = 0;
+		backend_jm->prev_cycle_count[KBASE_CLOCK_DOMAIN_SHADER_CORES] = 0;
 	}
 
 	/* Keep clk_enable_map for dump_request. */
@@ -319,28 +296,22 @@ static void kbasep_hwcnt_backend_jm_cc_enable(
  *
  * @backend_jm:      Non-NULL pointer to backend.
  */
-static void kbasep_hwcnt_backend_jm_cc_disable(
-	struct kbase_hwcnt_backend_jm *backend_jm)
+static void kbasep_hwcnt_backend_jm_cc_disable(struct kbase_hwcnt_backend_jm *backend_jm)
 {
 	struct kbase_device *kbdev = backend_jm->kctx->kbdev;
 	struct kbase_clk_rate_trace_manager *rtm = &kbdev->pm.clk_rtm;
 	u64 clk_enable_map = backend_jm->clk_enable_map;
 
-	if (kbase_hwcnt_clk_enable_map_enabled(
-		clk_enable_map, KBASE_CLOCK_DOMAIN_TOP)) {
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_TOP)) {
 		/* turn off the cycle counter */
 		kbase_pm_release_gpu_cycle_counter(kbdev);
 	}
 
-	if (kbase_hwcnt_clk_enable_map_enabled(
-		clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
-
-		kbase_clk_rate_trace_manager_unsubscribe(
-			rtm, &backend_jm->rate_listener);
+	if (kbase_hwcnt_clk_enable_map_enabled(clk_enable_map, KBASE_CLOCK_DOMAIN_SHADER_CORES)) {
+		kbase_clk_rate_trace_manager_unsubscribe(rtm, &backend_jm->rate_listener);
 	}
 }
 
-
 /**
  * kbasep_hwcnt_gpu_update_curr_config() - Update the destination buffer with
  *                                        current config information.
@@ -356,38 +327,33 @@ static void kbasep_hwcnt_backend_jm_cc_disable(
  *
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_gpu_update_curr_config(
-	struct kbase_device *kbdev,
-	struct kbase_hwcnt_curr_config *curr_config)
+static int kbasep_hwcnt_gpu_update_curr_config(struct kbase_device *kbdev,
+					       struct kbase_hwcnt_curr_config *curr_config)
 {
 	if (WARN_ON(!kbdev) || WARN_ON(!curr_config))
 		return -EINVAL;
 
 	lockdep_assert_held(&kbdev->hwaccess_lock);
 
-	curr_config->num_l2_slices =
-		kbdev->gpu_props.curr_config.l2_slices;
-	curr_config->shader_present =
-		kbdev->gpu_props.curr_config.shader_present;
+	curr_config->num_l2_slices = kbdev->gpu_props.curr_config.l2_slices;
+	curr_config->shader_present = kbdev->gpu_props.curr_config.shader_present;
 	return 0;
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_timestamp_ns_fn */
-static u64 kbasep_hwcnt_backend_jm_timestamp_ns(
-	struct kbase_hwcnt_backend *backend)
+static u64 kbasep_hwcnt_backend_jm_timestamp_ns(struct kbase_hwcnt_backend *backend)
 {
 	(void)backend;
 	return ktime_get_raw_ns();
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_dump_enable_nolock_fn */
-static int kbasep_hwcnt_backend_jm_dump_enable_nolock(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map)
+static int
+kbasep_hwcnt_backend_jm_dump_enable_nolock(struct kbase_hwcnt_backend *backend,
+					   const struct kbase_hwcnt_enable_map *enable_map)
 {
 	int errcode;
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 	struct kbase_context *kctx;
 	struct kbase_device *kbdev;
 	struct kbase_hwcnt_physical_enable_map phys_enable_map;
@@ -406,8 +372,7 @@ static int kbasep_hwcnt_backend_jm_dump_enable_nolock(
 
 	kbase_hwcnt_gpu_enable_map_to_physical(&phys_enable_map, enable_map);
 
-	kbase_hwcnt_gpu_set_to_physical(&phys_counter_set,
-					backend_jm->info->counter_set);
+	kbase_hwcnt_gpu_set_to_physical(&phys_counter_set, backend_jm->info->counter_set);
 
 	enable.fe_bm = phys_enable_map.fe_bm;
 	enable.shader_bm = phys_enable_map.shader_bm;
@@ -425,8 +390,7 @@ static int kbasep_hwcnt_backend_jm_dump_enable_nolock(
 	timestamp_ns = kbasep_hwcnt_backend_jm_timestamp_ns(backend);
 
 	/* Update the current configuration information. */
-	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev,
-						      &backend_jm->curr_config);
+	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev, &backend_jm->curr_config);
 	if (errcode)
 		goto error;
 
@@ -446,14 +410,12 @@ error:
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_dump_enable_fn */
-static int kbasep_hwcnt_backend_jm_dump_enable(
-	struct kbase_hwcnt_backend *backend,
-	const struct kbase_hwcnt_enable_map *enable_map)
+static int kbasep_hwcnt_backend_jm_dump_enable(struct kbase_hwcnt_backend *backend,
+					       const struct kbase_hwcnt_enable_map *enable_map)
 {
 	unsigned long flags;
 	int errcode;
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 	struct kbase_device *kbdev;
 
 	if (!backend_jm)
@@ -463,8 +425,7 @@ static int kbasep_hwcnt_backend_jm_dump_enable(
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
-	errcode = kbasep_hwcnt_backend_jm_dump_enable_nolock(
-		backend, enable_map);
+	errcode = kbasep_hwcnt_backend_jm_dump_enable_nolock(backend, enable_map);
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
@@ -472,12 +433,10 @@ static int kbasep_hwcnt_backend_jm_dump_enable(
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_dump_disable_fn */
-static void kbasep_hwcnt_backend_jm_dump_disable(
-	struct kbase_hwcnt_backend *backend)
+static void kbasep_hwcnt_backend_jm_dump_disable(struct kbase_hwcnt_backend *backend)
 {
 	int errcode;
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 
 	if (WARN_ON(!backend_jm) || !backend_jm->enabled)
 		return;
@@ -491,11 +450,9 @@ static void kbasep_hwcnt_backend_jm_dump_disable(
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_dump_clear_fn */
-static int kbasep_hwcnt_backend_jm_dump_clear(
-	struct kbase_hwcnt_backend *backend)
+static int kbasep_hwcnt_backend_jm_dump_clear(struct kbase_hwcnt_backend *backend)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 
 	if (!backend_jm || !backend_jm->enabled)
 		return -EINVAL;
@@ -504,12 +461,10 @@ static int kbasep_hwcnt_backend_jm_dump_clear(
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_dump_request_fn */
-static int kbasep_hwcnt_backend_jm_dump_request(
-	struct kbase_hwcnt_backend *backend,
-	u64 *dump_time_ns)
+static int kbasep_hwcnt_backend_jm_dump_request(struct kbase_hwcnt_backend *backend,
+						u64 *dump_time_ns)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 	struct kbase_device *kbdev;
 	const struct kbase_hwcnt_metadata *metadata;
 	u64 current_cycle_count;
@@ -528,28 +483,25 @@ static int kbasep_hwcnt_backend_jm_dump_request(
 		*dump_time_ns = kbasep_hwcnt_backend_jm_timestamp_ns(backend);
 		ret = kbase_instr_hwcnt_request_dump(backend_jm->kctx);
 
-		kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
-			if (!kbase_hwcnt_clk_enable_map_enabled(
-				backend_jm->clk_enable_map, clk))
+		kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+		{
+			if (!kbase_hwcnt_clk_enable_map_enabled(backend_jm->clk_enable_map, clk))
 				continue;
 
 			if (clk == KBASE_CLOCK_DOMAIN_TOP) {
 				/* Read cycle count for top clock domain. */
-				kbase_backend_get_gpu_time_norequest(
-					kbdev, &current_cycle_count,
-					NULL, NULL);
+				kbase_backend_get_gpu_time_norequest(kbdev, &current_cycle_count,
+								     NULL, NULL);
 			} else {
 				/*
 				 * Estimate cycle count for non-top clock
 				 * domain.
 				 */
 				current_cycle_count = kbase_ccswe_cycle_at(
-					&backend_jm->ccswe_shader_cores,
-					*dump_time_ns);
+					&backend_jm->ccswe_shader_cores, *dump_time_ns);
 			}
 			backend_jm->cycle_count_elapsed[clk] =
-				current_cycle_count -
-				backend_jm->prev_cycle_count[clk];
+				current_cycle_count - backend_jm->prev_cycle_count[clk];
 
 			/*
 			 * Keep the current cycle count for later calculation.
@@ -563,11 +515,9 @@ static int kbasep_hwcnt_backend_jm_dump_request(
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_dump_wait_fn */
-static int kbasep_hwcnt_backend_jm_dump_wait(
-	struct kbase_hwcnt_backend *backend)
+static int kbasep_hwcnt_backend_jm_dump_wait(struct kbase_hwcnt_backend *backend)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 
 	if (!backend_jm || !backend_jm->enabled)
 		return -EINVAL;
@@ -576,14 +526,12 @@ static int kbasep_hwcnt_backend_jm_dump_wait(
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_dump_get_fn */
-static int kbasep_hwcnt_backend_jm_dump_get(
-	struct kbase_hwcnt_backend *backend,
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_enable_map *dst_enable_map,
-	bool accumulate)
+static int kbasep_hwcnt_backend_jm_dump_get(struct kbase_hwcnt_backend *backend,
+					    struct kbase_hwcnt_dump_buffer *dst,
+					    const struct kbase_hwcnt_enable_map *dst_enable_map,
+					    bool accumulate)
 {
-	struct kbase_hwcnt_backend_jm *backend_jm =
-		(struct kbase_hwcnt_backend_jm *)backend;
+	struct kbase_hwcnt_backend_jm *backend_jm = (struct kbase_hwcnt_backend_jm *)backend;
 	size_t clk;
 #if IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 	struct kbase_device *kbdev;
@@ -597,16 +545,15 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 		return -EINVAL;
 
 	/* Invalidate the kernel buffer before reading from it. */
-	kbase_sync_mem_regions(
-		backend_jm->kctx, backend_jm->vmap, KBASE_SYNC_TO_CPU);
+	kbase_sync_mem_regions(backend_jm->kctx, backend_jm->vmap, KBASE_SYNC_TO_CPU);
 
 	/* Dump sample to the internal 64-bit user buffer. */
 	kbasep_hwcnt_backend_jm_dump_sample(backend_jm);
 
 	/* Extract elapsed cycle count for each clock domain if enabled. */
-	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk) {
-		if (!kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk))
+	kbase_hwcnt_metadata_for_each_clock(dst_enable_map->metadata, clk)
+	{
+		if (!kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			continue;
 
 		/* Reset the counter to zero if accumulation is off. */
@@ -621,17 +568,16 @@ static int kbasep_hwcnt_backend_jm_dump_get(
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
 	/* Update the current configuration information. */
-	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev,
-		&backend_jm->curr_config);
+	errcode = kbasep_hwcnt_gpu_update_curr_config(kbdev, &backend_jm->curr_config);
 
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 
 	if (errcode)
 		return errcode;
 #endif /* CONFIG_MALI_BIFROST_NO_MALI */
-	return kbase_hwcnt_jm_dump_get(dst, backend_jm->to_user_buf,
-				       dst_enable_map, backend_jm->pm_core_mask,
-				       &backend_jm->curr_config, accumulate);
+	return kbase_hwcnt_jm_dump_get(dst, backend_jm->to_user_buf, dst_enable_map,
+				       backend_jm->pm_core_mask, &backend_jm->curr_config,
+				       accumulate);
 }
 
 /**
@@ -643,10 +589,8 @@ static int kbasep_hwcnt_backend_jm_dump_get(
  *
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_backend_jm_dump_alloc(
-	const struct kbase_hwcnt_backend_jm_info *info,
-	struct kbase_context *kctx,
-	u64 *gpu_dump_va)
+static int kbasep_hwcnt_backend_jm_dump_alloc(const struct kbase_hwcnt_backend_jm_info *info,
+					      struct kbase_context *kctx, u64 *gpu_dump_va)
 {
 	struct kbase_va_region *reg;
 	u64 flags;
@@ -661,16 +605,12 @@ static int kbasep_hwcnt_backend_jm_dump_alloc(
 	WARN_ON(!kctx);
 	WARN_ON(!gpu_dump_va);
 
-	flags = BASE_MEM_PROT_CPU_RD |
-		BASE_MEM_PROT_GPU_WR |
-		BASEP_MEM_PERMANENT_KERNEL_MAPPING |
-		BASE_MEM_CACHED_CPU |
-		BASE_MEM_UNCACHED_GPU;
+	flags = BASE_MEM_PROT_CPU_RD | BASE_MEM_PROT_GPU_WR | BASEP_MEM_PERMANENT_KERNEL_MAPPING |
+		BASE_MEM_CACHED_CPU | BASE_MEM_UNCACHED_GPU;
 
 	nr_pages = PFN_UP(info->dump_bytes);
 
-	reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, gpu_dump_va,
-			      mmu_sync_info);
+	reg = kbase_mem_alloc(kctx, nr_pages, nr_pages, 0, &flags, gpu_dump_va, mmu_sync_info);
 
 	if (!reg)
 		return -ENOMEM;
@@ -683,9 +623,7 @@ static int kbasep_hwcnt_backend_jm_dump_alloc(
  * @kctx:        Non-NULL pointer to kbase context.
  * @gpu_dump_va: GPU dump buffer virtual address.
  */
-static void kbasep_hwcnt_backend_jm_dump_free(
-	struct kbase_context *kctx,
-	u64 gpu_dump_va)
+static void kbasep_hwcnt_backend_jm_dump_free(struct kbase_context *kctx, u64 gpu_dump_va)
 {
 	WARN_ON(!kctx);
 	if (gpu_dump_va)
@@ -698,8 +636,7 @@ static void kbasep_hwcnt_backend_jm_dump_free(
  *
  * Can be safely called on a backend in any state of partial construction.
  */
-static void kbasep_hwcnt_backend_jm_destroy(
-	struct kbase_hwcnt_backend_jm *backend)
+static void kbasep_hwcnt_backend_jm_destroy(struct kbase_hwcnt_backend_jm *backend)
 {
 	if (!backend)
 		return;
@@ -712,8 +649,7 @@ static void kbasep_hwcnt_backend_jm_destroy(
 			kbase_phy_alloc_mapping_put(kctx, backend->vmap);
 
 		if (backend->gpu_dump_va)
-			kbasep_hwcnt_backend_jm_dump_free(
-				kctx, backend->gpu_dump_va);
+			kbasep_hwcnt_backend_jm_dump_free(kctx, backend->gpu_dump_va);
 
 		kbasep_js_release_privileged_ctx(kbdev, kctx);
 		kbase_destroy_context(kctx);
@@ -731,9 +667,8 @@ static void kbasep_hwcnt_backend_jm_destroy(
  *
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_backend_jm_create(
-	const struct kbase_hwcnt_backend_jm_info *info,
-	struct kbase_hwcnt_backend_jm **out_backend)
+static int kbasep_hwcnt_backend_jm_create(const struct kbase_hwcnt_backend_jm_info *info,
+					  struct kbase_hwcnt_backend_jm **out_backend)
 {
 	int errcode;
 	struct kbase_device *kbdev;
@@ -749,28 +684,25 @@ static int kbasep_hwcnt_backend_jm_create(
 		goto alloc_error;
 
 	backend->info = info;
-	kbasep_hwcnt_backend_jm_init_layout(&info->hwcnt_gpu_info,
-					    &backend->phys_layout);
+	kbasep_hwcnt_backend_jm_init_layout(&info->hwcnt_gpu_info, &backend->phys_layout);
 
 	backend->kctx = kbase_create_context(kbdev, true,
-		BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED, 0, NULL);
+					     BASE_CONTEXT_SYSTEM_MONITOR_SUBMIT_DISABLED, 0, NULL);
 	if (!backend->kctx)
 		goto alloc_error;
 
 	kbasep_js_schedule_privileged_ctx(kbdev, backend->kctx);
 
-	errcode = kbasep_hwcnt_backend_jm_dump_alloc(
-		info, backend->kctx, &backend->gpu_dump_va);
+	errcode = kbasep_hwcnt_backend_jm_dump_alloc(info, backend->kctx, &backend->gpu_dump_va);
 	if (errcode)
 		goto error;
 
-	backend->cpu_dump_va = kbase_phy_alloc_mapping_get(backend->kctx,
-		backend->gpu_dump_va, &backend->vmap);
+	backend->cpu_dump_va =
+		kbase_phy_alloc_mapping_get(backend->kctx, backend->gpu_dump_va, &backend->vmap);
 	if (!backend->cpu_dump_va || !backend->vmap)
 		goto alloc_error;
 
-	backend->to_user_buf =
-		kzalloc(info->metadata->dump_buf_bytes, GFP_KERNEL);
+	backend->to_user_buf = kzalloc(info->metadata->dump_buf_bytes, GFP_KERNEL);
 	if (!backend->to_user_buf)
 		goto alloc_error;
 
@@ -798,9 +730,8 @@ kbasep_hwcnt_backend_jm_metadata(const struct kbase_hwcnt_backend_info *info)
 }
 
 /* JM backend implementation of kbase_hwcnt_backend_init_fn */
-static int kbasep_hwcnt_backend_jm_init(
-	const struct kbase_hwcnt_backend_info *info,
-	struct kbase_hwcnt_backend **out_backend)
+static int kbasep_hwcnt_backend_jm_init(const struct kbase_hwcnt_backend_info *info,
+					struct kbase_hwcnt_backend **out_backend)
 {
 	int errcode;
 	struct kbase_hwcnt_backend_jm *backend = NULL;
@@ -808,8 +739,8 @@ static int kbasep_hwcnt_backend_jm_init(
 	if (!info || !out_backend)
 		return -EINVAL;
 
-	errcode = kbasep_hwcnt_backend_jm_create(
-		(const struct kbase_hwcnt_backend_jm_info *) info, &backend);
+	errcode = kbasep_hwcnt_backend_jm_create((const struct kbase_hwcnt_backend_jm_info *)info,
+						 &backend);
 	if (errcode)
 		return errcode;
 
@@ -825,8 +756,7 @@ static void kbasep_hwcnt_backend_jm_term(struct kbase_hwcnt_backend *backend)
 		return;
 
 	kbasep_hwcnt_backend_jm_dump_disable(backend);
-	kbasep_hwcnt_backend_jm_destroy(
-		(struct kbase_hwcnt_backend_jm *)backend);
+	kbasep_hwcnt_backend_jm_destroy((struct kbase_hwcnt_backend_jm *)backend);
 }
 
 /**
@@ -835,8 +765,7 @@ static void kbasep_hwcnt_backend_jm_term(struct kbase_hwcnt_backend *backend)
  *
  * Can be safely called on a backend info in any state of partial construction.
  */
-static void kbasep_hwcnt_backend_jm_info_destroy(
-	const struct kbase_hwcnt_backend_jm_info *info)
+static void kbasep_hwcnt_backend_jm_info_destroy(const struct kbase_hwcnt_backend_jm_info *info)
 {
 	if (!info)
 		return;
@@ -852,9 +781,8 @@ static void kbasep_hwcnt_backend_jm_info_destroy(
  *
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_backend_jm_info_create(
-	struct kbase_device *kbdev,
-	const struct kbase_hwcnt_backend_jm_info **out_info)
+static int kbasep_hwcnt_backend_jm_info_create(struct kbase_device *kbdev,
+					       const struct kbase_hwcnt_backend_jm_info **out_info)
 {
 	int errcode = -ENOMEM;
 	struct kbase_hwcnt_backend_jm_info *info = NULL;
@@ -877,15 +805,12 @@ static int kbasep_hwcnt_backend_jm_info_create(
 	info->counter_set = KBASE_HWCNT_SET_PRIMARY;
 #endif
 
-	errcode = kbasep_hwcnt_backend_jm_gpu_info_init(kbdev,
-							&info->hwcnt_gpu_info);
+	errcode = kbasep_hwcnt_backend_jm_gpu_info_init(kbdev, &info->hwcnt_gpu_info);
 	if (errcode)
 		goto error;
 
-	errcode = kbase_hwcnt_jm_metadata_create(&info->hwcnt_gpu_info,
-						 info->counter_set,
-						 &info->metadata,
-						 &info->dump_bytes);
+	errcode = kbase_hwcnt_jm_metadata_create(&info->hwcnt_gpu_info, info->counter_set,
+						 &info->metadata, &info->dump_bytes);
 	if (errcode)
 		goto error;
 
@@ -897,9 +822,8 @@ error:
 	return errcode;
 }
 
-int kbase_hwcnt_backend_jm_create(
-	struct kbase_device *kbdev,
-	struct kbase_hwcnt_backend_interface *iface)
+int kbase_hwcnt_backend_jm_create(struct kbase_device *kbdev,
+				  struct kbase_hwcnt_backend_interface *iface)
 {
 	int errcode;
 	const struct kbase_hwcnt_backend_jm_info *info = NULL;
@@ -928,8 +852,7 @@ int kbase_hwcnt_backend_jm_create(
 	return 0;
 }
 
-void kbase_hwcnt_backend_jm_destroy(
-	struct kbase_hwcnt_backend_interface *iface)
+void kbase_hwcnt_backend_jm_destroy(struct kbase_hwcnt_backend_interface *iface)
 {
 	if (!iface)
 		return;
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm.h b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.h
similarity index 84%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm.h
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.h
index 1bc39066b414..4a6293c25473 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -27,7 +27,7 @@
 #ifndef _KBASE_HWCNT_BACKEND_JM_H_
 #define _KBASE_HWCNT_BACKEND_JM_H_
 
-#include "mali_kbase_hwcnt_backend.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend.h"
 
 struct kbase_device;
 
@@ -42,9 +42,8 @@ struct kbase_device;
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_backend_jm_create(
-	struct kbase_device *kbdev,
-	struct kbase_hwcnt_backend_interface *iface);
+int kbase_hwcnt_backend_jm_create(struct kbase_device *kbdev,
+				  struct kbase_hwcnt_backend_interface *iface);
 
 /**
  * kbase_hwcnt_backend_jm_destroy() - Destroy a JM hardware counter backend
@@ -54,7 +53,6 @@ int kbase_hwcnt_backend_jm_create(
  * Can be safely called on an all-zeroed interface, or on an already destroyed
  * interface.
  */
-void kbase_hwcnt_backend_jm_destroy(
-	struct kbase_hwcnt_backend_interface *iface);
+void kbase_hwcnt_backend_jm_destroy(struct kbase_hwcnt_backend_interface *iface);
 
 #endif /* _KBASE_HWCNT_BACKEND_JM_H_ */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm_watchdog.c b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.c
similarity index 97%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm_watchdog.c
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.c
index 8bb7ccb49a64..564700b2d978 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm_watchdog.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.c
@@ -21,11 +21,12 @@
 
 #include <mali_kbase.h>
 
-#include <mali_kbase_hwcnt_gpu.h>
-#include <mali_kbase_hwcnt_types.h>
+#include <hwcnt/mali_kbase_hwcnt_gpu.h>
+#include <hwcnt/mali_kbase_hwcnt_types.h>
 
-#include <mali_kbase_hwcnt_backend.h>
-#include <mali_kbase_hwcnt_watchdog_if.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h>
+#include <hwcnt/mali_kbase_hwcnt_watchdog_if.h>
 
 #if IS_ENABLED(CONFIG_MALI_IS_FPGA) && !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 /* Backend watch dog timer interval in milliseconds: 18 seconds. */
@@ -118,8 +119,7 @@ enum backend_watchdog_state {
  */
 enum wd_init_state {
 	HWCNT_JM_WD_INIT_START,
-	HWCNT_JM_WD_INIT_ALLOC = HWCNT_JM_WD_INIT_START,
-	HWCNT_JM_WD_INIT_BACKEND,
+	HWCNT_JM_WD_INIT_BACKEND = HWCNT_JM_WD_INIT_START,
 	HWCNT_JM_WD_INIT_ENABLE_MAP,
 	HWCNT_JM_WD_INIT_DUMP_BUFFER,
 	HWCNT_JM_WD_INIT_END
@@ -296,16 +296,10 @@ kbasep_hwcnt_backend_jm_watchdog_term_partial(struct kbase_hwcnt_backend_jm_watc
 	if (!wd_backend)
 		return;
 
-	/* disable timer thread to avoid concurrent access to shared resources */
-	wd_backend->info->dump_watchdog_iface->disable(
-		wd_backend->info->dump_watchdog_iface->timer);
+	WARN_ON(state > HWCNT_JM_WD_INIT_END);
 
-	/*will exit the loop when state reaches HWCNT_JM_WD_INIT_START*/
 	while (state-- > HWCNT_JM_WD_INIT_START) {
 		switch (state) {
-		case HWCNT_JM_WD_INIT_ALLOC:
-			kfree(wd_backend);
-			break;
 		case HWCNT_JM_WD_INIT_BACKEND:
 			wd_backend->info->jm_backend_iface->term(wd_backend->jm_backend);
 			break;
@@ -319,6 +313,8 @@ kbasep_hwcnt_backend_jm_watchdog_term_partial(struct kbase_hwcnt_backend_jm_watc
 			break;
 		}
 	}
+
+	kfree(wd_backend);
 }
 
 /* Job manager watchdog backend, implementation of kbase_hwcnt_backend_term_fn
@@ -326,11 +322,17 @@ kbasep_hwcnt_backend_jm_watchdog_term_partial(struct kbase_hwcnt_backend_jm_watc
  */
 static void kbasep_hwcnt_backend_jm_watchdog_term(struct kbase_hwcnt_backend *backend)
 {
+	struct kbase_hwcnt_backend_jm_watchdog *wd_backend =
+		(struct kbase_hwcnt_backend_jm_watchdog *)backend;
+
 	if (!backend)
 		return;
 
-	kbasep_hwcnt_backend_jm_watchdog_term_partial(
-		(struct kbase_hwcnt_backend_jm_watchdog *)backend, HWCNT_JM_WD_INIT_END);
+	/* disable timer thread to avoid concurrent access to shared resources */
+	wd_backend->info->dump_watchdog_iface->disable(
+		wd_backend->info->dump_watchdog_iface->timer);
+
+	kbasep_hwcnt_backend_jm_watchdog_term_partial(wd_backend, HWCNT_JM_WD_INIT_END);
 }
 
 /* Job manager watchdog backend, implementation of kbase_hwcnt_backend_init_fn */
@@ -350,20 +352,20 @@ static int kbasep_hwcnt_backend_jm_watchdog_init(const struct kbase_hwcnt_backen
 	jm_info = wd_info->jm_backend_iface->info;
 	metadata = wd_info->jm_backend_iface->metadata(wd_info->jm_backend_iface->info);
 
+	wd_backend = kmalloc(sizeof(*wd_backend), GFP_KERNEL);
+	if (!wd_backend) {
+		*out_backend = NULL;
+		return -ENOMEM;
+	}
+
+	*wd_backend = (struct kbase_hwcnt_backend_jm_watchdog){
+		.info = wd_info,
+		.timeout_ms = hwcnt_backend_watchdog_timer_interval_ms,
+		.locked = { .state = HWCNT_JM_WD_IDLE_BUFFER_EMPTY, .is_enabled = false }
+	};
+
 	while (state < HWCNT_JM_WD_INIT_END && !errcode) {
 		switch (state) {
-		case HWCNT_JM_WD_INIT_ALLOC:
-			wd_backend = kmalloc(sizeof(*wd_backend), GFP_KERNEL);
-			if (wd_backend) {
-				*wd_backend = (struct kbase_hwcnt_backend_jm_watchdog){
-					.info = wd_info,
-					.timeout_ms = hwcnt_backend_watchdog_timer_interval_ms,
-					.locked = { .state = HWCNT_JM_WD_IDLE_BUFFER_EMPTY,
-						    .is_enabled = false }
-				};
-			} else
-				errcode = -ENOMEM;
-			break;
 		case HWCNT_JM_WD_INIT_BACKEND:
 			errcode = wd_info->jm_backend_iface->init(jm_info, &wd_backend->jm_backend);
 			break;
@@ -823,5 +825,5 @@ void kbase_hwcnt_backend_jm_watchdog_destroy(struct kbase_hwcnt_backend_interfac
 	kfree((struct kbase_hwcnt_backend_jm_watchdog_info *)iface->info);
 
 	/*blanking the watchdog backend interface*/
-	*iface = (struct kbase_hwcnt_backend_interface){ NULL };
+	memset(iface, 0, sizeof(*iface));
 }
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm_watchdog.h b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h
similarity index 94%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm_watchdog.h
rename to drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h
index 5021b4fdb966..02a7952cced2 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_backend_jm_watchdog.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -32,8 +32,8 @@
 #ifndef _KBASE_HWCNT_BACKEND_JM_WATCHDOG_H_
 #define _KBASE_HWCNT_BACKEND_JM_WATCHDOG_H_
 
-#include <mali_kbase_hwcnt_backend.h>
-#include <mali_kbase_hwcnt_watchdog_if.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend.h>
+#include <hwcnt/mali_kbase_hwcnt_watchdog_if.h>
 
 /**
  * kbase_hwcnt_backend_jm_watchdog_create() - Create a job manager hardware counter watchdog
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt.c b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt.c
similarity index 87%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt.c
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt.c
index a54f005915ae..e724572560d5 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt.c
@@ -23,10 +23,10 @@
  * Implementation of hardware counter context and accumulator APIs.
  */
 
-#include "mali_kbase_hwcnt_context.h"
-#include "mali_kbase_hwcnt_accumulator.h"
-#include "mali_kbase_hwcnt_backend.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_context.h"
+#include "hwcnt/mali_kbase_hwcnt_accumulator.h"
+#include "hwcnt/backend/mali_kbase_hwcnt_backend.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
@@ -39,11 +39,7 @@
  * @ACCUM_STATE_ENABLED:  Enabled state, where dumping is enabled if there are
  *                        any enabled counters.
  */
-enum kbase_hwcnt_accum_state {
-	ACCUM_STATE_ERROR,
-	ACCUM_STATE_DISABLED,
-	ACCUM_STATE_ENABLED
-};
+enum kbase_hwcnt_accum_state { ACCUM_STATE_ERROR, ACCUM_STATE_DISABLED, ACCUM_STATE_ENABLED };
 
 /**
  * struct kbase_hwcnt_accumulator - Hardware counter accumulator structure.
@@ -130,9 +126,8 @@ struct kbase_hwcnt_context {
 	struct workqueue_struct *wq;
 };
 
-int kbase_hwcnt_context_init(
-	const struct kbase_hwcnt_backend_interface *iface,
-	struct kbase_hwcnt_context **out_hctx)
+int kbase_hwcnt_context_init(const struct kbase_hwcnt_backend_interface *iface,
+			     struct kbase_hwcnt_context **out_hctx)
 {
 	struct kbase_hwcnt_context *hctx = NULL;
 
@@ -149,8 +144,7 @@ int kbase_hwcnt_context_init(
 	mutex_init(&hctx->accum_lock);
 	hctx->accum_inited = false;
 
-	hctx->wq =
-		alloc_workqueue("mali_kbase_hwcnt", WQ_HIGHPRI | WQ_UNBOUND, 0);
+	hctx->wq = alloc_workqueue("mali_kbase_hwcnt", WQ_HIGHPRI | WQ_UNBOUND, 0);
 	if (!hctx->wq)
 		goto err_alloc_workqueue;
 
@@ -208,35 +202,30 @@ static int kbasep_hwcnt_accumulator_init(struct kbase_hwcnt_context *hctx)
 	WARN_ON(!hctx);
 	WARN_ON(!hctx->accum_inited);
 
-	errcode = hctx->iface->init(
-		hctx->iface->info, &hctx->accum.backend);
+	errcode = hctx->iface->init(hctx->iface->info, &hctx->accum.backend);
 	if (errcode)
 		goto error;
 
 	hctx->accum.metadata = hctx->iface->metadata(hctx->iface->info);
 	hctx->accum.state = ACCUM_STATE_ERROR;
 
-	errcode = kbase_hwcnt_enable_map_alloc(hctx->accum.metadata,
-					       &hctx->accum.enable_map);
+	errcode = kbase_hwcnt_enable_map_alloc(hctx->accum.metadata, &hctx->accum.enable_map);
 	if (errcode)
 		goto error;
 
 	hctx->accum.enable_map_any_enabled = false;
 
-	errcode = kbase_hwcnt_dump_buffer_alloc(hctx->accum.metadata,
-						&hctx->accum.accum_buf);
+	errcode = kbase_hwcnt_dump_buffer_alloc(hctx->accum.metadata, &hctx->accum.accum_buf);
 	if (errcode)
 		goto error;
 
-	errcode = kbase_hwcnt_enable_map_alloc(hctx->accum.metadata,
-					       &hctx->accum.scratch_map);
+	errcode = kbase_hwcnt_enable_map_alloc(hctx->accum.metadata, &hctx->accum.scratch_map);
 	if (errcode)
 		goto error;
 
 	hctx->accum.accumulated = false;
 
-	hctx->accum.ts_last_dump_ns =
-		hctx->iface->timestamp_ns(hctx->accum.backend);
+	hctx->accum.ts_last_dump_ns = hctx->iface->timestamp_ns(hctx->accum.backend);
 
 	return 0;
 
@@ -252,8 +241,7 @@ error:
  * @hctx:       Non-NULL pointer to hardware counter context.
  * @accumulate: True if we should accumulate before disabling, else false.
  */
-static void kbasep_hwcnt_accumulator_disable(
-	struct kbase_hwcnt_context *hctx, bool accumulate)
+static void kbasep_hwcnt_accumulator_disable(struct kbase_hwcnt_context *hctx, bool accumulate)
 {
 	int errcode = 0;
 	bool backend_enabled = false;
@@ -272,8 +260,7 @@ static void kbasep_hwcnt_accumulator_disable(
 	WARN_ON(hctx->disable_count != 0);
 	WARN_ON(hctx->accum.state == ACCUM_STATE_DISABLED);
 
-	if ((hctx->accum.state == ACCUM_STATE_ENABLED) &&
-	    (accum->enable_map_any_enabled))
+	if ((hctx->accum.state == ACCUM_STATE_ENABLED) && (accum->enable_map_any_enabled))
 		backend_enabled = true;
 
 	if (!backend_enabled)
@@ -297,8 +284,8 @@ static void kbasep_hwcnt_accumulator_disable(
 	if (errcode)
 		goto disable;
 
-	errcode = hctx->iface->dump_get(accum->backend,
-		&accum->accum_buf, &accum->enable_map, accum->accumulated);
+	errcode = hctx->iface->dump_get(accum->backend, &accum->accum_buf, &accum->enable_map,
+					accum->accumulated);
 	if (errcode)
 		goto disable;
 
@@ -336,8 +323,7 @@ static void kbasep_hwcnt_accumulator_enable(struct kbase_hwcnt_context *hctx)
 
 	/* The backend only needs enabling if any counters are enabled */
 	if (accum->enable_map_any_enabled)
-		errcode = hctx->iface->dump_enable_nolock(
-			accum->backend, &accum->enable_map);
+		errcode = hctx->iface->dump_enable_nolock(accum->backend, &accum->enable_map);
 
 	if (!errcode)
 		accum->state = ACCUM_STATE_ENABLED;
@@ -364,12 +350,9 @@ static void kbasep_hwcnt_accumulator_enable(struct kbase_hwcnt_context *hctx)
  *
  * Return:       0 on success, else error code.
  */
-static int kbasep_hwcnt_accumulator_dump(
-	struct kbase_hwcnt_context *hctx,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf,
-	const struct kbase_hwcnt_enable_map *new_map)
+static int kbasep_hwcnt_accumulator_dump(struct kbase_hwcnt_context *hctx, u64 *ts_start_ns,
+					 u64 *ts_end_ns, struct kbase_hwcnt_dump_buffer *dump_buf,
+					 const struct kbase_hwcnt_enable_map *new_map)
 {
 	int errcode = 0;
 	unsigned long flags;
@@ -398,8 +381,7 @@ static int kbasep_hwcnt_accumulator_dump(
 	kbase_hwcnt_enable_map_copy(cur_map, &accum->enable_map);
 
 	if (new_map)
-		new_map_any_enabled =
-			kbase_hwcnt_enable_map_any_enabled(new_map);
+		new_map_any_enabled = kbase_hwcnt_enable_map_any_enabled(new_map);
 
 	/*
 	 * We're holding accum_lock, so the accumulator state might transition
@@ -426,8 +408,7 @@ static int kbasep_hwcnt_accumulator_dump(
 	 * then we'll do it ourselves after the dump.
 	 */
 	if (new_map) {
-		kbase_hwcnt_enable_map_copy(
-			&accum->enable_map, new_map);
+		kbase_hwcnt_enable_map_copy(&accum->enable_map, new_map);
 		accum->enable_map_any_enabled = new_map_any_enabled;
 	}
 
@@ -440,12 +421,10 @@ static int kbasep_hwcnt_accumulator_dump(
 	/* Initiate the dump if the backend is enabled. */
 	if ((state == ACCUM_STATE_ENABLED) && cur_map_any_enabled) {
 		if (dump_buf) {
-			errcode = hctx->iface->dump_request(
-					accum->backend, &dump_time_ns);
+			errcode = hctx->iface->dump_request(accum->backend, &dump_time_ns);
 			dump_requested = true;
 		} else {
-			dump_time_ns = hctx->iface->timestamp_ns(
-					accum->backend);
+			dump_time_ns = hctx->iface->timestamp_ns(accum->backend);
 			errcode = hctx->iface->dump_clear(accum->backend);
 		}
 
@@ -457,8 +436,7 @@ static int kbasep_hwcnt_accumulator_dump(
 
 	/* Copy any accumulation into the dest buffer */
 	if (accum->accumulated && dump_buf) {
-		kbase_hwcnt_dump_buffer_copy(
-			dump_buf, &accum->accum_buf, cur_map);
+		kbase_hwcnt_dump_buffer_copy(dump_buf, &accum->accum_buf, cur_map);
 		dump_written = true;
 	}
 
@@ -483,8 +461,7 @@ static int kbasep_hwcnt_accumulator_dump(
 		 * we're already enabled and holding accum_lock is impossible.
 		 */
 		if (new_map_any_enabled) {
-			errcode = hctx->iface->dump_enable(
-				accum->backend, new_map);
+			errcode = hctx->iface->dump_enable(accum->backend, new_map);
 			if (errcode)
 				goto error;
 		}
@@ -495,11 +472,8 @@ static int kbasep_hwcnt_accumulator_dump(
 		/* If we dumped, copy or accumulate it into the destination */
 		if (dump_requested) {
 			WARN_ON(state != ACCUM_STATE_ENABLED);
-			errcode = hctx->iface->dump_get(
-				accum->backend,
-				dump_buf,
-				cur_map,
-				dump_written);
+			errcode = hctx->iface->dump_get(accum->backend, dump_buf, cur_map,
+							dump_written);
 			if (errcode)
 				goto error;
 			dump_written = true;
@@ -540,8 +514,7 @@ error:
  * @hctx:       Non-NULL pointer to hardware counter context.
  * @accumulate: True if we should accumulate before disabling, else false.
  */
-static void kbasep_hwcnt_context_disable(
-	struct kbase_hwcnt_context *hctx, bool accumulate)
+static void kbasep_hwcnt_context_disable(struct kbase_hwcnt_context *hctx, bool accumulate)
 {
 	unsigned long flags;
 
@@ -563,9 +536,8 @@ static void kbasep_hwcnt_context_disable(
 	}
 }
 
-int kbase_hwcnt_accumulator_acquire(
-	struct kbase_hwcnt_context *hctx,
-	struct kbase_hwcnt_accumulator **accum)
+int kbase_hwcnt_accumulator_acquire(struct kbase_hwcnt_context *hctx,
+				    struct kbase_hwcnt_accumulator **accum)
 {
 	int errcode = 0;
 	unsigned long flags;
@@ -618,9 +590,7 @@ int kbase_hwcnt_accumulator_acquire(
 	 * Regardless of initial state, counters don't need to be enabled via
 	 * the backend, as the initial enable map has no enabled counters.
 	 */
-	hctx->accum.state = (hctx->disable_count == 0) ?
-		ACCUM_STATE_ENABLED :
-		ACCUM_STATE_DISABLED;
+	hctx->accum.state = (hctx->disable_count == 0) ? ACCUM_STATE_ENABLED : ACCUM_STATE_DISABLED;
 
 	spin_unlock_irqrestore(&hctx->state_lock, flags);
 
@@ -728,8 +698,7 @@ void kbase_hwcnt_context_enable(struct kbase_hwcnt_context *hctx)
 	spin_unlock_irqrestore(&hctx->state_lock, flags);
 }
 
-const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(
-	struct kbase_hwcnt_context *hctx)
+const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(struct kbase_hwcnt_context *hctx)
 {
 	if (!hctx)
 		return NULL;
@@ -737,8 +706,7 @@ const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(
 	return hctx->iface->metadata(hctx->iface->info);
 }
 
-bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx,
-				    struct work_struct *work)
+bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx, struct work_struct *work)
 {
 	if (WARN_ON(!hctx) || WARN_ON(!work))
 		return false;
@@ -746,12 +714,10 @@ bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx,
 	return queue_work(hctx->wq, work);
 }
 
-int kbase_hwcnt_accumulator_set_counters(
-	struct kbase_hwcnt_accumulator *accum,
-	const struct kbase_hwcnt_enable_map *new_map,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+int kbase_hwcnt_accumulator_set_counters(struct kbase_hwcnt_accumulator *accum,
+					 const struct kbase_hwcnt_enable_map *new_map,
+					 u64 *ts_start_ns, u64 *ts_end_ns,
+					 struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	int errcode;
 	struct kbase_hwcnt_context *hctx;
@@ -767,19 +733,15 @@ int kbase_hwcnt_accumulator_set_counters(
 
 	mutex_lock(&hctx->accum_lock);
 
-	errcode = kbasep_hwcnt_accumulator_dump(
-		hctx, ts_start_ns, ts_end_ns, dump_buf, new_map);
+	errcode = kbasep_hwcnt_accumulator_dump(hctx, ts_start_ns, ts_end_ns, dump_buf, new_map);
 
 	mutex_unlock(&hctx->accum_lock);
 
 	return errcode;
 }
 
-int kbase_hwcnt_accumulator_dump(
-	struct kbase_hwcnt_accumulator *accum,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+int kbase_hwcnt_accumulator_dump(struct kbase_hwcnt_accumulator *accum, u64 *ts_start_ns,
+				 u64 *ts_end_ns, struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	int errcode;
 	struct kbase_hwcnt_context *hctx;
@@ -794,8 +756,7 @@ int kbase_hwcnt_accumulator_dump(
 
 	mutex_lock(&hctx->accum_lock);
 
-	errcode = kbasep_hwcnt_accumulator_dump(
-		hctx, ts_start_ns, ts_end_ns, dump_buf, NULL);
+	errcode = kbasep_hwcnt_accumulator_dump(hctx, ts_start_ns, ts_end_ns, dump_buf, NULL);
 
 	mutex_unlock(&hctx->accum_lock);
 
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_accumulator.h b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_accumulator.h
similarity index 90%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_accumulator.h
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_accumulator.h
index af542ea5b56b..069e02068902 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_accumulator.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_accumulator.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -67,9 +67,8 @@ struct kbase_hwcnt_dump_buffer;
  *
  * Return: 0 on success or error code.
  */
-int kbase_hwcnt_accumulator_acquire(
-	struct kbase_hwcnt_context *hctx,
-	struct kbase_hwcnt_accumulator **accum);
+int kbase_hwcnt_accumulator_acquire(struct kbase_hwcnt_context *hctx,
+				    struct kbase_hwcnt_accumulator **accum);
 
 /**
  * kbase_hwcnt_accumulator_release() - Release a hardware counter accumulator.
@@ -102,12 +101,10 @@ void kbase_hwcnt_accumulator_release(struct kbase_hwcnt_accumulator *accum);
  *
  * Return: 0 on success or error code.
  */
-int kbase_hwcnt_accumulator_set_counters(
-	struct kbase_hwcnt_accumulator *accum,
-	const struct kbase_hwcnt_enable_map *new_map,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf);
+int kbase_hwcnt_accumulator_set_counters(struct kbase_hwcnt_accumulator *accum,
+					 const struct kbase_hwcnt_enable_map *new_map,
+					 u64 *ts_start_ns, u64 *ts_end_ns,
+					 struct kbase_hwcnt_dump_buffer *dump_buf);
 
 /**
  * kbase_hwcnt_accumulator_dump() - Perform a dump of the currently enabled
@@ -127,11 +124,8 @@ int kbase_hwcnt_accumulator_set_counters(
  *
  * Return: 0 on success or error code.
  */
-int kbase_hwcnt_accumulator_dump(
-	struct kbase_hwcnt_accumulator *accum,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf);
+int kbase_hwcnt_accumulator_dump(struct kbase_hwcnt_accumulator *accum, u64 *ts_start_ns,
+				 u64 *ts_end_ns, struct kbase_hwcnt_dump_buffer *dump_buf);
 
 /**
  * kbase_hwcnt_accumulator_timestamp_ns() - Get the current accumulator backend
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_context.h b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_context.h
similarity index 95%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_context.h
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_context.h
index 34423d1b60c7..89732a908789 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_context.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_context.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -43,9 +43,8 @@ struct kbase_hwcnt_context;
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_context_init(
-	const struct kbase_hwcnt_backend_interface *iface,
-	struct kbase_hwcnt_context **out_hctx);
+int kbase_hwcnt_context_init(const struct kbase_hwcnt_backend_interface *iface,
+			     struct kbase_hwcnt_context **out_hctx);
 
 /**
  * kbase_hwcnt_context_term() - Terminate a hardware counter context.
@@ -61,8 +60,7 @@ void kbase_hwcnt_context_term(struct kbase_hwcnt_context *hctx);
  *
  * Return: Non-NULL pointer to metadata, or NULL on error.
  */
-const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(
-	struct kbase_hwcnt_context *hctx);
+const struct kbase_hwcnt_metadata *kbase_hwcnt_context_metadata(struct kbase_hwcnt_context *hctx);
 
 /**
  * kbase_hwcnt_context_disable() - Increment the disable count of the context.
@@ -145,7 +143,6 @@ void kbase_hwcnt_context_enable(struct kbase_hwcnt_context *hctx);
  * this meant progress through the power management states could be stalled
  * for however long that higher priority thread took.
  */
-bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx,
-				    struct work_struct *work);
+bool kbase_hwcnt_context_queue_work(struct kbase_hwcnt_context *hctx, struct work_struct *work);
 
 #endif /* _KBASE_HWCNT_CONTEXT_H_ */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu.c b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.c
similarity index 78%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu.c
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.c
index 5f5c36f33d41..74916dab060d 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.c
@@ -19,8 +19,8 @@
  *
  */
 
-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 
 #include <linux/err.h>
 
@@ -32,8 +32,7 @@ enum enable_map_idx {
 	EM_COUNT,
 };
 
-static void kbasep_get_fe_block_type(u64 *dst, enum kbase_hwcnt_set counter_set,
-				     bool is_csf)
+static void kbasep_get_fe_block_type(u64 *dst, enum kbase_hwcnt_set counter_set, bool is_csf)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
@@ -56,8 +55,7 @@ static void kbasep_get_fe_block_type(u64 *dst, enum kbase_hwcnt_set counter_set,
 	}
 }
 
-static void kbasep_get_tiler_block_type(u64 *dst,
-					enum kbase_hwcnt_set counter_set)
+static void kbasep_get_tiler_block_type(u64 *dst, enum kbase_hwcnt_set counter_set)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
@@ -72,8 +70,7 @@ static void kbasep_get_tiler_block_type(u64 *dst,
 	}
 }
 
-static void kbasep_get_sc_block_type(u64 *dst, enum kbase_hwcnt_set counter_set,
-				     bool is_csf)
+static void kbasep_get_sc_block_type(u64 *dst, enum kbase_hwcnt_set counter_set, bool is_csf)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
@@ -93,8 +90,7 @@ static void kbasep_get_sc_block_type(u64 *dst, enum kbase_hwcnt_set counter_set,
 	}
 }
 
-static void kbasep_get_memsys_block_type(u64 *dst,
-					 enum kbase_hwcnt_set counter_set)
+static void kbasep_get_memsys_block_type(u64 *dst, enum kbase_hwcnt_set counter_set)
 {
 	switch (counter_set) {
 	case KBASE_HWCNT_SET_PRIMARY:
@@ -122,15 +118,14 @@ static void kbasep_get_memsys_block_type(u64 *dst,
  *
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_backend_gpu_metadata_create(
-	const struct kbase_hwcnt_gpu_info *gpu_info, const bool is_csf,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **metadata)
+static int kbasep_hwcnt_backend_gpu_metadata_create(const struct kbase_hwcnt_gpu_info *gpu_info,
+						    const bool is_csf,
+						    enum kbase_hwcnt_set counter_set,
+						    const struct kbase_hwcnt_metadata **metadata)
 {
 	struct kbase_hwcnt_description desc;
 	struct kbase_hwcnt_group_description group;
-	struct kbase_hwcnt_block_description
-		blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
+	struct kbase_hwcnt_block_description blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
 	size_t non_sc_block_count;
 	size_t sc_block_count;
 
@@ -156,22 +151,19 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(
 	kbasep_get_fe_block_type(&blks[0].type, counter_set, is_csf);
 	blks[0].inst_cnt = 1;
 	blks[0].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[0].ctr_cnt = gpu_info->prfcnt_values_per_block -
-			  KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[0].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
 
 	/* One Tiler block */
 	kbasep_get_tiler_block_type(&blks[1].type, counter_set);
 	blks[1].inst_cnt = 1;
 	blks[1].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[1].ctr_cnt = gpu_info->prfcnt_values_per_block -
-			  KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[1].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
 
 	/* l2_count memsys blks */
 	kbasep_get_memsys_block_type(&blks[2].type, counter_set);
 	blks[2].inst_cnt = gpu_info->l2_count;
 	blks[2].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[2].ctr_cnt = gpu_info->prfcnt_values_per_block -
-			  KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[2].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
 
 	/*
 	 * There are as many shader cores in the system as there are bits set in
@@ -192,8 +184,7 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(
 	kbasep_get_sc_block_type(&blks[3].type, counter_set, is_csf);
 	blks[3].inst_cnt = sc_block_count;
 	blks[3].hdr_cnt = KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
-	blks[3].ctr_cnt = gpu_info->prfcnt_values_per_block -
-			  KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
+	blks[3].ctr_cnt = gpu_info->prfcnt_values_per_block - KBASE_HWCNT_V5_HEADERS_PER_BLOCK;
 
 	WARN_ON(KBASE_HWCNT_V5_BLOCK_TYPE_COUNT != 4);
 
@@ -220,8 +211,7 @@ static int kbasep_hwcnt_backend_gpu_metadata_create(
  *
  * Return: Size of buffer the GPU needs to perform a counter dump.
  */
-static size_t
-kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_info *gpu_info)
+static size_t kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_info *gpu_info)
 {
 	WARN_ON(!gpu_info);
 
@@ -229,11 +219,10 @@ kbasep_hwcnt_backend_jm_dump_bytes(const struct kbase_hwcnt_gpu_info *gpu_info)
 	       gpu_info->prfcnt_values_per_block * KBASE_HWCNT_VALUE_HW_BYTES;
 }
 
-int kbase_hwcnt_jm_metadata_create(
-	const struct kbase_hwcnt_gpu_info *gpu_info,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **out_metadata,
-	size_t *out_dump_bytes)
+int kbase_hwcnt_jm_metadata_create(const struct kbase_hwcnt_gpu_info *gpu_info,
+				   enum kbase_hwcnt_set counter_set,
+				   const struct kbase_hwcnt_metadata **out_metadata,
+				   size_t *out_dump_bytes)
 {
 	int errcode;
 	const struct kbase_hwcnt_metadata *metadata;
@@ -250,8 +239,7 @@ int kbase_hwcnt_jm_metadata_create(
 	 * all the available L2 cache and Shader cores are allocated.
 	 */
 	dump_bytes = kbasep_hwcnt_backend_jm_dump_bytes(gpu_info);
-	errcode = kbasep_hwcnt_backend_gpu_metadata_create(
-		gpu_info, false, counter_set, &metadata);
+	errcode = kbasep_hwcnt_backend_gpu_metadata_create(gpu_info, false, counter_set, &metadata);
 	if (errcode)
 		return errcode;
 
@@ -276,10 +264,9 @@ void kbase_hwcnt_jm_metadata_destroy(const struct kbase_hwcnt_metadata *metadata
 	kbase_hwcnt_metadata_destroy(metadata);
 }
 
-int kbase_hwcnt_csf_metadata_create(
-	const struct kbase_hwcnt_gpu_info *gpu_info,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **out_metadata)
+int kbase_hwcnt_csf_metadata_create(const struct kbase_hwcnt_gpu_info *gpu_info,
+				    enum kbase_hwcnt_set counter_set,
+				    const struct kbase_hwcnt_metadata **out_metadata)
 {
 	int errcode;
 	const struct kbase_hwcnt_metadata *metadata;
@@ -287,8 +274,7 @@ int kbase_hwcnt_csf_metadata_create(
 	if (!gpu_info || !out_metadata)
 		return -EINVAL;
 
-	errcode = kbasep_hwcnt_backend_gpu_metadata_create(
-		gpu_info, true, counter_set, &metadata);
+	errcode = kbasep_hwcnt_backend_gpu_metadata_create(gpu_info, true, counter_set, &metadata);
 	if (errcode)
 		return errcode;
 
@@ -297,8 +283,7 @@ int kbase_hwcnt_csf_metadata_create(
 	return 0;
 }
 
-void kbase_hwcnt_csf_metadata_destroy(
-	const struct kbase_hwcnt_metadata *metadata)
+void kbase_hwcnt_csf_metadata_destroy(const struct kbase_hwcnt_metadata *metadata)
 {
 	if (!metadata)
 		return;
@@ -306,10 +291,7 @@ void kbase_hwcnt_csf_metadata_destroy(
 	kbase_hwcnt_metadata_destroy(metadata);
 }
 
-static bool is_block_type_shader(
-	const u64 grp_type,
-	const u64 blk_type,
-	const size_t blk)
+static bool is_block_type_shader(const u64 grp_type, const u64 blk_type, const size_t blk)
 {
 	bool is_shader = false;
 
@@ -326,9 +308,7 @@ static bool is_block_type_shader(
 	return is_shader;
 }
 
-static bool is_block_type_l2_cache(
-	const u64 grp_type,
-	const u64 blk_type)
+static bool is_block_type_l2_cache(const u64 grp_type, const u64 blk_type)
 {
 	bool is_l2_cache = false;
 
@@ -348,10 +328,8 @@ static bool is_block_type_l2_cache(
 }
 
 int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
-			    const struct kbase_hwcnt_enable_map *dst_enable_map,
-			    u64 pm_core_mask,
-			    const struct kbase_hwcnt_curr_config *curr_config,
-			    bool accumulate)
+			    const struct kbase_hwcnt_enable_map *dst_enable_map, u64 pm_core_mask,
+			    const struct kbase_hwcnt_curr_config *curr_config, bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
@@ -362,28 +340,21 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 	/* Variables to deal with the current configuration */
 	int l2_count = 0;
 
-	if (!dst || !src || !dst_enable_map ||
-	    (dst_enable_map->metadata != dst->metadata))
+	if (!dst || !src || !dst_enable_map || (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(
-		metadata, grp, blk, blk_inst) {
-		const size_t hdr_cnt =
-			kbase_hwcnt_metadata_block_headers_count(
-				metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
 		const size_t ctr_cnt =
-			kbase_hwcnt_metadata_block_counters_count(
-				metadata, grp, blk);
-		const u64 blk_type = kbase_hwcnt_metadata_block_type(
-			metadata, grp, blk);
+			kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
 		const bool is_shader_core = is_block_type_shader(
-			kbase_hwcnt_metadata_group_type(metadata, grp),
-			blk_type, blk);
+			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type, blk);
 		const bool is_l2_cache = is_block_type_l2_cache(
-			kbase_hwcnt_metadata_group_type(metadata, grp),
-			blk_type);
+			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type);
 		const bool is_undefined = kbase_hwcnt_is_block_type_undefined(
 			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type);
 		bool hw_res_available = true;
@@ -412,10 +383,9 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 		/*
 		 * Skip block if no values in the destination block are enabled.
 		 */
-		if (kbase_hwcnt_enable_map_block_enabled(
-			dst_enable_map, grp, blk, blk_inst)) {
-			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-				dst, grp, blk, blk_inst);
+		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst)) {
+			u64 *dst_blk =
+				kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
 			const u64 *src_blk = dump_src + src_offset;
 			bool blk_powered;
 
@@ -435,13 +405,11 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			if (blk_powered && !is_undefined && hw_res_available) {
 				/* Only powered and defined blocks have valid data. */
 				if (accumulate) {
-					kbase_hwcnt_dump_buffer_block_accumulate(
-						dst_blk, src_blk, hdr_cnt,
-						ctr_cnt);
+					kbase_hwcnt_dump_buffer_block_accumulate(dst_blk, src_blk,
+										 hdr_cnt, ctr_cnt);
 				} else {
-					kbase_hwcnt_dump_buffer_block_copy(
-						dst_blk, src_blk,
-						(hdr_cnt + ctr_cnt));
+					kbase_hwcnt_dump_buffer_block_copy(dst_blk, src_blk,
+									   (hdr_cnt + ctr_cnt));
 				}
 			} else {
 				/* Even though the block might be undefined, the
@@ -469,26 +437,23 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 }
 
 int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
-			     const struct kbase_hwcnt_enable_map *dst_enable_map,
-			     bool accumulate)
+			     const struct kbase_hwcnt_enable_map *dst_enable_map, bool accumulate)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	const u64 *dump_src = src;
 	size_t src_offset = 0;
 	size_t grp, blk, blk_inst;
 
-	if (!dst || !src || !dst_enable_map ||
-	    (dst_enable_map->metadata != dst->metadata))
+	if (!dst || !src || !dst_enable_map || (dst_enable_map->metadata != dst->metadata))
 		return -EINVAL;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
-			metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
 		const size_t ctr_cnt =
-			kbase_hwcnt_metadata_block_counters_count(metadata, grp,
-								  blk);
+			kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
 		const uint64_t blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
 		const bool is_undefined = kbase_hwcnt_is_block_type_undefined(
 			kbase_hwcnt_metadata_group_type(metadata, grp), blk_type);
@@ -496,10 +461,9 @@ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 		/*
 		 * Skip block if no values in the destination block are enabled.
 		 */
-		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp,
-							 blk, blk_inst)) {
-			u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-				dst, grp, blk, blk_inst);
+		if (kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst)) {
+			u64 *dst_blk =
+				kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
 			const u64 *src_blk = dump_src + src_offset;
 
 			if (!is_undefined) {
@@ -542,12 +506,9 @@ int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
  * @hi:   Non-NULL pointer to where high 64 bits of block enable map abstraction
  *        will be stored.
  */
-static inline void kbasep_hwcnt_backend_gpu_block_map_from_physical(
-	u32 phys,
-	u64 *lo,
-	u64 *hi)
+static inline void kbasep_hwcnt_backend_gpu_block_map_from_physical(u32 phys, u64 *lo, u64 *hi)
 {
-	u64 dwords[2] = {0, 0};
+	u64 dwords[2] = { 0, 0 };
 
 	size_t dword_idx;
 
@@ -572,9 +533,8 @@ static inline void kbasep_hwcnt_backend_gpu_block_map_from_physical(
 	*hi = dwords[1];
 }
 
-void kbase_hwcnt_gpu_enable_map_to_physical(
-	struct kbase_hwcnt_physical_enable_map *dst,
-	const struct kbase_hwcnt_enable_map *src)
+void kbase_hwcnt_gpu_enable_map_to_physical(struct kbase_hwcnt_physical_enable_map *dst,
+					    const struct kbase_hwcnt_enable_map *src)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	u64 fe_bm[EM_COUNT] = { 0 };
@@ -588,17 +548,13 @@ void kbase_hwcnt_gpu_enable_map_to_physical(
 
 	metadata = src->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(
-		metadata, grp, blk, blk_inst) {
-		const u64 grp_type = kbase_hwcnt_metadata_group_type(
-			metadata, grp);
-		const u64 blk_type = kbase_hwcnt_metadata_block_type(
-			metadata, grp, blk);
-		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
-			src, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp);
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
+		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(src, grp, blk, blk_inst);
 
-		if ((enum kbase_hwcnt_gpu_group_type)grp_type ==
-		    KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
+		if ((enum kbase_hwcnt_gpu_group_type)grp_type == KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
 			const size_t map_stride =
 				kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk);
 			size_t map_idx;
@@ -649,8 +605,7 @@ void kbase_hwcnt_gpu_enable_map_to_physical(
 		kbase_hwcnt_backend_gpu_block_map_to_physical(mmu_l2_bm[EM_LO], mmu_l2_bm[EM_HI]);
 }
 
-void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
-				     enum kbase_hwcnt_set src)
+void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, enum kbase_hwcnt_set src)
 {
 	switch (src) {
 	case KBASE_HWCNT_SET_PRIMARY:
@@ -667,9 +622,8 @@ void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
 	}
 }
 
-void kbase_hwcnt_gpu_enable_map_from_physical(
-	struct kbase_hwcnt_enable_map *dst,
-	const struct kbase_hwcnt_physical_enable_map *src)
+void kbase_hwcnt_gpu_enable_map_from_physical(struct kbase_hwcnt_enable_map *dst,
+					      const struct kbase_hwcnt_physical_enable_map *src)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 
@@ -692,16 +646,13 @@ void kbase_hwcnt_gpu_enable_map_from_physical(
 	kbasep_hwcnt_backend_gpu_block_map_from_physical(src->mmu_l2_bm, &mmu_l2_bm[EM_LO],
 							 &mmu_l2_bm[EM_HI]);
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		const u64 grp_type = kbase_hwcnt_metadata_group_type(
-			metadata, grp);
-		const u64 blk_type = kbase_hwcnt_metadata_block_type(
-			metadata, grp, blk);
-		u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
-			dst, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp);
+		const u64 blk_type = kbase_hwcnt_metadata_block_type(metadata, grp, blk);
+		u64 *blk_map = kbase_hwcnt_enable_map_block_instance(dst, grp, blk, blk_inst);
 
-		if ((enum kbase_hwcnt_gpu_group_type)grp_type ==
-		    KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
+		if ((enum kbase_hwcnt_gpu_group_type)grp_type == KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
 			const size_t map_stride =
 				kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk);
 			size_t map_idx;
@@ -744,29 +695,25 @@ void kbase_hwcnt_gpu_enable_map_from_physical(
 	}
 }
 
-void kbase_hwcnt_gpu_patch_dump_headers(
-	struct kbase_hwcnt_dump_buffer *buf,
-	const struct kbase_hwcnt_enable_map *enable_map)
+void kbase_hwcnt_gpu_patch_dump_headers(struct kbase_hwcnt_dump_buffer *buf,
+					const struct kbase_hwcnt_enable_map *enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 
-	if (WARN_ON(!buf) || WARN_ON(!enable_map) ||
-	    WARN_ON(buf->metadata != enable_map->metadata))
+	if (WARN_ON(!buf) || WARN_ON(!enable_map) || WARN_ON(buf->metadata != enable_map->metadata))
 		return;
 
 	metadata = buf->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		const u64 grp_type =
-			kbase_hwcnt_metadata_group_type(metadata, grp);
-		u64 *buf_blk = kbase_hwcnt_dump_buffer_block_instance(
-			buf, grp, blk, blk_inst);
-		const u64 *blk_map = kbase_hwcnt_enable_map_block_instance(
-			enable_map, grp, blk, blk_inst);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		const u64 grp_type = kbase_hwcnt_metadata_group_type(metadata, grp);
+		u64 *buf_blk = kbase_hwcnt_dump_buffer_block_instance(buf, grp, blk, blk_inst);
+		const u64 *blk_map =
+			kbase_hwcnt_enable_map_block_instance(enable_map, grp, blk, blk_inst);
 
-		if ((enum kbase_hwcnt_gpu_group_type)grp_type ==
-		    KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
+		if ((enum kbase_hwcnt_gpu_group_type)grp_type == KBASE_HWCNT_GPU_GROUP_TYPE_V5) {
 			const size_t map_stride =
 				kbase_hwcnt_metadata_block_enable_map_stride(metadata, grp, blk);
 			u64 prfcnt_bm[EM_COUNT] = { 0 };
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu.h b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.h
similarity index 92%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu.h
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.h
index f890d451c2c1..a49c31e52f98 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu.h
@@ -34,9 +34,8 @@ struct kbase_hwcnt_dump_buffer;
 #define KBASE_HWCNT_V5_BLOCK_TYPE_COUNT 4
 #define KBASE_HWCNT_V5_HEADERS_PER_BLOCK 4
 #define KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK 60
-#define KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK                                \
-	(KBASE_HWCNT_V5_HEADERS_PER_BLOCK +                                    \
-	 KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK)
+#define KBASE_HWCNT_V5_DEFAULT_VALUES_PER_BLOCK                                                    \
+	(KBASE_HWCNT_V5_HEADERS_PER_BLOCK + KBASE_HWCNT_V5_DEFAULT_COUNTERS_PER_BLOCK)
 
 /* FrontEnd block count in V5 GPU hardware counter. */
 #define KBASE_HWCNT_V5_FE_BLOCK_COUNT 1
@@ -228,19 +227,17 @@ static inline bool kbase_hwcnt_is_block_type_undefined(const uint64_t grp_type,
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_jm_metadata_create(
-	const struct kbase_hwcnt_gpu_info *info,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **out_metadata,
-	size_t *out_dump_bytes);
+int kbase_hwcnt_jm_metadata_create(const struct kbase_hwcnt_gpu_info *info,
+				   enum kbase_hwcnt_set counter_set,
+				   const struct kbase_hwcnt_metadata **out_metadata,
+				   size_t *out_dump_bytes);
 
 /**
  * kbase_hwcnt_jm_metadata_destroy() - Destroy JM GPU hardware counter metadata.
  *
  * @metadata: Pointer to metadata to destroy.
  */
-void kbase_hwcnt_jm_metadata_destroy(
-	const struct kbase_hwcnt_metadata *metadata);
+void kbase_hwcnt_jm_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
 
 /**
  * kbase_hwcnt_csf_metadata_create() - Create hardware counter metadata for the
@@ -252,18 +249,16 @@ void kbase_hwcnt_jm_metadata_destroy(
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_csf_metadata_create(
-	const struct kbase_hwcnt_gpu_info *info,
-	enum kbase_hwcnt_set counter_set,
-	const struct kbase_hwcnt_metadata **out_metadata);
+int kbase_hwcnt_csf_metadata_create(const struct kbase_hwcnt_gpu_info *info,
+				    enum kbase_hwcnt_set counter_set,
+				    const struct kbase_hwcnt_metadata **out_metadata);
 
 /**
  * kbase_hwcnt_csf_metadata_destroy() - Destroy CSF GPU hardware counter
  *                                      metadata.
  * @metadata: Pointer to metadata to destroy.
  */
-void kbase_hwcnt_csf_metadata_destroy(
-	const struct kbase_hwcnt_metadata *metadata);
+void kbase_hwcnt_csf_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
 
 /**
  * kbase_hwcnt_jm_dump_get() - Copy or accumulate enabled counters from the raw
@@ -289,8 +284,7 @@ void kbase_hwcnt_csf_metadata_destroy(
 int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
 			    const struct kbase_hwcnt_enable_map *dst_enable_map,
 			    const u64 pm_core_mask,
-			    const struct kbase_hwcnt_curr_config *curr_config,
-			    bool accumulate);
+			    const struct kbase_hwcnt_curr_config *curr_config, bool accumulate);
 
 /**
  * kbase_hwcnt_csf_dump_get() - Copy or accumulate enabled counters from the raw
@@ -310,8 +304,7 @@ int kbase_hwcnt_jm_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
  * Return: 0 on success, else error code.
  */
 int kbase_hwcnt_csf_dump_get(struct kbase_hwcnt_dump_buffer *dst, u64 *src,
-			     const struct kbase_hwcnt_enable_map *dst_enable_map,
-			     bool accumulate);
+			     const struct kbase_hwcnt_enable_map *dst_enable_map, bool accumulate);
 
 /**
  * kbase_hwcnt_backend_gpu_block_map_to_physical() - Convert from a block
@@ -365,9 +358,8 @@ static inline u32 kbase_hwcnt_backend_gpu_block_map_to_physical(u64 lo, u64 hi)
  * individual counter block value, but the physical enable map uses 1 bit for
  * every 4 counters, shared over all instances of a block.
  */
-void kbase_hwcnt_gpu_enable_map_to_physical(
-	struct kbase_hwcnt_physical_enable_map *dst,
-	const struct kbase_hwcnt_enable_map *src);
+void kbase_hwcnt_gpu_enable_map_to_physical(struct kbase_hwcnt_physical_enable_map *dst,
+					    const struct kbase_hwcnt_enable_map *src);
 
 /**
  * kbase_hwcnt_gpu_set_to_physical() - Map counter set selection to physical
@@ -376,8 +368,7 @@ void kbase_hwcnt_gpu_enable_map_to_physical(
  * @dst: Non-NULL pointer to destination physical SET_SELECT value.
  * @src: Non-NULL pointer to source counter set selection.
  */
-void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
-				     enum kbase_hwcnt_set src);
+void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst, enum kbase_hwcnt_set src);
 
 /**
  * kbase_hwcnt_gpu_enable_map_from_physical() - Convert a physical enable map to
@@ -393,9 +384,8 @@ void kbase_hwcnt_gpu_set_to_physical(enum kbase_hwcnt_physical_set *dst,
  * more than 64, so the enable map abstraction has nowhere to store the enable
  * information for the 64 non-existent counters.
  */
-void kbase_hwcnt_gpu_enable_map_from_physical(
-	struct kbase_hwcnt_enable_map *dst,
-	const struct kbase_hwcnt_physical_enable_map *src);
+void kbase_hwcnt_gpu_enable_map_from_physical(struct kbase_hwcnt_enable_map *dst,
+					      const struct kbase_hwcnt_physical_enable_map *src);
 
 /**
  * kbase_hwcnt_gpu_patch_dump_headers() - Patch all the performance counter
@@ -411,8 +401,7 @@ void kbase_hwcnt_gpu_enable_map_from_physical(
  * kernel-user boundary, to ensure the header is accurate for the enable map
  * used by the user.
  */
-void kbase_hwcnt_gpu_patch_dump_headers(
-	struct kbase_hwcnt_dump_buffer *buf,
-	const struct kbase_hwcnt_enable_map *enable_map);
+void kbase_hwcnt_gpu_patch_dump_headers(struct kbase_hwcnt_dump_buffer *buf,
+					const struct kbase_hwcnt_enable_map *enable_map);
 
 #endif /* _KBASE_HWCNT_GPU_H_ */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu_narrow.c b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.c
similarity index 68%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu_narrow.c
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.c
index 2a1cde79709b..0cf2f94cfb87 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu_narrow.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.c
@@ -19,21 +19,19 @@
  *
  */
 
-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_gpu_narrow.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu_narrow.h"
 
 #include <linux/bug.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 
-int kbase_hwcnt_gpu_metadata_narrow_create(
-	const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
-	const struct kbase_hwcnt_metadata *src_md)
+int kbase_hwcnt_gpu_metadata_narrow_create(const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
+					   const struct kbase_hwcnt_metadata *src_md)
 {
 	struct kbase_hwcnt_description desc;
 	struct kbase_hwcnt_group_description group;
-	struct kbase_hwcnt_block_description
-		blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
+	struct kbase_hwcnt_block_description blks[KBASE_HWCNT_V5_BLOCK_TYPE_COUNT];
 	size_t prfcnt_values_per_block;
 	size_t blk;
 	int err;
@@ -47,18 +45,15 @@ int kbase_hwcnt_gpu_metadata_narrow_create(
 	 * count in the metadata.
 	 */
 	if ((kbase_hwcnt_metadata_group_count(src_md) != 1) ||
-	    (kbase_hwcnt_metadata_block_count(src_md, 0) !=
-	     KBASE_HWCNT_V5_BLOCK_TYPE_COUNT))
+	    (kbase_hwcnt_metadata_block_count(src_md, 0) != KBASE_HWCNT_V5_BLOCK_TYPE_COUNT))
 		return -EINVAL;
 
 	/* Get the values count in the first block. */
-	prfcnt_values_per_block =
-		kbase_hwcnt_metadata_block_values_count(src_md, 0, 0);
+	prfcnt_values_per_block = kbase_hwcnt_metadata_block_values_count(src_md, 0, 0);
 
 	/* check all blocks should have same values count. */
 	for (blk = 1; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
-		size_t val_cnt =
-			kbase_hwcnt_metadata_block_values_count(src_md, 0, blk);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(src_md, 0, blk);
 		if (val_cnt != prfcnt_values_per_block)
 			return -EINVAL;
 	}
@@ -75,12 +70,10 @@ int kbase_hwcnt_gpu_metadata_narrow_create(
 	prfcnt_values_per_block = 64;
 
 	for (blk = 0; blk < KBASE_HWCNT_V5_BLOCK_TYPE_COUNT; blk++) {
-		size_t blk_hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
-			src_md, 0, blk);
+		size_t blk_hdr_cnt = kbase_hwcnt_metadata_block_headers_count(src_md, 0, blk);
 		blks[blk] = (struct kbase_hwcnt_block_description){
 			.type = kbase_hwcnt_metadata_block_type(src_md, 0, blk),
-			.inst_cnt = kbase_hwcnt_metadata_block_instance_count(
-				src_md, 0, blk),
+			.inst_cnt = kbase_hwcnt_metadata_block_instance_count(src_md, 0, blk),
 			.hdr_cnt = blk_hdr_cnt,
 			.ctr_cnt = prfcnt_values_per_block - blk_hdr_cnt,
 		};
@@ -105,8 +98,7 @@ int kbase_hwcnt_gpu_metadata_narrow_create(
 		 * only supports 32-bit but the created metadata uses 64-bit for
 		 * block entry.
 		 */
-		metadata_narrow->dump_buf_bytes =
-			metadata_narrow->metadata->dump_buf_bytes >> 1;
+		metadata_narrow->dump_buf_bytes = metadata_narrow->metadata->dump_buf_bytes >> 1;
 		*dst_md_narrow = metadata_narrow;
 	} else {
 		kfree(metadata_narrow);
@@ -115,8 +107,7 @@ int kbase_hwcnt_gpu_metadata_narrow_create(
 	return err;
 }
 
-void kbase_hwcnt_gpu_metadata_narrow_destroy(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow)
+void kbase_hwcnt_gpu_metadata_narrow_destroy(const struct kbase_hwcnt_metadata_narrow *md_narrow)
 {
 	if (!md_narrow)
 		return;
@@ -125,9 +116,8 @@ void kbase_hwcnt_gpu_metadata_narrow_destroy(
 	kfree(md_narrow);
 }
 
-int kbase_hwcnt_dump_buffer_narrow_alloc(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow,
-	struct kbase_hwcnt_dump_buffer_narrow *dump_buf)
+int kbase_hwcnt_dump_buffer_narrow_alloc(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+					 struct kbase_hwcnt_dump_buffer_narrow *dump_buf)
 {
 	size_t dump_buf_bytes;
 	size_t clk_cnt_buf_bytes;
@@ -137,8 +127,7 @@ int kbase_hwcnt_dump_buffer_narrow_alloc(
 		return -EINVAL;
 
 	dump_buf_bytes = md_narrow->dump_buf_bytes;
-	clk_cnt_buf_bytes =
-		sizeof(*dump_buf->clk_cnt_buf) * md_narrow->metadata->clk_cnt;
+	clk_cnt_buf_bytes = sizeof(*dump_buf->clk_cnt_buf) * md_narrow->metadata->clk_cnt;
 
 	/* Make a single allocation for both dump_buf and clk_cnt_buf. */
 	buf = kmalloc(dump_buf_bytes + clk_cnt_buf_bytes, GFP_KERNEL);
@@ -154,14 +143,15 @@ int kbase_hwcnt_dump_buffer_narrow_alloc(
 	return 0;
 }
 
-void kbase_hwcnt_dump_buffer_narrow_free(
-	struct kbase_hwcnt_dump_buffer_narrow *dump_buf_narrow)
+void kbase_hwcnt_dump_buffer_narrow_free(struct kbase_hwcnt_dump_buffer_narrow *dump_buf_narrow)
 {
 	if (!dump_buf_narrow)
 		return;
 
 	kfree(dump_buf_narrow->dump_buf);
-	*dump_buf_narrow = (struct kbase_hwcnt_dump_buffer_narrow){ NULL };
+	*dump_buf_narrow = (struct kbase_hwcnt_dump_buffer_narrow){ .md_narrow = NULL,
+								    .dump_buf = NULL,
+								    .clk_cnt_buf = NULL };
 }
 
 int kbase_hwcnt_dump_buffer_narrow_array_alloc(
@@ -180,8 +170,7 @@ int kbase_hwcnt_dump_buffer_narrow_array_alloc(
 		return -EINVAL;
 
 	dump_buf_bytes = md_narrow->dump_buf_bytes;
-	clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) *
-			    md_narrow->metadata->clk_cnt;
+	clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) * md_narrow->metadata->clk_cnt;
 
 	/* Allocate memory for the dump buffer struct array */
 	buffers = kmalloc_array(n, sizeof(*buffers), GFP_KERNEL);
@@ -234,27 +223,22 @@ void kbase_hwcnt_dump_buffer_narrow_array_free(
 	memset(dump_bufs, 0, sizeof(*dump_bufs));
 }
 
-void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk,
-						      const u64 *src_blk,
-						      const u64 *blk_em,
-						      size_t val_cnt)
+void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk, const u64 *src_blk,
+						      const u64 *blk_em, size_t val_cnt)
 {
 	size_t val;
 
 	for (val = 0; val < val_cnt; val++) {
-		bool val_enabled =
-			kbase_hwcnt_enable_map_block_value_enabled(blk_em, val);
-		u32 src_val =
-			(src_blk[val] > U32_MAX) ? U32_MAX : (u32)src_blk[val];
+		bool val_enabled = kbase_hwcnt_enable_map_block_value_enabled(blk_em, val);
+		u32 src_val = (src_blk[val] > U32_MAX) ? U32_MAX : (u32)src_blk[val];
 
 		dst_blk[val] = val_enabled ? src_val : 0;
 	}
 }
 
-void kbase_hwcnt_dump_buffer_copy_strict_narrow(
-	struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_copy_strict_narrow(struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
+						const struct kbase_hwcnt_dump_buffer *src,
+						const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata_narrow *metadata_narrow;
 	size_t grp;
@@ -262,68 +246,53 @@ void kbase_hwcnt_dump_buffer_copy_strict_narrow(
 
 	if (WARN_ON(!dst_narrow) || WARN_ON(!src) || WARN_ON(!dst_enable_map) ||
 	    WARN_ON(dst_narrow->md_narrow->metadata == src->metadata) ||
-	    WARN_ON(dst_narrow->md_narrow->metadata->grp_cnt !=
-		    src->metadata->grp_cnt) ||
+	    WARN_ON(dst_narrow->md_narrow->metadata->grp_cnt != src->metadata->grp_cnt) ||
 	    WARN_ON(src->metadata->grp_cnt != 1) ||
 	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt !=
 		    src->metadata->grp_metadata[0].blk_cnt) ||
 	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_cnt !=
 		    KBASE_HWCNT_V5_BLOCK_TYPE_COUNT) ||
-	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0]
-			    .blk_metadata[0]
-			    .ctr_cnt >
+	    WARN_ON(dst_narrow->md_narrow->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt >
 		    src->metadata->grp_metadata[0].blk_metadata[0].ctr_cnt))
 		return;
 
 	/* Don't use src metadata since src buffer is bigger than dst buffer. */
 	metadata_narrow = dst_narrow->md_narrow;
 
-	for (grp = 0;
-	     grp < kbase_hwcnt_metadata_narrow_group_count(metadata_narrow);
-	     grp++) {
+	for (grp = 0; grp < kbase_hwcnt_metadata_narrow_group_count(metadata_narrow); grp++) {
 		size_t blk;
-		size_t blk_cnt = kbase_hwcnt_metadata_narrow_block_count(
-			metadata_narrow, grp);
+		size_t blk_cnt = kbase_hwcnt_metadata_narrow_block_count(metadata_narrow, grp);
 
 		for (blk = 0; blk < blk_cnt; blk++) {
 			size_t blk_inst;
-			size_t blk_inst_cnt =
-				kbase_hwcnt_metadata_narrow_block_instance_count(
-					metadata_narrow, grp, blk);
+			size_t blk_inst_cnt = kbase_hwcnt_metadata_narrow_block_instance_count(
+				metadata_narrow, grp, blk);
 
-			for (blk_inst = 0; blk_inst < blk_inst_cnt;
-			     blk_inst++) {
+			for (blk_inst = 0; blk_inst < blk_inst_cnt; blk_inst++) {
 				/* The narrowed down buffer is only 32-bit. */
-				u32 *dst_blk =
-					kbase_hwcnt_dump_buffer_narrow_block_instance(
-						dst_narrow, grp, blk, blk_inst);
-				const u64 *src_blk =
-					kbase_hwcnt_dump_buffer_block_instance(
-						src, grp, blk, blk_inst);
-				const u64 *blk_em =
-					kbase_hwcnt_enable_map_block_instance(
-						dst_enable_map, grp, blk,
-						blk_inst);
-				size_t val_cnt =
-					kbase_hwcnt_metadata_narrow_block_values_count(
-						metadata_narrow, grp, blk);
+				u32 *dst_blk = kbase_hwcnt_dump_buffer_narrow_block_instance(
+					dst_narrow, grp, blk, blk_inst);
+				const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
+					src, grp, blk, blk_inst);
+				const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
+					dst_enable_map, grp, blk, blk_inst);
+				size_t val_cnt = kbase_hwcnt_metadata_narrow_block_values_count(
+					metadata_narrow, grp, blk);
 				/* Align upwards to include padding bytes */
 				val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
-					val_cnt,
-					(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-					 KBASE_HWCNT_VALUE_BYTES));
+					val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
+						  KBASE_HWCNT_VALUE_BYTES));
 
-				kbase_hwcnt_dump_buffer_block_copy_strict_narrow(
-					dst_blk, src_blk, blk_em, val_cnt);
+				kbase_hwcnt_dump_buffer_block_copy_strict_narrow(dst_blk, src_blk,
+										 blk_em, val_cnt);
 			}
 		}
 	}
 
 	for (clk = 0; clk < metadata_narrow->metadata->clk_cnt; clk++) {
-		bool clk_enabled = kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk);
+		bool clk_enabled =
+			kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk);
 
-		dst_narrow->clk_cnt_buf[clk] =
-			clk_enabled ? src->clk_cnt_buf[clk] : 0;
+		dst_narrow->clk_cnt_buf[clk] = clk_enabled ? src->clk_cnt_buf[clk] : 0;
 	}
 }
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu_narrow.h b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.h
similarity index 84%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu_narrow.h
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.h
index af6fa19f71e3..afd236d71a7c 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_gpu_narrow.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_gpu_narrow.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,7 +22,7 @@
 #ifndef _KBASE_HWCNT_GPU_NARROW_H_
 #define _KBASE_HWCNT_GPU_NARROW_H_
 
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 #include <linux/types.h>
 
 struct kbase_device;
@@ -86,8 +86,8 @@ struct kbase_hwcnt_dump_buffer_narrow_array {
  *
  * Return: Number of hardware counter groups described by narrow metadata.
  */
-static inline size_t kbase_hwcnt_metadata_narrow_group_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow)
+static inline size_t
+kbase_hwcnt_metadata_narrow_group_count(const struct kbase_hwcnt_metadata_narrow *md_narrow)
 {
 	return kbase_hwcnt_metadata_group_count(md_narrow->metadata);
 }
@@ -100,8 +100,9 @@ static inline size_t kbase_hwcnt_metadata_narrow_group_count(
  *
  * Return: Type of the group grp.
  */
-static inline u64 kbase_hwcnt_metadata_narrow_group_type(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp)
+static inline u64
+kbase_hwcnt_metadata_narrow_group_type(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+				       size_t grp)
 {
 	return kbase_hwcnt_metadata_group_type(md_narrow->metadata, grp);
 }
@@ -114,8 +115,9 @@ static inline u64 kbase_hwcnt_metadata_narrow_group_type(
  *
  * Return: Number of blocks in group grp.
  */
-static inline size_t kbase_hwcnt_metadata_narrow_block_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp)
+static inline size_t
+kbase_hwcnt_metadata_narrow_block_count(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+					size_t grp)
 {
 	return kbase_hwcnt_metadata_block_count(md_narrow->metadata, grp);
 }
@@ -131,11 +133,9 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_count(
  * Return: Number of instances of block blk in group grp.
  */
 static inline size_t kbase_hwcnt_metadata_narrow_block_instance_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
-	size_t blk)
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, size_t blk)
 {
-	return kbase_hwcnt_metadata_block_instance_count(md_narrow->metadata,
-							 grp, blk);
+	return kbase_hwcnt_metadata_block_instance_count(md_narrow->metadata, grp, blk);
 }
 
 /**
@@ -148,12 +148,11 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_instance_count(
  *
  * Return: Number of counter headers in each instance of block blk in group grp.
  */
-static inline size_t kbase_hwcnt_metadata_narrow_block_headers_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
-	size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_narrow_block_headers_count(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+						size_t grp, size_t blk)
 {
-	return kbase_hwcnt_metadata_block_headers_count(md_narrow->metadata,
-							grp, blk);
+	return kbase_hwcnt_metadata_block_headers_count(md_narrow->metadata, grp, blk);
 }
 
 /**
@@ -167,11 +166,9 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_headers_count(
  * Return: Number of counters in each instance of block blk in group grp.
  */
 static inline size_t kbase_hwcnt_metadata_narrow_block_counters_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
-	size_t blk)
+	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp, size_t blk)
 {
-	return kbase_hwcnt_metadata_block_counters_count(md_narrow->metadata,
-							 grp, blk);
+	return kbase_hwcnt_metadata_block_counters_count(md_narrow->metadata, grp, blk);
 }
 
 /**
@@ -184,14 +181,12 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_counters_count(
  * Return: Number of headers plus counters in each instance of block blk
  *         in group grp.
  */
-static inline size_t kbase_hwcnt_metadata_narrow_block_values_count(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow, size_t grp,
-	size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_narrow_block_values_count(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+					       size_t grp, size_t blk)
 {
-	return kbase_hwcnt_metadata_narrow_block_counters_count(md_narrow, grp,
-								blk) +
-	       kbase_hwcnt_metadata_narrow_block_headers_count(md_narrow, grp,
-							       blk);
+	return kbase_hwcnt_metadata_narrow_block_counters_count(md_narrow, grp, blk) +
+	       kbase_hwcnt_metadata_narrow_block_headers_count(md_narrow, grp, blk);
 }
 
 /**
@@ -205,18 +200,13 @@ static inline size_t kbase_hwcnt_metadata_narrow_block_values_count(
  *
  * Return: u32* to the dump buffer for the block instance.
  */
-static inline u32 *kbase_hwcnt_dump_buffer_narrow_block_instance(
-	const struct kbase_hwcnt_dump_buffer_narrow *buf, size_t grp,
-	size_t blk, size_t blk_inst)
+static inline u32 *
+kbase_hwcnt_dump_buffer_narrow_block_instance(const struct kbase_hwcnt_dump_buffer_narrow *buf,
+					      size_t grp, size_t blk, size_t blk_inst)
 {
-	return buf->dump_buf +
-	       buf->md_narrow->metadata->grp_metadata[grp].dump_buf_index +
-	       buf->md_narrow->metadata->grp_metadata[grp]
-		       .blk_metadata[blk]
-		       .dump_buf_index +
-	       (buf->md_narrow->metadata->grp_metadata[grp]
-			.blk_metadata[blk]
-			.dump_buf_stride *
+	return buf->dump_buf + buf->md_narrow->metadata->grp_metadata[grp].dump_buf_index +
+	       buf->md_narrow->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_index +
+	       (buf->md_narrow->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_stride *
 		blk_inst);
 }
 
@@ -239,17 +229,15 @@ static inline u32 *kbase_hwcnt_dump_buffer_narrow_block_instance(
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_gpu_metadata_narrow_create(
-	const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
-	const struct kbase_hwcnt_metadata *src_md);
+int kbase_hwcnt_gpu_metadata_narrow_create(const struct kbase_hwcnt_metadata_narrow **dst_md_narrow,
+					   const struct kbase_hwcnt_metadata *src_md);
 
 /**
  * kbase_hwcnt_gpu_metadata_narrow_destroy() - Destroy a hardware counter narrow
  *                                             metadata object.
  * @md_narrow: Pointer to hardware counter narrow metadata.
  */
-void kbase_hwcnt_gpu_metadata_narrow_destroy(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow);
+void kbase_hwcnt_gpu_metadata_narrow_destroy(const struct kbase_hwcnt_metadata_narrow *md_narrow);
 
 /**
  * kbase_hwcnt_dump_buffer_narrow_alloc() - Allocate a narrow dump buffer.
@@ -260,9 +248,8 @@ void kbase_hwcnt_gpu_metadata_narrow_destroy(
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_dump_buffer_narrow_alloc(
-	const struct kbase_hwcnt_metadata_narrow *md_narrow,
-	struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
+int kbase_hwcnt_dump_buffer_narrow_alloc(const struct kbase_hwcnt_metadata_narrow *md_narrow,
+					 struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
 
 /**
  * kbase_hwcnt_dump_buffer_narrow_free() - Free a narrow dump buffer.
@@ -271,8 +258,7 @@ int kbase_hwcnt_dump_buffer_narrow_alloc(
  * Can be safely called on an all-zeroed narrow dump buffer structure, or on an
  * already freed narrow dump buffer.
  */
-void kbase_hwcnt_dump_buffer_narrow_free(
-	struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
+void kbase_hwcnt_dump_buffer_narrow_free(struct kbase_hwcnt_dump_buffer_narrow *dump_buf);
 
 /**
  * kbase_hwcnt_dump_buffer_narrow_array_alloc() - Allocate an array of narrow
@@ -320,10 +306,8 @@ void kbase_hwcnt_dump_buffer_narrow_array_free(
  * source value is bigger than U32_MAX, or copy the value from source if the
  * corresponding source value is less than or equal to U32_MAX.
  */
-void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk,
-						      const u64 *src_blk,
-						      const u64 *blk_em,
-						      size_t val_cnt);
+void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk, const u64 *src_blk,
+						      const u64 *blk_em, size_t val_cnt);
 
 /**
  * kbase_hwcnt_dump_buffer_copy_strict_narrow() - Copy all enabled values to a
@@ -339,9 +323,8 @@ void kbase_hwcnt_dump_buffer_block_copy_strict_narrow(u32 *dst_blk,
  * corresponding source value is bigger than U32_MAX, or copy the value from
  * source if the corresponding source value is less than or equal to U32_MAX.
  */
-void kbase_hwcnt_dump_buffer_copy_strict_narrow(
-	struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map);
+void kbase_hwcnt_dump_buffer_copy_strict_narrow(struct kbase_hwcnt_dump_buffer_narrow *dst_narrow,
+						const struct kbase_hwcnt_dump_buffer *src,
+						const struct kbase_hwcnt_enable_map *dst_enable_map);
 
 #endif /* _KBASE_HWCNT_GPU_NARROW_H_ */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_types.c b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_types.c
similarity index 56%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_types.c
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_types.c
index d925ed744d3d..763eb315d9a2 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_types.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_types.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,13 +19,12 @@
  *
  */
 
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 
 #include <linux/slab.h>
 
-int kbase_hwcnt_metadata_create(
-	const struct kbase_hwcnt_description *desc,
-	const struct kbase_hwcnt_metadata **out_metadata)
+int kbase_hwcnt_metadata_create(const struct kbase_hwcnt_description *desc,
+				const struct kbase_hwcnt_metadata **out_metadata)
 {
 	char *buf;
 	struct kbase_hwcnt_metadata *metadata;
@@ -56,8 +55,7 @@ int kbase_hwcnt_metadata_create(
 
 	/* Block metadata */
 	for (grp = 0; grp < desc->grp_cnt; grp++) {
-		size += sizeof(struct kbase_hwcnt_block_metadata) *
-			desc->grps[grp].blk_cnt;
+		size += sizeof(struct kbase_hwcnt_block_metadata) * desc->grps[grp].blk_cnt;
 	}
 
 	/* Single allocation for the entire metadata */
@@ -83,8 +81,7 @@ int kbase_hwcnt_metadata_create(
 	for (grp = 0; grp < desc->grp_cnt; grp++) {
 		size_t blk;
 
-		const struct kbase_hwcnt_group_description *grp_desc =
-			desc->grps + grp;
+		const struct kbase_hwcnt_group_description *grp_desc = desc->grps + grp;
 		struct kbase_hwcnt_group_metadata *grp_md = grp_mds + grp;
 
 		size_t group_enable_map_count = 0;
@@ -94,37 +91,28 @@ int kbase_hwcnt_metadata_create(
 		/* Bump allocate this group's block metadata */
 		struct kbase_hwcnt_block_metadata *blk_mds =
 			(struct kbase_hwcnt_block_metadata *)(buf + offset);
-		offset += sizeof(struct kbase_hwcnt_block_metadata) *
-			grp_desc->blk_cnt;
+		offset += sizeof(struct kbase_hwcnt_block_metadata) * grp_desc->blk_cnt;
 
 		/* Fill in each block in the group's information */
 		for (blk = 0; blk < grp_desc->blk_cnt; blk++) {
-			const struct kbase_hwcnt_block_description *blk_desc =
-				grp_desc->blks + blk;
-			struct kbase_hwcnt_block_metadata *blk_md =
-				blk_mds + blk;
-			const size_t n_values =
-				blk_desc->hdr_cnt + blk_desc->ctr_cnt;
+			const struct kbase_hwcnt_block_description *blk_desc = grp_desc->blks + blk;
+			struct kbase_hwcnt_block_metadata *blk_md = blk_mds + blk;
+			const size_t n_values = blk_desc->hdr_cnt + blk_desc->ctr_cnt;
 
 			blk_md->type = blk_desc->type;
 			blk_md->inst_cnt = blk_desc->inst_cnt;
 			blk_md->hdr_cnt = blk_desc->hdr_cnt;
 			blk_md->ctr_cnt = blk_desc->ctr_cnt;
 			blk_md->enable_map_index = group_enable_map_count;
-			blk_md->enable_map_stride =
-				kbase_hwcnt_bitfield_count(n_values);
+			blk_md->enable_map_stride = kbase_hwcnt_bitfield_count(n_values);
 			blk_md->dump_buf_index = group_dump_buffer_count;
-			blk_md->dump_buf_stride =
-				KBASE_HWCNT_ALIGN_UPWARDS(
-					n_values,
-					(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-					 KBASE_HWCNT_VALUE_BYTES));
+			blk_md->dump_buf_stride = KBASE_HWCNT_ALIGN_UPWARDS(
+				n_values,
+				(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES));
 			blk_md->avail_mask_index = group_avail_mask_bits;
 
-			group_enable_map_count +=
-				blk_md->enable_map_stride * blk_md->inst_cnt;
-			group_dump_buffer_count +=
-				blk_md->dump_buf_stride * blk_md->inst_cnt;
+			group_enable_map_count += blk_md->enable_map_stride * blk_md->inst_cnt;
+			group_dump_buffer_count += blk_md->dump_buf_stride * blk_md->inst_cnt;
 			group_avail_mask_bits += blk_md->inst_cnt;
 		}
 
@@ -144,8 +132,7 @@ int kbase_hwcnt_metadata_create(
 	/* Fill in the top level metadata's information */
 	metadata->grp_cnt = desc->grp_cnt;
 	metadata->grp_metadata = grp_mds;
-	metadata->enable_map_bytes =
-		enable_map_count * KBASE_HWCNT_BITFIELD_BYTES;
+	metadata->enable_map_bytes = enable_map_count * KBASE_HWCNT_BITFIELD_BYTES;
 	metadata->dump_buf_bytes = dump_buf_count * KBASE_HWCNT_VALUE_BYTES;
 	metadata->avail_mask = desc->avail_mask;
 	metadata->clk_cnt = desc->clk_cnt;
@@ -155,8 +142,7 @@ int kbase_hwcnt_metadata_create(
 	 * bit per 4 bytes in the dump buffer.
 	 */
 	WARN_ON(metadata->dump_buf_bytes !=
-		(metadata->enable_map_bytes *
-		 BITS_PER_BYTE * KBASE_HWCNT_VALUE_BYTES));
+		(metadata->enable_map_bytes * BITS_PER_BYTE * KBASE_HWCNT_VALUE_BYTES));
 
 	*out_metadata = metadata;
 	return 0;
@@ -167,9 +153,8 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata)
 	kfree(metadata);
 }
 
-int kbase_hwcnt_enable_map_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	struct kbase_hwcnt_enable_map *enable_map)
+int kbase_hwcnt_enable_map_alloc(const struct kbase_hwcnt_metadata *metadata,
+				 struct kbase_hwcnt_enable_map *enable_map)
 {
 	u64 *enable_map_buf;
 
@@ -177,8 +162,7 @@ int kbase_hwcnt_enable_map_alloc(
 		return -EINVAL;
 
 	if (metadata->enable_map_bytes > 0) {
-		enable_map_buf =
-			kzalloc(metadata->enable_map_bytes, GFP_KERNEL);
+		enable_map_buf = kzalloc(metadata->enable_map_bytes, GFP_KERNEL);
 		if (!enable_map_buf)
 			return -ENOMEM;
 	} else {
@@ -200,9 +184,8 @@ void kbase_hwcnt_enable_map_free(struct kbase_hwcnt_enable_map *enable_map)
 	enable_map->metadata = NULL;
 }
 
-int kbase_hwcnt_dump_buffer_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+int kbase_hwcnt_dump_buffer_alloc(const struct kbase_hwcnt_metadata *metadata,
+				  struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	size_t dump_buf_bytes;
 	size_t clk_cnt_buf_bytes;
@@ -235,10 +218,8 @@ void kbase_hwcnt_dump_buffer_free(struct kbase_hwcnt_dump_buffer *dump_buf)
 	memset(dump_buf, 0, sizeof(*dump_buf));
 }
 
-int kbase_hwcnt_dump_buffer_array_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	size_t n,
-	struct kbase_hwcnt_dump_buffer_array *dump_bufs)
+int kbase_hwcnt_dump_buffer_array_alloc(const struct kbase_hwcnt_metadata *metadata, size_t n,
+					struct kbase_hwcnt_dump_buffer_array *dump_bufs)
 {
 	struct kbase_hwcnt_dump_buffer *buffers;
 	size_t buf_idx;
@@ -251,8 +232,7 @@ int kbase_hwcnt_dump_buffer_array_alloc(
 		return -EINVAL;
 
 	dump_buf_bytes = metadata->dump_buf_bytes;
-	clk_cnt_buf_bytes =
-		sizeof(*dump_bufs->bufs->clk_cnt_buf) * metadata->clk_cnt;
+	clk_cnt_buf_bytes = sizeof(*dump_bufs->bufs->clk_cnt_buf) * metadata->clk_cnt;
 
 	/* Allocate memory for the dump buffer struct array */
 	buffers = kmalloc_array(n, sizeof(*buffers), GFP_KERNEL);
@@ -283,15 +263,13 @@ int kbase_hwcnt_dump_buffer_array_alloc(
 
 		buffers[buf_idx].metadata = metadata;
 		buffers[buf_idx].dump_buf = (u64 *)(addr + dump_buf_offset);
-		buffers[buf_idx].clk_cnt_buf =
-			(u64 *)(addr + clk_cnt_buf_offset);
+		buffers[buf_idx].clk_cnt_buf = (u64 *)(addr + clk_cnt_buf_offset);
 	}
 
 	return 0;
 }
 
-void kbase_hwcnt_dump_buffer_array_free(
-	struct kbase_hwcnt_dump_buffer_array *dump_bufs)
+void kbase_hwcnt_dump_buffer_array_free(struct kbase_hwcnt_dump_buffer_array *dump_bufs)
 {
 	if (!dump_bufs)
 		return;
@@ -301,84 +279,71 @@ void kbase_hwcnt_dump_buffer_array_free(
 	memset(dump_bufs, 0, sizeof(*dump_bufs));
 }
 
-void kbase_hwcnt_dump_buffer_zero(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_zero(struct kbase_hwcnt_dump_buffer *dst,
+				  const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 
-	if (WARN_ON(!dst) ||
-	    WARN_ON(!dst_enable_map) ||
+	if (WARN_ON(!dst) || WARN_ON(!dst_enable_map) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
 		u64 *dst_blk;
 		size_t val_cnt;
 
-		if (!kbase_hwcnt_enable_map_block_enabled(
-			dst_enable_map, grp, blk, blk_inst))
+		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst))
 			continue;
 
-		dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		val_cnt = kbase_hwcnt_metadata_block_values_count(
-			metadata, grp, blk);
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);
 
 		kbase_hwcnt_dump_buffer_block_zero(dst_blk, val_cnt);
 	}
 
-	memset(dst->clk_cnt_buf, 0,
-		sizeof(*dst->clk_cnt_buf) * metadata->clk_cnt);
+	memset(dst->clk_cnt_buf, 0, sizeof(*dst->clk_cnt_buf) * metadata->clk_cnt);
 }
 
-void kbase_hwcnt_dump_buffer_zero_strict(
-	struct kbase_hwcnt_dump_buffer *dst)
+void kbase_hwcnt_dump_buffer_zero_strict(struct kbase_hwcnt_dump_buffer *dst)
 {
 	if (WARN_ON(!dst))
 		return;
 
 	memset(dst->dump_buf, 0, dst->metadata->dump_buf_bytes);
 
-	memset(dst->clk_cnt_buf, 0,
-		sizeof(*dst->clk_cnt_buf) * dst->metadata->clk_cnt);
+	memset(dst->clk_cnt_buf, 0, sizeof(*dst->clk_cnt_buf) * dst->metadata->clk_cnt);
 }
 
-void kbase_hwcnt_dump_buffer_zero_non_enabled(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_zero_non_enabled(struct kbase_hwcnt_dump_buffer *dst,
+					      const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 
-	if (WARN_ON(!dst) ||
-	    WARN_ON(!dst_enable_map) ||
+	if (WARN_ON(!dst) || WARN_ON(!dst_enable_map) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
-			dst_enable_map, grp, blk, blk_inst);
-		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
-			metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		const u64 *blk_em =
+			kbase_hwcnt_enable_map_block_instance(dst_enable_map, grp, blk, blk_inst);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);
 
 		/* Align upwards to include padding bytes */
-		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(val_cnt,
-			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-			 KBASE_HWCNT_VALUE_BYTES));
+		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
+			val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES));
 
-		if (kbase_hwcnt_metadata_block_instance_avail(
-			metadata, grp, blk, blk_inst)) {
+		if (kbase_hwcnt_metadata_block_instance_avail(metadata, grp, blk, blk_inst)) {
 			/* Block available, so only zero non-enabled values */
-			kbase_hwcnt_dump_buffer_block_zero_non_enabled(
-				dst_blk, blk_em, val_cnt);
+			kbase_hwcnt_dump_buffer_block_zero_non_enabled(dst_blk, blk_em, val_cnt);
 		} else {
 			/* Block not available, so zero the entire thing */
 			kbase_hwcnt_dump_buffer_block_zero(dst_blk, val_cnt);
@@ -386,188 +351,159 @@ void kbase_hwcnt_dump_buffer_zero_non_enabled(
 	}
 }
 
-void kbase_hwcnt_dump_buffer_copy(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_copy(struct kbase_hwcnt_dump_buffer *dst,
+				  const struct kbase_hwcnt_dump_buffer *src,
+				  const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 	size_t clk;
 
-	if (WARN_ON(!dst) ||
-	    WARN_ON(!src) ||
-	    WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst == src) ||
+	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
 	    WARN_ON(dst->metadata != src->metadata) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
 		u64 *dst_blk;
 		const u64 *src_blk;
 		size_t val_cnt;
 
-		if (!kbase_hwcnt_enable_map_block_enabled(
-			dst_enable_map, grp, blk, blk_inst))
+		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst))
 			continue;
 
-		dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		src_blk = kbase_hwcnt_dump_buffer_block_instance(
-			src, grp, blk, blk_inst);
-		val_cnt = kbase_hwcnt_metadata_block_values_count(
-			metadata, grp, blk);
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		src_blk = kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+		val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);
 
 		kbase_hwcnt_dump_buffer_block_copy(dst_blk, src_blk, val_cnt);
 	}
 
-	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
-		if (kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk))
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+	{
+		if (kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			dst->clk_cnt_buf[clk] = src->clk_cnt_buf[clk];
 	}
 }
 
-void kbase_hwcnt_dump_buffer_copy_strict(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_copy_strict(struct kbase_hwcnt_dump_buffer *dst,
+					 const struct kbase_hwcnt_dump_buffer *src,
+					 const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 	size_t clk;
 
-	if (WARN_ON(!dst) ||
-	    WARN_ON(!src) ||
-	    WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst == src) ||
+	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
 	    WARN_ON(dst->metadata != src->metadata) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
-			src, grp, blk, blk_inst);
-		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
-			dst_enable_map, grp, blk, blk_inst);
-		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(
-			metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		const u64 *src_blk =
+			kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+		const u64 *blk_em =
+			kbase_hwcnt_enable_map_block_instance(dst_enable_map, grp, blk, blk_inst);
+		size_t val_cnt = kbase_hwcnt_metadata_block_values_count(metadata, grp, blk);
 		/* Align upwards to include padding bytes */
-		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(val_cnt,
-			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-			 KBASE_HWCNT_VALUE_BYTES));
+		val_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
+			val_cnt, (KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES));
 
-		kbase_hwcnt_dump_buffer_block_copy_strict(
-			dst_blk, src_blk, blk_em, val_cnt);
+		kbase_hwcnt_dump_buffer_block_copy_strict(dst_blk, src_blk, blk_em, val_cnt);
 	}
 
-	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+	{
 		bool clk_enabled =
-			kbase_hwcnt_clk_enable_map_enabled(
-				dst_enable_map->clk_enable_map, clk);
+			kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk);
 
 		dst->clk_cnt_buf[clk] = clk_enabled ? src->clk_cnt_buf[clk] : 0;
 	}
 }
 
-void kbase_hwcnt_dump_buffer_accumulate(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_accumulate(struct kbase_hwcnt_dump_buffer *dst,
+					const struct kbase_hwcnt_dump_buffer *src,
+					const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 	size_t clk;
 
-	if (WARN_ON(!dst) ||
-	    WARN_ON(!src) ||
-	    WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst == src) ||
+	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
 	    WARN_ON(dst->metadata != src->metadata) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
 		u64 *dst_blk;
 		const u64 *src_blk;
 		size_t hdr_cnt;
 		size_t ctr_cnt;
 
-		if (!kbase_hwcnt_enable_map_block_enabled(
-			dst_enable_map, grp, blk, blk_inst))
+		if (!kbase_hwcnt_enable_map_block_enabled(dst_enable_map, grp, blk, blk_inst))
 			continue;
 
-		dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		src_blk = kbase_hwcnt_dump_buffer_block_instance(
-			src, grp, blk, blk_inst);
-		hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
-			metadata, grp, blk);
-		ctr_cnt = kbase_hwcnt_metadata_block_counters_count(
-			metadata, grp, blk);
+		dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		src_blk = kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+		hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
+		ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
 
-		kbase_hwcnt_dump_buffer_block_accumulate(
-			dst_blk, src_blk, hdr_cnt, ctr_cnt);
+		kbase_hwcnt_dump_buffer_block_accumulate(dst_blk, src_blk, hdr_cnt, ctr_cnt);
 	}
 
-	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
-		if (kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk))
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+	{
+		if (kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			dst->clk_cnt_buf[clk] += src->clk_cnt_buf[clk];
 	}
 }
 
-void kbase_hwcnt_dump_buffer_accumulate_strict(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map)
+void kbase_hwcnt_dump_buffer_accumulate_strict(struct kbase_hwcnt_dump_buffer *dst,
+					       const struct kbase_hwcnt_dump_buffer *src,
+					       const struct kbase_hwcnt_enable_map *dst_enable_map)
 {
 	const struct kbase_hwcnt_metadata *metadata;
 	size_t grp, blk, blk_inst;
 	size_t clk;
 
-	if (WARN_ON(!dst) ||
-	    WARN_ON(!src) ||
-	    WARN_ON(!dst_enable_map) ||
-	    WARN_ON(dst == src) ||
+	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst_enable_map) || WARN_ON(dst == src) ||
 	    WARN_ON(dst->metadata != src->metadata) ||
 	    WARN_ON(dst->metadata != dst_enable_map->metadata))
 		return;
 
 	metadata = dst->metadata;
 
-	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst) {
-		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(
-			dst, grp, blk, blk_inst);
-		const u64 *src_blk = kbase_hwcnt_dump_buffer_block_instance(
-			src, grp, blk, blk_inst);
-		const u64 *blk_em = kbase_hwcnt_enable_map_block_instance(
-			dst_enable_map, grp, blk, blk_inst);
-		size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(
-			metadata, grp, blk);
-		size_t ctr_cnt = kbase_hwcnt_metadata_block_counters_count(
-			metadata, grp, blk);
+	kbase_hwcnt_metadata_for_each_block(metadata, grp, blk, blk_inst)
+	{
+		u64 *dst_blk = kbase_hwcnt_dump_buffer_block_instance(dst, grp, blk, blk_inst);
+		const u64 *src_blk =
+			kbase_hwcnt_dump_buffer_block_instance(src, grp, blk, blk_inst);
+		const u64 *blk_em =
+			kbase_hwcnt_enable_map_block_instance(dst_enable_map, grp, blk, blk_inst);
+		size_t hdr_cnt = kbase_hwcnt_metadata_block_headers_count(metadata, grp, blk);
+		size_t ctr_cnt = kbase_hwcnt_metadata_block_counters_count(metadata, grp, blk);
 		/* Align upwards to include padding bytes */
-		ctr_cnt = KBASE_HWCNT_ALIGN_UPWARDS(hdr_cnt + ctr_cnt,
-			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT /
-			 KBASE_HWCNT_VALUE_BYTES) - hdr_cnt);
+		ctr_cnt = KBASE_HWCNT_ALIGN_UPWARDS(
+			hdr_cnt + ctr_cnt,
+			(KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT / KBASE_HWCNT_VALUE_BYTES) - hdr_cnt);
 
-		kbase_hwcnt_dump_buffer_block_accumulate_strict(
-			dst_blk, src_blk, blk_em, hdr_cnt, ctr_cnt);
+		kbase_hwcnt_dump_buffer_block_accumulate_strict(dst_blk, src_blk, blk_em, hdr_cnt,
+								ctr_cnt);
 	}
 
-	kbase_hwcnt_metadata_for_each_clock(metadata, clk) {
-		if (kbase_hwcnt_clk_enable_map_enabled(
-			dst_enable_map->clk_enable_map, clk))
+	kbase_hwcnt_metadata_for_each_clock(metadata, clk)
+	{
+		if (kbase_hwcnt_clk_enable_map_enabled(dst_enable_map->clk_enable_map, clk))
 			dst->clk_cnt_buf[clk] += src->clk_cnt_buf[clk];
 		else
 			dst->clk_cnt_buf[clk] = 0;
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_types.h b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_types.h
similarity index 84%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_types.h
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_types.h
index 9397840146b4..5c5ada401768 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_types.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_types.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -104,8 +104,7 @@
 #define KBASE_HWCNT_AVAIL_MASK_BITS (sizeof(u64) * BITS_PER_BYTE)
 
 /* Minimum alignment of each block of hardware counters */
-#define KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT                                       \
-	(KBASE_HWCNT_BITFIELD_BITS * KBASE_HWCNT_VALUE_BYTES)
+#define KBASE_HWCNT_BLOCK_BYTE_ALIGNMENT (KBASE_HWCNT_BITFIELD_BITS * KBASE_HWCNT_VALUE_BYTES)
 
 /**
  * KBASE_HWCNT_ALIGN_UPWARDS() - Calculate next aligned value.
@@ -115,7 +114,7 @@
  * Return: Input value if already aligned to the specified boundary, or next
  * (incrementing upwards) aligned value.
  */
-#define KBASE_HWCNT_ALIGN_UPWARDS(value, alignment)                            \
+#define KBASE_HWCNT_ALIGN_UPWARDS(value, alignment)                                                \
 	(value + ((alignment - (value % alignment)) % alignment))
 
 /**
@@ -307,9 +306,8 @@ struct kbase_hwcnt_dump_buffer_array {
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_metadata_create(
-	const struct kbase_hwcnt_description *desc,
-	const struct kbase_hwcnt_metadata **metadata);
+int kbase_hwcnt_metadata_create(const struct kbase_hwcnt_description *desc,
+				const struct kbase_hwcnt_metadata **metadata);
 
 /**
  * kbase_hwcnt_metadata_destroy() - Destroy a hardware counter metadata object.
@@ -323,8 +321,7 @@ void kbase_hwcnt_metadata_destroy(const struct kbase_hwcnt_metadata *metadata);
  *
  * Return: Number of hardware counter groups described by metadata.
  */
-static inline size_t
-kbase_hwcnt_metadata_group_count(const struct kbase_hwcnt_metadata *metadata)
+static inline size_t kbase_hwcnt_metadata_group_count(const struct kbase_hwcnt_metadata *metadata)
 {
 	if (WARN_ON(!metadata))
 		return 0;
@@ -339,9 +336,8 @@ kbase_hwcnt_metadata_group_count(const struct kbase_hwcnt_metadata *metadata)
  *
  * Return: Type of the group grp.
  */
-static inline u64
-kbase_hwcnt_metadata_group_type(const struct kbase_hwcnt_metadata *metadata,
-				size_t grp)
+static inline u64 kbase_hwcnt_metadata_group_type(const struct kbase_hwcnt_metadata *metadata,
+						  size_t grp)
 {
 	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt))
 		return 0;
@@ -356,9 +352,8 @@ kbase_hwcnt_metadata_group_type(const struct kbase_hwcnt_metadata *metadata,
  *
  * Return: Number of blocks in group grp.
  */
-static inline size_t
-kbase_hwcnt_metadata_block_count(const struct kbase_hwcnt_metadata *metadata,
-				 size_t grp)
+static inline size_t kbase_hwcnt_metadata_block_count(const struct kbase_hwcnt_metadata *metadata,
+						      size_t grp)
 {
 	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt))
 		return 0;
@@ -374,9 +369,8 @@ kbase_hwcnt_metadata_block_count(const struct kbase_hwcnt_metadata *metadata,
  *
  * Return: Type of the block blk in group grp.
  */
-static inline u64
-kbase_hwcnt_metadata_block_type(const struct kbase_hwcnt_metadata *metadata,
-				size_t grp, size_t blk)
+static inline u64 kbase_hwcnt_metadata_block_type(const struct kbase_hwcnt_metadata *metadata,
+						  size_t grp, size_t blk)
 {
 	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
 	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
@@ -394,8 +388,9 @@ kbase_hwcnt_metadata_block_type(const struct kbase_hwcnt_metadata *metadata,
  *
  * Return: Number of instances of block blk in group grp.
  */
-static inline size_t kbase_hwcnt_metadata_block_instance_count(
-	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_block_instance_count(const struct kbase_hwcnt_metadata *metadata, size_t grp,
+					  size_t blk)
 {
 	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
 	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
@@ -413,8 +408,9 @@ static inline size_t kbase_hwcnt_metadata_block_instance_count(
  *
  * Return: Number of counter headers in each instance of block blk in group grp.
  */
-static inline size_t kbase_hwcnt_metadata_block_headers_count(
-	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_block_headers_count(const struct kbase_hwcnt_metadata *metadata, size_t grp,
+					 size_t blk)
 {
 	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
 	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
@@ -431,8 +427,9 @@ static inline size_t kbase_hwcnt_metadata_block_headers_count(
  *
  * Return: Number of counters in each instance of block blk in group grp.
  */
-static inline size_t kbase_hwcnt_metadata_block_counters_count(
-	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_block_counters_count(const struct kbase_hwcnt_metadata *metadata, size_t grp,
+					  size_t blk)
 {
 	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
 	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
@@ -449,8 +446,9 @@ static inline size_t kbase_hwcnt_metadata_block_counters_count(
  *
  * Return: enable map stride in each instance of block blk in group grp.
  */
-static inline size_t kbase_hwcnt_metadata_block_enable_map_stride(
-	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_block_enable_map_stride(const struct kbase_hwcnt_metadata *metadata,
+					     size_t grp, size_t blk)
 {
 	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
 	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
@@ -468,8 +466,9 @@ static inline size_t kbase_hwcnt_metadata_block_enable_map_stride(
  * Return: Number of headers plus counters in each instance of block blk
  *         in group grp.
  */
-static inline size_t kbase_hwcnt_metadata_block_values_count(
-	const struct kbase_hwcnt_metadata *metadata, size_t grp, size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_block_values_count(const struct kbase_hwcnt_metadata *metadata, size_t grp,
+					size_t blk)
 {
 	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
 	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
@@ -490,10 +489,13 @@ static inline size_t kbase_hwcnt_metadata_block_values_count(
  * Iteration order is group, then block, then block instance (i.e. linearly
  * through memory).
  */
-#define kbase_hwcnt_metadata_for_each_block(md, grp, blk, blk_inst) \
-	for ((grp) = 0; (grp) < kbase_hwcnt_metadata_group_count((md)); (grp)++) \
-		for ((blk) = 0; (blk) < kbase_hwcnt_metadata_block_count((md), (grp)); (blk)++) \
-			for ((blk_inst) = 0; (blk_inst) < kbase_hwcnt_metadata_block_instance_count((md), (grp), (blk)); (blk_inst)++)
+#define kbase_hwcnt_metadata_for_each_block(md, grp, blk, blk_inst)                                \
+	for ((grp) = 0; (grp) < kbase_hwcnt_metadata_group_count((md)); (grp)++)                   \
+		for ((blk) = 0; (blk) < kbase_hwcnt_metadata_block_count((md), (grp)); (blk)++)    \
+			for ((blk_inst) = 0;                                                       \
+			     (blk_inst) <                                                          \
+			     kbase_hwcnt_metadata_block_instance_count((md), (grp), (blk));        \
+			     (blk_inst)++)
 
 /**
  * kbase_hwcnt_metadata_block_avail_bit() - Get the bit index into the avail
@@ -504,10 +506,9 @@ static inline size_t kbase_hwcnt_metadata_block_values_count(
  *
  * Return: The bit index into the avail mask for the block.
  */
-static inline size_t kbase_hwcnt_metadata_block_avail_bit(
-	const struct kbase_hwcnt_metadata *metadata,
-	size_t grp,
-	size_t blk)
+static inline size_t
+kbase_hwcnt_metadata_block_avail_bit(const struct kbase_hwcnt_metadata *metadata, size_t grp,
+				     size_t blk)
 {
 	if (WARN_ON(!metadata) || WARN_ON(grp >= metadata->grp_cnt) ||
 	    WARN_ON(blk >= metadata->grp_metadata[grp].blk_cnt))
@@ -527,11 +528,9 @@ static inline size_t kbase_hwcnt_metadata_block_avail_bit(
  *
  * Return: true if the block instance is available, else false.
  */
-static inline bool kbase_hwcnt_metadata_block_instance_avail(
-	const struct kbase_hwcnt_metadata *metadata,
-	size_t grp,
-	size_t blk,
-	size_t blk_inst)
+static inline bool
+kbase_hwcnt_metadata_block_instance_avail(const struct kbase_hwcnt_metadata *metadata, size_t grp,
+					  size_t blk, size_t blk_inst)
 {
 	size_t bit;
 	u64 mask;
@@ -553,9 +552,8 @@ static inline bool kbase_hwcnt_metadata_block_instance_avail(
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_enable_map_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	struct kbase_hwcnt_enable_map *enable_map);
+int kbase_hwcnt_enable_map_alloc(const struct kbase_hwcnt_metadata *metadata,
+				 struct kbase_hwcnt_enable_map *enable_map);
 
 /**
  * kbase_hwcnt_enable_map_free() - Free an enable map.
@@ -577,9 +575,8 @@ void kbase_hwcnt_enable_map_free(struct kbase_hwcnt_enable_map *enable_map);
  * Return: u64* to the bitfield(s) used as the enable map for the
  *         block instance.
  */
-static inline u64 *
-kbase_hwcnt_enable_map_block_instance(const struct kbase_hwcnt_enable_map *map,
-				      size_t grp, size_t blk, size_t blk_inst)
+static inline u64 *kbase_hwcnt_enable_map_block_instance(const struct kbase_hwcnt_enable_map *map,
+							 size_t grp, size_t blk, size_t blk_inst)
 {
 	if (WARN_ON(!map) || WARN_ON(!map->hwcnt_enable_map))
 		return NULL;
@@ -589,15 +586,9 @@ kbase_hwcnt_enable_map_block_instance(const struct kbase_hwcnt_enable_map *map,
 	    WARN_ON(blk_inst >= map->metadata->grp_metadata[grp].blk_metadata[blk].inst_cnt))
 		return map->hwcnt_enable_map;
 
-	return map->hwcnt_enable_map +
-	       map->metadata->grp_metadata[grp].enable_map_index +
-	       map->metadata->grp_metadata[grp]
-		       .blk_metadata[blk]
-		       .enable_map_index +
-	       (map->metadata->grp_metadata[grp]
-			.blk_metadata[blk]
-			.enable_map_stride *
-		blk_inst);
+	return map->hwcnt_enable_map + map->metadata->grp_metadata[grp].enable_map_index +
+	       map->metadata->grp_metadata[grp].blk_metadata[blk].enable_map_index +
+	       (map->metadata->grp_metadata[grp].blk_metadata[blk].enable_map_stride * blk_inst);
 }
 
 /**
@@ -609,8 +600,7 @@ kbase_hwcnt_enable_map_block_instance(const struct kbase_hwcnt_enable_map *map,
  */
 static inline size_t kbase_hwcnt_bitfield_count(size_t val_cnt)
 {
-	return (val_cnt + KBASE_HWCNT_BITFIELD_BITS - 1) /
-		KBASE_HWCNT_BITFIELD_BITS;
+	return (val_cnt + KBASE_HWCNT_BITFIELD_BITS - 1) / KBASE_HWCNT_BITFIELD_BITS;
 }
 
 /**
@@ -620,11 +610,8 @@ static inline size_t kbase_hwcnt_bitfield_count(size_t val_cnt)
  * @blk:      Index of the block in the group.
  * @blk_inst: Index of the block instance in the block.
  */
-static inline void kbase_hwcnt_enable_map_block_disable_all(
-	struct kbase_hwcnt_enable_map *dst,
-	size_t grp,
-	size_t blk,
-	size_t blk_inst)
+static inline void kbase_hwcnt_enable_map_block_disable_all(struct kbase_hwcnt_enable_map *dst,
+							    size_t grp, size_t blk, size_t blk_inst)
 {
 	size_t val_cnt;
 	size_t bitfld_cnt;
@@ -644,15 +631,13 @@ static inline void kbase_hwcnt_enable_map_block_disable_all(
  * kbase_hwcnt_enable_map_disable_all() - Disable all values in the enable map.
  * @dst: Non-NULL pointer to enable map to zero.
  */
-static inline void kbase_hwcnt_enable_map_disable_all(
-	struct kbase_hwcnt_enable_map *dst)
+static inline void kbase_hwcnt_enable_map_disable_all(struct kbase_hwcnt_enable_map *dst)
 {
 	if (WARN_ON(!dst) || WARN_ON(!dst->metadata))
 		return;
 
 	if (dst->hwcnt_enable_map != NULL)
-		memset(dst->hwcnt_enable_map, 0,
-		       dst->metadata->enable_map_bytes);
+		memset(dst->hwcnt_enable_map, 0, dst->metadata->enable_map_bytes);
 
 	dst->clk_enable_map = 0;
 }
@@ -664,11 +649,8 @@ static inline void kbase_hwcnt_enable_map_disable_all(
  * @blk:      Index of the block in the group.
  * @blk_inst: Index of the block instance in the block.
  */
-static inline void kbase_hwcnt_enable_map_block_enable_all(
-	struct kbase_hwcnt_enable_map *dst,
-	size_t grp,
-	size_t blk,
-	size_t blk_inst)
+static inline void kbase_hwcnt_enable_map_block_enable_all(struct kbase_hwcnt_enable_map *dst,
+							   size_t grp, size_t blk, size_t blk_inst)
 {
 	size_t val_cnt;
 	size_t bitfld_cnt;
@@ -683,8 +665,7 @@ static inline void kbase_hwcnt_enable_map_block_enable_all(
 	bitfld_cnt = kbase_hwcnt_bitfield_count(val_cnt);
 
 	for (bitfld_idx = 0; bitfld_idx < bitfld_cnt; bitfld_idx++) {
-		const u64 remaining_values = val_cnt -
-			(bitfld_idx * KBASE_HWCNT_BITFIELD_BITS);
+		const u64 remaining_values = val_cnt - (bitfld_idx * KBASE_HWCNT_BITFIELD_BITS);
 		u64 block_enable_map_mask = U64_MAX;
 
 		if (remaining_values < KBASE_HWCNT_BITFIELD_BITS)
@@ -699,8 +680,7 @@ static inline void kbase_hwcnt_enable_map_block_enable_all(
  *                                       map.
  * @dst: Non-NULL pointer to enable map.
  */
-static inline void kbase_hwcnt_enable_map_enable_all(
-	struct kbase_hwcnt_enable_map *dst)
+static inline void kbase_hwcnt_enable_map_enable_all(struct kbase_hwcnt_enable_map *dst)
 {
 	size_t grp, blk, blk_inst;
 
@@ -708,8 +688,7 @@ static inline void kbase_hwcnt_enable_map_enable_all(
 		return;
 
 	kbase_hwcnt_metadata_for_each_block(dst->metadata, grp, blk, blk_inst)
-		kbase_hwcnt_enable_map_block_enable_all(
-			dst, grp, blk, blk_inst);
+		kbase_hwcnt_enable_map_block_enable_all(dst, grp, blk, blk_inst);
 
 	dst->clk_enable_map = (1ull << dst->metadata->clk_cnt) - 1;
 }
@@ -721,9 +700,8 @@ static inline void kbase_hwcnt_enable_map_enable_all(
  *
  * The dst and src MUST have been created from the same metadata.
  */
-static inline void kbase_hwcnt_enable_map_copy(
-	struct kbase_hwcnt_enable_map *dst,
-	const struct kbase_hwcnt_enable_map *src)
+static inline void kbase_hwcnt_enable_map_copy(struct kbase_hwcnt_enable_map *dst,
+					       const struct kbase_hwcnt_enable_map *src)
 {
 	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst->metadata) ||
 	    WARN_ON(dst->metadata != src->metadata))
@@ -733,8 +711,7 @@ static inline void kbase_hwcnt_enable_map_copy(
 		if (WARN_ON(!src->hwcnt_enable_map))
 			return;
 
-		memcpy(dst->hwcnt_enable_map,
-		       src->hwcnt_enable_map,
+		memcpy(dst->hwcnt_enable_map, src->hwcnt_enable_map,
 		       dst->metadata->enable_map_bytes);
 	}
 
@@ -748,9 +725,8 @@ static inline void kbase_hwcnt_enable_map_copy(
  *
  * The dst and src MUST have been created from the same metadata.
  */
-static inline void kbase_hwcnt_enable_map_union(
-	struct kbase_hwcnt_enable_map *dst,
-	const struct kbase_hwcnt_enable_map *src)
+static inline void kbase_hwcnt_enable_map_union(struct kbase_hwcnt_enable_map *dst,
+						const struct kbase_hwcnt_enable_map *src)
 {
 	if (WARN_ON(!dst) || WARN_ON(!src) || WARN_ON(!dst->metadata) ||
 	    WARN_ON(dst->metadata != src->metadata))
@@ -781,11 +757,9 @@ static inline void kbase_hwcnt_enable_map_union(
  *
  * Return: true if any values in the block are enabled, else false.
  */
-static inline bool kbase_hwcnt_enable_map_block_enabled(
-	const struct kbase_hwcnt_enable_map *enable_map,
-	size_t grp,
-	size_t blk,
-	size_t blk_inst)
+static inline bool
+kbase_hwcnt_enable_map_block_enabled(const struct kbase_hwcnt_enable_map *enable_map, size_t grp,
+				     size_t blk, size_t blk_inst)
 {
 	bool any_enabled = false;
 	size_t val_cnt;
@@ -801,15 +775,13 @@ static inline bool kbase_hwcnt_enable_map_block_enabled(
 	bitfld_cnt = kbase_hwcnt_bitfield_count(val_cnt);
 
 	for (bitfld_idx = 0; bitfld_idx < bitfld_cnt; bitfld_idx++) {
-		const u64 remaining_values = val_cnt -
-			(bitfld_idx * KBASE_HWCNT_BITFIELD_BITS);
+		const u64 remaining_values = val_cnt - (bitfld_idx * KBASE_HWCNT_BITFIELD_BITS);
 		u64 block_enable_map_mask = U64_MAX;
 
 		if (remaining_values < KBASE_HWCNT_BITFIELD_BITS)
 			block_enable_map_mask = (1ull << remaining_values) - 1;
 
-		any_enabled = any_enabled ||
-			(block_enable_map[bitfld_idx] & block_enable_map_mask);
+		any_enabled = any_enabled || (block_enable_map[bitfld_idx] & block_enable_map_mask);
 	}
 
 	return any_enabled;
@@ -821,8 +793,8 @@ static inline bool kbase_hwcnt_enable_map_block_enabled(
  *
  * Return: true if any values are enabled, else false.
  */
-static inline bool kbase_hwcnt_enable_map_any_enabled(
-	const struct kbase_hwcnt_enable_map *enable_map)
+static inline bool
+kbase_hwcnt_enable_map_any_enabled(const struct kbase_hwcnt_enable_map *enable_map)
 {
 	size_t grp, blk, blk_inst;
 	u64 clk_enable_map_mask;
@@ -832,14 +804,12 @@ static inline bool kbase_hwcnt_enable_map_any_enabled(
 
 	clk_enable_map_mask = (1ull << enable_map->metadata->clk_cnt) - 1;
 
-	if (enable_map->metadata->clk_cnt > 0 &&
-		(enable_map->clk_enable_map & clk_enable_map_mask))
+	if (enable_map->metadata->clk_cnt > 0 && (enable_map->clk_enable_map & clk_enable_map_mask))
 		return true;
 
-	kbase_hwcnt_metadata_for_each_block(
-		enable_map->metadata, grp, blk, blk_inst) {
-		if (kbase_hwcnt_enable_map_block_enabled(
-			enable_map, grp, blk, blk_inst))
+	kbase_hwcnt_metadata_for_each_block(enable_map->metadata, grp, blk, blk_inst)
+	{
+		if (kbase_hwcnt_enable_map_block_enabled(enable_map, grp, blk, blk_inst))
 			return true;
 	}
 
@@ -855,9 +825,7 @@ static inline bool kbase_hwcnt_enable_map_any_enabled(
  *
  * Return: true if the value was enabled, else false.
  */
-static inline bool kbase_hwcnt_enable_map_block_value_enabled(
-	const u64 *bitfld,
-	size_t val_idx)
+static inline bool kbase_hwcnt_enable_map_block_value_enabled(const u64 *bitfld, size_t val_idx)
 {
 	const size_t idx = val_idx / KBASE_HWCNT_BITFIELD_BITS;
 	const size_t bit = val_idx % KBASE_HWCNT_BITFIELD_BITS;
@@ -873,9 +841,7 @@ static inline bool kbase_hwcnt_enable_map_block_value_enabled(
  *           kbase_hwcnt_enable_map_block_instance.
  * @val_idx: Index of the value to enable in the block instance.
  */
-static inline void kbase_hwcnt_enable_map_block_enable_value(
-	u64 *bitfld,
-	size_t val_idx)
+static inline void kbase_hwcnt_enable_map_block_enable_value(u64 *bitfld, size_t val_idx)
 {
 	const size_t idx = val_idx / KBASE_HWCNT_BITFIELD_BITS;
 	const size_t bit = val_idx % KBASE_HWCNT_BITFIELD_BITS;
@@ -891,9 +857,7 @@ static inline void kbase_hwcnt_enable_map_block_enable_value(
  *           kbase_hwcnt_enable_map_block_instance.
  * @val_idx: Index of the value to disable in the block instance.
  */
-static inline void kbase_hwcnt_enable_map_block_disable_value(
-	u64 *bitfld,
-	size_t val_idx)
+static inline void kbase_hwcnt_enable_map_block_disable_value(u64 *bitfld, size_t val_idx)
 {
 	const size_t idx = val_idx / KBASE_HWCNT_BITFIELD_BITS;
 	const size_t bit = val_idx % KBASE_HWCNT_BITFIELD_BITS;
@@ -911,9 +875,8 @@ static inline void kbase_hwcnt_enable_map_block_disable_value(
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_dump_buffer_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	struct kbase_hwcnt_dump_buffer *dump_buf);
+int kbase_hwcnt_dump_buffer_alloc(const struct kbase_hwcnt_metadata *metadata,
+				  struct kbase_hwcnt_dump_buffer *dump_buf);
 
 /**
  * kbase_hwcnt_dump_buffer_free() - Free a dump buffer.
@@ -936,10 +899,8 @@ void kbase_hwcnt_dump_buffer_free(struct kbase_hwcnt_dump_buffer *dump_buf);
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_dump_buffer_array_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	size_t n,
-	struct kbase_hwcnt_dump_buffer_array *dump_bufs);
+int kbase_hwcnt_dump_buffer_array_alloc(const struct kbase_hwcnt_metadata *metadata, size_t n,
+					struct kbase_hwcnt_dump_buffer_array *dump_bufs);
 
 /**
  * kbase_hwcnt_dump_buffer_array_free() - Free a dump buffer array.
@@ -948,8 +909,7 @@ int kbase_hwcnt_dump_buffer_array_alloc(
  * Can be safely called on an all-zeroed dump buffer array structure, or on an
  * already freed dump buffer array.
  */
-void kbase_hwcnt_dump_buffer_array_free(
-	struct kbase_hwcnt_dump_buffer_array *dump_bufs);
+void kbase_hwcnt_dump_buffer_array_free(struct kbase_hwcnt_dump_buffer_array *dump_bufs);
 
 /**
  * kbase_hwcnt_dump_buffer_block_instance() - Get the pointer to a block
@@ -961,9 +921,8 @@ void kbase_hwcnt_dump_buffer_array_free(
  *
  * Return: u64* to the dump buffer for the block instance.
  */
-static inline u64 *kbase_hwcnt_dump_buffer_block_instance(
-	const struct kbase_hwcnt_dump_buffer *buf, size_t grp, size_t blk,
-	size_t blk_inst)
+static inline u64 *kbase_hwcnt_dump_buffer_block_instance(const struct kbase_hwcnt_dump_buffer *buf,
+							  size_t grp, size_t blk, size_t blk_inst)
 {
 	if (WARN_ON(!buf) || WARN_ON(!buf->dump_buf))
 		return NULL;
@@ -975,10 +934,7 @@ static inline u64 *kbase_hwcnt_dump_buffer_block_instance(
 
 	return buf->dump_buf + buf->metadata->grp_metadata[grp].dump_buf_index +
 	       buf->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_index +
-	       (buf->metadata->grp_metadata[grp]
-			.blk_metadata[blk]
-			.dump_buf_stride *
-		blk_inst);
+	       (buf->metadata->grp_metadata[grp].blk_metadata[blk].dump_buf_stride * blk_inst);
 }
 
 /**
@@ -990,9 +946,8 @@ static inline u64 *kbase_hwcnt_dump_buffer_block_instance(
  *
  * The dst and dst_enable_map MUST have been created from the same metadata.
  */
-void kbase_hwcnt_dump_buffer_zero(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_enable_map *dst_enable_map);
+void kbase_hwcnt_dump_buffer_zero(struct kbase_hwcnt_dump_buffer *dst,
+				  const struct kbase_hwcnt_enable_map *dst_enable_map);
 
 /**
  * kbase_hwcnt_dump_buffer_block_zero() - Zero all values in a block.
@@ -1000,8 +955,7 @@ void kbase_hwcnt_dump_buffer_zero(
  *           kbase_hwcnt_dump_buffer_block_instance.
  * @val_cnt: Number of values in the block.
  */
-static inline void kbase_hwcnt_dump_buffer_block_zero(u64 *dst_blk,
-						      size_t val_cnt)
+static inline void kbase_hwcnt_dump_buffer_block_zero(u64 *dst_blk, size_t val_cnt)
 {
 	if (WARN_ON(!dst_blk))
 		return;
@@ -1017,8 +971,7 @@ static inline void kbase_hwcnt_dump_buffer_block_zero(u64 *dst_blk,
  *                                         Slower than the non-strict variant.
  * @dst: Non-NULL pointer to dump buffer.
  */
-void kbase_hwcnt_dump_buffer_zero_strict(
-	struct kbase_hwcnt_dump_buffer *dst);
+void kbase_hwcnt_dump_buffer_zero_strict(struct kbase_hwcnt_dump_buffer *dst);
 
 /**
  * kbase_hwcnt_dump_buffer_zero_non_enabled() - Zero all non-enabled values in
@@ -1031,9 +984,8 @@ void kbase_hwcnt_dump_buffer_zero_strict(
  *
  * The dst and dst_enable_map MUST have been created from the same metadata.
  */
-void kbase_hwcnt_dump_buffer_zero_non_enabled(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_enable_map *dst_enable_map);
+void kbase_hwcnt_dump_buffer_zero_non_enabled(struct kbase_hwcnt_dump_buffer *dst,
+					      const struct kbase_hwcnt_enable_map *dst_enable_map);
 
 /**
  * kbase_hwcnt_dump_buffer_block_zero_non_enabled() - Zero all non-enabled
@@ -1047,9 +999,8 @@ void kbase_hwcnt_dump_buffer_zero_non_enabled(
  *           kbase_hwcnt_enable_map_block_instance.
  * @val_cnt: Number of values in the block.
  */
-static inline void
-kbase_hwcnt_dump_buffer_block_zero_non_enabled(u64 *dst_blk, const u64 *blk_em,
-					       size_t val_cnt)
+static inline void kbase_hwcnt_dump_buffer_block_zero_non_enabled(u64 *dst_blk, const u64 *blk_em,
+								  size_t val_cnt)
 {
 	size_t val;
 
@@ -1073,10 +1024,9 @@ kbase_hwcnt_dump_buffer_block_zero_non_enabled(u64 *dst_blk, const u64 *blk_em,
  * The dst, src, and dst_enable_map MUST have been created from the same
  * metadata.
  */
-void kbase_hwcnt_dump_buffer_copy(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map);
+void kbase_hwcnt_dump_buffer_copy(struct kbase_hwcnt_dump_buffer *dst,
+				  const struct kbase_hwcnt_dump_buffer *src,
+				  const struct kbase_hwcnt_enable_map *dst_enable_map);
 
 /**
  * kbase_hwcnt_dump_buffer_block_copy() - Copy all block values from src to dst.
@@ -1086,8 +1036,7 @@ void kbase_hwcnt_dump_buffer_copy(
  *           kbase_hwcnt_dump_buffer_block_instance.
  * @val_cnt: Number of values in the block.
  */
-static inline void kbase_hwcnt_dump_buffer_block_copy(u64 *dst_blk,
-						      const u64 *src_blk,
+static inline void kbase_hwcnt_dump_buffer_block_copy(u64 *dst_blk, const u64 *src_blk,
 						      size_t val_cnt)
 {
 	if (WARN_ON(!dst_blk) || WARN_ON(!src_blk))
@@ -1113,10 +1062,9 @@ static inline void kbase_hwcnt_dump_buffer_block_copy(u64 *dst_blk,
  * The dst, src, and dst_enable_map MUST have been created from the same
  * metadata.
  */
-void kbase_hwcnt_dump_buffer_copy_strict(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map);
+void kbase_hwcnt_dump_buffer_copy_strict(struct kbase_hwcnt_dump_buffer *dst,
+					 const struct kbase_hwcnt_dump_buffer *src,
+					 const struct kbase_hwcnt_enable_map *dst_enable_map);
 
 /**
  * kbase_hwcnt_dump_buffer_block_copy_strict() - Copy all enabled block values
@@ -1134,10 +1082,8 @@ void kbase_hwcnt_dump_buffer_copy_strict(
  *
  * After the copy, any disabled values in dst will be zero.
  */
-static inline void kbase_hwcnt_dump_buffer_block_copy_strict(u64 *dst_blk,
-							     const u64 *src_blk,
-							     const u64 *blk_em,
-							     size_t val_cnt)
+static inline void kbase_hwcnt_dump_buffer_block_copy_strict(u64 *dst_blk, const u64 *src_blk,
+							     const u64 *blk_em, size_t val_cnt)
 {
 	size_t val;
 
@@ -1145,8 +1091,7 @@ static inline void kbase_hwcnt_dump_buffer_block_copy_strict(u64 *dst_blk,
 		return;
 
 	for (val = 0; val < val_cnt; val++) {
-		bool val_enabled = kbase_hwcnt_enable_map_block_value_enabled(
-			blk_em, val);
+		bool val_enabled = kbase_hwcnt_enable_map_block_value_enabled(blk_em, val);
 
 		dst_blk[val] = val_enabled ? src_blk[val] : 0;
 	}
@@ -1165,10 +1110,9 @@ static inline void kbase_hwcnt_dump_buffer_block_copy_strict(u64 *dst_blk,
  * The dst, src, and dst_enable_map MUST have been created from the same
  * metadata.
  */
-void kbase_hwcnt_dump_buffer_accumulate(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map);
+void kbase_hwcnt_dump_buffer_accumulate(struct kbase_hwcnt_dump_buffer *dst,
+					const struct kbase_hwcnt_dump_buffer *src,
+					const struct kbase_hwcnt_enable_map *dst_enable_map);
 
 /**
  * kbase_hwcnt_dump_buffer_block_accumulate() - Copy all block headers and
@@ -1181,10 +1125,8 @@ void kbase_hwcnt_dump_buffer_accumulate(
  * @hdr_cnt: Number of headers in the block.
  * @ctr_cnt: Number of counters in the block.
  */
-static inline void kbase_hwcnt_dump_buffer_block_accumulate(u64 *dst_blk,
-							    const u64 *src_blk,
-							    size_t hdr_cnt,
-							    size_t ctr_cnt)
+static inline void kbase_hwcnt_dump_buffer_block_accumulate(u64 *dst_blk, const u64 *src_blk,
+							    size_t hdr_cnt, size_t ctr_cnt)
 {
 	size_t ctr;
 
@@ -1219,10 +1161,9 @@ static inline void kbase_hwcnt_dump_buffer_block_accumulate(u64 *dst_blk,
  * The dst, src, and dst_enable_map MUST have been created from the same
  * metadata.
  */
-void kbase_hwcnt_dump_buffer_accumulate_strict(
-	struct kbase_hwcnt_dump_buffer *dst,
-	const struct kbase_hwcnt_dump_buffer *src,
-	const struct kbase_hwcnt_enable_map *dst_enable_map);
+void kbase_hwcnt_dump_buffer_accumulate_strict(struct kbase_hwcnt_dump_buffer *dst,
+					       const struct kbase_hwcnt_dump_buffer *src,
+					       const struct kbase_hwcnt_enable_map *dst_enable_map);
 
 /**
  * kbase_hwcnt_dump_buffer_block_accumulate_strict() - Copy all enabled block
@@ -1241,21 +1182,19 @@ void kbase_hwcnt_dump_buffer_accumulate_strict(
  * @hdr_cnt: Number of headers in the block.
  * @ctr_cnt: Number of counters in the block.
  */
-static inline void kbase_hwcnt_dump_buffer_block_accumulate_strict(
-	u64 *dst_blk, const u64 *src_blk, const u64 *blk_em, size_t hdr_cnt,
-	size_t ctr_cnt)
+static inline void kbase_hwcnt_dump_buffer_block_accumulate_strict(u64 *dst_blk, const u64 *src_blk,
+								   const u64 *blk_em,
+								   size_t hdr_cnt, size_t ctr_cnt)
 {
 	size_t ctr;
 
 	if (WARN_ON(!dst_blk) || WARN_ON(!src_blk))
 		return;
 
-	kbase_hwcnt_dump_buffer_block_copy_strict(
-		dst_blk, src_blk, blk_em, hdr_cnt);
+	kbase_hwcnt_dump_buffer_block_copy_strict(dst_blk, src_blk, blk_em, hdr_cnt);
 
 	for (ctr = hdr_cnt; ctr < ctr_cnt + hdr_cnt; ctr++) {
-		bool ctr_enabled = kbase_hwcnt_enable_map_block_value_enabled(
-			blk_em, ctr);
+		bool ctr_enabled = kbase_hwcnt_enable_map_block_value_enabled(blk_em, ctr);
 
 		if (ctr_enabled)
 			dst_blk[ctr] += src_blk[ctr];
@@ -1270,8 +1209,7 @@ static inline void kbase_hwcnt_dump_buffer_block_accumulate_strict(
  * @md:          Non-NULL pointer to metadata.
  * @clk:         size_t variable used as clock iterator.
  */
-#define kbase_hwcnt_metadata_for_each_clock(md, clk)    \
-	for ((clk) = 0; (clk) < (md)->clk_cnt; (clk)++)
+#define kbase_hwcnt_metadata_for_each_clock(md, clk) for ((clk) = 0; (clk) < (md)->clk_cnt; (clk)++)
 
 /**
  * kbase_hwcnt_clk_enable_map_enabled() - Check if the given index is enabled
@@ -1281,8 +1219,7 @@ static inline void kbase_hwcnt_dump_buffer_block_accumulate_strict(
  *
  * Return: true if the index of the clock domain is enabled, else false.
  */
-static inline bool kbase_hwcnt_clk_enable_map_enabled(
-	const u64 clk_enable_map, const size_t index)
+static inline bool kbase_hwcnt_clk_enable_map_enabled(const u64 clk_enable_map, const size_t index)
 {
 	if (WARN_ON(index >= 64))
 		return false;
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_virtualizer.c b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_virtualizer.c
similarity index 75%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_virtualizer.c
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_virtualizer.c
index 52ecb7bed03f..d618764d3b32 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_virtualizer.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_virtualizer.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,10 +19,10 @@
  *
  */
 
-#include "mali_kbase_hwcnt_virtualizer.h"
-#include "mali_kbase_hwcnt_accumulator.h"
-#include "mali_kbase_hwcnt_context.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_virtualizer.h"
+#include "hwcnt/mali_kbase_hwcnt_accumulator.h"
+#include "hwcnt/mali_kbase_hwcnt_context.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 
 #include <linux/mutex.h>
 #include <linux/slab.h>
@@ -75,8 +75,8 @@ struct kbase_hwcnt_virtualizer_client {
 	u64 ts_start_ns;
 };
 
-const struct kbase_hwcnt_metadata *kbase_hwcnt_virtualizer_metadata(
-	struct kbase_hwcnt_virtualizer *hvirt)
+const struct kbase_hwcnt_metadata *
+kbase_hwcnt_virtualizer_metadata(struct kbase_hwcnt_virtualizer *hvirt)
 {
 	if (!hvirt)
 		return NULL;
@@ -90,8 +90,7 @@ const struct kbase_hwcnt_metadata *kbase_hwcnt_virtualizer_metadata(
  *
  * Will safely free a client in any partial state of construction.
  */
-static void kbasep_hwcnt_virtualizer_client_free(
-	struct kbase_hwcnt_virtualizer_client *hvcli)
+static void kbasep_hwcnt_virtualizer_client_free(struct kbase_hwcnt_virtualizer_client *hvcli)
 {
 	if (!hvcli)
 		return;
@@ -110,9 +109,8 @@ static void kbasep_hwcnt_virtualizer_client_free(
  *
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_virtualizer_client_alloc(
-	const struct kbase_hwcnt_metadata *metadata,
-	struct kbase_hwcnt_virtualizer_client **out_hvcli)
+static int kbasep_hwcnt_virtualizer_client_alloc(const struct kbase_hwcnt_metadata *metadata,
+						 struct kbase_hwcnt_virtualizer_client **out_hvcli)
 {
 	int errcode;
 	struct kbase_hwcnt_virtualizer_client *hvcli = NULL;
@@ -145,9 +143,9 @@ error:
  * @hvcli:    Non-NULL pointer to virtualizer client.
  * @dump_buf: Non-NULL pointer to dump buffer to accumulate from.
  */
-static void kbasep_hwcnt_virtualizer_client_accumulate(
-	struct kbase_hwcnt_virtualizer_client *hvcli,
-	const struct kbase_hwcnt_dump_buffer *dump_buf)
+static void
+kbasep_hwcnt_virtualizer_client_accumulate(struct kbase_hwcnt_virtualizer_client *hvcli,
+					   const struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	WARN_ON(!hvcli);
 	WARN_ON(!dump_buf);
@@ -155,12 +153,10 @@ static void kbasep_hwcnt_virtualizer_client_accumulate(
 
 	if (hvcli->has_accum) {
 		/* If already some accumulation, accumulate */
-		kbase_hwcnt_dump_buffer_accumulate(
-			&hvcli->accum_buf, dump_buf, &hvcli->enable_map);
+		kbase_hwcnt_dump_buffer_accumulate(&hvcli->accum_buf, dump_buf, &hvcli->enable_map);
 	} else {
 		/* If no accumulation, copy */
-		kbase_hwcnt_dump_buffer_copy(
-			&hvcli->accum_buf, dump_buf, &hvcli->enable_map);
+		kbase_hwcnt_dump_buffer_copy(&hvcli->accum_buf, dump_buf, &hvcli->enable_map);
 	}
 	hvcli->has_accum = true;
 }
@@ -173,8 +169,7 @@ static void kbasep_hwcnt_virtualizer_client_accumulate(
  *
  * Will safely terminate the accumulator in any partial state of initialisation.
  */
-static void kbasep_hwcnt_virtualizer_accumulator_term(
-	struct kbase_hwcnt_virtualizer *hvirt)
+static void kbasep_hwcnt_virtualizer_accumulator_term(struct kbase_hwcnt_virtualizer *hvirt)
 {
 	WARN_ON(!hvirt);
 	lockdep_assert_held(&hvirt->lock);
@@ -194,8 +189,7 @@ static void kbasep_hwcnt_virtualizer_accumulator_term(
  *
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_virtualizer_accumulator_init(
-	struct kbase_hwcnt_virtualizer *hvirt)
+static int kbasep_hwcnt_virtualizer_accumulator_init(struct kbase_hwcnt_virtualizer *hvirt)
 {
 	int errcode;
 
@@ -204,18 +198,15 @@ static int kbasep_hwcnt_virtualizer_accumulator_init(
 	WARN_ON(hvirt->client_count);
 	WARN_ON(hvirt->accum);
 
-	errcode = kbase_hwcnt_accumulator_acquire(
-		hvirt->hctx, &hvirt->accum);
+	errcode = kbase_hwcnt_accumulator_acquire(hvirt->hctx, &hvirt->accum);
 	if (errcode)
 		goto error;
 
-	errcode = kbase_hwcnt_enable_map_alloc(
-		hvirt->metadata, &hvirt->scratch_map);
+	errcode = kbase_hwcnt_enable_map_alloc(hvirt->metadata, &hvirt->scratch_map);
 	if (errcode)
 		goto error;
 
-	errcode = kbase_hwcnt_dump_buffer_alloc(
-		hvirt->metadata, &hvirt->scratch_buf);
+	errcode = kbase_hwcnt_dump_buffer_alloc(hvirt->metadata, &hvirt->scratch_buf);
 	if (errcode)
 		goto error;
 
@@ -234,10 +225,9 @@ error:
  *
  * Return: 0 on success, else error code.
  */
-static int kbasep_hwcnt_virtualizer_client_add(
-	struct kbase_hwcnt_virtualizer *hvirt,
-	struct kbase_hwcnt_virtualizer_client *hvcli,
-	const struct kbase_hwcnt_enable_map *enable_map)
+static int kbasep_hwcnt_virtualizer_client_add(struct kbase_hwcnt_virtualizer *hvirt,
+					       struct kbase_hwcnt_virtualizer_client *hvcli,
+					       const struct kbase_hwcnt_enable_map *enable_map)
 {
 	int errcode = 0;
 	u64 ts_start_ns;
@@ -258,28 +248,25 @@ static int kbasep_hwcnt_virtualizer_client_add(
 
 	if (hvirt->client_count == 1) {
 		/* First client, so just pass the enable map onwards as is */
-		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum,
-			enable_map, &ts_start_ns, &ts_end_ns, NULL);
+		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum, enable_map,
+							       &ts_start_ns, &ts_end_ns, NULL);
 	} else {
 		struct kbase_hwcnt_virtualizer_client *pos;
 
 		/* Make the scratch enable map the union of all enable maps */
-		kbase_hwcnt_enable_map_copy(
-			&hvirt->scratch_map, enable_map);
-		list_for_each_entry(pos, &hvirt->clients, node)
-			kbase_hwcnt_enable_map_union(
-				&hvirt->scratch_map, &pos->enable_map);
+		kbase_hwcnt_enable_map_copy(&hvirt->scratch_map, enable_map);
+		list_for_each_entry (pos, &hvirt->clients, node)
+			kbase_hwcnt_enable_map_union(&hvirt->scratch_map, &pos->enable_map);
 
 		/* Set the counters with the new union enable map */
-		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum,
-			&hvirt->scratch_map,
-			&ts_start_ns, &ts_end_ns,
-			&hvirt->scratch_buf);
+		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum, &hvirt->scratch_map,
+							       &ts_start_ns, &ts_end_ns,
+							       &hvirt->scratch_buf);
 		/* Accumulate into only existing clients' accumulation bufs */
 		if (!errcode)
-			list_for_each_entry(pos, &hvirt->clients, node)
-				kbasep_hwcnt_virtualizer_client_accumulate(
-					pos, &hvirt->scratch_buf);
+			list_for_each_entry (pos, &hvirt->clients, node)
+				kbasep_hwcnt_virtualizer_client_accumulate(pos,
+									   &hvirt->scratch_buf);
 	}
 	if (errcode)
 		goto error;
@@ -307,9 +294,8 @@ error:
  * @hvirt:      Non-NULL pointer to the hardware counter virtualizer.
  * @hvcli:      Non-NULL pointer to the virtualizer client to remove.
  */
-static void kbasep_hwcnt_virtualizer_client_remove(
-	struct kbase_hwcnt_virtualizer *hvirt,
-	struct kbase_hwcnt_virtualizer_client *hvcli)
+static void kbasep_hwcnt_virtualizer_client_remove(struct kbase_hwcnt_virtualizer *hvirt,
+						   struct kbase_hwcnt_virtualizer_client *hvcli)
 {
 	int errcode = 0;
 	u64 ts_start_ns;
@@ -329,22 +315,21 @@ static void kbasep_hwcnt_virtualizer_client_remove(
 		struct kbase_hwcnt_virtualizer_client *pos;
 		/* Make the scratch enable map the union of all enable maps */
 		kbase_hwcnt_enable_map_disable_all(&hvirt->scratch_map);
-		list_for_each_entry(pos, &hvirt->clients, node)
-			kbase_hwcnt_enable_map_union(
-				&hvirt->scratch_map, &pos->enable_map);
+		list_for_each_entry (pos, &hvirt->clients, node)
+			kbase_hwcnt_enable_map_union(&hvirt->scratch_map, &pos->enable_map);
 		/* Set the counters with the new union enable map */
-		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum,
-			&hvirt->scratch_map,
-			&ts_start_ns, &ts_end_ns,
-			&hvirt->scratch_buf);
+		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum, &hvirt->scratch_map,
+							       &ts_start_ns, &ts_end_ns,
+							       &hvirt->scratch_buf);
 		/* Accumulate into remaining clients' accumulation bufs */
-		if (!errcode)
-			list_for_each_entry(pos, &hvirt->clients, node)
-				kbasep_hwcnt_virtualizer_client_accumulate(
-					pos, &hvirt->scratch_buf);
+		if (!errcode) {
+			list_for_each_entry (pos, &hvirt->clients, node)
+				kbasep_hwcnt_virtualizer_client_accumulate(pos,
+									   &hvirt->scratch_buf);
 
-		/* Store the most recent dump time for rate limiting */
-		hvirt->ts_last_dump_ns = ts_end_ns;
+			/* Store the most recent dump time for rate limiting */
+			hvirt->ts_last_dump_ns = ts_end_ns;
+		}
 	}
 	WARN_ON(errcode);
 }
@@ -370,11 +355,8 @@ static void kbasep_hwcnt_virtualizer_client_remove(
  * Return: 0 on success or error code.
  */
 static int kbasep_hwcnt_virtualizer_client_set_counters(
-	struct kbase_hwcnt_virtualizer *hvirt,
-	struct kbase_hwcnt_virtualizer_client *hvcli,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
+	struct kbase_hwcnt_virtualizer *hvirt, struct kbase_hwcnt_virtualizer_client *hvcli,
+	const struct kbase_hwcnt_enable_map *enable_map, u64 *ts_start_ns, u64 *ts_end_ns,
 	struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	int errcode;
@@ -391,32 +373,29 @@ static int kbasep_hwcnt_virtualizer_client_set_counters(
 
 	/* Make the scratch enable map the union of all enable maps */
 	kbase_hwcnt_enable_map_copy(&hvirt->scratch_map, enable_map);
-	list_for_each_entry(pos, &hvirt->clients, node)
+	list_for_each_entry (pos, &hvirt->clients, node)
 		/* Ignore the enable map of the selected client */
 		if (pos != hvcli)
-			kbase_hwcnt_enable_map_union(
-				&hvirt->scratch_map, &pos->enable_map);
+			kbase_hwcnt_enable_map_union(&hvirt->scratch_map, &pos->enable_map);
 
 	/* Set the counters with the new union enable map */
-	errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum,
-		&hvirt->scratch_map, ts_start_ns, ts_end_ns,
-		&hvirt->scratch_buf);
+	errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum, &hvirt->scratch_map,
+						       ts_start_ns, ts_end_ns, &hvirt->scratch_buf);
 	if (errcode)
 		return errcode;
 
 	/* Accumulate into all accumulation bufs except the selected client's */
-	list_for_each_entry(pos, &hvirt->clients, node)
+	list_for_each_entry (pos, &hvirt->clients, node)
 		if (pos != hvcli)
-			kbasep_hwcnt_virtualizer_client_accumulate(
-				pos, &hvirt->scratch_buf);
+			kbasep_hwcnt_virtualizer_client_accumulate(pos, &hvirt->scratch_buf);
 
 	/* Finally, write into the dump buf */
 	if (dump_buf) {
 		const struct kbase_hwcnt_dump_buffer *src = &hvirt->scratch_buf;
 
 		if (hvcli->has_accum) {
-			kbase_hwcnt_dump_buffer_accumulate(
-				&hvcli->accum_buf, src, &hvcli->enable_map);
+			kbase_hwcnt_dump_buffer_accumulate(&hvcli->accum_buf, src,
+							   &hvcli->enable_map);
 			src = &hvcli->accum_buf;
 		}
 		kbase_hwcnt_dump_buffer_copy(dump_buf, src, &hvcli->enable_map);
@@ -436,12 +415,10 @@ static int kbasep_hwcnt_virtualizer_client_set_counters(
 	return errcode;
 }
 
-int kbase_hwcnt_virtualizer_client_set_counters(
-	struct kbase_hwcnt_virtualizer_client *hvcli,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+int kbase_hwcnt_virtualizer_client_set_counters(struct kbase_hwcnt_virtualizer_client *hvcli,
+						const struct kbase_hwcnt_enable_map *enable_map,
+						u64 *ts_start_ns, u64 *ts_end_ns,
+						struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	int errcode;
 	struct kbase_hwcnt_virtualizer *hvirt;
@@ -464,14 +441,12 @@ int kbase_hwcnt_virtualizer_client_set_counters(
 		 * to the accumulator, saving a fair few copies and
 		 * accumulations.
 		 */
-		errcode = kbase_hwcnt_accumulator_set_counters(
-			hvirt->accum, enable_map,
-			ts_start_ns, ts_end_ns, dump_buf);
+		errcode = kbase_hwcnt_accumulator_set_counters(hvirt->accum, enable_map,
+							       ts_start_ns, ts_end_ns, dump_buf);
 
 		if (!errcode) {
 			/* Update the selected client's enable map */
-			kbase_hwcnt_enable_map_copy(
-				&hvcli->enable_map, enable_map);
+			kbase_hwcnt_enable_map_copy(&hvcli->enable_map, enable_map);
 
 			/* Fix up the timestamps */
 			*ts_start_ns = hvcli->ts_start_ns;
@@ -483,8 +458,7 @@ int kbase_hwcnt_virtualizer_client_set_counters(
 	} else {
 		/* Otherwise, do the full virtualize */
 		errcode = kbasep_hwcnt_virtualizer_client_set_counters(
-			hvirt, hvcli, enable_map,
-			ts_start_ns, ts_end_ns, dump_buf);
+			hvirt, hvcli, enable_map, ts_start_ns, ts_end_ns, dump_buf);
 	}
 
 	mutex_unlock(&hvirt->lock);
@@ -507,12 +481,10 @@ int kbase_hwcnt_virtualizer_client_set_counters(
  *
  * Return: 0 on success or error code.
  */
-static int kbasep_hwcnt_virtualizer_client_dump(
-	struct kbase_hwcnt_virtualizer *hvirt,
-	struct kbase_hwcnt_virtualizer_client *hvcli,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+static int kbasep_hwcnt_virtualizer_client_dump(struct kbase_hwcnt_virtualizer *hvirt,
+						struct kbase_hwcnt_virtualizer_client *hvcli,
+						u64 *ts_start_ns, u64 *ts_end_ns,
+						struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	int errcode;
 	struct kbase_hwcnt_virtualizer_client *pos;
@@ -525,24 +497,23 @@ static int kbasep_hwcnt_virtualizer_client_dump(
 	lockdep_assert_held(&hvirt->lock);
 
 	/* Perform the dump */
-	errcode = kbase_hwcnt_accumulator_dump(hvirt->accum,
-		ts_start_ns, ts_end_ns, &hvirt->scratch_buf);
+	errcode = kbase_hwcnt_accumulator_dump(hvirt->accum, ts_start_ns, ts_end_ns,
+					       &hvirt->scratch_buf);
 	if (errcode)
 		return errcode;
 
 	/* Accumulate into all accumulation bufs except the selected client's */
-	list_for_each_entry(pos, &hvirt->clients, node)
+	list_for_each_entry (pos, &hvirt->clients, node)
 		if (pos != hvcli)
-			kbasep_hwcnt_virtualizer_client_accumulate(
-				pos, &hvirt->scratch_buf);
+			kbasep_hwcnt_virtualizer_client_accumulate(pos, &hvirt->scratch_buf);
 
 	/* Finally, write into the dump buf */
 	if (dump_buf) {
 		const struct kbase_hwcnt_dump_buffer *src = &hvirt->scratch_buf;
 
 		if (hvcli->has_accum) {
-			kbase_hwcnt_dump_buffer_accumulate(
-				&hvcli->accum_buf, src, &hvcli->enable_map);
+			kbase_hwcnt_dump_buffer_accumulate(&hvcli->accum_buf, src,
+							   &hvcli->enable_map);
 			src = &hvcli->accum_buf;
 		}
 		kbase_hwcnt_dump_buffer_copy(dump_buf, src, &hvcli->enable_map);
@@ -578,11 +549,8 @@ static int kbasep_hwcnt_virtualizer_client_dump(
  * Return: 0 on success or error code.
  */
 static int kbasep_hwcnt_virtualizer_client_dump_rate_limited(
-	struct kbase_hwcnt_virtualizer *hvirt,
-	struct kbase_hwcnt_virtualizer_client *hvcli,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+	struct kbase_hwcnt_virtualizer *hvirt, struct kbase_hwcnt_virtualizer_client *hvcli,
+	u64 *ts_start_ns, u64 *ts_end_ns, struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	bool rate_limited = true;
 
@@ -602,10 +570,8 @@ static int kbasep_hwcnt_virtualizer_client_dump_rate_limited(
 		 */
 		rate_limited = false;
 	} else {
-		const u64 ts_ns =
-			kbase_hwcnt_accumulator_timestamp_ns(hvirt->accum);
-		const u64 time_since_last_dump_ns =
-			ts_ns - hvirt->ts_last_dump_ns;
+		const u64 ts_ns = kbase_hwcnt_accumulator_timestamp_ns(hvirt->accum);
+		const u64 time_since_last_dump_ns = ts_ns - hvirt->ts_last_dump_ns;
 
 		/* Dump period equals or exceeds the threshold */
 		if (time_since_last_dump_ns >= hvirt->dump_threshold_ns)
@@ -613,8 +579,8 @@ static int kbasep_hwcnt_virtualizer_client_dump_rate_limited(
 	}
 
 	if (!rate_limited)
-		return kbasep_hwcnt_virtualizer_client_dump(
-			hvirt, hvcli, ts_start_ns, ts_end_ns, dump_buf);
+		return kbasep_hwcnt_virtualizer_client_dump(hvirt, hvcli, ts_start_ns, ts_end_ns,
+							    dump_buf);
 
 	/* If we've gotten this far, the client must have something accumulated
 	 * otherwise it is a logic error
@@ -622,8 +588,7 @@ static int kbasep_hwcnt_virtualizer_client_dump_rate_limited(
 	WARN_ON(!hvcli->has_accum);
 
 	if (dump_buf)
-		kbase_hwcnt_dump_buffer_copy(
-			dump_buf, &hvcli->accum_buf, &hvcli->enable_map);
+		kbase_hwcnt_dump_buffer_copy(dump_buf, &hvcli->accum_buf, &hvcli->enable_map);
 	hvcli->has_accum = false;
 
 	*ts_start_ns = hvcli->ts_start_ns;
@@ -633,11 +598,9 @@ static int kbasep_hwcnt_virtualizer_client_dump_rate_limited(
 	return 0;
 }
 
-int kbase_hwcnt_virtualizer_client_dump(
-	struct kbase_hwcnt_virtualizer_client *hvcli,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf)
+int kbase_hwcnt_virtualizer_client_dump(struct kbase_hwcnt_virtualizer_client *hvcli,
+					u64 *ts_start_ns, u64 *ts_end_ns,
+					struct kbase_hwcnt_dump_buffer *dump_buf)
 {
 	int errcode;
 	struct kbase_hwcnt_virtualizer *hvirt;
@@ -659,8 +622,8 @@ int kbase_hwcnt_virtualizer_client_dump(
 		 * to the accumulator, saving a fair few copies and
 		 * accumulations.
 		 */
-		errcode = kbase_hwcnt_accumulator_dump(
-			hvirt->accum, ts_start_ns, ts_end_ns, dump_buf);
+		errcode = kbase_hwcnt_accumulator_dump(hvirt->accum, ts_start_ns, ts_end_ns,
+						       dump_buf);
 
 		if (!errcode) {
 			/* Fix up the timestamps */
@@ -681,20 +644,17 @@ int kbase_hwcnt_virtualizer_client_dump(
 	return errcode;
 }
 
-int kbase_hwcnt_virtualizer_client_create(
-	struct kbase_hwcnt_virtualizer *hvirt,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	struct kbase_hwcnt_virtualizer_client **out_hvcli)
+int kbase_hwcnt_virtualizer_client_create(struct kbase_hwcnt_virtualizer *hvirt,
+					  const struct kbase_hwcnt_enable_map *enable_map,
+					  struct kbase_hwcnt_virtualizer_client **out_hvcli)
 {
 	int errcode;
 	struct kbase_hwcnt_virtualizer_client *hvcli;
 
-	if (!hvirt || !enable_map || !out_hvcli ||
-	    (enable_map->metadata != hvirt->metadata))
+	if (!hvirt || !enable_map || !out_hvcli || (enable_map->metadata != hvirt->metadata))
 		return -EINVAL;
 
-	errcode = kbasep_hwcnt_virtualizer_client_alloc(
-		hvirt->metadata, &hvcli);
+	errcode = kbasep_hwcnt_virtualizer_client_alloc(hvirt->metadata, &hvcli);
 	if (errcode)
 		return errcode;
 
@@ -713,8 +673,7 @@ int kbase_hwcnt_virtualizer_client_create(
 	return 0;
 }
 
-void kbase_hwcnt_virtualizer_client_destroy(
-	struct kbase_hwcnt_virtualizer_client *hvcli)
+void kbase_hwcnt_virtualizer_client_destroy(struct kbase_hwcnt_virtualizer_client *hvcli)
 {
 	if (!hvcli)
 		return;
@@ -728,10 +687,8 @@ void kbase_hwcnt_virtualizer_client_destroy(
 	kbasep_hwcnt_virtualizer_client_free(hvcli);
 }
 
-int kbase_hwcnt_virtualizer_init(
-	struct kbase_hwcnt_context *hctx,
-	u64 dump_threshold_ns,
-	struct kbase_hwcnt_virtualizer **out_hvirt)
+int kbase_hwcnt_virtualizer_init(struct kbase_hwcnt_context *hctx, u64 dump_threshold_ns,
+				 struct kbase_hwcnt_virtualizer **out_hvirt)
 {
 	struct kbase_hwcnt_virtualizer *virt;
 	const struct kbase_hwcnt_metadata *metadata;
@@ -758,8 +715,7 @@ int kbase_hwcnt_virtualizer_init(
 	return 0;
 }
 
-void kbase_hwcnt_virtualizer_term(
-	struct kbase_hwcnt_virtualizer *hvirt)
+void kbase_hwcnt_virtualizer_term(struct kbase_hwcnt_virtualizer *hvirt)
 {
 	if (!hvirt)
 		return;
@@ -768,7 +724,7 @@ void kbase_hwcnt_virtualizer_term(
 	if (WARN_ON(hvirt->client_count != 0)) {
 		struct kbase_hwcnt_virtualizer_client *pos, *n;
 
-		list_for_each_entry_safe(pos, n, &hvirt->clients, node)
+		list_for_each_entry_safe (pos, n, &hvirt->clients, node)
 			kbase_hwcnt_virtualizer_client_destroy(pos);
 	}
 
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_virtualizer.h b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_virtualizer.h
similarity index 83%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_virtualizer.h
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_virtualizer.h
index 08e8e9f1d596..485ba74960f6 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_virtualizer.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_virtualizer.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2018, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2018, 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -51,17 +51,14 @@ struct kbase_hwcnt_dump_buffer;
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_virtualizer_init(
-	struct kbase_hwcnt_context *hctx,
-	u64 dump_threshold_ns,
-	struct kbase_hwcnt_virtualizer **out_hvirt);
+int kbase_hwcnt_virtualizer_init(struct kbase_hwcnt_context *hctx, u64 dump_threshold_ns,
+				 struct kbase_hwcnt_virtualizer **out_hvirt);
 
 /**
  * kbase_hwcnt_virtualizer_term - Terminate a hardware counter virtualizer.
  * @hvirt: Pointer to virtualizer to be terminated.
  */
-void kbase_hwcnt_virtualizer_term(
-	struct kbase_hwcnt_virtualizer *hvirt);
+void kbase_hwcnt_virtualizer_term(struct kbase_hwcnt_virtualizer *hvirt);
 
 /**
  * kbase_hwcnt_virtualizer_metadata - Get the hardware counter metadata used by
@@ -71,8 +68,8 @@ void kbase_hwcnt_virtualizer_term(
  *
  * Return: Non-NULL pointer to metadata, or NULL on error.
  */
-const struct kbase_hwcnt_metadata *kbase_hwcnt_virtualizer_metadata(
-	struct kbase_hwcnt_virtualizer *hvirt);
+const struct kbase_hwcnt_metadata *
+kbase_hwcnt_virtualizer_metadata(struct kbase_hwcnt_virtualizer *hvirt);
 
 /**
  * kbase_hwcnt_virtualizer_client_create - Create a new virtualizer client.
@@ -84,17 +81,15 @@ const struct kbase_hwcnt_metadata *kbase_hwcnt_virtualizer_metadata(
  *
  * Return: 0 on success, else error code.
  */
-int kbase_hwcnt_virtualizer_client_create(
-	struct kbase_hwcnt_virtualizer *hvirt,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	struct kbase_hwcnt_virtualizer_client **out_hvcli);
+int kbase_hwcnt_virtualizer_client_create(struct kbase_hwcnt_virtualizer *hvirt,
+					  const struct kbase_hwcnt_enable_map *enable_map,
+					  struct kbase_hwcnt_virtualizer_client **out_hvcli);
 
 /**
  * kbase_hwcnt_virtualizer_client_destroy() - Destroy a virtualizer client.
  * @hvcli: Pointer to the hardware counter client.
  */
-void kbase_hwcnt_virtualizer_client_destroy(
-	struct kbase_hwcnt_virtualizer_client *hvcli);
+void kbase_hwcnt_virtualizer_client_destroy(struct kbase_hwcnt_virtualizer_client *hvcli);
 
 /**
  * kbase_hwcnt_virtualizer_client_set_counters - Perform a dump of the client's
@@ -115,12 +110,10 @@ void kbase_hwcnt_virtualizer_client_destroy(
  *
  * Return: 0 on success or error code.
  */
-int kbase_hwcnt_virtualizer_client_set_counters(
-	struct kbase_hwcnt_virtualizer_client *hvcli,
-	const struct kbase_hwcnt_enable_map *enable_map,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf);
+int kbase_hwcnt_virtualizer_client_set_counters(struct kbase_hwcnt_virtualizer_client *hvcli,
+						const struct kbase_hwcnt_enable_map *enable_map,
+						u64 *ts_start_ns, u64 *ts_end_ns,
+						struct kbase_hwcnt_dump_buffer *dump_buf);
 
 /**
  * kbase_hwcnt_virtualizer_client_dump - Perform a dump of the client's
@@ -136,11 +129,9 @@ int kbase_hwcnt_virtualizer_client_set_counters(
  *
  * Return: 0 on success or error code.
  */
-int kbase_hwcnt_virtualizer_client_dump(
-	struct kbase_hwcnt_virtualizer_client *hvcli,
-	u64 *ts_start_ns,
-	u64 *ts_end_ns,
-	struct kbase_hwcnt_dump_buffer *dump_buf);
+int kbase_hwcnt_virtualizer_client_dump(struct kbase_hwcnt_virtualizer_client *hvcli,
+					u64 *ts_start_ns, u64 *ts_end_ns,
+					struct kbase_hwcnt_dump_buffer *dump_buf);
 
 /**
  * kbase_hwcnt_virtualizer_queue_work() - Queue hardware counter related async
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_watchdog_if.h b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_watchdog_if.h
similarity index 84%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_watchdog_if.h
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_watchdog_if.h
index 187331866428..501c0087b7e6 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_watchdog_if.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_watchdog_if.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -50,17 +50,17 @@ typedef void kbase_hwcnt_watchdog_callback_fn(void *user_data);
  *
  * Return: 0 if the watchdog timer enabled successfully, error code otherwise.
  */
-typedef int kbase_hwcnt_watchdog_enable_fn(
-	const struct kbase_hwcnt_watchdog_info *timer, u32 period_ms,
-	kbase_hwcnt_watchdog_callback_fn *callback, void *user_data);
+typedef int kbase_hwcnt_watchdog_enable_fn(const struct kbase_hwcnt_watchdog_info *timer,
+					   u32 period_ms,
+					   kbase_hwcnt_watchdog_callback_fn *callback,
+					   void *user_data);
 
 /**
  * typedef kbase_hwcnt_watchdog_disable_fn - Disable watchdog timer
  *
  * @timer: Non-NULL pointer to a watchdog timer interface context
  */
-typedef void
-kbase_hwcnt_watchdog_disable_fn(const struct kbase_hwcnt_watchdog_info *timer);
+typedef void kbase_hwcnt_watchdog_disable_fn(const struct kbase_hwcnt_watchdog_info *timer);
 
 /**
  * typedef kbase_hwcnt_watchdog_modify_fn - Modify watchdog timer's timeout
@@ -68,9 +68,8 @@ kbase_hwcnt_watchdog_disable_fn(const struct kbase_hwcnt_watchdog_info *timer);
  * @timer:    Non-NULL pointer to a watchdog timer interface context
  * @delay_ms: Watchdog timer expiration in milliseconds
  */
-typedef void
-kbase_hwcnt_watchdog_modify_fn(const struct kbase_hwcnt_watchdog_info *timer,
-			       u32 delay_ms);
+typedef void kbase_hwcnt_watchdog_modify_fn(const struct kbase_hwcnt_watchdog_info *timer,
+					    u32 delay_ms);
 
 /**
  * struct kbase_hwcnt_watchdog_interface - Hardware counter watchdog virtual interface.
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_watchdog_if_timer.c b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_watchdog_if_timer.c
similarity index 76%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_watchdog_if_timer.c
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_watchdog_if_timer.c
index 69b957adc4dd..4caa832cd587 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_watchdog_if_timer.c
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_watchdog_if_timer.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -20,8 +20,8 @@
  */
 
 #include "mali_kbase.h"
-#include "mali_kbase_hwcnt_watchdog_if.h"
-#include "mali_kbase_hwcnt_watchdog_if_timer.h"
+#include "hwcnt/mali_kbase_hwcnt_watchdog_if.h"
+#include "hwcnt/mali_kbase_hwcnt_watchdog_if_timer.h"
 
 #include <linux/workqueue.h>
 #include <linux/slab.h>
@@ -62,12 +62,10 @@ static void kbasep_hwcnt_watchdog_callback(struct work_struct *const work)
 }
 
 static int kbasep_hwcnt_watchdog_if_timer_enable(
-	const struct kbase_hwcnt_watchdog_info *const timer,
-	u32 const period_ms, kbase_hwcnt_watchdog_callback_fn *const callback,
-	void *const user_data)
+	const struct kbase_hwcnt_watchdog_info *const timer, u32 const period_ms,
+	kbase_hwcnt_watchdog_callback_fn *const callback, void *const user_data)
 {
-	struct kbase_hwcnt_watchdog_if_timer_info *const timer_info =
-		(void *)timer;
+	struct kbase_hwcnt_watchdog_if_timer_info *const timer_info = (void *)timer;
 
 	if (WARN_ON(!timer) || WARN_ON(!callback) || WARN_ON(timer_info->timer_enabled))
 		return -EINVAL;
@@ -81,11 +79,10 @@ static int kbasep_hwcnt_watchdog_if_timer_enable(
 	return 0;
 }
 
-static void kbasep_hwcnt_watchdog_if_timer_disable(
-	const struct kbase_hwcnt_watchdog_info *const timer)
+static void
+kbasep_hwcnt_watchdog_if_timer_disable(const struct kbase_hwcnt_watchdog_info *const timer)
 {
-	struct kbase_hwcnt_watchdog_if_timer_info *const timer_info =
-		(void *)timer;
+	struct kbase_hwcnt_watchdog_if_timer_info *const timer_info = (void *)timer;
 
 	if (WARN_ON(!timer))
 		return;
@@ -97,11 +94,11 @@ static void kbasep_hwcnt_watchdog_if_timer_disable(
 	timer_info->timer_enabled = false;
 }
 
-static void kbasep_hwcnt_watchdog_if_timer_modify(
-	const struct kbase_hwcnt_watchdog_info *const timer, u32 const delay_ms)
+static void
+kbasep_hwcnt_watchdog_if_timer_modify(const struct kbase_hwcnt_watchdog_info *const timer,
+				      u32 const delay_ms)
 {
-	struct kbase_hwcnt_watchdog_if_timer_info *const timer_info =
-		(void *)timer;
+	struct kbase_hwcnt_watchdog_if_timer_info *const timer_info = (void *)timer;
 
 	if (WARN_ON(!timer) || WARN_ON(!timer_info->timer_enabled))
 		return;
@@ -109,8 +106,7 @@ static void kbasep_hwcnt_watchdog_if_timer_modify(
 	mod_delayed_work(timer_info->workq, &timer_info->dwork, msecs_to_jiffies(delay_ms));
 }
 
-void kbase_hwcnt_watchdog_if_timer_destroy(
-	struct kbase_hwcnt_watchdog_interface *const watchdog_if)
+void kbase_hwcnt_watchdog_if_timer_destroy(struct kbase_hwcnt_watchdog_interface *const watchdog_if)
 {
 	struct kbase_hwcnt_watchdog_if_timer_info *timer_info;
 
@@ -125,11 +121,12 @@ void kbase_hwcnt_watchdog_if_timer_destroy(
 	destroy_workqueue(timer_info->workq);
 	kfree(timer_info);
 
-	*watchdog_if = (struct kbase_hwcnt_watchdog_interface){ NULL };
+	*watchdog_if = (struct kbase_hwcnt_watchdog_interface){
+		.timer = NULL, .enable = NULL, .disable = NULL, .modify = NULL
+	};
 }
 
-int kbase_hwcnt_watchdog_if_timer_create(
-	struct kbase_hwcnt_watchdog_interface *const watchdog_if)
+int kbase_hwcnt_watchdog_if_timer_create(struct kbase_hwcnt_watchdog_interface *const watchdog_if)
 {
 	struct kbase_hwcnt_watchdog_if_timer_info *timer_info;
 
@@ -140,9 +137,7 @@ int kbase_hwcnt_watchdog_if_timer_create(
 	if (!timer_info)
 		return -ENOMEM;
 
-	*timer_info =
-		(struct kbase_hwcnt_watchdog_if_timer_info){ .timer_enabled =
-								     false };
+	*timer_info = (struct kbase_hwcnt_watchdog_if_timer_info){ .timer_enabled = false };
 
 	INIT_DELAYED_WORK(&timer_info->dwork, kbasep_hwcnt_watchdog_callback);
 
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_watchdog_if_timer.h b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_watchdog_if_timer.h
similarity index 85%
rename from drivers/gpu/arm/bifrost/mali_kbase_hwcnt_watchdog_if_timer.h
rename to drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_watchdog_if_timer.h
index 3bd69c3401c4..a545ad3e39e3 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hwcnt_watchdog_if_timer.h
+++ b/drivers/gpu/arm/bifrost/hwcnt/mali_kbase_hwcnt_watchdog_if_timer.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -35,8 +35,7 @@ struct kbase_hwcnt_watchdog_interface;
  *
  * Return: 0 on success, error otherwise.
  */
-int kbase_hwcnt_watchdog_if_timer_create(
-	struct kbase_hwcnt_watchdog_interface *watchdog_if);
+int kbase_hwcnt_watchdog_if_timer_create(struct kbase_hwcnt_watchdog_interface *watchdog_if);
 
 /**
  * kbase_hwcnt_watchdog_if_timer_destroy() - Destroy a watchdog interface of hardware counter
@@ -44,7 +43,6 @@ int kbase_hwcnt_watchdog_if_timer_create(
  *
  * @watchdog_if: Pointer to watchdog interface to destroy
  */
-void kbase_hwcnt_watchdog_if_timer_destroy(
-	struct kbase_hwcnt_watchdog_interface *watchdog_if);
+void kbase_hwcnt_watchdog_if_timer_destroy(struct kbase_hwcnt_watchdog_interface *watchdog_if);
 
 #endif /* _KBASE_HWCNT_WATCHDOG_IF_TIMER_H_ */
diff --git a/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_common_jm.h b/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_common_jm.h
index 4479a4b8665f..6089610847b4 100644
--- a/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_common_jm.h
+++ b/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_common_jm.h
@@ -23,8 +23,8 @@
 #define _KBASE_IPA_COUNTER_COMMON_JM_H_
 
 #include "mali_kbase.h"
-#include "mali_kbase_hwcnt_virtualizer.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_virtualizer.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 
 /* Maximum number of IPA groups for an IPA model. */
 #define KBASE_IPA_MAX_GROUP_DEF_NUM  16
diff --git a/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_csf.c b/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_csf.c
index 66e56e267e68..43cdf18a5e3b 100644
--- a/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_csf.c
+++ b/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_csf.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2020-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -59,9 +59,6 @@
 		.counter_block_type = block_type,		\
 	}
 
-#define CSHW_COUNTER_DEF(cnt_name, coeff, cnt_idx)	\
-	COUNTER_DEF(cnt_name, coeff, cnt_idx, KBASE_IPA_CORE_TYPE_CSHW)
-
 #define MEMSYS_COUNTER_DEF(cnt_name, coeff, cnt_idx)	\
 	COUNTER_DEF(cnt_name, coeff, cnt_idx, KBASE_IPA_CORE_TYPE_MEMSYS)
 
diff --git a/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_jm.c b/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_jm.c
index 4bb880e79b4a..a32a2c207163 100644
--- a/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_jm.c
+++ b/drivers/gpu/arm/bifrost/ipa/backend/mali_kbase_ipa_counter_jm.c
@@ -30,21 +30,15 @@
 
 /* Performance counter blocks base offsets */
 #define JM_BASE             (0 * KBASE_IPA_NR_BYTES_PER_BLOCK)
-#define TILER_BASE          (1 * KBASE_IPA_NR_BYTES_PER_BLOCK)
 #define MEMSYS_BASE         (2 * KBASE_IPA_NR_BYTES_PER_BLOCK)
 
 /* JM counter block offsets */
 #define JM_GPU_ACTIVE (KBASE_IPA_NR_BYTES_PER_CNT *  6)
 
-/* Tiler counter block offsets */
-#define TILER_ACTIVE (KBASE_IPA_NR_BYTES_PER_CNT * 45)
-
 /* MEMSYS counter block offsets */
 #define MEMSYS_L2_ANY_LOOKUP (KBASE_IPA_NR_BYTES_PER_CNT * 25)
 
 /* SC counter block offsets */
-#define SC_FRAG_ACTIVE             (KBASE_IPA_NR_BYTES_PER_CNT *  4)
-#define SC_EXEC_CORE_ACTIVE        (KBASE_IPA_NR_BYTES_PER_CNT * 26)
 #define SC_EXEC_INSTR_FMA          (KBASE_IPA_NR_BYTES_PER_CNT * 27)
 #define SC_EXEC_INSTR_COUNT        (KBASE_IPA_NR_BYTES_PER_CNT * 28)
 #define SC_EXEC_INSTR_MSG          (KBASE_IPA_NR_BYTES_PER_CNT * 30)
@@ -52,10 +46,6 @@
 #define SC_TEX_COORD_ISSUE         (KBASE_IPA_NR_BYTES_PER_CNT * 40)
 #define SC_TEX_TFCH_NUM_OPERATIONS (KBASE_IPA_NR_BYTES_PER_CNT * 42)
 #define SC_VARY_INSTR              (KBASE_IPA_NR_BYTES_PER_CNT * 49)
-#define SC_VARY_SLOT_32            (KBASE_IPA_NR_BYTES_PER_CNT * 50)
-#define SC_VARY_SLOT_16            (KBASE_IPA_NR_BYTES_PER_CNT * 51)
-#define SC_BEATS_RD_LSC            (KBASE_IPA_NR_BYTES_PER_CNT * 56)
-#define SC_BEATS_WR_LSC            (KBASE_IPA_NR_BYTES_PER_CNT * 61)
 #define SC_BEATS_WR_TIB            (KBASE_IPA_NR_BYTES_PER_CNT * 62)
 
 /**
diff --git a/drivers/gpu/arm/bifrost/ipa/mali_kbase_ipa.h b/drivers/gpu/arm/bifrost/ipa/mali_kbase_ipa.h
index f722f5fadff9..c875ffb4990e 100644
--- a/drivers/gpu/arm/bifrost/ipa/mali_kbase_ipa.h
+++ b/drivers/gpu/arm/bifrost/ipa/mali_kbase_ipa.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2016-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2016-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -271,7 +271,6 @@ int kbase_get_real_power(struct devfreq *df, u32 *power,
 				unsigned long freq,
 				unsigned long voltage);
 
-#if MALI_UNIT_TEST
 /* Called by kbase_get_real_power() to invoke the power models.
  * Must be called with kbdev->ipa.lock held.
  * This function is only exposed for use by unit tests.
@@ -279,7 +278,6 @@ int kbase_get_real_power(struct devfreq *df, u32 *power,
 int kbase_get_real_power_locked(struct kbase_device *kbdev, u32 *power,
 				unsigned long freq,
 				unsigned long voltage);
-#endif /* MALI_UNIT_TEST */
 
 extern struct devfreq_cooling_power kbase_ipa_power_model_ops;
 
diff --git a/drivers/gpu/arm/bifrost/ipa/mali_kbase_ipa_simple.c b/drivers/gpu/arm/bifrost/ipa/mali_kbase_ipa_simple.c
index 78c343cf249f..57508eb24749 100644
--- a/drivers/gpu/arm/bifrost/ipa/mali_kbase_ipa_simple.c
+++ b/drivers/gpu/arm/bifrost/ipa/mali_kbase_ipa_simple.c
@@ -34,6 +34,8 @@
 #include "mali_kbase_ipa_simple.h"
 #include "mali_kbase_ipa_debugfs.h"
 
+#if MALI_USE_CSF
+
 /* This is used if the dynamic power for top-level is estimated separately
  * through the counter model. To roughly match the contribution of top-level
  * power in the total dynamic power, when calculated through counter model,
@@ -44,6 +46,8 @@
  */
 #define TOP_LEVEL_DYN_COEFF_SCALER (3)
 
+#endif /* MALI_USE_CSF */
+
 #if MALI_UNIT_TEST
 
 static int dummy_temp;
diff --git a/drivers/gpu/arm/bifrost/jm/mali_kbase_jm_defs.h b/drivers/gpu/arm/bifrost/jm/mali_kbase_jm_defs.h
index f9e0099a5cbf..e4316981e635 100644
--- a/drivers/gpu/arm/bifrost/jm/mali_kbase_jm_defs.h
+++ b/drivers/gpu/arm/bifrost/jm/mali_kbase_jm_defs.h
@@ -344,19 +344,6 @@ enum kbase_atom_exit_protected_state {
 	KBASE_ATOM_EXIT_PROTECTED_RESET_WAIT,
 };
 
-/**
- * struct kbase_ext_res - Contains the info for external resources referred
- *                        by an atom, which have been mapped on GPU side.
- * @gpu_address:          Start address of the memory region allocated for
- *                        the resource from GPU virtual address space.
- * @alloc:                pointer to physical pages tracking object, set on
- *                        mapping the external resource on GPU side.
- */
-struct kbase_ext_res {
-	u64 gpu_address;
-	struct kbase_mem_phy_alloc *alloc;
-};
-
 /**
  * struct kbase_jd_atom  - object representing the atom, containing the complete
  *                         state and attributes of an atom.
@@ -390,7 +377,8 @@ struct kbase_ext_res {
  *                         each allocation is read in order to enforce an
  *                         overall physical memory usage limit.
  * @nr_extres:             number of external resources referenced by the atom.
- * @extres:                pointer to the location containing info about
+ * @extres:                Pointer to @nr_extres VA regions containing the external
+ *                         resource allocation and other information.
  *                         @nr_extres external resources referenced by the atom.
  * @device_nr:             indicates the coregroup with which the atom is
  *                         associated, when
@@ -408,16 +396,21 @@ struct kbase_ext_res {
  *                         sync through soft jobs and for the implicit
  *                         synchronization required on access to external
  *                         resources.
- * @dma_fence.fence_in:    Input fence
+ * @dma_fence.fence_in:    Points to the dma-buf input fence for this atom.
+ *                         The atom would complete only after the fence is
+ *                         signaled.
  * @dma_fence.fence:       Points to the dma-buf output fence for this atom.
+ * @dma_fence.fence_cb:    The object that is passed at the time of adding the
+ *                         callback that gets invoked when @dma_fence.fence_in
+ *                         is signaled.
+ * @dma_fence.fence_cb_added: Flag to keep a track if the callback was successfully
+ *                            added for @dma_fence.fence_in, which is supposed to be
+ *                            invoked on the signaling of fence.
  * @dma_fence.context:     The dma-buf fence context number for this atom. A
  *                         unique context number is allocated to each katom in
  *                         the context on context creation.
  * @dma_fence.seqno:       The dma-buf fence sequence number for this atom. This
  *                         is increased every time this katom uses dma-buf fence
- * @dma_fence.callbacks:   List of all callbacks set up to wait on other fences
- * @dma_fence.dep_count:   Atomic counter of number of outstandind dma-buf fence
- *                         dependencies for this atom.
  * @event_code:            Event code for the job chain represented by the atom,
  *                         both HW and low-level SW events are represented by
  *                         event codes.
@@ -519,21 +512,17 @@ struct kbase_jd_atom {
 #endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
 
 	u16 nr_extres;
-	struct kbase_ext_res *extres;
+	struct kbase_va_region **extres;
 
 	u32 device_nr;
 	u64 jc;
 	void *softjob_data;
-#if defined(CONFIG_SYNC)
-	struct sync_fence *fence;
-	struct sync_fence_waiter sync_waiter;
-#endif				/* CONFIG_SYNC */
-#if defined(CONFIG_MALI_BIFROST_DMA_FENCE) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 	struct {
 		/* Use the functions/API defined in mali_kbase_fence.h to
 		 * when working with this sub struct
 		 */
-#if defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 #if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
 		struct fence *fence_in;
 #else
@@ -556,38 +545,21 @@ struct kbase_jd_atom {
 #else
 		struct dma_fence *fence;
 #endif
+
+		/* This is the callback object that is registered for the fence_in.
+		 * The callback is invoked when the fence_in is signaled.
+		 */
+#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
+		struct fence_cb fence_cb;
+#else
+		struct dma_fence_cb fence_cb;
+#endif
+		bool fence_cb_added;
+
 		unsigned int context;
 		atomic_t seqno;
-		/* This contains a list of all callbacks set up to wait on
-		 * other fences.  This atom must be held back from JS until all
-		 * these callbacks have been called and dep_count have reached
-		 * 0. The initial value of dep_count must be equal to the
-		 * number of callbacks on this list.
-		 *
-		 * This list is protected by jctx.lock. Callbacks are added to
-		 * this list when the atom is built and the wait are set up.
-		 * All the callbacks then stay on the list until all callbacks
-		 * have been called and the atom is queued, or cancelled, and
-		 * then all callbacks are taken off the list and freed.
-		 */
-		struct list_head callbacks;
-		/* Atomic counter of number of outstandind dma-buf fence
-		 * dependencies for this atom. When dep_count reaches 0 the
-		 * atom may be queued.
-		 *
-		 * The special value "-1" may only be set after the count
-		 * reaches 0, while holding jctx.lock. This indicates that the
-		 * atom has been handled, either queued in JS or cancelled.
-		 *
-		 * If anyone but the dma-fence worker sets this to -1 they must
-		 * ensure that any potentially queued worker must have
-		 * completed before allowing the atom to be marked as unused.
-		 * This can be done by flushing the fence work queue:
-		 * kctx->dma_fence.wq.
-		 */
-		atomic_t dep_count;
 	} dma_fence;
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE || CONFIG_SYNC_FILE */
+#endif /* CONFIG_SYNC_FILE */
 
 	/* Note: refer to kbasep_js_atom_retained_state, which will take a copy
 	 * of some of the following members
diff --git a/drivers/gpu/arm/bifrost/mali_base_hwconfig_features.h b/drivers/gpu/arm/bifrost/mali_base_hwconfig_features.h
index 3669f7e23fa6..ea143ab49642 100644
--- a/drivers/gpu/arm/bifrost/mali_base_hwconfig_features.h
+++ b/drivers/gpu/arm/bifrost/mali_base_hwconfig_features.h
@@ -39,6 +39,7 @@ enum base_hw_feature {
 	BASE_HW_FEATURE_GPU_SLEEP,
 	BASE_HW_FEATURE_FLUSH_INV_SHADER_OTHER,
 	BASE_HW_FEATURE_CORE_FEATURES,
+	BASE_HW_FEATURE_PBHA_HWU,
 	BASE_HW_FEATURE_END
 };
 
@@ -177,5 +178,17 @@ __attribute__((unused)) static const enum base_hw_feature base_hw_features_tTUx[
 	BASE_HW_FEATURE_END
 };
 
+__attribute__((unused)) static const enum base_hw_feature base_hw_features_tTIx[] = {
+	BASE_HW_FEATURE_FLUSH_REDUCTION,
+	BASE_HW_FEATURE_PROTECTED_DEBUG_MODE,
+	BASE_HW_FEATURE_L2_CONFIG,
+	BASE_HW_FEATURE_CLEAN_ONLY_SAFE,
+	BASE_HW_FEATURE_ASN_HASH,
+	BASE_HW_FEATURE_GPU_SLEEP,
+	BASE_HW_FEATURE_CORE_FEATURES,
+	BASE_HW_FEATURE_PBHA_HWU,
+	BASE_HW_FEATURE_END
+};
+
 
 #endif /* _BASE_HWCONFIG_FEATURES_H_ */
diff --git a/drivers/gpu/arm/bifrost/mali_base_hwconfig_issues.h b/drivers/gpu/arm/bifrost/mali_base_hwconfig_issues.h
index 391730106f6d..a360984acca5 100644
--- a/drivers/gpu/arm/bifrost/mali_base_hwconfig_issues.h
+++ b/drivers/gpu/arm/bifrost/mali_base_hwconfig_issues.h
@@ -700,5 +700,35 @@ __attribute__((unused)) static const enum base_hw_issue base_hw_issues_tTUx_r1p1
 	BASE_HW_ISSUE_END
 };
 
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tTUx_r1p2[] = {
+	BASE_HW_ISSUE_9435,
+	BASE_HW_ISSUE_TSIX_2033,
+	BASE_HW_ISSUE_TTRX_1337,
+	BASE_HW_ISSUE_GPU2019_3878,
+	BASE_HW_ISSUE_TURSEHW_2716,
+	BASE_HW_ISSUE_GPU2019_3901,
+	BASE_HW_ISSUE_GPU2021PRO_290,
+	BASE_HW_ISSUE_END
+};
+
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_model_tTIx[] = {
+	BASE_HW_ISSUE_5736,
+	BASE_HW_ISSUE_9435,
+	BASE_HW_ISSUE_TSIX_2033,
+	BASE_HW_ISSUE_TTRX_1337,
+	BASE_HW_ISSUE_TURSEHW_2716,
+	BASE_HW_ISSUE_GPU2021PRO_290,
+	BASE_HW_ISSUE_END
+};
+
+__attribute__((unused)) static const enum base_hw_issue base_hw_issues_tTIx_r0p0[] = {
+	BASE_HW_ISSUE_9435,
+	BASE_HW_ISSUE_TSIX_2033,
+	BASE_HW_ISSUE_TTRX_1337,
+	BASE_HW_ISSUE_TURSEHW_2716,
+	BASE_HW_ISSUE_GPU2021PRO_290,
+	BASE_HW_ISSUE_END
+};
+
 
 #endif /* _BASE_HWCONFIG_ISSUES_H_ */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase.h b/drivers/gpu/arm/bifrost/mali_kbase.h
index 7d0d0dae0279..8e4d36141368 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase.h
@@ -70,7 +70,9 @@
 #include "mali_kbase_jd_debugfs.h"
 #include "mali_kbase_jm.h"
 #include "mali_kbase_js.h"
-#endif /* !MALI_USE_CSF */
+#else /* !MALI_USE_CSF */
+#include "csf/mali_kbase_debug_csf_fault.h"
+#endif /* MALI_USE_CSF */
 
 #include "ipa/mali_kbase_ipa.h"
 
@@ -466,7 +468,7 @@ void kbase_finish_soft_job(struct kbase_jd_atom *katom);
 void kbase_cancel_soft_job(struct kbase_jd_atom *katom);
 void kbase_resume_suspended_soft_jobs(struct kbase_device *kbdev);
 void kbasep_remove_waiting_soft_job(struct kbase_jd_atom *katom);
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 void kbase_soft_event_wait_callback(struct kbase_jd_atom *katom);
 #endif
 int kbase_soft_event_update(struct kbase_context *kctx,
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_core_linux.c b/drivers/gpu/arm/bifrost/mali_kbase_core_linux.c
index 0c8f653a9bff..480e693f3c61 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_core_linux.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_core_linux.c
@@ -35,7 +35,7 @@
 #include "backend/gpu/mali_kbase_model_linux.h"
 #include <backend/gpu/mali_kbase_model_dummy.h>
 #endif /* CONFIG_MALI_BIFROST_NO_MALI */
-#include "mali_kbase_mem_profile_debugfs_buf_size.h"
+#include "uapi/gpu/arm/bifrost/mali_kbase_mem_profile_debugfs_buf_size.h"
 #include "mali_kbase_mem.h"
 #include "mali_kbase_mem_pool_debugfs.h"
 #include "mali_kbase_mem_pool_group.h"
@@ -54,8 +54,8 @@
 #if !MALI_USE_CSF
 #include "mali_kbase_kinstr_jm.h"
 #endif
-#include "mali_kbase_hwcnt_context.h"
-#include "mali_kbase_hwcnt_virtualizer.h"
+#include "hwcnt/mali_kbase_hwcnt_context.h"
+#include "hwcnt/mali_kbase_hwcnt_virtualizer.h"
 #include "mali_kbase_kinstr_prfcnt.h"
 #include "mali_kbase_vinstr.h"
 #if MALI_USE_CSF
@@ -95,15 +95,16 @@
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/mm.h>
 #include <linux/compat.h>	/* is_compat_task/in_compat_syscall */
 #include <linux/mman.h>
 #include <linux/version.h>
 #include <linux/version_compat_defs.h>
 #include <mali_kbase_hw.h>
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 #include <mali_kbase_sync.h>
-#endif /* CONFIG_SYNC || CONFIG_SYNC_FILE */
+#endif /* CONFIG_SYNC_FILE */
 #include <linux/clk.h>
 #include <linux/clk-provider.h>
 #include <linux/delay.h>
@@ -123,11 +124,6 @@
 
 #include <mali_kbase_caps.h>
 
-/* GPU IRQ Tags */
-#define	JOB_IRQ_TAG	0
-#define MMU_IRQ_TAG	1
-#define GPU_IRQ_TAG	2
-
 #define KERNEL_SIDE_DDK_VERSION_STRING "K:" MALI_RELEASE_NAME "(GPL)"
 
 /**
@@ -139,9 +135,6 @@
 					 (((minor) & 0xFFF) << 8) | \
 					 ((0 & 0xFF) << 0))
 
-#define KBASE_API_MIN(api_version) ((api_version >> 8) & 0xFFF)
-#define KBASE_API_MAJ(api_version) ((api_version >> 20) & 0xFFF)
-
 /**
  * struct mali_kbase_capability_def - kbase capabilities table
  *
@@ -173,6 +166,11 @@ static const struct mali_kbase_capability_def kbase_caps_table[MALI_KBASE_NUM_CA
 #endif
 };
 
+#if (KERNEL_VERSION(5, 3, 0) <= LINUX_VERSION_CODE)
+/* Mutex to synchronize the probe of multiple kbase instances */
+static struct mutex kbase_probe_mutex;
+#endif
+
 /**
  * mali_kbase_supports_cap - Query whether a kbase capability is supported
  *
@@ -432,6 +430,12 @@ static struct kbase_device *to_kbase_device(struct device *dev)
 
 int assign_irqs(struct kbase_device *kbdev)
 {
+	static const char *const irq_names_caps[] = { "JOB", "MMU", "GPU" };
+
+#if IS_ENABLED(CONFIG_OF)
+	static const char *const irq_names[] = { "job", "mmu", "gpu" };
+#endif
+
 	struct platform_device *pdev;
 	int i;
 
@@ -439,34 +443,31 @@ int assign_irqs(struct kbase_device *kbdev)
 		return -ENODEV;
 
 	pdev = to_platform_device(kbdev->dev);
-	/* 3 IRQ resources */
-	for (i = 0; i < 3; i++) {
-		struct resource *irq_res;
-		int irqtag;
 
-		irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, i);
-		if (!irq_res) {
-			dev_err(kbdev->dev, "No IRQ resource at index %d\n", i);
-			return -ENOENT;
-		}
+	for (i = 0; i < ARRAY_SIZE(irq_names_caps); i++) {
+		int irq;
 
 #if IS_ENABLED(CONFIG_OF)
-		if (!strncasecmp(irq_res->name, "JOB", 4)) {
-			irqtag = JOB_IRQ_TAG;
-		} else if (!strncasecmp(irq_res->name, "MMU", 4)) {
-			irqtag = MMU_IRQ_TAG;
-		} else if (!strncasecmp(irq_res->name, "GPU", 4)) {
-			irqtag = GPU_IRQ_TAG;
-		} else {
-			dev_err(&pdev->dev, "Invalid irq res name: '%s'\n",
-				irq_res->name);
-			return -EINVAL;
-		}
+		/* We recommend using Upper case for the irq names in dts, but if
+		 * there are devices in the world using Lower case then we should
+		 * avoid breaking support for them. So try using names in Upper case
+		 * first then try using Lower case names. If both attempts fail then
+		 * we assume there is no IRQ resource specified for the GPU.
+		 */
+		irq = platform_get_irq_byname(pdev, irq_names_caps[i]);
+		if (irq < 0)
+			irq = platform_get_irq_byname(pdev, irq_names[i]);
 #else
-		irqtag = i;
+		irq = platform_get_irq(pdev, i);
 #endif /* CONFIG_OF */
-		kbdev->irqs[irqtag].irq = irq_res->start;
-		kbdev->irqs[irqtag].flags = irq_res->flags & IRQF_TRIGGER_MASK;
+
+		if (irq < 0) {
+			dev_err(kbdev->dev, "No IRQ resource '%s'\n", irq_names_caps[i]);
+			return irq;
+		}
+
+		kbdev->irqs[i].irq = irq;
+		kbdev->irqs[i].flags = irqd_get_trigger_type(irq_get_irq_data(irq));
 	}
 
 	return 0;
@@ -663,6 +664,9 @@ static int kbase_open(struct inode *inode, struct file *filp)
 	if (!kbdev)
 		return -ENODEV;
 
+	/* Set address space operation for page migration */
+	kbase_mem_migrate_set_address_space_ops(kbdev, filp);
+
 	/* Device-wide firmware load is moved here from probing to comply with
 	 * Android GKI vendor guideline.
 	 */
@@ -1040,52 +1044,11 @@ static int kbase_api_get_ddk_version(struct kbase_context *kctx,
 	return len;
 }
 
-/* Defaults for legacy just-in-time memory allocator initialization
- * kernel calls
- */
-#define DEFAULT_MAX_JIT_ALLOCATIONS 255
-#define JIT_LEGACY_TRIM_LEVEL (0) /* No trimming */
-
-static int kbase_api_mem_jit_init_10_2(struct kbase_context *kctx,
-		struct kbase_ioctl_mem_jit_init_10_2 *jit_init)
-{
-	kctx->jit_version = 1;
-
-	/* since no phys_pages parameter, use the maximum: va_pages */
-	return kbase_region_tracker_init_jit(kctx, jit_init->va_pages,
-			DEFAULT_MAX_JIT_ALLOCATIONS,
-			JIT_LEGACY_TRIM_LEVEL, BASE_MEM_GROUP_DEFAULT,
-			jit_init->va_pages);
-}
-
-static int kbase_api_mem_jit_init_11_5(struct kbase_context *kctx,
-		struct kbase_ioctl_mem_jit_init_11_5 *jit_init)
-{
-	int i;
-
-	kctx->jit_version = 2;
-
-	for (i = 0; i < sizeof(jit_init->padding); i++) {
-		/* Ensure all padding bytes are 0 for potential future
-		 * extension
-		 */
-		if (jit_init->padding[i])
-			return -EINVAL;
-	}
-
-	/* since no phys_pages parameter, use the maximum: va_pages */
-	return kbase_region_tracker_init_jit(kctx, jit_init->va_pages,
-			jit_init->max_allocations, jit_init->trim_level,
-			jit_init->group_id, jit_init->va_pages);
-}
-
 static int kbase_api_mem_jit_init(struct kbase_context *kctx,
 		struct kbase_ioctl_mem_jit_init *jit_init)
 {
 	int i;
 
-	kctx->jit_version = 3;
-
 	for (i = 0; i < sizeof(jit_init->padding); i++) {
 		/* Ensure all padding bytes are 0 for potential future
 		 * extension
@@ -1243,7 +1206,7 @@ static int kbase_api_mem_flags_change(struct kbase_context *kctx,
 static int kbase_api_stream_create(struct kbase_context *kctx,
 		struct kbase_ioctl_stream_create *stream)
 {
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 	int fd, ret;
 
 	/* Name must be NULL-terminated and padded with NULLs, so check last
@@ -1265,7 +1228,7 @@ static int kbase_api_stream_create(struct kbase_context *kctx,
 static int kbase_api_fence_validate(struct kbase_context *kctx,
 		struct kbase_ioctl_fence_validate *validate)
 {
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 	return kbase_sync_fence_validate(validate->fd);
 #else
 	return -ENOENT;
@@ -1279,12 +1242,18 @@ static int kbase_api_mem_profile_add(struct kbase_context *kctx,
 	int err;
 
 	if (data->len > KBASE_MEM_PROFILE_MAX_BUF_SIZE) {
-		dev_err(kctx->kbdev->dev, "mem_profile_add: buffer too big\n");
+		dev_err(kctx->kbdev->dev, "mem_profile_add: buffer too big");
 		return -EINVAL;
 	}
 
+	if (!data->len) {
+		dev_err(kctx->kbdev->dev, "mem_profile_add: buffer size is 0");
+		/* Should return -EINVAL, but returning -ENOMEM for backwards compat */
+		return -ENOMEM;
+	}
+
 	buf = kmalloc(data->len, GFP_KERNEL);
-	if (ZERO_OR_NULL_PTR(buf))
+	if (!buf)
 		return -ENOMEM;
 
 	err = copy_from_user(buf, u64_to_user_ptr(data->buffer),
@@ -1494,9 +1463,22 @@ static int kbasep_cs_tiler_heap_init(struct kbase_context *kctx,
 	kctx->jit_group_id = heap_init->in.group_id;
 
 	return kbase_csf_tiler_heap_init(kctx, heap_init->in.chunk_size,
-		heap_init->in.initial_chunks, heap_init->in.max_chunks,
-		heap_init->in.target_in_flight,
-		&heap_init->out.gpu_heap_va, &heap_init->out.first_chunk_va);
+					 heap_init->in.initial_chunks, heap_init->in.max_chunks,
+					 heap_init->in.target_in_flight, heap_init->in.buf_desc_va,
+					 &heap_init->out.gpu_heap_va,
+					 &heap_init->out.first_chunk_va);
+}
+
+static int kbasep_cs_tiler_heap_init_1_13(struct kbase_context *kctx,
+					  union kbase_ioctl_cs_tiler_heap_init_1_13 *heap_init)
+{
+	kctx->jit_group_id = heap_init->in.group_id;
+
+	return kbase_csf_tiler_heap_init(kctx, heap_init->in.chunk_size,
+					 heap_init->in.initial_chunks, heap_init->in.max_chunks,
+					 heap_init->in.target_in_flight, 0,
+					 &heap_init->out.gpu_heap_va,
+					 &heap_init->out.first_chunk_va);
 }
 
 static int kbasep_cs_tiler_heap_term(struct kbase_context *kctx,
@@ -1578,6 +1560,31 @@ static int kbasep_ioctl_cs_cpu_queue_dump(struct kbase_context *kctx,
 					cpu_queue_info->size);
 }
 
+#define POWER_DOWN_LATEST_FLUSH_VALUE ((u32)1)
+static int kbase_ioctl_read_user_page(struct kbase_context *kctx,
+				      union kbase_ioctl_read_user_page *user_page)
+{
+	struct kbase_device *kbdev = kctx->kbdev;
+	unsigned long flags;
+
+	/* As of now, only LATEST_FLUSH is supported */
+	if (unlikely(user_page->in.offset != LATEST_FLUSH))
+		return -EINVAL;
+
+	/* Validating padding that must be zero */
+	if (unlikely(user_page->in.padding != 0))
+		return -EINVAL;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	if (!kbdev->pm.backend.gpu_powered)
+		user_page->out.val_lo = POWER_DOWN_LATEST_FLUSH_VALUE;
+	else
+		user_page->out.val_lo = kbase_reg_read(kbdev, USER_REG(LATEST_FLUSH));
+	user_page->out.val_hi = 0;
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
+	return 0;
+}
 #endif /* MALI_USE_CSF */
 
 static int kbasep_ioctl_context_priority_check(struct kbase_context *kctx,
@@ -1779,18 +1786,6 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				struct kbase_ioctl_get_ddk_version,
 				kctx);
 		break;
-	case KBASE_IOCTL_MEM_JIT_INIT_10_2:
-		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_MEM_JIT_INIT_10_2,
-				kbase_api_mem_jit_init_10_2,
-				struct kbase_ioctl_mem_jit_init_10_2,
-				kctx);
-		break;
-	case KBASE_IOCTL_MEM_JIT_INIT_11_5:
-		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_MEM_JIT_INIT_11_5,
-				kbase_api_mem_jit_init_11_5,
-				struct kbase_ioctl_mem_jit_init_11_5,
-				kctx);
-		break;
 	case KBASE_IOCTL_MEM_JIT_INIT:
 		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_MEM_JIT_INIT,
 				kbase_api_mem_jit_init,
@@ -2028,6 +2023,11 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				union kbase_ioctl_cs_tiler_heap_init,
 				kctx);
 		break;
+	case KBASE_IOCTL_CS_TILER_HEAP_INIT_1_13:
+		KBASE_HANDLE_IOCTL_INOUT(KBASE_IOCTL_CS_TILER_HEAP_INIT_1_13,
+					 kbasep_cs_tiler_heap_init_1_13,
+					 union kbase_ioctl_cs_tiler_heap_init_1_13, kctx);
+		break;
 	case KBASE_IOCTL_CS_TILER_HEAP_TERM:
 		KBASE_HANDLE_IOCTL_IN(KBASE_IOCTL_CS_TILER_HEAP_TERM,
 				kbasep_cs_tiler_heap_term,
@@ -2046,6 +2046,10 @@ static long kbase_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 				struct kbase_ioctl_cs_cpu_queue_info,
 				kctx);
 		break;
+	case KBASE_IOCTL_READ_USER_PAGE:
+		KBASE_HANDLE_IOCTL_INOUT(KBASE_IOCTL_READ_USER_PAGE, kbase_ioctl_read_user_page,
+					 union kbase_ioctl_read_user_page, kctx);
+		break;
 #endif /* MALI_USE_CSF */
 #if MALI_UNIT_TEST
 	case KBASE_IOCTL_TLSTREAM_STATS:
@@ -2088,6 +2092,9 @@ static ssize_t kbase_read(struct file *filp, char __user *buf, size_t count, lof
 	if (unlikely(!kctx))
 		return -EPERM;
 
+	if (count < data_size)
+		return -ENOBUFS;
+
 	if (atomic_read(&kctx->event_count))
 		read_event = true;
 	else
@@ -3185,10 +3192,10 @@ static ssize_t gpuinfo_show(struct device *dev,
 		  .name = "Mali-G510" },
 		{ .id = GPU_ID2_PRODUCT_TVAX >> KBASE_GPU_ID_VERSION_PRODUCT_ID_SHIFT,
 		  .name = "Mali-G310" },
-		{ .id = GPU_ID2_PRODUCT_TTUX >> KBASE_GPU_ID_VERSION_PRODUCT_ID_SHIFT,
-		  .name = "Mali-TTUX" },
-		{ .id = GPU_ID2_PRODUCT_LTUX >> KBASE_GPU_ID_VERSION_PRODUCT_ID_SHIFT,
-		  .name = "Mali-LTUX" },
+		{ .id = GPU_ID2_PRODUCT_TTIX >> KBASE_GPU_ID_VERSION_PRODUCT_ID_SHIFT,
+		  .name = "Mali-TTIX" },
+		{ .id = GPU_ID2_PRODUCT_LTIX >> KBASE_GPU_ID_VERSION_PRODUCT_ID_SHIFT,
+		  .name = "Mali-LTIX" },
 	};
 	const char *product_name = "(Unknown Mali GPU)";
 	struct kbase_device *kbdev;
@@ -3223,19 +3230,19 @@ static ssize_t gpuinfo_show(struct device *dev,
 			GPU_FEATURES_RAY_TRACING_GET(gpu_props->props.raw_props.gpu_features);
 		const u8 nr_cores = gpu_props->num_cores;
 
-		/* Mali-TTUX_B(ig) if 10 < number of cores with ray tracing supproted.
-		 * Mali-TTUX if 10 < number of cores without ray tracing supported.
-		 * Mali-TTUX if 7 <= number of cores <= 10 regardless ray tracing.
-		 * Mali-LTUX if number of cores < 7.
+		/* Mali-G715-Immortalis if 10 < number of cores with ray tracing supproted.
+		 * Mali-G715 if 10 < number of cores without ray tracing supported.
+		 * Mali-G715 if 7 <= number of cores <= 10 regardless ray tracing.
+		 * Mali-G615 if number of cores < 7.
 		 */
 		if ((nr_cores > 10) && rt_supported)
-			product_name = "Mali-TTUX_B";
+			product_name = "Mali-G715-Immortalis";
 		else if (nr_cores >= 7)
-			product_name = "Mali-TTUX";
+			product_name = "Mali-G715";
 
 		if (nr_cores < 7) {
-			dev_warn(kbdev->dev, "nr_cores(%u) GPU ID must be LTUX", nr_cores);
-			product_name = "Mali-LTUX";
+			dev_warn(kbdev->dev, "nr_cores(%u) GPU ID must be G615", nr_cores);
+			product_name = "Mali-G615";
 		} else
 			dev_dbg(kbdev->dev, "GPU ID_Name: %s, nr_cores(%u)\n", product_name,
 				nr_cores);
@@ -4511,7 +4518,7 @@ int power_control_init(struct kbase_device *kbdev)
 	for (i = 0; i < ARRAY_SIZE(regulator_names); i++) {
 		kbdev->regulators[i] = regulator_get_optional(kbdev->dev,
 			regulator_names[i]);
-		if (IS_ERR_OR_NULL(kbdev->regulators[i])) {
+		if (IS_ERR(kbdev->regulators[i])) {
 			err = PTR_ERR(kbdev->regulators[i]);
 			kbdev->regulators[i] = NULL;
 			break;
@@ -4539,7 +4546,7 @@ int power_control_init(struct kbase_device *kbdev)
 	 */
 	for (i = 0; i < BASE_MAX_NR_CLOCKS_REGULATORS; i++) {
 		kbdev->clocks[i] = of_clk_get(kbdev->dev->of_node, i);
-		if (IS_ERR_OR_NULL(kbdev->clocks[i])) {
+		if (IS_ERR(kbdev->clocks[i])) {
 			err = PTR_ERR(kbdev->clocks[i]);
 			kbdev->clocks[i] = NULL;
 			break;
@@ -4801,52 +4808,84 @@ static const struct file_operations
 	.release = single_release,
 };
 
-int kbase_device_debugfs_init(struct kbase_device *kbdev)
+/**
+ * debugfs_ctx_defaults_init - Create the default configuration of new contexts in debugfs
+ * @kbdev: An instance of the GPU platform device, allocated from the probe method of the driver.
+ * Return: A pointer to the last dentry that it tried to create, whether successful or not.
+ *         Could be NULL or encode another error value.
+ */
+static struct dentry *debugfs_ctx_defaults_init(struct kbase_device *const kbdev)
 {
-	struct dentry *debugfs_ctx_defaults_directory;
-	int err;
 	/* prevent unprivileged use of debug file system
 	 * in old kernel version
 	 */
 	const mode_t mode = 0644;
+	struct dentry *dentry = debugfs_create_dir("defaults", kbdev->debugfs_ctx_directory);
+	struct dentry *debugfs_ctx_defaults_directory = dentry;
 
-	kbdev->mali_debugfs_directory = debugfs_create_dir(kbdev->devname,
-			NULL);
-	if (IS_ERR_OR_NULL(kbdev->mali_debugfs_directory)) {
+	if (IS_ERR_OR_NULL(dentry)) {
+		dev_err(kbdev->dev, "Couldn't create mali debugfs ctx defaults directory\n");
+		return dentry;
+	}
+
+	debugfs_create_bool("infinite_cache", mode,
+			debugfs_ctx_defaults_directory,
+			&kbdev->infinite_cache_active_default);
+
+	dentry = debugfs_create_file("mem_pool_max_size", mode, debugfs_ctx_defaults_directory,
+				   &kbdev->mem_pool_defaults.small,
+				   &kbase_device_debugfs_mem_pool_max_size_fops);
+	if (IS_ERR_OR_NULL(dentry)) {
+		dev_err(kbdev->dev, "Unable to create mem_pool_max_size debugfs entry\n");
+		return dentry;
+	}
+
+	dentry = debugfs_create_file("lp_mem_pool_max_size", mode, debugfs_ctx_defaults_directory,
+				   &kbdev->mem_pool_defaults.large,
+				   &kbase_device_debugfs_mem_pool_max_size_fops);
+	if (IS_ERR_OR_NULL(dentry))
+		dev_err(kbdev->dev, "Unable to create lp_mem_pool_max_size debugfs entry\n");
+
+	return dentry;
+}
+
+/**
+ * init_debugfs - Create device-wide debugfs directories and files for the Mali driver
+ * @kbdev: An instance of the GPU platform device, allocated from the probe method of the driver.
+ * Return: A pointer to the last dentry that it tried to create, whether successful or not.
+ *         Could be NULL or encode another error value.
+ */
+static struct dentry *init_debugfs(struct kbase_device *kbdev)
+{
+	struct dentry *dentry = debugfs_create_dir(kbdev->devname, NULL);
+
+	kbdev->mali_debugfs_directory = dentry;
+	if (IS_ERR_OR_NULL(dentry)) {
 		dev_err(kbdev->dev,
 			"Couldn't create mali debugfs directory: %s\n",
 			kbdev->devname);
-		err = -ENOMEM;
-		goto out;
+		return dentry;
 	}
 
-	kbdev->debugfs_ctx_directory = debugfs_create_dir("ctx",
-			kbdev->mali_debugfs_directory);
-	if (IS_ERR_OR_NULL(kbdev->debugfs_ctx_directory)) {
+	dentry = debugfs_create_dir("ctx", kbdev->mali_debugfs_directory);
+	kbdev->debugfs_ctx_directory = dentry;
+	if (IS_ERR_OR_NULL(dentry)) {
 		dev_err(kbdev->dev, "Couldn't create mali debugfs ctx directory\n");
-		err = -ENOMEM;
-		goto out;
+		return dentry;
 	}
 
-	kbdev->debugfs_instr_directory = debugfs_create_dir("instrumentation",
-			kbdev->mali_debugfs_directory);
-	if (IS_ERR_OR_NULL(kbdev->debugfs_instr_directory)) {
+	dentry = debugfs_create_dir("instrumentation", kbdev->mali_debugfs_directory);
+	kbdev->debugfs_instr_directory = dentry;
+	if (IS_ERR_OR_NULL(dentry)) {
 		dev_err(kbdev->dev, "Couldn't create mali debugfs instrumentation directory\n");
-		err = -ENOMEM;
-		goto out;
-	}
-
-	debugfs_ctx_defaults_directory = debugfs_create_dir("defaults",
-			kbdev->debugfs_ctx_directory);
-	if (IS_ERR_OR_NULL(debugfs_ctx_defaults_directory)) {
-		dev_err(kbdev->dev, "Couldn't create mali debugfs ctx defaults directory\n");
-		err = -ENOMEM;
-		goto out;
+		return dentry;
 	}
 
 	kbasep_regs_history_debugfs_init(kbdev);
 
-#if !MALI_USE_CSF
+#if MALI_USE_CSF
+	kbase_debug_csf_fault_debugfs_init(kbdev);
+#else /* MALI_USE_CSF */
 	kbase_debug_job_fault_debugfs_init(kbdev);
 #endif /* !MALI_USE_CSF */
 
@@ -4860,41 +4899,58 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev)
 	/* fops_* variables created by invocations of macro
 	 * MAKE_QUIRK_ACCESSORS() above.
 	 */
-	debugfs_create_file("quirks_sc", 0644,
+	dentry = debugfs_create_file("quirks_sc", 0644,
 			kbdev->mali_debugfs_directory, kbdev,
 			&fops_sc_quirks);
-	debugfs_create_file("quirks_tiler", 0644,
-			kbdev->mali_debugfs_directory, kbdev,
-			&fops_tiler_quirks);
-	debugfs_create_file("quirks_mmu", 0644,
-			kbdev->mali_debugfs_directory, kbdev,
-			&fops_mmu_quirks);
-	debugfs_create_file("quirks_gpu", 0644, kbdev->mali_debugfs_directory,
-			    kbdev, &fops_gpu_quirks);
-
-	debugfs_create_bool("infinite_cache", mode,
-			debugfs_ctx_defaults_directory,
-			&kbdev->infinite_cache_active_default);
-
-	debugfs_create_file("mem_pool_max_size", mode,
-			debugfs_ctx_defaults_directory,
-			&kbdev->mem_pool_defaults.small,
-			&kbase_device_debugfs_mem_pool_max_size_fops);
-
-	debugfs_create_file("lp_mem_pool_max_size", mode,
-			debugfs_ctx_defaults_directory,
-			&kbdev->mem_pool_defaults.large,
-			&kbase_device_debugfs_mem_pool_max_size_fops);
-
-	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE)) {
-		debugfs_create_file("protected_debug_mode", 0444,
-				kbdev->mali_debugfs_directory, kbdev,
-				&fops_protected_debug_mode);
+	if (IS_ERR_OR_NULL(dentry)) {
+		dev_err(kbdev->dev, "Unable to create quirks_sc debugfs entry\n");
+		return dentry;
 	}
 
-	debugfs_create_file("reset", 0644,
+	dentry = debugfs_create_file("quirks_tiler", 0644,
+			kbdev->mali_debugfs_directory, kbdev,
+			&fops_tiler_quirks);
+	if (IS_ERR_OR_NULL(dentry)) {
+		dev_err(kbdev->dev, "Unable to create quirks_tiler debugfs entry\n");
+		return dentry;
+	}
+
+	dentry = debugfs_create_file("quirks_mmu", 0644,
+			kbdev->mali_debugfs_directory, kbdev,
+			&fops_mmu_quirks);
+	if (IS_ERR_OR_NULL(dentry)) {
+		dev_err(kbdev->dev, "Unable to create quirks_mmu debugfs entry\n");
+		return dentry;
+	}
+
+	dentry = debugfs_create_file("quirks_gpu", 0644, kbdev->mali_debugfs_directory,
+			    kbdev, &fops_gpu_quirks);
+	if (IS_ERR_OR_NULL(dentry)) {
+		dev_err(kbdev->dev, "Unable to create quirks_gpu debugfs entry\n");
+		return dentry;
+	}
+
+	dentry = debugfs_ctx_defaults_init(kbdev);
+	if (IS_ERR_OR_NULL(dentry))
+		return dentry;
+
+	if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_PROTECTED_DEBUG_MODE)) {
+		dentry = debugfs_create_file("protected_debug_mode", 0444,
+				kbdev->mali_debugfs_directory, kbdev,
+				&fops_protected_debug_mode);
+		if (IS_ERR_OR_NULL(dentry)) {
+			dev_err(kbdev->dev, "Unable to create protected_debug_mode debugfs entry\n");
+			return dentry;
+		}
+	}
+
+	dentry = debugfs_create_file("reset", 0644,
 			kbdev->mali_debugfs_directory, kbdev,
 			&fops_trigger_reset);
+	if (IS_ERR_OR_NULL(dentry)) {
+		dev_err(kbdev->dev, "Unable to create reset debugfs entry\n");
+		return dentry;
+	}
 
 	kbase_ktrace_debugfs_init(kbdev);
 
@@ -4907,19 +4963,30 @@ int kbase_device_debugfs_init(struct kbase_device *kbdev)
 #endif /* CONFIG_MALI_BIFROST_DEVFREQ */
 
 #if !MALI_USE_CSF
-	debugfs_create_file("serialize_jobs", 0644,
+	dentry = debugfs_create_file("serialize_jobs", 0644,
 			kbdev->mali_debugfs_directory, kbdev,
 			&kbasep_serialize_jobs_debugfs_fops);
-
+	if (IS_ERR_OR_NULL(dentry)) {
+		dev_err(kbdev->dev, "Unable to create serialize_jobs debugfs entry\n");
+		return dentry;
+	}
+	kbase_timeline_io_debugfs_init(kbdev);
 #endif
 	kbase_dvfs_status_debugfs_init(kbdev);
 
 
-	return 0;
+	return dentry;
+}
 
-out:
-	debugfs_remove_recursive(kbdev->mali_debugfs_directory);
-	return err;
+int kbase_device_debugfs_init(struct kbase_device *kbdev)
+{
+	struct dentry *dentry = init_debugfs(kbdev);
+
+	if (IS_ERR_OR_NULL(dentry)) {
+		debugfs_remove_recursive(kbdev->mali_debugfs_directory);
+		return IS_ERR(dentry) ? PTR_ERR(dentry) : -ENOMEM;
+	}
+	return 0;
 }
 
 void kbase_device_debugfs_term(struct kbase_device *kbdev)
@@ -5441,7 +5508,9 @@ static int kbase_platform_device_probe(struct platform_device *pdev)
 
 	kbdev->dev = &pdev->dev;
 	dev_set_drvdata(kbdev->dev, kbdev);
-
+#if (KERNEL_VERSION(5, 3, 0) <= LINUX_VERSION_CODE)
+	mutex_lock(&kbase_probe_mutex);
+#endif
 	err = kbase_device_init(kbdev);
 
 	if (err) {
@@ -5453,10 +5522,16 @@ static int kbase_platform_device_probe(struct platform_device *pdev)
 
 		dev_set_drvdata(kbdev->dev, NULL);
 		kbase_device_free(kbdev);
+#if (KERNEL_VERSION(5, 3, 0) <= LINUX_VERSION_CODE)
+		mutex_unlock(&kbase_probe_mutex);
+#endif
 	} else {
 		dev_info(kbdev->dev,
 			"Probed as %s\n", dev_name(kbdev->mdev.this_device));
 		kbase_increment_device_id();
+#if (KERNEL_VERSION(5, 3, 0) <= LINUX_VERSION_CODE)
+		mutex_unlock(&kbase_probe_mutex);
+#endif
 #ifdef CONFIG_MALI_ARBITER_SUPPORT
 		mutex_lock(&kbdev->pm.lock);
 		kbase_arbiter_pm_vm_event(kbdev, KBASE_VM_GPU_INITIALIZED_EVT);
@@ -5690,10 +5765,11 @@ static const struct dev_pm_ops kbase_pm_ops = {
 };
 
 #if IS_ENABLED(CONFIG_OF)
-static const struct of_device_id kbase_dt_ids[] = {
-	{ .compatible = "arm,mali-bifrost" },
-	{ /* sentinel */ }
-};
+static const struct of_device_id kbase_dt_ids[] = { { .compatible = "arm,malit6xx" },
+						    { .compatible = "arm,mali-midgard" },
+						    { .compatible = "arm,mali-bifrost" },
+						    { .compatible = "arm,mali-valhall" },
+						    { /* sentinel */ } };
 MODULE_DEVICE_TABLE(of, kbase_dt_ids);
 #endif
 
@@ -5708,26 +5784,29 @@ static struct platform_driver kbase_platform_driver = {
 	},
 };
 
-/*
- * The driver will not provide a shortcut to create the Mali platform device
- * anymore when using Device Tree.
- */
-#if IS_ENABLED(CONFIG_OF)
+#if (KERNEL_VERSION(5, 3, 0) > LINUX_VERSION_CODE) && IS_ENABLED(CONFIG_OF)
 module_platform_driver(kbase_platform_driver);
 #else
-
 static int __init kbase_driver_init(void)
 {
 	int ret;
 
+#if (KERNEL_VERSION(5, 3, 0) <= LINUX_VERSION_CODE)
+	mutex_init(&kbase_probe_mutex);
+#endif
+
+#ifndef CONFIG_OF
 	ret = kbase_platform_register();
 	if (ret)
 		return ret;
-
+#endif
 	ret = platform_driver_register(&kbase_platform_driver);
-
-	if (ret)
+#ifndef CONFIG_OF
+	if (ret) {
 		kbase_platform_unregister();
+		return ret;
+	}
+#endif
 
 	return ret;
 }
@@ -5735,14 +5814,14 @@ static int __init kbase_driver_init(void)
 static void __exit kbase_driver_exit(void)
 {
 	platform_driver_unregister(&kbase_platform_driver);
+#ifndef CONFIG_OF
 	kbase_platform_unregister();
+#endif
 }
 
 module_init(kbase_driver_init);
 module_exit(kbase_driver_exit);
-
-#endif /* CONFIG_OF */
-
+#endif
 MODULE_LICENSE("GPL");
 MODULE_VERSION(MALI_RELEASE_NAME " (UK version " \
 		__stringify(BASE_UK_VERSION_MAJOR) "." \
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_ctx_sched.c b/drivers/gpu/arm/bifrost/mali_kbase_ctx_sched.c
index f4a46c12ac92..60afde2ceb7f 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_ctx_sched.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_ctx_sched.c
@@ -327,16 +327,14 @@ struct kbase_context *kbase_ctx_sched_as_to_ctx_nolock(
 bool kbase_ctx_sched_inc_refcount_nolock(struct kbase_context *kctx)
 {
 	bool result = false;
-	int as_nr;
 
 	if (WARN_ON(kctx == NULL))
 		return result;
 
 	lockdep_assert_held(&kctx->kbdev->hwaccess_lock);
 
-	as_nr = kctx->as_nr;
 	if (atomic_read(&kctx->refcount) > 0) {
-		KBASE_DEBUG_ASSERT(as_nr >= 0);
+		KBASE_DEBUG_ASSERT(kctx->as_nr >= 0);
 
 		kbase_ctx_sched_retain_ctx_refcount(kctx);
 		KBASE_KTRACE_ADD(kctx->kbdev, SCHED_RETAIN_CTX_NOLOCK, kctx,
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_debug_mem_allocs.c b/drivers/gpu/arm/bifrost/mali_kbase_debug_mem_allocs.c
new file mode 100644
index 000000000000..598d8f594644
--- /dev/null
+++ b/drivers/gpu/arm/bifrost/mali_kbase_debug_mem_allocs.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/*
+ * Debugfs interface to dump information about GPU allocations in kctx
+ */
+
+#include "mali_kbase_debug_mem_allocs.h"
+#include "mali_kbase.h"
+
+#include <string.h>
+#include <linux/list.h>
+#include <linux/file.h>
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+
+/**
+ * debug_zone_mem_allocs_show - Show information from specific rbtree
+ * @zone: Name of GPU virtual memory zone
+ * @rbtree: Pointer to the root of the rbtree associated with @zone
+ * @sfile: The debugfs entry
+ *
+ * This function is called to show information about all the GPU allocations of a
+ * a particular zone within GPU virtual memory space of a context.
+ * The information like the start virtual address and size (in bytes) is shown for
+ * every GPU allocation mapped in the zone.
+ */
+static void debug_zone_mem_allocs_show(char *zone, struct rb_root *rbtree, struct seq_file *sfile)
+{
+	struct rb_node *p;
+	struct kbase_va_region *reg;
+	const char *type_names[5] = {
+		"Native",
+		"Imported UMM",
+		"Imported user buf",
+		"Alias",
+		"Raw"
+	};
+
+#define MEM_ALLOCS_HEADER \
+	"              VA,          VA size,      Commit size,    Flags,     Mem type\n"
+	seq_printf(sfile, "Zone name: %s\n:", zone);
+	seq_printf(sfile, MEM_ALLOCS_HEADER);
+	for (p = rb_first(rbtree); p; p = rb_next(p)) {
+		reg = rb_entry(p, struct kbase_va_region, rblink);
+		if (!(reg->flags & KBASE_REG_FREE)) {
+			seq_printf(sfile, "%16llx, %16zx, %16zx, %8lx, %s\n",
+					reg->start_pfn << PAGE_SHIFT, reg->nr_pages << PAGE_SHIFT,
+					kbase_reg_current_backed_size(reg) << PAGE_SHIFT,
+					reg->flags, type_names[reg->gpu_alloc->type]);
+		}
+	}
+}
+
+/**
+ * debug_ctx_mem_allocs_show - Show information about GPU allocations in a kctx
+ * @sfile: The debugfs entry
+ * @data: Data associated with the entry
+ *
+ * Return:
+ * 0 if successfully prints data in debugfs entry file
+ * -1 if it encountered an error
+ */
+static int debug_ctx_mem_allocs_show(struct seq_file *sfile, void *data)
+{
+	struct kbase_context *const kctx = sfile->private;
+
+	kbase_gpu_vm_lock(kctx);
+
+	debug_zone_mem_allocs_show("SAME_VA:", &kctx->reg_rbtree_same, sfile);
+	debug_zone_mem_allocs_show("CUSTOM_VA:",  &kctx->reg_rbtree_custom, sfile);
+	debug_zone_mem_allocs_show("EXEC_VA:", &kctx->reg_rbtree_exec, sfile);
+
+#if MALI_USE_CSF
+	debug_zone_mem_allocs_show("EXEC_VA_FIXED:", &kctx->reg_rbtree_exec_fixed, sfile);
+	debug_zone_mem_allocs_show("FIXED_VA:", &kctx->reg_rbtree_fixed, sfile);
+#endif /* MALI_USE_CSF */
+
+	kbase_gpu_vm_unlock(kctx);
+	return 0;
+}
+
+/*
+ *  File operations related to debugfs entry for mem_zones
+ */
+static int debug_mem_allocs_open(struct inode *in, struct file *file)
+{
+	return single_open(file, debug_ctx_mem_allocs_show, in->i_private);
+}
+
+static const struct file_operations kbase_debug_mem_allocs_fops = {
+	.owner = THIS_MODULE,
+	.open = debug_mem_allocs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ *  Initialize debugfs entry for mem_allocs
+ */
+void kbase_debug_mem_allocs_init(struct kbase_context *const kctx)
+{
+	/* Caller already ensures this, but we keep the pattern for
+	 * maintenance safety.
+	 */
+	if (WARN_ON(!kctx) || WARN_ON(IS_ERR_OR_NULL(kctx->kctx_dentry)))
+		return;
+
+	debugfs_create_file("mem_allocs", 0400, kctx->kctx_dentry, kctx,
+			    &kbase_debug_mem_allocs_fops);
+}
+#else
+/*
+ * Stub functions for when debugfs is disabled
+ */
+void kbase_debug_mem_allocs_init(struct kbase_context *const kctx)
+{
+}
+#endif
diff --git a/drivers/base/arm/dma_buf_lock/src/dma_buf_lock.h b/drivers/gpu/arm/bifrost/mali_kbase_debug_mem_allocs.h
similarity index 56%
rename from drivers/base/arm/dma_buf_lock/src/dma_buf_lock.h
rename to drivers/gpu/arm/bifrost/mali_kbase_debug_mem_allocs.h
index b5bb22553fe2..8cf69c2cbaf9 100644
--- a/drivers/base/arm/dma_buf_lock/src/dma_buf_lock.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_debug_mem_allocs.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2012, 2020-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,26 +19,21 @@
  *
  */
 
-#ifndef _DMA_BUF_LOCK_H
-#define _DMA_BUF_LOCK_H
+#ifndef _KBASE_DEBUG_MEM_ALLOCS_H
+#define _KBASE_DEBUG_MEM_ALLOCS_H
 
-enum dma_buf_lock_exclusive {
-	DMA_BUF_LOCK_NONEXCLUSIVE = 0,
-	DMA_BUF_LOCK_EXCLUSIVE = -1
-};
+#include <mali_kbase.h>
 
-struct dma_buf_lock_k_request {
-	int count;
-	int *list_of_dma_buf_fds;
-	int timeout;
-	enum dma_buf_lock_exclusive exclusive;
-};
+/**
+ * kbase_debug_mem_allocs_init() - Initialize the mem_allocs debugfs file
+ * @kctx: Pointer to kernel base context
+ *
+ * This function creates a "mem_allocs" file for a context to show infor about the
+ * GPU allocations created for that context.
+ *
+ * The file is cleaned up by a call to debugfs_remove_recursive() deleting the
+ * parent directory.
+ */
+void kbase_debug_mem_allocs_init(struct kbase_context *kctx);
 
-#define DMA_BUF_LOCK_IOC_MAGIC '~'
-
-#define DMA_BUF_LOCK_FUNC_LOCK_ASYNC       _IOW(DMA_BUF_LOCK_IOC_MAGIC, 11, struct dma_buf_lock_k_request)
-
-#define DMA_BUF_LOCK_IOC_MINNR 11
-#define DMA_BUF_LOCK_IOC_MAXNR 11
-
-#endif /* _DMA_BUF_LOCK_H */
+#endif
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_debug_mem_view.h b/drivers/gpu/arm/bifrost/mali_kbase_debug_mem_view.h
index d03483219b08..cb8050d9b32c 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_debug_mem_view.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_debug_mem_view.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2013-2015, 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2013-2015, 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -25,7 +25,7 @@
 #include <mali_kbase.h>
 
 /**
- * kbase_debug_mem_view_init - Initialize the mem_view sysfs file
+ * kbase_debug_mem_view_init - Initialize the mem_view debugfs file
  * @kctx: Pointer to kernel base context
  *
  * This function creates a "mem_view" file which can be used to get a view of
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_defs.h b/drivers/gpu/arm/bifrost/mali_kbase_defs.h
index 9fafe96d14a8..1072eac6d186 100755
--- a/drivers/gpu/arm/bifrost/mali_kbase_defs.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_defs.h
@@ -35,13 +35,13 @@
 #include <backend/gpu/mali_kbase_instr_defs.h>
 #include <mali_kbase_pm.h>
 #include <mali_kbase_gpuprops_types.h>
-#include <mali_kbase_hwcnt_watchdog_if.h>
+#include <hwcnt/mali_kbase_hwcnt_watchdog_if.h>
 
 #if MALI_USE_CSF
-#include <mali_kbase_hwcnt_backend_csf.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_csf.h>
 #else
-#include <mali_kbase_hwcnt_backend_jm.h>
-#include <mali_kbase_hwcnt_backend_jm_watchdog.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_jm.h>
+#include <hwcnt/backend/mali_kbase_hwcnt_backend_jm_watchdog.h>
 #endif
 
 #include <protected_mode_switcher.h>
@@ -53,11 +53,7 @@
 #include <linux/sizes.h>
 
 
-#if defined(CONFIG_SYNC)
-#include <sync.h>
-#else
 #include "mali_kbase_fence_defs.h"
-#endif
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 #include <linux/debugfs.h>
@@ -133,8 +129,7 @@
 /* Maximum number of pages of memory that require a permanent mapping, per
  * kbase_context
  */
-#define KBASE_PERMANENTLY_MAPPED_MEM_LIMIT_PAGES ((32 * 1024ul * 1024ul) >> \
-								PAGE_SHIFT)
+#define KBASE_PERMANENTLY_MAPPED_MEM_LIMIT_PAGES ((64 * 1024ul * 1024ul) >> PAGE_SHIFT)
 /* Minimum threshold period for hwcnt dumps between different hwcnt virtualizer
  * clients, to reduce undesired system load.
  * If a virtualizer client requests a dump within this threshold period after
@@ -446,36 +441,40 @@ struct kbase_pm_device_data {
 
 /**
  * struct kbase_mem_pool - Page based memory pool for kctx/kbdev
- * @kbdev:        Kbase device where memory is used
- * @cur_size:     Number of free pages currently in the pool (may exceed
- *                @max_size in some corner cases)
- * @max_size:     Maximum number of free pages in the pool
- * @order:        order = 0 refers to a pool of 4 KB pages
- *                order = 9 refers to a pool of 2 MB pages (2^9 * 4KB = 2 MB)
- * @group_id:     A memory group ID to be passed to a platform-specific
- *                memory group manager, if present. Immutable.
- *                Valid range is 0..(MEMORY_GROUP_MANAGER_NR_GROUPS-1).
- * @pool_lock:    Lock protecting the pool - must be held when modifying
- *                @cur_size and @page_list
- * @page_list:    List of free pages in the pool
- * @reclaim:      Shrinker for kernel reclaim of free pages
- * @next_pool:    Pointer to next pool where pages can be allocated when this
- *                pool is empty. Pages will spill over to the next pool when
- *                this pool is full. Can be NULL if there is no next pool.
- * @dying:        true if the pool is being terminated, and any ongoing
- *                operations should be abandoned
- * @dont_reclaim: true if the shrinker is forbidden from reclaiming memory from
- *                this pool, eg during a grow operation
+ * @kbdev:                     Kbase device where memory is used
+ * @cur_size:                  Number of free pages currently in the pool (may exceed
+ *                             @max_size in some corner cases)
+ * @max_size:                  Maximum number of free pages in the pool
+ * @order:                     order = 0 refers to a pool of 4 KB pages
+ *                             order = 9 refers to a pool of 2 MB pages (2^9 * 4KB = 2 MB)
+ * @group_id:                  A memory group ID to be passed to a platform-specific
+ *                             memory group manager, if present. Immutable.
+ *                             Valid range is 0..(MEMORY_GROUP_MANAGER_NR_GROUPS-1).
+ * @pool_lock:                 Lock protecting the pool - must be held when modifying
+ *                             @cur_size and @page_list
+ * @page_list:                 List of free pages in the pool
+ * @reclaim:                   Shrinker for kernel reclaim of free pages
+ * @isolation_in_progress_cnt: Number of pages in pool undergoing page isolation.
+ *                             This is used to avoid race condition between pool termination
+ *                             and page isolation for page migration.
+ * @next_pool:                 Pointer to next pool where pages can be allocated when this
+ *                             pool is empty. Pages will spill over to the next pool when
+ *                             this pool is full. Can be NULL if there is no next pool.
+ * @dying:                     true if the pool is being terminated, and any ongoing
+ *                             operations should be abandoned
+ * @dont_reclaim:              true if the shrinker is forbidden from reclaiming memory from
+ *                             this pool, eg during a grow operation
  */
 struct kbase_mem_pool {
 	struct kbase_device *kbdev;
-	size_t              cur_size;
-	size_t              max_size;
-	u8                  order;
-	u8                  group_id;
-	spinlock_t          pool_lock;
-	struct list_head    page_list;
-	struct shrinker     reclaim;
+	size_t cur_size;
+	size_t max_size;
+	u8 order;
+	u8 group_id;
+	spinlock_t pool_lock;
+	struct list_head page_list;
+	struct shrinker reclaim;
+	atomic_t isolation_in_progress_cnt;
 
 	struct kbase_mem_pool *next_pool;
 
@@ -562,7 +561,7 @@ struct kbase_devfreq_opp {
  * @entry_set_pte:    program the pte to be a valid entry to encode the physical
  *                    address of the next lower level page table and also update
  *                    the number of valid entries.
- * @entry_invalidate: clear out or invalidate the pte.
+ * @entries_invalidate: clear out or invalidate a range of ptes.
  * @get_num_valid_entries: returns the number of valid entries for a specific pgd.
  * @set_num_valid_entries: sets the number of valid entries for a specific pgd
  * @flags:            bitmask of MMU mode flags. Refer to KBASE_MMU_MODE_ constants.
@@ -580,7 +579,7 @@ struct kbase_mmu_mode {
 	void (*entry_set_ate)(u64 *entry, struct tagged_addr phy,
 			unsigned long flags, int level);
 	void (*entry_set_pte)(u64 *entry, phys_addr_t phy);
-	void (*entry_invalidate)(u64 *entry);
+	void (*entries_invalidate)(u64 *entry, u32 count);
 	unsigned int (*get_num_valid_entries)(u64 *pgd);
 	void (*set_num_valid_entries)(u64 *pgd,
 				      unsigned int num_of_valid_entries);
@@ -647,6 +646,30 @@ struct kbase_process {
 	struct rb_root dma_buf_root;
 };
 
+/**
+ * struct kbase_mem_migrate - Object representing an instance for managing
+ *                            page migration.
+ *
+ * @mapping:          Pointer to address space struct used for page migration.
+ * @free_pages_list:  List of deferred pages to free. Mostly used when page migration
+ *                    is enabled. Pages in memory pool that require migrating
+ *                    will be freed instead. However page cannot be freed
+ *                    right away as Linux will need to release the page lock.
+ *                    Therefore page will be added to this list and freed later.
+ * @free_pages_lock:  This lock should be held when adding or removing pages
+ *                    from @free_pages_list.
+ * @free_pages_workq: Work queue to process the work items queued to free
+ *                    pages in @free_pages_list.
+ * @free_pages_work:  Work item to free pages in @free_pages_list.
+ */
+struct kbase_mem_migrate {
+	struct address_space *mapping;
+	struct list_head free_pages_list;
+	spinlock_t free_pages_lock;
+	struct workqueue_struct *free_pages_workq;
+	struct work_struct free_pages_work;
+};
+
 /**
  * struct kbase_device   - Object representing an instance of GPU platform device,
  *                         allocated from the probe method of mali driver.
@@ -962,6 +985,7 @@ struct kbase_process {
  * @pcm_dev:                The priority control manager device.
  * @oom_notifier_block:     notifier_block containing kernel-registered out-of-
  *                          memory handler.
+ * @mem_migrate:            Per device object for managing page migration.
  */
 struct kbase_device {
 	u32 hw_quirks_sc;
@@ -1029,6 +1053,12 @@ struct kbase_device {
 	s8 nr_hw_address_spaces;
 	s8 nr_user_address_spaces;
 
+	/**
+	 * @pbha_propagate_bits:   Record of Page-Based Hardware Attribute Propagate bits to
+	 *                         restore to L2_CONFIG upon GPU reset.
+	 */
+	u8 pbha_propagate_bits;
+
 #if MALI_USE_CSF
 	struct kbase_hwcnt_backend_csf_if hwcnt_backend_csf_if_fw;
 #else
@@ -1115,7 +1145,9 @@ struct kbase_device {
 #endif /* CONFIG_MALI_BIFROST_DEVFREQ */
 	unsigned long previous_frequency;
 
+#if !MALI_USE_CSF
 	atomic_t job_fault_debug;
+#endif /* !MALI_USE_CSF */
 
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	struct dentry *mali_debugfs_directory;
@@ -1126,11 +1158,13 @@ struct kbase_device {
 	u64 debugfs_as_read_bitmap;
 #endif /* CONFIG_MALI_BIFROST_DEBUG */
 
+#if !MALI_USE_CSF
 	wait_queue_head_t job_fault_wq;
 	wait_queue_head_t job_fault_resume_wq;
 	struct workqueue_struct *job_fault_resume_workq;
 	struct list_head job_fault_event_list;
 	spinlock_t job_fault_event_lock;
+#endif /* !MALI_USE_CSF */
 
 #if !MALI_CUSTOMER_RELEASE
 	struct {
@@ -1250,6 +1284,8 @@ struct kbase_device {
 	 */
 	u32 num_of_atoms_hw_completed;
 #endif
+
+	struct kbase_mem_migrate mem_migrate;
 };
 
 /**
@@ -1332,10 +1368,6 @@ struct kbase_file {
  *
  * @KCTX_DYING: Set when the context process is in the process of being evicted.
  *
- * @KCTX_NO_IMPLICIT_SYNC: Set when explicit Android fences are in use on this
- * context, to disable use of implicit dma-buf fences. This is used to avoid
- * potential synchronization deadlocks.
- *
  * @KCTX_FORCE_SAME_VA: Set when BASE_MEM_SAME_VA should be forced on memory
  * allocations. For 64-bit clients it is enabled by default, and disabled by
  * default on 32-bit clients. Being able to clear this flag is only used for
@@ -1378,7 +1410,6 @@ enum kbase_context_flags {
 	KCTX_PRIVILEGED = 1U << 7,
 	KCTX_SCHEDULED = 1U << 8,
 	KCTX_DYING = 1U << 9,
-	KCTX_NO_IMPLICIT_SYNC = 1U << 10,
 	KCTX_FORCE_SAME_VA = 1U << 11,
 	KCTX_PULLED_SINCE_ACTIVE_JS0 = 1U << 12,
 	KCTX_PULLED_SINCE_ACTIVE_JS1 = 1U << 13,
@@ -1417,9 +1448,6 @@ enum kbase_context_flags {
  *
  * @KCTX_DYING: Set when the context process is in the process of being evicted.
  *
- * @KCTX_NO_IMPLICIT_SYNC: Set when explicit Android fences are in use on this
- * context, to disable use of implicit dma-buf fences. This is used to avoid
- * potential synchronization deadlocks.
  *
  * @KCTX_FORCE_SAME_VA: Set when BASE_MEM_SAME_VA should be forced on memory
  * allocations. For 64-bit clients it is enabled by default, and disabled by
@@ -1460,7 +1488,6 @@ enum kbase_context_flags {
 	KCTX_PRIVILEGED = 1U << 7,
 	KCTX_SCHEDULED = 1U << 8,
 	KCTX_DYING = 1U << 9,
-	KCTX_NO_IMPLICIT_SYNC = 1U << 10,
 	KCTX_FORCE_SAME_VA = 1U << 11,
 	KCTX_PULLED_SINCE_ACTIVE_JS0 = 1U << 12,
 	KCTX_PULLED_SINCE_ACTIVE_JS1 = 1U << 13,
@@ -1667,12 +1694,6 @@ struct kbase_sub_alloc {
  *                                   memory allocations.
  * @jit_current_allocations_per_bin: Current number of in-flight just-in-time
  *                                   memory allocations per bin.
- * @jit_version:          Version number indicating whether userspace is using
- *                        old or new version of interface for just-in-time
- *                        memory allocations.
- *                        1 -> client used KBASE_IOCTL_MEM_JIT_INIT_10_2
- *                        2 -> client used KBASE_IOCTL_MEM_JIT_INIT_11_5
- *                        3 -> client used KBASE_IOCTL_MEM_JIT_INIT
  * @jit_group_id:         A memory group ID to be passed to a platform-specific
  *                        memory group manager.
  *                        Valid range is 0..(MEMORY_GROUP_MANAGER_NR_GROUPS-1).
@@ -1826,12 +1847,6 @@ struct kbase_context {
 
 	struct list_head waiting_soft_jobs;
 	spinlock_t waiting_soft_jobs_lock;
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	struct {
-		struct list_head waiting_resource;
-		struct workqueue_struct *wq;
-	} dma_fence;
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
 
 	int as_nr;
 
@@ -1863,7 +1878,6 @@ struct kbase_context {
 	u8 jit_max_allocations;
 	u8 jit_current_allocations;
 	u8 jit_current_allocations_per_bin[256];
-	u8 jit_version;
 	u8 jit_group_id;
 #if MALI_JIT_PRESSURE_LIMIT_BASE
 	u64 jit_phys_pages_limit;
@@ -1930,17 +1944,15 @@ struct kbasep_gwt_list_element {
  *                                 to a @kbase_context.
  * @ext_res_node:                  List head for adding the metadata to a
  *                                 @kbase_context.
- * @alloc:                         The physical memory allocation structure
- *                                 which is mapped.
- * @gpu_addr:                      The GPU virtual address the resource is
- *                                 mapped to.
+ * @reg:                           External resource information, containing
+ *                                 the corresponding VA region
  * @ref:                           Reference count.
  *
  * External resources can be mapped into multiple contexts as well as the same
  * context multiple times.
- * As kbase_va_region itself isn't refcounted we can't attach our extra
- * information to it as it could be removed under our feet leaving external
- * resources pinned.
+ * As kbase_va_region is refcounted, we guarantee that it will be available
+ * for the duration of the external resource, meaning it is sufficient to use
+ * it to rederive any additional data, like the GPU address.
  * This metadata structure binds a single external resource to a single
  * context, ensuring that per context mapping is tracked separately so it can
  * be overridden when needed and abuses by the application (freeing the resource
@@ -1948,8 +1960,7 @@ struct kbasep_gwt_list_element {
  */
 struct kbase_ctx_ext_res_meta {
 	struct list_head ext_res_node;
-	struct kbase_mem_phy_alloc *alloc;
-	u64 gpu_addr;
+	struct kbase_va_region *reg;
 	u32 ref;
 };
 
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_dma_fence.c b/drivers/gpu/arm/bifrost/mali_kbase_dma_fence.c
deleted file mode 100644
index d5f4fae091e8..000000000000
--- a/drivers/gpu/arm/bifrost/mali_kbase_dma_fence.c
+++ /dev/null
@@ -1,491 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
-/*
- *
- * (C) COPYRIGHT 2011-2016, 2020-2021 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-/* Include mali_kbase_dma_fence.h before checking for CONFIG_MALI_BIFROST_DMA_FENCE as
- * it will be set there.
- */
-#include "mali_kbase_dma_fence.h"
-#include <linux/atomic.h>
-#include <linux/list.h>
-#include <linux/lockdep.h>
-#include <linux/mutex.h>
-#include <linux/version.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/workqueue.h>
-#include <linux/ww_mutex.h>
-#include <mali_kbase.h>
-
-static void
-kbase_dma_fence_work(struct work_struct *pwork);
-
-static void
-kbase_dma_fence_waiters_add(struct kbase_jd_atom *katom)
-{
-	struct kbase_context *kctx = katom->kctx;
-
-	list_add_tail(&katom->queue, &kctx->dma_fence.waiting_resource);
-}
-
-static void
-kbase_dma_fence_waiters_remove(struct kbase_jd_atom *katom)
-{
-	list_del(&katom->queue);
-}
-
-static int
-kbase_dma_fence_lock_reservations(struct kbase_dma_fence_resv_info *info,
-				  struct ww_acquire_ctx *ctx)
-{
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-	struct reservation_object *content_res = NULL;
-#else
-	struct dma_resv *content_res = NULL;
-#endif
-	unsigned int content_res_idx = 0;
-	unsigned int r;
-	int err = 0;
-
-	ww_acquire_init(ctx, &reservation_ww_class);
-
-retry:
-	for (r = 0; r < info->dma_fence_resv_count; r++) {
-		if (info->resv_objs[r] == content_res) {
-			content_res = NULL;
-			continue;
-		}
-
-		err = ww_mutex_lock(&info->resv_objs[r]->lock, ctx);
-		if (err)
-			goto error;
-	}
-
-	ww_acquire_done(ctx);
-	return err;
-
-error:
-	content_res_idx = r;
-
-	/* Unlock the locked one ones */
-	while (r--)
-		ww_mutex_unlock(&info->resv_objs[r]->lock);
-
-	if (content_res)
-		ww_mutex_unlock(&content_res->lock);
-
-	/* If we deadlock try with lock_slow and retry */
-	if (err == -EDEADLK) {
-		content_res = info->resv_objs[content_res_idx];
-		ww_mutex_lock_slow(&content_res->lock, ctx);
-		goto retry;
-	}
-
-	/* If we are here the function failed */
-	ww_acquire_fini(ctx);
-	return err;
-}
-
-static void
-kbase_dma_fence_unlock_reservations(struct kbase_dma_fence_resv_info *info,
-				    struct ww_acquire_ctx *ctx)
-{
-	unsigned int r;
-
-	for (r = 0; r < info->dma_fence_resv_count; r++)
-		ww_mutex_unlock(&info->resv_objs[r]->lock);
-	ww_acquire_fini(ctx);
-}
-
-
-
-/**
- * kbase_dma_fence_queue_work() - Queue work to handle @katom
- * @katom: Pointer to atom for which to queue work
- *
- * Queue kbase_dma_fence_work() for @katom to clean up the fence callbacks and
- * submit the atom.
- */
-static void
-kbase_dma_fence_queue_work(struct kbase_jd_atom *katom)
-{
-	struct kbase_context *kctx = katom->kctx;
-	bool ret;
-
-	INIT_WORK(&katom->work, kbase_dma_fence_work);
-	ret = queue_work(kctx->dma_fence.wq, &katom->work);
-	/* Warn if work was already queued, that should not happen. */
-	WARN_ON(!ret);
-}
-
-/**
- * kbase_dma_fence_cancel_atom() - Cancels waiting on an atom
- * @katom:	Katom to cancel
- *
- * Locking: katom->dma_fence.callbacks list assumes jctx.lock is held.
- */
-static void
-kbase_dma_fence_cancel_atom(struct kbase_jd_atom *katom)
-{
-	lockdep_assert_held(&katom->kctx->jctx.lock);
-
-	/* Cancel callbacks and clean up. */
-	kbase_fence_free_callbacks(katom);
-
-	/* Mark the atom as handled in case all fences signaled just before
-	 * canceling the callbacks and the worker was queued.
-	 */
-	kbase_fence_dep_count_set(katom, -1);
-
-	/* Prevent job_done_nolock from being called twice on an atom when
-	 * there is a race between job completion and cancellation.
-	 */
-
-	if (katom->status == KBASE_JD_ATOM_STATE_QUEUED) {
-		/* Wait was cancelled - zap the atom */
-		katom->event_code = BASE_JD_EVENT_JOB_CANCELLED;
-		if (kbase_jd_done_nolock(katom, true))
-			kbase_js_sched_all(katom->kctx->kbdev);
-	}
-}
-
-/**
- * kbase_dma_fence_work() - Worker thread called when a fence is signaled
- * @pwork:	work_struct containing a pointer to a katom
- *
- * This function will clean and mark all dependencies as satisfied
- */
-static void
-kbase_dma_fence_work(struct work_struct *pwork)
-{
-	struct kbase_jd_atom *katom;
-	struct kbase_jd_context *ctx;
-
-	katom = container_of(pwork, struct kbase_jd_atom, work);
-	ctx = &katom->kctx->jctx;
-
-	mutex_lock(&ctx->lock);
-	if (kbase_fence_dep_count_read(katom) != 0)
-		goto out;
-
-	kbase_fence_dep_count_set(katom, -1);
-
-	/* Remove atom from list of dma-fence waiting atoms. */
-	kbase_dma_fence_waiters_remove(katom);
-	/* Cleanup callbacks. */
-	kbase_fence_free_callbacks(katom);
-	/*
-	 * Queue atom on GPU, unless it has already completed due to a failing
-	 * dependency. Run kbase_jd_done_nolock() on the katom if it is completed.
-	 */
-	if (unlikely(katom->status == KBASE_JD_ATOM_STATE_COMPLETED))
-		kbase_jd_done_nolock(katom, true);
-	else
-		kbase_jd_dep_clear_locked(katom);
-
-out:
-	mutex_unlock(&ctx->lock);
-}
-
-static void
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-kbase_dma_fence_cb(struct fence *fence, struct fence_cb *cb)
-#else
-kbase_dma_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
-#endif
-{
-	struct kbase_fence_cb *kcb = container_of(cb,
-				struct kbase_fence_cb,
-				fence_cb);
-	struct kbase_jd_atom *katom = kcb->katom;
-
-	/* If the atom is zapped dep_count will be forced to a negative number
-	 * preventing this callback from ever scheduling work. Which in turn
-	 * would reschedule the atom.
-	 */
-
-	if (kbase_fence_dep_count_dec_and_test(katom))
-		kbase_dma_fence_queue_work(katom);
-}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-static int
-kbase_dma_fence_add_reservation_callback(struct kbase_jd_atom *katom,
-					 struct reservation_object *resv,
-					 bool exclusive)
-#else
-static int
-kbase_dma_fence_add_reservation_callback(struct kbase_jd_atom *katom,
-					 struct dma_resv *resv,
-					 bool exclusive)
-#endif
-{
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-	struct fence *excl_fence = NULL;
-	struct fence **shared_fences = NULL;
-#else
-	struct dma_fence *excl_fence = NULL;
-	struct dma_fence **shared_fences = NULL;
-#endif
-	unsigned int shared_count = 0;
-	int err, i;
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-	err = reservation_object_get_fences_rcu(
-#elif (KERNEL_VERSION(5, 14, 0) > LINUX_VERSION_CODE)
-	err = dma_resv_get_fences_rcu(
-#else
-	err = dma_resv_get_fences(
-#endif
-						resv,
-						&excl_fence,
-						&shared_count,
-						&shared_fences);
-	if (err)
-		return err;
-
-	if (excl_fence) {
-		err = kbase_fence_add_callback(katom,
-						excl_fence,
-						kbase_dma_fence_cb);
-
-		/* Release our reference, taken by reservation_object_get_fences_rcu(),
-		 * to the fence. We have set up our callback (if that was possible),
-		 * and it's the fence's owner is responsible for singling the fence
-		 * before allowing it to disappear.
-		 */
-		dma_fence_put(excl_fence);
-
-		if (err)
-			goto out;
-	}
-
-	if (exclusive) {
-		for (i = 0; i < shared_count; i++) {
-			err = kbase_fence_add_callback(katom,
-							shared_fences[i],
-							kbase_dma_fence_cb);
-			if (err)
-				goto out;
-		}
-	}
-
-	/* Release all our references to the shared fences, taken by
-	 * reservation_object_get_fences_rcu(). We have set up our callback (if
-	 * that was possible), and it's the fence's owner is responsible for
-	 * signaling the fence before allowing it to disappear.
-	 */
-out:
-	for (i = 0; i < shared_count; i++)
-		dma_fence_put(shared_fences[i]);
-	kfree(shared_fences);
-
-	if (err) {
-		/*
-		 * On error, cancel and clean up all callbacks that was set up
-		 * before the error.
-		 */
-		kbase_fence_free_callbacks(katom);
-	}
-
-	return err;
-}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-void kbase_dma_fence_add_reservation(struct reservation_object *resv,
-				     struct kbase_dma_fence_resv_info *info,
-				     bool exclusive)
-#else
-void kbase_dma_fence_add_reservation(struct dma_resv *resv,
-				     struct kbase_dma_fence_resv_info *info,
-				     bool exclusive)
-#endif
-{
-	unsigned int i;
-
-	for (i = 0; i < info->dma_fence_resv_count; i++) {
-		/* Duplicate resource, ignore */
-		if (info->resv_objs[i] == resv)
-			return;
-	}
-
-	info->resv_objs[info->dma_fence_resv_count] = resv;
-	if (exclusive)
-		set_bit(info->dma_fence_resv_count,
-			info->dma_fence_excl_bitmap);
-	(info->dma_fence_resv_count)++;
-}
-
-int kbase_dma_fence_wait(struct kbase_jd_atom *katom,
-			 struct kbase_dma_fence_resv_info *info)
-{
-	int err, i;
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-	struct fence *fence;
-#else
-	struct dma_fence *fence;
-#endif
-	struct ww_acquire_ctx ww_ctx;
-
-	lockdep_assert_held(&katom->kctx->jctx.lock);
-
-	fence = kbase_fence_out_new(katom);
-	if (!fence) {
-		err = -ENOMEM;
-		dev_err(katom->kctx->kbdev->dev,
-			"Error %d creating fence.\n", err);
-		return err;
-	}
-
-	kbase_fence_dep_count_set(katom, 1);
-
-	err = kbase_dma_fence_lock_reservations(info, &ww_ctx);
-	if (err) {
-		dev_err(katom->kctx->kbdev->dev,
-			"Error %d locking reservations.\n", err);
-		kbase_fence_dep_count_set(katom, -1);
-		kbase_fence_out_remove(katom);
-		return err;
-	}
-
-	for (i = 0; i < info->dma_fence_resv_count; i++) {
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-		struct reservation_object *obj = info->resv_objs[i];
-#else
-		struct dma_resv *obj = info->resv_objs[i];
-#endif
-		if (!test_bit(i, info->dma_fence_excl_bitmap)) {
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			err = reservation_object_reserve_shared(obj);
-#else
-			err = dma_resv_reserve_shared(obj, 0);
-#endif
-			if (err) {
-				dev_err(katom->kctx->kbdev->dev,
-					"Error %d reserving space for shared fence.\n", err);
-				goto end;
-			}
-
-			err = kbase_dma_fence_add_reservation_callback(katom, obj, false);
-			if (err) {
-				dev_err(katom->kctx->kbdev->dev,
-					"Error %d adding reservation to callback.\n", err);
-				goto end;
-			}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			reservation_object_add_shared_fence(obj, fence);
-#else
-			dma_resv_add_shared_fence(obj, fence);
-#endif
-		} else {
-			err = kbase_dma_fence_add_reservation_callback(katom, obj, true);
-			if (err) {
-				dev_err(katom->kctx->kbdev->dev,
-					"Error %d adding reservation to callback.\n", err);
-				goto end;
-			}
-
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			reservation_object_add_excl_fence(obj, fence);
-#else
-			dma_resv_add_excl_fence(obj, fence);
-#endif
-		}
-	}
-
-end:
-	kbase_dma_fence_unlock_reservations(info, &ww_ctx);
-
-	if (likely(!err)) {
-		/* Test if the callbacks are already triggered */
-		if (kbase_fence_dep_count_dec_and_test(katom)) {
-			kbase_fence_dep_count_set(katom, -1);
-			kbase_fence_free_callbacks(katom);
-		} else {
-			/* Add katom to the list of dma-buf fence waiting atoms
-			 * only if it is still waiting.
-			 */
-			kbase_dma_fence_waiters_add(katom);
-		}
-	} else {
-		/* There was an error, cancel callbacks, set dep_count to -1 to
-		 * indicate that the atom has been handled (the caller will
-		 * kill it for us), signal the fence, free callbacks and the
-		 * fence.
-		 */
-		kbase_fence_free_callbacks(katom);
-		kbase_fence_dep_count_set(katom, -1);
-		kbase_dma_fence_signal(katom);
-	}
-
-	return err;
-}
-
-void kbase_dma_fence_cancel_all_atoms(struct kbase_context *kctx)
-{
-	struct list_head *list = &kctx->dma_fence.waiting_resource;
-
-	while (!list_empty(list)) {
-		struct kbase_jd_atom *katom;
-
-		katom = list_first_entry(list, struct kbase_jd_atom, queue);
-		kbase_dma_fence_waiters_remove(katom);
-		kbase_dma_fence_cancel_atom(katom);
-	}
-}
-
-void kbase_dma_fence_cancel_callbacks(struct kbase_jd_atom *katom)
-{
-	/* Cancel callbacks and clean up. */
-	if (kbase_fence_free_callbacks(katom))
-		kbase_dma_fence_queue_work(katom);
-}
-
-void kbase_dma_fence_signal(struct kbase_jd_atom *katom)
-{
-	if (!katom->dma_fence.fence)
-		return;
-
-	/* Signal the atom's fence. */
-	dma_fence_signal(katom->dma_fence.fence);
-
-	kbase_fence_out_remove(katom);
-
-	kbase_fence_free_callbacks(katom);
-}
-
-void kbase_dma_fence_term(struct kbase_context *kctx)
-{
-	destroy_workqueue(kctx->dma_fence.wq);
-	kctx->dma_fence.wq = NULL;
-}
-
-int kbase_dma_fence_init(struct kbase_context *kctx)
-{
-	INIT_LIST_HEAD(&kctx->dma_fence.waiting_resource);
-
-	kctx->dma_fence.wq = alloc_workqueue("mali-fence-%d",
-					     WQ_UNBOUND, 1, kctx->pid);
-	if (!kctx->dma_fence.wq)
-		return -ENOMEM;
-
-	return 0;
-}
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_dma_fence.h b/drivers/gpu/arm/bifrost/mali_kbase_dma_fence.h
deleted file mode 100644
index f0c8d069b02c..000000000000
--- a/drivers/gpu/arm/bifrost/mali_kbase_dma_fence.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *
- * (C) COPYRIGHT 2010-2016, 2020-2022 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#ifndef _KBASE_DMA_FENCE_H_
-#define _KBASE_DMA_FENCE_H_
-
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-
-#include <linux/list.h>
-#include <linux/version.h>
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-#include <linux/reservation.h>
-#else
-#include <linux/dma-resv.h>
-#endif
-#include <mali_kbase_fence.h>
-
-/* Forward declaration from mali_kbase_defs.h */
-struct kbase_jd_atom;
-struct kbase_context;
-
-/**
- * struct kbase_dma_fence_resv_info - Structure with list of reservation objects
- * @resv_objs:             Array of reservation objects to attach the
- *                         new fence to.
- * @dma_fence_resv_count:  Number of reservation objects in the array.
- * @dma_fence_excl_bitmap: Specifies which resv_obj are exclusive.
- *
- * This is used by some functions to pass around a collection of data about
- * reservation objects.
- */
-struct kbase_dma_fence_resv_info {
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-	struct reservation_object **resv_objs;
-#else
-	struct dma_resv **resv_objs;
-#endif
-	unsigned int dma_fence_resv_count;
-	unsigned long *dma_fence_excl_bitmap;
-};
-
-/**
- * kbase_dma_fence_add_reservation() - Adds a resv to the array of resv_objs
- * @resv:      Reservation object to add to the array.
- * @info:      Pointer to struct with current reservation info
- * @exclusive: Boolean indicating if exclusive access is needed
- *
- * The function adds a new reservation_object to an existing array of
- * reservation_objects. At the same time keeps track of which objects require
- * exclusive access in dma_fence_excl_bitmap.
- */
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-void kbase_dma_fence_add_reservation(struct reservation_object *resv,
-				     struct kbase_dma_fence_resv_info *info,
-				     bool exclusive);
-#else
-void kbase_dma_fence_add_reservation(struct dma_resv *resv,
-				     struct kbase_dma_fence_resv_info *info,
-				     bool exclusive);
-#endif
-
-/**
- * kbase_dma_fence_wait() - Creates a new fence and attaches it to the resv_objs
- * @katom: Katom with the external dependency.
- * @info:  Pointer to struct with current reservation info
- *
- * Return: An error code or 0 if succeeds
- */
-int kbase_dma_fence_wait(struct kbase_jd_atom *katom,
-			 struct kbase_dma_fence_resv_info *info);
-
-/**
- * kbase_dma_fence_cancel_all_atoms() - Cancel all dma-fences blocked atoms on kctx
- * @kctx: Pointer to kbase context
- *
- * This function will cancel and clean up all katoms on @kctx that is waiting
- * on dma-buf fences.
- *
- * Locking: jctx.lock needs to be held when calling this function.
- */
-void kbase_dma_fence_cancel_all_atoms(struct kbase_context *kctx);
-
-/**
- * kbase_dma_fence_cancel_callbacks() - Cancel only callbacks on katom
- * @katom: Pointer to katom whose callbacks are to be canceled
- *
- * This function cancels all dma-buf fence callbacks on @katom, but does not
- * cancel the katom itself.
- *
- * The caller is responsible for ensuring that kbase_jd_done_nolock is called on
- * @katom.
- *
- * Locking: jctx.lock must be held when calling this function.
- */
-void kbase_dma_fence_cancel_callbacks(struct kbase_jd_atom *katom);
-
-/**
- * kbase_dma_fence_signal() - Signal katom's fence and clean up after wait
- * @katom: Pointer to katom to signal and clean up
- *
- * This function will signal the @katom's fence, if it has one, and clean up
- * the callback data from the katom's wait on earlier fences.
- *
- * Locking: jctx.lock must be held while calling this function.
- */
-void kbase_dma_fence_signal(struct kbase_jd_atom *katom);
-
-/**
- * kbase_dma_fence_term() - Terminate Mali dma-fence context
- * @kctx: kbase context to terminate
- */
-void kbase_dma_fence_term(struct kbase_context *kctx);
-
-/**
- * kbase_dma_fence_init() - Initialize Mali dma-fence context
- * @kctx: kbase context to initialize
- *
- * Return: 0 on success, error code otherwise.
- */
-int kbase_dma_fence_init(struct kbase_context *kctx);
-
-#else /* CONFIG_MALI_BIFROST_DMA_FENCE */
-/* Dummy functions for when dma-buf fence isn't enabled. */
-
-static inline int kbase_dma_fence_init(struct kbase_context *kctx)
-{
-	return 0;
-}
-
-static inline void kbase_dma_fence_term(struct kbase_context *kctx) {}
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
-#endif
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_fence.c b/drivers/gpu/arm/bifrost/mali_kbase_fence.c
index 01557cd3ba47..b16b27659e61 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_fence.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_fence.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2011-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2011-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -59,95 +59,3 @@ kbase_fence_out_new(struct kbase_jd_atom *katom)
 	return fence;
 }
 
-bool
-kbase_fence_free_callbacks(struct kbase_jd_atom *katom)
-{
-	struct kbase_fence_cb *cb, *tmp;
-	bool res = false;
-
-	lockdep_assert_held(&katom->kctx->jctx.lock);
-
-	/* Clean up and free callbacks. */
-	list_for_each_entry_safe(cb, tmp, &katom->dma_fence.callbacks, node) {
-		bool ret;
-
-		/* Cancel callbacks that hasn't been called yet. */
-		ret = dma_fence_remove_callback(cb->fence, &cb->fence_cb);
-		if (ret) {
-			int ret;
-
-			/* Fence had not signaled, clean up after
-			 * canceling.
-			 */
-			ret = atomic_dec_return(&katom->dma_fence.dep_count);
-
-			if (unlikely(ret == 0))
-				res = true;
-		}
-
-		/*
-		 * Release the reference taken in
-		 * kbase_fence_add_callback().
-		 */
-		dma_fence_put(cb->fence);
-		list_del(&cb->node);
-		kfree(cb);
-	}
-
-	return res;
-}
-
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-int
-kbase_fence_add_callback(struct kbase_jd_atom *katom,
-			 struct fence *fence,
-			 fence_func_t callback)
-#else
-int
-kbase_fence_add_callback(struct kbase_jd_atom *katom,
-			 struct dma_fence *fence,
-			 dma_fence_func_t callback)
-#endif
-{
-	int err = 0;
-	struct kbase_fence_cb *kbase_fence_cb;
-
-	if (!fence)
-		return -EINVAL;
-
-	kbase_fence_cb = kmalloc(sizeof(*kbase_fence_cb), GFP_KERNEL);
-	if (!kbase_fence_cb)
-		return -ENOMEM;
-
-	kbase_fence_cb->fence = fence;
-	kbase_fence_cb->katom = katom;
-	INIT_LIST_HEAD(&kbase_fence_cb->node);
-	atomic_inc(&katom->dma_fence.dep_count);
-
-	err = dma_fence_add_callback(fence, &kbase_fence_cb->fence_cb,
-				     callback);
-	if (err == -ENOENT) {
-		/* Fence signaled, get the completion result */
-		err = dma_fence_get_status(fence);
-
-		/* remap success completion to err code */
-		if (err == 1)
-			err = 0;
-
-		kfree(kbase_fence_cb);
-		atomic_dec(&katom->dma_fence.dep_count);
-	} else if (err) {
-		kfree(kbase_fence_cb);
-		atomic_dec(&katom->dma_fence.dep_count);
-	} else {
-		/*
-		 * Get reference to fence that will be kept until callback gets
-		 * cleaned up in kbase_fence_free_callbacks().
-		 */
-		dma_fence_get(fence);
-		/* Add callback to katom's list of callbacks */
-		list_add(&kbase_fence_cb->node, &katom->dma_fence.callbacks);
-	}
-
-	return err;
-}
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_fence.h b/drivers/gpu/arm/bifrost/mali_kbase_fence.h
index 4f952ad4d509..dfe33e52b4ce 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_fence.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_fence.h
@@ -23,12 +23,11 @@
 #define _KBASE_FENCE_H_
 
 /*
- * mali_kbase_fence.[hc] has common fence code used by both
- * - CONFIG_MALI_BIFROST_DMA_FENCE - implicit DMA fences
- * - CONFIG_SYNC_FILE      - explicit fences beginning with 4.9 kernel
+ * mali_kbase_fence.[hc] has fence code used only by
+ * - CONFIG_SYNC_FILE      - explicit fences
  */
 
-#if defined(CONFIG_MALI_BIFROST_DMA_FENCE) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 
 #include <linux/list.h>
 #include "mali_kbase_fence_defs.h"
@@ -40,25 +39,6 @@ extern const struct fence_ops kbase_fence_ops;
 extern const struct dma_fence_ops kbase_fence_ops;
 #endif
 
-/**
- * struct kbase_fence_cb - Mali dma-fence callback data struct
- * @fence_cb: Callback function
- * @katom:    Pointer to katom that is waiting on this callback
- * @fence:    Pointer to the fence object on which this callback is waiting
- * @node:     List head for linking this callback to the katom
- */
-struct kbase_fence_cb {
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-	struct fence_cb fence_cb;
-	struct fence *fence;
-#else
-	struct dma_fence_cb fence_cb;
-	struct dma_fence *fence;
-#endif
-	struct kbase_jd_atom *katom;
-	struct list_head node;
-};
-
 /**
  * kbase_fence_out_new() - Creates a new output fence and puts it on the atom
  * @katom: Atom to create an output fence for
@@ -71,7 +51,7 @@ struct fence *kbase_fence_out_new(struct kbase_jd_atom *katom);
 struct dma_fence *kbase_fence_out_new(struct kbase_jd_atom *katom);
 #endif
 
-#if defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 /**
  * kbase_fence_fence_in_set() - Assign input fence to atom
  * @katom: Atom to assign input fence to
@@ -102,7 +82,7 @@ static inline void kbase_fence_out_remove(struct kbase_jd_atom *katom)
 	}
 }
 
-#if defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 /**
  * kbase_fence_in_remove() - Removes the input fence from atom
  * @katom: Atom to remove input fence for
@@ -153,101 +133,7 @@ static inline int kbase_fence_out_signal(struct kbase_jd_atom *katom,
 	return dma_fence_signal(katom->dma_fence.fence);
 }
 
-/**
- * kbase_fence_add_callback() - Add callback on @fence to block @katom
- * @katom: Pointer to katom that will be blocked by @fence
- * @fence: Pointer to fence on which to set up the callback
- * @callback: Pointer to function to be called when fence is signaled
- *
- * Caller needs to hold a reference to @fence when calling this function, and
- * the caller is responsible for releasing that reference.  An additional
- * reference to @fence will be taken when the callback was successfully set up
- * and @fence needs to be kept valid until the callback has been called and
- * cleanup have been done.
- *
- * Return: 0 on success: fence was either already signaled, or callback was
- * set up. Negative error code is returned on error.
- */
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-int kbase_fence_add_callback(struct kbase_jd_atom *katom,
-			     struct fence *fence,
-			     fence_func_t callback);
-#else
-int kbase_fence_add_callback(struct kbase_jd_atom *katom,
-			     struct dma_fence *fence,
-			     dma_fence_func_t callback);
-#endif
-
-/**
- * kbase_fence_dep_count_set() - Set dep_count value on atom to specified value
- * @katom: Atom to set dep_count for
- * @val: value to set dep_count to
- *
- * The dep_count is available to the users of this module so that they can
- * synchronize completion of the wait with cancellation and adding of more
- * callbacks. For instance, a user could do the following:
- *
- * dep_count set to 1
- * callback #1 added, dep_count is increased to 2
- *                             callback #1 happens, dep_count decremented to 1
- *                             since dep_count > 0, no completion is done
- * callback #2 is added, dep_count is increased to 2
- * dep_count decremented to 1
- *                             callback #2 happens, dep_count decremented to 0
- *                             since dep_count now is zero, completion executes
- *
- * The dep_count can also be used to make sure that the completion only
- * executes once. This is typically done by setting dep_count to -1 for the
- * thread that takes on this responsibility.
- */
-static inline void
-kbase_fence_dep_count_set(struct kbase_jd_atom *katom, int val)
-{
-	atomic_set(&katom->dma_fence.dep_count, val);
-}
-
-/**
- * kbase_fence_dep_count_dec_and_test() - Decrements dep_count
- * @katom: Atom to decrement dep_count for
- *
- * See @kbase_fence_dep_count_set for general description about dep_count
- *
- * Return: true if value was decremented to zero, otherwise false
- */
-static inline bool
-kbase_fence_dep_count_dec_and_test(struct kbase_jd_atom *katom)
-{
-	return atomic_dec_and_test(&katom->dma_fence.dep_count);
-}
-
-/**
- * kbase_fence_dep_count_read() - Returns the current dep_count value
- * @katom: Pointer to katom
- *
- * See @kbase_fence_dep_count_set for general description about dep_count
- *
- * Return: The current dep_count value
- */
-static inline int kbase_fence_dep_count_read(struct kbase_jd_atom *katom)
-{
-	return atomic_read(&katom->dma_fence.dep_count);
-}
-
-/**
- * kbase_fence_free_callbacks() - Free dma-fence callbacks on a katom
- * @katom: Pointer to katom
- *
- * This function will free all fence callbacks on the katom's list of
- * callbacks. Callbacks that have not yet been called, because their fence
- * hasn't yet signaled, will first be removed from the fence.
- *
- * Locking: katom->dma_fence.callbacks list assumes jctx.lock is held.
- *
- * Return: true if dep_count reached 0, otherwise false.
- */
-bool kbase_fence_free_callbacks(struct kbase_jd_atom *katom);
-
-#if defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 /**
  * kbase_fence_in_get() - Retrieve input fence for atom.
  * @katom: Atom to get input fence from
@@ -288,6 +174,6 @@ bool kbase_fence_free_callbacks(struct kbase_jd_atom *katom);
 #define kbase_fence_put(fence) dma_fence_put(fence)
 
 
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE || defined(CONFIG_SYNC_FILE */
+#endif /* IS_ENABLED(CONFIG_SYNC_FILE) */
 
 #endif /* _KBASE_FENCE_H_ */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_gpuprops.c b/drivers/gpu/arm/bifrost/mali_kbase_gpuprops.c
index 0bea655178d5..0282aaf8eb3a 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_gpuprops.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_gpuprops.c
@@ -677,9 +677,11 @@ int kbase_gpuprops_update_l2_features(struct kbase_device *kbdev)
 			int idx;
 			const bool asn_he = regdump.l2_config &
 					    L2_CONFIG_ASN_HASH_ENABLE_MASK;
+#if !IS_ENABLED(CONFIG_MALI_BIFROST_NO_MALI)
 			if (!asn_he && kbdev->l2_hash_values_override)
 				dev_err(kbdev->dev,
 					"Failed to use requested ASN_HASH, fallback to default");
+#endif
 			for (idx = 0; idx < ASN_HASH_COUNT; idx++)
 				dev_info(kbdev->dev,
 					 "%s ASN_HASH[%d] is [0x%08x]\n",
@@ -705,10 +707,6 @@ static struct {
 #define PROP(name, member) \
 	{KBASE_GPUPROP_ ## name, offsetof(struct base_gpu_props, member), \
 		sizeof(((struct base_gpu_props *)0)->member)}
-#define BACKWARDS_COMPAT_PROP(name, type)                                                          \
-	{                                                                                          \
-		KBASE_GPUPROP_##name, SIZE_MAX, sizeof(type)                                       \
-	}
 	PROP(PRODUCT_ID, core_props.product_id),
 	PROP(VERSION_STATUS, core_props.version_status),
 	PROP(MINOR_REVISION, core_props.minor_revision),
@@ -722,6 +720,10 @@ static struct {
 	PROP(GPU_AVAILABLE_MEMORY_SIZE, core_props.gpu_available_memory_size),
 
 #if MALI_USE_CSF
+#define BACKWARDS_COMPAT_PROP(name, type)                                                          \
+	{                                                                                          \
+		KBASE_GPUPROP_##name, SIZE_MAX, sizeof(type)                                       \
+	}
 	BACKWARDS_COMPAT_PROP(NUM_EXEC_ENGINES, u8),
 #else
 	PROP(NUM_EXEC_ENGINES, core_props.num_exec_engines),
@@ -820,7 +822,7 @@ int kbase_gpuprops_populate_user_buffer(struct kbase_device *kbdev)
 	}
 
 	kprops->prop_buffer_size = size;
-	kprops->prop_buffer = kmalloc(size, GFP_KERNEL);
+	kprops->prop_buffer = kzalloc(size, GFP_KERNEL);
 
 	if (!kprops->prop_buffer) {
 		kprops->prop_buffer_size = 0;
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_hw.c b/drivers/gpu/arm/bifrost/mali_kbase_hw.c
index 1de1e29fcb75..b6a8a2e5608f 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_hw.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_hw.c
@@ -85,6 +85,10 @@ void kbase_hw_set_features_mask(struct kbase_device *kbdev)
 	case GPU_ID2_PRODUCT_LTUX:
 		features = base_hw_features_tTUx;
 		break;
+	case GPU_ID2_PRODUCT_TTIX:
+	case GPU_ID2_PRODUCT_LTIX:
+		features = base_hw_features_tTIx;
+		break;
 	default:
 		features = base_hw_features_generic;
 		break;
@@ -233,12 +237,22 @@ static const enum base_hw_issue *kbase_hw_get_issues_for_new_id(
 		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTUx_r0p0 },
 		    { GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_tTUx_r1p0 },
 		    { GPU_ID2_VERSION_MAKE(1, 1, 0), base_hw_issues_tTUx_r1p1 },
+		    { GPU_ID2_VERSION_MAKE(1, 2, 0), base_hw_issues_tTUx_r1p2 },
 		    { U32_MAX, NULL } } },
 
 		{ GPU_ID2_PRODUCT_LTUX,
 		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTUx_r0p0 },
 		    { GPU_ID2_VERSION_MAKE(1, 0, 0), base_hw_issues_tTUx_r1p0 },
 		    { GPU_ID2_VERSION_MAKE(1, 1, 0), base_hw_issues_tTUx_r1p1 },
+		    { GPU_ID2_VERSION_MAKE(1, 2, 0), base_hw_issues_tTUx_r1p2 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_TTIX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTIx_r0p0 },
+		    { U32_MAX, NULL } } },
+
+		{ GPU_ID2_PRODUCT_LTIX,
+		  { { GPU_ID2_VERSION_MAKE(0, 0, 0), base_hw_issues_tTIx_r0p0 },
 		    { U32_MAX, NULL } } },
 
 	};
@@ -396,6 +410,10 @@ int kbase_hw_set_issues_mask(struct kbase_device *kbdev)
 		case GPU_ID2_PRODUCT_LTUX:
 			issues = base_hw_issues_model_tTUx;
 			break;
+		case GPU_ID2_PRODUCT_TTIX:
+		case GPU_ID2_PRODUCT_LTIX:
+			issues = base_hw_issues_model_tTIx;
+			break;
 
 		default:
 			dev_err(kbdev->dev,
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_jd.c b/drivers/gpu/arm/bifrost/mali_kbase_jd.c
index 5a96f924bfbd..f5faa92525c5 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_jd.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_jd.c
@@ -35,7 +35,6 @@
 #include <tl/mali_kbase_tracepoints.h>
 #include <mali_linux_trace.h>
 
-#include "mali_kbase_dma_fence.h"
 #include <mali_kbase_cs_experimental.h>
 
 #include <mali_kbase_caps.h>
@@ -158,15 +157,6 @@ void kbase_jd_dep_clear_locked(struct kbase_jd_atom *katom)
 
 void kbase_jd_free_external_resources(struct kbase_jd_atom *katom)
 {
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	/* Flush dma-fence workqueue to ensure that any callbacks that may have
-	 * been queued are done before continuing.
-	 * Any successfully completed atom would have had all it's callbacks
-	 * completed before the atom was run, so only flush for failed atoms.
-	 */
-	if (katom->event_code != BASE_JD_EVENT_DONE)
-		flush_workqueue(katom->kctx->dma_fence.wq);
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
 }
 
 static void kbase_jd_post_external_resources(struct kbase_jd_atom *katom)
@@ -174,10 +164,6 @@ static void kbase_jd_post_external_resources(struct kbase_jd_atom *katom)
 	KBASE_DEBUG_ASSERT(katom);
 	KBASE_DEBUG_ASSERT(katom->core_req & BASE_JD_REQ_EXTERNAL_RESOURCES);
 
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	kbase_dma_fence_signal(katom);
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
-
 	kbase_gpu_vm_lock(katom->kctx);
 	/* only roll back if extres is non-NULL */
 	if (katom->extres) {
@@ -185,13 +171,7 @@ static void kbase_jd_post_external_resources(struct kbase_jd_atom *katom)
 
 		res_no = katom->nr_extres;
 		while (res_no-- > 0) {
-			struct kbase_mem_phy_alloc *alloc = katom->extres[res_no].alloc;
-			struct kbase_va_region *reg;
-
-			reg = kbase_region_tracker_find_region_base_address(
-					katom->kctx,
-					katom->extres[res_no].gpu_address);
-			kbase_unmap_external_resource(katom->kctx, reg, alloc);
+			kbase_unmap_external_resource(katom->kctx, katom->extres[res_no]);
 		}
 		kfree(katom->extres);
 		katom->extres = NULL;
@@ -207,26 +187,8 @@ static void kbase_jd_post_external_resources(struct kbase_jd_atom *katom)
 
 static int kbase_jd_pre_external_resources(struct kbase_jd_atom *katom, const struct base_jd_atom *user_atom)
 {
-	int err_ret_val = -EINVAL;
+	int err = -EINVAL;
 	u32 res_no;
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	struct kbase_dma_fence_resv_info info = {
-		.resv_objs = NULL,
-		.dma_fence_resv_count = 0,
-		.dma_fence_excl_bitmap = NULL
-	};
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
-	/*
-	 * When both dma-buf fence and Android native sync is enabled, we
-	 * disable dma-buf fence for contexts that are using Android native
-	 * fences.
-	 */
-	const bool implicit_sync = !kbase_ctx_flag(katom->kctx,
-						   KCTX_NO_IMPLICIT_SYNC);
-#else /* CONFIG_SYNC || CONFIG_SYNC_FILE*/
-	const bool implicit_sync = true;
-#endif /* CONFIG_SYNC || CONFIG_SYNC_FILE */
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
 	struct base_external_resource *input_extres;
 
 	KBASE_DEBUG_ASSERT(katom);
@@ -240,68 +202,32 @@ static int kbase_jd_pre_external_resources(struct kbase_jd_atom *katom, const st
 	if (!katom->extres)
 		return -ENOMEM;
 
-	/* copy user buffer to the end of our real buffer.
-	 * Make sure the struct sizes haven't changed in a way
-	 * we don't support
-	 */
-	BUILD_BUG_ON(sizeof(*input_extres) > sizeof(*katom->extres));
-	input_extres = (struct base_external_resource *)
-			(((unsigned char *)katom->extres) +
-			(sizeof(*katom->extres) - sizeof(*input_extres)) *
-			katom->nr_extres);
+	input_extres = kmalloc_array(katom->nr_extres, sizeof(*input_extres), GFP_KERNEL);
+	if (!input_extres) {
+		err = -ENOMEM;
+		goto failed_input_alloc;
+	}
 
 	if (copy_from_user(input_extres,
 			get_compat_pointer(katom->kctx, user_atom->extres_list),
 			sizeof(*input_extres) * katom->nr_extres) != 0) {
-		err_ret_val = -EINVAL;
-		goto early_err_out;
+		err = -EINVAL;
+		goto failed_input_copy;
 	}
 
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	if (implicit_sync) {
-		info.resv_objs =
-			kmalloc_array(katom->nr_extres,
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-				      sizeof(struct reservation_object *),
-#else
-				      sizeof(struct dma_resv *),
-#endif
-				      GFP_KERNEL);
-		if (!info.resv_objs) {
-			err_ret_val = -ENOMEM;
-			goto early_err_out;
-		}
-
-		info.dma_fence_excl_bitmap =
-				kcalloc(BITS_TO_LONGS(katom->nr_extres),
-					sizeof(unsigned long), GFP_KERNEL);
-		if (!info.dma_fence_excl_bitmap) {
-			err_ret_val = -ENOMEM;
-			goto early_err_out;
-		}
-	}
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
-
 	/* Take the processes mmap lock */
 	down_read(kbase_mem_get_process_mmap_lock());
 
 	/* need to keep the GPU VM locked while we set up UMM buffers */
 	kbase_gpu_vm_lock(katom->kctx);
 	for (res_no = 0; res_no < katom->nr_extres; res_no++) {
-		struct base_external_resource *res = &input_extres[res_no];
+		struct base_external_resource *user_res = &input_extres[res_no];
 		struct kbase_va_region *reg;
-		struct kbase_mem_phy_alloc *alloc;
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-		bool exclusive;
 
-		exclusive = (res->ext_resource & BASE_EXT_RES_ACCESS_EXCLUSIVE)
-				? true : false;
-#endif
 		reg = kbase_region_tracker_find_region_enclosing_address(
-				katom->kctx,
-				res->ext_resource & ~BASE_EXT_RES_ACCESS_EXCLUSIVE);
+			katom->kctx, user_res->ext_resource & ~BASE_EXT_RES_ACCESS_EXCLUSIVE);
 		/* did we find a matching region object? */
-		if (kbase_is_region_invalid_or_free(reg)) {
+		if (unlikely(kbase_is_region_invalid_or_free(reg))) {
 			/* roll back */
 			goto failed_loop;
 		}
@@ -311,36 +237,11 @@ static int kbase_jd_pre_external_resources(struct kbase_jd_atom *katom, const st
 			katom->atom_flags |= KBASE_KATOM_FLAG_PROTECTED;
 		}
 
-		alloc = kbase_map_external_resource(katom->kctx, reg,
-				current->mm);
-		if (!alloc) {
-			err_ret_val = -EINVAL;
+		err = kbase_map_external_resource(katom->kctx, reg, current->mm);
+		if (err)
 			goto failed_loop;
-		}
 
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-		if (implicit_sync &&
-		    reg->gpu_alloc->type == KBASE_MEM_TYPE_IMPORTED_UMM) {
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-			struct reservation_object *resv;
-#else
-			struct dma_resv *resv;
-#endif
-			resv = reg->gpu_alloc->imported.umm.dma_buf->resv;
-			if (resv)
-				kbase_dma_fence_add_reservation(resv, &info,
-								exclusive);
-		}
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
-
-		/* finish with updating out array with the data we found */
-		/* NOTE: It is important that this is the last thing we do (or
-		 * at least not before the first write) as we overwrite elements
-		 * as we loop and could be overwriting ourself, so no writes
-		 * until the last read for an element.
-		 */
-		katom->extres[res_no].gpu_address = reg->start_pfn << PAGE_SHIFT; /* save the start_pfn (as an address, not pfn) to use fast lookup later */
-		katom->extres[res_no].alloc = alloc;
+		katom->extres[res_no] = reg;
 	}
 	/* successfully parsed the extres array */
 	/* drop the vm lock now */
@@ -349,57 +250,33 @@ static int kbase_jd_pre_external_resources(struct kbase_jd_atom *katom, const st
 	/* Release the processes mmap lock */
 	up_read(kbase_mem_get_process_mmap_lock());
 
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	if (implicit_sync) {
-		if (info.dma_fence_resv_count) {
-			int ret;
-
-			ret = kbase_dma_fence_wait(katom, &info);
-			if (ret < 0)
-				goto failed_dma_fence_setup;
-		}
-
-		kfree(info.resv_objs);
-		kfree(info.dma_fence_excl_bitmap);
-	}
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
+	/* Free the buffer holding data from userspace */
+	kfree(input_extres);
 
 	/* all done OK */
 	return 0;
 
 /* error handling section */
-
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-failed_dma_fence_setup:
-	/* Lock the processes mmap lock */
-	down_read(kbase_mem_get_process_mmap_lock());
-
-	/* lock before we unmap */
-	kbase_gpu_vm_lock(katom->kctx);
-#endif
-
- failed_loop:
-	/* undo the loop work */
+failed_loop:
+	/* undo the loop work. We are guaranteed to have access to the VA region
+	 * as we hold a reference to it until it's unmapped
+	 */
 	while (res_no-- > 0) {
-		struct kbase_mem_phy_alloc *alloc = katom->extres[res_no].alloc;
+		struct kbase_va_region *reg = katom->extres[res_no];
 
-		kbase_unmap_external_resource(katom->kctx, NULL, alloc);
+		kbase_unmap_external_resource(katom->kctx, reg);
 	}
 	kbase_gpu_vm_unlock(katom->kctx);
 
 	/* Release the processes mmap lock */
 	up_read(kbase_mem_get_process_mmap_lock());
 
- early_err_out:
+failed_input_copy:
+	kfree(input_extres);
+failed_input_alloc:
 	kfree(katom->extres);
 	katom->extres = NULL;
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	if (implicit_sync) {
-		kfree(info.resv_objs);
-		kfree(info.dma_fence_excl_bitmap);
-	}
-#endif
-	return err_ret_val;
+	return err;
 }
 
 static inline void jd_resolve_dep(struct list_head *out_list,
@@ -422,10 +299,6 @@ static inline void jd_resolve_dep(struct list_head *out_list,
 
 		if (katom->event_code != BASE_JD_EVENT_DONE &&
 			(dep_type != BASE_JD_DEP_TYPE_ORDER)) {
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-			kbase_dma_fence_cancel_callbacks(dep_atom);
-#endif
-
 			dep_atom->event_code = katom->event_code;
 			KBASE_DEBUG_ASSERT(dep_atom->status !=
 						KBASE_JD_ATOM_STATE_UNUSED);
@@ -439,35 +312,8 @@ static inline void jd_resolve_dep(struct list_head *out_list,
 				(IS_GPU_ATOM(dep_atom) && !ctx_is_dying &&
 				!dep_atom->will_fail_event_code &&
 				!other_dep_atom->will_fail_event_code))) {
-			bool dep_satisfied = true;
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-			int dep_count;
-
-			dep_count = kbase_fence_dep_count_read(dep_atom);
-			if (likely(dep_count == -1)) {
-				dep_satisfied = true;
-			} else {
-				/*
-				 * There are either still active callbacks, or
-				 * all fences for this @dep_atom has signaled,
-				 * but the worker that will queue the atom has
-				 * not yet run.
-				 *
-				 * Wait for the fences to signal and the fence
-				 * worker to run and handle @dep_atom. If
-				 * @dep_atom was completed due to error on
-				 * @katom, then the fence worker will pick up
-				 * the complete status and error code set on
-				 * @dep_atom above.
-				 */
-				dep_satisfied = false;
-			}
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
-
-			if (dep_satisfied) {
-				dep_atom->in_jd_list = true;
-				list_add_tail(&dep_atom->jd_item, out_list);
-			}
+			dep_atom->in_jd_list = true;
+			list_add_tail(&dep_atom->jd_item, out_list);
 		}
 	}
 }
@@ -526,33 +372,8 @@ static void jd_try_submitting_deps(struct list_head *out_list,
 						dep_atom->dep[0].atom);
 				bool dep1_valid = is_dep_valid(
 						dep_atom->dep[1].atom);
-				bool dep_satisfied = true;
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-				int dep_count;
 
-				dep_count = kbase_fence_dep_count_read(
-								dep_atom);
-				if (likely(dep_count == -1)) {
-					dep_satisfied = true;
-				} else {
-				/*
-				 * There are either still active callbacks, or
-				 * all fences for this @dep_atom has signaled,
-				 * but the worker that will queue the atom has
-				 * not yet run.
-				 *
-				 * Wait for the fences to signal and the fence
-				 * worker to run and handle @dep_atom. If
-				 * @dep_atom was completed due to error on
-				 * @katom, then the fence worker will pick up
-				 * the complete status and error code set on
-				 * @dep_atom above.
-				 */
-					dep_satisfied = false;
-				}
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
-
-				if (dep0_valid && dep1_valid && dep_satisfied) {
+				if (dep0_valid && dep1_valid) {
 					dep_atom->in_jd_list = true;
 					list_add(&dep_atom->jd_item, out_list);
 				}
@@ -963,9 +784,6 @@ static bool jd_submit_atom(struct kbase_context *const kctx,
 
 	INIT_LIST_HEAD(&katom->queue);
 	INIT_LIST_HEAD(&katom->jd_item);
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	kbase_fence_dep_count_set(katom, -1);
-#endif
 
 	/* Don't do anything if there is a mess up with dependencies.
 	 * This is done in a separate cycle to check both the dependencies at ones, otherwise
@@ -1185,12 +1003,6 @@ static bool jd_submit_atom(struct kbase_context *const kctx,
 	if (queued && !IS_GPU_ATOM(katom))
 		return false;
 
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	if (kbase_fence_dep_count_read(katom) != -1)
-		return false;
-
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
-
 	if (katom->core_req & BASE_JD_REQ_SOFT_JOB) {
 		if (kbase_process_soft_job(katom) == 0) {
 			kbase_finish_soft_job(katom);
@@ -1273,7 +1085,7 @@ int kbase_jd_submit(struct kbase_context *kctx,
 		if (unlikely(jd_atom_is_v2)) {
 			if (copy_from_user(&user_atom.jc, user_addr, sizeof(struct base_jd_atom_v2)) != 0) {
 				dev_dbg(kbdev->dev,
-					"Invalid atom address %p passed to job_submit\n",
+					"Invalid atom address %pK passed to job_submit\n",
 					user_addr);
 				err = -EFAULT;
 				break;
@@ -1284,7 +1096,7 @@ int kbase_jd_submit(struct kbase_context *kctx,
 		} else {
 			if (copy_from_user(&user_atom, user_addr, stride) != 0) {
 				dev_dbg(kbdev->dev,
-					"Invalid atom address %p passed to job_submit\n",
+					"Invalid atom address %pK passed to job_submit\n",
 					user_addr);
 				err = -EFAULT;
 				break;
@@ -1599,6 +1411,7 @@ static void jd_cancel_worker(struct work_struct *data)
 	bool need_to_try_schedule_context;
 	bool attr_state_changed;
 	struct kbase_device *kbdev;
+	CSTD_UNUSED(need_to_try_schedule_context);
 
 	/* Soft jobs should never reach this function */
 	KBASE_DEBUG_ASSERT((katom->core_req & BASE_JD_REQ_SOFT_JOB) == 0);
@@ -1746,20 +1559,8 @@ void kbase_jd_zap_context(struct kbase_context *kctx)
 		kbase_cancel_soft_job(katom);
 	}
 
-
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	kbase_dma_fence_cancel_all_atoms(kctx);
-#endif
-
 	mutex_unlock(&kctx->jctx.lock);
 
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	/* Flush dma-fence workqueue to ensure that any callbacks that may have
-	 * been queued are done before continuing.
-	 */
-	flush_workqueue(kctx->dma_fence.wq);
-#endif
-
 #if IS_ENABLED(CONFIG_DEBUG_FS)
 	kbase_debug_job_fault_kctx_unblock(kctx);
 #endif
@@ -1796,11 +1597,10 @@ int kbase_jd_init(struct kbase_context *kctx)
 		kctx->jctx.atoms[i].event_code = BASE_JD_EVENT_JOB_INVALID;
 		kctx->jctx.atoms[i].status = KBASE_JD_ATOM_STATE_UNUSED;
 
-#if defined(CONFIG_MALI_BIFROST_DMA_FENCE) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 		kctx->jctx.atoms[i].dma_fence.context =
 						dma_fence_context_alloc(1);
 		atomic_set(&kctx->jctx.atoms[i].dma_fence.seqno, 0);
-		INIT_LIST_HEAD(&kctx->jctx.atoms[i].dma_fence.callbacks);
 #endif
 	}
 
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_jd_debugfs.c b/drivers/gpu/arm/bifrost/mali_kbase_jd_debugfs.c
index 87c92330dfe2..6196c0985c7e 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_jd_debugfs.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_jd_debugfs.c
@@ -24,8 +24,7 @@
 #include <linux/seq_file.h>
 #include <mali_kbase.h>
 #include <mali_kbase_jd_debugfs.h>
-#include <mali_kbase_dma_fence.h>
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 #include <mali_kbase_sync.h>
 #endif
 #include <uapi/gpu/arm/bifrost/mali_kbase_ioctl.h>
@@ -38,7 +37,7 @@ struct kbase_jd_debugfs_depinfo {
 static void kbase_jd_debugfs_fence_info(struct kbase_jd_atom *atom,
 					struct seq_file *sfile)
 {
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 	struct kbase_sync_fence_info info;
 	int res;
 
@@ -58,51 +57,7 @@ static void kbase_jd_debugfs_fence_info(struct kbase_jd_atom *atom,
 	default:
 		break;
 	}
-#endif /* CONFIG_SYNC || CONFIG_SYNC_FILE */
-
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-	if (atom->core_req & BASE_JD_REQ_EXTERNAL_RESOURCES) {
-		struct kbase_fence_cb *cb;
-
-		if (atom->dma_fence.fence) {
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-			struct fence *fence = atom->dma_fence.fence;
-#else
-			struct dma_fence *fence = atom->dma_fence.fence;
-#endif
-
-			seq_printf(sfile,
-#if (KERNEL_VERSION(5, 1, 0) > LINUX_VERSION_CODE)
-				   "Sd(%llu#%u: %s) ",
-#else
-				   "Sd(%llu#%llu: %s) ",
-#endif
-				   fence->context, fence->seqno,
-				   dma_fence_is_signaled(fence) ? "signaled" :
-								  "active");
-		}
-
-		list_for_each_entry(cb, &atom->dma_fence.callbacks,
-				    node) {
-#if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
-			struct fence *fence = cb->fence;
-#else
-			struct dma_fence *fence = cb->fence;
-#endif
-
-			seq_printf(sfile,
-#if (KERNEL_VERSION(5, 1, 0) > LINUX_VERSION_CODE)
-				   "Wd(%llu#%u: %s) ",
-#else
-				   "Wd(%llu#%llu: %s) ",
-#endif
-				   fence->context, fence->seqno,
-				   dma_fence_is_signaled(fence) ? "signaled" :
-								  "active");
-		}
-	}
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
-
+#endif /* CONFIG_SYNC_FILE */
 }
 
 static void kbasep_jd_debugfs_atom_deps(
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_js.c b/drivers/gpu/arm/bifrost/mali_kbase_js.c
index 1991bfa9532d..a64d7327a76b 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_js.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_js.c
@@ -621,6 +621,7 @@ void kbasep_js_devdata_term(struct kbase_device *kbdev)
 {
 	struct kbasep_js_device_data *js_devdata;
 	s8 zero_ctx_attr_ref_count[KBASEP_JS_CTX_ATTR_COUNT] = { 0, };
+	CSTD_UNUSED(js_devdata);
 
 	KBASE_DEBUG_ASSERT(kbdev != NULL);
 
@@ -638,15 +639,12 @@ void kbasep_js_devdata_term(struct kbase_device *kbdev)
 
 int kbasep_js_kctx_init(struct kbase_context *const kctx)
 {
-	struct kbase_device *kbdev;
 	struct kbasep_js_kctx_info *js_kctx_info;
 	int i, j;
+	CSTD_UNUSED(js_kctx_info);
 
 	KBASE_DEBUG_ASSERT(kctx != NULL);
 
-	kbdev = kctx->kbdev;
-	KBASE_DEBUG_ASSERT(kbdev != NULL);
-
 	for (i = 0; i < BASE_JM_MAX_NR_SLOTS; ++i)
 		INIT_LIST_HEAD(&kctx->jctx.sched_info.ctx.ctx_list_entry[i]);
 
@@ -688,6 +686,7 @@ void kbasep_js_kctx_term(struct kbase_context *kctx)
 	int js;
 	bool update_ctx_count = false;
 	unsigned long flags;
+	CSTD_UNUSED(js_kctx_info);
 
 	KBASE_DEBUG_ASSERT(kctx != NULL);
 
@@ -1800,6 +1799,7 @@ static kbasep_js_release_result kbasep_js_runpool_release_ctx_internal(
 	bool runpool_ctx_attr_change = false;
 	int kctx_as_nr;
 	int new_ref_count;
+	CSTD_UNUSED(kctx_as_nr);
 
 	KBASE_DEBUG_ASSERT(kbdev != NULL);
 	KBASE_DEBUG_ASSERT(kctx != NULL);
@@ -2183,6 +2183,7 @@ static bool kbasep_js_schedule_ctx(struct kbase_device *kbdev,
 #endif
 		/* Cause it to leave at some later point */
 		bool retained;
+		CSTD_UNUSED(retained);
 
 		retained = kbase_ctx_sched_inc_refcount_nolock(kctx);
 		KBASE_DEBUG_ASSERT(retained);
@@ -3918,6 +3919,7 @@ void kbase_js_zap_context(struct kbase_context *kctx)
 	} else {
 		unsigned long flags;
 		bool was_retained;
+		CSTD_UNUSED(was_retained);
 
 		/* Case c: didn't evict, but it is scheduled - it's in the Run
 		 * Pool
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_kinstr_jm.c b/drivers/gpu/arm/bifrost/mali_kbase_kinstr_jm.c
index 78fa6f37ef6c..7b8961679a10 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_kinstr_jm.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_kinstr_jm.c
@@ -61,10 +61,6 @@
 #define __static_assert(e, msg, ...) _Static_assert(e, msg)
 #endif
 
-#ifndef ENOTSUP
-#define ENOTSUP EOPNOTSUPP
-#endif
-
 /* The module printing prefix */
 #define PR_ "mali_kbase_kinstr_jm: "
 
@@ -224,11 +220,8 @@ static inline bool reader_changes_is_valid_size(const size_t size)
  *
  * Return:
  * (0, U16_MAX] - the number of data elements allocated
- * -EINVAL - a pointer was invalid
- * -ENOTSUP - we do not support allocation of the context
  * -ERANGE - the requested memory size was invalid
  * -ENOMEM - could not allocate the memory
- * -EADDRINUSE - the buffer memory was already allocated
  */
 static int reader_changes_init(struct reader_changes *const changes,
 			       const size_t size)
@@ -623,31 +616,34 @@ exit:
  *
  * Return:
  * * 0 - no data ready
- * * POLLIN - state changes have been buffered
- * * -EBADF - the file descriptor did not have an attached reader
- * * -EINVAL - the IO control arguments were invalid
+ * * EPOLLIN | EPOLLRDNORM - state changes have been buffered
+ * * EPOLLHUP | EPOLLERR - IO control arguments were invalid or the file
+ *                         descriptor did not have an attached reader.
  */
 static __poll_t reader_poll(struct file *const file,
 			    struct poll_table_struct *const wait)
 {
 	struct reader *reader;
 	struct reader_changes *changes;
+	__poll_t mask = 0;
 
 	if (unlikely(!file || !wait))
-		return (__poll_t)-EINVAL;
+		return EPOLLHUP | EPOLLERR;
 
 	reader = file->private_data;
 	if (unlikely(!reader))
-		return (__poll_t)-EBADF;
+		return EPOLLHUP | EPOLLERR;
 
 	changes = &reader->changes;
-
 	if (reader_changes_count(changes) >= changes->threshold)
-		return POLLIN;
+		return EPOLLIN | EPOLLRDNORM;
 
 	poll_wait(file, &reader->wait_queue, wait);
 
-	return (reader_changes_count(changes) > 0) ? POLLIN : 0;
+	if (reader_changes_count(changes) > 0)
+		mask |= EPOLLIN | EPOLLRDNORM;
+
+	return mask;
 }
 
 /* The file operations virtual function table */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_kinstr_prfcnt.c b/drivers/gpu/arm/bifrost/mali_kbase_kinstr_prfcnt.c
index 81758c32259c..5fb11b7b94c5 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_kinstr_prfcnt.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_kinstr_prfcnt.c
@@ -21,8 +21,8 @@
 
 #include "mali_kbase.h"
 #include "mali_kbase_kinstr_prfcnt.h"
-#include "mali_kbase_hwcnt_virtualizer.h"
-#include "mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_virtualizer.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
 #include <uapi/gpu/arm/bifrost/mali_kbase_ioctl.h>
 #include "mali_malisw.h"
 #include "mali_kbase_debug.h"
@@ -47,9 +47,6 @@
 /* The maximum allowed buffers per client */
 #define MAX_BUFFER_COUNT 32
 
-/* The module printing prefix */
-#define KINSTR_PRFCNT_PREFIX "mali_kbase_kinstr_prfcnt: "
-
 /**
  * struct kbase_kinstr_prfcnt_context - IOCTL interface for userspace hardware
  *                                      counters.
@@ -224,8 +221,8 @@ static struct prfcnt_enum_item kinstr_prfcnt_supported_requests[] = {
  * @filp: Non-NULL pointer to file structure.
  * @wait: Non-NULL pointer to poll table.
  *
- * Return: POLLIN if data can be read without blocking, 0 if data can not be
- *         read without blocking, else error code.
+ * Return: EPOLLIN | EPOLLRDNORM if data can be read without blocking, 0 if
+ *         data can not be read without blocking, else EPOLLHUP | EPOLLERR.
  */
 static __poll_t
 kbasep_kinstr_prfcnt_hwcnt_reader_poll(struct file *filp,
@@ -234,19 +231,19 @@ kbasep_kinstr_prfcnt_hwcnt_reader_poll(struct file *filp,
 	struct kbase_kinstr_prfcnt_client *cli;
 
 	if (!filp || !wait)
-		return (__poll_t)-EINVAL;
+		return EPOLLHUP | EPOLLERR;
 
 	cli = filp->private_data;
 
 	if (!cli)
-		return (__poll_t)-EINVAL;
+		return EPOLLHUP | EPOLLERR;
 
 	poll_wait(filp, &cli->waitq, wait);
 
 	if (atomic_read(&cli->write_idx) != atomic_read(&cli->fetch_idx))
-		return POLLIN;
+		return EPOLLIN | EPOLLRDNORM;
 
-	return 0;
+	return (__poll_t)0;
 }
 
 /**
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_kinstr_prfcnt.h b/drivers/gpu/arm/bifrost/mali_kbase_kinstr_prfcnt.h
index 0ffc10e5c496..bbe33796e62f 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_kinstr_prfcnt.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_kinstr_prfcnt.h
@@ -26,7 +26,7 @@
 #ifndef _KBASE_KINSTR_PRFCNT_H_
 #define _KBASE_KINSTR_PRFCNT_H_
 
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 #include <uapi/gpu/arm/bifrost/mali_kbase_hwcnt_reader.h>
 
 struct kbase_kinstr_prfcnt_context;
@@ -80,7 +80,6 @@ void kbase_kinstr_prfcnt_suspend(struct kbase_kinstr_prfcnt_context *kinstr_ctx)
  */
 void kbase_kinstr_prfcnt_resume(struct kbase_kinstr_prfcnt_context *kinstr_ctx);
 
-#if MALI_KERNEL_TEST_API
 /**
  * kbasep_kinstr_prfcnt_get_block_info_list() - Get list of all block types
  *                                              with their information.
@@ -158,7 +157,6 @@ int kbasep_kinstr_prfcnt_cmd(struct kbase_kinstr_prfcnt_client *cli,
  * @cli: kinstr_prfcnt client. Must not be attached to a kinstr_prfcnt context.
  */
 void kbasep_kinstr_prfcnt_client_destroy(struct kbase_kinstr_prfcnt_client *cli);
-#endif /* MALI_KERNEL_TEST_API */
 
 /**
  * kbase_kinstr_prfcnt_enum_info - Enumerate performance counter information.
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem.c b/drivers/gpu/arm/bifrost/mali_kbase_mem.c
index e0785793e26a..3743b4df999f 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_mem.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_mem.c
@@ -44,6 +44,8 @@
 #include <mali_kbase_config_defaults.h>
 #include <mali_kbase_trace_gpu_mem.h>
 
+#if MALI_JIT_PRESSURE_LIMIT_BASE
+
 /*
  * Alignment of objects allocated by the GPU inside a just-in-time memory
  * region whose size is given by an end address
@@ -66,6 +68,7 @@
  */
 #define KBASE_GPU_ALLOCATED_OBJECT_MAX_BYTES (512u)
 
+#endif /* MALI_JIT_PRESSURE_LIMIT_BASE */
 
 /* Forward declarations */
 static void free_partial_locked(struct kbase_context *kctx,
@@ -429,15 +432,15 @@ void kbase_remove_va_region(struct kbase_device *kbdev,
 			next->nr_pages += reg->nr_pages;
 			rb_erase(&(reg->rblink), reg_rbtree);
 			merged_back = 1;
-			if (merged_front) {
-				/* We already merged with prev, free it */
-				kfree(reg);
-			}
 		}
 	}
 
-	/* If we failed to merge then we need to add a new block */
-	if (!(merged_front || merged_back)) {
+	if (merged_front && merged_back) {
+		/* We already merged with prev, free it */
+		kfree(reg);
+	} else if (!(merged_front || merged_back)) {
+		/* If we failed to merge then we need to add a new block */
+
 		/*
 		 * We didn't merge anything. Try to add a new free
 		 * placeholder, and in any case, remove the original one.
@@ -1416,6 +1419,7 @@ int kbase_mem_init(struct kbase_device *kbdev)
 
 	memdev = &kbdev->memdev;
 
+	kbase_mem_migrate_init(kbdev);
 	kbase_mem_pool_group_config_set_max_size(&kbdev->mem_pool_defaults,
 		KBASE_MEM_POOL_MAX_SIZE_KCTX);
 
@@ -1478,8 +1482,7 @@ int kbase_mem_init(struct kbase_device *kbdev)
 		kbase_mem_pool_group_config_set_max_size(&mem_pool_defaults,
 			KBASE_MEM_POOL_MAX_SIZE_KBDEV);
 
-		err = kbase_mem_pool_group_init(&kbdev->mem_pools, kbdev,
-			&mem_pool_defaults, NULL);
+		err = kbase_mem_pool_group_init(&kbdev->mem_pools, kbdev, &mem_pool_defaults, NULL);
 	}
 
 	return err;
@@ -1505,6 +1508,8 @@ void kbase_mem_term(struct kbase_device *kbdev)
 
 	kbase_mem_pool_group_term(&kbdev->mem_pools);
 
+	kbase_mem_migrate_term(kbdev);
+
 	WARN_ON(kbdev->total_gpu_pages);
 	WARN_ON(!RB_EMPTY_ROOT(&kbdev->process_root));
 	WARN_ON(!RB_EMPTY_ROOT(&kbdev->dma_buf_root));
@@ -1613,6 +1618,7 @@ static struct kbase_context *kbase_reg_flags_to_kctx(
  * alloc object will be released.
  * It is a bug if no alloc object exists for non-free regions.
  *
+ * If region is KBASE_REG_ZONE_MCU_SHARED it is freed
  */
 void kbase_free_alloced_region(struct kbase_va_region *reg)
 {
@@ -1636,6 +1642,13 @@ void kbase_free_alloced_region(struct kbase_va_region *reg)
 			(void *)reg);
 #if MALI_USE_CSF
 		if (reg->flags & KBASE_REG_CSF_EVENT)
+			/*
+			 * This should not be reachable if called from 'mcu_shared' functions
+			 * such as:
+			 * kbase_csf_firmware_mcu_shared_mapping_init
+			 * kbase_csf_firmware_mcu_shared_mapping_term
+			 */
+
 			kbase_unlink_event_mem_page(kctx, reg);
 #endif
 
@@ -1649,8 +1662,6 @@ void kbase_free_alloced_region(struct kbase_va_region *reg)
 		 * on the list at termination time of the region tracker.
 		 */
 		if (!list_empty(&reg->gpu_alloc->evict_node)) {
-			mutex_unlock(&kctx->jit_evict_lock);
-
 			/*
 			 * Unlink the physical allocation before unmaking it
 			 * evictable so that the allocation isn't grown back to
@@ -1661,6 +1672,8 @@ void kbase_free_alloced_region(struct kbase_va_region *reg)
 			if (reg->cpu_alloc != reg->gpu_alloc)
 				reg->gpu_alloc->reg = NULL;
 
+			mutex_unlock(&kctx->jit_evict_lock);
+
 			/*
 			 * If a region has been made evictable then we must
 			 * unmake it before trying to free it.
@@ -1812,8 +1825,8 @@ bad_insert:
 
 KBASE_EXPORT_TEST_API(kbase_gpu_mmap);
 
-static void kbase_jd_user_buf_unmap(struct kbase_context *kctx,
-		struct kbase_mem_phy_alloc *alloc, bool writeable);
+static void kbase_jd_user_buf_unmap(struct kbase_context *kctx, struct kbase_mem_phy_alloc *alloc,
+				    struct kbase_va_region *reg, bool writeable);
 
 int kbase_gpu_munmap(struct kbase_context *kctx, struct kbase_va_region *reg)
 {
@@ -1879,7 +1892,7 @@ int kbase_gpu_munmap(struct kbase_context *kctx, struct kbase_va_region *reg)
 
 			/* The allocation could still have active mappings. */
 			if (user_buf->current_mapping_usage_count == 0) {
-				kbase_jd_user_buf_unmap(kctx, alloc,
+				kbase_jd_user_buf_unmap(kctx, alloc, reg,
 							(reg->flags &
 							 (KBASE_REG_CPU_WR | KBASE_REG_GPU_WR)));
 			}
@@ -2004,7 +2017,8 @@ void kbase_sync_single(struct kbase_context *kctx,
 		BUG_ON(!cpu_page);
 		BUG_ON(offset + size > PAGE_SIZE);
 
-		dma_addr = kbase_dma_addr(cpu_page) + offset;
+		dma_addr = kbase_dma_addr_from_tagged(t_cpu_pa) + offset;
+
 		if (sync_fn == KBASE_SYNC_TO_CPU)
 			dma_sync_single_for_cpu(kctx->kbdev->dev, dma_addr,
 					size, DMA_BIDIRECTIONAL);
@@ -2015,19 +2029,20 @@ void kbase_sync_single(struct kbase_context *kctx,
 		void *src = NULL;
 		void *dst = NULL;
 		struct page *gpu_page;
+		dma_addr_t dma_addr;
 
 		if (WARN(!gpu_pa, "No GPU PA found for infinite cache op"))
 			return;
 
 		gpu_page = pfn_to_page(PFN_DOWN(gpu_pa));
+		dma_addr = kbase_dma_addr_from_tagged(t_gpu_pa) + offset;
 
 		if (sync_fn == KBASE_SYNC_TO_DEVICE) {
 			src = ((unsigned char *)kmap(cpu_page)) + offset;
 			dst = ((unsigned char *)kmap(gpu_page)) + offset;
 		} else if (sync_fn == KBASE_SYNC_TO_CPU) {
-			dma_sync_single_for_cpu(kctx->kbdev->dev,
-					kbase_dma_addr(gpu_page) + offset,
-					size, DMA_BIDIRECTIONAL);
+			dma_sync_single_for_cpu(kctx->kbdev->dev, dma_addr, size,
+						DMA_BIDIRECTIONAL);
 			src = ((unsigned char *)kmap(gpu_page)) + offset;
 			dst = ((unsigned char *)kmap(cpu_page)) + offset;
 		}
@@ -2035,9 +2050,8 @@ void kbase_sync_single(struct kbase_context *kctx,
 		kunmap(gpu_page);
 		kunmap(cpu_page);
 		if (sync_fn == KBASE_SYNC_TO_DEVICE)
-			dma_sync_single_for_device(kctx->kbdev->dev,
-					kbase_dma_addr(gpu_page) + offset,
-					size, DMA_BIDIRECTIONAL);
+			dma_sync_single_for_device(kctx->kbdev->dev, dma_addr, size,
+						   DMA_BIDIRECTIONAL);
 	}
 }
 
@@ -2188,24 +2202,22 @@ int kbase_mem_free_region(struct kbase_context *kctx, struct kbase_va_region *re
 		return -EINVAL;
 	}
 
-	/*
-	 * Unlink the physical allocation before unmaking it evictable so
-	 * that the allocation isn't grown back to its last backed size
-	 * as we're going to unmap it anyway.
-	 */
-	reg->cpu_alloc->reg = NULL;
-	if (reg->cpu_alloc != reg->gpu_alloc)
-		reg->gpu_alloc->reg = NULL;
-
-	/*
-	 * If a region has been made evictable then we must unmake it
+	/* If a region has been made evictable then we must unmake it
 	 * before trying to free it.
 	 * If the memory hasn't been reclaimed it will be unmapped and freed
 	 * below, if it has been reclaimed then the operations below are no-ops.
 	 */
 	if (reg->flags & KBASE_REG_DONT_NEED) {
-		KBASE_DEBUG_ASSERT(reg->cpu_alloc->type ==
-				   KBASE_MEM_TYPE_NATIVE);
+		WARN_ON(reg->cpu_alloc->type != KBASE_MEM_TYPE_NATIVE);
+		mutex_lock(&kctx->jit_evict_lock);
+		/* Unlink the physical allocation before unmaking it evictable so
+		 * that the allocation isn't grown back to its last backed size
+		 * as we're going to unmap it anyway.
+		 */
+		reg->cpu_alloc->reg = NULL;
+		if (reg->cpu_alloc != reg->gpu_alloc)
+			reg->gpu_alloc->reg = NULL;
+		mutex_unlock(&kctx->jit_evict_lock);
 		kbase_mem_evictable_unmake(reg->gpu_alloc);
 	}
 
@@ -2464,11 +2476,8 @@ int kbase_alloc_phy_pages_helper(struct kbase_mem_phy_alloc *alloc,
 	if (nr_left >= (SZ_2M / SZ_4K)) {
 		int nr_lp = nr_left / (SZ_2M / SZ_4K);
 
-		res = kbase_mem_pool_alloc_pages(
-			&kctx->mem_pools.large[alloc->group_id],
-			 nr_lp * (SZ_2M / SZ_4K),
-			 tp,
-			 true);
+		res = kbase_mem_pool_alloc_pages(&kctx->mem_pools.large[alloc->group_id],
+						 nr_lp * (SZ_2M / SZ_4K), tp, true);
 
 		if (res > 0) {
 			nr_left -= res;
@@ -2567,9 +2576,8 @@ no_new_partial:
 #endif
 
 	if (nr_left) {
-		res = kbase_mem_pool_alloc_pages(
-			&kctx->mem_pools.small[alloc->group_id],
-			nr_left, tp, false);
+		res = kbase_mem_pool_alloc_pages(&kctx->mem_pools.small[alloc->group_id], nr_left,
+						 tp, false);
 		if (res <= 0)
 			goto alloc_failed;
 	}
@@ -3061,6 +3069,13 @@ KBASE_EXPORT_TEST_API(kbase_free_phy_pages_helper_locked);
 /**
  * kbase_jd_user_buf_unpin_pages - Release the pinned pages of a user buffer.
  * @alloc: The allocation for the imported user buffer.
+ *
+ * This must only be called when terminating an alloc, when its refcount
+ * (number of users) has become 0. This also ensures it is only called once all
+ * CPU mappings have been closed.
+ *
+ * Instead call kbase_jd_user_buf_unmap() if you need to unpin pages on active
+ * allocations
  */
 static void kbase_jd_user_buf_unpin_pages(struct kbase_mem_phy_alloc *alloc);
 #endif
@@ -3434,10 +3449,6 @@ int kbase_check_alloc_sizes(struct kbase_context *kctx, unsigned long flags,
 #undef KBASE_MSG_PRE
 }
 
-/**
- * kbase_gpu_vm_lock() - Acquire the per-context region list lock
- * @kctx:  KBase context
- */
 void kbase_gpu_vm_lock(struct kbase_context *kctx)
 {
 	KBASE_DEBUG_ASSERT(kctx != NULL);
@@ -3446,10 +3457,6 @@ void kbase_gpu_vm_lock(struct kbase_context *kctx)
 
 KBASE_EXPORT_TEST_API(kbase_gpu_vm_lock);
 
-/**
- * kbase_gpu_vm_unlock() - Release the per-context region list lock
- * @kctx:  KBase context
- */
 void kbase_gpu_vm_unlock(struct kbase_context *kctx)
 {
 	KBASE_DEBUG_ASSERT(kctx != NULL);
@@ -3774,6 +3781,7 @@ int kbase_jit_init(struct kbase_context *kctx)
 	INIT_WORK(&kctx->jit_work, kbase_jit_destroy_worker);
 
 #if MALI_USE_CSF
+	mutex_init(&kctx->csf.kcpu_queues.jit_lock);
 	INIT_LIST_HEAD(&kctx->csf.kcpu_queues.jit_cmds_head);
 	INIT_LIST_HEAD(&kctx->csf.kcpu_queues.jit_blocked_queues);
 #else /* !MALI_USE_CSF */
@@ -4211,11 +4219,11 @@ static bool jit_allow_allocate(struct kbase_context *kctx,
 		const struct base_jit_alloc_info *info,
 		bool ignore_pressure_limit)
 {
-#if MALI_USE_CSF
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
-#else
+#if !MALI_USE_CSF
 	lockdep_assert_held(&kctx->jctx.lock);
-#endif
+#else /* MALI_USE_CSF */
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);
+#endif /* !MALI_USE_CSF */
 
 #if MALI_JIT_PRESSURE_LIMIT_BASE
 	if (!ignore_pressure_limit &&
@@ -4306,11 +4314,11 @@ struct kbase_va_region *kbase_jit_allocate(struct kbase_context *kctx,
 	 */
 	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_SYNC;
 
-#if MALI_USE_CSF
-	lockdep_assert_held(&kctx->csf.kcpu_queues.lock);
-#else
+#if !MALI_USE_CSF
 	lockdep_assert_held(&kctx->jctx.lock);
-#endif
+#else /* MALI_USE_CSF */
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);
+#endif /* !MALI_USE_CSF */
 
 	if (!jit_allow_allocate(kctx, info, ignore_pressure_limit))
 		return NULL;
@@ -4518,6 +4526,12 @@ void kbase_jit_free(struct kbase_context *kctx, struct kbase_va_region *reg)
 {
 	u64 old_pages;
 
+#if !MALI_USE_CSF
+	lockdep_assert_held(&kctx->jctx.lock);
+#else /* MALI_USE_CSF */
+	lockdep_assert_held(&kctx->csf.kcpu_queues.jit_lock);
+#endif /* !MALI_USE_CSF */
+
 	/* JIT id not immediately available here, so use 0u */
 	trace_mali_jit_free(reg, 0u);
 
@@ -4764,7 +4778,23 @@ void kbase_unpin_user_buf_page(struct page *page)
 #if MALI_USE_CSF
 static void kbase_jd_user_buf_unpin_pages(struct kbase_mem_phy_alloc *alloc)
 {
-	if (alloc->nents) {
+	/* In CSF builds, we keep pages pinned until the last reference is
+	 * released on the alloc. A refcount of 0 also means we can be sure
+	 * that all CPU mappings have been closed on this alloc, and no more
+	 * mappings of it will be created.
+	 *
+	 * Further, the WARN() below captures the restriction that this
+	 * function will not handle anything other than the alloc termination
+	 * path, because the caller of kbase_mem_phy_alloc_put() is not
+	 * required to hold the kctx's reg_lock, and so we could not handle
+	 * removing an existing CPU mapping here.
+	 *
+	 * Refer to this function's kernel-doc comments for alternatives for
+	 * unpinning a User buffer.
+	 */
+
+	if (alloc->nents && !WARN(kref_read(&alloc->kref) != 0,
+				  "must only be called on terminating an allocation")) {
 		struct page **pages = alloc->imported.user_buf.pages;
 		long i;
 
@@ -4772,6 +4802,8 @@ static void kbase_jd_user_buf_unpin_pages(struct kbase_mem_phy_alloc *alloc)
 
 		for (i = 0; i < alloc->nents; i++)
 			kbase_unpin_user_buf_page(pages[i]);
+
+		alloc->nents = 0;
 	}
 }
 #endif
@@ -4787,6 +4819,8 @@ int kbase_jd_user_buf_pin_pages(struct kbase_context *kctx,
 	long i;
 	int write;
 
+	lockdep_assert_held(&kctx->reg_lock);
+
 	if (WARN_ON(alloc->type != KBASE_MEM_TYPE_IMPORTED_USER_BUF))
 		return -EINVAL;
 
@@ -4817,6 +4851,9 @@ int kbase_jd_user_buf_pin_pages(struct kbase_context *kctx,
 		return pinned_pages;
 
 	if (pinned_pages != alloc->imported.user_buf.nr_pages) {
+		/* Above code already ensures there will not have been a CPU
+		 * mapping by ensuring alloc->nents is 0
+		 */
 		for (i = 0; i < pinned_pages; i++)
 			kbase_unpin_user_buf_page(pages[i]);
 		return -ENOMEM;
@@ -4830,23 +4867,26 @@ int kbase_jd_user_buf_pin_pages(struct kbase_context *kctx,
 static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 		struct kbase_va_region *reg)
 {
-	long pinned_pages;
+	int err;
+	long pinned_pages = 0;
 	struct kbase_mem_phy_alloc *alloc;
 	struct page **pages;
 	struct tagged_addr *pa;
-	long i;
+	long i, dma_mapped_pages;
 	unsigned long address;
 	struct device *dev;
-	unsigned long offset;
-	unsigned long local_size;
+	unsigned long offset_within_page;
+	unsigned long remaining_size;
 	unsigned long gwt_mask = ~0;
-	int err = kbase_jd_user_buf_pin_pages(kctx, reg);
-
 	/* Calls to this function are inherently asynchronous, with respect to
 	 * MMU operations.
 	 */
 	const enum kbase_caller_mmu_sync_info mmu_sync_info = CALLER_MMU_ASYNC;
 
+	lockdep_assert_held(&kctx->reg_lock);
+
+	err = kbase_jd_user_buf_pin_pages(kctx, reg);
+
 	if (err)
 		return err;
 
@@ -4856,17 +4896,16 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 	pinned_pages = alloc->nents;
 	pages = alloc->imported.user_buf.pages;
 	dev = kctx->kbdev->dev;
-	offset = address & ~PAGE_MASK;
-	local_size = alloc->imported.user_buf.size;
+	offset_within_page = address & ~PAGE_MASK;
+	remaining_size = alloc->imported.user_buf.size;
 
 	for (i = 0; i < pinned_pages; i++) {
-		dma_addr_t dma_addr;
-		unsigned long min;
-
-		min = MIN(PAGE_SIZE - offset, local_size);
-		dma_addr = dma_map_page(dev, pages[i],
-				offset, min,
+		unsigned long map_size =
+			MIN(PAGE_SIZE - offset_within_page, remaining_size);
+		dma_addr_t dma_addr = dma_map_page(dev, pages[i],
+				offset_within_page, map_size,
 				DMA_BIDIRECTIONAL);
+
 		err = dma_mapping_error(dev, dma_addr);
 		if (err)
 			goto unwind;
@@ -4874,8 +4913,8 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 		alloc->imported.user_buf.dma_addrs[i] = dma_addr;
 		pa[i] = as_tagged(page_to_phys(pages[i]));
 
-		local_size -= min;
-		offset = 0;
+		remaining_size -= map_size;
+		offset_within_page = 0;
 	}
 
 #ifdef CONFIG_MALI_CINSTR_GWT
@@ -4893,13 +4932,28 @@ static int kbase_jd_user_buf_map(struct kbase_context *kctx,
 	/* fall down */
 unwind:
 	alloc->nents = 0;
-	while (i--) {
+	offset_within_page = address & ~PAGE_MASK;
+	remaining_size = alloc->imported.user_buf.size;
+	dma_mapped_pages = i;
+	/* Run the unmap loop in the same order as map loop */
+	for (i = 0; i < dma_mapped_pages; i++) {
+		unsigned long unmap_size =
+			MIN(PAGE_SIZE - offset_within_page, remaining_size);
+
 		dma_unmap_page(kctx->kbdev->dev,
 				alloc->imported.user_buf.dma_addrs[i],
-				PAGE_SIZE, DMA_BIDIRECTIONAL);
+				unmap_size, DMA_BIDIRECTIONAL);
+		remaining_size -= unmap_size;
+		offset_within_page = 0;
 	}
 
-	while (++i < pinned_pages) {
+	/* The user buffer could already have been previously pinned before
+	 * entering this function, and hence there could potentially be CPU
+	 * mappings of it
+	 */
+	kbase_mem_shrink_cpu_mapping(kctx, reg, 0, pinned_pages);
+
+	for (i = 0; i < pinned_pages; i++) {
 		kbase_unpin_user_buf_page(pages[i]);
 		pages[i] = NULL;
 	}
@@ -4911,21 +4965,31 @@ unwind:
  * GPUs, which implies that a call to kbase_jd_user_buf_pin_pages() will NOT
  * have a corresponding call to kbase_jd_user_buf_unpin_pages().
  */
-static void kbase_jd_user_buf_unmap(struct kbase_context *kctx,
-		struct kbase_mem_phy_alloc *alloc, bool writeable)
+static void kbase_jd_user_buf_unmap(struct kbase_context *kctx, struct kbase_mem_phy_alloc *alloc,
+				    struct kbase_va_region *reg, bool writeable)
 {
 	long i;
 	struct page **pages;
-	unsigned long size = alloc->imported.user_buf.size;
+	unsigned long offset_within_page = alloc->imported.user_buf.address & ~PAGE_MASK;
+	unsigned long remaining_size = alloc->imported.user_buf.size;
+
+	lockdep_assert_held(&kctx->reg_lock);
 
 	KBASE_DEBUG_ASSERT(alloc->type == KBASE_MEM_TYPE_IMPORTED_USER_BUF);
 	pages = alloc->imported.user_buf.pages;
+
+#if !MALI_USE_CSF
+	kbase_mem_shrink_cpu_mapping(kctx, reg, 0, alloc->nents);
+#else
+	CSTD_UNUSED(reg);
+#endif
+
 	for (i = 0; i < alloc->imported.user_buf.nr_pages; i++) {
-		unsigned long local_size;
+		unsigned long unmap_size =
+			MIN(remaining_size, PAGE_SIZE - offset_within_page);
 		dma_addr_t dma_addr = alloc->imported.user_buf.dma_addrs[i];
 
-		local_size = MIN(size, PAGE_SIZE - (dma_addr & ~PAGE_MASK));
-		dma_unmap_page(kctx->kbdev->dev, dma_addr, local_size,
+		dma_unmap_page(kctx->kbdev->dev, dma_addr, unmap_size,
 				DMA_BIDIRECTIONAL);
 		if (writeable)
 			set_page_dirty_lock(pages[i]);
@@ -4934,7 +4998,8 @@ static void kbase_jd_user_buf_unmap(struct kbase_context *kctx,
 		pages[i] = NULL;
 #endif
 
-		size -= local_size;
+		remaining_size -= unmap_size;
+		offset_within_page = 0;
 	}
 #if !MALI_USE_CSF
 	alloc->nents = 0;
@@ -4981,11 +5046,11 @@ int kbase_mem_copy_to_pinned_user_pages(struct page **dest_pages,
 	return 0;
 }
 
-struct kbase_mem_phy_alloc *kbase_map_external_resource(
-		struct kbase_context *kctx, struct kbase_va_region *reg,
-		struct mm_struct *locked_mm)
+int kbase_map_external_resource(struct kbase_context *kctx, struct kbase_va_region *reg,
+				struct mm_struct *locked_mm)
 {
-	int err;
+	int err = 0;
+	struct kbase_mem_phy_alloc *alloc = reg->gpu_alloc;
 
 	lockdep_assert_held(&kctx->reg_lock);
 
@@ -4994,7 +5059,7 @@ struct kbase_mem_phy_alloc *kbase_map_external_resource(
 	case KBASE_MEM_TYPE_IMPORTED_USER_BUF: {
 		if ((reg->gpu_alloc->imported.user_buf.mm != locked_mm) &&
 		    (!reg->gpu_alloc->nents))
-			goto exit;
+			return -EINVAL;
 
 		reg->gpu_alloc->imported.user_buf.current_mapping_usage_count++;
 		if (reg->gpu_alloc->imported.user_buf
@@ -5002,7 +5067,7 @@ struct kbase_mem_phy_alloc *kbase_map_external_resource(
 			err = kbase_jd_user_buf_map(kctx, reg);
 			if (err) {
 				reg->gpu_alloc->imported.user_buf.current_mapping_usage_count--;
-				goto exit;
+				return err;
 			}
 		}
 	}
@@ -5010,21 +5075,29 @@ struct kbase_mem_phy_alloc *kbase_map_external_resource(
 	case KBASE_MEM_TYPE_IMPORTED_UMM: {
 		err = kbase_mem_umm_map(kctx, reg);
 		if (err)
-			goto exit;
+			return err;
 		break;
 	}
 	default:
-		goto exit;
+		WARN(1, "Invalid external resource GPU allocation type (%x) on mapping",
+		     alloc->type);
+		return -EINVAL;
 	}
 
-	return kbase_mem_phy_alloc_get(reg->gpu_alloc);
-exit:
-	return NULL;
+	kbase_va_region_alloc_get(kctx, reg);
+	kbase_mem_phy_alloc_get(alloc);
+	return err;
 }
 
-void kbase_unmap_external_resource(struct kbase_context *kctx,
-		struct kbase_va_region *reg, struct kbase_mem_phy_alloc *alloc)
+void kbase_unmap_external_resource(struct kbase_context *kctx, struct kbase_va_region *reg)
 {
+	/* gpu_alloc was used in kbase_map_external_resources, so we need to use it for the
+	 * unmapping operation.
+	 */
+	struct kbase_mem_phy_alloc *alloc = reg->gpu_alloc;
+
+	lockdep_assert_held(&kctx->reg_lock);
+
 	switch (alloc->type) {
 	case KBASE_MEM_TYPE_IMPORTED_UMM: {
 		kbase_mem_umm_unmap(kctx, reg, alloc);
@@ -5036,24 +5109,32 @@ void kbase_unmap_external_resource(struct kbase_context *kctx,
 		if (alloc->imported.user_buf.current_mapping_usage_count == 0) {
 			bool writeable = true;
 
-			if (!kbase_is_region_invalid_or_free(reg) &&
-					reg->gpu_alloc == alloc)
+			if (!kbase_is_region_invalid_or_free(reg)) {
 				kbase_mmu_teardown_pages(kctx->kbdev, &kctx->mmu, reg->start_pfn,
 							 alloc->pages,
 							 kbase_reg_current_backed_size(reg),
 							 kctx->as_nr);
+			}
 
-			if (reg && ((reg->flags & (KBASE_REG_CPU_WR | KBASE_REG_GPU_WR)) == 0))
+			if ((reg->flags & (KBASE_REG_CPU_WR | KBASE_REG_GPU_WR)) == 0)
 				writeable = false;
 
-			kbase_jd_user_buf_unmap(kctx, alloc, writeable);
+			kbase_jd_user_buf_unmap(kctx, alloc, reg, writeable);
+		}
 		}
-	}
 	break;
 	default:
-	break;
+		WARN(1, "Invalid external resource GPU allocation type (%x) on unmapping",
+		     alloc->type);
+		return;
 	}
 	kbase_mem_phy_alloc_put(alloc);
+	kbase_va_region_alloc_put(kctx, reg);
+}
+
+static inline u64 kbasep_get_va_gpu_addr(struct kbase_va_region *reg)
+{
+	return reg->start_pfn << PAGE_SHIFT;
 }
 
 struct kbase_ctx_ext_res_meta *kbase_sticky_resource_acquire(
@@ -5069,7 +5150,7 @@ struct kbase_ctx_ext_res_meta *kbase_sticky_resource_acquire(
 	 * metadata which matches the region which is being acquired.
 	 */
 	list_for_each_entry(walker, &kctx->ext_res_meta_head, ext_res_node) {
-		if (walker->gpu_addr == gpu_addr) {
+		if (kbasep_get_va_gpu_addr(walker->reg) == gpu_addr) {
 			meta = walker;
 			meta->ref++;
 			break;
@@ -5081,8 +5162,7 @@ struct kbase_ctx_ext_res_meta *kbase_sticky_resource_acquire(
 		struct kbase_va_region *reg;
 
 		/* Find the region */
-		reg = kbase_region_tracker_find_region_enclosing_address(
-				kctx, gpu_addr);
+		reg = kbase_region_tracker_find_region_enclosing_address(kctx, gpu_addr);
 		if (kbase_is_region_invalid_or_free(reg))
 			goto failed;
 
@@ -5090,18 +5170,18 @@ struct kbase_ctx_ext_res_meta *kbase_sticky_resource_acquire(
 		meta = kzalloc(sizeof(*meta), GFP_KERNEL);
 		if (!meta)
 			goto failed;
-
 		/*
 		 * Fill in the metadata object and acquire a reference
 		 * for the physical resource.
 		 */
-		meta->alloc = kbase_map_external_resource(kctx, reg, NULL);
-		meta->ref = 1;
+		meta->reg = reg;
 
-		if (!meta->alloc)
+		/* Map the external resource to the GPU allocation of the region
+		 * and acquire the reference to the VA region
+		 */
+		if (kbase_map_external_resource(kctx, meta->reg, NULL))
 			goto fail_map;
-
-		meta->gpu_addr = reg->start_pfn << PAGE_SHIFT;
+		meta->ref = 1;
 
 		list_add(&meta->ext_res_node, &kctx->ext_res_meta_head);
 	}
@@ -5126,7 +5206,7 @@ find_sticky_resource_meta(struct kbase_context *kctx, u64 gpu_addr)
 	 * metadata which matches the region which is being released.
 	 */
 	list_for_each_entry(walker, &kctx->ext_res_meta_head, ext_res_node)
-		if (walker->gpu_addr == gpu_addr)
+		if (kbasep_get_va_gpu_addr(walker->reg) == gpu_addr)
 			return walker;
 
 	return NULL;
@@ -5135,14 +5215,7 @@ find_sticky_resource_meta(struct kbase_context *kctx, u64 gpu_addr)
 static void release_sticky_resource_meta(struct kbase_context *kctx,
 		struct kbase_ctx_ext_res_meta *meta)
 {
-	struct kbase_va_region *reg;
-
-	/* Drop the physical memory reference and free the metadata. */
-	reg = kbase_region_tracker_find_region_enclosing_address(
-			kctx,
-			meta->gpu_addr);
-
-	kbase_unmap_external_resource(kctx, reg, meta->alloc);
+	kbase_unmap_external_resource(kctx, meta->reg);
 	list_del(&meta->ext_res_node);
 	kfree(meta);
 }
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem.h b/drivers/gpu/arm/bifrost/mali_kbase_mem.h
index f590a449504a..5820f6d8a556 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_mem.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_mem.h
@@ -37,6 +37,7 @@
 #include "mali_kbase_defs.h"
 /* Required for kbase_mem_evictable_unmake */
 #include "mali_kbase_mem_linux.h"
+#include "mali_kbase_mem_migrate.h"
 
 static inline void kbase_process_page_usage_inc(struct kbase_context *kctx,
 		int pages);
@@ -182,6 +183,89 @@ struct kbase_mem_phy_alloc {
 	} imported;
 };
 
+/**
+ * enum kbase_page_status - Status of a page used for page migration.
+ *
+ * @MEM_POOL: Stable state. Page is located in a memory pool and can safely
+ *            be migrated.
+ * @ALLOCATE_IN_PROGRESS: Transitory state. A page is set to this status as
+ *                        soon as it leaves a memory pool.
+ * @SPILL_IN_PROGRESS: Transitory state. Corner case where pages in a memory
+ *                     pool of a dying context are being moved to the device
+ *                     memory pool.
+ * @ALLOCATED_MAPPED: Stable state. Page has been allocated, mapped to GPU
+ *                    and has reference to kbase_mem_phy_alloc object.
+ * @MULTI_MAPPED: Stable state. This state is used to manage all use cases
+ *                where a page may have "unusual" mappings.
+ * @PT_MAPPED: Stable state. Similar to ALLOCATED_MAPPED, but page doesn't
+ *             reference kbase_mem_phy_alloc object. Used as a page in MMU
+ *             page table.
+ * @FREE_IN_PROGRESS: Transitory state. A page is set to this status as soon as
+ *                    the driver manages to acquire a lock on the page while
+ *                    unmapping it. This status means that a memory release is
+ *                    happening and it's still not complete.
+ * @FREE_ISOLATED_IN_PROGRESS: Transitory state. This is a very particular corner case.
+ *                             A page is isolated while it is in ALLOCATED_MAPPED or
+ *                             PT_MAPPED state, but then the driver tries to destroy the
+ *                             allocation.
+ *
+ * Pages can only be migrated in stable states.
+ */
+enum kbase_page_status {
+	MEM_POOL = 0,
+	ALLOCATE_IN_PROGRESS,
+	SPILL_IN_PROGRESS,
+	ALLOCATED_MAPPED,
+	MULTI_MAPPED,
+	PT_MAPPED,
+	FREE_IN_PROGRESS,
+	FREE_ISOLATED_IN_PROGRESS,
+};
+
+/**
+ * struct kbase_page_metadata - Metadata for each page in kbase
+ *
+ * @kbdev:         Pointer to kbase device.
+ * @dma_addr:      DMA address mapped to page.
+ * @migrate_lock:  A spinlock to protect the private metadata.
+ * @status:        Status to keep track if page can be migrated at any
+ *                 given moment. MSB will indicate if page is isolated.
+ *                 Protected by @migrate_lock.
+ * @data:          Member in union valid based on @status.
+ *
+ * Each 4KB page will have a reference to this struct in the private field.
+ * This will be used to keep track of information required for Linux page
+ * migration functionality as well as address for DMA mapping.
+ */
+struct kbase_page_metadata {
+	dma_addr_t dma_addr;
+	spinlock_t migrate_lock;
+	u8 status;
+
+	union {
+		struct {
+			struct kbase_mem_pool *pool;
+			/* Pool could be terminated after page is isolated and therefore
+			 * won't be able to get reference to kbase device.
+			 */
+			struct kbase_device *kbdev;
+		} mem_pool;
+		struct {
+			struct kbase_mem_phy_alloc *phy_alloc;
+			struct kbase_va_region *reg;
+			struct kbase_mmu_table *mmut;
+			struct page *pgd;
+			u64 vpfn;
+			size_t page_array_index;
+		} mapped;
+		struct {
+			struct kbase_mmu_table *mmut;
+			struct page *pgd;
+			u16 entry_info;
+		} pt_mapped;
+	} data;
+};
+
 /* The top bit of kbase_alloc_import_user_buf::current_mapping_usage_count is
  * used to signify that a buffer was pinned when it was imported. Since the
  * reference count is limited by the number of atoms that can be submitted at
@@ -224,8 +308,9 @@ static inline void kbase_mem_phy_alloc_gpu_unmapped(struct kbase_mem_phy_alloc *
 }
 
 /**
- * kbase_mem_phy_alloc_kernel_mapped - Increment kernel_mappings
- * counter for a memory region to prevent commit and flag changes
+ * kbase_mem_phy_alloc_kernel_mapped - Increment kernel_mappings counter for a
+ *                                     memory region to prevent commit and flag
+ *                                     changes
  *
  * @alloc:  Pointer to physical pages tracking object
  */
@@ -387,6 +472,13 @@ struct kbase_va_region {
 
 #define KBASE_REG_PROTECTED         (1ul << 19)
 
+/* Region belongs to a shrinker.
+ *
+ * This can either mean that it is part of the JIT/Ephemeral or tiler heap
+ * shrinker paths. Should be removed only after making sure that there are
+ * no references remaining to it in these paths, as it may cause the physical
+ * backing of the region to disappear during use.
+ */
 #define KBASE_REG_DONT_NEED         (1ul << 20)
 
 /* Imported buffer is padded? */
@@ -862,12 +954,9 @@ static inline size_t kbase_mem_pool_config_get_max_size(
  *
  * Return: 0 on success, negative -errno on error
  */
-int kbase_mem_pool_init(struct kbase_mem_pool *pool,
-		const struct kbase_mem_pool_config *config,
-		unsigned int order,
-		int group_id,
-		struct kbase_device *kbdev,
-		struct kbase_mem_pool *next_pool);
+int kbase_mem_pool_init(struct kbase_mem_pool *pool, const struct kbase_mem_pool_config *config,
+			unsigned int order, int group_id, struct kbase_device *kbdev,
+			struct kbase_mem_pool *next_pool);
 
 /**
  * kbase_mem_pool_term - Destroy a memory pool
@@ -963,7 +1052,7 @@ void kbase_mem_pool_free_locked(struct kbase_mem_pool *pool, struct page *p,
  * this lock, it should use kbase_mem_pool_alloc_pages_locked() instead.
  */
 int kbase_mem_pool_alloc_pages(struct kbase_mem_pool *pool, size_t nr_4k_pages,
-		struct tagged_addr *pages, bool partial_allowed);
+			       struct tagged_addr *pages, bool partial_allowed);
 
 /**
  * kbase_mem_pool_alloc_pages_locked - Allocate pages from memory pool
@@ -1114,6 +1203,16 @@ void kbase_mem_pool_mark_dying(struct kbase_mem_pool *pool);
  */
 struct page *kbase_mem_alloc_page(struct kbase_mem_pool *pool);
 
+/**
+ * kbase_mem_pool_free_page - Free a page from a memory pool.
+ * @pool:  Memory pool to free a page from
+ * @p:     Page to free
+ *
+ * This will free any associated data stored for the page and release
+ * the page back to the kernel.
+ */
+void kbase_mem_pool_free_page(struct kbase_mem_pool *pool, struct page *p);
+
 /**
  * kbase_region_tracker_init - Initialize the region tracker data structure
  * @kctx: kbase context
@@ -1233,7 +1332,55 @@ int kbase_check_alloc_sizes(struct kbase_context *kctx, unsigned long flags,
 int kbase_update_region_flags(struct kbase_context *kctx,
 		struct kbase_va_region *reg, unsigned long flags);
 
+/**
+ * kbase_gpu_vm_lock() - Acquire the per-context region list lock
+ * @kctx:  KBase context
+ *
+ * Care must be taken when making an allocation whilst holding this lock, because of interaction
+ * with the Kernel's OoM-killer and use of this lock in &vm_operations_struct close() handlers.
+ *
+ * If this lock is taken during a syscall, and/or the allocation is 'small' then it is safe to use.
+ *
+ * If the caller is not in a syscall, and the allocation is 'large', then it must not hold this
+ * lock.
+ *
+ * This is because the kernel OoM killer might target the process corresponding to that same kbase
+ * context, and attempt to call the context's close() handlers for its open VMAs. This is safe if
+ * the allocating caller is in a syscall, because the VMA close() handlers are delayed until all
+ * syscalls have finished (noting that no new syscalls can start as the remaining user threads will
+ * have been killed too), and so there is no possibility of contention between the thread
+ * allocating with this lock held, and the VMA close() handler.
+ *
+ * However, outside of a syscall (e.g. a kworker or other kthread), one of kbase's VMA close()
+ * handlers (kbase_cpu_vm_close()) also takes this lock, and so prevents the process from being
+ * killed until the caller of the function allocating memory has released this lock. On subsequent
+ * retries for allocating a page, the OoM killer would be re-invoked but skips over the process
+ * stuck in its close() handler.
+ *
+ * Also because the caller is not in a syscall, the page allocation code in the kernel is not aware
+ * that the allocation is being done on behalf of another process, and so does not realize that
+ * process has received a kill signal due to an OoM, and so will continually retry with the OoM
+ * killer until enough memory has been released, or until all other killable processes have been
+ * killed (at which point the kernel halts with a panic).
+ *
+ * However, if the allocation outside of a syscall is small enough to be satisfied by killing
+ * another process, then the allocation completes, the caller releases this lock, and
+ * kbase_cpu_vm_close() can unblock and allow the process to be killed.
+ *
+ * Hence, this is effectively a deadlock with kbase_cpu_vm_close(), except that if the memory
+ * allocation is small enough the deadlock can be resolved. For that reason, such a memory deadlock
+ * is NOT discovered with CONFIG_PROVE_LOCKING.
+ *
+ * If this may be called outside of a syscall, consider moving allocations outside of this lock, or
+ * use __GFP_NORETRY for such allocations (which will allow direct-reclaim attempts, but will
+ * prevent OoM kills to satisfy the allocation, and will just fail the allocation instead).
+ */
 void kbase_gpu_vm_lock(struct kbase_context *kctx);
+
+/**
+ * kbase_gpu_vm_unlock() - Release the per-context region list lock
+ * @kctx:  KBase context
+ */
 void kbase_gpu_vm_unlock(struct kbase_context *kctx);
 
 int kbase_alloc_phy_pages(struct kbase_va_region *reg, size_t vsize, size_t size);
@@ -1311,6 +1458,7 @@ void kbase_mmu_disable_as(struct kbase_device *kbdev, int as_nr);
 
 void kbase_mmu_interrupt(struct kbase_device *kbdev, u32 irq_stat);
 
+#if defined(CONFIG_MALI_VECTOR_DUMP)
 /**
  * kbase_mmu_dump() - Dump the MMU tables to a buffer.
  *
@@ -1330,6 +1478,7 @@ void kbase_mmu_interrupt(struct kbase_device *kbdev, u32 irq_stat);
  * (including if the @c nr_pages is too small)
  */
 void *kbase_mmu_dump(struct kbase_context *kctx, int nr_pages);
+#endif
 
 /**
  * kbase_sync_now - Perform cache maintenance on a memory region
@@ -1449,15 +1598,21 @@ int kbasep_find_enclosing_gpu_mapping_start_and_offset(
  * @alloc:              allocation object to add pages to
  * @nr_pages_requested: number of physical pages to allocate
  *
- * Allocates \a nr_pages_requested and updates the alloc object.
+ * Allocates @nr_pages_requested and updates the alloc object.
  *
- * Return: 0 if all pages have been successfully allocated. Error code otherwise
+ * Note: if kbase_gpu_vm_lock() is to be held around this function to ensure thread-safe updating
+ * of @alloc, then refer to the documentation of kbase_gpu_vm_lock() about the requirements of
+ * either calling during a syscall, or ensuring the allocation is small. These requirements prevent
+ * an effective deadlock between the kernel's OoM killer and kbase's VMA close() handlers, which
+ * could take kbase_gpu_vm_lock() too.
  *
- * Note : The caller must not hold vm_lock, as this could cause a deadlock if
- * the kernel OoM killer runs. If the caller must allocate pages while holding
- * this lock, it should use kbase_mem_pool_alloc_pages_locked() instead.
+ * If the requirements of kbase_gpu_vm_lock() cannot be satisfied when calling this function, but
+ * @alloc must still be updated in a thread-safe way, then instead use
+ * kbase_alloc_phy_pages_helper_locked() and restructure callers into the sequence outlined there.
  *
  * This function cannot be used from interrupt context
+ *
+ * Return: 0 if all pages have been successfully allocated. Error code otherwise
  */
 int kbase_alloc_phy_pages_helper(struct kbase_mem_phy_alloc *alloc,
 		size_t nr_pages_requested);
@@ -1467,17 +1622,19 @@ int kbase_alloc_phy_pages_helper(struct kbase_mem_phy_alloc *alloc,
  * @alloc:              allocation object to add pages to
  * @pool:               Memory pool to allocate from
  * @nr_pages_requested: number of physical pages to allocate
- * @prealloc_sa:        Information about the partial allocation if the amount
- *                      of memory requested is not a multiple of 2MB. One
- *                      instance of struct kbase_sub_alloc must be allocated by
- *                      the caller iff CONFIG_MALI_2MB_ALLOC is enabled.
  *
- * Allocates \a nr_pages_requested and updates the alloc object. This function
- * does not allocate new pages from the kernel, and therefore will never trigger
- * the OoM killer. Therefore, it can be run while the vm_lock is held.
+ * @prealloc_sa:        Information about the partial allocation if the amount of memory requested
+ *                      is not a multiple of 2MB. One instance of struct kbase_sub_alloc must be
+ *                      allocated by the caller iff CONFIG_MALI_2MB_ALLOC is enabled.
  *
- * As new pages can not be allocated, the caller must ensure there are
- * sufficient pages in the pool. Usage of this function should look like :
+ * Allocates @nr_pages_requested and updates the alloc object. This function does not allocate new
+ * pages from the kernel, and therefore will never trigger the OoM killer. Therefore, it can be
+ * called whilst a thread operating outside of a syscall has held the region list lock
+ * (kbase_gpu_vm_lock()), as it will not cause an effective deadlock with VMA close() handlers used
+ * by the OoM killer.
+ *
+ * As new pages can not be allocated, the caller must ensure there are sufficient pages in the
+ * pool. Usage of this function should look like :
  *
  *   kbase_gpu_vm_lock(kctx);
  *   kbase_mem_pool_lock(pool)
@@ -1490,24 +1647,24 @@ int kbase_alloc_phy_pages_helper(struct kbase_mem_phy_alloc *alloc,
  *   }
  *   kbase_alloc_phy_pages_helper_locked(pool)
  *   kbase_mem_pool_unlock(pool)
- *   Perform other processing that requires vm_lock...
+ *   // Perform other processing that requires vm_lock...
  *   kbase_gpu_vm_unlock(kctx);
  *
- * This ensures that the pool can be grown to the required size and that the
- * allocation can complete without another thread using the newly grown pages.
+ * This ensures that the pool can be grown to the required size and that the allocation can
+ * complete without another thread using the newly grown pages.
  *
- * If CONFIG_MALI_2MB_ALLOC is defined and the allocation is >= 2MB, then
- * @pool must be alloc->imported.native.kctx->lp_mem_pool. Otherwise it must be
- * alloc->imported.native.kctx->mem_pool.
- * @prealloc_sa is used to manage the non-2MB sub-allocation. It has to be
- * pre-allocated because we must not sleep (due to the usage of kmalloc())
- * whilst holding pool->pool_lock.
- * @prealloc_sa shall be set to NULL if it has been consumed by this function
- * to indicate that the caller must not free it.
+ * If CONFIG_MALI_2MB_ALLOC is defined and the allocation is >= 2MB, then @pool must be one of the
+ * pools from alloc->imported.native.kctx->mem_pools.large[]. Otherwise it must be one of the
+ * mempools from alloc->imported.native.kctx->mem_pools.small[].
+ *
+ * @prealloc_sa is used to manage the non-2MB sub-allocation. It has to be pre-allocated because we
+ * must not sleep (due to the usage of kmalloc()) whilst holding pool->pool_lock.  @prealloc_sa
+ * shall be set to NULL if it has been consumed by this function to indicate that the caller no
+ * longer owns it and should not access it further.
+ *
+ * Note: Caller must hold @pool->pool_lock
  *
  * Return: Pointer to array of allocated pages. NULL on failure.
- *
- * Note : Caller must hold pool->pool_lock
  */
 struct tagged_addr *kbase_alloc_phy_pages_helper_locked(
 		struct kbase_mem_phy_alloc *alloc, struct kbase_mem_pool *pool,
@@ -1546,7 +1703,7 @@ void kbase_free_phy_pages_helper_locked(struct kbase_mem_phy_alloc *alloc,
 		struct kbase_mem_pool *pool, struct tagged_addr *pages,
 		size_t nr_pages_to_free);
 
-static inline void kbase_set_dma_addr(struct page *p, dma_addr_t dma_addr)
+static inline void kbase_set_dma_addr_as_priv(struct page *p, dma_addr_t dma_addr)
 {
 	SetPagePrivate(p);
 	if (sizeof(dma_addr_t) > sizeof(p->private)) {
@@ -1562,7 +1719,7 @@ static inline void kbase_set_dma_addr(struct page *p, dma_addr_t dma_addr)
 	}
 }
 
-static inline dma_addr_t kbase_dma_addr(struct page *p)
+static inline dma_addr_t kbase_dma_addr_as_priv(struct page *p)
 {
 	if (sizeof(dma_addr_t) > sizeof(p->private))
 		return ((dma_addr_t)page_private(p)) << PAGE_SHIFT;
@@ -1570,11 +1727,34 @@ static inline dma_addr_t kbase_dma_addr(struct page *p)
 	return (dma_addr_t)page_private(p);
 }
 
-static inline void kbase_clear_dma_addr(struct page *p)
+static inline void kbase_clear_dma_addr_as_priv(struct page *p)
 {
 	ClearPagePrivate(p);
 }
 
+static inline struct kbase_page_metadata *kbase_page_private(struct page *p)
+{
+	return (struct kbase_page_metadata *)page_private(p);
+}
+
+static inline dma_addr_t kbase_dma_addr(struct page *p)
+{
+	if (kbase_page_migration_enabled)
+		return kbase_page_private(p)->dma_addr;
+
+	return kbase_dma_addr_as_priv(p);
+}
+
+static inline dma_addr_t kbase_dma_addr_from_tagged(struct tagged_addr tagged_pa)
+{
+	phys_addr_t pa = as_phys_addr_t(tagged_pa);
+	struct page *page = pfn_to_page(PFN_DOWN(pa));
+	dma_addr_t dma_addr =
+		is_huge(tagged_pa) ? kbase_dma_addr_as_priv(page) : kbase_dma_addr(page);
+
+	return dma_addr;
+}
+
 /**
  * kbase_flush_mmu_wqs() - Flush MMU workqueues.
  * @kbdev:   Device pointer.
@@ -1868,28 +2048,36 @@ bool kbase_has_exec_va_zone(struct kbase_context *kctx);
 /**
  * kbase_map_external_resource - Map an external resource to the GPU.
  * @kctx:              kbase context.
- * @reg:               The region to map.
+ * @reg:               External resource to map.
  * @locked_mm:         The mm_struct which has been locked for this operation.
  *
- * Return: The physical allocation which backs the region on success or NULL
- * on failure.
+ * On successful mapping, the VA region and the gpu_alloc refcounts will be
+ * increased, making it safe to use and store both values directly.
+ *
+ * Return: Zero on success, or negative error code.
  */
-struct kbase_mem_phy_alloc *kbase_map_external_resource(
-		struct kbase_context *kctx, struct kbase_va_region *reg,
-		struct mm_struct *locked_mm);
+int kbase_map_external_resource(struct kbase_context *kctx, struct kbase_va_region *reg,
+				struct mm_struct *locked_mm);
 
 /**
  * kbase_unmap_external_resource - Unmap an external resource from the GPU.
  * @kctx:  kbase context.
- * @reg:   The region to unmap or NULL if it has already been released.
- * @alloc: The physical allocation being unmapped.
+ * @reg:   VA region corresponding to external resource
+ *
+ * On successful unmapping, the VA region and the gpu_alloc refcounts will
+ * be decreased. If the refcount reaches zero, both @reg and the corresponding
+ * allocation may be freed, so using them after returning from this function
+ * requires the caller to explicitly check their state.
  */
-void kbase_unmap_external_resource(struct kbase_context *kctx,
-		struct kbase_va_region *reg, struct kbase_mem_phy_alloc *alloc);
+void kbase_unmap_external_resource(struct kbase_context *kctx, struct kbase_va_region *reg);
 
 /**
  * kbase_unpin_user_buf_page - Unpin a page of a user buffer.
  * @page: page to unpin
+ *
+ * The caller must have ensured that there are no CPU mappings for @page (as
+ * might be created from the struct kbase_mem_phy_alloc that tracks @page), and
+ * that userspace will not be able to recreate the CPU mappings again.
  */
 void kbase_unpin_user_buf_page(struct page *page);
 
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem_linux.c b/drivers/gpu/arm/bifrost/mali_kbase_mem_linux.c
index c373cf82ea37..9899cef317ac 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_mem_linux.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_mem_linux.c
@@ -35,6 +35,7 @@
 #include <linux/shrinker.h>
 #include <linux/cache.h>
 #include <linux/memory_group_manager.h>
+#include <linux/math64.h>
 
 #include <mali_kbase.h>
 #include <mali_kbase_mem_linux.h>
@@ -83,24 +84,18 @@
 #define IR_THRESHOLD_STEPS (256u)
 
 #if MALI_USE_CSF
-static int kbase_csf_cpu_mmap_user_reg_page(struct kbase_context *kctx,
-			struct vm_area_struct *vma);
-static int kbase_csf_cpu_mmap_user_io_pages(struct kbase_context *kctx,
-			struct vm_area_struct *vma);
+static int kbase_csf_cpu_mmap_user_reg_page(struct kbase_context *kctx, struct vm_area_struct *vma);
+static int kbase_csf_cpu_mmap_user_io_pages(struct kbase_context *kctx, struct vm_area_struct *vma);
 #endif
 
-static int kbase_vmap_phy_pages(struct kbase_context *kctx,
-		struct kbase_va_region *reg, u64 offset_bytes, size_t size,
-		struct kbase_vmap_struct *map);
+static int kbase_vmap_phy_pages(struct kbase_context *kctx, struct kbase_va_region *reg,
+				u64 offset_bytes, size_t size, struct kbase_vmap_struct *map,
+				kbase_vmap_flag vmap_flags);
 static void kbase_vunmap_phy_pages(struct kbase_context *kctx,
 		struct kbase_vmap_struct *map);
 
 static int kbase_tracking_page_setup(struct kbase_context *kctx, struct vm_area_struct *vma);
 
-static int kbase_mem_shrink_gpu_mapping(struct kbase_context *kctx,
-		struct kbase_va_region *reg,
-		u64 new_pages, u64 old_pages);
-
 static bool is_process_exiting(struct vm_area_struct *vma)
 {
 	/* PF_EXITING flag can't be reliably used here for the detection
@@ -198,20 +193,12 @@ static int kbase_phy_alloc_mapping_init(struct kbase_context *kctx,
 			reg->cpu_alloc->type != KBASE_MEM_TYPE_NATIVE)
 		return -EINVAL;
 
-	if (size > (KBASE_PERMANENTLY_MAPPED_MEM_LIMIT_PAGES -
-			atomic_read(&kctx->permanent_mapped_pages))) {
-		dev_warn(kctx->kbdev->dev, "Request for %llu more pages mem needing a permanent mapping would breach limit %lu, currently at %d pages",
-				(u64)size,
-				KBASE_PERMANENTLY_MAPPED_MEM_LIMIT_PAGES,
-				atomic_read(&kctx->permanent_mapped_pages));
-		return -ENOMEM;
-	}
-
 	kern_mapping = kzalloc(sizeof(*kern_mapping), GFP_KERNEL);
 	if (!kern_mapping)
 		return -ENOMEM;
 
-	err = kbase_vmap_phy_pages(kctx, reg, 0u, size_bytes, kern_mapping);
+	err = kbase_vmap_phy_pages(kctx, reg, 0u, size_bytes, kern_mapping,
+				   KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING);
 	if (err < 0)
 		goto vmap_fail;
 
@@ -219,7 +206,6 @@ static int kbase_phy_alloc_mapping_init(struct kbase_context *kctx,
 	reg->flags &= ~KBASE_REG_GROWABLE;
 
 	reg->cpu_alloc->permanent_map = kern_mapping;
-	atomic_add(size, &kctx->permanent_mapped_pages);
 
 	return 0;
 vmap_fail:
@@ -235,13 +221,6 @@ void kbase_phy_alloc_mapping_term(struct kbase_context *kctx,
 	kfree(alloc->permanent_map);
 
 	alloc->permanent_map = NULL;
-
-	/* Mappings are only done on cpu_alloc, so don't need to worry about
-	 * this being reduced a second time if a separate gpu_alloc is
-	 * freed
-	 */
-	WARN_ON(alloc->nents > atomic_read(&kctx->permanent_mapped_pages));
-	atomic_sub(alloc->nents, &kctx->permanent_mapped_pages);
 }
 
 void *kbase_phy_alloc_mapping_get(struct kbase_context *kctx,
@@ -499,7 +478,25 @@ struct kbase_va_region *kbase_mem_alloc(struct kbase_context *kctx, u64 va_pages
 
 		*gpu_va = (u64) cookie;
 	} else /* we control the VA */ {
-		if (kbase_gpu_mmap(kctx, reg, *gpu_va, va_pages, 1,
+		size_t align = 1;
+
+#ifdef CONFIG_MALI_2MB_ALLOC
+		/* If there's enough (> 33 bits) of GPU VA space, align to 2MB
+		 * boundaries. The similar condition is used for mapping from
+		 * the SAME_VA zone inside kbase_context_get_unmapped_area().
+		 */
+		if (kctx->kbdev->gpu_props.mmu.va_bits > 33) {
+			if (va_pages >= (SZ_2M / SZ_4K))
+				align = (SZ_2M / SZ_4K);
+		}
+		if (*gpu_va)
+			align = 1;
+#if !MALI_USE_CSF
+		if (reg->flags & KBASE_REG_TILER_ALIGN_TOP)
+			align = 1;
+#endif /* !MALI_USE_CSF */
+#endif /* CONFIG_MALI_2MB_ALLOC */
+		if (kbase_gpu_mmap(kctx, reg, *gpu_va, va_pages, align,
 				   mmu_sync_info) != 0) {
 			dev_warn(dev, "Failed to map memory on GPU");
 			kbase_gpu_vm_unlock(kctx);
@@ -675,24 +672,36 @@ out_unlock:
  * @s:        Shrinker
  * @sc:       Shrinker control
  *
- * Return: Number of pages which can be freed.
+ * Return: Number of pages which can be freed or SHRINK_EMPTY if no page remains.
  */
 static
 unsigned long kbase_mem_evictable_reclaim_count_objects(struct shrinker *s,
 		struct shrink_control *sc)
 {
-	struct kbase_context *kctx;
-
-	kctx = container_of(s, struct kbase_context, reclaim);
+	struct kbase_context *kctx = container_of(s, struct kbase_context, reclaim);
+	int evict_nents = atomic_read(&kctx->evict_nents);
+	unsigned long nr_freeable_items;
 
 	WARN((sc->gfp_mask & __GFP_ATOMIC),
 	     "Shrinkers cannot be called for GFP_ATOMIC allocations. Check kernel mm for problems. gfp_mask==%x\n",
 	     sc->gfp_mask);
 	WARN(in_atomic(),
-	     "Shrinker called whilst in atomic context. The caller must switch to using GFP_ATOMIC or similar. gfp_mask==%x\n",
+	     "Shrinker called in atomic context. The caller must use GFP_ATOMIC or similar, then Shrinkers must not be called. gfp_mask==%x\n",
 	     sc->gfp_mask);
 
-	return atomic_read(&kctx->evict_nents);
+	if (unlikely(evict_nents < 0)) {
+		dev_err(kctx->kbdev->dev, "invalid evict_nents(%d)", evict_nents);
+		nr_freeable_items = 0;
+	} else {
+		nr_freeable_items = evict_nents;
+	}
+
+#if KERNEL_VERSION(4, 19, 0) <= LINUX_VERSION_CODE
+	if (nr_freeable_items == 0)
+		nr_freeable_items = SHRINK_EMPTY;
+#endif
+
+	return nr_freeable_items;
 }
 
 /**
@@ -701,8 +710,8 @@ unsigned long kbase_mem_evictable_reclaim_count_objects(struct shrinker *s,
  * @s:        Shrinker
  * @sc:       Shrinker control
  *
- * Return: Number of pages freed (can be less then requested) or -1 if the
- * shrinker failed to free pages in its pool.
+ * Return: Number of pages freed (can be less then requested) or
+ *         SHRINK_STOP if reclaim isn't possible.
  *
  * Note:
  * This function accesses region structures without taking the region lock,
@@ -730,17 +739,15 @@ unsigned long kbase_mem_evictable_reclaim_scan_objects(struct shrinker *s,
 	list_for_each_entry_safe(alloc, tmp, &kctx->evict_list, evict_node) {
 		int err;
 
+		if (!alloc->reg)
+			continue;
+
 		err = kbase_mem_shrink_gpu_mapping(kctx, alloc->reg,
 				0, alloc->nents);
-		if (err != 0) {
-			/*
-			 * Failed to remove GPU mapping, tell the shrinker
-			 * to stop trying to shrink our slab even though we
-			 * have pages in it.
-			 */
-			freed = -1;
-			goto out_unlock;
-		}
+
+		/* Failed to remove GPU mapping, proceed to next one. */
+		if (err != 0)
+			continue;
 
 		/*
 		 * Update alloc->evicted before freeing the backing so the
@@ -764,7 +771,7 @@ unsigned long kbase_mem_evictable_reclaim_scan_objects(struct shrinker *s,
 		if (freed > sc->nr_to_scan)
 			break;
 	}
-out_unlock:
+
 	mutex_unlock(&kctx->jit_evict_lock);
 
 	return freed;
@@ -964,6 +971,15 @@ int kbase_mem_flags_change(struct kbase_context *kctx, u64 gpu_addr, unsigned in
 	if (kbase_is_region_invalid_or_free(reg))
 		goto out_unlock;
 
+	/* There is no use case to support MEM_FLAGS_CHANGE ioctl for allocations
+	 * that have NO_USER_FREE flag set, to mark them as evictable/reclaimable.
+	 * This would usually include JIT allocations, Tiler heap related allocations
+	 * & GPU queue ringbuffer and none of them needs to be explicitly marked
+	 * as evictable by Userspace.
+	 */
+	if (reg->flags & KBASE_REG_NO_USER_FREE)
+		goto out_unlock;
+
 	/* Is the region being transitioning between not needed and needed? */
 	prev_needed = (KBASE_REG_DONT_NEED & reg->flags) == KBASE_REG_DONT_NEED;
 	new_needed = (BASE_MEM_DONT_NEED & flags) == BASE_MEM_DONT_NEED;
@@ -1536,13 +1552,15 @@ static struct kbase_va_region *kbase_mem_from_user_buffer(
 		struct kbase_context *kctx, unsigned long address,
 		unsigned long size, u64 *va_pages, u64 *flags)
 {
-	long i;
+	long i, dma_mapped_pages;
 	struct kbase_va_region *reg;
 	struct rb_root *rbtree;
 	long faulted_pages;
 	int zone = KBASE_REG_ZONE_CUSTOM_VA;
 	bool shared_zone = false;
 	u32 cache_line_alignment = kbase_get_cache_line_alignment(kctx->kbdev);
+	unsigned long offset_within_page;
+	unsigned long remaining_size;
 	struct kbase_alloc_import_user_buf *user_buf;
 	struct page **pages = NULL;
 	int write;
@@ -1688,29 +1706,27 @@ static struct kbase_va_region *kbase_mem_from_user_buffer(
 
 	if (pages) {
 		struct device *dev = kctx->kbdev->dev;
-		unsigned long local_size = user_buf->size;
-		unsigned long offset = user_buf->address & ~PAGE_MASK;
 		struct tagged_addr *pa = kbase_get_gpu_phy_pages(reg);
 
 		/* Top bit signifies that this was pinned on import */
 		user_buf->current_mapping_usage_count |= PINNED_ON_IMPORT;
 
+		offset_within_page = user_buf->address & ~PAGE_MASK;
+		remaining_size = user_buf->size;
 		for (i = 0; i < faulted_pages; i++) {
-			dma_addr_t dma_addr;
-			unsigned long min;
+			unsigned long map_size =
+				MIN(PAGE_SIZE - offset_within_page, remaining_size);
+			dma_addr_t dma_addr = dma_map_page(dev, pages[i],
+				offset_within_page, map_size, DMA_BIDIRECTIONAL);
 
-			min = MIN(PAGE_SIZE - offset, local_size);
-			dma_addr = dma_map_page(dev, pages[i],
-					offset, min,
-					DMA_BIDIRECTIONAL);
 			if (dma_mapping_error(dev, dma_addr))
 				goto unwind_dma_map;
 
 			user_buf->dma_addrs[i] = dma_addr;
 			pa[i] = as_tagged(page_to_phys(pages[i]));
 
-			local_size -= min;
-			offset = 0;
+			remaining_size -= map_size;
+			offset_within_page = 0;
 		}
 
 		reg->gpu_alloc->nents = faulted_pages;
@@ -1719,13 +1735,26 @@ static struct kbase_va_region *kbase_mem_from_user_buffer(
 	return reg;
 
 unwind_dma_map:
-	while (i--) {
+	offset_within_page = user_buf->address & ~PAGE_MASK;
+	remaining_size = user_buf->size;
+	dma_mapped_pages = i;
+	/* Run the unmap loop in the same order as map loop */
+	for (i = 0; i < dma_mapped_pages; i++) {
+		unsigned long unmap_size =
+			MIN(PAGE_SIZE - offset_within_page, remaining_size);
+
 		dma_unmap_page(kctx->kbdev->dev,
 				user_buf->dma_addrs[i],
-				PAGE_SIZE, DMA_BIDIRECTIONAL);
+				unmap_size, DMA_BIDIRECTIONAL);
+		remaining_size -= unmap_size;
+		offset_within_page = 0;
 	}
 fault_mismatch:
 	if (pages) {
+		/* In this case, the region was not yet in the region tracker,
+		 * and so there are no CPU mappings to remove before we unpin
+		 * the page
+		 */
 		for (i = 0; i < faulted_pages; i++)
 			kbase_unpin_user_buf_page(pages[i]);
 	}
@@ -1750,6 +1779,7 @@ u64 kbase_mem_alias(struct kbase_context *kctx, u64 *flags, u64 stride,
 	u64 gpu_va;
 	size_t i;
 	bool coherent;
+	uint64_t max_stride;
 
 	/* Calls to this function are inherently asynchronous, with respect to
 	 * MMU operations.
@@ -1782,7 +1812,9 @@ u64 kbase_mem_alias(struct kbase_context *kctx, u64 *flags, u64 stride,
 	if (!nents)
 		goto bad_nents;
 
-	if (stride > U64_MAX / nents)
+	max_stride = div64_u64(U64_MAX, nents);
+
+	if (stride > max_stride)
 		goto bad_size;
 
 	if ((nents * stride) > (U64_MAX / PAGE_SIZE))
@@ -2156,22 +2188,9 @@ void kbase_mem_shrink_cpu_mapping(struct kbase_context *kctx,
 			(old_pages - new_pages)<<PAGE_SHIFT, 1);
 }
 
-/**
- * kbase_mem_shrink_gpu_mapping - Shrink the GPU mapping of an allocation
- * @kctx:      Context the region belongs to
- * @reg:       The GPU region or NULL if there isn't one
- * @new_pages: The number of pages after the shrink
- * @old_pages: The number of pages before the shrink
- *
- * Return: 0 on success, negative -errno on error
- *
- * Unmap the shrunk pages from the GPU mapping. Note that the size of the region
- * itself is unmodified as we still need to reserve the VA, only the page tables
- * will be modified by this function.
- */
-static int kbase_mem_shrink_gpu_mapping(struct kbase_context *const kctx,
-		struct kbase_va_region *const reg,
-		u64 const new_pages, u64 const old_pages)
+int kbase_mem_shrink_gpu_mapping(struct kbase_context *const kctx,
+				 struct kbase_va_region *const reg, u64 const new_pages,
+				 u64 const old_pages)
 {
 	u64 delta = old_pages - new_pages;
 	struct kbase_mem_phy_alloc *alloc = reg->gpu_alloc;
@@ -2242,10 +2261,13 @@ int kbase_mem_commit(struct kbase_context *kctx, u64 gpu_addr, u64 new_pages)
 
 	if (atomic_read(&reg->cpu_alloc->kernel_mappings) > 0)
 		goto out_unlock;
-	/* can't grow regions which are ephemeral */
+
 	if (reg->flags & KBASE_REG_DONT_NEED)
 		goto out_unlock;
 
+	if (reg->flags & KBASE_REG_NO_USER_FREE)
+		goto out_unlock;
+
 #ifdef CONFIG_MALI_MEMORY_FULLY_BACKED
 	/* Reject resizing commit size */
 	if (reg->flags & KBASE_REG_PF_GROW)
@@ -2628,7 +2650,6 @@ static void kbase_free_unused_jit_allocations(struct kbase_context *kctx)
 	while (kbase_jit_evict(kctx))
 		;
 }
-#endif
 
 static int kbase_mmu_dump_mmap(struct kbase_context *kctx,
 			struct vm_area_struct *vma,
@@ -2645,9 +2666,7 @@ static int kbase_mmu_dump_mmap(struct kbase_context *kctx,
 	size = (vma->vm_end - vma->vm_start);
 	nr_pages = size >> PAGE_SHIFT;
 
-#ifdef CONFIG_MALI_VECTOR_DUMP
 	kbase_free_unused_jit_allocations(kctx);
-#endif
 
 	kaddr = kbase_mmu_dump(kctx, nr_pages);
 
@@ -2695,7 +2714,7 @@ out_va_region:
 out:
 	return err;
 }
-
+#endif
 
 void kbase_os_mem_map_lock(struct kbase_context *kctx)
 {
@@ -2836,6 +2855,7 @@ int kbase_context_mmap(struct kbase_context *const kctx,
 		err = -EINVAL;
 		goto out_unlock;
 	case PFN_DOWN(BASE_MEM_MMU_DUMP_HANDLE):
+#if defined(CONFIG_MALI_VECTOR_DUMP)
 		/* MMU dump */
 		err = kbase_mmu_dump_mmap(kctx, vma, &reg, &kaddr);
 		if (err != 0)
@@ -2843,6 +2863,11 @@ int kbase_context_mmap(struct kbase_context *const kctx,
 		/* free the region on munmap */
 		free_on_close = 1;
 		break;
+#else
+		/* Illegal handle for direct map */
+		err = -EINVAL;
+		goto out_unlock;
+#endif /* defined(CONFIG_MALI_VECTOR_DUMP) */
 #if MALI_USE_CSF
 	case PFN_DOWN(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE):
 		kbase_gpu_vm_unlock(kctx);
@@ -2930,7 +2955,7 @@ int kbase_context_mmap(struct kbase_context *const kctx,
 
 	err = kbase_cpu_mmap(kctx, reg, vma, kaddr, nr_pages, aligned_offset,
 			free_on_close);
-
+#if defined(CONFIG_MALI_VECTOR_DUMP)
 	if (vma->vm_pgoff == PFN_DOWN(BASE_MEM_MMU_DUMP_HANDLE)) {
 		/* MMU dump - userspace should now have a reference on
 		 * the pages, so we can now free the kernel mapping
@@ -2949,7 +2974,7 @@ int kbase_context_mmap(struct kbase_context *const kctx,
 		 */
 		vma->vm_pgoff = PFN_DOWN(vma->vm_start);
 	}
-
+#endif /* defined(CONFIG_MALI_VECTOR_DUMP) */
 out_unlock:
 	kbase_gpu_vm_unlock(kctx);
 out:
@@ -2991,9 +3016,9 @@ void kbase_sync_mem_regions(struct kbase_context *kctx,
 	}
 }
 
-static int kbase_vmap_phy_pages(struct kbase_context *kctx,
-		struct kbase_va_region *reg, u64 offset_bytes, size_t size,
-		struct kbase_vmap_struct *map)
+static int kbase_vmap_phy_pages(struct kbase_context *kctx, struct kbase_va_region *reg,
+				u64 offset_bytes, size_t size, struct kbase_vmap_struct *map,
+				kbase_vmap_flag vmap_flags)
 {
 	unsigned long page_index;
 	unsigned int offset_in_page = offset_bytes & ~PAGE_MASK;
@@ -3004,6 +3029,12 @@ static int kbase_vmap_phy_pages(struct kbase_context *kctx,
 	pgprot_t prot;
 	size_t i;
 
+	if (WARN_ON(vmap_flags & ~KBASE_VMAP_INPUT_FLAGS))
+		return -EINVAL;
+
+	if (WARN_ON(kbase_is_region_invalid_or_free(reg)))
+		return -EINVAL;
+
 	if (!size || !map || !reg->cpu_alloc || !reg->gpu_alloc)
 		return -EINVAL;
 
@@ -3020,6 +3051,17 @@ static int kbase_vmap_phy_pages(struct kbase_context *kctx,
 	if (page_index + page_count > kbase_reg_current_backed_size(reg))
 		return -ENOMEM;
 
+	if ((vmap_flags & KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING) &&
+	    (page_count > (KBASE_PERMANENTLY_MAPPED_MEM_LIMIT_PAGES -
+			   atomic_read(&kctx->permanent_mapped_pages)))) {
+		dev_warn(
+			kctx->kbdev->dev,
+			"Request for %llu more pages mem needing a permanent mapping would breach limit %lu, currently at %d pages",
+			(u64)page_count, KBASE_PERMANENTLY_MAPPED_MEM_LIMIT_PAGES,
+			atomic_read(&kctx->permanent_mapped_pages));
+		return -ENOMEM;
+	}
+
 	if (reg->flags & KBASE_REG_DONT_NEED)
 		return -EINVAL;
 
@@ -3058,59 +3100,73 @@ static int kbase_vmap_phy_pages(struct kbase_context *kctx,
 	map->gpu_pages = &kbase_get_gpu_phy_pages(reg)[page_index];
 	map->addr = (void *)((uintptr_t)cpu_addr + offset_in_page);
 	map->size = size;
-	map->sync_needed = ((reg->flags & KBASE_REG_CPU_CACHED) != 0) &&
-		!kbase_mem_is_imported(map->gpu_alloc->type);
+	map->flags = vmap_flags;
+	if ((reg->flags & KBASE_REG_CPU_CACHED) && !kbase_mem_is_imported(map->gpu_alloc->type))
+		map->flags |= KBASE_VMAP_FLAG_SYNC_NEEDED;
 
-	if (map->sync_needed)
+	if (map->flags & KBASE_VMAP_FLAG_SYNC_NEEDED)
 		kbase_sync_mem_regions(kctx, map, KBASE_SYNC_TO_CPU);
 
+	if (vmap_flags & KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING)
+		atomic_add(page_count, &kctx->permanent_mapped_pages);
+
 	kbase_mem_phy_alloc_kernel_mapped(reg->cpu_alloc);
 	return 0;
 }
 
+void *kbase_vmap_reg(struct kbase_context *kctx, struct kbase_va_region *reg, u64 gpu_addr,
+		     size_t size, unsigned long prot_request, struct kbase_vmap_struct *map,
+		     kbase_vmap_flag vmap_flags)
+{
+	u64 offset_bytes;
+	struct kbase_mem_phy_alloc *cpu_alloc;
+	struct kbase_mem_phy_alloc *gpu_alloc;
+	int err;
+
+	lockdep_assert_held(&kctx->reg_lock);
+
+	if (WARN_ON(kbase_is_region_invalid_or_free(reg)))
+		return NULL;
+
+	/* check access permissions can be satisfied
+	 * Intended only for checking KBASE_REG_{CPU,GPU}_{RD,WR}
+	 */
+	if ((reg->flags & prot_request) != prot_request)
+		return NULL;
+
+	offset_bytes = gpu_addr - (reg->start_pfn << PAGE_SHIFT);
+	cpu_alloc = kbase_mem_phy_alloc_get(reg->cpu_alloc);
+	gpu_alloc = kbase_mem_phy_alloc_get(reg->gpu_alloc);
+
+	err = kbase_vmap_phy_pages(kctx, reg, offset_bytes, size, map, vmap_flags);
+	if (err < 0)
+		goto fail_vmap_phy_pages;
+
+	return map->addr;
+
+fail_vmap_phy_pages:
+	kbase_mem_phy_alloc_put(cpu_alloc);
+	kbase_mem_phy_alloc_put(gpu_alloc);
+	return NULL;
+}
+
 void *kbase_vmap_prot(struct kbase_context *kctx, u64 gpu_addr, size_t size,
 		      unsigned long prot_request, struct kbase_vmap_struct *map)
 {
 	struct kbase_va_region *reg;
 	void *addr = NULL;
-	u64 offset_bytes;
-	struct kbase_mem_phy_alloc *cpu_alloc;
-	struct kbase_mem_phy_alloc *gpu_alloc;
-	int err;
 
 	kbase_gpu_vm_lock(kctx);
 
-	reg = kbase_region_tracker_find_region_enclosing_address(kctx,
-			gpu_addr);
+	reg = kbase_region_tracker_find_region_enclosing_address(kctx, gpu_addr);
 	if (kbase_is_region_invalid_or_free(reg))
 		goto out_unlock;
 
-	/* check access permissions can be satisfied
-	 * Intended only for checking KBASE_REG_{CPU,GPU}_{RD,WR}
-	 */
-	if ((reg->flags & prot_request) != prot_request)
-		goto out_unlock;
-
-	offset_bytes = gpu_addr - (reg->start_pfn << PAGE_SHIFT);
-	cpu_alloc = kbase_mem_phy_alloc_get(reg->cpu_alloc);
-	gpu_alloc = kbase_mem_phy_alloc_get(reg->gpu_alloc);
-
-	err = kbase_vmap_phy_pages(kctx, reg, offset_bytes, size, map);
-	if (err < 0)
-		goto fail_vmap_phy_pages;
-
-	addr = map->addr;
+	addr = kbase_vmap_reg(kctx, reg, gpu_addr, size, prot_request, map, 0u);
 
 out_unlock:
 	kbase_gpu_vm_unlock(kctx);
 	return addr;
-
-fail_vmap_phy_pages:
-	kbase_gpu_vm_unlock(kctx);
-	kbase_mem_phy_alloc_put(cpu_alloc);
-	kbase_mem_phy_alloc_put(gpu_alloc);
-
-	return NULL;
 }
 
 void *kbase_vmap(struct kbase_context *kctx, u64 gpu_addr, size_t size,
@@ -3133,16 +3189,23 @@ static void kbase_vunmap_phy_pages(struct kbase_context *kctx,
 
 	vunmap(addr);
 
-	if (map->sync_needed)
+	if (map->flags & KBASE_VMAP_FLAG_SYNC_NEEDED)
 		kbase_sync_mem_regions(kctx, map, KBASE_SYNC_TO_DEVICE);
+	if (map->flags & KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING) {
+		size_t page_count = PFN_UP(map->offset_in_page + map->size);
+
+		WARN_ON(page_count > atomic_read(&kctx->permanent_mapped_pages));
+		atomic_sub(page_count, &kctx->permanent_mapped_pages);
+	}
 
 	kbase_mem_phy_alloc_kernel_unmapped(map->cpu_alloc);
+
 	map->offset_in_page = 0;
 	map->cpu_pages = NULL;
 	map->gpu_pages = NULL;
 	map->addr = NULL;
 	map->size = 0;
-	map->sync_needed = false;
+	map->flags = 0;
 }
 
 void kbase_vunmap(struct kbase_context *kctx, struct kbase_vmap_struct *map)
@@ -3266,9 +3329,27 @@ static unsigned long get_queue_doorbell_pfn(struct kbase_device *kbdev,
 			 (u64)queue->doorbell_nr * CSF_HW_DOORBELL_PAGE_SIZE));
 }
 
+static int
+#if (KERNEL_VERSION(5, 13, 0) <= LINUX_VERSION_CODE || \
+	KERNEL_VERSION(5, 11, 0) > LINUX_VERSION_CODE)
+kbase_csf_user_io_pages_vm_mremap(struct vm_area_struct *vma)
+#else
+kbase_csf_user_io_pages_vm_mremap(struct vm_area_struct *vma, unsigned long flags)
+#endif
+{
+	pr_debug("Unexpected call to mremap method for User IO pages mapping vma\n");
+	return -EINVAL;
+}
+
+static int kbase_csf_user_io_pages_vm_split(struct vm_area_struct *vma, unsigned long addr)
+{
+	pr_debug("Unexpected call to split method for User IO pages mapping vma\n");
+	return -EINVAL;
+}
+
 static void kbase_csf_user_io_pages_vm_open(struct vm_area_struct *vma)
 {
-	WARN(1, "Unexpected attempt to clone private vma\n");
+	pr_debug("Unexpected call to the open method for User IO pages mapping vma\n");
 	vma->vm_private_data = NULL;
 }
 
@@ -3280,8 +3361,10 @@ static void kbase_csf_user_io_pages_vm_close(struct vm_area_struct *vma)
 	int err;
 	bool reset_prevented = false;
 
-	if (WARN_ON(!queue))
+	if (!queue) {
+		pr_debug("Close method called for the new User IO pages mapping vma\n");
 		return;
+	}
 
 	kctx = queue->kctx;
 	kbdev = kctx->kbdev;
@@ -3325,9 +3408,12 @@ static vm_fault_t kbase_csf_user_io_pages_vm_fault(struct vm_fault *vmf)
 	struct memory_group_manager_device *mgm_dev;
 
 	/* Few sanity checks up front */
-	if ((nr_pages != BASEP_QUEUE_NR_MMAP_USER_PAGES) ||
-	    (vma->vm_pgoff != queue->db_file_offset))
+	if (!queue || (nr_pages != BASEP_QUEUE_NR_MMAP_USER_PAGES) ||
+	    (vma->vm_pgoff != queue->db_file_offset)) {
+		pr_warn("Unexpected CPU page fault on User IO pages mapping for process %s tgid %d pid %d\n",
+			current->comm, current->tgid, current->pid);
 		return VM_FAULT_SIGBUS;
+	}
 
 	kbdev = queue->kctx->kbdev;
 	mgm_dev = kbdev->mgm_dev;
@@ -3382,6 +3468,12 @@ exit:
 static const struct vm_operations_struct kbase_csf_user_io_pages_vm_ops = {
 	.open = kbase_csf_user_io_pages_vm_open,
 	.close = kbase_csf_user_io_pages_vm_close,
+#if KERNEL_VERSION(5, 11, 0) <= LINUX_VERSION_CODE
+	.may_split = kbase_csf_user_io_pages_vm_split,
+#else
+	.split = kbase_csf_user_io_pages_vm_split,
+#endif
+	.mremap = kbase_csf_user_io_pages_vm_mremap,
 	.fault = kbase_csf_user_io_pages_vm_fault
 };
 
@@ -3461,13 +3553,71 @@ map_failed:
 	return err;
 }
 
+/**
+ * kbase_csf_user_reg_vm_open - VMA open function for the USER page
+ *
+ * @vma:  Pointer to the struct containing information about
+ *        the userspace mapping of USER page.
+ * Note:
+ * This function isn't expected to be called. If called (i.e> mremap),
+ * set private_data as NULL to indicate to close() and fault() functions.
+ */
+static void kbase_csf_user_reg_vm_open(struct vm_area_struct *vma)
+{
+	pr_debug("Unexpected call to the open method for USER register mapping");
+	vma->vm_private_data = NULL;
+}
+
+/**
+ * kbase_csf_user_reg_vm_close - VMA close function for the USER page
+ *
+ * @vma:  Pointer to the struct containing information about
+ *        the userspace mapping of USER page.
+ */
 static void kbase_csf_user_reg_vm_close(struct vm_area_struct *vma)
 {
 	struct kbase_context *kctx = vma->vm_private_data;
 
-	WARN_ON(!kctx->csf.user_reg_vma);
+	if (!kctx) {
+		pr_debug("Close function called for the unexpected mapping");
+		return;
+	}
+
+	if (unlikely(!kctx->csf.user_reg_vma))
+		dev_warn(kctx->kbdev->dev, "user_reg_vma pointer unexpectedly NULL");
 
 	kctx->csf.user_reg_vma = NULL;
+
+	mutex_lock(&kctx->kbdev->csf.reg_lock);
+	if (unlikely(kctx->kbdev->csf.nr_user_page_mapped == 0))
+		dev_warn(kctx->kbdev->dev, "Unexpected value for the USER page mapping counter");
+	else
+		kctx->kbdev->csf.nr_user_page_mapped--;
+	mutex_unlock(&kctx->kbdev->csf.reg_lock);
+}
+
+/**
+ * kbase_csf_user_reg_vm_mremap - VMA mremap function for the USER page
+ *
+ * @vma:  Pointer to the struct containing information about
+ *        the userspace mapping of USER page.
+ *
+ * Return: -EINVAL
+ *
+ * Note:
+ * User space must not attempt mremap on USER page mapping.
+ * This function will return an error to fail the attempt.
+ */
+static int
+#if ((KERNEL_VERSION(5, 13, 0) <= LINUX_VERSION_CODE) || \
+	(KERNEL_VERSION(5, 11, 0) > LINUX_VERSION_CODE))
+kbase_csf_user_reg_vm_mremap(struct vm_area_struct *vma)
+#else
+kbase_csf_user_reg_vm_mremap(struct vm_area_struct *vma, unsigned long flags)
+#endif
+{
+	pr_debug("Unexpected call to mremap method for USER page mapping vma\n");
+	return -EINVAL;
 }
 
 #if (KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE)
@@ -3480,19 +3630,24 @@ static vm_fault_t kbase_csf_user_reg_vm_fault(struct vm_fault *vmf)
 	struct vm_area_struct *vma = vmf->vma;
 #endif
 	struct kbase_context *kctx = vma->vm_private_data;
-	struct kbase_device *kbdev = kctx->kbdev;
-	struct memory_group_manager_device *mgm_dev = kbdev->mgm_dev;
-	unsigned long pfn = PFN_DOWN(kbdev->reg_start + USER_BASE);
+	struct kbase_device *kbdev;
+	struct memory_group_manager_device *mgm_dev;
+	unsigned long pfn;
 	size_t nr_pages = PFN_DOWN(vma->vm_end - vma->vm_start);
 	vm_fault_t ret = VM_FAULT_SIGBUS;
 	unsigned long flags;
 
 	/* Few sanity checks up front */
-	if (WARN_ON(nr_pages != 1) ||
-	    WARN_ON(vma != kctx->csf.user_reg_vma) ||
-	    WARN_ON(vma->vm_pgoff !=
-			PFN_DOWN(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE)))
+	if (!kctx || (nr_pages != 1) || (vma != kctx->csf.user_reg_vma) ||
+	    (vma->vm_pgoff != PFN_DOWN(BASEP_MEM_CSF_USER_REG_PAGE_HANDLE))) {
+		pr_warn("Unexpected CPU page fault on USER page mapping for process %s tgid %d pid %d\n",
+			current->comm, current->tgid, current->pid);
 		return VM_FAULT_SIGBUS;
+	}
+
+	kbdev = kctx->kbdev;
+	mgm_dev = kbdev->mgm_dev;
+	pfn = PFN_DOWN(kbdev->reg_start + USER_BASE);
 
 	mutex_lock(&kbdev->csf.reg_lock);
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
@@ -3517,14 +3672,31 @@ static vm_fault_t kbase_csf_user_reg_vm_fault(struct vm_fault *vmf)
 }
 
 static const struct vm_operations_struct kbase_csf_user_reg_vm_ops = {
+	.open = kbase_csf_user_reg_vm_open,
 	.close = kbase_csf_user_reg_vm_close,
+	.mremap = kbase_csf_user_reg_vm_mremap,
 	.fault = kbase_csf_user_reg_vm_fault
 };
 
+/**
+ * kbase_csf_cpu_mmap_user_reg_page - Memory map method for USER page.
+ *
+ * @kctx: Pointer of the kernel context.
+ * @vma:  Pointer to the struct containing the information about
+ *        the userspace mapping of USER page.
+ *
+ * Return: 0 on success, error code otherwise.
+ *
+ * Note:
+ * New Base will request Kbase to read the LATEST_FLUSH of USER page on its behalf.
+ * But this function needs to be kept for backward-compatibility as old Base (<=1.12)
+ * will try to mmap USER page for direct access when it creates a base context.
+ */
 static int kbase_csf_cpu_mmap_user_reg_page(struct kbase_context *kctx,
 				struct vm_area_struct *vma)
 {
 	size_t nr_pages = PFN_DOWN(vma->vm_end - vma->vm_start);
+	struct kbase_device *kbdev = kctx->kbdev;
 
 	/* Few sanity checks */
 	if (kctx->csf.user_reg_vma)
@@ -3548,6 +3720,17 @@ static int kbase_csf_cpu_mmap_user_reg_page(struct kbase_context *kctx,
 
 	kctx->csf.user_reg_vma = vma;
 
+	mutex_lock(&kbdev->csf.reg_lock);
+	kbdev->csf.nr_user_page_mapped++;
+
+	if (!kbdev->csf.mali_file_inode)
+		kbdev->csf.mali_file_inode = kctx->filp->f_inode;
+
+	if (unlikely(kbdev->csf.mali_file_inode != kctx->filp->f_inode))
+		dev_warn(kbdev->dev, "Device file inode pointer not same for all contexts");
+
+	mutex_unlock(&kbdev->csf.reg_lock);
+
 	vma->vm_ops = &kbase_csf_user_reg_vm_ops;
 	vma->vm_private_data = kctx;
 
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem_linux.h b/drivers/gpu/arm/bifrost/mali_kbase_mem_linux.h
index 5e5d991105a6..5b12e181bf4c 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_mem_linux.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_mem_linux.h
@@ -217,6 +217,26 @@ int kbase_mem_evictable_make(struct kbase_mem_phy_alloc *gpu_alloc);
  */
 bool kbase_mem_evictable_unmake(struct kbase_mem_phy_alloc *alloc);
 
+typedef unsigned int kbase_vmap_flag;
+
+/* Sync operations are needed on beginning and ending of access to kernel-mapped GPU memory.
+ *
+ * This is internal to the struct kbase_vmap_struct and should not be passed in by callers of
+ * kbase_vmap-related functions.
+ */
+#define KBASE_VMAP_FLAG_SYNC_NEEDED (((kbase_vmap_flag)1) << 0)
+
+/* Permanently mapped memory accounting (including enforcing limits) should be done on the
+ * kernel-mapped GPU memory.
+ *
+ * This should be used if the kernel mapping is going to live for a potentially long time, for
+ * example if it will persist after the caller has returned.
+ */
+#define KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING (((kbase_vmap_flag)1) << 1)
+
+/* Set of flags that can be passed into kbase_vmap-related functions */
+#define KBASE_VMAP_INPUT_FLAGS (KBASE_VMAP_FLAG_PERMANENT_MAP_ACCOUNTING)
+
 struct kbase_vmap_struct {
 	off_t offset_in_page;
 	struct kbase_mem_phy_alloc *cpu_alloc;
@@ -225,9 +245,55 @@ struct kbase_vmap_struct {
 	struct tagged_addr *gpu_pages;
 	void *addr;
 	size_t size;
-	bool sync_needed;
+	kbase_vmap_flag flags;
 };
 
+/**
+ * kbase_mem_shrink_gpu_mapping - Shrink the GPU mapping of an allocation
+ * @kctx:      Context the region belongs to
+ * @reg:       The GPU region or NULL if there isn't one
+ * @new_pages: The number of pages after the shrink
+ * @old_pages: The number of pages before the shrink
+ *
+ * Return: 0 on success, negative -errno on error
+ *
+ * Unmap the shrunk pages from the GPU mapping. Note that the size of the region
+ * itself is unmodified as we still need to reserve the VA, only the page tables
+ * will be modified by this function.
+ */
+int kbase_mem_shrink_gpu_mapping(struct kbase_context *kctx, struct kbase_va_region *reg,
+				 u64 new_pages, u64 old_pages);
+
+/**
+ * kbase_vmap_reg - Map part of an existing region into the kernel safely, only if the requested
+ *                  access permissions are supported
+ * @kctx:         Context @reg belongs to
+ * @reg:          The GPU region to map part of
+ * @gpu_addr:     Start address of VA range to map, which must be within @reg
+ * @size:         Size of VA range, which when added to @gpu_addr must be within @reg
+ * @prot_request: Flags indicating how the caller will then access the memory
+ * @map:          Structure to be given to kbase_vunmap() on freeing
+ * @vmap_flags:   Flags of type kbase_vmap_flag
+ *
+ * Return: Kernel-accessible CPU pointer to the VA range, or NULL on error
+ *
+ * Variant of kbase_vmap_prot() that can be used given an existing region.
+ *
+ * The caller must satisfy one of the following for @reg:
+ * * It must have been obtained by finding it on the region tracker, and the region lock must not
+ *   have been released in the mean time.
+ * * Or, it must have been refcounted with a call to kbase_va_region_alloc_get(), and the region
+ *   lock is now held again.
+ * * Or, @reg has had KBASE_REG_NO_USER_FREE set at creation time or under the region lock, and the
+ *   region lock is now held again.
+ *
+ * The acceptable @vmap_flags are those in %KBASE_VMAP_INPUT_FLAGS.
+ *
+ * Refer to kbase_vmap_prot() for more information on the operation of this function.
+ */
+void *kbase_vmap_reg(struct kbase_context *kctx, struct kbase_va_region *reg, u64 gpu_addr,
+		     size_t size, unsigned long prot_request, struct kbase_vmap_struct *map,
+		     kbase_vmap_flag vmap_flags);
 
 /**
  * kbase_vmap_prot - Map a GPU VA range into the kernel safely, only if the
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem_migrate.c b/drivers/gpu/arm/bifrost/mali_kbase_mem_migrate.c
new file mode 100644
index 000000000000..dfa70252bcf1
--- /dev/null
+++ b/drivers/gpu/arm/bifrost/mali_kbase_mem_migrate.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/**
+ * DOC: Base kernel page migration implementation.
+ */
+
+#include <linux/migrate.h>
+
+#include <mali_kbase.h>
+#include <mali_kbase_mem_migrate.h>
+
+/* Global integer used to determine if module parameter value has been
+ * provided and if page migration feature is enabled.
+ * Feature is disabled on all platforms by default.
+ */
+int kbase_page_migration_enabled;
+module_param(kbase_page_migration_enabled, int, 0444);
+KBASE_EXPORT_TEST_API(kbase_page_migration_enabled);
+
+bool kbase_alloc_page_metadata(struct kbase_device *kbdev, struct page *p, dma_addr_t dma_addr)
+{
+	struct kbase_page_metadata *page_md =
+		kzalloc(sizeof(struct kbase_page_metadata), GFP_KERNEL);
+
+	if (!page_md)
+		return false;
+
+	SetPagePrivate(p);
+	set_page_private(p, (unsigned long)page_md);
+	page_md->dma_addr = dma_addr;
+	page_md->status = PAGE_STATUS_SET(page_md->status, (u8)ALLOCATE_IN_PROGRESS);
+	spin_lock_init(&page_md->migrate_lock);
+
+	lock_page(p);
+	if (kbdev->mem_migrate.mapping)
+		__SetPageMovable(p, kbdev->mem_migrate.mapping);
+	unlock_page(p);
+
+	return true;
+}
+
+static void kbase_free_page_metadata(struct kbase_device *kbdev, struct page *p)
+{
+	struct device *const dev = kbdev->dev;
+	struct kbase_page_metadata *page_md;
+	dma_addr_t dma_addr;
+
+	page_md = kbase_page_private(p);
+	if (!page_md)
+		return;
+
+	dma_addr = kbase_dma_addr(p);
+	dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+
+	kfree(page_md);
+	ClearPagePrivate(p);
+}
+
+static void kbase_free_pages_worker(struct work_struct *work)
+{
+	struct kbase_mem_migrate *mem_migrate =
+		container_of(work, struct kbase_mem_migrate, free_pages_work);
+	struct kbase_device *kbdev = container_of(mem_migrate, struct kbase_device, mem_migrate);
+	struct page *p, *tmp;
+	LIST_HEAD(free_list);
+
+	spin_lock(&mem_migrate->free_pages_lock);
+	list_splice_init(&mem_migrate->free_pages_list, &free_list);
+	spin_unlock(&mem_migrate->free_pages_lock);
+
+	list_for_each_entry_safe(p, tmp, &free_list, lru) {
+		list_del_init(&p->lru);
+
+		lock_page(p);
+		if (PageMovable(p))
+			__ClearPageMovable(p);
+		unlock_page(p);
+
+		kbase_free_page_metadata(kbdev, p);
+		__free_pages(p, 0);
+	}
+}
+
+void kbase_free_page_later(struct kbase_device *kbdev, struct page *p)
+{
+	struct kbase_mem_migrate *mem_migrate = &kbdev->mem_migrate;
+
+	spin_lock(&mem_migrate->free_pages_lock);
+	list_add(&p->lru, &mem_migrate->free_pages_list);
+	spin_unlock(&mem_migrate->free_pages_lock);
+}
+
+/**
+ * kbase_page_isolate - Isolate a page for migration.
+ *
+ * @p:    Pointer of the page struct of page to isolate.
+ * @mode: LRU Isolation modes.
+ *
+ * Callback function for Linux to isolate a page and prepare it for migration.
+ *
+ * Return: true on success, false otherwise.
+ */
+static bool kbase_page_isolate(struct page *p, isolate_mode_t mode)
+{
+	bool status_mem_pool = false;
+	struct kbase_mem_pool *mem_pool = NULL;
+	struct kbase_page_metadata *page_md = kbase_page_private(p);
+
+	CSTD_UNUSED(mode);
+
+	if (!spin_trylock(&page_md->migrate_lock))
+		return false;
+
+	if (WARN_ON(IS_PAGE_ISOLATED(page_md->status))) {
+		spin_unlock(&page_md->migrate_lock);
+		return false;
+	}
+
+	switch (PAGE_STATUS_GET(page_md->status)) {
+	case MEM_POOL:
+		/* Prepare to remove page from memory pool later only if pool is not
+		 * in the process of termination.
+		 */
+		mem_pool = page_md->data.mem_pool.pool;
+		status_mem_pool = true;
+		preempt_disable();
+		atomic_inc(&mem_pool->isolation_in_progress_cnt);
+		break;
+	case ALLOCATED_MAPPED:
+	case PT_MAPPED:
+		/* Only pages in a memory pool can be isolated for now. */
+		break;
+	case SPILL_IN_PROGRESS:
+	case ALLOCATE_IN_PROGRESS:
+	case FREE_IN_PROGRESS:
+		/* Transitory state: do nothing. */
+		break;
+	default:
+		/* State should always fall in one of the previous cases!
+		 * Also notice that FREE_ISOLATED_IN_PROGRESS is impossible because
+		 * that state only applies to pages that are already isolated.
+		 */
+		page_md->status = PAGE_ISOLATE_SET(page_md->status, 0);
+		break;
+	}
+
+	spin_unlock(&page_md->migrate_lock);
+
+	/* If the page is still in the memory pool: try to remove it. This will fail
+	 * if pool lock is taken which could mean page no longer exists in pool.
+	 */
+	if (status_mem_pool) {
+		if (!spin_trylock(&mem_pool->pool_lock)) {
+			atomic_dec(&mem_pool->isolation_in_progress_cnt);
+			preempt_enable();
+			return false;
+		}
+
+		spin_lock(&page_md->migrate_lock);
+		/* Check status again to ensure page has not been removed from memory pool. */
+		if (PAGE_STATUS_GET(page_md->status) == MEM_POOL) {
+			page_md->status = PAGE_ISOLATE_SET(page_md->status, 1);
+			list_del_init(&p->lru);
+			mem_pool->cur_size--;
+		}
+		spin_unlock(&page_md->migrate_lock);
+		spin_unlock(&mem_pool->pool_lock);
+		atomic_dec(&mem_pool->isolation_in_progress_cnt);
+		preempt_enable();
+	}
+
+	return IS_PAGE_ISOLATED(page_md->status);
+}
+
+/**
+ * kbase_page_migrate - Migrate content of old page to new page provided.
+ *
+ * @mapping:  Pointer to address_space struct associated with pages.
+ * @new_page: Pointer to the page struct of new page.
+ * @old_page: Pointer to the page struct of old page.
+ * @mode:     Mode to determine if migration will be synchronised.
+ *
+ * Callback function for Linux to migrate the content of the old page to the
+ * new page provided.
+ *
+ * Return: 0 on success, error code otherwise.
+ */
+static int kbase_page_migrate(struct address_space *mapping, struct page *new_page,
+			      struct page *old_page, enum migrate_mode mode)
+{
+	int err = 0;
+	bool status_mem_pool = false;
+	struct kbase_page_metadata *page_md = kbase_page_private(old_page);
+	struct kbase_device *kbdev;
+
+	CSTD_UNUSED(mapping);
+	CSTD_UNUSED(mode);
+
+	if (!spin_trylock(&page_md->migrate_lock))
+		return -EAGAIN;
+
+	if (WARN_ON(!IS_PAGE_ISOLATED(page_md->status))) {
+		spin_unlock(&page_md->migrate_lock);
+		return -EINVAL;
+	}
+
+	switch (PAGE_STATUS_GET(page_md->status)) {
+	case MEM_POOL:
+		status_mem_pool = true;
+		kbdev = page_md->data.mem_pool.kbdev;
+		break;
+	case ALLOCATED_MAPPED:
+	case PT_MAPPED:
+	case FREE_ISOLATED_IN_PROGRESS:
+	case MULTI_MAPPED:
+		/* So far, only pages in a memory pool can be migrated. */
+	default:
+		/* State should always fall in one of the previous cases! */
+		err = -EAGAIN;
+		break;
+	}
+
+	spin_unlock(&page_md->migrate_lock);
+
+	if (status_mem_pool) {
+		struct kbase_mem_migrate *mem_migrate = &kbdev->mem_migrate;
+
+		kbase_free_page_metadata(kbdev, old_page);
+		__ClearPageMovable(old_page);
+
+		/* Just free new page to avoid lock contention. */
+		INIT_LIST_HEAD(&new_page->lru);
+		set_page_private(new_page, 0);
+		kbase_free_page_later(kbdev, new_page);
+		queue_work(mem_migrate->free_pages_workq, &mem_migrate->free_pages_work);
+	}
+
+	return err;
+}
+
+/**
+ * kbase_page_putback - Return isolated page back to kbase.
+ *
+ * @p: Pointer of the page struct of page.
+ *
+ * Callback function for Linux to return isolated page back to kbase. This
+ * will only be called for a page that has been isolated but failed to
+ * migrate. This function will put back the given page to the state it was
+ * in before it was isolated.
+ */
+static void kbase_page_putback(struct page *p)
+{
+	bool status_mem_pool = false;
+	struct kbase_page_metadata *page_md = kbase_page_private(p);
+	struct kbase_device *kbdev;
+
+	spin_lock(&page_md->migrate_lock);
+
+	/* Page must have been isolated to reach here but metadata is incorrect. */
+	WARN_ON(!IS_PAGE_ISOLATED(page_md->status));
+
+	switch (PAGE_STATUS_GET(page_md->status)) {
+	case MEM_POOL:
+		status_mem_pool = true;
+		kbdev = page_md->data.mem_pool.kbdev;
+		break;
+	case ALLOCATED_MAPPED:
+	case PT_MAPPED:
+	case FREE_ISOLATED_IN_PROGRESS:
+		/* Only pages in a memory pool can be isolated for now.
+		 * Therefore only pages in a memory pool can be 'putback'.
+		 */
+		break;
+	default:
+		/* State should always fall in one of the previous cases! */
+		break;
+	}
+
+	spin_unlock(&page_md->migrate_lock);
+
+	/* If page was in a memory pool then just free it to avoid lock contention. */
+	if (!WARN_ON(!status_mem_pool)) {
+		struct kbase_mem_migrate *mem_migrate = &kbdev->mem_migrate;
+
+		__ClearPageMovable(p);
+		list_del_init(&p->lru);
+		kbase_free_page_later(kbdev, p);
+		queue_work(mem_migrate->free_pages_workq, &mem_migrate->free_pages_work);
+	}
+}
+
+static const struct address_space_operations kbase_address_space_ops = {
+	.isolate_page = kbase_page_isolate,
+	.migratepage = kbase_page_migrate,
+	.putback_page = kbase_page_putback,
+};
+
+void kbase_mem_migrate_set_address_space_ops(struct kbase_device *kbdev, struct file *const filp)
+{
+	if (filp) {
+		filp->f_inode->i_mapping->a_ops = &kbase_address_space_ops;
+
+		if (!kbdev->mem_migrate.mapping)
+			kbdev->mem_migrate.mapping = filp->f_inode->i_mapping;
+		else
+			WARN_ON(kbdev->mem_migrate.mapping != filp->f_inode->i_mapping);
+	}
+}
+
+void kbase_mem_migrate_init(struct kbase_device *kbdev)
+{
+	struct kbase_mem_migrate *mem_migrate = &kbdev->mem_migrate;
+
+	if (kbase_page_migration_enabled < 0)
+		kbase_page_migration_enabled = 0;
+
+	spin_lock_init(&mem_migrate->free_pages_lock);
+	INIT_LIST_HEAD(&mem_migrate->free_pages_list);
+
+	mem_migrate->free_pages_workq =
+		alloc_workqueue("free_pages_workq", WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
+	INIT_WORK(&mem_migrate->free_pages_work, kbase_free_pages_worker);
+}
+
+void kbase_mem_migrate_term(struct kbase_device *kbdev)
+{
+	struct kbase_mem_migrate *mem_migrate = &kbdev->mem_migrate;
+
+	if (mem_migrate->free_pages_workq)
+		destroy_workqueue(mem_migrate->free_pages_workq);
+}
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem_migrate.h b/drivers/gpu/arm/bifrost/mali_kbase_mem_migrate.h
new file mode 100644
index 000000000000..6610c0ccc40c
--- /dev/null
+++ b/drivers/gpu/arm/bifrost/mali_kbase_mem_migrate.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+/**
+ * DOC: Base kernel page migration implementation.
+ */
+
+#define PAGE_STATUS_MASK ((u8)0x7F)
+#define PAGE_STATUS_GET(status) (status & PAGE_STATUS_MASK)
+#define PAGE_STATUS_SET(status, value) ((status & ~PAGE_STATUS_MASK) | (value & PAGE_STATUS_MASK))
+#define PAGE_ISOLATE_SHIFT (7)
+#define PAGE_ISOLATE_SET(status, value)                                                            \
+	((status & PAGE_STATUS_MASK) | (value << PAGE_ISOLATE_SHIFT))
+#define IS_PAGE_ISOLATED(status) ((bool)(status & ~PAGE_STATUS_MASK))
+
+/* Global integer used to determine if module parameter value has been
+ * provided and if page migration feature is enabled.
+ */
+extern int kbase_page_migration_enabled;
+
+/**
+ * kbase_alloc_page_metadata - Allocate and initialize page metadata
+ * @kbdev:    Pointer to kbase device.
+ * @p:        Page to assign metadata to.
+ * @dma_addr: DMA address mapped to paged.
+ *
+ * This will allocate memory for the page's metadata, initialize it and
+ * assign a reference to the page's private field. Importantly, once
+ * the metadata is set and ready this function will mark the page as
+ * movable.
+ *
+ * Return: true if successful or false otherwise.
+ */
+bool kbase_alloc_page_metadata(struct kbase_device *kbdev, struct page *p, dma_addr_t dma_addr);
+
+/**
+ * kbase_free_page_later - Defer freeing of given page.
+ * @kbdev:  Pointer to kbase device
+ * @p:      Page to free
+ *
+ * This will add given page to a list of pages which will be freed at
+ * a later time.
+ */
+void kbase_free_page_later(struct kbase_device *kbdev, struct page *p);
+
+/*
+ * kbase_mem_migrate_set_address_space_ops - Set address space operations
+ *
+ * @kbdev: Pointer to object representing an instance of GPU platform device.
+ * @filp:  Pointer to the struct file corresponding to device file
+ *         /dev/malixx instance, passed to the file's open method.
+ *
+ * Assign address space operations to the given file struct @filp and
+ * add a reference to @kbdev.
+ */
+void kbase_mem_migrate_set_address_space_ops(struct kbase_device *kbdev, struct file *const filp);
+
+/*
+ * kbase_mem_migrate_init - Initialise kbase page migration
+ *
+ * @kbdev: Pointer to kbase device
+ *
+ * Enables page migration by default based on GPU and setup work queue to
+ * defer freeing pages during page migration callbacks.
+ */
+void kbase_mem_migrate_init(struct kbase_device *kbdev);
+
+/*
+ * kbase_mem_migrate_term - Terminate kbase page migration
+ *
+ * @kbdev: Pointer to kbase device
+ *
+ * This will flush any work left to free pages from page migration
+ * and destroy workqueue associated.
+ */
+void kbase_mem_migrate_term(struct kbase_device *kbdev);
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem_pool.c b/drivers/gpu/arm/bifrost/mali_kbase_mem_pool.c
index 4103bd1c93d0..dce066db7385 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_mem_pool.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_mem_pool.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2015-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2015-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -21,6 +21,7 @@
 
 #include <mali_kbase.h>
 #include <linux/mm.h>
+#include <linux/migrate.h>
 #include <linux/dma-mapping.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
@@ -56,13 +57,36 @@ static bool kbase_mem_pool_is_empty(struct kbase_mem_pool *pool)
 	return kbase_mem_pool_size(pool) == 0;
 }
 
+static void set_pool_new_page_metadata(struct kbase_mem_pool *pool, struct page *p,
+				       struct list_head *page_list, size_t *list_size)
+{
+	struct kbase_page_metadata *page_md = kbase_page_private(p);
+
+	lockdep_assert_held(&pool->pool_lock);
+
+	spin_lock(&page_md->migrate_lock);
+	/* Only update page status and add the page to the memory pool if it is not isolated */
+	if (!WARN_ON(IS_PAGE_ISOLATED(page_md->status))) {
+		page_md->status = PAGE_STATUS_SET(page_md->status, (u8)MEM_POOL);
+		page_md->data.mem_pool.pool = pool;
+		page_md->data.mem_pool.kbdev = pool->kbdev;
+		list_move(&p->lru, page_list);
+		(*list_size)++;
+	}
+	spin_unlock(&page_md->migrate_lock);
+}
+
 static void kbase_mem_pool_add_locked(struct kbase_mem_pool *pool,
 		struct page *p)
 {
 	lockdep_assert_held(&pool->pool_lock);
 
-	list_add(&p->lru, &pool->page_list);
-	pool->cur_size++;
+	if (!pool->order && kbase_page_migration_enabled)
+		set_pool_new_page_metadata(pool, p, &pool->page_list, &pool->cur_size);
+	else {
+		list_add(&p->lru, &pool->page_list);
+		pool->cur_size++;
+	}
 
 	pool_dbg(pool, "added page\n");
 }
@@ -79,8 +103,15 @@ static void kbase_mem_pool_add_list_locked(struct kbase_mem_pool *pool,
 {
 	lockdep_assert_held(&pool->pool_lock);
 
-	list_splice(page_list, &pool->page_list);
-	pool->cur_size += nr_pages;
+	if (!pool->order && kbase_page_migration_enabled) {
+		struct page *p, *tmp;
+
+		list_for_each_entry_safe(p, tmp, page_list, lru)
+			set_pool_new_page_metadata(pool, p, &pool->page_list, &pool->cur_size);
+	} else {
+		list_splice(page_list, &pool->page_list);
+		pool->cur_size += nr_pages;
+	}
 
 	pool_dbg(pool, "added %zu pages\n", nr_pages);
 }
@@ -93,7 +124,8 @@ static void kbase_mem_pool_add_list(struct kbase_mem_pool *pool,
 	kbase_mem_pool_unlock(pool);
 }
 
-static struct page *kbase_mem_pool_remove_locked(struct kbase_mem_pool *pool)
+static struct page *kbase_mem_pool_remove_locked(struct kbase_mem_pool *pool,
+						 enum kbase_page_status status)
 {
 	struct page *p;
 
@@ -103,6 +135,16 @@ static struct page *kbase_mem_pool_remove_locked(struct kbase_mem_pool *pool)
 		return NULL;
 
 	p = list_first_entry(&pool->page_list, struct page, lru);
+
+	if (!pool->order && kbase_page_migration_enabled) {
+		struct kbase_page_metadata *page_md = kbase_page_private(p);
+
+		spin_lock(&page_md->migrate_lock);
+		WARN_ON(PAGE_STATUS_GET(page_md->status) != (u8)MEM_POOL);
+		page_md->status = PAGE_STATUS_SET(page_md->status, (u8)status);
+		spin_unlock(&page_md->migrate_lock);
+	}
+
 	list_del_init(&p->lru);
 	pool->cur_size--;
 
@@ -111,12 +153,13 @@ static struct page *kbase_mem_pool_remove_locked(struct kbase_mem_pool *pool)
 	return p;
 }
 
-static struct page *kbase_mem_pool_remove(struct kbase_mem_pool *pool)
+static struct page *kbase_mem_pool_remove(struct kbase_mem_pool *pool,
+					  enum kbase_page_status status)
 {
 	struct page *p;
 
 	kbase_mem_pool_lock(pool);
-	p = kbase_mem_pool_remove_locked(pool);
+	p = kbase_mem_pool_remove_locked(pool, status);
 	kbase_mem_pool_unlock(pool);
 
 	return p;
@@ -126,9 +169,9 @@ static void kbase_mem_pool_sync_page(struct kbase_mem_pool *pool,
 		struct page *p)
 {
 	struct device *dev = pool->kbdev->dev;
+	dma_addr_t dma_addr = pool->order ? kbase_dma_addr_as_priv(p) : kbase_dma_addr(p);
 
-	dma_sync_single_for_device(dev, kbase_dma_addr(p),
-			(PAGE_SIZE << pool->order), DMA_BIDIRECTIONAL);
+	dma_sync_single_for_device(dev, dma_addr, (PAGE_SIZE << pool->order), DMA_BIDIRECTIONAL);
 }
 
 static void kbase_mem_pool_zero_page(struct kbase_mem_pool *pool,
@@ -154,7 +197,7 @@ static void kbase_mem_pool_spill(struct kbase_mem_pool *next_pool,
 struct page *kbase_mem_alloc_page(struct kbase_mem_pool *pool)
 {
 	struct page *p;
-	gfp_t gfp = GFP_HIGHUSER | __GFP_ZERO;
+	gfp_t gfp = __GFP_ZERO;
 	struct kbase_device *const kbdev = pool->kbdev;
 	struct device *const dev = kbdev->dev;
 	dma_addr_t dma_addr;
@@ -162,7 +205,9 @@ struct page *kbase_mem_alloc_page(struct kbase_mem_pool *pool)
 
 	/* don't warn on higher order failures */
 	if (pool->order)
-		gfp |= __GFP_NOWARN;
+		gfp |= GFP_HIGHUSER | __GFP_NOWARN;
+	else
+		gfp |= kbase_page_migration_enabled ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER;
 
 	p = kbdev->mgm_dev->ops.mgm_alloc_page(kbdev->mgm_dev,
 		pool->group_id, gfp, pool->order);
@@ -178,30 +223,52 @@ struct page *kbase_mem_alloc_page(struct kbase_mem_pool *pool)
 		return NULL;
 	}
 
-	WARN_ON(dma_addr != page_to_phys(p));
-	for (i = 0; i < (1u << pool->order); i++)
-		kbase_set_dma_addr(p+i, dma_addr + PAGE_SIZE * i);
+	/* Setup page metadata for 4KB pages when page migration is enabled */
+	if (!pool->order && kbase_page_migration_enabled) {
+		INIT_LIST_HEAD(&p->lru);
+		if (!kbase_alloc_page_metadata(kbdev, p, dma_addr)) {
+			dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+			kbdev->mgm_dev->ops.mgm_free_page(kbdev->mgm_dev, pool->group_id, p,
+							  pool->order);
+			return NULL;
+		}
+	} else {
+		WARN_ON(dma_addr != page_to_phys(p));
+		for (i = 0; i < (1u << pool->order); i++)
+			kbase_set_dma_addr_as_priv(p + i, dma_addr + PAGE_SIZE * i);
+	}
 
 	return p;
 }
 
-static void kbase_mem_pool_free_page(struct kbase_mem_pool *pool,
-		struct page *p)
+static void enqueue_free_pool_pages_work(struct kbase_mem_pool *pool)
 {
-	struct kbase_device *const kbdev = pool->kbdev;
-	struct device *const dev = kbdev->dev;
-	dma_addr_t dma_addr = kbase_dma_addr(p);
-	int i;
+	struct kbase_mem_migrate *mem_migrate = &pool->kbdev->mem_migrate;
 
-	dma_unmap_page(dev, dma_addr, (PAGE_SIZE << pool->order),
-		       DMA_BIDIRECTIONAL);
-	for (i = 0; i < (1u << pool->order); i++)
-		kbase_clear_dma_addr(p+i);
+	if (!pool->order && kbase_page_migration_enabled)
+		queue_work(mem_migrate->free_pages_workq, &mem_migrate->free_pages_work);
+}
 
-	kbdev->mgm_dev->ops.mgm_free_page(kbdev->mgm_dev,
-		pool->group_id, p, pool->order);
+void kbase_mem_pool_free_page(struct kbase_mem_pool *pool, struct page *p)
+{
+	struct kbase_device *kbdev = pool->kbdev;
 
-	pool_dbg(pool, "freed page to kernel\n");
+	if (!pool->order && kbase_page_migration_enabled) {
+		kbase_free_page_later(kbdev, p);
+		pool_dbg(pool, "page to be freed to kernel later\n");
+	} else {
+		int i;
+		dma_addr_t dma_addr = kbase_dma_addr_as_priv(p);
+
+		for (i = 0; i < (1u << pool->order); i++)
+			kbase_clear_dma_addr_as_priv(p + i);
+
+		dma_unmap_page(kbdev->dev, dma_addr, (PAGE_SIZE << pool->order), DMA_BIDIRECTIONAL);
+
+		kbdev->mgm_dev->ops.mgm_free_page(kbdev->mgm_dev, pool->group_id, p, pool->order);
+
+		pool_dbg(pool, "freed page to kernel\n");
+	}
 }
 
 static size_t kbase_mem_pool_shrink_locked(struct kbase_mem_pool *pool,
@@ -213,10 +280,13 @@ static size_t kbase_mem_pool_shrink_locked(struct kbase_mem_pool *pool,
 	lockdep_assert_held(&pool->pool_lock);
 
 	for (i = 0; i < nr_to_shrink && !kbase_mem_pool_is_empty(pool); i++) {
-		p = kbase_mem_pool_remove_locked(pool);
+		p = kbase_mem_pool_remove_locked(pool, FREE_IN_PROGRESS);
 		kbase_mem_pool_free_page(pool, p);
 	}
 
+	/* Freeing of pages will be deferred when page migration is enabled. */
+	enqueue_free_pool_pages_work(pool);
+
 	return i;
 }
 
@@ -232,8 +302,7 @@ static size_t kbase_mem_pool_shrink(struct kbase_mem_pool *pool,
 	return nr_freed;
 }
 
-int kbase_mem_pool_grow(struct kbase_mem_pool *pool,
-		size_t nr_to_grow)
+int kbase_mem_pool_grow(struct kbase_mem_pool *pool, size_t nr_to_grow)
 {
 	struct page *p;
 	size_t i;
@@ -268,6 +337,7 @@ int kbase_mem_pool_grow(struct kbase_mem_pool *pool,
 
 	return 0;
 }
+KBASE_EXPORT_TEST_API(kbase_mem_pool_grow);
 
 void kbase_mem_pool_trim(struct kbase_mem_pool *pool, size_t new_size)
 {
@@ -323,6 +393,9 @@ static unsigned long kbase_mem_pool_reclaim_count_objects(struct shrinker *s,
 	kbase_mem_pool_lock(pool);
 	if (pool->dont_reclaim && !pool->dying) {
 		kbase_mem_pool_unlock(pool);
+		/* Tell shrinker to skip reclaim
+		 * even though freeable pages are available
+		 */
 		return 0;
 	}
 	pool_size = kbase_mem_pool_size(pool);
@@ -342,7 +415,10 @@ static unsigned long kbase_mem_pool_reclaim_scan_objects(struct shrinker *s,
 	kbase_mem_pool_lock(pool);
 	if (pool->dont_reclaim && !pool->dying) {
 		kbase_mem_pool_unlock(pool);
-		return 0;
+		/* Tell shrinker that reclaim can't be made and
+		 * do not attempt again for this reclaim context.
+		 */
+		return SHRINK_STOP;
 	}
 
 	pool_dbg(pool, "reclaim scan %ld:\n", sc->nr_to_scan);
@@ -356,12 +432,9 @@ static unsigned long kbase_mem_pool_reclaim_scan_objects(struct shrinker *s,
 	return freed;
 }
 
-int kbase_mem_pool_init(struct kbase_mem_pool *pool,
-		const struct kbase_mem_pool_config *config,
-		unsigned int order,
-		int group_id,
-		struct kbase_device *kbdev,
-		struct kbase_mem_pool *next_pool)
+int kbase_mem_pool_init(struct kbase_mem_pool *pool, const struct kbase_mem_pool_config *config,
+			unsigned int order, int group_id, struct kbase_device *kbdev,
+			struct kbase_mem_pool *next_pool)
 {
 	if (WARN_ON(group_id < 0) ||
 		WARN_ON(group_id >= MEMORY_GROUP_MANAGER_NR_GROUPS)) {
@@ -375,6 +448,7 @@ int kbase_mem_pool_init(struct kbase_mem_pool *pool,
 	pool->kbdev = kbdev;
 	pool->next_pool = next_pool;
 	pool->dying = false;
+	atomic_set(&pool->isolation_in_progress_cnt, 0);
 
 	spin_lock_init(&pool->pool_lock);
 	INIT_LIST_HEAD(&pool->page_list);
@@ -392,6 +466,7 @@ int kbase_mem_pool_init(struct kbase_mem_pool *pool,
 
 	return 0;
 }
+KBASE_EXPORT_TEST_API(kbase_mem_pool_init);
 
 void kbase_mem_pool_mark_dying(struct kbase_mem_pool *pool)
 {
@@ -423,14 +498,14 @@ void kbase_mem_pool_term(struct kbase_mem_pool *pool)
 
 		/* Zero pages first without holding the next_pool lock */
 		for (i = 0; i < nr_to_spill; i++) {
-			p = kbase_mem_pool_remove_locked(pool);
+			p = kbase_mem_pool_remove_locked(pool, SPILL_IN_PROGRESS);
 			list_add(&p->lru, &spill_list);
 		}
 	}
 
 	while (!kbase_mem_pool_is_empty(pool)) {
 		/* Free remaining pages to kernel */
-		p = kbase_mem_pool_remove_locked(pool);
+		p = kbase_mem_pool_remove_locked(pool, FREE_IN_PROGRESS);
 		list_add(&p->lru, &free_list);
 	}
 
@@ -451,8 +526,18 @@ void kbase_mem_pool_term(struct kbase_mem_pool *pool)
 		kbase_mem_pool_free_page(pool, p);
 	}
 
+	/* Freeing of pages will be deferred when page migration is enabled. */
+	enqueue_free_pool_pages_work(pool);
+
+	/* Before returning wait to make sure there are no pages undergoing page isolation
+	 * which will require reference to this pool.
+	 */
+	while (atomic_read(&pool->isolation_in_progress_cnt))
+		cpu_relax();
+
 	pool_dbg(pool, "terminated\n");
 }
+KBASE_EXPORT_TEST_API(kbase_mem_pool_term);
 
 struct page *kbase_mem_pool_alloc(struct kbase_mem_pool *pool)
 {
@@ -460,7 +545,7 @@ struct page *kbase_mem_pool_alloc(struct kbase_mem_pool *pool)
 
 	do {
 		pool_dbg(pool, "alloc()\n");
-		p = kbase_mem_pool_remove(pool);
+		p = kbase_mem_pool_remove(pool, ALLOCATE_IN_PROGRESS);
 
 		if (p)
 			return p;
@@ -478,7 +563,7 @@ struct page *kbase_mem_pool_alloc_locked(struct kbase_mem_pool *pool)
 	lockdep_assert_held(&pool->pool_lock);
 
 	pool_dbg(pool, "alloc_locked()\n");
-	p = kbase_mem_pool_remove_locked(pool);
+	p = kbase_mem_pool_remove_locked(pool, ALLOCATE_IN_PROGRESS);
 
 	if (p)
 		return p;
@@ -505,6 +590,8 @@ void kbase_mem_pool_free(struct kbase_mem_pool *pool, struct page *p,
 	} else {
 		/* Free page */
 		kbase_mem_pool_free_page(pool, p);
+		/* Freeing of pages will be deferred when page migration is enabled. */
+		enqueue_free_pool_pages_work(pool);
 	}
 }
 
@@ -524,11 +611,13 @@ void kbase_mem_pool_free_locked(struct kbase_mem_pool *pool, struct page *p,
 	} else {
 		/* Free page */
 		kbase_mem_pool_free_page(pool, p);
+		/* Freeing of pages will be deferred when page migration is enabled. */
+		enqueue_free_pool_pages_work(pool);
 	}
 }
 
 int kbase_mem_pool_alloc_pages(struct kbase_mem_pool *pool, size_t nr_4k_pages,
-		struct tagged_addr *pages, bool partial_allowed)
+			       struct tagged_addr *pages, bool partial_allowed)
 {
 	struct page *p;
 	size_t nr_from_pool;
@@ -550,7 +639,7 @@ int kbase_mem_pool_alloc_pages(struct kbase_mem_pool *pool, size_t nr_4k_pages,
 	while (nr_from_pool--) {
 		int j;
 
-		p = kbase_mem_pool_remove_locked(pool);
+		p = kbase_mem_pool_remove_locked(pool, ALLOCATE_IN_PROGRESS);
 		if (pool->order) {
 			pages[i++] = as_tagged_tag(page_to_phys(p),
 						   HUGE_HEAD | HUGE_PAGE);
@@ -566,8 +655,8 @@ int kbase_mem_pool_alloc_pages(struct kbase_mem_pool *pool, size_t nr_4k_pages,
 
 	if (i != nr_4k_pages && pool->next_pool) {
 		/* Allocate via next pool */
-		err = kbase_mem_pool_alloc_pages(pool->next_pool,
-				nr_4k_pages - i, pages + i, partial_allowed);
+		err = kbase_mem_pool_alloc_pages(pool->next_pool, nr_4k_pages - i, pages + i,
+						 partial_allowed);
 
 		if (err < 0)
 			goto err_rollback;
@@ -638,7 +727,7 @@ int kbase_mem_pool_alloc_pages_locked(struct kbase_mem_pool *pool,
 	for (i = 0; i < nr_pages_internal; i++) {
 		int j;
 
-		p = kbase_mem_pool_remove_locked(pool);
+		p = kbase_mem_pool_remove_locked(pool, ALLOCATE_IN_PROGRESS);
 		if (pool->order) {
 			*pages++ = as_tagged_tag(page_to_phys(p),
 						   HUGE_HEAD | HUGE_PAGE);
@@ -745,6 +834,7 @@ void kbase_mem_pool_free_pages(struct kbase_mem_pool *pool, size_t nr_pages,
 	size_t nr_to_pool;
 	LIST_HEAD(to_pool_list);
 	size_t i = 0;
+	bool pages_released = false;
 
 	pool_dbg(pool, "free_pages(%zu):\n", nr_pages);
 
@@ -782,8 +872,13 @@ void kbase_mem_pool_free_pages(struct kbase_mem_pool *pool, size_t nr_pages,
 
 		kbase_mem_pool_free_page(pool, p);
 		pages[i] = as_tagged(0);
+		pages_released = true;
 	}
 
+	/* Freeing of pages will be deferred when page migration is enabled. */
+	if (pages_released)
+		enqueue_free_pool_pages_work(pool);
+
 	pool_dbg(pool, "free_pages(%zu) done\n", nr_pages);
 }
 
@@ -796,6 +891,7 @@ void kbase_mem_pool_free_pages_locked(struct kbase_mem_pool *pool,
 	size_t nr_to_pool;
 	LIST_HEAD(to_pool_list);
 	size_t i = 0;
+	bool pages_released = false;
 
 	lockdep_assert_held(&pool->pool_lock);
 
@@ -826,7 +922,12 @@ void kbase_mem_pool_free_pages_locked(struct kbase_mem_pool *pool,
 
 		kbase_mem_pool_free_page(pool, p);
 		pages[i] = as_tagged(0);
+		pages_released = true;
 	}
 
+	/* Freeing of pages will be deferred when page migration is enabled. */
+	if (pages_released)
+		enqueue_free_pool_pages_work(pool);
+
 	pool_dbg(pool, "free_pages_locked(%zu) done\n", nr_pages);
 }
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem_pool_group.c b/drivers/gpu/arm/bifrost/mali_kbase_mem_pool_group.c
index 8d7bb4d68854..49c4b041e13a 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_mem_pool_group.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_mem_pool_group.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -43,29 +43,22 @@ void kbase_mem_pool_group_config_set_max_size(
 	}
 }
 
-int kbase_mem_pool_group_init(
-	struct kbase_mem_pool_group *const mem_pools,
-	struct kbase_device *const kbdev,
-	const struct kbase_mem_pool_group_config *const configs,
-	struct kbase_mem_pool_group *next_pools)
+int kbase_mem_pool_group_init(struct kbase_mem_pool_group *const mem_pools,
+			      struct kbase_device *const kbdev,
+			      const struct kbase_mem_pool_group_config *const configs,
+			      struct kbase_mem_pool_group *next_pools)
 {
 	int gid, err = 0;
 
 	for (gid = 0; gid < MEMORY_GROUP_MANAGER_NR_GROUPS; ++gid) {
-		err = kbase_mem_pool_init(&mem_pools->small[gid],
-			&configs->small[gid],
-			KBASE_MEM_POOL_4KB_PAGE_TABLE_ORDER,
-			gid,
-			kbdev,
-			next_pools ? &next_pools->small[gid] : NULL);
+		err = kbase_mem_pool_init(&mem_pools->small[gid], &configs->small[gid],
+					  KBASE_MEM_POOL_4KB_PAGE_TABLE_ORDER, gid, kbdev,
+					  next_pools ? &next_pools->small[gid] : NULL);
 
 		if (!err) {
-			err = kbase_mem_pool_init(&mem_pools->large[gid],
-				&configs->large[gid],
-				KBASE_MEM_POOL_2MB_PAGE_TABLE_ORDER,
-				gid,
-				kbdev,
-				next_pools ? &next_pools->large[gid] : NULL);
+			err = kbase_mem_pool_init(&mem_pools->large[gid], &configs->large[gid],
+						  KBASE_MEM_POOL_2MB_PAGE_TABLE_ORDER, gid, kbdev,
+						  next_pools ? &next_pools->large[gid] : NULL);
 			if (err)
 				kbase_mem_pool_term(&mem_pools->small[gid]);
 		}
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem_pool_group.h b/drivers/gpu/arm/bifrost/mali_kbase_mem_pool_group.h
index f97f47d15b7d..fe8ce775258f 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_mem_pool_group.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_mem_pool_group.h
@@ -86,10 +86,9 @@ void kbase_mem_pool_group_config_set_max_size(
  *
  * Return: 0 on success, otherwise a negative error code
  */
-int kbase_mem_pool_group_init(struct kbase_mem_pool_group *mem_pools,
-	struct kbase_device *kbdev,
-	const struct kbase_mem_pool_group_config *configs,
-	struct kbase_mem_pool_group *next_pools);
+int kbase_mem_pool_group_init(struct kbase_mem_pool_group *mem_pools, struct kbase_device *kbdev,
+			      const struct kbase_mem_pool_group_config *configs,
+			      struct kbase_mem_pool_group *next_pools);
 
 /**
  * kbase_mem_pool_group_mark_dying - Mark a set of memory pools as dying
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_pbha.c b/drivers/gpu/arm/bifrost/mali_kbase_pbha.c
index 90406b2bf1fe..b65f9e7b5162 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_pbha.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_pbha.c
@@ -209,20 +209,13 @@ void kbase_pbha_write_settings(struct kbase_device *kbdev)
 	}
 }
 
-int kbase_pbha_read_dtb(struct kbase_device *kbdev)
+static int kbase_pbha_read_int_id_override_property(struct kbase_device *kbdev,
+						    const struct device_node *pbha_node)
 {
 	u32 dtb_data[SYSC_ALLOC_COUNT * sizeof(u32) * DTB_SET_SIZE];
-	const struct device_node *pbha_node;
 	int sz, i;
 	bool valid = true;
 
-	if (!kbasep_pbha_supported(kbdev))
-		return 0;
-
-	pbha_node = of_get_child_by_name(kbdev->dev->of_node, "pbha");
-	if (!pbha_node)
-		return 0;
-
 	sz = of_property_count_elems_of_size(pbha_node, "int_id_override",
 					     sizeof(u32));
 	if (sz <= 0 || (sz % DTB_SET_SIZE != 0)) {
@@ -256,3 +249,58 @@ int kbase_pbha_read_dtb(struct kbase_device *kbdev)
 	}
 	return 0;
 }
+
+#if MALI_USE_CSF
+static int kbase_pbha_read_propagate_bits_property(struct kbase_device *kbdev,
+						   const struct device_node *pbha_node)
+{
+	u32 bits;
+	int err;
+
+	if (!kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_PBHA_HWU))
+		return 0;
+
+	err = of_property_read_u32(pbha_node, "propagate_bits", &bits);
+
+	if (err < 0) {
+		if (err != -EINVAL) {
+			dev_err(kbdev->dev,
+				"DTB value for propagate_bits is improperly formed (err=%d)\n",
+				err);
+			return err;
+		}
+	}
+
+	if (bits > (L2_CONFIG_PBHA_HWU_MASK >> L2_CONFIG_PBHA_HWU_SHIFT)) {
+		dev_err(kbdev->dev, "Bad DTB value for propagate_bits: 0x%x\n", bits);
+		return -EINVAL;
+	}
+
+	kbdev->pbha_propagate_bits = bits;
+	return 0;
+}
+#endif
+
+int kbase_pbha_read_dtb(struct kbase_device *kbdev)
+{
+	const struct device_node *pbha_node;
+	int err;
+
+	if (!kbasep_pbha_supported(kbdev))
+		return 0;
+
+	pbha_node = of_get_child_by_name(kbdev->dev->of_node, "pbha");
+	if (!pbha_node)
+		return 0;
+
+	err = kbase_pbha_read_int_id_override_property(kbdev, pbha_node);
+
+#if MALI_USE_CSF
+	if (err < 0)
+		return err;
+
+	err = kbase_pbha_read_propagate_bits_property(kbdev, pbha_node);
+#endif
+
+	return err;
+}
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_pbha_debugfs.c b/drivers/gpu/arm/bifrost/mali_kbase_pbha_debugfs.c
index 4130dd609157..1cc29c700e5a 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_pbha_debugfs.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_pbha_debugfs.c
@@ -20,13 +20,15 @@
  */
 
 #include "mali_kbase_pbha_debugfs.h"
-
 #include "mali_kbase_pbha.h"
-
 #include <device/mali_kbase_device.h>
 #include <mali_kbase_reset_gpu.h>
 #include <mali_kbase.h>
 
+#if MALI_USE_CSF
+#include "backend/gpu/mali_kbase_pm_internal.h"
+#endif
+
 static int int_id_overrides_show(struct seq_file *sfile, void *data)
 {
 	struct kbase_device *kbdev = sfile->private;
@@ -108,6 +110,90 @@ static int int_id_overrides_open(struct inode *in, struct file *file)
 	return single_open(file, int_id_overrides_show, in->i_private);
 }
 
+#if MALI_USE_CSF
+/**
+ * propagate_bits_show - Read PBHA bits from L2_CONFIG out to debugfs.
+ *
+ * @sfile: The debugfs entry.
+ * @data: Data associated with the entry.
+ *
+ * Return: 0 in all cases.
+ */
+static int propagate_bits_show(struct seq_file *sfile, void *data)
+{
+	struct kbase_device *kbdev = sfile->private;
+	u32 l2_config_val;
+
+	kbase_csf_scheduler_pm_active(kbdev);
+	kbase_pm_wait_for_l2_powered(kbdev);
+	l2_config_val = L2_CONFIG_PBHA_HWU_GET(kbase_reg_read(kbdev, GPU_CONTROL_REG(L2_CONFIG)));
+	kbase_csf_scheduler_pm_idle(kbdev);
+
+	seq_printf(sfile, "PBHA Propagate Bits: 0x%x\n", l2_config_val);
+	return 0;
+}
+
+static int propagate_bits_open(struct inode *in, struct file *file)
+{
+	return single_open(file, propagate_bits_show, in->i_private);
+}
+
+/**
+ * propagate_bits_write - Write input value from debugfs to PBHA bits of L2_CONFIG register.
+ *
+ * @file:     Pointer to file struct of debugfs node.
+ * @ubuf:     Pointer to user buffer with value to be written.
+ * @count:    Size of user buffer.
+ * @ppos:     Not used.
+ *
+ * Return: Size of buffer passed in when successful, but error code E2BIG/EINVAL otherwise.
+ */
+static ssize_t propagate_bits_write(struct file *file, const char __user *ubuf, size_t count,
+				    loff_t *ppos)
+{
+	struct seq_file *sfile = file->private_data;
+	struct kbase_device *kbdev = sfile->private;
+	/* 32 characters should be enough for the input string in any base */
+	char raw_str[32];
+	unsigned long propagate_bits;
+
+	if (count >= sizeof(raw_str))
+		return -E2BIG;
+	if (copy_from_user(raw_str, ubuf, count))
+		return -EINVAL;
+	raw_str[count] = '\0';
+	if (kstrtoul(raw_str, 0, &propagate_bits))
+		return -EINVAL;
+
+	/* Check propagate_bits input argument does not
+	 * exceed the maximum size of the propagate_bits mask.
+	 */
+	if (propagate_bits > (L2_CONFIG_PBHA_HWU_MASK >> L2_CONFIG_PBHA_HWU_SHIFT))
+		return -EINVAL;
+	/* Cast to u8 is safe as check is done already to ensure size is within
+	 * correct limits.
+	 */
+	kbdev->pbha_propagate_bits = (u8)propagate_bits;
+
+	/* GPU Reset will set new values in L2 config */
+	if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_NONE)) {
+		kbase_reset_gpu(kbdev);
+		kbase_reset_gpu_wait(kbdev);
+	}
+
+	return count;
+}
+
+static const struct file_operations pbha_propagate_bits_fops = {
+	.owner = THIS_MODULE,
+	.open = propagate_bits_open,
+	.read = seq_read,
+	.write = propagate_bits_write,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+#endif /* MALI_USE_CSF */
+
 static const struct file_operations pbha_int_id_overrides_fops = {
 	.owner = THIS_MODULE,
 	.open = int_id_overrides_open,
@@ -132,5 +218,10 @@ void kbase_pbha_debugfs_init(struct kbase_device *kbdev)
 
 		debugfs_create_file("int_id_overrides", mode, debugfs_pbha_dir,
 				    kbdev, &pbha_int_id_overrides_fops);
+#if MALI_USE_CSF
+		if (kbase_hw_has_feature(kbdev, BASE_HW_FEATURE_PBHA_HWU))
+			debugfs_create_file("propagate_bits", mode, debugfs_pbha_dir, kbdev,
+					    &pbha_propagate_bits_fops);
+#endif /* MALI_USE_CSF */
 	}
 }
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_platform_fake.c b/drivers/gpu/arm/bifrost/mali_kbase_platform_fake.c
index 761a636b4cbf..265c676f13fa 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_platform_fake.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_platform_fake.c
@@ -32,12 +32,12 @@
  */
 #include <mali_kbase_config.h>
 
+#ifndef CONFIG_OF
+
 #define PLATFORM_CONFIG_RESOURCE_COUNT 4
-#define PLATFORM_CONFIG_IRQ_RES_COUNT  3
 
 static struct platform_device *mali_device;
 
-#ifndef CONFIG_OF
 /**
  * kbasep_config_parse_io_resources - Convert data in struct kbase_io_resources
  * struct to Linux-specific resources
@@ -73,14 +73,11 @@ static void kbasep_config_parse_io_resources(const struct kbase_io_resources *io
 	linux_resources[3].end   = io_resources->gpu_irq_number;
 	linux_resources[3].flags = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL;
 }
-#endif /* CONFIG_OF */
 
 int kbase_platform_register(void)
 {
 	struct kbase_platform_config *config;
-#ifndef CONFIG_OF
 	struct resource resources[PLATFORM_CONFIG_RESOURCE_COUNT];
-#endif
 	int err;
 
 	config = kbase_get_platform_config(); /* declared in midgard/mali_kbase_config.h but defined in platform folder */
@@ -93,7 +90,6 @@ int kbase_platform_register(void)
 	if (mali_device == NULL)
 		return -ENOMEM;
 
-#ifndef CONFIG_OF
 	kbasep_config_parse_io_resources(config->io_resources, resources);
 	err = platform_device_add_resources(mali_device, resources, PLATFORM_CONFIG_RESOURCE_COUNT);
 	if (err) {
@@ -101,7 +97,6 @@ int kbase_platform_register(void)
 		mali_device = NULL;
 		return err;
 	}
-#endif /* CONFIG_OF */
 
 	err = platform_device_add(mali_device);
 	if (err) {
@@ -120,3 +115,5 @@ void kbase_platform_unregister(void)
 		platform_device_unregister(mali_device);
 }
 EXPORT_SYMBOL(kbase_platform_unregister);
+
+#endif /* CONFIG_OF */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_pm.c b/drivers/gpu/arm/bifrost/mali_kbase_pm.c
index 68c1b9bb25e8..62a132816a42 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_pm.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_pm.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2010-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -27,7 +27,7 @@
 #include <gpu/mali_kbase_gpu_regmap.h>
 #include <mali_kbase_vinstr.h>
 #include <mali_kbase_kinstr_prfcnt.h>
-#include <mali_kbase_hwcnt_context.h>
+#include <hwcnt/mali_kbase_hwcnt_context.h>
 
 #include <mali_kbase_pm.h>
 #include <backend/gpu/mali_kbase_pm_internal.h>
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_reset_gpu.h b/drivers/gpu/arm/bifrost/mali_kbase_reset_gpu.h
index ff631e91824f..48ea9954b17c 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_reset_gpu.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_reset_gpu.h
@@ -236,6 +236,18 @@ int kbase_reset_gpu_silent(struct kbase_device *kbdev);
  */
 bool kbase_reset_gpu_is_active(struct kbase_device *kbdev);
 
+/**
+ * kbase_reset_gpu_not_pending - Reports if the GPU reset isn't pending
+ *
+ * @kbdev: Device pointer
+ *
+ * Note that unless appropriate locks are held when using this function, the
+ * state could change immediately afterwards.
+ *
+ * Return: True if the GPU reset isn't pending.
+ */
+bool kbase_reset_gpu_is_not_pending(struct kbase_device *kbdev);
+
 /**
  * kbase_reset_gpu_wait - Wait for a GPU reset to complete
  * @kbdev: Device pointer
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_softjobs.c b/drivers/gpu/arm/bifrost/mali_kbase_softjobs.c
index 5808a2e893cc..b0c5126afcbe 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_softjobs.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_softjobs.c
@@ -23,7 +23,7 @@
 
 #include <linux/dma-buf.h>
 #include <asm/cacheflush.h>
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 #include <mali_kbase_sync.h>
 #endif
 #include <linux/dma-mapping.h>
@@ -204,7 +204,7 @@ static int kbase_dump_cpu_gpu_time(struct kbase_jd_atom *katom)
 	return 0;
 }
 
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 /* Called by the explicit fence mechanism when a fence wait has completed */
 void kbase_soft_event_wait_callback(struct kbase_jd_atom *katom)
 {
@@ -925,26 +925,6 @@ int kbasep_jit_alloc_validate(struct kbase_context *kctx,
 
 #if !MALI_USE_CSF
 
-/*
- * Sizes of user data to copy for each just-in-time memory interface version
- *
- * In interface version 2 onwards this is the same as the struct size, allowing
- * copying of arrays of structures from userspace.
- *
- * In interface version 1 the structure size was variable, and hence arrays of
- * structures cannot be supported easily, and were not a feature present in
- * version 1 anyway.
- */
-static const size_t jit_info_copy_size_for_jit_version[] = {
-	/* in jit_version 1, the structure did not have any end padding, hence
-	 * it could be a different size on 32 and 64-bit clients. We therefore
-	 * do not copy past the last member
-	 */
-	[1] = offsetofend(struct base_jit_alloc_info_10_2, id),
-	[2] = sizeof(struct base_jit_alloc_info_11_5),
-	[3] = sizeof(struct base_jit_alloc_info)
-};
-
 static int kbase_jit_allocate_prepare(struct kbase_jd_atom *katom)
 {
 	__user u8 *data = (__user u8 *)(uintptr_t) katom->jc;
@@ -954,18 +934,11 @@ static int kbase_jit_allocate_prepare(struct kbase_jd_atom *katom)
 	u32 count;
 	int ret;
 	u32 i;
-	size_t jit_info_user_copy_size;
-
-	WARN_ON(kctx->jit_version >=
-		ARRAY_SIZE(jit_info_copy_size_for_jit_version));
-	jit_info_user_copy_size =
-			jit_info_copy_size_for_jit_version[kctx->jit_version];
-	WARN_ON(jit_info_user_copy_size > sizeof(*info));
 
 	/* For backwards compatibility, and to prevent reading more than 1 jit
 	 * info struct on jit version 1
 	 */
-	if (katom->nr_extres == 0 || kctx->jit_version == 1)
+	if (katom->nr_extres == 0)
 		katom->nr_extres = 1;
 	count = katom->nr_extres;
 
@@ -985,8 +958,8 @@ static int kbase_jit_allocate_prepare(struct kbase_jd_atom *katom)
 
 	katom->softjob_data = info;
 
-	for (i = 0; i < count; i++, info++, data += jit_info_user_copy_size) {
-		if (copy_from_user(info, data, jit_info_user_copy_size) != 0) {
+	for (i = 0; i < count; i++, info++, data += sizeof(*info)) {
+		if (copy_from_user(info, data, sizeof(*info)) != 0) {
 			ret = -EINVAL;
 			goto free_info;
 		}
@@ -994,8 +967,7 @@ static int kbase_jit_allocate_prepare(struct kbase_jd_atom *katom)
 		 * kernel struct. For jit version 1, this also clears the
 		 * padding bytes
 		 */
-		memset(((u8 *)info) + jit_info_user_copy_size, 0,
-				sizeof(*info) - jit_info_user_copy_size);
+		memset(((u8 *)info) + sizeof(*info), 0, sizeof(*info) - sizeof(*info));
 
 		ret = kbasep_jit_alloc_validate(kctx, info);
 		if (ret)
@@ -1476,10 +1448,11 @@ static void kbase_ext_res_process(struct kbase_jd_atom *katom, bool map)
 			if (!kbase_sticky_resource_acquire(katom->kctx,
 					gpu_addr))
 				goto failed_loop;
-		} else
+		} else {
 			if (!kbase_sticky_resource_release_force(katom->kctx, NULL,
 					gpu_addr))
 				failed = true;
+		}
 	}
 
 	/*
@@ -1539,7 +1512,7 @@ int kbase_process_soft_job(struct kbase_jd_atom *katom)
 		ret = kbase_dump_cpu_gpu_time(katom);
 		break;
 
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 	case BASE_JD_REQ_SOFT_FENCE_TRIGGER:
 		katom->event_code = kbase_sync_fence_out_trigger(katom,
 				katom->event_code == BASE_JD_EVENT_DONE ?
@@ -1599,7 +1572,7 @@ int kbase_process_soft_job(struct kbase_jd_atom *katom)
 void kbase_cancel_soft_job(struct kbase_jd_atom *katom)
 {
 	switch (katom->core_req & BASE_JD_REQ_SOFT_JOB_TYPE) {
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 	case BASE_JD_REQ_SOFT_FENCE_WAIT:
 		kbase_sync_fence_in_cancel_wait(katom);
 		break;
@@ -1622,7 +1595,7 @@ int kbase_prepare_soft_job(struct kbase_jd_atom *katom)
 				return -EINVAL;
 		}
 		break;
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 	case BASE_JD_REQ_SOFT_FENCE_TRIGGER:
 		{
 			struct base_fence fence;
@@ -1668,20 +1641,9 @@ int kbase_prepare_soft_job(struct kbase_jd_atom *katom)
 							  fence.basep.fd);
 			if (ret < 0)
 				return ret;
-
-#ifdef CONFIG_MALI_BIFROST_DMA_FENCE
-			/*
-			 * Set KCTX_NO_IMPLICIT_FENCE in the context the first
-			 * time a soft fence wait job is observed. This will
-			 * prevent the implicit dma-buf fence to conflict with
-			 * the Android native sync fences.
-			 */
-			if (!kbase_ctx_flag(katom->kctx, KCTX_NO_IMPLICIT_SYNC))
-				kbase_ctx_flag_set(katom->kctx, KCTX_NO_IMPLICIT_SYNC);
-#endif /* CONFIG_MALI_BIFROST_DMA_FENCE */
 		}
 		break;
-#endif /* CONFIG_SYNC || CONFIG_SYNC_FILE */
+#endif /* CONFIG_SYNC_FILE */
 	case BASE_JD_REQ_SOFT_JIT_ALLOC:
 		return kbase_jit_allocate_prepare(katom);
 	case BASE_JD_REQ_SOFT_JIT_FREE:
@@ -1714,7 +1676,7 @@ void kbase_finish_soft_job(struct kbase_jd_atom *katom)
 	case BASE_JD_REQ_SOFT_DUMP_CPU_GPU_TIME:
 		/* Nothing to do */
 		break;
-#if defined(CONFIG_SYNC) || defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 	case BASE_JD_REQ_SOFT_FENCE_TRIGGER:
 		/* If fence has not yet been signaled, do it now */
 		kbase_sync_fence_out_trigger(katom, katom->event_code ==
@@ -1724,7 +1686,7 @@ void kbase_finish_soft_job(struct kbase_jd_atom *katom)
 		/* Release katom's reference to fence object */
 		kbase_sync_fence_in_remove(katom);
 		break;
-#endif /* CONFIG_SYNC || CONFIG_SYNC_FILE */
+#endif /* CONFIG_SYNC_FILE */
 	case BASE_JD_REQ_SOFT_DEBUG_COPY:
 		kbase_debug_copy_finish(katom);
 		break;
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_sync.h b/drivers/gpu/arm/bifrost/mali_kbase_sync.h
index fcc9b6fe0195..3d2053bee08e 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_sync.h
+++ b/drivers/gpu/arm/bifrost/mali_kbase_sync.h
@@ -30,9 +30,6 @@
 
 #include <linux/fdtable.h>
 #include <linux/syscalls.h>
-#if IS_ENABLED(CONFIG_SYNC)
-#include <sync.h>
-#endif
 #if IS_ENABLED(CONFIG_SYNC_FILE)
 #include "mali_kbase_fence_defs.h"
 #include <linux/sync_file.h>
@@ -181,7 +178,7 @@ int kbase_sync_fence_out_info_get(struct kbase_jd_atom *katom,
 				  struct kbase_sync_fence_info *info);
 #endif /* !MALI_USE_CSF */
 
-#if defined(CONFIG_SYNC_FILE)
+#if IS_ENABLED(CONFIG_SYNC_FILE)
 #if (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE)
 void kbase_sync_fence_info_get(struct fence *fence,
 			       struct kbase_sync_fence_info *info);
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_sync_android.c b/drivers/gpu/arm/bifrost/mali_kbase_sync_android.c
deleted file mode 100644
index 8c5cb6c3838e..000000000000
--- a/drivers/gpu/arm/bifrost/mali_kbase_sync_android.c
+++ /dev/null
@@ -1,514 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
-/*
- *
- * (C) COPYRIGHT 2012-2017, 2020-2022 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-/*
- * Code for supporting explicit Android fences (CONFIG_SYNC)
- * Known to be good for kernels 4.5 and earlier.
- * Replaced with CONFIG_SYNC_FILE for 4.9 and later kernels
- * (see mali_kbase_sync_file.c)
- */
-
-#include <linux/sched.h>
-#include <linux/fdtable.h>
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/anon_inodes.h>
-#include <linux/version.h>
-#include "sync.h"
-#include <mali_kbase.h>
-#include <mali_kbase_sync.h>
-
-struct mali_sync_timeline {
-	struct sync_timeline timeline;
-	atomic_t counter;
-	atomic_t signaled;
-};
-
-struct mali_sync_pt {
-	struct sync_pt pt;
-	int order;
-	int result;
-};
-
-static struct mali_sync_timeline *to_mali_sync_timeline(
-						struct sync_timeline *timeline)
-{
-	return container_of(timeline, struct mali_sync_timeline, timeline);
-}
-
-static struct mali_sync_pt *to_mali_sync_pt(struct sync_pt *pt)
-{
-	return container_of(pt, struct mali_sync_pt, pt);
-}
-
-static struct sync_pt *timeline_dup(struct sync_pt *pt)
-{
-	struct mali_sync_pt *mpt = to_mali_sync_pt(pt);
-	struct mali_sync_pt *new_mpt;
-	struct sync_pt *new_pt = sync_pt_create(sync_pt_parent(pt),
-						sizeof(struct mali_sync_pt));
-
-	if (!new_pt)
-		return NULL;
-
-	new_mpt = to_mali_sync_pt(new_pt);
-	new_mpt->order = mpt->order;
-	new_mpt->result = mpt->result;
-
-	return new_pt;
-}
-
-static int timeline_has_signaled(struct sync_pt *pt)
-{
-	struct mali_sync_pt *mpt = to_mali_sync_pt(pt);
-	struct mali_sync_timeline *mtl = to_mali_sync_timeline(
-							sync_pt_parent(pt));
-	int result = mpt->result;
-
-	int diff = atomic_read(&mtl->signaled) - mpt->order;
-
-	if (diff >= 0)
-		return (result < 0) ? result : 1;
-
-	return 0;
-}
-
-static int timeline_compare(struct sync_pt *a, struct sync_pt *b)
-{
-	struct mali_sync_pt *ma = container_of(a, struct mali_sync_pt, pt);
-	struct mali_sync_pt *mb = container_of(b, struct mali_sync_pt, pt);
-
-	int diff = ma->order - mb->order;
-
-	if (diff == 0)
-		return 0;
-
-	return (diff < 0) ? -1 : 1;
-}
-
-static void timeline_value_str(struct sync_timeline *timeline, char *str,
-			       int size)
-{
-	struct mali_sync_timeline *mtl = to_mali_sync_timeline(timeline);
-
-	snprintf(str, size, "%d", atomic_read(&mtl->signaled));
-}
-
-static void pt_value_str(struct sync_pt *pt, char *str, int size)
-{
-	struct mali_sync_pt *mpt = to_mali_sync_pt(pt);
-
-	snprintf(str, size, "%d(%d)", mpt->order, mpt->result);
-}
-
-static struct sync_timeline_ops mali_timeline_ops = {
-	.driver_name = "Mali",
-	.dup = timeline_dup,
-	.has_signaled = timeline_has_signaled,
-	.compare = timeline_compare,
-	.timeline_value_str = timeline_value_str,
-	.pt_value_str       = pt_value_str,
-};
-
-/* Allocates a timeline for Mali
- *
- * One timeline should be allocated per API context.
- */
-static struct sync_timeline *mali_sync_timeline_alloc(const char *name)
-{
-	struct sync_timeline *tl;
-	struct mali_sync_timeline *mtl;
-
-	tl = sync_timeline_create(&mali_timeline_ops,
-				  sizeof(struct mali_sync_timeline), name);
-	if (!tl)
-		return NULL;
-
-	/* Set the counter in our private struct */
-	mtl = to_mali_sync_timeline(tl);
-	atomic_set(&mtl->counter, 0);
-	atomic_set(&mtl->signaled, 0);
-
-	return tl;
-}
-
-static int kbase_stream_close(struct inode *inode, struct file *file)
-{
-	struct sync_timeline *tl;
-
-	tl = (struct sync_timeline *)file->private_data;
-	sync_timeline_destroy(tl);
-	return 0;
-}
-
-static const struct file_operations stream_fops = {
-	.owner = THIS_MODULE,
-	.release = kbase_stream_close,
-};
-
-int kbase_sync_fence_stream_create(const char *name, int *const out_fd)
-{
-	struct sync_timeline *tl;
-
-	if (!out_fd)
-		return -EINVAL;
-
-	tl = mali_sync_timeline_alloc(name);
-	if (!tl)
-		return -EINVAL;
-
-	*out_fd = anon_inode_getfd(name, &stream_fops, tl, O_RDONLY|O_CLOEXEC);
-
-	if (*out_fd < 0) {
-		sync_timeline_destroy(tl);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-#if !MALI_USE_CSF
-/* Allocates a sync point within the timeline.
- *
- * The timeline must be the one allocated by kbase_sync_timeline_alloc
- *
- * Sync points must be triggered in *exactly* the same order as they are
- * allocated.
- */
-static struct sync_pt *kbase_sync_pt_alloc(struct sync_timeline *parent)
-{
-	struct sync_pt *pt = sync_pt_create(parent,
-					    sizeof(struct mali_sync_pt));
-	struct mali_sync_timeline *mtl = to_mali_sync_timeline(parent);
-	struct mali_sync_pt *mpt;
-
-	if (!pt)
-		return NULL;
-
-	mpt = to_mali_sync_pt(pt);
-	mpt->order = atomic_inc_return(&mtl->counter);
-	mpt->result = 0;
-
-	return pt;
-}
-
-int kbase_sync_fence_out_create(struct kbase_jd_atom *katom, int tl_fd)
-{
-	struct sync_timeline *tl;
-	struct sync_pt *pt;
-	struct sync_fence *fence;
-	int fd;
-	struct file *tl_file;
-
-	tl_file = fget(tl_fd);
-	if (tl_file == NULL)
-		return -EBADF;
-
-	if (tl_file->f_op != &stream_fops) {
-		fd = -EBADF;
-		goto out;
-	}
-
-	tl = tl_file->private_data;
-
-	pt = kbase_sync_pt_alloc(tl);
-	if (!pt) {
-		fd = -EFAULT;
-		goto out;
-	}
-
-	fence = sync_fence_create("mali_fence", pt);
-	if (!fence) {
-		sync_pt_free(pt);
-		fd = -EFAULT;
-		goto out;
-	}
-
-	/* from here the fence owns the sync_pt */
-
-	/* create a fd representing the fence */
-	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
-	if (fd < 0) {
-		sync_pt_free(pt);
-		sync_fence_put(fence);
-		katom->fence = NULL;
-		goto out;
-	}
-
-	/* Place the successfully created fence in katom */
-	katom->fence = fence;
-
-	/* bind fence to the new fd */
-	sync_fence_install(fence, fd);
-out:
-	fput(tl_file);
-
-	return fd;
-}
-
-int kbase_sync_fence_in_from_fd(struct kbase_jd_atom *katom, int fd)
-{
-	katom->fence = sync_fence_fdget(fd);
-	return katom->fence ? 0 : -ENOENT;
-}
-#endif /* !MALI_USE_CSF */
-
-int kbase_sync_fence_validate(int fd)
-{
-	struct sync_fence *fence;
-
-	fence = sync_fence_fdget(fd);
-	if (!fence)
-		return -EINVAL;
-
-	sync_fence_put(fence);
-	return 0;
-}
-
-#if !MALI_USE_CSF
-/* Returns true if the specified timeline is allocated by Mali */
-static int kbase_sync_timeline_is_ours(struct sync_timeline *timeline)
-{
-	return timeline->ops == &mali_timeline_ops;
-}
-
-/* Signals a particular sync point
- *
- * Sync points must be triggered in *exactly* the same order as they are
- * allocated.
- *
- * If they are signaled in the wrong order then a message will be printed in
- * debug builds and otherwise attempts to signal order sync_pts will be ignored.
- *
- * result can be negative to indicate error, any other value is interpreted as
- * success.
- */
-static void kbase_sync_signal_pt(struct sync_pt *pt, int result)
-{
-	struct mali_sync_pt *mpt = to_mali_sync_pt(pt);
-	struct mali_sync_timeline *mtl = to_mali_sync_timeline(
-							sync_pt_parent(pt));
-	int signaled;
-	int diff;
-
-	mpt->result = result;
-
-	do {
-		signaled = atomic_read(&mtl->signaled);
-
-		diff = signaled - mpt->order;
-
-		if (diff > 0) {
-			/* The timeline is already at or ahead of this point.
-			 * This should not happen unless userspace has been
-			 * signaling fences out of order, so warn but don't
-			 * violate the sync_pt API.
-			 * The warning is only in debug builds to prevent
-			 * a malicious user being able to spam dmesg.
-			 */
-#ifdef CONFIG_MALI_BIFROST_DEBUG
-			pr_err("Fences were triggered in a different order to allocation!");
-#endif				/* CONFIG_MALI_BIFROST_DEBUG */
-			return;
-		}
-	} while (atomic_cmpxchg(&mtl->signaled,
-				signaled, mpt->order) != signaled);
-}
-
-enum base_jd_event_code
-kbase_sync_fence_out_trigger(struct kbase_jd_atom *katom, int result)
-{
-	struct sync_pt *pt;
-	struct sync_timeline *timeline;
-
-	if (!katom->fence)
-		return BASE_JD_EVENT_JOB_CANCELLED;
-
-	if (katom->fence->num_fences != 1) {
-		/* Not exactly one item in the list - so it didn't (directly)
-		 * come from us
-		 */
-		return BASE_JD_EVENT_JOB_CANCELLED;
-	}
-
-	pt = container_of(katom->fence->cbs[0].sync_pt, struct sync_pt, base);
-	timeline = sync_pt_parent(pt);
-
-	if (!kbase_sync_timeline_is_ours(timeline)) {
-		/* Fence has a sync_pt which isn't ours! */
-		return BASE_JD_EVENT_JOB_CANCELLED;
-	}
-
-	kbase_sync_signal_pt(pt, result);
-
-	sync_timeline_signal(timeline);
-
-	kbase_sync_fence_out_remove(katom);
-
-	return (result < 0) ? BASE_JD_EVENT_JOB_CANCELLED : BASE_JD_EVENT_DONE;
-}
-
-static inline int kbase_fence_get_status(struct sync_fence *fence)
-{
-	if (!fence)
-		return -ENOENT;
-
-	return atomic_read(&fence->status);
-}
-
-static void kbase_fence_wait_callback(struct sync_fence *fence,
-				      struct sync_fence_waiter *waiter)
-{
-	struct kbase_jd_atom *katom = container_of(waiter,
-					struct kbase_jd_atom, sync_waiter);
-	struct kbase_context *kctx = katom->kctx;
-
-	/* Propagate the fence status to the atom.
-	 * If negative then cancel this atom and its dependencies.
-	 */
-	if (kbase_fence_get_status(fence) < 0)
-		katom->event_code = BASE_JD_EVENT_JOB_CANCELLED;
-
-	/* To prevent a potential deadlock we schedule the work onto the
-	 * job_done_wq workqueue
-	 *
-	 * The issue is that we may signal the timeline while holding
-	 * kctx->jctx.lock and the callbacks are run synchronously from
-	 * sync_timeline_signal. So we simply defer the work.
-	 */
-
-	INIT_WORK(&katom->work, kbase_sync_fence_wait_worker);
-	queue_work(kctx->jctx.job_done_wq, &katom->work);
-}
-
-int kbase_sync_fence_in_wait(struct kbase_jd_atom *katom)
-{
-	int ret;
-
-	sync_fence_waiter_init(&katom->sync_waiter, kbase_fence_wait_callback);
-
-	ret = sync_fence_wait_async(katom->fence, &katom->sync_waiter);
-
-	if (ret == 1) {
-		/* Already signaled */
-		return 0;
-	}
-
-	if (ret < 0) {
-		katom->event_code = BASE_JD_EVENT_JOB_CANCELLED;
-		/* We should cause the dependent jobs in the bag to be failed,
-		 * to do this we schedule the work queue to complete this job
-		 */
-		INIT_WORK(&katom->work, kbase_sync_fence_wait_worker);
-		queue_work(katom->kctx->jctx.job_done_wq, &katom->work);
-	}
-
-	return 1;
-}
-
-void kbase_sync_fence_in_cancel_wait(struct kbase_jd_atom *katom)
-{
-	if (sync_fence_cancel_async(katom->fence, &katom->sync_waiter) != 0) {
-		/* The wait wasn't cancelled - leave the cleanup for
-		 * kbase_fence_wait_callback
-		 */
-		return;
-	}
-
-	/* Wait was cancelled - zap the atoms */
-	katom->event_code = BASE_JD_EVENT_JOB_CANCELLED;
-
-	kbasep_remove_waiting_soft_job(katom);
-	kbase_finish_soft_job(katom);
-
-	if (kbase_jd_done_nolock(katom, true))
-		kbase_js_sched_all(katom->kctx->kbdev);
-}
-
-void kbase_sync_fence_out_remove(struct kbase_jd_atom *katom)
-{
-	if (katom->fence) {
-		sync_fence_put(katom->fence);
-		katom->fence = NULL;
-	}
-}
-
-void kbase_sync_fence_in_remove(struct kbase_jd_atom *katom)
-{
-	if (katom->fence) {
-		sync_fence_put(katom->fence);
-		katom->fence = NULL;
-	}
-}
-
-int kbase_sync_fence_in_info_get(struct kbase_jd_atom *katom,
-				 struct kbase_sync_fence_info *info)
-{
-	u32 string_len;
-
-	if (!katom->fence)
-		return -ENOENT;
-
-	info->fence = katom->fence;
-	info->status = kbase_fence_get_status(katom->fence);
-
-	string_len = strscpy(info->name, katom->fence->name, sizeof(info->name));
-	string_len += sizeof(char);
-	/* Make sure that the source string fit into the buffer. */
-	KBASE_DEBUG_ASSERT(string_len <= sizeof(info->name));
-	CSTD_UNUSED(string_len);
-
-	return 0;
-}
-
-int kbase_sync_fence_out_info_get(struct kbase_jd_atom *katom,
-				 struct kbase_sync_fence_info *info)
-{
-	u32 string_len;
-
-	if (!katom->fence)
-		return -ENOENT;
-
-	info->fence = katom->fence;
-	info->status = kbase_fence_get_status(katom->fence);
-
-	string_len = strscpy(info->name, katom->fence->name, sizeof(info->name));
-	string_len += sizeof(char);
-	/* Make sure that the source string fit into the buffer. */
-	KBASE_DEBUG_ASSERT(string_len <= sizeof(info->name));
-	CSTD_UNUSED(string_len);
-
-	return 0;
-}
-
-#ifdef CONFIG_MALI_BIFROST_FENCE_DEBUG
-void kbase_sync_fence_in_dump(struct kbase_jd_atom *katom)
-{
-	/* Dump out the full state of all the Android sync fences.
-	 * The function sync_dump() isn't exported to modules, so force
-	 * sync_fence_wait() to time out to trigger sync_dump().
-	 */
-	if (katom->fence)
-		sync_fence_wait(katom->fence, 1);
-}
-#endif
-#endif /* !MALI_USE_CSF */
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_sync_file.c b/drivers/gpu/arm/bifrost/mali_kbase_sync_file.c
index e08a87210fbc..9360324cfee6 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_sync_file.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_sync_file.c
@@ -21,9 +21,6 @@
 
 /*
  * Code for supporting explicit Linux fences (CONFIG_SYNC_FILE)
- * Introduced in kernel 4.9.
- * Android explicit fences (CONFIG_SYNC) can be used for older kernels
- * (see mali_kbase_sync_android.c)
  */
 
 #include <linux/sched.h>
@@ -112,10 +109,13 @@ int kbase_sync_fence_in_from_fd(struct kbase_jd_atom *katom, int fd)
 	struct dma_fence *fence = sync_file_get_fence(fd);
 #endif
 
+	lockdep_assert_held(&katom->kctx->jctx.lock);
+
 	if (!fence)
 		return -ENOENT;
 
 	kbase_fence_fence_in_set(katom, fence);
+	katom->dma_fence.fence_cb_added = false;
 
 	return 0;
 }
@@ -167,36 +167,31 @@ static void kbase_fence_wait_callback(struct dma_fence *fence,
 				      struct dma_fence_cb *cb)
 #endif
 {
-	struct kbase_fence_cb *kcb = container_of(cb,
-				struct kbase_fence_cb,
-				fence_cb);
-	struct kbase_jd_atom *katom = kcb->katom;
+	struct kbase_jd_atom *katom = container_of(cb, struct kbase_jd_atom,
+						   dma_fence.fence_cb);
 	struct kbase_context *kctx = katom->kctx;
 
 	/* Cancel atom if fence is erroneous */
+	if (dma_fence_is_signaled(katom->dma_fence.fence_in) &&
 #if (KERNEL_VERSION(4, 11, 0) <= LINUX_VERSION_CODE || \
 	 (KERNEL_VERSION(4, 10, 0) > LINUX_VERSION_CODE && \
 	  KERNEL_VERSION(4, 9, 68) <= LINUX_VERSION_CODE))
-	if (dma_fence_is_signaled(kcb->fence) && kcb->fence->error < 0)
+	    katom->dma_fence.fence_in->error < 0)
 #else
-	if (dma_fence_is_signaled(kcb->fence) && kcb->fence->status < 0)
+	    katom->dma_fence.fence_in->status < 0)
 #endif
 		katom->event_code = BASE_JD_EVENT_JOB_CANCELLED;
 
-	if (kbase_fence_dep_count_dec_and_test(katom)) {
-		/* We take responsibility of handling this */
-		kbase_fence_dep_count_set(katom, -1);
 
-		/* To prevent a potential deadlock we schedule the work onto the
-		 * job_done_wq workqueue
-		 *
-		 * The issue is that we may signal the timeline while holding
-		 * kctx->jctx.lock and the callbacks are run synchronously from
-		 * sync_timeline_signal. So we simply defer the work.
-		 */
-		INIT_WORK(&katom->work, kbase_sync_fence_wait_worker);
-		queue_work(kctx->jctx.job_done_wq, &katom->work);
-	}
+	/* To prevent a potential deadlock we schedule the work onto the
+	 * job_done_wq workqueue
+	 *
+	 * The issue is that we may signal the timeline while holding
+	 * kctx->jctx.lock and the callbacks are run synchronously from
+	 * sync_timeline_signal. So we simply defer the work.
+	 */
+	INIT_WORK(&katom->work, kbase_sync_fence_wait_worker);
+	queue_work(kctx->jctx.job_done_wq, &katom->work);
 }
 
 int kbase_sync_fence_in_wait(struct kbase_jd_atom *katom)
@@ -208,53 +203,77 @@ int kbase_sync_fence_in_wait(struct kbase_jd_atom *katom)
 	struct dma_fence *fence;
 #endif
 
-	fence = kbase_fence_in_get(katom);
+	lockdep_assert_held(&katom->kctx->jctx.lock);
+
+	fence = katom->dma_fence.fence_in;
 	if (!fence)
 		return 0; /* no input fence to wait for, good to go! */
 
-	kbase_fence_dep_count_set(katom, 1);
+	err = dma_fence_add_callback(fence, &katom->dma_fence.fence_cb,
+				     kbase_fence_wait_callback);
+	if (err == -ENOENT) {
+		int fence_status = dma_fence_get_status(fence);
 
-	err = kbase_fence_add_callback(katom, fence, kbase_fence_wait_callback);
-
-	kbase_fence_put(fence);
-
-	if (likely(!err)) {
-		/* Test if the callbacks are already triggered */
-		if (kbase_fence_dep_count_dec_and_test(katom)) {
-			kbase_fence_free_callbacks(katom);
-			kbase_fence_dep_count_set(katom, -1);
-			return 0; /* Already signaled, good to go right now */
+		if (fence_status == 1) {
+			/* Fence is already signaled with no error. The completion
+			 * for FENCE_WAIT softjob can be done right away.
+			 */
+			return 0;
 		}
 
-		/* Callback installed, so we just need to wait for it... */
-	} else {
-		/* Failure */
-		kbase_fence_free_callbacks(katom);
-		kbase_fence_dep_count_set(katom, -1);
+		/* Fence shouldn't be in not signaled state */
+		if (!fence_status) {
+			struct kbase_sync_fence_info info;
 
-		katom->event_code = BASE_JD_EVENT_JOB_CANCELLED;
+			kbase_sync_fence_in_info_get(katom, &info);
 
-		/* We should cause the dependent jobs in the bag to be failed,
-		 * to do this we schedule the work queue to complete this job
+			dev_warn(katom->kctx->kbdev->dev,
+				 "Unexpected status for fence %s of ctx:%d_%d atom:%d",
+				 info.name, katom->kctx->tgid, katom->kctx->id,
+				 kbase_jd_atom_id(katom->kctx, katom));
+		}
+
+		/* If fence is signaled with an error, then the FENCE_WAIT softjob is
+		 * considered to be failed.
 		 */
-		INIT_WORK(&katom->work, kbase_sync_fence_wait_worker);
-		queue_work(katom->kctx->jctx.job_done_wq, &katom->work);
 	}
 
-	return 1; /* completion to be done later by callback/worker */
+	if (unlikely(err)) {
+		/* We should cause the dependent jobs in the bag to be failed. */
+		katom->event_code = BASE_JD_EVENT_JOB_CANCELLED;
+
+		/* The completion for FENCE_WAIT softjob can be done right away. */
+		return 0;
+	}
+
+	/* Callback was successfully installed */
+	katom->dma_fence.fence_cb_added = true;
+
+	/* Completion to be done later by callback/worker */
+	return 1;
 }
 
 void kbase_sync_fence_in_cancel_wait(struct kbase_jd_atom *katom)
 {
-	if (!kbase_fence_free_callbacks(katom)) {
-		/* The wait wasn't cancelled -
-		 * leave the cleanup for kbase_fence_wait_callback
-		 */
-		return;
-	}
+	lockdep_assert_held(&katom->kctx->jctx.lock);
 
-	/* Take responsibility of completion */
-	kbase_fence_dep_count_set(katom, -1);
+	if (katom->dma_fence.fence_cb_added) {
+		if (!dma_fence_remove_callback(katom->dma_fence.fence_in,
+					       &katom->dma_fence.fence_cb)) {
+			/* The callback is already removed so leave the cleanup
+			 * for kbase_fence_wait_callback.
+			 */
+			return;
+		}
+	} else {
+		struct kbase_sync_fence_info info;
+
+		kbase_sync_fence_in_info_get(katom, &info);
+		dev_warn(katom->kctx->kbdev->dev,
+			 "Callback was not added earlier for fence %s of ctx:%d_%d atom:%d",
+			 info.name, katom->kctx->tgid, katom->kctx->id,
+			 kbase_jd_atom_id(katom->kctx, katom));
+	}
 
 	/* Wait was cancelled - zap the atoms */
 	katom->event_code = BASE_JD_EVENT_JOB_CANCELLED;
@@ -273,8 +292,29 @@ void kbase_sync_fence_out_remove(struct kbase_jd_atom *katom)
 
 void kbase_sync_fence_in_remove(struct kbase_jd_atom *katom)
 {
-	kbase_fence_free_callbacks(katom);
+	lockdep_assert_held(&katom->kctx->jctx.lock);
+
+	if (katom->dma_fence.fence_cb_added) {
+		bool removed = dma_fence_remove_callback(katom->dma_fence.fence_in,
+							 &katom->dma_fence.fence_cb);
+
+		/* Here it is expected that the callback should have already been removed
+		 * previously either by kbase_sync_fence_in_cancel_wait() or when the fence
+		 * was signaled and kbase_sync_fence_wait_worker() was called.
+		 */
+		if (removed) {
+			struct kbase_sync_fence_info info;
+
+			kbase_sync_fence_in_info_get(katom, &info);
+			dev_warn(katom->kctx->kbdev->dev,
+				 "Callback was not removed earlier for fence %s of ctx:%d_%d atom:%d",
+				 info.name, katom->kctx->tgid, katom->kctx->id,
+				 kbase_jd_atom_id(katom->kctx, katom));
+		}
+	}
+
 	kbase_fence_in_remove(katom);
+	katom->dma_fence.fence_cb_added = false;
 }
 #endif /* !MALI_USE_CSF */
 
@@ -288,7 +328,7 @@ void kbase_sync_fence_info_get(struct dma_fence *fence,
 {
 	info->fence = fence;
 
-	/* translate into CONFIG_SYNC status:
+	/* Translate into the following status, with support for error handling:
 	 * < 0 : error
 	 * 0 : active
 	 * 1 : signaled
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_vinstr.c b/drivers/gpu/arm/bifrost/mali_kbase_vinstr.c
index abcf53041069..853c89796d44 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_vinstr.c
+++ b/drivers/gpu/arm/bifrost/mali_kbase_vinstr.c
@@ -20,11 +20,11 @@
  */
 
 #include "mali_kbase_vinstr.h"
-#include "mali_kbase_hwcnt_virtualizer.h"
-#include "mali_kbase_hwcnt_types.h"
+#include "hwcnt/mali_kbase_hwcnt_virtualizer.h"
+#include "hwcnt/mali_kbase_hwcnt_types.h"
 #include <uapi/gpu/arm/bifrost/mali_kbase_hwcnt_reader.h>
-#include "mali_kbase_hwcnt_gpu.h"
-#include "mali_kbase_hwcnt_gpu_narrow.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu.h"
+#include "hwcnt/mali_kbase_hwcnt_gpu_narrow.h"
 #include <uapi/gpu/arm/bifrost/mali_kbase_ioctl.h>
 #include "mali_malisw.h"
 #include "mali_kbase_debug.h"
@@ -1034,24 +1034,25 @@ static long kbasep_vinstr_hwcnt_reader_ioctl(
  * @filp: Non-NULL pointer to file structure.
  * @wait: Non-NULL pointer to poll table.
  *
- * Return: POLLIN if data can be read without blocking, 0 if data can not be
- *         read without blocking, else error code.
+ * Return: EPOLLIN | EPOLLRDNORM if data can be read without blocking, 0 if
+ *         data can not be read without blocking, else EPOLLHUP | EPOLLERR.
  */
 static __poll_t kbasep_vinstr_hwcnt_reader_poll(struct file *filp, poll_table *wait)
 {
 	struct kbase_vinstr_client *cli;
 
 	if (!filp || !wait)
-		return (__poll_t)-EINVAL;
+		return EPOLLHUP | EPOLLERR;
 
 	cli = filp->private_data;
 	if (!cli)
-		return (__poll_t)-EINVAL;
+		return EPOLLHUP | EPOLLERR;
 
 	poll_wait(filp, &cli->waitq, wait);
 	if (kbasep_vinstr_hwcnt_reader_buffer_ready(cli))
-		return POLLIN;
-	return 0;
+		return EPOLLIN | EPOLLRDNORM;
+
+	return (__poll_t)0;
 }
 
 /**
diff --git a/drivers/gpu/arm/bifrost/mali_malisw.h b/drivers/gpu/arm/bifrost/mali_malisw.h
index d25c29fda63b..d9db189e8684 100644
--- a/drivers/gpu/arm/bifrost/mali_malisw.h
+++ b/drivers/gpu/arm/bifrost/mali_malisw.h
@@ -97,16 +97,12 @@
  */
 #define CSTD_STR2(x)	CSTD_STR1(x)
 
-/* LINUX_VERSION_CODE < 5.4 */
-#if (KERNEL_VERSION(5, 4, 0) > LINUX_VERSION_CODE)
-#if defined(GCC_VERSION) && GCC_VERSION >= 70000
+ #ifndef fallthrough
+ #define fallthrough    __fallthrough
+ #endif /* fallthrough */
+
 #ifndef __fallthrough
 #define __fallthrough  __attribute__((fallthrough))
 #endif /* __fallthrough */
-#define fallthrough    __fallthrough
-#else
-#define fallthrough	   CSTD_NOP(...) /* fallthrough */
-#endif /* GCC_VERSION >= 70000 */
-#endif /* KERNEL_VERSION(5, 4, 0) */
 
 #endif /* _MALISW_H_ */
diff --git a/drivers/gpu/arm/bifrost/mmu/backend/mali_kbase_mmu_csf.c b/drivers/gpu/arm/bifrost/mmu/backend/mali_kbase_mmu_csf.c
index 04f5cdf42b84..db2086079c14 100644
--- a/drivers/gpu/arm/bifrost/mmu/backend/mali_kbase_mmu_csf.c
+++ b/drivers/gpu/arm/bifrost/mmu/backend/mali_kbase_mmu_csf.c
@@ -122,6 +122,8 @@ void kbase_mmu_report_mcu_as_fault_and_reset(struct kbase_device *kbdev,
 		access_type, kbase_gpu_access_type_name(fault->status),
 		source_id);
 
+	kbase_debug_csf_fault_notify(kbdev, NULL, DF_GPU_PAGE_FAULT);
+
 	/* Report MMU fault for all address spaces (except MCU_AS_NR) */
 	for (as_no = 1; as_no < kbdev->nr_hw_address_spaces; as_no++)
 		submit_work_pagefault(kbdev, as_no, fault);
@@ -188,6 +190,7 @@ void kbase_gpu_report_bus_fault_and_kill(struct kbase_context *kctx,
 	kbase_reg_write(kbdev, GPU_CONTROL_REG(GPU_COMMAND),
 			GPU_COMMAND_CLEAR_FAULT);
 	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+
 }
 
 /*
@@ -249,6 +252,7 @@ void kbase_mmu_report_fault_and_kill(struct kbase_context *kctx,
 	mutex_unlock(&kbdev->mmu_hw_mutex);
 	/* AS transaction end */
 
+	kbase_debug_csf_fault_notify(kbdev, kctx, DF_GPU_PAGE_FAULT);
 	/* Switching to UNMAPPED mode above would have enabled the firmware to
 	 * recover from the fault (if the memory access was made by firmware)
 	 * and it can then respond to CSG termination requests to be sent now.
@@ -262,6 +266,7 @@ void kbase_mmu_report_fault_and_kill(struct kbase_context *kctx,
 			KBASE_MMU_FAULT_TYPE_PAGE_UNEXPECTED);
 	kbase_mmu_hw_enable_fault(kbdev, as,
 			KBASE_MMU_FAULT_TYPE_PAGE_UNEXPECTED);
+
 }
 
 /**
diff --git a/drivers/gpu/arm/bifrost/mmu/backend/mali_kbase_mmu_jm.c b/drivers/gpu/arm/bifrost/mmu/backend/mali_kbase_mmu_jm.c
index 3130b332dec2..22786f0748ce 100644
--- a/drivers/gpu/arm/bifrost/mmu/backend/mali_kbase_mmu_jm.c
+++ b/drivers/gpu/arm/bifrost/mmu/backend/mali_kbase_mmu_jm.c
@@ -94,6 +94,7 @@ void kbase_gpu_report_bus_fault_and_kill(struct kbase_context *kctx,
 				 KBASE_MMU_FAULT_TYPE_BUS_UNEXPECTED);
 	kbase_mmu_hw_enable_fault(kbdev, as,
 				 KBASE_MMU_FAULT_TYPE_BUS_UNEXPECTED);
+
 }
 
 /*
diff --git a/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu.c b/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu.c
index fc7c8923ab07..8f261d439909 100644
--- a/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu.c
+++ b/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu.c
@@ -110,7 +110,8 @@ static void mmu_hw_operation_end(struct kbase_device *kbdev)
 
 /**
  * mmu_flush_cache_on_gpu_ctrl() - Check if cache flush needs to be done
- * through GPU_CONTROL interface
+ * through GPU_CONTROL interface.
+ *
  * @kbdev:         kbase device to check GPU model ID on.
  *
  * This function returns whether a cache flush for page table update should
@@ -137,6 +138,42 @@ static bool mmu_flush_cache_on_gpu_ctrl(struct kbase_device *kbdev)
  *
  * Issue a cache flush physical range command.
  */
+#if MALI_USE_CSF
+static void mmu_flush_pa_range(struct kbase_device *kbdev, phys_addr_t phys, size_t nr_bytes,
+			       enum kbase_mmu_op_type op)
+{
+	u32 flush_op;
+	int ret;
+
+	if (WARN_ON(kbdev == NULL))
+		return;
+
+	lockdep_assert_held(&kbdev->hwaccess_lock);
+
+	/* Translate operation to command */
+	if (op == KBASE_MMU_OP_FLUSH_PT) {
+		flush_op = GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_L2;
+	} else if (op == KBASE_MMU_OP_FLUSH_MEM) {
+		flush_op = GPU_COMMAND_FLUSH_PA_RANGE_CLN_INV_L2_LSC;
+	} else {
+		dev_warn(kbdev->dev, "Invalid flush request (op = %d)\n", op);
+		return;
+	}
+
+	ret = kbase_gpu_cache_flush_pa_range_and_busy_wait(kbdev, phys, nr_bytes, flush_op);
+
+	if (ret) {
+		/* Flush failed to complete, assume the GPU has hung and
+		 * perform a reset to recover
+		 */
+		dev_err(kbdev->dev,
+			"Flush for physical address range did not complete. Issuing GPU soft-reset to recover\n");
+
+		if (kbase_prepare_to_reset_gpu(kbdev, RESET_FLAGS_HWC_UNRECOVERABLE_ERROR))
+			kbase_reset_gpu(kbdev);
+	}
+}
+#endif
 
 /**
  * mmu_invalidate() - Perform an invalidate operation on MMU caches.
@@ -177,39 +214,15 @@ static void mmu_invalidate(struct kbase_device *kbdev, struct kbase_context *kct
 static void mmu_flush_invalidate_as(struct kbase_device *kbdev, struct kbase_as *as,
 				    const struct kbase_mmu_hw_op_param *op_param)
 {
-	int err;
-	bool gpu_powered;
+	int err = 0;
 	unsigned long flags;
 
-	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
-	gpu_powered = kbdev->pm.backend.gpu_powered;
-	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
-	/* GPU is off so there's no need to perform flush/invalidate.
-	 * But even if GPU is not actually powered down, after gpu_powered flag
-	 * was set to false, it is still safe to skip the flush/invalidate.
-	 * The TLB invalidation will anyways be performed due to AS_COMMAND_UPDATE
-	 * which is sent when address spaces are restored after gpu_powered flag
-	 * is set to true. Flushing of L2 cache is certainly not required as L2
-	 * cache is definitely off if gpu_powered is false.
-	 */
-	if (!gpu_powered)
-		return;
-
-	if (kbase_pm_context_active_handle_suspend(kbdev,
-				KBASE_PM_SUSPEND_HANDLER_DONT_REACTIVATE)) {
-		/* GPU has just been powered off due to system suspend.
-		 * So again, no need to perform flush/invalidate.
-		 */
-		return;
-	}
-
 	/* AS transaction begin */
 	mutex_lock(&kbdev->mmu_hw_mutex);
+	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 
-	mmu_hw_operation_begin(kbdev);
-	err = kbase_mmu_hw_do_flush(kbdev, as, op_param);
-	mmu_hw_operation_end(kbdev);
+	if (kbdev->pm.backend.gpu_powered)
+		err = kbase_mmu_hw_do_flush_locked(kbdev, as, op_param);
 
 	if (err) {
 		/* Flush failed to complete, assume the GPU has hung and
@@ -222,10 +235,9 @@ static void mmu_flush_invalidate_as(struct kbase_device *kbdev, struct kbase_as
 			kbase_reset_gpu(kbdev);
 	}
 
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
 	mutex_unlock(&kbdev->mmu_hw_mutex);
 	/* AS transaction end */
-
-	kbase_pm_context_idle(kbdev);
 }
 
 /**
@@ -246,9 +258,6 @@ static void mmu_flush_invalidate_as(struct kbase_device *kbdev, struct kbase_as
  * If operation is set to KBASE_MMU_OP_FLUSH_MEM then this function will issue
  * a cache flush + invalidate to the L2 and GPU Load/Store caches as well as
  * invalidating the TLBs.
- *
- * If operation is set to KBASE_MMU_OP_UNLOCK then this function will only
- * invalidate the MMU caches and TLBs.
  */
 static void mmu_flush_invalidate(struct kbase_device *kbdev, struct kbase_context *kctx, int as_nr,
 				 const struct kbase_mmu_hw_op_param *op_param)
@@ -327,6 +336,31 @@ static void mmu_flush_invalidate_on_gpu_ctrl(struct kbase_device *kbdev, struct
 	mutex_unlock(&kbdev->mmu_hw_mutex);
 }
 
+static void kbase_mmu_sync_pgd_gpu(struct kbase_device *kbdev, struct kbase_context *kctx,
+				   phys_addr_t phys, size_t size,
+				   enum kbase_mmu_op_type flush_op)
+{
+#if MALI_USE_CSF
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
+	if (mmu_flush_cache_on_gpu_ctrl(kbdev) && (flush_op != KBASE_MMU_OP_NONE) &&
+	    kbdev->pm.backend.gpu_powered && (!kctx || kctx->as_nr >= 0))
+		mmu_flush_pa_range(kbdev, phys, size, KBASE_MMU_OP_FLUSH_PT);
+	spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
+#endif
+}
+
+static void kbase_mmu_sync_pgd_cpu(struct kbase_device *kbdev, dma_addr_t handle, size_t size)
+{
+	/* In non-coherent system, ensure the GPU can read
+	 * the pages from memory
+	 */
+	if (kbdev->system_coherency == COHERENCY_NONE)
+		dma_sync_single_for_device(kbdev->dev, handle, size,
+				DMA_TO_DEVICE);
+}
+
 /**
  * kbase_mmu_sync_pgd() - sync page directory to memory when needed.
  * @kbdev:    Device pointer.
@@ -350,13 +384,9 @@ static void kbase_mmu_sync_pgd(struct kbase_device *kbdev, struct kbase_context
 			       phys_addr_t phys, dma_addr_t handle, size_t size,
 			       enum kbase_mmu_op_type flush_op)
 {
-	/* In non-coherent system, ensure the GPU can read
-	 * the pages from memory
-	 */
-	if (kbdev->system_coherency == COHERENCY_NONE)
-		dma_sync_single_for_device(kbdev->dev, handle, size,
-				DMA_TO_DEVICE);
 
+	kbase_mmu_sync_pgd_cpu(kbdev, handle, size);
+	kbase_mmu_sync_pgd_gpu(kbdev, kctx, phys, size, flush_op);
 }
 
 /*
@@ -383,22 +413,75 @@ static int kbase_mmu_update_pages_no_flush(struct kbase_context *kctx, u64 vpfn,
  * @level:    The level of MMU page table.
  * @flush_op: The type of MMU flush operation to perform.
  * @dirty_pgds: Flags to track every level where a PGD has been updated.
+ * @free_pgds_list: Linked list of the page directory pages to free.
  */
 static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
 						  struct kbase_mmu_table *mmut, phys_addr_t *pgds,
 						  u64 vpfn, int level,
-						  enum kbase_mmu_op_type flush_op, u64 *dirty_pgds);
+						  enum kbase_mmu_op_type flush_op, u64 *dirty_pgds,
+						  struct list_head *free_pgds_list);
 /**
  * kbase_mmu_free_pgd() - Free memory of the page directory
  *
  * @kbdev:   Device pointer.
  * @mmut:    GPU MMU page table.
  * @pgd:     Physical address of page directory to be freed.
- * @dirty:   Flag to indicate whether the page may be dirty in the cache.
+ *
+ * This function is supposed to be called with mmu_lock held and after
+ * ensuring that GPU won't be able to access the page.
  */
-static void kbase_mmu_free_pgd(struct kbase_device *kbdev,
-			       struct kbase_mmu_table *mmut, phys_addr_t pgd,
-			       bool dirty);
+static void kbase_mmu_free_pgd(struct kbase_device *kbdev, struct kbase_mmu_table *mmut,
+			       phys_addr_t pgd)
+{
+	struct page *p;
+
+	lockdep_assert_held(&mmut->mmu_lock);
+
+	p = pfn_to_page(PFN_DOWN(pgd));
+
+	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id], p, true);
+
+	atomic_sub(1, &kbdev->memdev.used_pages);
+
+	/* If MMU tables belong to a context then pages will have been accounted
+	 * against it, so we must decrement the usage counts here.
+	 */
+	if (mmut->kctx) {
+		kbase_process_page_usage_dec(mmut->kctx, 1);
+		atomic_sub(1, &mmut->kctx->used_pages);
+	}
+
+	kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1);
+}
+
+/**
+ * kbase_mmu_free_pgds_list() - Free the PGD pages present in the list
+ *
+ * @kbdev:          Device pointer.
+ * @mmut:           GPU MMU page table.
+ * @free_pgds_list: Linked list of the page directory pages to free.
+ *
+ * This function will call kbase_mmu_free_pgd() on each page directory page
+ * present in the @free_pgds_list.
+ *
+ * The function is supposed to be called after the GPU cache and MMU TLB has
+ * been invalidated post the teardown loop.
+ */
+static void kbase_mmu_free_pgds_list(struct kbase_device *kbdev, struct kbase_mmu_table *mmut,
+				     struct list_head *free_pgds_list)
+{
+	struct page *page, *next_page;
+
+	mutex_lock(&mmut->mmu_lock);
+
+	list_for_each_entry_safe(page, next_page, free_pgds_list, lru) {
+		list_del_init(&page->lru);
+		kbase_mmu_free_pgd(kbdev, mmut, page_to_phys(page));
+	}
+
+	mutex_unlock(&mmut->mmu_lock);
+}
+
 /**
  * reg_grow_calc_extra_pages() - Calculate the number of backed pages to add to
  *                               a region on a GPU page fault
@@ -484,8 +567,6 @@ static void kbase_gpu_mmu_handle_write_faulting_as(struct kbase_device *kbdev,
 						   u64 start_pfn, size_t nr,
 						   u32 kctx_id, u64 dirty_pgds)
 {
-	int err;
-
 	/* Calls to this function are inherently synchronous, with respect to
 	 * MMU operations.
 	 */
@@ -509,12 +590,11 @@ static void kbase_gpu_mmu_handle_write_faulting_as(struct kbase_device *kbdev,
 		spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
 		op_param.flush_skip_levels =
 				pgd_level_to_skip_flush(dirty_pgds);
-		err = kbase_mmu_hw_do_flush_on_gpu_ctrl(kbdev, faulting_as,
-							&op_param);
+		kbase_mmu_hw_do_flush_on_gpu_ctrl(kbdev, faulting_as, &op_param);
 		spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
 	} else {
 		mmu_hw_operation_begin(kbdev);
-		err = kbase_mmu_hw_do_flush(kbdev, faulting_as, &op_param);
+		kbase_mmu_hw_do_flush(kbdev, faulting_as, &op_param);
 		mmu_hw_operation_end(kbdev);
 	}
 
@@ -552,7 +632,6 @@ static void kbase_gpu_mmu_handle_write_fault(struct kbase_context *kctx,
 	struct tagged_addr *fault_phys_addr;
 	struct kbase_fault *fault;
 	u64 fault_pfn, pfn_offset;
-	int ret;
 	int as_no;
 	u64 dirty_pgds = 0;
 
@@ -613,8 +692,8 @@ static void kbase_gpu_mmu_handle_write_fault(struct kbase_context *kctx,
 	}
 
 	/* Now make this faulting page writable to GPU. */
-	ret = kbase_mmu_update_pages_no_flush(kctx, fault_pfn, fault_phys_addr, 1, region->flags,
-					      region->gpu_alloc->group_id, &dirty_pgds);
+	kbase_mmu_update_pages_no_flush(kctx, fault_pfn, fault_phys_addr, 1, region->flags,
+					region->gpu_alloc->group_id, &dirty_pgds);
 
 	kbase_gpu_mmu_handle_write_faulting_as(kbdev, faulting_as, fault_pfn, 1,
 					       kctx->id, dirty_pgds);
@@ -648,31 +727,68 @@ static void kbase_gpu_mmu_handle_permission_fault(struct kbase_context *kctx,
 }
 #endif
 
-#define MAX_POOL_LEVEL 2
+/**
+ * estimate_pool_space_required - Determine how much a pool should be grown by to support a future
+ * allocation
+ * @pool:           The memory pool to check, including its linked pools
+ * @pages_required: Number of 4KiB pages require for the pool to support a future allocation
+ *
+ * The value returned is accounting for the size of @pool and the size of each memory pool linked to
+ * @pool. Hence, the caller should use @pool and (if not already satisfied) all its linked pools to
+ * allocate from.
+ *
+ * Note: this is only an estimate, because even during the calculation the memory pool(s) involved
+ * can be updated to be larger or smaller. Hence, the result is only a guide as to whether an
+ * allocation could succeed, or an estimate of the correct amount to grow the pool by. The caller
+ * should keep attempting an allocation and then re-growing with a new value queried form this
+ * function until the allocation succeeds.
+ *
+ * Return: an estimate of the amount of extra 4KiB pages in @pool that are required to satisfy an
+ * allocation, or 0 if @pool (including its linked pools) is likely to already satisfy the
+ * allocation.
+ */
+static size_t estimate_pool_space_required(struct kbase_mem_pool *pool, const size_t pages_required)
+{
+	size_t pages_still_required;
+
+	for (pages_still_required = pages_required; pool != NULL && pages_still_required;
+	     pool = pool->next_pool) {
+		size_t pool_size_4k;
+
+		kbase_mem_pool_lock(pool);
+
+		pool_size_4k = kbase_mem_pool_size(pool) << pool->order;
+		if (pool_size_4k >= pages_still_required)
+			pages_still_required = 0;
+		else
+			pages_still_required -= pool_size_4k;
+
+		kbase_mem_pool_unlock(pool);
+	}
+	return pages_still_required;
+}
 
 /**
  * page_fault_try_alloc - Try to allocate memory from a context pool
  * @kctx:          Context pointer
  * @region:        Region to grow
- * @new_pages:     Number of 4 kB pages to allocate
- * @pages_to_grow: Pointer to variable to store number of outstanding pages on
- *                 failure. This can be either 4 kB or 2 MB pages, depending on
- *                 the number of pages requested.
- * @grow_2mb_pool: Pointer to variable to store which pool needs to grow - true
- *                 for 2 MB, false for 4 kB.
+ * @new_pages:     Number of 4 KiB pages to allocate
+ * @pages_to_grow: Pointer to variable to store number of outstanding pages on failure. This can be
+ *                 either 4 KiB or 2 MiB pages, depending on the number of pages requested.
+ * @grow_2mb_pool: Pointer to variable to store which pool needs to grow - true for 2 MiB, false for
+ *                 4 KiB.
  * @prealloc_sas:  Pointer to kbase_sub_alloc structures
  *
- * This function will try to allocate as many pages as possible from the context
- * pool, then if required will try to allocate the remaining pages from the
- * device pool.
+ * This function will try to allocate as many pages as possible from the context pool, then if
+ * required will try to allocate the remaining pages from the device pool.
  *
- * This function will not allocate any new memory beyond that is already
- * present in the context or device pools. This is because it is intended to be
- * called with the vm_lock held, which could cause recursive locking if the
- * allocation caused the out-of-memory killer to run.
+ * This function will not allocate any new memory beyond that is already present in the context or
+ * device pools. This is because it is intended to be called whilst the thread has acquired the
+ * region list lock with kbase_gpu_vm_lock(), and a large enough memory allocation whilst that is
+ * held could invoke the OoM killer and cause an effective deadlock with kbase_cpu_vm_close().
  *
- * If 2 MB pages are enabled and new_pages is >= 2 MB then pages_to_grow will be
- * a count of 2 MB pages, otherwise it will be a count of 4 kB pages.
+ * If 2 MiB pages are enabled and new_pages is >= 2 MiB then pages_to_grow will be a count of 2 MiB
+ * pages, otherwise it will be a count of 4 KiB pages.
  *
  * Return: true if successful, false on failure
  */
@@ -681,13 +797,15 @@ static bool page_fault_try_alloc(struct kbase_context *kctx,
 		int *pages_to_grow, bool *grow_2mb_pool,
 		struct kbase_sub_alloc **prealloc_sas)
 {
-	struct tagged_addr *gpu_pages[MAX_POOL_LEVEL] = {NULL};
-	struct tagged_addr *cpu_pages[MAX_POOL_LEVEL] = {NULL};
-	size_t pages_alloced[MAX_POOL_LEVEL] = {0};
+	size_t total_gpu_pages_alloced = 0;
+	size_t total_cpu_pages_alloced = 0;
 	struct kbase_mem_pool *pool, *root_pool;
-	int pool_level = 0;
 	bool alloc_failed = false;
 	size_t pages_still_required;
+	size_t total_mempools_free_4k = 0;
+
+	lockdep_assert_held(&kctx->reg_lock);
+	lockdep_assert_held(&kctx->mem_partials_lock);
 
 	if (WARN_ON(region->gpu_alloc->group_id >=
 		MEMORY_GROUP_MANAGER_NR_GROUPS)) {
@@ -711,27 +829,10 @@ static bool page_fault_try_alloc(struct kbase_context *kctx,
 	if (region->gpu_alloc != region->cpu_alloc)
 		new_pages *= 2;
 
-	pages_still_required = new_pages;
-
 	/* Determine how many pages are in the pools before trying to allocate.
 	 * Don't attempt to allocate & free if the allocation can't succeed.
 	 */
-	for (pool = root_pool; pool != NULL; pool = pool->next_pool) {
-		size_t pool_size_4k;
-
-		kbase_mem_pool_lock(pool);
-
-		pool_size_4k = kbase_mem_pool_size(pool) << pool->order;
-		if (pool_size_4k >= pages_still_required)
-			pages_still_required = 0;
-		else
-			pages_still_required -= pool_size_4k;
-
-		kbase_mem_pool_unlock(pool);
-
-		if (!pages_still_required)
-			break;
-	}
+	pages_still_required = estimate_pool_space_required(root_pool, new_pages);
 
 	if (pages_still_required) {
 		/* Insufficient pages in pools. Don't try to allocate - just
@@ -742,11 +843,11 @@ static bool page_fault_try_alloc(struct kbase_context *kctx,
 		return false;
 	}
 
-	/* Since we've dropped the pool locks, the amount of memory in the pools
-	 * may change between the above check and the actual allocation.
+	/* Since we're not holding any of the mempool locks, the amount of memory in the pools may
+	 * change between the above estimate and the actual allocation.
 	 */
-	pool = root_pool;
-	for (pool_level = 0; pool_level < MAX_POOL_LEVEL; pool_level++) {
+	pages_still_required = new_pages;
+	for (pool = root_pool; pool != NULL && pages_still_required; pool = pool->next_pool) {
 		size_t pool_size_4k;
 		size_t pages_to_alloc_4k;
 		size_t pages_to_alloc_4k_per_alloc;
@@ -755,93 +856,91 @@ static bool page_fault_try_alloc(struct kbase_context *kctx,
 
 		/* Allocate as much as possible from this pool*/
 		pool_size_4k = kbase_mem_pool_size(pool) << pool->order;
-		pages_to_alloc_4k = MIN(new_pages, pool_size_4k);
+		total_mempools_free_4k += pool_size_4k;
+		pages_to_alloc_4k = MIN(pages_still_required, pool_size_4k);
 		if (region->gpu_alloc == region->cpu_alloc)
 			pages_to_alloc_4k_per_alloc = pages_to_alloc_4k;
 		else
 			pages_to_alloc_4k_per_alloc = pages_to_alloc_4k >> 1;
 
-		pages_alloced[pool_level] = pages_to_alloc_4k;
 		if (pages_to_alloc_4k) {
-			gpu_pages[pool_level] =
-					kbase_alloc_phy_pages_helper_locked(
-						region->gpu_alloc, pool,
-						pages_to_alloc_4k_per_alloc,
-						&prealloc_sas[0]);
+			struct tagged_addr *gpu_pages =
+				kbase_alloc_phy_pages_helper_locked(region->gpu_alloc, pool,
+								    pages_to_alloc_4k_per_alloc,
+								    &prealloc_sas[0]);
 
-			if (!gpu_pages[pool_level]) {
+			if (!gpu_pages)
 				alloc_failed = true;
-			} else if (region->gpu_alloc != region->cpu_alloc) {
-				cpu_pages[pool_level] =
-					kbase_alloc_phy_pages_helper_locked(
-						region->cpu_alloc, pool,
-						pages_to_alloc_4k_per_alloc,
-						&prealloc_sas[1]);
+			else
+				total_gpu_pages_alloced += pages_to_alloc_4k_per_alloc;
 
-				if (!cpu_pages[pool_level])
+			if (!alloc_failed && region->gpu_alloc != region->cpu_alloc) {
+				struct tagged_addr *cpu_pages = kbase_alloc_phy_pages_helper_locked(
+					region->cpu_alloc, pool, pages_to_alloc_4k_per_alloc,
+					&prealloc_sas[1]);
+
+				if (!cpu_pages)
 					alloc_failed = true;
+				else
+					total_cpu_pages_alloced += pages_to_alloc_4k_per_alloc;
 			}
 		}
 
 		kbase_mem_pool_unlock(pool);
 
 		if (alloc_failed) {
-			WARN_ON(!new_pages);
-			WARN_ON(pages_to_alloc_4k >= new_pages);
-			WARN_ON(pages_to_alloc_4k_per_alloc >= new_pages);
+			WARN_ON(!pages_still_required);
+			WARN_ON(pages_to_alloc_4k >= pages_still_required);
+			WARN_ON(pages_to_alloc_4k_per_alloc >= pages_still_required);
 			break;
 		}
 
-		new_pages -= pages_to_alloc_4k;
-
-		if (!new_pages)
-			break;
-
-		pool = pool->next_pool;
-		if (!pool)
-			break;
+		pages_still_required -= pages_to_alloc_4k;
 	}
 
-	if (new_pages) {
-		/* Allocation was unsuccessful */
-		int max_pool_level = pool_level;
-
-		pool = root_pool;
-
-		/* Free memory allocated so far */
-		for (pool_level = 0; pool_level <= max_pool_level;
-				pool_level++) {
-			kbase_mem_pool_lock(pool);
-
-			if (region->gpu_alloc != region->cpu_alloc) {
-				if (pages_alloced[pool_level] &&
-						cpu_pages[pool_level])
-					kbase_free_phy_pages_helper_locked(
-						region->cpu_alloc,
-						pool, cpu_pages[pool_level],
-						pages_alloced[pool_level]);
-			}
-
-			if (pages_alloced[pool_level] && gpu_pages[pool_level])
-				kbase_free_phy_pages_helper_locked(
-						region->gpu_alloc,
-						pool, gpu_pages[pool_level],
-						pages_alloced[pool_level]);
-
-			kbase_mem_pool_unlock(pool);
-
-			pool = pool->next_pool;
-		}
-
-		/*
-		 * If the allocation failed despite there being enough memory in
-		 * the pool, then just fail. Otherwise, try to grow the memory
-		 * pool.
+	if (pages_still_required) {
+		/* Allocation was unsuccessful. We have dropped the mem_pool lock after allocation,
+		 * so must in any case use kbase_free_phy_pages_helper() rather than
+		 * kbase_free_phy_pages_helper_locked()
 		 */
-		if (alloc_failed)
+		if (total_gpu_pages_alloced > 0)
+			kbase_free_phy_pages_helper(region->gpu_alloc, total_gpu_pages_alloced);
+		if (region->gpu_alloc != region->cpu_alloc && total_cpu_pages_alloced > 0)
+			kbase_free_phy_pages_helper(region->cpu_alloc, total_cpu_pages_alloced);
+
+		if (alloc_failed) {
+			/* Note that in allocating from the above memory pools, we always ensure
+			 * never to request more than is available in each pool with the pool's
+			 * lock held. Hence failing to allocate in such situations would be unusual
+			 * and we should cancel the growth instead (as re-growing the memory pool
+			 * might not fix the situation)
+			 */
+			dev_warn(
+				kctx->kbdev->dev,
+				"Page allocation failure of %zu pages: managed %zu pages, mempool (inc linked pools) had %zu pages available\n",
+				new_pages, total_gpu_pages_alloced + total_cpu_pages_alloced,
+				total_mempools_free_4k);
 			*pages_to_grow = 0;
-		else
-			*pages_to_grow = new_pages;
+		} else {
+			/* Tell the caller to try to grow the memory pool
+			 *
+			 * Freeing pages above may have spilled or returned them to the OS, so we
+			 * have to take into account how many are still in the pool before giving a
+			 * new estimate for growth required of the pool. We can just re-estimate a
+			 * new value.
+			 */
+			pages_still_required = estimate_pool_space_required(root_pool, new_pages);
+			if (pages_still_required) {
+				*pages_to_grow = pages_still_required;
+			} else {
+				/* It's possible another thread could've grown the pool to be just
+				 * big enough after we rolled back the allocation. Request at least
+				 * one more page to ensure the caller doesn't fail the growth by
+				 * conflating it with the alloc_failed case above
+				 */
+				*pages_to_grow = 1u;
+			}
+		}
 
 		return false;
 	}
@@ -975,20 +1074,24 @@ void kbase_mmu_page_fault_worker(struct work_struct *data)
 		goto fault_done;
 	}
 
+page_fault_retry:
 #ifdef CONFIG_MALI_2MB_ALLOC
-	/* Preallocate memory for the sub-allocation structs if necessary */
+	/* Preallocate (or re-allocate) memory for the sub-allocation structs if necessary */
 	for (i = 0; i != ARRAY_SIZE(prealloc_sas); ++i) {
-		prealloc_sas[i] = kmalloc(sizeof(*prealloc_sas[i]), GFP_KERNEL);
 		if (!prealloc_sas[i]) {
-			kbase_mmu_report_fault_and_kill(kctx, faulting_as,
+			prealloc_sas[i] = kmalloc(sizeof(*prealloc_sas[i]), GFP_KERNEL);
+
+			if (!prealloc_sas[i]) {
+				kbase_mmu_report_fault_and_kill(
+					kctx, faulting_as,
 					"Failed pre-allocating memory for sub-allocations' metadata",
 					fault);
-			goto fault_done;
+				goto fault_done;
+			}
 		}
 	}
 #endif /* CONFIG_MALI_2MB_ALLOC */
 
-page_fault_retry:
 	/* so we have a translation fault,
 	 * let's see if it is for growable memory
 	 */
@@ -1372,13 +1475,12 @@ static phys_addr_t kbase_mmu_alloc_pgd(struct kbase_device *kbdev,
 		struct kbase_mmu_table *mmut)
 {
 	u64 *page;
-	int i;
 	struct page *p;
 	phys_addr_t pgd;
 
 	p = kbase_mem_pool_alloc(&kbdev->mem_pools.small[mmut->group_id]);
 	if (!p)
-		return 0;
+		return KBASE_MMU_INVALID_PGD_ADDRESS;
 
 	page = kmap(p);
 	if (page == NULL)
@@ -1406,13 +1508,12 @@ static phys_addr_t kbase_mmu_alloc_pgd(struct kbase_device *kbdev,
 
 	kbase_trace_gpu_mem_usage_inc(kbdev, mmut->kctx, 1);
 
-	for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++)
-		kbdev->mmu_mode->entry_invalidate(&page[i]);
+	kbdev->mmu_mode->entries_invalidate(page, KBASE_MMU_PAGE_ENTRIES);
 
-	/* MMU cache flush strategy is NONE because this page is newly created, therefore
-	 * there is no content to clean or invalidate in the GPU caches.
+	/* As this page is newly created, therefore there is no content to
+	 * clean or invalidate in the GPU caches.
 	 */
-	kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd, kbase_dma_addr(p), PAGE_SIZE, KBASE_MMU_OP_NONE);
+	kbase_mmu_sync_pgd_cpu(kbdev, kbase_dma_addr(p), PAGE_SIZE);
 
 	kunmap(p);
 	return pgd;
@@ -1420,7 +1521,7 @@ static phys_addr_t kbase_mmu_alloc_pgd(struct kbase_device *kbdev,
 alloc_free:
 	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id], p, false);
 
-	return 0;
+	return KBASE_MMU_INVALID_PGD_ADDRESS;
 }
 
 /* Given PGD PFN for level N, return PGD PFN for level N+1, allocating the
@@ -1452,16 +1553,12 @@ static int mmu_get_next_pgd(struct kbase_device *kbdev, struct kbase_mmu_table *
 		return -EINVAL;
 	}
 
-	target_pgd = kbdev->mmu_mode->pte_to_phy_addr(
-		kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
-			kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[vpfn]));
-
-	if (!target_pgd) {
-		enum kbase_mmu_op_type flush_op = KBASE_MMU_OP_NONE;
+	if (!kbdev->mmu_mode->pte_is_valid(page[vpfn], level)) {
 		unsigned int current_valid_entries;
 		u64 managed_pte;
+
 		target_pgd = kbase_mmu_alloc_pgd(kbdev, mmut);
-		if (!target_pgd) {
+		if (target_pgd == KBASE_MMU_INVALID_PGD_ADDRESS) {
 			dev_dbg(kbdev->dev, "%s: kbase_mmu_alloc_pgd failure\n",
 					__func__);
 			kunmap(p);
@@ -1477,22 +1574,24 @@ static int mmu_get_next_pgd(struct kbase_device *kbdev, struct kbase_mmu_table *
 		/* Rely on the caller to update the address space flags. */
 		if (newly_created_pgd && !*newly_created_pgd) {
 			*newly_created_pgd = true;
-			/* If code reaches here we know parent PGD of target PGD was
-			 * not newly created and should be flushed.
-			 */
-			flush_op = KBASE_MMU_OP_FLUSH_PT;
-
 			if (dirty_pgds)
 				*dirty_pgds |= 1ULL << level;
 		}
 
-		/* MMU cache flush strategy is FLUSH_PT because a new entry is added
-		 * to an existing PGD which may be stored in GPU caches and needs a
-		 * "clean" operation. An "invalidation" operation is not required here
-		 * as this entry points to a new page and cannot be present in GPU
-		 * caches.
+		/* A new valid entry is added to an existing PGD. Perform the
+		 * invalidate operation for GPU cache as it could be having a
+		 * cacheline that contains the entry (in an invalid form).
+		 * Even if the parent PGD was newly created, invalidation of
+		 * GPU cache is still needed. For explanation, please refer
+		 * the comment in kbase_mmu_insert_pages_no_flush().
 		 */
-		kbase_mmu_sync_pgd(kbdev, mmut->kctx, *pgd, kbase_dma_addr(p), PAGE_SIZE, flush_op);
+		kbase_mmu_sync_pgd(kbdev, mmut->kctx, *pgd + (vpfn * sizeof(u64)),
+				   kbase_dma_addr(p) + (vpfn * sizeof(u64)), sizeof(u64),
+				   KBASE_MMU_OP_FLUSH_PT);
+	} else {
+		target_pgd = kbdev->mmu_mode->pte_to_phy_addr(
+			kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
+				kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[vpfn]));
 	}
 
 	kunmap(p);
@@ -1540,9 +1639,9 @@ static int mmu_get_bottom_pgd(struct kbase_device *kbdev, struct kbase_mmu_table
 
 static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 					      struct kbase_mmu_table *mmut, u64 from_vpfn,
-					      u64 to_vpfn, u64 *dirty_pgds)
+					      u64 to_vpfn, u64 *dirty_pgds,
+					      struct list_head *free_pgds_list)
 {
-	phys_addr_t pgd;
 	u64 vpfn = from_vpfn;
 	struct kbase_mmu_mode const *mmu_mode;
 
@@ -1555,7 +1654,6 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 	mmu_mode = kbdev->mmu_mode;
 
 	while (vpfn < to_vpfn) {
-		unsigned int i;
 		unsigned int idx = vpfn & 0x1FF;
 		unsigned int count = KBASE_MMU_PAGE_ENTRIES - idx;
 		unsigned int pcount = 0;
@@ -1563,6 +1661,8 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 		int level;
 		u64 *page;
 		phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
+		phys_addr_t pgd = mmut->pgd;
+		struct page *p = phys_to_page(pgd);
 
 		register unsigned int num_of_valid_entries;
 
@@ -1570,18 +1670,17 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 			count = left;
 
 		/* need to check if this is a 2MB page or a 4kB */
-		pgd = mmut->pgd;
-
 		for (level = MIDGARD_MMU_TOPLEVEL;
 				level <= MIDGARD_MMU_BOTTOMLEVEL; level++) {
 			idx = (vpfn >> ((3 - level) * 9)) & 0x1FF;
 			pgds[level] = pgd;
-			page = kmap(phys_to_page(pgd));
+			page = kmap(p);
 			if (mmu_mode->ate_is_valid(page[idx], level))
 				break; /* keep the mapping */
-			kunmap(phys_to_page(pgd));
+			kunmap(p);
 			pgd = mmu_mode->pte_to_phy_addr(kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
 				kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[idx]));
+			p = phys_to_page(pgd);
 		}
 
 		switch (level) {
@@ -1608,35 +1707,70 @@ static void mmu_insert_pages_failure_recovery(struct kbase_device *kbdev,
 		else
 			num_of_valid_entries -= pcount;
 
-		if (!num_of_valid_entries) {
-			kunmap(phys_to_page(pgd));
+		/* Invalidate the entries we added */
+		mmu_mode->entries_invalidate(&page[idx], pcount);
 
-			kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
+		if (!num_of_valid_entries) {
+			kunmap(p);
+
+			list_add(&p->lru, free_pgds_list);
 
 			kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds, vpfn, level,
-							      KBASE_MMU_OP_NONE, dirty_pgds);
+							      KBASE_MMU_OP_NONE, dirty_pgds,
+							      free_pgds_list);
 			vpfn += count;
 			continue;
 		}
 
-		/* Invalidate the entries we added */
-		for (i = 0; i < pcount; i++)
-			mmu_mode->entry_invalidate(&page[idx + i]);
-
 		mmu_mode->set_num_valid_entries(page, num_of_valid_entries);
 
 		/* MMU cache flush strategy is NONE because GPU cache maintenance is
 		 * going to be done by the caller
 		 */
 		kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (idx * sizeof(u64)),
-				   kbase_dma_addr(phys_to_page(pgd)) + 8 * idx, 8 * pcount,
+				   kbase_dma_addr(p) + sizeof(u64) * idx, sizeof(u64) * pcount,
 				   KBASE_MMU_OP_NONE);
-		kunmap(phys_to_page(pgd));
+		kunmap(p);
 next:
 		vpfn += count;
 	}
 }
 
+static void mmu_flush_invalidate_insert_pages(struct kbase_device *kbdev,
+					      struct kbase_mmu_table *mmut, const u64 vpfn,
+					      size_t nr, u64 dirty_pgds,
+					      enum kbase_caller_mmu_sync_info mmu_sync_info)
+{
+	struct kbase_mmu_hw_op_param op_param;
+	int as_nr = 0;
+
+	op_param.vpfn = vpfn;
+	op_param.nr = nr;
+	op_param.op = KBASE_MMU_OP_FLUSH_PT;
+	op_param.mmu_sync_info = mmu_sync_info;
+	op_param.kctx_id = mmut->kctx ? mmut->kctx->id : 0xFFFFFFFF;
+	op_param.flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds);
+
+#if MALI_USE_CSF
+	as_nr = mmut->kctx ? mmut->kctx->as_nr : MCU_AS_NR;
+#else
+	WARN_ON(!mmut->kctx);
+#endif
+
+	/* MMU cache flush strategy depends on whether GPU control commands for
+	 * flushing physical address ranges are supported. The new physical pages
+	 * are not present in GPU caches therefore they don't need any cache
+	 * maintenance, but PGDs in the page table may or may not be created anew.
+	 *
+	 * Operations that affect the whole GPU cache shall only be done if it's
+	 * impossible to update physical ranges.
+	 */
+	if (mmu_flush_cache_on_gpu_ctrl(kbdev))
+		mmu_invalidate(kbdev, mmut->kctx, as_nr, &op_param);
+	else
+		mmu_flush_invalidate(kbdev, mmut->kctx, as_nr, &op_param);
+}
+
 /*
  * Map the single page 'phys' 'nr' of times, starting at GPU PFN 'vpfn'
  */
@@ -1657,8 +1791,8 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 	int err;
 	struct kbase_device *kbdev;
 	enum kbase_mmu_op_type flush_op;
-	struct kbase_mmu_hw_op_param op_param;
 	u64 dirty_pgds = 0;
+	LIST_HEAD(free_pgds_list);
 
 	if (WARN_ON(kctx == NULL))
 		return -EINVAL;
@@ -1672,15 +1806,6 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 	if (nr == 0)
 		return 0;
 
-	/* Set up MMU flush operation parameters. */
-	op_param = (struct kbase_mmu_hw_op_param){
-		.vpfn = vpfn,
-		.nr = nr,
-		.op = KBASE_MMU_OP_FLUSH_PT,
-		.kctx_id = kctx->id,
-		.mmu_sync_info = mmu_sync_info,
-	};
-
 	mutex_lock(&kctx->mmu.mmu_lock);
 
 	while (remain) {
@@ -1725,7 +1850,7 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 				 */
 				mmu_insert_pages_failure_recovery(kbdev, &kctx->mmu, start_vpfn,
 								  start_vpfn + recover_count,
-								  &dirty_pgds);
+								  &dirty_pgds, &free_pgds_list);
 			}
 			goto fail_unlock;
 		}
@@ -1740,7 +1865,7 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 				 */
 				mmu_insert_pages_failure_recovery(kbdev, &kctx->mmu, start_vpfn,
 								  start_vpfn + recover_count,
-								  &dirty_pgds);
+								  &dirty_pgds, &free_pgds_list);
 			}
 			err = -ENOMEM;
 			goto fail_unlock;
@@ -1791,53 +1916,21 @@ int kbase_mmu_insert_single_page(struct kbase_context *kctx, u64 vpfn,
 	}
 	mutex_unlock(&kctx->mmu.mmu_lock);
 
-	op_param.flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds);
-	/* If FLUSH_PA_RANGE is supported then existing PGDs will have been flushed
-	 * and all that remains is TLB (or MMU cache) invalidation which is done via
-	 * MMU UNLOCK command.
-	 */
-	if (mmu_flush_cache_on_gpu_ctrl(kbdev))
-		mmu_invalidate(kbdev, kctx, kctx->as_nr, &op_param);
-	else
-		mmu_flush_invalidate(kbdev, kctx, kctx->as_nr, &op_param);
+	mmu_flush_invalidate_insert_pages(kbdev, &kctx->mmu, start_vpfn, nr, dirty_pgds,
+					  mmu_sync_info);
+
 	return 0;
 
 fail_unlock:
 	mutex_unlock(&kctx->mmu.mmu_lock);
-	op_param.flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds);
-	if (mmu_flush_cache_on_gpu_ctrl(kbdev))
-		mmu_flush_invalidate_on_gpu_ctrl(kbdev, kctx, kctx->as_nr, &op_param);
-	else
-		mmu_flush_invalidate(kbdev, kctx, kctx->as_nr, &op_param);
+
+	mmu_flush_invalidate_insert_pages(kbdev, &kctx->mmu, start_vpfn, nr, dirty_pgds,
+					  mmu_sync_info);
+	kbase_mmu_free_pgds_list(kbdev, &kctx->mmu, &free_pgds_list);
+
 	return err;
 }
 
-static void kbase_mmu_free_pgd(struct kbase_device *kbdev,
-			       struct kbase_mmu_table *mmut, phys_addr_t pgd,
-			       bool dirty)
-{
-	struct page *p;
-
-	lockdep_assert_held(&mmut->mmu_lock);
-
-	p = pfn_to_page(PFN_DOWN(pgd));
-
-	kbase_mem_pool_free(&kbdev->mem_pools.small[mmut->group_id],
-			    p, dirty);
-
-	atomic_sub(1, &kbdev->memdev.used_pages);
-
-	/* If MMU tables belong to a context then pages will have been accounted
-	 * against it, so we must decrement the usage counts here.
-	 */
-	if (mmut->kctx) {
-		kbase_process_page_usage_dec(mmut->kctx, 1);
-		atomic_sub(1, &mmut->kctx->used_pages);
-	}
-
-	kbase_trace_gpu_mem_usage_dec(kbdev, mmut->kctx, 1);
-}
-
 u64 kbase_mmu_create_ate(struct kbase_device *const kbdev,
 	struct tagged_addr const phy, unsigned long const flags,
 	int const level, int const group_id)
@@ -1859,6 +1952,7 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, struct kbase_mmu
 	size_t remain = nr;
 	int err;
 	struct kbase_mmu_mode const *mmu_mode;
+	LIST_HEAD(free_pgds_list);
 
 	/* Note that 0 is a valid start_vpfn */
 	/* 64-bit address range is the max */
@@ -1879,7 +1973,6 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, struct kbase_mmu
 		struct page *p;
 		int cur_level;
 		register unsigned int num_of_valid_entries;
-		enum kbase_mmu_op_type flush_op;
 		bool newly_created_pgd = false;
 
 		if (count > remain)
@@ -1919,7 +2012,8 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, struct kbase_mmu
 				 * completed
 				 */
 				mmu_insert_pages_failure_recovery(kbdev, mmut, start_vpfn,
-								  insert_vpfn, dirty_pgds);
+								  insert_vpfn, dirty_pgds,
+								  &free_pgds_list);
 			}
 			goto fail_unlock;
 		}
@@ -1934,7 +2028,8 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, struct kbase_mmu
 				 * completed
 				 */
 				mmu_insert_pages_failure_recovery(kbdev, mmut, start_vpfn,
-								  insert_vpfn, dirty_pgds);
+								  insert_vpfn, dirty_pgds,
+								  &free_pgds_list);
 			}
 			err = -ENOMEM;
 			goto fail_unlock;
@@ -1945,20 +2040,8 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, struct kbase_mmu
 
 		if (cur_level == MIDGARD_MMU_LEVEL(2)) {
 			int level_index = (insert_vpfn >> 9) & 0x1FF;
-			u64 *target = &pgd_page[level_index];
-
-			if (mmu_mode->pte_is_valid(*target, cur_level)) {
-				kbase_mmu_free_pgd(
-					kbdev, mmut,
-					kbdev->mmu_mode->pte_to_phy_addr(
-						kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
-							kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP,
-							cur_level, *target)),
-					false);
-				num_of_valid_entries--;
-			}
-			*target = kbase_mmu_create_ate(kbdev, *phys, flags,
-				cur_level, group_id);
+			pgd_page[level_index] =
+				kbase_mmu_create_ate(kbdev, *phys, flags, cur_level, group_id);
 
 			num_of_valid_entries++;
 		} else {
@@ -1983,36 +2066,48 @@ int kbase_mmu_insert_pages_no_flush(struct kbase_device *kbdev, struct kbase_mmu
 
 		mmu_mode->set_num_valid_entries(pgd_page, num_of_valid_entries);
 
-		if (dirty_pgds && count > 0 && !newly_created_pgd)
+		if (dirty_pgds && !newly_created_pgd)
 			*dirty_pgds |= 1ULL << cur_level;
 
 		phys += count;
 		insert_vpfn += count;
 		remain -= count;
 
-		/* For the most part, the creation of a new virtual memory mapping does
-		 * not require cache flush operations, because the operation results
-		 * into the creation of new memory pages which are not present in GPU
-		 * caches. Therefore the defaul operation is NONE.
-		 *
-		 * However, it is quite common for the mapping to start and/or finish
-		 * at an already existing PGD. Moreover, the PTEs modified are not
-		 * necessarily aligned with GPU cache lines. Therefore, GPU cache
-		 * maintenance is required for existing PGDs.
+		/* Even if mmu_get_pgd_at_level() allocated a new bottom level
+		 * table page, the invalidation of L2 cache is still needed for
+		 * for the valid entries written in that page. This is because a
+		 * race can happen as soon as the entry of parent level table is
+		 * updated to point to the page of bottom level table.
+		 * GPU can try to access within the the same virtual range that
+		 * is being mapped, before the valid entries of bottom level table
+		 * page are flushed to the memory from the CPU's cache. And if that
+		 * happens then the invalid entries from memory could get fetched
+		 * into the L2 cache and so those entries won't be affected by the
+		 * MMU TLB invalidation done by sending the UNLOCK command.
+		 * If the memory is growable then this could result in unexpected
+		 * page faults happening repeatedly, until the invalid entry is
+		 * evicted from the L2 cache, as Driver would consider the page
+		 * faults for mapped memory as duplicate and won't take any action
+		 * effectively.
 		 */
-		flush_op = newly_created_pgd ? KBASE_MMU_OP_NONE : KBASE_MMU_OP_FLUSH_PT;
-
 		kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (vindex * sizeof(u64)),
 				   kbase_dma_addr(p) + (vindex * sizeof(u64)), count * sizeof(u64),
-				   flush_op);
+				   KBASE_MMU_OP_FLUSH_PT);
 
 		kunmap(p);
 	}
 
-	err = 0;
+	mutex_unlock(&mmut->mmu_lock);
+
+	return 0;
 
 fail_unlock:
 	mutex_unlock(&mmut->mmu_lock);
+
+	mmu_flush_invalidate_insert_pages(kbdev, mmut, start_vpfn, nr, *dirty_pgds,
+					  CALLER_MMU_ASYNC);
+	kbase_mmu_free_pgds_list(kbdev, mmut, &free_pgds_list);
+
 	return err;
 }
 
@@ -2027,8 +2122,8 @@ int kbase_mmu_insert_pages(struct kbase_device *kbdev,
 			   enum kbase_caller_mmu_sync_info mmu_sync_info)
 {
 	int err;
-	struct kbase_mmu_hw_op_param op_param = { 0 };
 	u64 dirty_pgds = 0;
+	LIST_HEAD(free_pgds_list);
 
 	/* Early out if there is nothing to do */
 	if (nr == 0)
@@ -2036,28 +2131,12 @@ int kbase_mmu_insert_pages(struct kbase_device *kbdev,
 
 	err = kbase_mmu_insert_pages_no_flush(kbdev, mmut, vpfn, phys, nr, flags, group_id,
 					      &dirty_pgds);
+	if (err)
+		return err;
 
-	op_param.vpfn = vpfn;
-	op_param.nr = nr;
-	op_param.op = KBASE_MMU_OP_FLUSH_PT;
-	op_param.mmu_sync_info = mmu_sync_info;
-	op_param.kctx_id = mmut->kctx ? mmut->kctx->id : 0xFFFFFFFF;
-	op_param.flush_skip_levels = pgd_level_to_skip_flush(dirty_pgds);
+	mmu_flush_invalidate_insert_pages(kbdev, mmut, vpfn, nr, dirty_pgds, mmu_sync_info);
 
-	/* MMU cache flush strategy depends on whether GPU control commands for
-	 * flushing physical address ranges are supported. The new physical pages
-	 * are not present in GPU caches there for they don't need any cache
-	 * maintenance, but PGDs in the page table may or may not be created anew.
-	 *
-	 * Operations that affect the whole GPU cache shall only be done if it's
-	 * impossible to update physical ranges.
-	 */
-	if (mmu_flush_cache_on_gpu_ctrl(kbdev))
-		mmu_invalidate(kbdev, mmut->kctx, as_nr, &op_param);
-	else
-		mmu_flush_invalidate(kbdev, mmut->kctx, as_nr, &op_param);
-
-	return err;
+	return 0;
 }
 
 KBASE_EXPORT_TEST_API(kbase_mmu_insert_pages);
@@ -2173,7 +2252,8 @@ KBASE_EXPORT_TEST_API(kbase_mmu_disable);
 static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
 						  struct kbase_mmu_table *mmut, phys_addr_t *pgds,
 						  u64 vpfn, int level,
-						  enum kbase_mmu_op_type flush_op, u64 *dirty_pgds)
+						  enum kbase_mmu_op_type flush_op, u64 *dirty_pgds,
+						  struct list_head *free_pgds_list)
 {
 	int current_level;
 
@@ -2181,36 +2261,42 @@ static void kbase_mmu_update_and_free_parent_pgds(struct kbase_device *kbdev,
 
 	for (current_level = level - 1; current_level >= MIDGARD_MMU_LEVEL(0);
 	     current_level--) {
-		u64 *current_page = kmap(phys_to_page(pgds[current_level]));
+		phys_addr_t current_pgd = pgds[current_level];
+		struct page *p = phys_to_page(current_pgd);
+		u64 *current_page = kmap(p);
 		unsigned int current_valid_entries =
 			kbdev->mmu_mode->get_num_valid_entries(current_page);
+		int index = (vpfn >> ((3 - current_level) * 9)) & 0x1FF;
 
 		/* We need to track every level that needs updating */
 		if (dirty_pgds)
 			*dirty_pgds |= 1ULL << current_level;
 
+		kbdev->mmu_mode->entries_invalidate(&current_page[index], 1);
 		if (current_valid_entries == 1 &&
 		    current_level != MIDGARD_MMU_LEVEL(0)) {
-			kunmap(phys_to_page(pgds[current_level]));
+			kunmap(p);
 
-			kbase_mmu_free_pgd(kbdev, mmut, pgds[current_level],
-					   true);
+			/* Ensure the cacheline containing the last valid entry
+			 * of PGD is invalidated from the GPU cache, before the
+			 * PGD page is freed.
+			 */
+			kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx,
+				current_pgd + (index * sizeof(u64)),
+				sizeof(u64), flush_op);
+
+			list_add(&p->lru, free_pgds_list);
 		} else {
-			int index = (vpfn >> ((3 - current_level) * 9)) & 0x1FF;
-
-			kbdev->mmu_mode->entry_invalidate(&current_page[index]);
-
 			current_valid_entries--;
 
 			kbdev->mmu_mode->set_num_valid_entries(
 				current_page, current_valid_entries);
 
-			kbase_mmu_sync_pgd(
-				kbdev, mmut->kctx, pgds[current_level] + (index * sizeof(u64)),
-				kbase_dma_addr(phys_to_page(pgds[current_level])) + 8 * index,
-				8 * 1, flush_op);
+			kunmap(p);
 
-			kunmap(phys_to_page(pgds[current_level]));
+			kbase_mmu_sync_pgd(kbdev, mmut->kctx, current_pgd + (index * sizeof(u64)),
+					   kbase_dma_addr(p) + (index * sizeof(u64)), sizeof(u64),
+					   flush_op);
 			break;
 		}
 	}
@@ -2239,15 +2325,33 @@ static void mmu_flush_invalidate_teardown_pages(struct kbase_device *kbdev,
 						struct tagged_addr *phys,
 						struct kbase_mmu_hw_op_param *op_param)
 {
-
 	if (!mmu_flush_cache_on_gpu_ctrl(kbdev)) {
+		/* Full cache flush through the MMU_COMMAND */
 		mmu_flush_invalidate(kbdev, kctx, as_nr, op_param);
-		return;
 	} else if (op_param->op == KBASE_MMU_OP_FLUSH_MEM) {
+		/* Full cache flush through the GPU_CONTROL */
 		mmu_flush_invalidate_on_gpu_ctrl(kbdev, kctx, as_nr, op_param);
-		return;
 	}
+#if MALI_USE_CSF
+	else {
+		/* Partial GPU cache flush with MMU cache invalidation */
+		unsigned long irq_flags;
+		unsigned int i;
+		bool flush_done = false;
 
+		mmu_invalidate(kbdev, kctx, as_nr, op_param);
+
+		for (i = 0; !flush_done && i < op_param->nr; i++) {
+			spin_lock_irqsave(&kbdev->hwaccess_lock, irq_flags);
+			if (kbdev->pm.backend.gpu_powered && (!kctx || kctx->as_nr >= 0))
+				mmu_flush_pa_range(kbdev, as_phys_addr_t(phys[i]), PAGE_SIZE,
+						   KBASE_MMU_OP_FLUSH_MEM);
+			else
+				flush_done = true;
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, irq_flags);
+		}
+	}
+#endif
 }
 
 /**
@@ -2282,15 +2386,14 @@ static void mmu_flush_invalidate_teardown_pages(struct kbase_device *kbdev,
 int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_table *mmut, u64 vpfn,
 			     struct tagged_addr *phys, size_t nr, int as_nr)
 {
-	phys_addr_t pgd;
 	u64 start_vpfn = vpfn;
 	size_t requested_nr = nr;
 	enum kbase_mmu_op_type flush_op = KBASE_MMU_OP_NONE;
 	struct kbase_mmu_mode const *mmu_mode;
 	struct kbase_mmu_hw_op_param op_param;
-	unsigned int i;
 	int err = -EFAULT;
 	u64 dirty_pgds = 0;
+	LIST_HEAD(free_pgds_list);
 
 	/* Calls to this function are inherently asynchronous, with respect to
 	 * MMU operations.
@@ -2328,19 +2431,19 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_table
 		u64 *page;
 		phys_addr_t pgds[MIDGARD_MMU_BOTTOMLEVEL + 1];
 		register unsigned int num_of_valid_entries;
+		phys_addr_t pgd = mmut->pgd;
+		struct page *p = phys_to_page(pgd);
 
 		if (count > nr)
 			count = nr;
 
-		/* need to check if this is a 2MB or a 4kB page */
-		pgd = mmut->pgd;
-
+		/* need to check if this is a 2MB page or a 4kB */
 		for (level = MIDGARD_MMU_TOPLEVEL;
 				level <= MIDGARD_MMU_BOTTOMLEVEL; level++) {
 			phys_addr_t next_pgd;
 
 			index = (vpfn >> ((3 - level) * 9)) & 0x1FF;
-			page = kmap(phys_to_page(pgd));
+			page = kmap(p);
 			if (mmu_mode->ate_is_valid(page[index], level))
 				break; /* keep the mapping */
 			else if (!mmu_mode->pte_is_valid(page[index], level)) {
@@ -2366,9 +2469,10 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_table
 			next_pgd = mmu_mode->pte_to_phy_addr(
 				kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
 					kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP, level, page[index]));
+			kunmap(p);
 			pgds[level] = pgd;
-			kunmap(phys_to_page(pgd));
 			pgd = next_pgd;
+			p = phys_to_page(pgd);
 		}
 
 		switch (level) {
@@ -2377,7 +2481,7 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_table
 			dev_warn(kbdev->dev,
 				 "%s: No support for ATEs at level %d\n",
 				 __func__, level);
-			kunmap(phys_to_page(pgd));
+			kunmap(p);
 			goto out;
 		case MIDGARD_MMU_LEVEL(2):
 			/* can only teardown if count >= 512 */
@@ -2412,30 +2516,38 @@ int kbase_mmu_teardown_pages(struct kbase_device *kbdev, struct kbase_mmu_table
 		else
 			num_of_valid_entries -= pcount;
 
-		if (!num_of_valid_entries) {
-			kunmap(phys_to_page(pgd));
+		/* Invalidate the entries we added */
+		mmu_mode->entries_invalidate(&page[index], pcount);
 
-			kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
+		if (!num_of_valid_entries) {
+			kunmap(p);
+
+			/* Ensure the cacheline(s) containing the last valid entries
+			 * of PGD is invalidated from the GPU cache, before the
+			 * PGD page is freed.
+			 */
+			kbase_mmu_sync_pgd_gpu(kbdev, mmut->kctx,
+				pgd + (index * sizeof(u64)),
+				pcount * sizeof(u64), flush_op);
+
+			list_add(&p->lru, &free_pgds_list);
 
 			kbase_mmu_update_and_free_parent_pgds(kbdev, mmut, pgds, vpfn, level,
-							      flush_op, &dirty_pgds);
+							      flush_op, &dirty_pgds,
+							      &free_pgds_list);
 
 			vpfn += count;
 			nr -= count;
 			continue;
 		}
 
-		/* Invalidate the entries we added */
-		for (i = 0; i < pcount; i++)
-			mmu_mode->entry_invalidate(&page[index + i]);
-
 		mmu_mode->set_num_valid_entries(page, num_of_valid_entries);
 
 		kbase_mmu_sync_pgd(kbdev, mmut->kctx, pgd + (index * sizeof(u64)),
-				   kbase_dma_addr(phys_to_page(pgd)) + 8 * index, 8 * pcount,
+				   kbase_dma_addr(p) + (index * sizeof(u64)), pcount * sizeof(u64),
 				   flush_op);
 next:
-		kunmap(phys_to_page(pgd));
+		kunmap(p);
 		vpfn += count;
 		nr -= count;
 	}
@@ -2454,6 +2566,8 @@ out:
 	};
 	mmu_flush_invalidate_teardown_pages(kbdev, mmut->kctx, as_nr, phys, &op_param);
 
+	kbase_mmu_free_pgds_list(kbdev, mmut, &free_pgds_list);
+
 	return err;
 }
 
@@ -2627,49 +2741,45 @@ static void mmu_teardown_level(struct kbase_device *kbdev,
 		struct kbase_mmu_table *mmut, phys_addr_t pgd,
 		int level)
 {
-	phys_addr_t target_pgd;
 	u64 *pgd_page;
 	int i;
-	struct kbase_mmu_mode const *mmu_mode;
-	u64 *pgd_page_buffer;
+	struct memory_group_manager_device *mgm_dev = kbdev->mgm_dev;
+	struct kbase_mmu_mode const *mmu_mode = kbdev->mmu_mode;
+	u64 *pgd_page_buffer = NULL;
 
 	lockdep_assert_held(&mmut->mmu_lock);
 
-	/* Early-out. No need to kmap to check entries for L3 PGD. */
-	if (level == MIDGARD_MMU_BOTTOMLEVEL) {
-		kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
-		return;
-	}
-
 	pgd_page = kmap_atomic(pfn_to_page(PFN_DOWN(pgd)));
 	/* kmap_atomic should NEVER fail. */
 	if (WARN_ON(pgd_page == NULL))
 		return;
-	/* Copy the page to our preallocated buffer so that we can minimize
-	 * kmap_atomic usage
-	 */
-	pgd_page_buffer = mmut->mmu_teardown_pages[level];
-	memcpy(pgd_page_buffer, pgd_page, PAGE_SIZE);
+	if (level < MIDGARD_MMU_BOTTOMLEVEL) {
+		/* Copy the page to our preallocated buffer so that we can minimize
+		 * kmap_atomic usage
+		 */
+		pgd_page_buffer = mmut->mmu_teardown_pages[level];
+		memcpy(pgd_page_buffer, pgd_page, PAGE_SIZE);
+	}
+
+	/* Invalidate page after copying */
+	mmu_mode->entries_invalidate(pgd_page, KBASE_MMU_PAGE_ENTRIES);
 	kunmap_atomic(pgd_page);
 	pgd_page = pgd_page_buffer;
 
-	mmu_mode = kbdev->mmu_mode;
-
-	for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++) {
-		target_pgd = mmu_mode->pte_to_phy_addr(kbdev->mgm_dev->ops.mgm_pte_to_original_pte(
-			kbdev->mgm_dev, MGM_DEFAULT_PTE_GROUP,
-			level, pgd_page[i]));
-
-		if (target_pgd) {
+	if (level < MIDGARD_MMU_BOTTOMLEVEL) {
+		for (i = 0; i < KBASE_MMU_PAGE_ENTRIES; i++) {
 			if (mmu_mode->pte_is_valid(pgd_page[i], level)) {
-				mmu_teardown_level(kbdev, mmut,
-						   target_pgd,
-						   level + 1);
+				phys_addr_t target_pgd = mmu_mode->pte_to_phy_addr(
+					mgm_dev->ops.mgm_pte_to_original_pte(mgm_dev,
+									     MGM_DEFAULT_PTE_GROUP,
+									     level, pgd_page[i]));
+
+				mmu_teardown_level(kbdev, mmut, target_pgd, level + 1);
 			}
 		}
 	}
 
-	kbase_mmu_free_pgd(kbdev, mmut, pgd, true);
+	kbase_mmu_free_pgd(kbdev, mmut, pgd);
 }
 
 int kbase_mmu_init(struct kbase_device *const kbdev,
@@ -2685,7 +2795,7 @@ int kbase_mmu_init(struct kbase_device *const kbdev,
 	mmut->group_id = group_id;
 	mutex_init(&mmut->mmu_lock);
 	mmut->kctx = kctx;
-	mmut->pgd = 0;
+	mmut->pgd = KBASE_MMU_INVALID_PGD_ADDRESS;
 
 	/* Preallocate MMU depth of 3 pages for mmu_teardown_level to use */
 	for (level = MIDGARD_MMU_TOPLEVEL;
@@ -2703,7 +2813,7 @@ int kbase_mmu_init(struct kbase_device *const kbdev,
 	 * kbase_mmu_alloc_pgd will allocate out of that pool. This is done to
 	 * avoid allocations from the kernel happening with the lock held.
 	 */
-	while (!mmut->pgd) {
+	while (mmut->pgd == KBASE_MMU_INVALID_PGD_ADDRESS) {
 		int err;
 
 		err = kbase_mem_pool_grow(
@@ -2726,7 +2836,7 @@ void kbase_mmu_term(struct kbase_device *kbdev, struct kbase_mmu_table *mmut)
 {
 	int level;
 
-	if (mmut->pgd) {
+	if (mmut->pgd != KBASE_MMU_INVALID_PGD_ADDRESS) {
 		mutex_lock(&mmut->mmu_lock);
 		mmu_teardown_level(kbdev, mmut, mmut->pgd, MIDGARD_MMU_TOPLEVEL);
 		mutex_unlock(&mmut->mmu_lock);
@@ -2750,6 +2860,7 @@ void kbase_mmu_as_term(struct kbase_device *kbdev, int i)
 	destroy_workqueue(kbdev->as[i].pf_wq);
 }
 
+#ifdef CONFIG_MALI_VECTOR_DUMP
 static size_t kbasep_mmu_dump_level(struct kbase_context *kctx, phys_addr_t pgd,
 		int level, char ** const buffer, size_t *size_left)
 {
@@ -2891,6 +3002,7 @@ fail_free:
 	return NULL;
 }
 KBASE_EXPORT_TEST_API(kbase_mmu_dump);
+#endif /* CONFIG_MALI_VECTOR_DUMP */
 
 void kbase_mmu_bus_fault_worker(struct work_struct *data)
 {
diff --git a/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu.h b/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu.h
index 53d1d194eca7..848570f2a6dd 100644
--- a/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu.h
+++ b/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu.h
@@ -25,6 +25,7 @@
 #include <uapi/gpu/arm/bifrost/mali_base_kernel.h>
 
 #define KBASE_MMU_PAGE_ENTRIES 512
+#define KBASE_MMU_INVALID_PGD_ADDRESS (~(phys_addr_t)0)
 
 struct kbase_context;
 struct kbase_mmu_table;
diff --git a/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu_hw_direct.c b/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu_hw_direct.c
index c9e5ef288ff8..cc764b483f05 100644
--- a/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu_hw_direct.c
+++ b/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu_hw_direct.c
@@ -28,6 +28,26 @@
 #include <tl/mali_kbase_tracepoints.h>
 #include <linux/delay.h>
 
+#if MALI_USE_CSF
+/**
+ * mmu_has_flush_skip_pgd_levels() - Check if the GPU has the feature
+ *                                   AS_LOCKADDR_FLUSH_SKIP_LEVELS
+ *
+ * @gpu_props:  GPU properties for the GPU instance.
+ *
+ * This function returns whether a cache flush can apply the skip flags of
+ * AS_LOCKADDR_FLUSH_SKIP_LEVELS.
+ *
+ * Return: True if cache flush has the said feature.
+ */
+static bool mmu_has_flush_skip_pgd_levels(struct kbase_gpu_props const *gpu_props)
+{
+	u32 const signature =
+		gpu_props->props.raw_props.gpu_id & (GPU_ID2_ARCH_MAJOR | GPU_ID2_ARCH_REV);
+
+	return signature >= (u32)GPU_ID2_PRODUCT_MAKE(12, 0, 4, 0);
+}
+#endif
 
 /**
  * lock_region() - Generate lockaddr to lock memory region in MMU
@@ -105,7 +125,7 @@ static int lock_region(struct kbase_gpu_props const *gpu_props, u64 *lockaddr,
 	 * therefore the highest bit that differs is bit #16
 	 * and the region size (as a logarithm) is 16 + 1 = 17, i.e. 128 kB.
 	 */
-	lockaddr_size_log2 = fls(lockaddr_base ^ lockaddr_end);
+	lockaddr_size_log2 = fls64(lockaddr_base ^ lockaddr_end);
 
 	/* Cap the size against minimum and maximum values allowed. */
 	if (lockaddr_size_log2 > KBASE_LOCK_REGION_MAX_SIZE_LOG2)
@@ -126,6 +146,13 @@ static int lock_region(struct kbase_gpu_props const *gpu_props, u64 *lockaddr,
 	 */
 	*lockaddr = lockaddr_base & ~((1ull << lockaddr_size_log2) - 1);
 	*lockaddr |= lockaddr_size_log2 - 1;
+
+#if MALI_USE_CSF
+	if (mmu_has_flush_skip_pgd_levels(gpu_props))
+		*lockaddr =
+			AS_LOCKADDR_FLUSH_SKIP_LEVELS_SET(*lockaddr, op_param->flush_skip_levels);
+#endif
+
 	return 0;
 }
 
@@ -207,21 +234,18 @@ static int wait_cores_power_trans_complete(struct kbase_device *kbdev)
  *                     implicit unlock.
  * @as_nr:             Address space number for which MMU command needs to be
  *                     sent.
- * @hwaccess_locked:   Flag to indicate if hwaccess_lock is held by the caller.
  *
- * This functions ensures that the flush of LSC is not missed for the pages that
+ * This function ensures that the flush of LSC is not missed for the pages that
  * were unmapped from the GPU, due to the power down transition of shader cores.
  *
  * Return: 0 if the WA was successfully applied, non-zero otherwise.
  */
-static int apply_hw_issue_GPU2019_3901_wa(struct kbase_device *kbdev,
-			u32 *mmu_cmd, unsigned int as_nr, bool hwaccess_locked)
+static int apply_hw_issue_GPU2019_3901_wa(struct kbase_device *kbdev, u32 *mmu_cmd,
+					  unsigned int as_nr)
 {
-	unsigned long flags = 0;
 	int ret = 0;
 
-	if (!hwaccess_locked)
-		spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+	lockdep_assert_held(&kbdev->hwaccess_lock);
 
 	/* Check if L2 is OFF. The cores also must be OFF if L2 is not up, so
 	 * the workaround can be safely skipped.
@@ -230,23 +254,22 @@ static int apply_hw_issue_GPU2019_3901_wa(struct kbase_device *kbdev,
 		if (*mmu_cmd != AS_COMMAND_FLUSH_MEM) {
 			dev_warn(kbdev->dev,
 				 "Unexpected mmu command received");
-			ret = -EINVAL;
-			goto unlock;
+			return -EINVAL;
 		}
 
 		/* Wait for the LOCK MMU command to complete, issued by the caller */
 		ret = wait_ready(kbdev, as_nr);
 		if (ret)
-			goto unlock;
+			return ret;
 
 		ret = kbase_gpu_cache_flush_and_busy_wait(kbdev,
 				GPU_COMMAND_CACHE_CLN_INV_LSC);
 		if (ret)
-			goto unlock;
+			return ret;
 
 		ret = wait_cores_power_trans_complete(kbdev);
 		if (ret)
-			goto unlock;
+			return ret;
 
 		/* As LSC is guaranteed to have been flushed we can use FLUSH_PT
 		 * MMU command to only flush the L2.
@@ -254,10 +277,6 @@ static int apply_hw_issue_GPU2019_3901_wa(struct kbase_device *kbdev,
 		*mmu_cmd = AS_COMMAND_FLUSH_PT;
 	}
 
-unlock:
-	if (!hwaccess_locked)
-		spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
-
 	return ret;
 }
 #endif
@@ -487,8 +506,16 @@ static int mmu_hw_do_flush(struct kbase_device *kbdev, struct kbase_as *as,
 	 * FLUSH_MEM/PT command is deprecated.
 	 */
 	if (mmu_cmd == AS_COMMAND_FLUSH_MEM) {
-		ret = apply_hw_issue_GPU2019_3901_wa(kbdev, &mmu_cmd,
-						as->number, hwaccess_locked);
+		if (!hwaccess_locked) {
+			unsigned long flags = 0;
+
+			spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
+			ret = apply_hw_issue_GPU2019_3901_wa(kbdev, &mmu_cmd, as->number);
+			spin_unlock_irqrestore(&kbdev->hwaccess_lock, flags);
+		} else {
+			ret = apply_hw_issue_GPU2019_3901_wa(kbdev, &mmu_cmd, as->number);
+		}
+
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu_mode_aarch64.c b/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu_mode_aarch64.c
index dfbdee17782b..f2c627482c18 100644
--- a/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu_mode_aarch64.c
+++ b/drivers/gpu/arm/bifrost/mmu/mali_kbase_mmu_mode_aarch64.c
@@ -35,10 +35,8 @@
 #define ENTRY_IS_INVAL		2ULL
 #define ENTRY_IS_PTE		3ULL
 
-#define ENTRY_ATTR_BITS (7ULL << 2)	/* bits 4:2 */
 #define ENTRY_ACCESS_RW (1ULL << 6)     /* bits 6:7 */
 #define ENTRY_ACCESS_RO (3ULL << 6)
-#define ENTRY_SHARE_BITS (3ULL << 8)	/* bits 9:8 */
 #define ENTRY_ACCESS_BIT (1ULL << 10)
 #define ENTRY_NX_BIT (1ULL << 54)
 
@@ -194,25 +192,26 @@ static void entry_set_pte(u64 *entry, phys_addr_t phy)
 	page_table_entry_set(entry, (phy & PAGE_MASK) | ENTRY_ACCESS_BIT | ENTRY_IS_PTE);
 }
 
-static void entry_invalidate(u64 *entry)
+static void entries_invalidate(u64 *entry, u32 count)
 {
-	page_table_entry_set(entry, ENTRY_IS_INVAL);
+	u32 i;
+
+	for (i = 0; i < count; i++)
+		page_table_entry_set(entry + i, ENTRY_IS_INVAL);
 }
 
-static const struct kbase_mmu_mode aarch64_mode = {
-	.update = mmu_update,
-	.get_as_setup = kbase_mmu_get_as_setup,
-	.disable_as = mmu_disable_as,
-	.pte_to_phy_addr = pte_to_phy_addr,
-	.ate_is_valid = ate_is_valid,
-	.pte_is_valid = pte_is_valid,
-	.entry_set_ate = entry_set_ate,
-	.entry_set_pte = entry_set_pte,
-	.entry_invalidate = entry_invalidate,
-	.get_num_valid_entries = get_num_valid_entries,
-	.set_num_valid_entries = set_num_valid_entries,
-	.flags = KBASE_MMU_MODE_HAS_NON_CACHEABLE
-};
+static const struct kbase_mmu_mode aarch64_mode = { .update = mmu_update,
+						    .get_as_setup = kbase_mmu_get_as_setup,
+						    .disable_as = mmu_disable_as,
+						    .pte_to_phy_addr = pte_to_phy_addr,
+						    .ate_is_valid = ate_is_valid,
+						    .pte_is_valid = pte_is_valid,
+						    .entry_set_ate = entry_set_ate,
+						    .entry_set_pte = entry_set_pte,
+						    .entries_invalidate = entries_invalidate,
+						    .get_num_valid_entries = get_num_valid_entries,
+						    .set_num_valid_entries = set_num_valid_entries,
+						    .flags = KBASE_MMU_MODE_HAS_NON_CACHEABLE };
 
 struct kbase_mmu_mode const *kbase_mmu_mode_get_aarch64(void)
 {
diff --git a/drivers/gpu/arm/bifrost/platform/devicetree/Kbuild b/drivers/gpu/arm/bifrost/platform/devicetree/Kbuild
index 2eecd6635948..60a52d80fa8e 100644
--- a/drivers/gpu/arm/bifrost/platform/devicetree/Kbuild
+++ b/drivers/gpu/arm/bifrost/platform/devicetree/Kbuild
@@ -20,6 +20,5 @@
 
 bifrost_kbase-y += \
     platform/$(MALI_PLATFORM_DIR)/mali_kbase_config_devicetree.o \
-    platform/$(MALI_PLATFORM_DIR)/mali_kbase_config_platform.o \
     platform/$(MALI_PLATFORM_DIR)/mali_kbase_runtime_pm.o \
     platform/$(MALI_PLATFORM_DIR)/mali_kbase_clk_rate_trace.o
diff --git a/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_config_platform.c b/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_config_platform.c
deleted file mode 100644
index 2eebed022a59..000000000000
--- a/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_config_platform.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
-/*
- *
- * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
- *
- * This program is free software and is provided to you under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation, and any use by you of this program is subject to the terms
- * of such GNU license.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- */
-
-#include <mali_kbase.h>
-#include <mali_kbase_defs.h>
-#include <mali_kbase_config.h>
-#include "mali_kbase_config_platform.h"
-#include <device/mali_kbase_device.h>
-#include <mali_kbase_hwaccess_time.h>
-#include <gpu/mali_kbase_gpu_regmap.h>
-
-#include <linux/kthread.h>
-#include <linux/timer.h>
-#include <linux/jiffies.h>
-#include <linux/wait.h>
-#include <linux/delay.h>
-#include <linux/gcd.h>
-#include <asm/arch_timer.h>
-
-struct kbase_platform_funcs_conf platform_funcs = {
-	.platform_init_func = NULL,
-	.platform_term_func = NULL,
-	.platform_late_init_func = NULL,
-	.platform_late_term_func = NULL,
-};
diff --git a/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_config_platform.h b/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_config_platform.h
index 743885ffad0e..584a7217d300 100644
--- a/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_config_platform.h
+++ b/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_config_platform.h
@@ -33,13 +33,12 @@
  * Attached value: pointer to @ref kbase_platform_funcs_conf
  * Default value: See @ref kbase_platform_funcs_conf
  */
-#define PLATFORM_FUNCS (&platform_funcs)
+#define PLATFORM_FUNCS (NULL)
 
 #define CLK_RATE_TRACE_OPS (&clk_rate_trace_ops)
 
 extern struct kbase_pm_callback_conf pm_callbacks;
 extern struct kbase_clk_rate_trace_op_conf clk_rate_trace_ops;
-extern struct kbase_platform_funcs_conf platform_funcs;
 /**
  * AUTO_SUSPEND_DELAY - Autosuspend delay
  *
diff --git a/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_runtime_pm.c b/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_runtime_pm.c
index 07b09f868735..2687bee96ec9 100644
--- a/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_runtime_pm.c
+++ b/drivers/gpu/arm/bifrost/platform/devicetree/mali_kbase_runtime_pm.c
@@ -29,7 +29,6 @@
 
 #include "mali_kbase_config_platform.h"
 
-
 static void enable_gpu_power_control(struct kbase_device *kbdev)
 {
 	unsigned int i;
@@ -82,8 +81,7 @@ static int pm_callback_power_on(struct kbase_device *kbdev)
 	int error;
 	unsigned long flags;
 
-	dev_dbg(kbdev->dev, "%s %p\n", __func__,
-			(void *)kbdev->dev->pm_domain);
+	dev_dbg(kbdev->dev, "%s %pK\n", __func__, (void *)kbdev->dev->pm_domain);
 
 	spin_lock_irqsave(&kbdev->hwaccess_lock, flags);
 	WARN_ON(kbdev->pm.backend.gpu_powered);
@@ -298,5 +296,3 @@ struct kbase_pm_callback_conf pm_callbacks = {
 	.power_runtime_gpu_active_callback = NULL,
 #endif
 };
-
-
diff --git a/drivers/gpu/arm/bifrost/platform/meson/mali_kbase_runtime_pm.c b/drivers/gpu/arm/bifrost/platform/meson/mali_kbase_runtime_pm.c
index c00cbcb17d39..910d4b4fd3e1 100644
--- a/drivers/gpu/arm/bifrost/platform/meson/mali_kbase_runtime_pm.c
+++ b/drivers/gpu/arm/bifrost/platform/meson/mali_kbase_runtime_pm.c
@@ -149,7 +149,7 @@ static int pm_callback_power_on(struct kbase_device *kbdev)
 	int ret = 1; /* Assume GPU has been powered off */
 	int error;
 
-	dev_dbg(kbdev->dev, "%s %p\n", __func__, (void *)kbdev->dev->pm_domain);
+	dev_dbg(kbdev->dev, "%s %pK\n", __func__, (void *)kbdev->dev->pm_domain);
 
 #ifdef KBASE_PM_RUNTIME
 	error = pm_runtime_get_sync(kbdev->dev);
diff --git a/drivers/gpu/arm/bifrost/tests/Mconfig b/drivers/gpu/arm/bifrost/tests/Mconfig
index 738dbd42aac7..67b38a28cf96 100644
--- a/drivers/gpu/arm/bifrost/tests/Mconfig
+++ b/drivers/gpu/arm/bifrost/tests/Mconfig
@@ -26,8 +26,8 @@ menuconfig MALI_KUTF
 	  This option will build the Mali testing framework modules.
 
 	  Modules:
-	   - kutf.ko
-	   - kutf_test.ko
+	  - kutf.ko
+	  - kutf_test.ko
 
 config MALI_KUTF_IRQ_TEST
 	bool "Build Mali KUTF IRQ test module"
@@ -38,7 +38,7 @@ config MALI_KUTF_IRQ_TEST
 	  It can determine the latency of the Mali GPU IRQ on your system.
 
 	  Modules:
-	    - mali_kutf_irq_test.ko
+	  - mali_kutf_irq_test.ko
 
 config MALI_KUTF_CLK_RATE_TRACE
 	bool "Build Mali KUTF Clock rate trace test module"
@@ -50,7 +50,7 @@ config MALI_KUTF_CLK_RATE_TRACE
 	  basic trace test in the system.
 
 	  Modules:
-	    - mali_kutf_clk_rate_trace_test_portal.ko
+	  - mali_kutf_clk_rate_trace_test_portal.ko
 
 config MALI_KUTF_MGM_INTEGRATION_TEST
 	bool "Build Mali KUTF MGM integration test module"
@@ -62,12 +62,12 @@ config MALI_KUTF_MGM_INTEGRATION_TEST
 	  group ids.
 
 	  Modules:
-	    - mali_kutf_mgm_integration_test.ko
+	  - mali_kutf_mgm_integration_test.ko
 
 
 # Enable MALI_BIFROST_DEBUG for KUTF modules support
 
 config UNIT_TEST_KERNEL_MODULES
-       bool
-       default y if UNIT_TEST_CODE && BACKEND_KERNEL
-       default n
+	bool
+	default y if UNIT_TEST_CODE && BACKEND_KERNEL
+	default n
diff --git a/drivers/gpu/arm/bifrost/tests/build.bp b/drivers/gpu/arm/bifrost/tests/build.bp
index 9d6137d17d5f..7abae237f9c3 100644
--- a/drivers/gpu/arm/bifrost/tests/build.bp
+++ b/drivers/gpu/arm/bifrost/tests/build.bp
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2021-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -25,7 +25,7 @@ bob_defaults {
         "include",
         "./../../",
         "./../",
-        "./"
+        "./",
     ],
 }
 
diff --git a/drivers/gpu/arm/bifrost/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c b/drivers/gpu/arm/bifrost/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c
index 2d7289daca20..a6f54b61d4ad 100644
--- a/drivers/gpu/arm/bifrost/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c
+++ b/drivers/gpu/arm/bifrost/tests/mali_kutf_clk_rate_trace/kernel/mali_kutf_clk_rate_trace_test.c
@@ -825,7 +825,7 @@ static void *mali_kutf_clk_rate_trace_create_fixture(
 	if (!data)
 		return NULL;
 
-	*data = (const struct kutf_clk_rate_trace_fixture_data){ NULL };
+	memset(data, 0, sizeof(*data));
 	pr_debug("Hooking up the test portal to kbdev clk rate trace\n");
 	spin_lock(&kbdev->pm.clk_rtm.lock);
 
diff --git a/drivers/gpu/arm/bifrost/tests/mali_kutf_irq_test/mali_kutf_irq_test_main.c b/drivers/gpu/arm/bifrost/tests/mali_kutf_irq_test/mali_kutf_irq_test_main.c
index 2d6e68946c00..f2a014d9b5ca 100644
--- a/drivers/gpu/arm/bifrost/tests/mali_kutf_irq_test/mali_kutf_irq_test_main.c
+++ b/drivers/gpu/arm/bifrost/tests/mali_kutf_irq_test/mali_kutf_irq_test_main.c
@@ -51,8 +51,6 @@ struct kutf_irq_fixture_data {
 	struct kbase_device *kbdev;
 };
 
-#define SEC_TO_NANO(s)	      ((s)*1000000000LL)
-
 /* ID for the GPU IRQ */
 #define GPU_IRQ_HANDLER 2
 
@@ -212,6 +210,11 @@ static void mali_kutf_irq_latency(struct kutf_context *context)
 		average_time += irq_time - start_time;
 
 		udelay(10);
+		/* Sleep for a ms, every 10000 iterations, to avoid misleading warning
+		 * of CPU softlockup when all GPU IRQs keep going to the same CPU.
+		 */
+		if (!(i % 10000))
+			msleep(1);
 	}
 
 	/* Go back to default handler */
diff --git a/drivers/gpu/arm/bifrost/tests/mali_kutf_mgm_integration_test/build.bp b/drivers/gpu/arm/bifrost/tests/mali_kutf_mgm_integration_test/build.bp
index 2e4a083863e4..8b995f8a0a07 100644
--- a/drivers/gpu/arm/bifrost/tests/mali_kutf_mgm_integration_test/build.bp
+++ b/drivers/gpu/arm/bifrost/tests/mali_kutf_mgm_integration_test/build.bp
@@ -38,4 +38,4 @@ bob_kernel_module {
         kbuild_options: ["CONFIG_MALI_KUTF_MGM_INTEGRATION_TEST=y"],
         enabled: true,
     },
-}
\ No newline at end of file
+}
diff --git a/drivers/gpu/arm/bifrost/tl/Kbuild b/drivers/gpu/arm/bifrost/tl/Kbuild
index 6e472dff8a78..1c684d489d6f 100644
--- a/drivers/gpu/arm/bifrost/tl/Kbuild
+++ b/drivers/gpu/arm/bifrost/tl/Kbuild
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 #
-# (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+# (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
 #
 # This program is free software and is provided to you under the terms of the
 # GNU General Public License version 2 as published by the Free Software
diff --git a/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline.c b/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline.c
index 7160522a0be6..334248867c7c 100644
--- a/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline.c
+++ b/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
 /*
  *
- * (C) COPYRIGHT 2015-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2015-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -26,7 +26,6 @@
 #include <mali_kbase.h>
 #include <mali_kbase_jm.h>
 
-#include <linux/anon_inodes.h>
 #include <linux/atomic.h>
 #include <linux/file.h>
 #include <linux/mutex.h>
@@ -35,7 +34,7 @@
 #include <linux/stringify.h>
 #include <linux/timer.h>
 #include <linux/wait.h>
-
+#include <linux/delay.h>
 
 /* The period of autoflush checker execution in milliseconds. */
 #define AUTOFLUSH_INTERVAL 1000 /* ms */
@@ -184,90 +183,109 @@ static void kbase_tlstream_current_devfreq_target(struct kbase_device *kbdev)
 }
 #endif /* CONFIG_MALI_BIFROST_DEVFREQ */
 
-int kbase_timeline_io_acquire(struct kbase_device *kbdev, u32 flags)
+int kbase_timeline_acquire(struct kbase_device *kbdev, u32 flags)
 {
-	int ret = 0;
+	int err = 0;
 	u32 timeline_flags = TLSTREAM_ENABLED | flags;
-	struct kbase_timeline *timeline = kbdev->timeline;
+	struct kbase_timeline *timeline;
+	int rcode;
 
-	if (!atomic_cmpxchg(timeline->timeline_flags, 0, timeline_flags)) {
-		int rcode;
+	if (WARN_ON(!kbdev) || WARN_ON(flags & ~BASE_TLSTREAM_FLAGS_MASK))
+		return -EINVAL;
+
+	timeline = kbdev->timeline;
+	if (WARN_ON(!timeline))
+		return -EFAULT;
+
+	if (atomic_cmpxchg(timeline->timeline_flags, 0, timeline_flags))
+		return -EBUSY;
 
 #if MALI_USE_CSF
-		if (flags & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS) {
-			ret = kbase_csf_tl_reader_start(
-				&timeline->csf_tl_reader, kbdev);
-			if (ret) {
-				atomic_set(timeline->timeline_flags, 0);
-				return ret;
-			}
-		}
-#endif
-		ret = anon_inode_getfd(
-				"[mali_tlstream]",
-				&kbasep_tlstream_fops,
-				timeline,
-				O_RDONLY | O_CLOEXEC);
-		if (ret < 0) {
+	if (flags & BASE_TLSTREAM_ENABLE_CSFFW_TRACEPOINTS) {
+		err = kbase_csf_tl_reader_start(&timeline->csf_tl_reader, kbdev);
+		if (err) {
 			atomic_set(timeline->timeline_flags, 0);
-#if MALI_USE_CSF
-			kbase_csf_tl_reader_stop(&timeline->csf_tl_reader);
-#endif
-			return ret;
+			return err;
 		}
+	}
+#endif
 
-		/* Reset and initialize header streams. */
-		kbase_tlstream_reset(
-			&timeline->streams[TL_STREAM_TYPE_OBJ_SUMMARY]);
+	/* Reset and initialize header streams. */
+	kbase_tlstream_reset(&timeline->streams[TL_STREAM_TYPE_OBJ_SUMMARY]);
 
-		timeline->obj_header_btc = obj_desc_header_size;
-		timeline->aux_header_btc = aux_desc_header_size;
+	timeline->obj_header_btc = obj_desc_header_size;
+	timeline->aux_header_btc = aux_desc_header_size;
 
 #if !MALI_USE_CSF
-		/* If job dumping is enabled, readjust the software event's
-		 * timeout as the default value of 3 seconds is often
-		 * insufficient.
-		 */
-		if (flags & BASE_TLSTREAM_JOB_DUMPING_ENABLED) {
-			dev_info(kbdev->dev,
-					"Job dumping is enabled, readjusting the software event's timeout\n");
-			atomic_set(&kbdev->js_data.soft_job_timeout_ms,
-					1800000);
-		}
+	/* If job dumping is enabled, readjust the software event's
+	 * timeout as the default value of 3 seconds is often
+	 * insufficient.
+	 */
+	if (flags & BASE_TLSTREAM_JOB_DUMPING_ENABLED) {
+		dev_info(kbdev->dev,
+			 "Job dumping is enabled, readjusting the software event's timeout\n");
+		atomic_set(&kbdev->js_data.soft_job_timeout_ms, 1800000);
+	}
 #endif /* !MALI_USE_CSF */
 
-		/* Summary stream was cleared during acquire.
-		 * Create static timeline objects that will be
-		 * read by client.
-		 */
-		kbase_create_timeline_objects(kbdev);
+	/* Summary stream was cleared during acquire.
+	 * Create static timeline objects that will be
+	 * read by client.
+	 */
+	kbase_create_timeline_objects(kbdev);
 
 #ifdef CONFIG_MALI_BIFROST_DEVFREQ
-		/* Devfreq target tracepoints are only fired when the target
-		 * changes, so we won't know the current target unless we
-		 * send it now.
-		 */
-		kbase_tlstream_current_devfreq_target(kbdev);
+	/* Devfreq target tracepoints are only fired when the target
+	 * changes, so we won't know the current target unless we
+	 * send it now.
+	 */
+	kbase_tlstream_current_devfreq_target(kbdev);
 #endif /* CONFIG_MALI_BIFROST_DEVFREQ */
 
-		/* Start the autoflush timer.
-		 * We must do this after creating timeline objects to ensure we
-		 * don't auto-flush the streams which will be reset during the
-		 * summarization process.
-		 */
-		atomic_set(&timeline->autoflush_timer_active, 1);
-		rcode = mod_timer(&timeline->autoflush_timer,
-				  jiffies +
-					  msecs_to_jiffies(AUTOFLUSH_INTERVAL));
-		CSTD_UNUSED(rcode);
-	} else {
-		ret = -EBUSY;
-	}
+	/* Start the autoflush timer.
+	 * We must do this after creating timeline objects to ensure we
+	 * don't auto-flush the streams which will be reset during the
+	 * summarization process.
+	 */
+	atomic_set(&timeline->autoflush_timer_active, 1);
+	rcode = mod_timer(&timeline->autoflush_timer,
+			  jiffies + msecs_to_jiffies(AUTOFLUSH_INTERVAL));
+	CSTD_UNUSED(rcode);
 
-	if (ret >= 0)
-		timeline->last_acquire_time = ktime_get_raw();
+	timeline->last_acquire_time = ktime_get_raw();
 
-	return ret;
+	return err;
+}
+
+void kbase_timeline_release(struct kbase_timeline *timeline)
+{
+	ktime_t elapsed_time;
+	s64 elapsed_time_ms, time_to_sleep;
+
+	if (WARN_ON(!timeline) || WARN_ON(!atomic_read(timeline->timeline_flags)))
+		return;
+
+	/* Get the amount of time passed since the timeline was acquired and ensure
+	 * we sleep for long enough such that it has been at least
+	 * TIMELINE_HYSTERESIS_TIMEOUT_MS amount of time between acquire and release.
+	 * This prevents userspace from spamming acquire and release too quickly.
+	 */
+	elapsed_time = ktime_sub(ktime_get_raw(), timeline->last_acquire_time);
+	elapsed_time_ms = ktime_to_ms(elapsed_time);
+	time_to_sleep = (elapsed_time_ms < 0 ? TIMELINE_HYSTERESIS_TIMEOUT_MS :
+					       TIMELINE_HYSTERESIS_TIMEOUT_MS - elapsed_time_ms);
+	if (time_to_sleep > 0)
+		msleep_interruptible(time_to_sleep);
+
+#if MALI_USE_CSF
+	kbase_csf_tl_reader_stop(&timeline->csf_tl_reader);
+#endif
+
+	/* Stop autoflush timer before releasing access to streams. */
+	atomic_set(&timeline->autoflush_timer_active, 0);
+	del_timer_sync(&timeline->autoflush_timer);
+
+	atomic_set(timeline->timeline_flags, 0);
 }
 
 int kbase_timeline_streams_flush(struct kbase_timeline *timeline)
@@ -275,11 +293,17 @@ int kbase_timeline_streams_flush(struct kbase_timeline *timeline)
 	enum tl_stream_type stype;
 	bool has_bytes = false;
 	size_t nbytes = 0;
-#if MALI_USE_CSF
-	int ret = kbase_csf_tl_reader_flush_buffer(&timeline->csf_tl_reader);
 
-	if (ret > 0)
-		has_bytes = true;
+	if (WARN_ON(!timeline))
+		return -EINVAL;
+
+#if MALI_USE_CSF
+	{
+		int ret = kbase_csf_tl_reader_flush_buffer(&timeline->csf_tl_reader);
+
+		if (ret > 0)
+			has_bytes = true;
+	}
 #endif
 
 	for (stype = 0; stype < TL_STREAM_TYPE_COUNT; stype++) {
diff --git a/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline.h b/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline.h
index 96a4b181a285..62be6c64c850 100644
--- a/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline.h
+++ b/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline.h
@@ -117,4 +117,12 @@ void kbase_timeline_post_kbase_context_destroy(struct kbase_context *kctx);
 void kbase_timeline_stats(struct kbase_timeline *timeline, u32 *bytes_collected, u32 *bytes_generated);
 #endif /* MALI_UNIT_TEST */
 
+/**
+ * kbase_timeline_io_debugfs_init - Add a debugfs entry for reading timeline stream data
+ *
+ * @kbdev: An instance of the GPU platform device, allocated from the probe
+ *         method of the driver.
+ */
+void kbase_timeline_io_debugfs_init(struct kbase_device *kbdev);
+
 #endif /* _KBASE_TIMELINE_H */
diff --git a/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline_io.c b/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline_io.c
index af8b3d8c8c35..644d69bc209d 100644
--- a/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline_io.c
+++ b/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline_io.c
@@ -24,9 +24,11 @@
 #include "mali_kbase_tracepoints.h"
 #include "mali_kbase_timeline.h"
 
-#include <linux/delay.h>
+#include <device/mali_kbase_device.h>
+
 #include <linux/poll.h>
 #include <linux/version_compat_defs.h>
+#include <linux/anon_inodes.h>
 
 /* The timeline stream file operations functions. */
 static ssize_t kbasep_timeline_io_read(struct file *filp, char __user *buffer,
@@ -36,15 +38,6 @@ static int kbasep_timeline_io_release(struct inode *inode, struct file *filp);
 static int kbasep_timeline_io_fsync(struct file *filp, loff_t start, loff_t end,
 				    int datasync);
 
-/* The timeline stream file operations structure. */
-const struct file_operations kbasep_tlstream_fops = {
-	.owner = THIS_MODULE,
-	.release = kbasep_timeline_io_release,
-	.read = kbasep_timeline_io_read,
-	.poll = kbasep_timeline_io_poll,
-	.fsync = kbasep_timeline_io_fsync,
-};
-
 /**
  * kbasep_timeline_io_packet_pending - check timeline streams for pending
  *                                     packets
@@ -290,7 +283,8 @@ static ssize_t kbasep_timeline_io_read(struct file *filp, char __user *buffer,
  * @filp: Pointer to file structure
  * @wait: Pointer to poll table
  *
- * Return: POLLIN if data can be read without blocking, otherwise zero
+ * Return: EPOLLIN | EPOLLRDNORM if data can be read without blocking,
+ *         otherwise zero, or EPOLLHUP | EPOLLERR on error.
  */
 static __poll_t kbasep_timeline_io_poll(struct file *filp, poll_table *wait)
 {
@@ -302,20 +296,91 @@ static __poll_t kbasep_timeline_io_poll(struct file *filp, poll_table *wait)
 	KBASE_DEBUG_ASSERT(wait);
 
 	if (WARN_ON(!filp->private_data))
-		return (__poll_t)-EFAULT;
+		return EPOLLHUP | EPOLLERR;
 
 	timeline = (struct kbase_timeline *)filp->private_data;
 
 	/* If there are header bytes to copy, read will not block */
 	if (kbasep_timeline_has_header_data(timeline))
-		return POLLIN;
+		return EPOLLIN | EPOLLRDNORM;
 
 	poll_wait(filp, &timeline->event_queue, wait);
 	if (kbasep_timeline_io_packet_pending(timeline, &stream, &rb_idx))
-		return POLLIN;
-	return 0;
+		return EPOLLIN | EPOLLRDNORM;
+
+	return (__poll_t)0;
 }
 
+int kbase_timeline_io_acquire(struct kbase_device *kbdev, u32 flags)
+{
+	/* The timeline stream file operations structure. */
+	static const struct file_operations kbasep_tlstream_fops = {
+		.owner = THIS_MODULE,
+		.release = kbasep_timeline_io_release,
+		.read = kbasep_timeline_io_read,
+		.poll = kbasep_timeline_io_poll,
+		.fsync = kbasep_timeline_io_fsync,
+	};
+	int err;
+
+	if (WARN_ON(!kbdev) || (flags & ~BASE_TLSTREAM_FLAGS_MASK))
+		return -EINVAL;
+
+	err = kbase_timeline_acquire(kbdev, flags);
+	if (err)
+		return err;
+
+	err = anon_inode_getfd("[mali_tlstream]", &kbasep_tlstream_fops, kbdev->timeline,
+			       O_RDONLY | O_CLOEXEC);
+	if (err < 0)
+		kbase_timeline_release(kbdev->timeline);
+
+	return err;
+}
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+static int kbasep_timeline_io_open(struct inode *in, struct file *file)
+{
+	struct kbase_device *const kbdev = in->i_private;
+
+	if (WARN_ON(!kbdev))
+		return -EFAULT;
+
+	file->private_data = kbdev->timeline;
+	return kbase_timeline_acquire(kbdev, BASE_TLSTREAM_FLAGS_MASK &
+						     ~BASE_TLSTREAM_JOB_DUMPING_ENABLED);
+}
+
+void kbase_timeline_io_debugfs_init(struct kbase_device *const kbdev)
+{
+	static const struct file_operations kbasep_tlstream_debugfs_fops = {
+		.owner = THIS_MODULE,
+		.open = kbasep_timeline_io_open,
+		.release = kbasep_timeline_io_release,
+		.read = kbasep_timeline_io_read,
+		.poll = kbasep_timeline_io_poll,
+		.fsync = kbasep_timeline_io_fsync,
+	};
+	struct dentry *file;
+
+	if (WARN_ON(!kbdev) || WARN_ON(IS_ERR_OR_NULL(kbdev->mali_debugfs_directory)))
+		return;
+
+	file = debugfs_create_file("tlstream", 0444, kbdev->mali_debugfs_directory, kbdev,
+				   &kbasep_tlstream_debugfs_fops);
+
+	if (IS_ERR_OR_NULL(file))
+		dev_warn(kbdev->dev, "Unable to create timeline debugfs entry");
+}
+#else
+/*
+ * Stub function for when debugfs is disabled
+ */
+void kbase_timeline_io_debugfs_init(struct kbase_device *const kbdev)
+{
+}
+#endif
+
 /**
  * kbasep_timeline_io_release - release timeline stream descriptor
  * @inode: Pointer to inode structure
@@ -325,55 +390,18 @@ static __poll_t kbasep_timeline_io_poll(struct file *filp, poll_table *wait)
  */
 static int kbasep_timeline_io_release(struct inode *inode, struct file *filp)
 {
-	struct kbase_timeline *timeline;
-	ktime_t elapsed_time;
-	s64 elapsed_time_ms, time_to_sleep;
-
-	KBASE_DEBUG_ASSERT(inode);
-	KBASE_DEBUG_ASSERT(filp);
-	KBASE_DEBUG_ASSERT(filp->private_data);
-
 	CSTD_UNUSED(inode);
 
-	timeline = (struct kbase_timeline *)filp->private_data;
-
-	/* Get the amount of time passed since the timeline was acquired and ensure
-	 * we sleep for long enough such that it has been at least
-	 * TIMELINE_HYSTERESIS_TIMEOUT_MS amount of time between acquire and release.
-	 * This prevents userspace from spamming acquire and release too quickly.
-	 */
-	elapsed_time = ktime_sub(ktime_get_raw(), timeline->last_acquire_time);
-	elapsed_time_ms = ktime_to_ms(elapsed_time);
-	time_to_sleep = MIN(TIMELINE_HYSTERESIS_TIMEOUT_MS,
-		TIMELINE_HYSTERESIS_TIMEOUT_MS - elapsed_time_ms);
-	if (time_to_sleep > 0)
-		msleep(time_to_sleep);
-
-#if MALI_USE_CSF
-	kbase_csf_tl_reader_stop(&timeline->csf_tl_reader);
-#endif
-
-	/* Stop autoflush timer before releasing access to streams. */
-	atomic_set(&timeline->autoflush_timer_active, 0);
-	del_timer_sync(&timeline->autoflush_timer);
-
-	atomic_set(timeline->timeline_flags, 0);
+	kbase_timeline_release(filp->private_data);
 	return 0;
 }
 
 static int kbasep_timeline_io_fsync(struct file *filp, loff_t start, loff_t end,
 				    int datasync)
 {
-	struct kbase_timeline *timeline;
-
 	CSTD_UNUSED(start);
 	CSTD_UNUSED(end);
 	CSTD_UNUSED(datasync);
 
-	if (WARN_ON(!filp->private_data))
-		return -EFAULT;
-
-	timeline = (struct kbase_timeline *)filp->private_data;
-
-	return kbase_timeline_streams_flush(timeline);
+	return kbase_timeline_streams_flush(filp->private_data);
 }
diff --git a/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline_priv.h b/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline_priv.h
index bf2c3855434a..de30bccc7cca 100644
--- a/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline_priv.h
+++ b/drivers/gpu/arm/bifrost/tl/mali_kbase_timeline_priv.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -51,7 +51,7 @@
  * @event_queue:            Timeline stream event queue
  * @bytes_collected:        Number of bytes read by user
  * @timeline_flags:         Zero, if timeline is disabled. Timeline stream flags
- *                          otherwise. See kbase_timeline_io_acquire().
+ *                          otherwise. See kbase_timeline_acquire().
  * @obj_header_btc:         Remaining bytes to copy for the object stream header
  * @aux_header_btc:         Remaining bytes to copy for the aux stream header
  * @last_acquire_time:      The time at which timeline was last acquired.
@@ -77,8 +77,27 @@ struct kbase_timeline {
 #endif
 };
 
-extern const struct file_operations kbasep_tlstream_fops;
-
 void kbase_create_timeline_objects(struct kbase_device *kbdev);
 
+/**
+ * kbase_timeline_acquire - acquire timeline for a userspace client.
+ * @kbdev:     An instance of the GPU platform device, allocated from the probe
+ *             method of the driver.
+ * @flags:     Timeline stream flags
+ *
+ * Each timeline instance can be acquired by only one userspace client at a time.
+ *
+ * Return: Zero on success, error number on failure (e.g. if already acquired).
+ */
+int kbase_timeline_acquire(struct kbase_device *kbdev, u32 flags);
+
+/**
+ * kbase_timeline_release - release timeline for a userspace client.
+ * @timeline:     Timeline instance to be stopped. It must be previously acquired
+ *                with kbase_timeline_acquire().
+ *
+ * Releasing the timeline instance allows it to be acquired by another userspace client.
+ */
+void kbase_timeline_release(struct kbase_timeline *timeline);
+
 #endif /* _KBASE_TIMELINE_PRIV_H */
diff --git a/drivers/gpu/arm/bifrost/tl/mali_kbase_tracepoints.c b/drivers/gpu/arm/bifrost/tl/mali_kbase_tracepoints.c
index 3ac78503ce1f..fd0d0c01adde 100644
--- a/drivers/gpu/arm/bifrost/tl/mali_kbase_tracepoints.c
+++ b/drivers/gpu/arm/bifrost/tl/mali_kbase_tracepoints.c
@@ -100,14 +100,14 @@ enum tl_msg_id_obj {
 	KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_MAP_IMPORT,
 	KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_UNMAP_IMPORT,
 	KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_UNMAP_IMPORT_FORCE,
-	KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER,
-	KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND,
 	KBASE_TL_KBASE_ARRAY_BEGIN_KCPUQUEUE_ENQUEUE_JIT_ALLOC,
 	KBASE_TL_KBASE_ARRAY_ITEM_KCPUQUEUE_ENQUEUE_JIT_ALLOC,
 	KBASE_TL_KBASE_ARRAY_END_KCPUQUEUE_ENQUEUE_JIT_ALLOC,
 	KBASE_TL_KBASE_ARRAY_BEGIN_KCPUQUEUE_ENQUEUE_JIT_FREE,
 	KBASE_TL_KBASE_ARRAY_ITEM_KCPUQUEUE_ENQUEUE_JIT_FREE,
 	KBASE_TL_KBASE_ARRAY_END_KCPUQUEUE_ENQUEUE_JIT_FREE,
+	KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER,
+	KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND,
 	KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_FENCE_SIGNAL_START,
 	KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_FENCE_SIGNAL_END,
 	KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_FENCE_WAIT_START,
@@ -416,14 +416,6 @@ enum tl_msg_id_obj {
 		"KCPU Queue enqueues Unmap Import ignoring reference count", \
 		"@pL", \
 		"kcpu_queue,map_import_buf_gpu_addr") \
-	TRACEPOINT_DESC(KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER, \
-		"KCPU Queue enqueues Error Barrier", \
-		"@p", \
-		"kcpu_queue") \
-	TRACEPOINT_DESC(KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND, \
-		"KCPU Queue enqueues Group Suspend", \
-		"@ppI", \
-		"kcpu_queue,group_suspend_buf,gpu_cmdq_grp_handle") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_ARRAY_BEGIN_KCPUQUEUE_ENQUEUE_JIT_ALLOC, \
 		"Begin array of KCPU Queue enqueues JIT Alloc", \
 		"@p", \
@@ -448,6 +440,14 @@ enum tl_msg_id_obj {
 		"End array of KCPU Queue enqueues JIT Free", \
 		"@p", \
 		"kcpu_queue") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER, \
+		"KCPU Queue enqueues Error Barrier", \
+		"@p", \
+		"kcpu_queue") \
+	TRACEPOINT_DESC(KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND, \
+		"KCPU Queue enqueues Group Suspend", \
+		"@ppI", \
+		"kcpu_queue,group_suspend_buf,gpu_cmdq_grp_handle") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_FENCE_SIGNAL_START, \
 		"KCPU Queue starts a Signal on Fence", \
 		"@p", \
@@ -465,15 +465,15 @@ enum tl_msg_id_obj {
 		"@pI", \
 		"kcpu_queue,execute_error") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_CQS_WAIT_START, \
-		"KCPU Queue starts a Wait on an array of Cross Queue Sync Objects", \
+		"KCPU Queue starts a Wait on Cross Queue Sync Object", \
 		"@p", \
 		"kcpu_queue") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_CQS_WAIT_END, \
-		"KCPU Queue ends a Wait on an array of Cross Queue Sync Objects", \
+		"KCPU Queue ends a Wait on Cross Queue Sync Object", \
 		"@pI", \
 		"kcpu_queue,execute_error") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_CQS_SET, \
-		"KCPU Queue executes a Set on an array of Cross Queue Sync Objects", \
+		"KCPU Queue executes a Set on Cross Queue Sync Object", \
 		"@pI", \
 		"kcpu_queue,execute_error") \
 	TRACEPOINT_DESC(KBASE_TL_KBASE_KCPUQUEUE_EXECUTE_MAP_IMPORT_START, \
@@ -2540,60 +2540,6 @@ void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_unmap_import_force(
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
 
-void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_error_barrier(
-	struct kbase_tlstream *stream,
-	const void *kcpu_queue
-)
-{
-	const u32 msg_id = KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER;
-	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
-		+ sizeof(kcpu_queue)
-		;
-	char *buffer;
-	unsigned long acq_flags;
-	size_t pos = 0;
-
-	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
-
-	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
-	pos = kbasep_serialize_timestamp(buffer, pos);
-	pos = kbasep_serialize_bytes(buffer,
-		pos, &kcpu_queue, sizeof(kcpu_queue));
-
-	kbase_tlstream_msgbuf_release(stream, acq_flags);
-}
-
-void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_group_suspend(
-	struct kbase_tlstream *stream,
-	const void *kcpu_queue,
-	const void *group_suspend_buf,
-	u32 gpu_cmdq_grp_handle
-)
-{
-	const u32 msg_id = KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND;
-	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
-		+ sizeof(kcpu_queue)
-		+ sizeof(group_suspend_buf)
-		+ sizeof(gpu_cmdq_grp_handle)
-		;
-	char *buffer;
-	unsigned long acq_flags;
-	size_t pos = 0;
-
-	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
-
-	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
-	pos = kbasep_serialize_timestamp(buffer, pos);
-	pos = kbasep_serialize_bytes(buffer,
-		pos, &kcpu_queue, sizeof(kcpu_queue));
-	pos = kbasep_serialize_bytes(buffer,
-		pos, &group_suspend_buf, sizeof(group_suspend_buf));
-	pos = kbasep_serialize_bytes(buffer,
-		pos, &gpu_cmdq_grp_handle, sizeof(gpu_cmdq_grp_handle));
-
-	kbase_tlstream_msgbuf_release(stream, acq_flags);
-}
-
 void __kbase_tlstream_tl_kbase_array_begin_kcpuqueue_enqueue_jit_alloc(
 	struct kbase_tlstream *stream,
 	const void *kcpu_queue
@@ -2772,6 +2718,60 @@ void __kbase_tlstream_tl_kbase_array_end_kcpuqueue_enqueue_jit_free(
 	kbase_tlstream_msgbuf_release(stream, acq_flags);
 }
 
+void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_error_barrier(
+	struct kbase_tlstream *stream,
+	const void *kcpu_queue
+)
+{
+	const u32 msg_id = KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(kcpu_queue)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &kcpu_queue, sizeof(kcpu_queue));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
+void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_group_suspend(
+	struct kbase_tlstream *stream,
+	const void *kcpu_queue,
+	const void *group_suspend_buf,
+	u32 gpu_cmdq_grp_handle
+)
+{
+	const u32 msg_id = KBASE_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND;
+	const size_t msg_size = sizeof(msg_id) + sizeof(u64)
+		+ sizeof(kcpu_queue)
+		+ sizeof(group_suspend_buf)
+		+ sizeof(gpu_cmdq_grp_handle)
+		;
+	char *buffer;
+	unsigned long acq_flags;
+	size_t pos = 0;
+
+	buffer = kbase_tlstream_msgbuf_acquire(stream, msg_size, &acq_flags);
+
+	pos = kbasep_serialize_bytes(buffer, pos, &msg_id, sizeof(msg_id));
+	pos = kbasep_serialize_timestamp(buffer, pos);
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &kcpu_queue, sizeof(kcpu_queue));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &group_suspend_buf, sizeof(group_suspend_buf));
+	pos = kbasep_serialize_bytes(buffer,
+		pos, &gpu_cmdq_grp_handle, sizeof(gpu_cmdq_grp_handle));
+
+	kbase_tlstream_msgbuf_release(stream, acq_flags);
+}
+
 void __kbase_tlstream_tl_kbase_kcpuqueue_execute_fence_signal_start(
 	struct kbase_tlstream *stream,
 	const void *kcpu_queue
diff --git a/drivers/gpu/arm/bifrost/tl/mali_kbase_tracepoints.h b/drivers/gpu/arm/bifrost/tl/mali_kbase_tracepoints.h
index cb1e63ef56f5..be0c62edecd7 100644
--- a/drivers/gpu/arm/bifrost/tl/mali_kbase_tracepoints.h
+++ b/drivers/gpu/arm/bifrost/tl/mali_kbase_tracepoints.h
@@ -77,7 +77,7 @@ extern const size_t  aux_desc_header_size;
 #define TL_JS_EVENT_STOP      GATOR_JOB_SLOT_STOP
 #define TL_JS_EVENT_SOFT_STOP GATOR_JOB_SLOT_SOFT_STOPPED
 
-#define TLSTREAM_ENABLED (1 << 31)
+#define TLSTREAM_ENABLED (1u << 31)
 
 void __kbase_tlstream_tl_new_ctx(
 	struct kbase_tlstream *stream,
@@ -496,18 +496,6 @@ void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_unmap_import_force(
 	u64 map_import_buf_gpu_addr
 );
 
-void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_error_barrier(
-	struct kbase_tlstream *stream,
-	const void *kcpu_queue
-);
-
-void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_group_suspend(
-	struct kbase_tlstream *stream,
-	const void *kcpu_queue,
-	const void *group_suspend_buf,
-	u32 gpu_cmdq_grp_handle
-);
-
 void __kbase_tlstream_tl_kbase_array_begin_kcpuqueue_enqueue_jit_alloc(
 	struct kbase_tlstream *stream,
 	const void *kcpu_queue
@@ -548,6 +536,18 @@ void __kbase_tlstream_tl_kbase_array_end_kcpuqueue_enqueue_jit_free(
 	const void *kcpu_queue
 );
 
+void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_error_barrier(
+	struct kbase_tlstream *stream,
+	const void *kcpu_queue
+);
+
+void __kbase_tlstream_tl_kbase_kcpuqueue_enqueue_group_suspend(
+	struct kbase_tlstream *stream,
+	const void *kcpu_queue,
+	const void *group_suspend_buf,
+	u32 gpu_cmdq_grp_handle
+);
+
 void __kbase_tlstream_tl_kbase_kcpuqueue_execute_fence_signal_start(
 	struct kbase_tlstream *stream,
 	const void *kcpu_queue
@@ -2493,68 +2493,6 @@ struct kbase_tlstream;
 	do { } while (0)
 #endif /* MALI_USE_CSF */
 
-/**
- * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER - KCPU Queue enqueues Error Barrier
- *
- * @kbdev: Kbase device
- * @kcpu_queue: KCPU queue
- */
-#if MALI_USE_CSF
-#define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER(	\
-	kbdev,	\
-	kcpu_queue	\
-	)	\
-	do {	\
-		int enabled = atomic_read(&kbdev->timeline_flags);	\
-		if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS)	\
-			__kbase_tlstream_tl_kbase_kcpuqueue_enqueue_error_barrier(	\
-				__TL_DISPATCH_STREAM(kbdev, obj),	\
-				kcpu_queue	\
-				);	\
-	} while (0)
-#else
-#define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER(	\
-	kbdev,	\
-	kcpu_queue	\
-	)	\
-	do { } while (0)
-#endif /* MALI_USE_CSF */
-
-/**
- * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND - KCPU Queue enqueues Group Suspend
- *
- * @kbdev: Kbase device
- * @kcpu_queue: KCPU queue
- * @group_suspend_buf: Pointer to the suspend buffer structure
- * @gpu_cmdq_grp_handle: GPU Command Queue Group handle which will match userspace
- */
-#if MALI_USE_CSF
-#define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND(	\
-	kbdev,	\
-	kcpu_queue,	\
-	group_suspend_buf,	\
-	gpu_cmdq_grp_handle	\
-	)	\
-	do {	\
-		int enabled = atomic_read(&kbdev->timeline_flags);	\
-		if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS)	\
-			__kbase_tlstream_tl_kbase_kcpuqueue_enqueue_group_suspend(	\
-				__TL_DISPATCH_STREAM(kbdev, obj),	\
-				kcpu_queue,	\
-				group_suspend_buf,	\
-				gpu_cmdq_grp_handle	\
-				);	\
-	} while (0)
-#else
-#define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND(	\
-	kbdev,	\
-	kcpu_queue,	\
-	group_suspend_buf,	\
-	gpu_cmdq_grp_handle	\
-	)	\
-	do { } while (0)
-#endif /* MALI_USE_CSF */
-
 /**
  * KBASE_TLSTREAM_TL_KBASE_ARRAY_BEGIN_KCPUQUEUE_ENQUEUE_JIT_ALLOC - Begin array of KCPU Queue enqueues JIT Alloc
  *
@@ -2757,6 +2695,68 @@ struct kbase_tlstream;
 	do { } while (0)
 #endif /* MALI_USE_CSF */
 
+/**
+ * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER - KCPU Queue enqueues Error Barrier
+ *
+ * @kbdev: Kbase device
+ * @kcpu_queue: KCPU queue
+ */
+#if MALI_USE_CSF
+#define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER(	\
+	kbdev,	\
+	kcpu_queue	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS)	\
+			__kbase_tlstream_tl_kbase_kcpuqueue_enqueue_error_barrier(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				kcpu_queue	\
+				);	\
+	} while (0)
+#else
+#define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_ERROR_BARRIER(	\
+	kbdev,	\
+	kcpu_queue	\
+	)	\
+	do { } while (0)
+#endif /* MALI_USE_CSF */
+
+/**
+ * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND - KCPU Queue enqueues Group Suspend
+ *
+ * @kbdev: Kbase device
+ * @kcpu_queue: KCPU queue
+ * @group_suspend_buf: Pointer to the suspend buffer structure
+ * @gpu_cmdq_grp_handle: GPU Command Queue Group handle which will match userspace
+ */
+#if MALI_USE_CSF
+#define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND(	\
+	kbdev,	\
+	kcpu_queue,	\
+	group_suspend_buf,	\
+	gpu_cmdq_grp_handle	\
+	)	\
+	do {	\
+		int enabled = atomic_read(&kbdev->timeline_flags);	\
+		if (enabled & BASE_TLSTREAM_ENABLE_CSF_TRACEPOINTS)	\
+			__kbase_tlstream_tl_kbase_kcpuqueue_enqueue_group_suspend(	\
+				__TL_DISPATCH_STREAM(kbdev, obj),	\
+				kcpu_queue,	\
+				group_suspend_buf,	\
+				gpu_cmdq_grp_handle	\
+				);	\
+	} while (0)
+#else
+#define KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_ENQUEUE_GROUP_SUSPEND(	\
+	kbdev,	\
+	kcpu_queue,	\
+	group_suspend_buf,	\
+	gpu_cmdq_grp_handle	\
+	)	\
+	do { } while (0)
+#endif /* MALI_USE_CSF */
+
 /**
  * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_FENCE_SIGNAL_START - KCPU Queue starts a Signal on Fence
  *
@@ -2874,7 +2874,7 @@ struct kbase_tlstream;
 #endif /* MALI_USE_CSF */
 
 /**
- * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_CQS_WAIT_START - KCPU Queue starts a Wait on an array of Cross Queue Sync Objects
+ * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_CQS_WAIT_START - KCPU Queue starts a Wait on Cross Queue Sync Object
  *
  * @kbdev: Kbase device
  * @kcpu_queue: KCPU queue
@@ -2901,7 +2901,7 @@ struct kbase_tlstream;
 #endif /* MALI_USE_CSF */
 
 /**
- * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_CQS_WAIT_END - KCPU Queue ends a Wait on an array of Cross Queue Sync Objects
+ * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_CQS_WAIT_END - KCPU Queue ends a Wait on Cross Queue Sync Object
  *
  * @kbdev: Kbase device
  * @kcpu_queue: KCPU queue
@@ -2932,7 +2932,7 @@ struct kbase_tlstream;
 #endif /* MALI_USE_CSF */
 
 /**
- * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_CQS_SET - KCPU Queue executes a Set on an array of Cross Queue Sync Objects
+ * KBASE_TLSTREAM_TL_KBASE_KCPUQUEUE_EXECUTE_CQS_SET - KCPU Queue executes a Set on Cross Queue Sync Object
  *
  * @kbdev: Kbase device
  * @kcpu_queue: KCPU queue
diff --git a/drivers/gpu/arm/bifrost/arbiter/mali_kbase_arbiter_interface.h b/include/linux/mali_arbiter_interface.h
similarity index 99%
rename from drivers/gpu/arm/bifrost/arbiter/mali_kbase_arbiter_interface.h
rename to include/linux/mali_arbiter_interface.h
index a0ca1ccddcc9..8e675ec2ad3b 100644
--- a/drivers/gpu/arm/bifrost/arbiter/mali_kbase_arbiter_interface.h
+++ b/include/linux/mali_arbiter_interface.h
@@ -41,7 +41,7 @@
  * 4 - Added max_config support
  * 5 - Added GPU clock frequency reporting support from arbiter
  */
-#define MALI_KBASE_ARBITER_INTERFACE_VERSION 5
+#define MALI_ARBITER_INTERFACE_VERSION 5
 
 /**
  * DOC: NO_FREQ is used in case platform doesn't support reporting frequency
diff --git a/include/linux/memory_group_manager.h b/include/linux/memory_group_manager.h
index c4667803b361..786e3b995f29 100644
--- a/include/linux/memory_group_manager.h
+++ b/include/linux/memory_group_manager.h
@@ -43,6 +43,8 @@ struct memory_group_manager_import_data;
  * @mgm_free_page:            Callback to free physical memory in a group
  * @mgm_get_import_memory_id: Callback to get the group ID for imported memory
  * @mgm_update_gpu_pte:       Callback to modify a GPU page table entry
+ * @mgm_pte_to_original_pte:  Callback to get the original PTE entry as given
+ *                            to mgm_update_gpu_pte
  * @mgm_vmf_insert_pfn_prot:  Callback to map a physical memory page for the CPU
  */
 struct memory_group_manager_ops {
@@ -120,7 +122,8 @@ struct memory_group_manager_ops {
 	 * This function allows the memory group manager to modify a GPU page
 	 * table entry before it is stored by the kbase module (controller
 	 * driver). It may set certain bits in the page table entry attributes
-	 * or in the physical address, based on the physical memory group ID.
+	 * or modify the physical address, based on the physical memory group ID
+	 * and/or additional data in struct memory_group_manager_device.
 	 *
 	 * Return: A modified GPU page table entry to be stored in a page table.
 	 */
@@ -128,6 +131,17 @@ struct memory_group_manager_ops {
 			int group_id, int mmu_level, u64 pte);
 
 	/*
+	 * mgm_pte_to_original_pte - Undo any modification done during mgm_update_gpu_pte()
+	 *
+	 * @mgm_dev:   The memory group manager through which the request
+	 *             is being made.
+	 * @group_id:  A physical memory group ID. The meaning of this is
+	 *             defined by the systems integrator. Its valid range is
+	 *             0 .. MEMORY_GROUP_MANAGER_NR_GROUPS-1.
+	 * @mmu_level: The level of the page table entry in @ate.
+	 * @pte:       The page table entry to restore the original representation for,
+	 *             in LPAE or AArch64 format (depending on the driver's configuration).
+	 *
 	 * Undo any modifications done during mgm_update_gpu_pte().
 	 * This function allows getting back the original PTE entry as given
 	 * to mgm_update_gpu_pte().
diff --git a/include/linux/version_compat_defs.h b/include/linux/version_compat_defs.h
index a8e08742069d..d0a09985c5ca 100644
--- a/include/linux/version_compat_defs.h
+++ b/include/linux/version_compat_defs.h
@@ -28,4 +28,20 @@
 typedef unsigned int __poll_t;
 #endif
 
+#ifndef EPOLLHUP
+#define EPOLLHUP POLLHUP
+#endif
+
+#ifndef EPOLLERR
+#define EPOLLERR POLLERR
+#endif
+
+#ifndef EPOLLIN
+#define EPOLLIN POLLIN
+#endif
+
+#ifndef EPOLLRDNORM
+#define EPOLLRDNORM POLLRDNORM
+#endif
+
 #endif /* _VERSION_COMPAT_DEFS_H_ */
diff --git a/include/uapi/gpu/arm/bifrost/csf/mali_base_csf_kernel.h b/include/uapi/gpu/arm/bifrost/csf/mali_base_csf_kernel.h
index 3b02350c08bf..d9813c055809 100644
--- a/include/uapi/gpu/arm/bifrost/csf/mali_base_csf_kernel.h
+++ b/include/uapi/gpu/arm/bifrost/csf/mali_base_csf_kernel.h
@@ -573,6 +573,7 @@ struct base_csf_notification {
  *   is a bitpattern where a set bit indicates that the format is supported.
  *   Before using a texture format, it is recommended that the corresponding
  *   bit be checked.
+ * @paddings: Padding bytes.
  * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
  *   It is unlikely that a client will be able to allocate all of this memory
  *   for their own purposes, but this at least provides an upper bound on the
@@ -590,6 +591,7 @@ struct mali_base_gpu_core_props {
 	__u32 gpu_freq_khz_max;
 	__u32 log2_program_counter_size;
 	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u8 paddings[4];
 	__u64 gpu_available_memory_size;
 };
 
diff --git a/include/uapi/gpu/arm/bifrost/csf/mali_kbase_csf_errors_dumpfault.h b/include/uapi/gpu/arm/bifrost/csf/mali_kbase_csf_errors_dumpfault.h
new file mode 100644
index 000000000000..f49ab0036fc3
--- /dev/null
+++ b/include/uapi/gpu/arm/bifrost/csf/mali_kbase_csf_errors_dumpfault.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ *
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
+ *
+ * This program is free software and is provided to you under the terms of the
+ * GNU General Public License version 2 as published by the Free Software
+ * Foundation, and any use by you of this program is subject to the terms
+ * of such GNU license.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ */
+
+#ifndef _UAPI_KBASE_CSF_ERRORS_DUMPFAULT_H_
+#define _UAPI_KBASE_CSF_ERRORS_DUMPFAULT_H_
+
+/**
+ * enum dumpfault_error_type - Enumeration to define errors to be dumped
+ *
+ * @DF_NO_ERROR:                       No pending error
+ * @DF_CSG_SUSPEND_TIMEOUT:            CSG suspension timeout
+ * @DF_CSG_TERMINATE_TIMEOUT:          CSG group termination timeout
+ * @DF_CSG_START_TIMEOUT:              CSG start timeout
+ * @DF_CSG_RESUME_TIMEOUT:             CSG resume timeout
+ * @DF_CSG_EP_CFG_TIMEOUT:             CSG end point configuration timeout
+ * @DF_CSG_STATUS_UPDATE_TIMEOUT:      CSG status update timeout
+ * @DF_PROGRESS_TIMER_TIMEOUT:         Progress timer timeout
+ * @DF_FW_INTERNAL_ERROR:              Firmware internal error
+ * @DF_CS_FATAL:                       CS fatal error
+ * @DF_CS_FAULT:                       CS fault error
+ * @DF_FENCE_WAIT_TIMEOUT:             Fence wait timeout
+ * @DF_PROTECTED_MODE_EXIT_TIMEOUT:    P.mode exit timeout
+ * @DF_PROTECTED_MODE_ENTRY_FAILURE:   P.mode entrance failure
+ * @DF_PING_REQUEST_TIMEOUT:           Ping request timeout
+ * @DF_CORE_DOWNSCALE_REQUEST_TIMEOUT: DCS downscale request timeout
+ * @DF_TILER_OOM:                      Tiler Out-of-memory error
+ * @DF_GPU_PAGE_FAULT:                 GPU page fault
+ * @DF_BUS_FAULT:                      MMU BUS Fault
+ * @DF_GPU_PROTECTED_FAULT:            GPU P.mode fault
+ * @DF_AS_ACTIVE_STUCK:                AS active stuck
+ * @DF_GPU_SOFT_RESET_FAILURE:         GPU soft reset falure
+ *
+ * This is used for kbase to notify error type of an event whereby
+ * user space client will dump relevant debugging information via debugfs.
+ * @DF_NO_ERROR is used to indicate no pending fault, thus the client will
+ * be blocked on reading debugfs file till a fault happens.
+ */
+enum dumpfault_error_type {
+	DF_NO_ERROR = 0,
+	DF_CSG_SUSPEND_TIMEOUT,
+	DF_CSG_TERMINATE_TIMEOUT,
+	DF_CSG_START_TIMEOUT,
+	DF_CSG_RESUME_TIMEOUT,
+	DF_CSG_EP_CFG_TIMEOUT,
+	DF_CSG_STATUS_UPDATE_TIMEOUT,
+	DF_PROGRESS_TIMER_TIMEOUT,
+	DF_FW_INTERNAL_ERROR,
+	DF_CS_FATAL,
+	DF_CS_FAULT,
+	DF_FENCE_WAIT_TIMEOUT,
+	DF_PROTECTED_MODE_EXIT_TIMEOUT,
+	DF_PROTECTED_MODE_ENTRY_FAILURE,
+	DF_PING_REQUEST_TIMEOUT,
+	DF_CORE_DOWNSCALE_REQUEST_TIMEOUT,
+	DF_TILER_OOM,
+	DF_GPU_PAGE_FAULT,
+	DF_BUS_FAULT,
+	DF_GPU_PROTECTED_FAULT,
+	DF_AS_ACTIVE_STUCK,
+	DF_GPU_SOFT_RESET_FAILURE,
+};
+
+#endif /* _UAPI_KBASE_CSF_ERRORS_DUMPFAULT_H_ */
diff --git a/include/uapi/gpu/arm/bifrost/csf/mali_kbase_csf_ioctl.h b/include/uapi/gpu/arm/bifrost/csf/mali_kbase_csf_ioctl.h
index db7252605f06..d9a1867e13c3 100644
--- a/include/uapi/gpu/arm/bifrost/csf/mali_kbase_csf_ioctl.h
+++ b/include/uapi/gpu/arm/bifrost/csf/mali_kbase_csf_ioctl.h
@@ -60,10 +60,22 @@
  * - Dummy model (no mali) backend will now clear HWC values after each sample
  * 1.12:
  * - Added support for incremental rendering flag in CSG create call
+ * 1.13:
+ * - Added ioctl to query a register of USER page.
+ * 1.14:
+ * - Added support for passing down the buffer descriptor VA in tiler heap init
+ * 1.15:
+ * - Enable new sync_wait GE condition
+ * 1.16:
+ * - Remove legacy definitions:
+ *   - base_jit_alloc_info_10_2
+ *   - base_jit_alloc_info_11_5
+ *   - kbase_ioctl_mem_jit_init_10_2
+ *   - kbase_ioctl_mem_jit_init_11_5
  */
 
 #define BASE_UK_VERSION_MAJOR 1
-#define BASE_UK_VERSION_MINOR 12
+#define BASE_UK_VERSION_MINOR 16
 
 /**
  * struct kbase_ioctl_version_check - Check version compatibility between
@@ -271,9 +283,9 @@ union kbase_ioctl_cs_queue_group_create {
 		__u8 csi_handlers;
 		__u8 padding[2];
 		/**
-		 * @in.reserved: Reserved
+		 * @in.dvs_buf: buffer for deferred vertex shader
 		 */
-		__u64 reserved;
+		__u64 dvs_buf;
 	} in;
 	struct {
 		__u8 group_handle;
@@ -361,6 +373,7 @@ struct kbase_ioctl_kcpu_queue_enqueue {
  *                     allowed.
  * @in.group_id:       Group ID to be used for physical allocations.
  * @in.padding:        Padding
+ * @in.buf_desc_va:    Buffer descriptor GPU VA for tiler heap reclaims.
  * @out:               Output parameters
  * @out.gpu_heap_va:   GPU VA (virtual address) of Heap context that was set up
  *                     for the heap.
@@ -376,6 +389,7 @@ union kbase_ioctl_cs_tiler_heap_init {
 		__u16 target_in_flight;
 		__u8 group_id;
 		__u8 padding;
+		__u64 buf_desc_va;
 	} in;
 	struct {
 		__u64 gpu_heap_va;
@@ -386,6 +400,43 @@ union kbase_ioctl_cs_tiler_heap_init {
 #define KBASE_IOCTL_CS_TILER_HEAP_INIT \
 	_IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init)
 
+/**
+ * union kbase_ioctl_cs_tiler_heap_init_1_13 - Initialize chunked tiler memory heap,
+ *                                             earlier version upto 1.13
+ * @in:                Input parameters
+ * @in.chunk_size:     Size of each chunk.
+ * @in.initial_chunks: Initial number of chunks that heap will be created with.
+ * @in.max_chunks:     Maximum number of chunks that the heap is allowed to use.
+ * @in.target_in_flight: Number of render-passes that the driver should attempt to
+ *                     keep in flight for which allocation of new chunks is
+ *                     allowed.
+ * @in.group_id:       Group ID to be used for physical allocations.
+ * @in.padding:        Padding
+ * @out:               Output parameters
+ * @out.gpu_heap_va:   GPU VA (virtual address) of Heap context that was set up
+ *                     for the heap.
+ * @out.first_chunk_va: GPU VA of the first chunk allocated for the heap,
+ *                     actually points to the header of heap chunk and not to
+ *                     the low address of free memory in the chunk.
+ */
+union kbase_ioctl_cs_tiler_heap_init_1_13 {
+	struct {
+		__u32 chunk_size;
+		__u32 initial_chunks;
+		__u32 max_chunks;
+		__u16 target_in_flight;
+		__u8 group_id;
+		__u8 padding;
+	} in;
+	struct {
+		__u64 gpu_heap_va;
+		__u64 first_chunk_va;
+	} out;
+};
+
+#define KBASE_IOCTL_CS_TILER_HEAP_INIT_1_13                                                        \
+	_IOWR(KBASE_IOCTL_TYPE, 48, union kbase_ioctl_cs_tiler_heap_init_1_13)
+
 /**
  * struct kbase_ioctl_cs_tiler_heap_term - Terminate a chunked tiler heap
  *                                         instance
@@ -487,6 +538,29 @@ union kbase_ioctl_mem_alloc_ex {
 
 #define KBASE_IOCTL_MEM_ALLOC_EX _IOWR(KBASE_IOCTL_TYPE, 59, union kbase_ioctl_mem_alloc_ex)
 
+/**
+ * union kbase_ioctl_read_user_page - Read a register of USER page
+ *
+ * @in:               Input parameters.
+ * @in.offset:        Register offset in USER page.
+ * @in.padding:       Padding to round up to a multiple of 8 bytes, must be zero.
+ * @out:              Output parameters.
+ * @out.val_lo:       Value of 32bit register or the 1st half of 64bit register to be read.
+ * @out.val_hi:       Value of the 2nd half of 64bit register to be read.
+ */
+union kbase_ioctl_read_user_page {
+	struct {
+		__u32 offset;
+		__u32 padding;
+	} in;
+	struct {
+		__u32 val_lo;
+		__u32 val_hi;
+	} out;
+};
+
+#define KBASE_IOCTL_READ_USER_PAGE _IOWR(KBASE_IOCTL_TYPE, 60, union kbase_ioctl_read_user_page)
+
 /***************
  * test ioctls *
  ***************/
diff --git a/drivers/base/arm/dma_buf_lock/src/build.bp b/include/uapi/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_csf.h
similarity index 67%
rename from drivers/base/arm/dma_buf_lock/src/build.bp
rename to include/uapi/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_csf.h
index dc49c0feb44d..75ae6a1a5409 100644
--- a/drivers/base/arm/dma_buf_lock/src/build.bp
+++ b/include/uapi/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_csf.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -19,19 +19,12 @@
  *
  */
 
-bob_kernel_module {
-    name: "dma_buf_lock",
-    defaults: [
-        "kernel_defaults"
-    ],
-    srcs: [
-        "Kbuild",
-        "dma_buf_lock.c",
-        "dma_buf_lock.h",
-    ],
-    enabled: false,
-    dma_buf_lock: {
-        kbuild_options: ["CONFIG_DMA_BUF_LOCK=y"],
-        enabled: true,
-    },
-}
+#ifndef _UAPI_KBASE_GPU_REGMAP_CSF_H_
+#define _UAPI_KBASE_GPU_REGMAP_CSF_H_
+
+/* IPA control registers */
+#define IPA_CONTROL_BASE 0x40000
+#define IPA_CONTROL_REG(r) (IPA_CONTROL_BASE + (r))
+#define STATUS 0x004 /* (RO) Status register */
+
+#endif /* _UAPI_KBASE_GPU_REGMAP_CSF_H_ */
diff --git a/include/uapi/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h b/include/uapi/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h
index f46638947953..87f849d28c93 100644
--- a/include/uapi/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h
+++ b/include/uapi/gpu/arm/bifrost/gpu/backend/mali_kbase_gpu_regmap_jm.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -23,8 +23,24 @@
 #define _UAPI_KBASE_GPU_REGMAP_JM_H_
 
 /* GPU control registers */
-#define LATEST_FLUSH            0x038   /* (RO) Flush ID of latest
-					 * clean-and-invalidate operation
-					 */
+
+#define LATEST_FLUSH           0x038 /* (RO) Flush ID of latest clean-and-invalidate operation */
+
+/* Job control registers */
+
+#define JS_HEAD_LO             0x00	/* (RO) Job queue head pointer for job slot n, low word */
+#define JS_HEAD_HI             0x04	/* (RO) Job queue head pointer for job slot n, high word */
+#define JS_TAIL_LO             0x08	/* (RO) Job queue tail pointer for job slot n, low word */
+#define JS_TAIL_HI             0x0C	/* (RO) Job queue tail pointer for job slot n, high word */
+#define JS_AFFINITY_LO         0x10	/* (RO) Core affinity mask for job slot n, low word */
+#define JS_AFFINITY_HI         0x14	/* (RO) Core affinity mask for job slot n, high word */
+#define JS_CONFIG              0x18	/* (RO) Configuration settings for job slot n */
+
+#define JS_HEAD_NEXT_LO        0x40	/* (RW) Next job queue head pointer for job slot n, low word */
+#define JS_HEAD_NEXT_HI        0x44	/* (RW) Next job queue head pointer for job slot n, high word */
+#define JS_AFFINITY_NEXT_LO    0x50	/* (RW) Next core affinity mask for job slot n, low word */
+#define JS_AFFINITY_NEXT_HI    0x54	/* (RW) Next core affinity mask for job slot n, high word */
+#define JS_CONFIG_NEXT         0x58	/* (RW) Next configuration settings for job slot n */
+#define JS_COMMAND_NEXT        0x60	/* (RW) Next command register for job slot n */
 
 #endif /* _UAPI_KBASE_GPU_REGMAP_JM_H_ */
diff --git a/include/uapi/gpu/arm/bifrost/gpu/mali_kbase_gpu_id.h b/include/uapi/gpu/arm/bifrost/gpu/mali_kbase_gpu_id.h
index 1a99e56b0910..1f34d99830fe 100644
--- a/include/uapi/gpu/arm/bifrost/gpu/mali_kbase_gpu_id.h
+++ b/include/uapi/gpu/arm/bifrost/gpu/mali_kbase_gpu_id.h
@@ -126,6 +126,8 @@
 #define GPU_ID2_PRODUCT_LODX              GPU_ID2_MODEL_MAKE(10, 7)
 #define GPU_ID2_PRODUCT_TTUX              GPU_ID2_MODEL_MAKE(11, 2)
 #define GPU_ID2_PRODUCT_LTUX              GPU_ID2_MODEL_MAKE(11, 3)
+#define GPU_ID2_PRODUCT_TTIX              GPU_ID2_MODEL_MAKE(12, 0)
+#define GPU_ID2_PRODUCT_LTIX              GPU_ID2_MODEL_MAKE(12, 1)
 
 /**
  * GPU_ID_MAKE - Helper macro to generate GPU_ID using id, major, minor, status
diff --git a/include/uapi/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h b/include/uapi/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h
index deca665df030..cdfcf8d3150e 100644
--- a/include/uapi/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h
+++ b/include/uapi/gpu/arm/bifrost/gpu/mali_kbase_gpu_regmap.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  *
- * (C) COPYRIGHT 2019-2021 ARM Limited. All rights reserved.
+ * (C) COPYRIGHT 2019-2022 ARM Limited. All rights reserved.
  *
  * This program is free software and is provided to you under the terms of the
  * GNU General Public License version 2 as published by the Free Software
@@ -22,13 +22,70 @@
 #ifndef _UAPI_KBASE_GPU_REGMAP_H_
 #define _UAPI_KBASE_GPU_REGMAP_H_
 
-#if !MALI_USE_CSF
+#if MALI_USE_CSF
+#include "backend/mali_kbase_gpu_regmap_csf.h"
+#else
 #include "backend/mali_kbase_gpu_regmap_jm.h"
 #endif /* !MALI_USE_CSF */
 
+/* Begin Register Offsets */
+/* GPU control registers */
+
+#define GPU_CONTROL_BASE        0x0000
+#define GPU_CONTROL_REG(r)      (GPU_CONTROL_BASE + (r))
+
+#define GPU_ID                  0x000   /* (RO) GPU and revision identifier */
+
+#define SHADER_READY_LO         0x140   /* (RO) Shader core ready bitmap, low word */
+#define SHADER_READY_HI         0x144   /* (RO) Shader core ready bitmap, high word */
+
+#define TILER_READY_LO          0x150   /* (RO) Tiler core ready bitmap, low word */
+#define TILER_READY_HI          0x154   /* (RO) Tiler core ready bitmap, high word */
+
+#define L2_READY_LO             0x160   /* (RO) Level 2 cache ready bitmap, low word */
+#define L2_READY_HI             0x164   /* (RO) Level 2 cache ready bitmap, high word */
+
+#define SHADER_PWRON_LO         0x180   /* (WO) Shader core power on bitmap, low word */
+#define SHADER_PWRON_HI         0x184   /* (WO) Shader core power on bitmap, high word */
+
+#define TILER_PWRON_LO          0x190   /* (WO) Tiler core power on bitmap, low word */
+#define TILER_PWRON_HI          0x194   /* (WO) Tiler core power on bitmap, high word */
+
+#define L2_PWRON_LO             0x1A0   /* (WO) Level 2 cache power on bitmap, low word */
+#define L2_PWRON_HI             0x1A4   /* (WO) Level 2 cache power on bitmap, high word */
+
+/* Job control registers */
+
+#define JOB_CONTROL_BASE        0x1000
+
+#define JOB_CONTROL_REG(r)      (JOB_CONTROL_BASE + (r))
+
+#define JOB_IRQ_CLEAR           0x004   /* Interrupt clear register */
+#define JOB_IRQ_MASK            0x008   /* Interrupt mask register */
+
 /* MMU control registers */
+
 #define MEMORY_MANAGEMENT_BASE  0x2000
+
 #define MMU_REG(r)              (MEMORY_MANAGEMENT_BASE + (r))
+
 #define MMU_IRQ_RAWSTAT         0x000   /* (RW) Raw interrupt status register */
 
+#define MMU_AS0                 0x400   /* Configuration registers for address space 0 */
+
+/* MMU address space control registers */
+
+#define MMU_AS_REG(n, r)        (MMU_REG(MMU_AS0 + ((n) << 6)) + (r))
+
+#define AS_TRANSTAB_LO         0x00	/* (RW) Translation Table Base Address for address space n, low word */
+#define AS_TRANSTAB_HI         0x04	/* (RW) Translation Table Base Address for address space n, high word */
+#define AS_MEMATTR_LO          0x08	/* (RW) Memory attributes for address space n, low word. */
+#define AS_MEMATTR_HI          0x0C	/* (RW) Memory attributes for address space n, high word. */
+#define AS_COMMAND             0x18	/* (WO) MMU command register for address space n */
+
+/* (RW) Translation table configuration for address space n, low word */
+#define AS_TRANSCFG_LO         0x30
+/* (RW) Translation table configuration for address space n, high word */
+#define AS_TRANSCFG_HI         0x34
+
 #endif /* _UAPI_KBASE_GPU_REGMAP_H_ */
diff --git a/include/uapi/gpu/arm/bifrost/jm/mali_base_jm_kernel.h b/include/uapi/gpu/arm/bifrost/jm/mali_base_jm_kernel.h
index ae43908b9360..1a3098d6cad8 100644
--- a/include/uapi/gpu/arm/bifrost/jm/mali_base_jm_kernel.h
+++ b/include/uapi/gpu/arm/bifrost/jm/mali_base_jm_kernel.h
@@ -1024,6 +1024,7 @@ struct base_dump_cpu_gpu_counters {
  *   is a bitpattern where a set bit indicates that the format is supported.
  *   Before using a texture format, it is recommended that the corresponding
  *   bit be checked.
+ * @paddings_1: Padding bytes.
  * @gpu_available_memory_size: Theoretical maximum memory available to the GPU.
  *   It is unlikely that a client will be able to allocate all of this memory
  *   for their own purposes, but this at least provides an upper bound on the
@@ -1034,6 +1035,7 @@ struct base_dump_cpu_gpu_counters {
  * @num_exec_engines: The number of execution engines. Only valid for tGOX
  *   (Bifrost) GPUs, where GPU_HAS_REG_CORE_FEATURES is defined. Otherwise,
  *   this is always 0.
+ * @paddings_2: Padding bytes.
  */
 struct mali_base_gpu_core_props {
 	__u32 product_id;
@@ -1044,8 +1046,10 @@ struct mali_base_gpu_core_props {
 	__u32 gpu_freq_khz_max;
 	__u32 log2_program_counter_size;
 	__u32 texture_features[BASE_GPU_NUM_TEXTURE_FEATURES_REGISTERS];
+	__u8 paddings_1[4];
 	__u64 gpu_available_memory_size;
 	__u8 num_exec_engines;
+	__u8 paddings_2[7];
 };
 
 #endif /* _UAPI_BASE_JM_KERNEL_H_ */
diff --git a/include/uapi/gpu/arm/bifrost/jm/mali_kbase_jm_ioctl.h b/include/uapi/gpu/arm/bifrost/jm/mali_kbase_jm_ioctl.h
index 20d931adc9b8..9c7553ff2bd2 100644
--- a/include/uapi/gpu/arm/bifrost/jm/mali_kbase_jm_ioctl.h
+++ b/include/uapi/gpu/arm/bifrost/jm/mali_kbase_jm_ioctl.h
@@ -127,9 +127,15 @@
  * - First release of new HW performance counters interface.
  * 11.35:
  * - Dummy model (no mali) backend will now clear HWC values after each sample
+ * 11.36:
+ * - Remove legacy definitions:
+ *   - base_jit_alloc_info_10_2
+ *   - base_jit_alloc_info_11_5
+ *   - kbase_ioctl_mem_jit_init_10_2
+ *   - kbase_ioctl_mem_jit_init_11_5
  */
 #define BASE_UK_VERSION_MAJOR 11
-#define BASE_UK_VERSION_MINOR 35
+#define BASE_UK_VERSION_MINOR 36
 
 /**
  * struct kbase_ioctl_version_check - Check version compatibility between
diff --git a/include/uapi/gpu/arm/bifrost/mali_base_kernel.h b/include/uapi/gpu/arm/bifrost/mali_base_kernel.h
index 6adbd81bcc70..e6cac0eb2a1a 100644
--- a/include/uapi/gpu/arm/bifrost/mali_base_kernel.h
+++ b/include/uapi/gpu/arm/bifrost/mali_base_kernel.h
@@ -197,55 +197,6 @@ struct base_mem_aliasing_info {
  */
 #define BASE_JIT_ALLOC_COUNT (255)
 
-/* base_jit_alloc_info in use for kernel driver versions 10.2 to early 11.5
- *
- * jit_version is 1
- *
- * Due to the lack of padding specified, user clients between 32 and 64-bit
- * may have assumed a different size of the struct
- *
- * An array of structures was not supported
- */
-struct base_jit_alloc_info_10_2 {
-	__u64 gpu_alloc_addr;
-	__u64 va_pages;
-	__u64 commit_pages;
-	__u64 extension;
-	__u8 id;
-};
-
-/* base_jit_alloc_info introduced by kernel driver version 11.5, and in use up
- * to 11.19
- *
- * This structure had a number of modifications during and after kernel driver
- * version 11.5, but remains size-compatible throughout its version history, and
- * with earlier variants compatible with future variants by requiring
- * zero-initialization to the unused space in the structure.
- *
- * jit_version is 2
- *
- * Kernel driver version history:
- * 11.5: Initial introduction with 'usage_id' and padding[5]. All padding bytes
- *       must be zero. Kbase minor version was not incremented, so some
- *       versions of 11.5 do not have this change.
- * 11.5: Added 'bin_id' and 'max_allocations', replacing 2 padding bytes (Kbase
- *       minor version not incremented)
- * 11.6: Added 'flags', replacing 1 padding byte
- * 11.10: Arrays of this structure are supported
- */
-struct base_jit_alloc_info_11_5 {
-	__u64 gpu_alloc_addr;
-	__u64 va_pages;
-	__u64 commit_pages;
-	__u64 extension;
-	__u8 id;
-	__u8 bin_id;
-	__u8 max_allocations;
-	__u8 flags;
-	__u8 padding[2];
-	__u16 usage_id;
-};
-
 /**
  * struct base_jit_alloc_info - Structure which describes a JIT allocation
  *                              request.
@@ -275,16 +226,6 @@ struct base_jit_alloc_info_11_5 {
  * @heap_info_gpu_addr:         Pointer to an object in GPU memory describing
  *                              the actual usage of the region.
  *
- * jit_version is 3.
- *
- * When modifications are made to this structure, it is still compatible with
- * jit_version 3 when: a) the size is unchanged, and b) new members only
- * replace the padding bytes.
- *
- * Previous jit_version history:
- * jit_version == 1, refer to &base_jit_alloc_info_10_2
- * jit_version == 2, refer to &base_jit_alloc_info_11_5
- *
  * Kbase version history:
  * 11.20: added @heap_info_gpu_addr
  */
diff --git a/include/uapi/gpu/arm/bifrost/mali_kbase_hwcnt_reader.h b/include/uapi/gpu/arm/bifrost/mali_kbase_hwcnt_reader.h
index 42d93ba4c150..962decc10efc 100644
--- a/include/uapi/gpu/arm/bifrost/mali_kbase_hwcnt_reader.h
+++ b/include/uapi/gpu/arm/bifrost/mali_kbase_hwcnt_reader.h
@@ -221,6 +221,7 @@ struct prfcnt_enum_sample_info {
 
 /**
  * struct prfcnt_enum_item - Performance counter enumeration item.
+ * @padding:         Padding bytes.
  * @hdr:             Header describing the type of item in the list.
  * @u:               Structure containing discriptor for enumeration item type.
  * @u.block_counter: Performance counter block descriptor.
@@ -229,6 +230,7 @@ struct prfcnt_enum_sample_info {
  */
 struct prfcnt_enum_item {
 	struct prfcnt_item_header hdr;
+	__u8 padding[4];
 	/** union u - union of block_counter and request */
 	union {
 		struct prfcnt_enum_block_counter block_counter;
@@ -305,6 +307,7 @@ struct prfcnt_request_scope {
 
 /**
  * struct prfcnt_request_item - Performance counter request item.
+ * @padding:      Padding bytes.
  * @hdr:          Header describing the type of item in the list.
  * @u:            Structure containing descriptor for request type.
  * @u.req_mode:   Mode request descriptor.
@@ -313,6 +316,7 @@ struct prfcnt_request_scope {
  */
 struct prfcnt_request_item {
 	struct prfcnt_item_header hdr;
+	__u8 padding[4];
 	/** union u - union on req_mode and req_enable */
 	union {
 		struct prfcnt_request_mode req_mode;
@@ -417,6 +421,7 @@ struct prfcnt_block_metadata {
 
 /**
  * struct prfcnt_metadata - Performance counter metadata item.
+ * @padding:     Padding bytes.
  * @hdr:         Header describing the type of item in the list.
  * @u:           Structure containing descriptor for metadata type.
  * @u.sample_md: Counter sample data metadata descriptor.
@@ -425,6 +430,7 @@ struct prfcnt_block_metadata {
  */
 struct prfcnt_metadata {
 	struct prfcnt_item_header hdr;
+	__u8 padding[4];
 	union {
 		struct prfcnt_sample_metadata sample_md;
 		struct prfcnt_clock_metadata clock_md;
diff --git a/include/uapi/gpu/arm/bifrost/mali_kbase_ioctl.h b/include/uapi/gpu/arm/bifrost/mali_kbase_ioctl.h
index e691aea47b40..63bf48b603ef 100644
--- a/include/uapi/gpu/arm/bifrost/mali_kbase_ioctl.h
+++ b/include/uapi/gpu/arm/bifrost/mali_kbase_ioctl.h
@@ -218,52 +218,6 @@ struct kbase_ioctl_get_ddk_version {
 #define KBASE_IOCTL_GET_DDK_VERSION \
 	_IOW(KBASE_IOCTL_TYPE, 13, struct kbase_ioctl_get_ddk_version)
 
-/**
- * struct kbase_ioctl_mem_jit_init_10_2 - Initialize the just-in-time memory
- *                                        allocator (between kernel driver
- *                                        version 10.2--11.4)
- * @va_pages: Number of VA pages to reserve for JIT
- *
- * Note that depending on the VA size of the application and GPU, the value
- * specified in @va_pages may be ignored.
- *
- * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
- * backwards compatibility.
- */
-struct kbase_ioctl_mem_jit_init_10_2 {
-	__u64 va_pages;
-};
-
-#define KBASE_IOCTL_MEM_JIT_INIT_10_2 \
-	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_10_2)
-
-/**
- * struct kbase_ioctl_mem_jit_init_11_5 - Initialize the just-in-time memory
- *                                        allocator (between kernel driver
- *                                        version 11.5--11.19)
- * @va_pages: Number of VA pages to reserve for JIT
- * @max_allocations: Maximum number of concurrent allocations
- * @trim_level: Level of JIT allocation trimming to perform on free (0 - 100%)
- * @group_id: Group ID to be used for physical allocations
- * @padding: Currently unused, must be zero
- *
- * Note that depending on the VA size of the application and GPU, the value
- * specified in @va_pages may be ignored.
- *
- * New code should use KBASE_IOCTL_MEM_JIT_INIT instead, this is kept for
- * backwards compatibility.
- */
-struct kbase_ioctl_mem_jit_init_11_5 {
-	__u64 va_pages;
-	__u8 max_allocations;
-	__u8 trim_level;
-	__u8 group_id;
-	__u8 padding[5];
-};
-
-#define KBASE_IOCTL_MEM_JIT_INIT_11_5 \
-	_IOW(KBASE_IOCTL_TYPE, 14, struct kbase_ioctl_mem_jit_init_11_5)
-
 /**
  * struct kbase_ioctl_mem_jit_init - Initialize the just-in-time memory
  *                                   allocator
diff --git a/drivers/gpu/arm/bifrost/mali_kbase_mem_profile_debugfs_buf_size.h b/include/uapi/gpu/arm/bifrost/mali_kbase_mem_profile_debugfs_buf_size.h
similarity index 88%
rename from drivers/gpu/arm/bifrost/mali_kbase_mem_profile_debugfs_buf_size.h
rename to include/uapi/gpu/arm/bifrost/mali_kbase_mem_profile_debugfs_buf_size.h
index c2fb3f572f2d..329845005341 100644
--- a/drivers/gpu/arm/bifrost/mali_kbase_mem_profile_debugfs_buf_size.h
+++ b/include/uapi/gpu/arm/bifrost/mali_kbase_mem_profile_debugfs_buf_size.h
@@ -23,8 +23,8 @@
  * DOC: Header file for the size of the buffer to accumulate the histogram report text in
  */
 
-#ifndef _KBASE_MEM_PROFILE_DEBUGFS_BUF_SIZE_H_
-#define _KBASE_MEM_PROFILE_DEBUGFS_BUF_SIZE_H_
+#ifndef _UAPI_KBASE_MEM_PROFILE_DEBUGFS_BUF_SIZE_H_
+#define _UAPI_KBASE_MEM_PROFILE_DEBUGFS_BUF_SIZE_H_
 
 /**
  * KBASE_MEM_PROFILE_MAX_BUF_SIZE - The size of the buffer to accumulate the histogram report text
@@ -32,5 +32,4 @@
  */
 #define KBASE_MEM_PROFILE_MAX_BUF_SIZE ((size_t)(64 + ((80 + (56 * 64)) * 54) + 56))
 
-#endif  /*_KBASE_MEM_PROFILE_DEBUGFS_BUF_SIZE_H_*/
-
+#endif /*_UAPI_KBASE_MEM_PROFILE_DEBUGFS_BUF_SIZE_H_*/