Merge remote-tracking branch 'lsk/v3.10/topic/big.LITTLE' into linux-linaro-lsk

2026-01-06 10:13:00 -08:00 · 2013-10-11 19:26:24 +01:00
parent a3dfd8c063 b574d25f35
commit fa4b900fca
8 changed files with 515 additions and 86 deletions
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1578,6 +1578,18 @@ config HMP_FREQUENCY_INVARIANT_SCALE
 	  migration strategy to interact more predictably with CPUFreq's
 	  asynchronous compute capacity changes.

+config SCHED_HMP_LITTLE_PACKING
+	bool "Small task packing for HMP"
+	depends on SCHED_HMP
+	default n
+	help
+	  Allows the HMP Scheduler to pack small tasks into CPUs in the
+	  smallest HMP domain.
+	  Controlled by two sysfs files in sys/kernel/hmp.
+	  packing_enable: 1 to enable, 0 to disable packing. Default 1.
+	  packing_limit: runqueue load ratio where a RQ is considered
+	    to be full. Default is NICE_0_LOAD * 9/8.
+
 config HAVE_ARM_SCU
 	bool
 	help
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -46,6 +46,9 @@
 #include <asm/virt.h>
 #include <asm/mach/arch.h>

+#define CREATE_TRACE_POINTS
+#include <trace/events/arm-ipi.h>
+
 /*
 * as from 2.5, kernels no longer have an init_tasks structure
 * so we need some other way of telling a new secondary core
@@ -619,6 +622,7 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 	if (ipinr < NR_IPI)
 		__inc_irq_stat(cpu, ipi_irqs[ipinr]);

+	trace_arm_ipi_entry(ipinr);
 	switch (ipinr) {
 	case IPI_WAKEUP:
 		break;
@@ -664,6 +668,7 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 		       cpu, ipinr);
 		break;
 	}
+	trace_arm_ipi_exit(ipinr);
 	set_irq_regs(old_regs);
 }

--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -41,6 +41,7 @@
 #include <linux/slab.h>
 #include <linux/irqchip/chained_irq.h>
 #include <linux/irqchip/arm-gic.h>
+#include <trace/events/arm-ipi.h>

 #include <asm/irq.h>
 #include <asm/exception.h>
@@ -656,8 +657,10 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq)
 	raw_spin_lock_irqsave(&irq_controller_lock, flags);

 	/* Convert our logical CPU mask into a physical one. */
-	for_each_cpu(cpu, mask)
+	for_each_cpu(cpu, mask) {
+		trace_arm_ipi_send(irq, cpu);
 		map |= gic_cpu_map[cpu];
+	}

 	/*
 	 * Ensure that stores to Normal memory are visible to the
--- a/include/trace/events/arm-ipi.h
+++ b/include/trace/events/arm-ipi.h
@@ -0,0 +1,100 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM arm-ipi
+
+#if !defined(_TRACE_ARM_IPI_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ARM_IPI_H
+
+#include <linux/tracepoint.h>
+
+#define show_arm_ipi_name(val)				\
+	__print_symbolic(val,				\
+			 { 0, "IPI_WAKEUP" },		\
+			 { 1, "IPI_TIMER" },		\
+			 { 2, "IPI_RESCHEDULE" },		\
+			 { 3, "IPI_CALL_FUNC" },		\
+			 { 4, "IPI_CALL_FUNC_SINGLE" },		\
+			 { 5, "IPI_CPU_STOP" },	\
+			 { 6, "IPI_COMPLETION" },		\
+			 { 7, "IPI_CPU_BACKTRACE" })
+
+DECLARE_EVENT_CLASS(arm_ipi,
+
+	TP_PROTO(unsigned int ipi_nr),
+
+	TP_ARGS(ipi_nr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	ipi	)
+	),
+
+	TP_fast_assign(
+		__entry->ipi = ipi_nr;
+	),
+
+	TP_printk("ipi=%u [action=%s]", __entry->ipi,
+		show_arm_ipi_name(__entry->ipi))
+);
+
+/**
+ * arm_ipi_entry - called in the arm-generic ipi handler immediately before
+ *                 entering ipi-type handler
+ * @ipi_nr:  ipi number
+ *
+ * When used in combination with the arm_ipi_exit tracepoint
+ * we can determine the ipi handler runtine.
+ */
+DEFINE_EVENT(arm_ipi, arm_ipi_entry,
+
+	TP_PROTO(unsigned int ipi_nr),
+
+	TP_ARGS(ipi_nr)
+);
+
+/**
+ * arm_ipi_exit - called in the arm-generic ipi handler immediately
+ *                after the ipi-type handler returns
+ * @ipi_nr:  ipi number
+ *
+ * When used in combination with the arm_ipi_entry tracepoint
+ * we can determine the ipi handler runtine.
+ */
+DEFINE_EVENT(arm_ipi, arm_ipi_exit,
+
+	TP_PROTO(unsigned int ipi_nr),
+
+	TP_ARGS(ipi_nr)
+);
+
+/**
+ * arm_ipi_send - called as the ipi target mask is built, immediately
+ *                before the register is written
+ * @ipi_nr:  ipi number
+ * @dest:    cpu to send to
+ *
+ * When used in combination with the arm_ipi_entry tracepoint
+ * we can determine the ipi raise to run latency.
+ */
+TRACE_EVENT(arm_ipi_send,
+
+	TP_PROTO(unsigned int ipi_nr, int dest),
+
+	TP_ARGS(ipi_nr, dest),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	ipi	)
+		__field(	int			,	dest )
+	),
+
+	TP_fast_assign(
+		__entry->ipi = ipi_nr;
+		__entry->dest = dest;
+	),
+
+	TP_printk("dest=%d ipi=%u [action=%s]", __entry->dest,
+			__entry->ipi, show_arm_ipi_name(__entry->ipi))
+);
+
+#endif /*  _TRACE_ARM_IPI_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -530,6 +530,29 @@ TRACE_EVENT(sched_rq_runnable_load,
 			__entry->load)
 );

+TRACE_EVENT(sched_rq_nr_running,
+
+	TP_PROTO(int cpu, unsigned int nr_running, int nr_iowait),
+
+	TP_ARGS(cpu, nr_running, nr_iowait),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(unsigned int, nr_running)
+		__field(int, nr_iowait)
+	),
+
+	TP_fast_assign(
+		__entry->cpu  = cpu;
+		__entry->nr_running = nr_running;
+		__entry->nr_iowait = nr_iowait;
+	),
+
+	TP_printk("cpu=%d nr_running=%u nr_iowait=%d",
+			__entry->cpu,
+			__entry->nr_running, __entry->nr_iowait)
+);
+
 /*
 * Tracepoint for showing tracked task cpu usage ratio [0..1023].
 */
@@ -559,6 +582,10 @@ TRACE_EVENT(sched_task_usage_ratio,
 /*
 * Tracepoint for HMP (CONFIG_SCHED_HMP) task migrations.
 */
+#define HMP_MIGRATE_WAKEUP 0
+#define HMP_MIGRATE_FORCE  1
+#define HMP_MIGRATE_OFFLOAD 2
+#define HMP_MIGRATE_IDLE_PULL 3
 TRACE_EVENT(sched_hmp_migrate,

 	TP_PROTO(struct task_struct *tsk, int dest, int force),
@@ -583,6 +610,51 @@ TRACE_EVENT(sched_hmp_migrate,
 			__entry->comm, __entry->pid,
 			__entry->dest, __entry->force)
 );
+
+TRACE_EVENT(sched_hmp_offload_abort,
+
+	TP_PROTO(int cpu, int data, char *label),
+
+	TP_ARGS(cpu,data,label),
+
+	TP_STRUCT__entry(
+		__array(char, label, 64)
+		__field(int, cpu)
+		__field(int, data)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->label, label, 64);
+		__entry->cpu   = cpu;
+		__entry->data = data;
+	),
+
+	TP_printk("cpu=%d data=%d label=%63s",
+			__entry->cpu, __entry->data,
+			__entry->label)
+);
+
+TRACE_EVENT(sched_hmp_offload_succeed,
+
+	TP_PROTO(int cpu, int dest_cpu),
+
+	TP_ARGS(cpu,dest_cpu),
+
+	TP_STRUCT__entry(
+		__field(int, cpu)
+		__field(int, dest_cpu)
+	),
+
+	TP_fast_assign(
+		__entry->cpu   = cpu;
+		__entry->dest_cpu = dest_cpu;
+	),
+
+	TP_printk("cpu=%d dest=%d",
+			__entry->cpu,
+			__entry->dest_cpu)
+);
+
 #endif /* _TRACE_SCHED_H */

 /* This part must be outside protection */
--- a/include/trace/events/smp.h
+++ b/include/trace/events/smp.h
@@ -0,0 +1,91 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM smp
+
+#if !defined(_TRACE_SMP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SMP_H
+
+#include <linux/tracepoint.h>
+typedef void (*__smp_call_func_t)(void *info);
+
+DECLARE_EVENT_CLASS(smp_call_class,
+
+	TP_PROTO(__smp_call_func_t fnc),
+
+	TP_ARGS(fnc),
+
+	TP_STRUCT__entry(
+		__field( void *, func )
+	),
+
+	TP_fast_assign(
+		__entry->func = fnc;
+	),
+
+	TP_printk("func=%pf", __entry->func)
+);
+
+/**
+ * smp_call_func_entry - called in the generic smp-cross-call-handler
+ * 						 immediately before calling the destination
+ * 						 function
+ * @func:  function pointer
+ *
+ * When used in combination with the smp_call_func_exit tracepoint
+ * we can determine the cross-call runtime.
+ */
+DEFINE_EVENT(smp_call_class, smp_call_func_entry,
+
+	TP_PROTO(__smp_call_func_t fnc),
+
+	TP_ARGS(fnc)
+);
+
+/**
+ * smp_call_func_exit - called in the generic smp-cross-call-handler
+ * 						immediately after the destination function
+ * 						returns
+ * @func:  function pointer
+ *
+ * When used in combination with the smp_call_entry tracepoint
+ * we can determine the cross-call runtime.
+ */
+DEFINE_EVENT(smp_call_class, smp_call_func_exit,
+
+	TP_PROTO(__smp_call_func_t fnc),
+
+	TP_ARGS(fnc)
+);
+
+/**
+ * smp_call_func_send - called as destination function is set
+ * 						in the per-cpu storage
+ * @func:  function pointer
+ * @dest:  cpu to send to
+ *
+ * When used in combination with the smp_cross_call_entry tracepoint
+ * we can determine the call-to-run latency.
+ */
+TRACE_EVENT(smp_call_func_send,
+
+	TP_PROTO(__smp_call_func_t func, int dest),
+
+	TP_ARGS(func, dest),
+
+	TP_STRUCT__entry(
+		__field(	void * 	,	func )
+		__field(	int		,	dest )
+	),
+
+	TP_fast_assign(
+		__entry->func = func;
+		__entry->dest = dest;
+	),
+
+	TP_printk("dest=%d func=%pf", __entry->dest,
+			__entry->func)
+);
+
+#endif /*  _TRACE_SMP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1226,11 +1226,7 @@ struct hmp_global_attr {
 	int (*from_sysfs)(int);
 };

-#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
-#define HMP_DATA_SYSFS_MAX 4
-#else
-#define HMP_DATA_SYSFS_MAX 3
-#endif
+#define HMP_DATA_SYSFS_MAX 8

 struct hmp_data_struct {
 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
@@ -1688,6 +1684,7 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
 	trace_sched_rq_runnable_ratio(cpu_of(rq), rq->avg.load_avg_ratio);
 	trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
+	trace_sched_rq_nr_running(cpu_of(rq), rq->nr_running, rq->nr_iowait.counter);
 }

 /* Add the load generated by se into cfs_rq's child load-average */
@@ -3664,25 +3661,46 @@ static struct sched_entity *hmp_get_lightest_task(
 * Migration thresholds should be in the range [0..1023]
 * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
 * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
- * The default values (512, 256) offer good responsiveness, but may need
- * tweaking suit particular needs.
 *
 * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
 * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
 * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
+ *
+ * Small Task Packing:
+ * We can choose to fill the littlest CPUs in an HMP system rather than
+ * the typical spreading mechanic. This behavior is controllable using
+ * two variables.
+ * hmp_packing_enabled: runtime control over pack/spread
+ * hmp_full_threshold: Consider a CPU with this much unweighted load full
 */
-unsigned int hmp_up_threshold = 512;
-unsigned int hmp_down_threshold = 256;
+unsigned int hmp_up_threshold = 700;
+unsigned int hmp_down_threshold = 512;
 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
 unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
 #endif
 unsigned int hmp_next_up_threshold = 4096;
 unsigned int hmp_next_down_threshold = 4096;

+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+unsigned int hmp_packing_enabled = 1;
+#ifndef CONFIG_ARCH_VEXPRESS_TC2
+unsigned int hmp_full_threshold = (NICE_0_LOAD * 9) / 8;
+#else
+/* TC2 has a sharp consumption curve @ around 800Mhz, so
+   we aim to spread the load around that frequency. */
+unsigned int hmp_full_threshold = 650;  /*  80% of the 800Mhz freq * NICE_0_LOAD */
+#endif
+#endif
+
 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
-						int *min_cpu);
+						int *min_cpu, struct cpumask *affinity);
+
+static inline struct hmp_domain *hmp_smallest_domain(void)
+{
+	return list_entry(hmp_domains.prev, struct hmp_domain, hmp_domains);
+}

 /* Check if cpu is in fastest hmp_domain */
 static inline unsigned int hmp_cpu_is_fastest(int cpu)
@@ -3722,22 +3740,23 @@ static inline struct hmp_domain *hmp_faster_domain(int cpu)

 /*
 * Selects a cpu in previous (faster) hmp_domain
- * Note that cpumask_any_and() returns the first cpu in the cpumask
 */
 static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
 							int cpu)
 {
 	int lowest_cpu=NR_CPUS;
-	__always_unused int lowest_ratio = hmp_domain_min_load(hmp_faster_domain(cpu), &lowest_cpu);
-	/*
-	 * If the lowest-loaded CPU in the domain is allowed by the task affinity
-	 * select that one, otherwise select one which is allowed
-	 */
-	if(lowest_cpu != NR_CPUS && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
-		return lowest_cpu;
+	__always_unused int lowest_ratio;
+	struct hmp_domain *hmp;
+
+	if (hmp_cpu_is_fastest(cpu))
+		hmp = hmp_cpu_domain(cpu);
 	else
-		return cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
-				tsk_cpus_allowed(tsk));
+		hmp = hmp_faster_domain(cpu);
+
+	lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+			tsk_cpus_allowed(tsk));
+
+	return lowest_cpu;
 }

 /*
@@ -3756,18 +3775,54 @@ static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
 	else
 		hmp = hmp_slower_domain(cpu);

-	lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu);
-	/*
-	 * If the lowest-loaded CPU in the domain is allowed by the task affinity
-	 * select that one, otherwise select one which is allowed
-	 */
-	if(lowest_cpu != NR_CPUS && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
-		return lowest_cpu;
-	else
-		return cpumask_any_and(&hmp_slower_domain(cpu)->cpus,
-				tsk_cpus_allowed(tsk));
-}
+	lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+			tsk_cpus_allowed(tsk));

+	return lowest_cpu;
+}
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+/*
+ * Select the 'best' candidate little CPU to wake up on.
+ * Implements a packing strategy which examines CPU in
+ * logical CPU order, and selects the first which will
+ * have at least 10% capacity available, according to
+ * both tracked load of the runqueue and the task.
+ */
+static inline unsigned int hmp_best_little_cpu(struct task_struct *tsk,
+		int cpu) {
+	int tmp_cpu;
+	unsigned long estimated_load;
+	struct hmp_domain *hmp;
+	struct sched_avg *avg;
+	struct cpumask allowed_hmp_cpus;
+
+	if(!hmp_packing_enabled ||
+			tsk->se.avg.load_avg_ratio > ((NICE_0_LOAD * 90)/100))
+		return hmp_select_slower_cpu(tsk, cpu);
+
+	if (hmp_cpu_is_slowest(cpu))
+		hmp = hmp_cpu_domain(cpu);
+	else
+		hmp = hmp_slower_domain(cpu);
+
+	/* respect affinity */
+	cpumask_and(&allowed_hmp_cpus, &hmp->cpus,
+			tsk_cpus_allowed(tsk));
+
+	for_each_cpu_mask(tmp_cpu, allowed_hmp_cpus) {
+		avg = &cpu_rq(tmp_cpu)->avg;
+		/* estimate new rq load if we add this task */
+		estimated_load = avg->load_avg_ratio +
+				tsk->se.avg.load_avg_ratio;
+		if (estimated_load <= hmp_full_threshold) {
+			cpu = tmp_cpu;
+			break;
+		}
+	}
+	/* if no match was found, the task uses the initial value */
+	return cpu;
+}
+#endif
 static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
 {
 	/* hack - always use clock from first online CPU */
@@ -3891,6 +3946,15 @@ static int hmp_freqinvar_from_sysfs(int value)
 	return value;
 }
 #endif
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+/* packing value must be non-negative */
+static int hmp_packing_from_sysfs(int value)
+{
+	if (value < 0)
+		return -1;
+	return value;
+}
+#endif
 static void hmp_attr_add(
 	const char *name,
 	int *value,
@@ -3942,6 +4006,16 @@ static int hmp_attr_init(void)
 		&hmp_data.freqinvar_load_scale_enabled,
 		NULL,
 		hmp_freqinvar_from_sysfs);
+#endif
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+	hmp_attr_add("packing_enable",
+		&hmp_packing_enabled,
+		NULL,
+		hmp_freqinvar_from_sysfs);
+	hmp_attr_add("packing_limit",
+		&hmp_full_threshold,
+		NULL,
+		hmp_packing_from_sysfs);
 #endif
 	hmp_data.attr_group.name = "hmp";
 	hmp_data.attr_group.attrs = hmp_data.attributes;
@@ -3951,9 +4025,24 @@ static int hmp_attr_init(void)
 }
 late_initcall(hmp_attr_init);
 #endif /* CONFIG_HMP_VARIABLE_SCALE */
-
+/*
+ * return the load of the lowest-loaded CPU in a given HMP domain
+ * min_cpu optionally points to an int to receive the CPU.
+ * affinity optionally points to a cpumask containing the
+ * CPUs to be considered. note:
+ *   + min_cpu = NR_CPUS only if no CPUs are in the set of
+ *     affinity && hmp_domain cpus
+ *   + min_cpu will always otherwise equal one of the CPUs in
+ *     the hmp domain
+ *   + when more than one CPU has the same load, the one which
+ *     is least-recently-disturbed by an HMP migration will be
+ *     selected
+ *   + if all CPUs are equally loaded or idle and the times are
+ *     all the same, the first in the set will be used
+ *   + if affinity is not set, cpu_online_mask is used
+ */
 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
-						int *min_cpu)
+						int *min_cpu, struct cpumask *affinity)
 {
 	int cpu;
 	int min_cpu_runnable_temp = NR_CPUS;
@@ -3962,8 +4051,15 @@ static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
 	unsigned long min_runnable_load = INT_MAX;
 	unsigned long contrib;
 	struct sched_avg *avg;
+	struct cpumask temp_cpumask;
+	/*
+	 * only look at CPUs allowed if specified,
+	 * otherwise look at all online CPUs in the
+	 * right HMP domain
+	 */
+	cpumask_and(&temp_cpumask, &hmpd->cpus, affinity ? affinity : cpu_online_mask);

-	for_each_cpu_mask(cpu, hmpd->cpus) {
+	for_each_cpu_mask(cpu, temp_cpumask) {
 		avg = &cpu_rq(cpu)->avg;
 		/* used for both up and down migration */
 		curr_last_migration = avg->hmp_last_up_migration ?
@@ -4025,27 +4121,36 @@ static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
 		return NR_CPUS;

 	/* Is there an idle CPU in the current domain */
-	min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL);
-	if (min_usage == 0)
+	min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL);
+	if (min_usage == 0) {
+		trace_sched_hmp_offload_abort(cpu, min_usage, "load");
 		return NR_CPUS;
+	}

 	/* Is the task alone on the cpu? */
-	if (cpu_rq(cpu)->cfs.h_nr_running < 2)
+	if (cpu_rq(cpu)->cfs.h_nr_running < 2) {
+		trace_sched_hmp_offload_abort(cpu,
+			cpu_rq(cpu)->cfs.h_nr_running, "nr_running");
 		return NR_CPUS;
+	}

 	/* Is the task actually starving? */
 	/* >=25% ratio running/runnable = starving */
-	if (hmp_task_starvation(se) > 768)
+	if (hmp_task_starvation(se) > 768) {
+		trace_sched_hmp_offload_abort(cpu, hmp_task_starvation(se),
+			"starvation");
 		return NR_CPUS;
+	}

 	/* Does the slower domain have any idle CPUs? */
-	min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu);
-	if (min_usage > 0)
-		return NR_CPUS;
+	min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu,
+			tsk_cpus_allowed(task_of(se)));

-	if (cpumask_test_cpu(dest_cpu, &hmp_slower_domain(cpu)->cpus))
+	if (min_usage == 0) {
+		trace_sched_hmp_offload_succeed(cpu, dest_cpu);
 		return dest_cpu;
-
+	} else
+		trace_sched_hmp_offload_abort(cpu,min_usage,"slowdomain");
 	return NR_CPUS;
 }
 #endif /* CONFIG_SCHED_HMP */
@@ -4077,30 +4182,13 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
 #ifdef CONFIG_SCHED_HMP
 	/* always put non-kernel forking tasks on a big domain */
 	if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
-		if(hmp_cpu_is_fastest(prev_cpu)) {
-			struct hmp_domain *hmpdom = list_entry(&hmp_cpu_domain(prev_cpu)->hmp_domains, struct hmp_domain, hmp_domains);
-			__always_unused int lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu);
-			if (new_cpu != NR_CPUS &&
-				cpumask_test_cpu(new_cpu,
-						tsk_cpus_allowed(p))) {
-				hmp_next_up_delay(&p->se, new_cpu);
-				return new_cpu;
-			} else {
-				new_cpu = cpumask_any_and(
-						&hmp_faster_domain(cpu)->cpus,
-						tsk_cpus_allowed(p));
-				if (new_cpu < nr_cpu_ids) {
-					hmp_next_up_delay(&p->se, new_cpu);
-					return new_cpu;
-				}
-			}
-		} else {
-			new_cpu = hmp_select_faster_cpu(p, prev_cpu);
-			if (new_cpu != NR_CPUS) {
-				hmp_next_up_delay(&p->se, new_cpu);
-				return new_cpu;
-			}
+		new_cpu = hmp_select_faster_cpu(p, prev_cpu);
+		if (new_cpu != NR_CPUS) {
+			hmp_next_up_delay(&p->se, new_cpu);
+			return new_cpu;
 		}
+		/* failed to perform HMP fork balance, use normal balance */
+		new_cpu = cpu;
 	}
 #endif

@@ -4179,16 +4267,24 @@ unlock:
 	rcu_read_unlock();

 #ifdef CONFIG_SCHED_HMP
+	prev_cpu = task_cpu(p);
+
 	if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) {
 		hmp_next_up_delay(&p->se, new_cpu);
-		trace_sched_hmp_migrate(p, new_cpu, 0);
+		trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
 		return new_cpu;
 	}
 	if (hmp_down_migration(prev_cpu, &p->se)) {
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+		new_cpu = hmp_best_little_cpu(p, prev_cpu);
+#else
 		new_cpu = hmp_select_slower_cpu(p, prev_cpu);
-		hmp_next_down_delay(&p->se, new_cpu);
-		trace_sched_hmp_migrate(p, new_cpu, 0);
-		return new_cpu;
+#endif
+		if (new_cpu != prev_cpu) {
+			hmp_next_down_delay(&p->se, new_cpu);
+			trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
+			return new_cpu;
+		}
 	}
 	/* Make sure that the task stays in its previous hmp domain */
 	if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
@@ -6155,16 +6251,49 @@ static struct {
 	unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;

+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+/*
+ * Decide if the tasks on the busy CPUs in the
+ * littlest domain would benefit from an idle balance
+ */
+static int hmp_packing_ilb_needed(int cpu)
+{
+	struct hmp_domain *hmp;
+	/* always allow ilb on non-slowest domain */
+	if (!hmp_cpu_is_slowest(cpu))
+		return 1;
+
+	hmp = hmp_cpu_domain(cpu);
+	for_each_cpu_and(cpu, &hmp->cpus, nohz.idle_cpus_mask) {
+		/* only idle balance if a CPU is loaded over threshold */
+		if (cpu_rq(cpu)->avg.load_avg_ratio > hmp_full_threshold)
+			return 1;
+	}
+	return 0;
+}
+#endif
+
 static inline int find_new_ilb(int call_cpu)
 {
 	int ilb = cpumask_first(nohz.idle_cpus_mask);
 #ifdef CONFIG_SCHED_HMP
+	int ilb_needed = 1;
+
 	/* restrict nohz balancing to occur in the same hmp domain */
 	ilb = cpumask_first_and(nohz.idle_cpus_mask,
 			&((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus);
+
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+	if (ilb < nr_cpu_ids)
+		ilb_needed = hmp_packing_ilb_needed(ilb);
 #endif
+
+	if (ilb_needed && ilb < nr_cpu_ids && idle_cpu(ilb))
+		return ilb;
+#else
 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
 		return ilb;
+#endif

 	return nr_cpu_ids;
 }
@@ -6490,11 +6619,9 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
+	int temp_target_cpu;
 	u64 now;

-	if (target_cpu)
-		*target_cpu = NR_CPUS;
-
 	if (hmp_cpu_is_fastest(cpu))
 		return 0;

@@ -6517,13 +6644,12 @@ static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_enti
 	 * idle CPU or 1023 for any partly-busy one.
 	 * Be explicit about requirement for an idle CPU.
 	 */
-	if (hmp_domain_min_load(hmp_faster_domain(cpu), target_cpu) != 0)
-		return 0;
-
-	if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus,
-			tsk_cpus_allowed(p)))
+	if (hmp_domain_min_load(hmp_faster_domain(cpu), &temp_target_cpu,
+			tsk_cpus_allowed(p)) == 0 && temp_target_cpu != NR_CPUS) {
+		if(target_cpu)
+			*target_cpu = temp_target_cpu;
 		return 1;
-
+	}
 	return 0;
 }

@@ -6533,8 +6659,14 @@ static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
 	struct task_struct *p = task_of(se);
 	u64 now;

-	if (hmp_cpu_is_slowest(cpu))
+	if (hmp_cpu_is_slowest(cpu)) {
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+		if(hmp_packing_enabled)
+			return 1;
+		else
+#endif
 		return 0;
+	}

 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
 	/* Filter by task priority */
@@ -6703,6 +6835,7 @@ static int hmp_active_task_migration_cpu_stop(void *data)
 	rcu_read_unlock();
 	double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
+	put_task_struct(p);
 	busiest_rq->active_balance = 0;
 	raw_spin_unlock_irq(&busiest_rq->lock);
 	return 0;
@@ -6776,6 +6909,7 @@ static int hmp_idle_pull_cpu_stop(void *data)
 	rcu_read_unlock();
 	double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
+	put_task_struct(p);
 	busiest_rq->active_balance = 0;
 	raw_spin_unlock_irq(&busiest_rq->lock);
 	return 0;
@@ -6821,11 +6955,12 @@ static void hmp_force_up_migration(int this_cpu)
 		p = task_of(curr);
 		if (hmp_up_migration(cpu, &target_cpu, curr)) {
 			if (!target->active_balance) {
+				get_task_struct(p);
 				target->active_balance = 1;
 				target->push_cpu = target_cpu;
 				target->migrate_task = p;
 				force = 1;
-				trace_sched_hmp_migrate(p, target->push_cpu, 1);
+				trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
 				hmp_next_up_delay(&p->se, target->push_cpu);
 			}
 		}
@@ -6836,12 +6971,14 @@ static void hmp_force_up_migration(int this_cpu)
 			 * require extensive book keeping.
 			 */
 			curr = hmp_get_lightest_task(orig, 1);
+			p = task_of(curr);
 			target->push_cpu = hmp_offload_down(cpu, curr);
 			if (target->push_cpu < NR_CPUS) {
+				get_task_struct(p);
 				target->active_balance = 1;
 				target->migrate_task = p;
 				force = 1;
-				trace_sched_hmp_migrate(p, target->push_cpu, 2);
+				trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD);
 				hmp_next_down_delay(&p->se, target->push_cpu);
 			}
 		}
@@ -6916,11 +7053,12 @@ static unsigned int hmp_idle_pull(int this_cpu)
 	/* now we have a candidate */
 	raw_spin_lock_irqsave(&target->lock, flags);
 	if (!target->active_balance && task_rq(p) == target) {
+		get_task_struct(p);
 		target->active_balance = 1;
 		target->push_cpu = this_cpu;
 		target->migrate_task = p;
 		force = 1;
-		trace_sched_hmp_migrate(p, target->push_cpu, 3);
+		trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL);
 		hmp_next_up_delay(&p->se, target->push_cpu);
 	}
 	raw_spin_unlock_irqrestore(&target->lock, flags);
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,6 +12,8 @@
 #include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/smp.h>

 #include "smpboot.h"

@@ -159,8 +161,10 @@ void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
 	 * locking and barrier primitives. Generic code isn't really
 	 * equipped to do the right thing...
 	 */
-	if (ipi)
+	if (ipi) {
+		trace_smp_call_func_send(csd->func, cpu);
 		arch_send_call_function_single_ipi(cpu);
+	}

 	if (wait)
 		csd_lock_wait(csd);
@@ -197,8 +201,9 @@ void generic_smp_call_function_single_interrupt(void)
 		 * so save them away before making the call:
 		 */
 		csd_flags = csd->flags;
-
+		trace_smp_call_func_entry(csd->func);
 		csd->func(csd->info);
+		trace_smp_call_func_exit(csd->func);

 		/*
 		 * Unlocked CSDs are valid through generic_exec_single():
@@ -228,6 +233,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 	int this_cpu;
 	int err = 0;

+	trace_smp_call_func_send(func, cpu);
 	/*
 	 * prevent preemption and reschedule on another processor,
 	 * as well as CPU removal
@@ -245,7 +251,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,

 	if (cpu == this_cpu) {
 		local_irq_save(flags);
+		trace_smp_call_func_entry(func);
 		func(info);
+		trace_smp_call_func_exit(func);
 		local_irq_restore(flags);
 	} else {
 		if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {