From 08169a162f97819d3e5b4a342bb9cf5137787154 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 14 Apr 2023 20:10:26 +0800 Subject: [PATCH 1/9] PM: hibernate: Turn snapshot_test into global variable There is need to check snapshot_test and open block device in different mode, so as to avoid the race condition. No functional changes intended. Suggested-by: Pavankumar Kondeti Signed-off-by: Chen Yu Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 7 ++++++- kernel/power/power.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 793c55a2becb..aa551b093c3f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -64,6 +64,7 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; bool freezer_test_done; +bool snapshot_test; static const struct platform_hibernation_ops *hibernation_ops; @@ -716,7 +717,6 @@ static int load_image_and_restore(void) */ int hibernate(void) { - bool snapshot_test = false; unsigned int sleep_flags; int error; @@ -744,6 +744,9 @@ int hibernate(void) if (error) goto Exit; + /* protected by system_transition_mutex */ + snapshot_test = false; + lock_device_hotplug(); /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); @@ -940,6 +943,8 @@ static int software_resume(void) */ mutex_lock_nested(&system_transition_mutex, SINGLE_DEPTH_NESTING); + snapshot_test = false; + if (swsusp_resume_device) goto Check_image; diff --git a/kernel/power/power.h b/kernel/power/power.h index b4f433943209..b83c8d5e188d 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -59,6 +59,7 @@ asmlinkage int swsusp_save(void); /* kernel/power/hibernate.c */ extern bool freezer_test_done; +extern bool snapshot_test; extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); From 5904de0d735bbb3b4afe9375c5b4f9748f882945 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Fri, 14 Apr 2023 20:10:42 +0800 Subject: [PATCH 2/9] PM: hibernate: Do not get block device exclusively in test_resume mode The system refused to do a test_resume because it found that the swap device has already been taken by someone else. Specifically, the swsusp_check()->blkdev_get_by_dev(FMODE_EXCL) is supposed to do this check. Steps to reproduce: dd if=/dev/zero of=/swapfile bs=$(cat /proc/meminfo | awk '/MemTotal/ {print $2}') count=1024 conv=notrunc mkswap /swapfile swapon /swapfile swap-offset /swapfile echo 34816 > /sys/power/resume_offset echo test_resume > /sys/power/disk echo disk > /sys/power/state PM: Using 3 thread(s) for compression PM: Compressing and saving image data (293150 pages)... PM: Image saving progress: 0% PM: Image saving progress: 10% ata1: SATA link up 1.5 Gbps (SStatus 113 SControl 300) ata1.00: configured for UDMA/100 ata2: SATA link down (SStatus 0 SControl 300) ata5: SATA link down (SStatus 0 SControl 300) ata6: SATA link down (SStatus 0 SControl 300) ata3: SATA link down (SStatus 0 SControl 300) ata4: SATA link down (SStatus 0 SControl 300) PM: Image saving progress: 20% PM: Image saving progress: 30% PM: Image saving progress: 40% PM: Image saving progress: 50% pcieport 0000:00:02.5: pciehp: Slot(0-5): No device found PM: Image saving progress: 60% PM: Image saving progress: 70% PM: Image saving progress: 80% PM: Image saving progress: 90% PM: Image saving done PM: hibernation: Wrote 1172600 kbytes in 2.70 seconds (434.29 MB/s) PM: S| PM: hibernation: Basic memory bitmaps freed PM: Image not found (code -16) This is because when using the swapfile as the hibernation storage, the block device where the swapfile is located has already been mounted by the OS distribution(usually mounted as the rootfs). This is not an issue for normal hibernation, because software_resume()->swsusp_check() happens before the block device(rootfs) mount. But it is a problem for the test_resume mode. Because when test_resume happens, the block device has been mounted already. Thus remove the FMODE_EXCL for test_resume mode. This would not be a problem because in test_resume stage, the processes have already been frozen, and the race condition described in Commit 39fbef4b0f77 ("PM: hibernate: Get block device exclusively in swsusp_check()") is unlikely to happen. Fixes: 39fbef4b0f77 ("PM: hibernate: Get block device exclusively in swsusp_check()") Reported-by: Yifan Li Suggested-by: Pavankumar Kondeti Tested-by: Pavankumar Kondeti Tested-by: Wendy Wang Signed-off-by: Chen Yu Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 8 ++++++-- kernel/power/swap.c | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aa551b093c3f..30d1274f03f6 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -688,18 +688,22 @@ static int load_image_and_restore(void) { int error; unsigned int flags; + fmode_t mode = FMODE_READ; + + if (snapshot_test) + mode |= FMODE_EXCL; pm_pr_dbg("Loading hibernation image.\n"); lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) { - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(mode); goto Unlock; } error = swsusp_read(&flags); - swsusp_close(FMODE_READ | FMODE_EXCL); + swsusp_close(mode); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 36a1df48280c..92e41ed292ad 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -1518,9 +1518,13 @@ int swsusp_check(void) { int error; void *holder; + fmode_t mode = FMODE_READ; + + if (snapshot_test) + mode |= FMODE_EXCL; hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, - FMODE_READ | FMODE_EXCL, &holder); + mode, &holder); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); @@ -1547,7 +1551,7 @@ int swsusp_check(void) put: if (error) - blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL); + blkdev_put(hib_resume_bdev, mode); else pr_debug("Image signature found, resuming\n"); } else { From 91048ce4227868000bfe624707e42274ef2198aa Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 19 Apr 2023 17:39:41 +0300 Subject: [PATCH 3/9] intel_idle: use pr_info() instead of printk() Substitute 'printk()' with 'pr_info()', because 'intel_idle' already uses 'pr_debug()', so using 'pr_info()' will be more consistent. In addition to this, this patch addresses the following checkpatch.pl warning: WARNING: printk() should include KERN_ facility level Signed-off-by: Artem Bityutskiy Reviewed-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 938c17f25d94..726a361da422 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1895,7 +1895,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) drv->states[drv->state_count] = cpuidle_state_table[cstate]; if ((cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IRQ_ENABLE) || force_irq_on) { - printk("intel_idle: forced intel_idle_irq for state %d\n", cstate); + pr_info("forced intel_idle_irq for state %d\n", cstate); drv->states[drv->state_count].enter = intel_idle_irq; } From a78032e94bf16e9c53238cd83c82095b4a251d4b Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 20 Apr 2023 09:47:18 +0300 Subject: [PATCH 4/9] intel_idle: clean up intel_idle_init_cstates_icpu() The intel_idle_init_cstates_icpu() function includes a loop that iterates over every C-state. Inside the loop, the same C-state data is referenced 2 ways: 1. as cpuidle_state_table[cstate] 2. as drv->states[drv->state_count] (but it is a copy of #1, not the same object). Make the code be more consistent and easier to read by using only the 2nd way. So the code structure would be as follows: 1. Use cpuidle_state_table[cstate] 2. Copy cpuidle_state_table[cstate] to drv->states[drv->state_count] 3. Use only drv->states[drv->state_count] from this point. Note, this change introduces a checkpatch.pl warning (too long line), but it will be addressed in the next patch. Signed-off-by: Artem Bityutskiy Reviewed-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 726a361da422..190410fc9ce5 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1894,24 +1894,24 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) /* Structure copy. */ drv->states[drv->state_count] = cpuidle_state_table[cstate]; - if ((cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IRQ_ENABLE) || force_irq_on) { + if ((drv->states[drv->state_count].flags & CPUIDLE_FLAG_IRQ_ENABLE) || force_irq_on) { pr_info("forced intel_idle_irq for state %d\n", cstate); drv->states[drv->state_count].enter = intel_idle_irq; } if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && - cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS) { - WARN_ON_ONCE(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IRQ_ENABLE); + drv->states[drv->state_count].flags & CPUIDLE_FLAG_IBRS) { + WARN_ON_ONCE(drv->states[drv->state_count].flags & CPUIDLE_FLAG_IRQ_ENABLE); drv->states[drv->state_count].enter = intel_idle_ibrs; } - if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_INIT_XSTATE) + if (drv->states[drv->state_count].flags & CPUIDLE_FLAG_INIT_XSTATE) drv->states[drv->state_count].enter = intel_idle_xstate; if ((disabled_states_mask & BIT(drv->state_count)) || ((icpu->use_acpi || force_use_acpi) && intel_idle_off_by_default(mwait_hint) && - !(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_ALWAYS_ENABLE))) + !(drv->states[drv->state_count].flags & CPUIDLE_FLAG_ALWAYS_ENABLE))) drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF; if (intel_idle_state_needs_timer_stop(&drv->states[drv->state_count])) From 1abffbd827668b3b44c34b168ccb5ee2866714a8 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 19 Apr 2023 17:39:43 +0300 Subject: [PATCH 5/9] intel_idle: further intel_idle_init_cstates_icpu() cleanup Introduce a temporary 'state' variable for referencing the currently processed C-state in the intel_idle_init_cstates_icpu() function. This makes code lines shorter and easier to read. Signed-off-by: Artem Bityutskiy Reviewed-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 190410fc9ce5..73ddb1d8cfcf 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1871,6 +1871,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) } for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { + struct cpuidle_state *state; unsigned int mwait_hint; if (intel_idle_max_cstate_reached(cstate)) @@ -1893,29 +1894,30 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) /* Structure copy. */ drv->states[drv->state_count] = cpuidle_state_table[cstate]; + state = &drv->states[drv->state_count]; - if ((drv->states[drv->state_count].flags & CPUIDLE_FLAG_IRQ_ENABLE) || force_irq_on) { + if ((state->flags & CPUIDLE_FLAG_IRQ_ENABLE) || force_irq_on) { pr_info("forced intel_idle_irq for state %d\n", cstate); - drv->states[drv->state_count].enter = intel_idle_irq; + state->enter = intel_idle_irq; } if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && - drv->states[drv->state_count].flags & CPUIDLE_FLAG_IBRS) { - WARN_ON_ONCE(drv->states[drv->state_count].flags & CPUIDLE_FLAG_IRQ_ENABLE); - drv->states[drv->state_count].enter = intel_idle_ibrs; + state->flags & CPUIDLE_FLAG_IBRS) { + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); + state->enter = intel_idle_ibrs; } - if (drv->states[drv->state_count].flags & CPUIDLE_FLAG_INIT_XSTATE) - drv->states[drv->state_count].enter = intel_idle_xstate; + if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) + state->enter = intel_idle_xstate; if ((disabled_states_mask & BIT(drv->state_count)) || ((icpu->use_acpi || force_use_acpi) && intel_idle_off_by_default(mwait_hint) && - !(drv->states[drv->state_count].flags & CPUIDLE_FLAG_ALWAYS_ENABLE))) - drv->states[drv->state_count].flags |= CPUIDLE_FLAG_OFF; + !(state->flags & CPUIDLE_FLAG_ALWAYS_ENABLE))) + state->flags |= CPUIDLE_FLAG_OFF; - if (intel_idle_state_needs_timer_stop(&drv->states[drv->state_count])) - drv->states[drv->state_count].flags |= CPUIDLE_FLAG_TIMER_STOP; + if (intel_idle_state_needs_timer_stop(state)) + state->flags |= CPUIDLE_FLAG_TIMER_STOP; drv->state_count++; } From 00433eae1771d355f20c152ddc5976baedb427e2 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 19 Apr 2023 17:39:44 +0300 Subject: [PATCH 6/9] intel_idle: improve C-state flags handling robustness The following C-state flags are currently mutually-exclusive and should not be combined: * IRQ_ENABLE * IBRS * XSTATE There is a warning for the situation when the IRQ_ENABLE flag is combined with the IBRS flag, but no warnings for other combinations. This is inconsistent and prone to errors. Improve the situation by adding warnings for all the unexpected combinations. Add a couple of helpful commentaries too. Signed-off-by: Artem Bityutskiy Reviewed-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 73ddb1d8cfcf..1de36df15d5a 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1896,20 +1896,28 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) drv->states[drv->state_count] = cpuidle_state_table[cstate]; state = &drv->states[drv->state_count]; - if ((state->flags & CPUIDLE_FLAG_IRQ_ENABLE) || force_irq_on) { + if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) { + /* + * Combining with XSTATE with IBRS or IRQ_ENABLE flags + * is not currently supported but this driver. + */ + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS); + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); + state->enter = intel_idle_xstate; + } else if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && + state->flags & CPUIDLE_FLAG_IBRS) { + /* + * IBRS mitigation requires that C-states are entered + * with interrupts disabled. + */ + WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); + state->enter = intel_idle_ibrs; + } else if ((state->flags & CPUIDLE_FLAG_IRQ_ENABLE) || + force_irq_on) { pr_info("forced intel_idle_irq for state %d\n", cstate); state->enter = intel_idle_irq; } - if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) && - state->flags & CPUIDLE_FLAG_IBRS) { - WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); - state->enter = intel_idle_ibrs; - } - - if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) - state->enter = intel_idle_xstate; - if ((disabled_states_mask & BIT(drv->state_count)) || ((icpu->use_acpi || force_use_acpi) && intel_idle_off_by_default(mwait_hint) && From db1ae0c99950502d10e09d0c57ed1e16cd854b20 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 19 Apr 2023 17:39:45 +0300 Subject: [PATCH 7/9] intel_idle: fix confusing message By default, all non-POLL C-states are entered with interrupts disabled. There are 2 ways to make 'intel_idle' enter C-states with interrupts enabled: 1. Mark the C-state with the CPUIDLE_FLAG_IRQ_ENABLE flag. 2. Use the force_irq_on module parameter. The former is the "proper" way of doing it, it is per-C-state and per-platform. The latter is for debugging purposes only. The problem is that intel_idle prints the "forced intel_idle_irq" message in both cases, even though the former case does not needed this message, because nothing is forced there. This patch addresses the problem. Signed-off-by: Artem Bityutskiy Reviewed-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 1de36df15d5a..bff0d17aeda4 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1912,8 +1912,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) */ WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE); state->enter = intel_idle_ibrs; - } else if ((state->flags & CPUIDLE_FLAG_IRQ_ENABLE) || - force_irq_on) { + } else if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) { + state->enter = intel_idle_irq; + } else if (force_irq_on) { pr_info("forced intel_idle_irq for state %d\n", cstate); state->enter = intel_idle_irq; } From 4152379a701ad66b6c4e1c9fc93704d993e49615 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 20 Apr 2023 09:47:21 +0300 Subject: [PATCH 8/9] intel_idle: do not sprinkle module parameter definitions around This is a cleanup which improves code consistency. Move the force_irq_on module parameter variable and definition to the same place where we have variables and definitions for other module parameters. Signed-off-by: Artem Bityutskiy Reviewed-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index bff0d17aeda4..35bd284f7763 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -68,6 +68,7 @@ static struct cpuidle_driver intel_idle_driver = { static int max_cstate = CPUIDLE_STATE_MAX - 1; static unsigned int disabled_states_mask; static unsigned int preferred_states_mask; +static bool force_irq_on __read_mostly; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; @@ -1838,9 +1839,6 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) return true; } -static bool force_irq_on __read_mostly; -module_param(force_irq_on, bool, 0444); - static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) { int cstate; @@ -2157,3 +2155,9 @@ MODULE_PARM_DESC(states_off, "Mask of disabled idle states"); */ module_param_named(preferred_cstates, preferred_states_mask, uint, 0444); MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states"); +/* + * Debugging option that forces the driver to enter all C-states with + * interrupts enabled. Does not apply to C-states with + * 'CPUIDLE_FLAG_INIT_XSTATE' and 'CPUIDLE_FLAG_IBRS' flags. + */ +module_param(force_irq_on, bool, 0444); From bd4468295e7a335319c10affdf594b56f1e6c0a4 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 19 Apr 2023 17:39:47 +0300 Subject: [PATCH 9/9] intel_idle: mark few variables as __read_mostly The intention is to clean up the code and make it look a bit more consistent. Mark all unitialized module parameter variables as __read_mostly, not just one of them. The other parameters are read-mostly too. Signed-off-by: Artem Bityutskiy Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 35bd284f7763..aa2d19db2b1d 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -66,8 +66,8 @@ static struct cpuidle_driver intel_idle_driver = { }; /* intel_idle.max_cstate=0 disables driver */ static int max_cstate = CPUIDLE_STATE_MAX - 1; -static unsigned int disabled_states_mask; -static unsigned int preferred_states_mask; +static unsigned int disabled_states_mask __read_mostly; +static unsigned int preferred_states_mask __read_mostly; static bool force_irq_on __read_mostly; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;