Patchwork [v7,22/25] ACPI / APEI: Kick the memory_failure() queue for synchronous errors

login
register
mail settings
Submitter James Morse
Date Dec. 3, 2018, 6:06 p.m.
Message ID <20181203180613.228133-23-james.morse@arm.com>
Download mbox | patch
Permalink /patch/671079/
State New
Headers show

Comments

James Morse - Dec. 3, 2018, 6:06 p.m.
memory_failure() offlines or repairs pages of memory that have been
discovered to be corrupt. These may be detected by an external
component, (e.g. the memory controller), and notified via an IRQ.
In this case the work is queued as not all of memory_failure()s work
can happen in IRQ context.

If the error was detected as a result of user-space accessing a
corrupt memory location the CPU may take an abort instead. On arm64
this is a 'synchronous external abort', and on a firmware first
system it is replayed using NOTIFY_SEA.

This notification has NMI like properties, (it can interrupt
IRQ-masked code), so the memory_failure() work is queued. If we
return to user-space before the queued memory_failure() work is
processed, we will take the fault again. This loop may cause platform
firmware to exceed some threshold and reboot when Linux could have
recovered from this error.

If a ghes notification type indicates that it may be triggered again
when we return to user-space, use the task-work and notify-resume
hooks to kick the relevant memory_failure() queue before returning
to user-space.

Signed-off-by: James Morse <james.morse@arm.com>

---
current->mm == &init_mm ? I couldn't find a helper for this.
The intent is not to set TIF flags on kernel threads. What happens
if a kernel-thread takes on of these? Its just one of the many
not-handled-very-well cases we have already, as memory_failure()
puts it: "try to be lucky".

I assume that if NOTIFY_NMI is coming from SMM it must suffer from
this problem too.
---
 drivers/acpi/apei/ghes.c | 65 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 60 insertions(+), 5 deletions(-)
Xie XiuQi - Dec. 5, 2018, 2:02 a.m.
Hi James & Boris,

On 2018/12/4 2:06, James Morse wrote:
> memory_failure() offlines or repairs pages of memory that have been
> discovered to be corrupt. These may be detected by an external
> component, (e.g. the memory controller), and notified via an IRQ.
> In this case the work is queued as not all of memory_failure()s work
> can happen in IRQ context.
> 
> If the error was detected as a result of user-space accessing a
> corrupt memory location the CPU may take an abort instead. On arm64
> this is a 'synchronous external abort', and on a firmware first
> system it is replayed using NOTIFY_SEA.
> 
> This notification has NMI like properties, (it can interrupt
> IRQ-masked code), so the memory_failure() work is queued. If we
> return to user-space before the queued memory_failure() work is
> processed, we will take the fault again. This loop may cause platform
> firmware to exceed some threshold and reboot when Linux could have
> recovered from this error.
> 
> If a ghes notification type indicates that it may be triggered again
> when we return to user-space, use the task-work and notify-resume
> hooks to kick the relevant memory_failure() queue before returning
> to user-space.
> 
> Signed-off-by: James Morse <james.morse@arm.com>
> 
> ---
> current->mm == &init_mm ? I couldn't find a helper for this.
> The intent is not to set TIF flags on kernel threads. What happens
> if a kernel-thread takes on of these? Its just one of the many
> not-handled-very-well cases we have already, as memory_failure()
> puts it: "try to be lucky".
> 
> I assume that if NOTIFY_NMI is coming from SMM it must suffer from
> this problem too.
> ---
>  drivers/acpi/apei/ghes.c | 65 ++++++++++++++++++++++++++++++++++++----
>  1 file changed, 60 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 6cbf9471b2a2..3e7da9243153 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -47,6 +47,7 @@
>  #include <linux/sched/clock.h>
>  #include <linux/uuid.h>
>  #include <linux/ras.h>
> +#include <linux/task_work.h>
>  
>  #include <acpi/actbl1.h>
>  #include <acpi/ghes.h>
> @@ -136,6 +137,26 @@ static atomic_t ghes_estatus_cache_alloced;
>  
>  static int ghes_panic_timeout __read_mostly = 30;
>  
> +static bool ghes_is_synchronous(struct ghes *ghes)
> +{
> +	switch (ghes->generic->notify.type) {
> +	case ACPI_HEST_NOTIFY_NMI:	/* fall through */
> +	case ACPI_HEST_NOTIFY_SEA:
> +		/*
> +		 * These notifications could be repeated if the interrupted
> +		 * instruction is run again. e.g. a read of bad-memory causing
> +		 * a trap to platform firmware.
> +		 */
> +		return true;
> +	default:
> +		/*
> +		 * Other notifications are asynchronous, and not related to the
> +		 * interrupted instruction. e.g. an IRQ.
> +		 */
> +		return false;
> +	}
> +}
> +
>  static void __iomem *ghes_map(u64 pfn, int fixmap_idx)
>  {
>  	phys_addr_t paddr;
> @@ -379,14 +400,33 @@ static void ghes_clear_estatus(struct acpi_hest_generic_status *estatus,
>  				      fixmap_idx);
>  }
>  
> -static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
> +struct ghes_memory_failure_work {
> +	int cpu;
> +	struct callback_head work;
> +};
> +
> +static void ghes_kick_memory_failure(struct callback_head *head)
> +{
> +	struct ghes_memory_failure_work *callback;
> +
> +	callback = container_of(head, struct ghes_memory_failure_work, work);
> +	memory_failure_queue_kick(callback->cpu);
> +	kfree(callback);
> +}
> +
> +static void ghes_handle_memory_failure(struct ghes *ghes,
> +				       struct acpi_hest_generic_data *gdata,
> +				       int sev)
>  {
> -#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
>  	unsigned long pfn;
> -	int flags = -1;
> +	int flags = -1, ret;
> +	struct ghes_memory_failure_work	*callback;
>  	int sec_sev = ghes_severity(gdata->error_severity);
>  	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
>  
> +	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
> +		return;
> +
>  	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
>  		return;
>  
> @@ -407,7 +447,22 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>  
>  	if (flags != -1)
>  		memory_failure_queue(pfn, flags);

We may need to take MF_ACTION_REQUIRED flags for memory_failure() in SEA condition.
And there is no return value check for memory_failure() in memory_failure_work_func(),
I'm not sure whether we need to check the return value.

static void memory_failure_work_func(struct work_struct *work)
{
        struct memory_failure_cpu *mf_cpu;
        struct memory_failure_entry entry = { 0, };
        unsigned long proc_flags;
        int gotten;

        mf_cpu = container_of(work, struct memory_failure_cpu, work);
        for (;;) {
                spin_lock_irqsave(&mf_cpu->lock, proc_flags);
                gotten = kfifo_get(&mf_cpu->fifo, &entry);
                spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
                if (!gotten)
                        break;
                if (entry.flags & MF_SOFT_OFFLINE)
                        soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
                else
                        memory_failure(entry.pfn, entry.flags);
        }
}

If the recovery fails here, we need to take other actions, such as force to send a SIGBUS signal.


> -#endif
> +
> +	/*
> +	 * If the notification indicates that it was the interrupted
> +	 * instruction that caused the error, try to kick the
> +	 * memory_failure() queue before returning to user-space.
> +	 */
> +	if (ghes_is_synchronous(ghes) && current->mm != &init_mm) {
> +		callback = kzalloc(sizeof(*callback), GFP_ATOMIC);
> +		if (!callback)
> +			return;
> +		callback->work.func = ghes_kick_memory_failure;
> +		callback->cpu = smp_processor_id();
> +		ret = task_work_add(current, &callback->work, true);
> +		if (ret)
> +			kfree(callback);
> +	}
>  }
>  
>  /*
> @@ -480,7 +535,7 @@ static void ghes_do_proc(struct ghes *ghes,
>  			ghes_edac_report_mem_error(sev, mem_err);
>  
>  			arch_apei_report_mem_error(sev, mem_err);
> -			ghes_handle_memory_failure(gdata, sev);
> +			ghes_handle_memory_failure(ghes, gdata, sev);
>  		}
>  		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
>  			ghes_handle_aer(gdata);
>
James Morse - Dec. 10, 2018, 7:15 p.m.
Hi Xie XiuQi,

On 05/12/2018 02:02, Xie XiuQi wrote:
> On 2018/12/4 2:06, James Morse wrote:
>> memory_failure() offlines or repairs pages of memory that have been
>> discovered to be corrupt. These may be detected by an external
>> component, (e.g. the memory controller), and notified via an IRQ.
>> In this case the work is queued as not all of memory_failure()s work
>> can happen in IRQ context.
>>
>> If the error was detected as a result of user-space accessing a
>> corrupt memory location the CPU may take an abort instead. On arm64
>> this is a 'synchronous external abort', and on a firmware first
>> system it is replayed using NOTIFY_SEA.
>>
>> This notification has NMI like properties, (it can interrupt
>> IRQ-masked code), so the memory_failure() work is queued. If we
>> return to user-space before the queued memory_failure() work is
>> processed, we will take the fault again. This loop may cause platform
>> firmware to exceed some threshold and reboot when Linux could have
>> recovered from this error.
>>
>> If a ghes notification type indicates that it may be triggered again
>> when we return to user-space, use the task-work and notify-resume
>> hooks to kick the relevant memory_failure() queue before returning


>> @@ -407,7 +447,22 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>>  
>>  	if (flags != -1)
>>  		memory_failure_queue(pfn, flags);
> 
> We may need to take MF_ACTION_REQUIRED flags for memory_failure() in SEA condition.

Hmmm, I'd forgotten about the extra flags. They're only used by x86's
do_machine_check(), which knows more about what is going on. I agree we do know
it should be a 'MF_ACTION_REQUIRED' for Synchronous-external-abort, but I'd
really like all the notifications to behave in the same way as we can't change
which notification firmware uses.
(This ghes_is_synchronous() affects when memory_failure() runs, not what it does.)

What happens if we miss MF_ACTION_REQUIRED? Surely the page still gets unmapped
as its PG_Poisoned, an AO signal may be pending, but if user-space touches the
page it will get an AR signal. Is this just about removing an extra AO signal to
user-space?

If we do need this, I'd like to pick it up from the CPER records, as x86's
NOTIFY_NMI looks like it covers both AO/AR cases. (as does NOTIFY_SDEI). The
Master/Target abort or Invalid-address types in the memory-error-section CPER
records look like the best bet.


> And there is no return value check for memory_failure() in memory_failure_work_func(),
> I'm not sure whether we need to check the return value.

What would we do if it fails? The reasons look fairly broad, -EBUSY can mean
"(page) still referenced by [..] users", 'thp split failed' or 'page already
poisoned'. I don't think the behaviour or return-codes are consistent enough to use.


> If the recovery fails here, we need to take other actions, such as force to send a SIGBUS signal.

We don't do this today. If it fixes some mis-behaviour, and we can key it from
something in the CPER records then I'm all ears!


Thanks,

James
Borislav Petkov - Jan. 21, 2019, 5:58 p.m.
On Mon, Dec 03, 2018 at 06:06:10PM +0000, James Morse wrote:
> memory_failure() offlines or repairs pages of memory that have been
> discovered to be corrupt. These may be detected by an external
> component, (e.g. the memory controller), and notified via an IRQ.
> In this case the work is queued as not all of memory_failure()s work
> can happen in IRQ context.
> 
> If the error was detected as a result of user-space accessing a
> corrupt memory location the CPU may take an abort instead. On arm64
> this is a 'synchronous external abort', and on a firmware first
> system it is replayed using NOTIFY_SEA.
> 
> This notification has NMI like properties, (it can interrupt
> IRQ-masked code), so the memory_failure() work is queued. If we
> return to user-space before the queued memory_failure() work is
> processed, we will take the fault again. This loop may cause platform
> firmware to exceed some threshold and reboot when Linux could have
> recovered from this error.
> 
> If a ghes notification type indicates that it may be triggered again
> when we return to user-space, use the task-work and notify-resume
> hooks to kick the relevant memory_failure() queue before returning
> to user-space.
> 
> Signed-off-by: James Morse <james.morse@arm.com>
> 
> ---
> current->mm == &init_mm ? I couldn't find a helper for this.
> The intent is not to set TIF flags on kernel threads. What happens
> if a kernel-thread takes on of these? Its just one of the many
> not-handled-very-well cases we have already, as memory_failure()
> puts it: "try to be lucky".
> 
> I assume that if NOTIFY_NMI is coming from SMM it must suffer from
> this problem too.

Good question.

I'm guessing all those things should be queued on a normal struct
work_struct queue, no?

Now, memory_failure_queue() does that and can run from IRQ context so
you need only an irq_work which can queue from NMI context. We do it
this way in the MCA code:

We queue in an irq_work in NMI context and work through the items in
process context.

> ---
>  drivers/acpi/apei/ghes.c | 65 ++++++++++++++++++++++++++++++++++++----
>  1 file changed, 60 insertions(+), 5 deletions(-)

...

> @@ -407,7 +447,22 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>  
>  	if (flags != -1)
>  		memory_failure_queue(pfn, flags);
> -#endif
> +
> +	/*
> +	 * If the notification indicates that it was the interrupted
> +	 * instruction that caused the error, try to kick the
> +	 * memory_failure() queue before returning to user-space.
> +	 */
> +	if (ghes_is_synchronous(ghes) && current->mm != &init_mm) {
> +		callback = kzalloc(sizeof(*callback), GFP_ATOMIC);

Can we avoid that GFP_ATOMIC allocation and kfree() in
ghes_kick_memory_failure()?

I mean, that struct ghes_memory_failure_work is small enough and we
already do lockless allocation:

	estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);

so I guess we could add that ghes_memory_failure_work struct to that
estatus_node, hand it into ghes_do_proc() and then free it.

No?
Borislav Petkov - Jan. 22, 2019, 10:51 a.m.
On Mon, Dec 10, 2018 at 07:15:13PM +0000, James Morse wrote:
> What happens if we miss MF_ACTION_REQUIRED?

AFAICU, the logic is to force-send a signal to the user process, i.e.,
force_sig_info() which cannot be ignored. IOW, an "enlightened" process
would know how to do recovery action from a memory error.

VS the action optional thing which you can handle at your leisure.

So the question boils down to what kind of severity do the errors
reported through SEA have? I mean, if the hw would go the trouble to do
the synchronous reporting, then something important must've happened and
it wants us to know about it and handle it.

> Surely the page still gets unmapped as its PG_Poisoned, an AO signal
> may be pending, but if user-space touches the page it will get an AR
> signal. Is this just about removing an extra AO signal to user-space?
>
> If we do need this, I'd like to pick it up from the CPER records, as x86's
> NOTIFY_NMI looks like it covers both AO/AR cases. (as does NOTIFY_SDEI). The
> Master/Target abort or Invalid-address types in the memory-error-section CPER
> records look like the best bet.

Right, and we do all kinds of severity mapping there aka ghes_severity()
so that'll be a good start, methinks.
James Morse - Jan. 23, 2019, 6:37 p.m.
Hi Boris,

On 22/01/2019 10:51, Borislav Petkov wrote:
> On Mon, Dec 10, 2018 at 07:15:13PM +0000, James Morse wrote:
>> What happens if we miss MF_ACTION_REQUIRED?
> 
> AFAICU, the logic is to force-send a signal to the user process, i.e.,
> force_sig_info() which cannot be ignored. IOW, an "enlightened" process
> would know how to do recovery action from a memory error.
> 
> VS the action optional thing which you can handle at your leisure.

> So the question boils down to what kind of severity do the errors
> reported through SEA have? I mean, if the hw would go the trouble to do
> the synchronous reporting, then something important must've happened and
> it wants us to know about it and handle it.

Before v8.2 we assumed these were fatal for the thread, it couldn't make progress.
Since v8.2 we get a value from the CPU, the severity values are, (the flippant
summary is obviously mine!):
* Recoverable: "You're about to step in it, fix it or die"
* Uncontainable: "It was here, but it escaped, we dont know where it went, panic!"
* Restartable/Corrected: "its fine, pretend this didn't happen"

Firmware should duplicate these values into the CPER severity fields.


>> Surely the page still gets unmapped as its PG_Poisoned, an AO signal
>> may be pending, but if user-space touches the page it will get an AR
>> signal. Is this just about removing an extra AO signal to user-space?

If we miss MF_ACTION_REQUIRED, the page still gets unmapped from user-space, and
user-space gets an AO signal. With this patch it takes that signal before it
continues. If it ignores it, the access gets a translation-fault->EHWPOISON->AR
signal from the arch code.

... so missing the flag gives us an extra signal. I'm not convinced this results
in any observable difference.


>> If we do need this, I'd like to pick it up from the CPER records, as x86's
>> NOTIFY_NMI looks like it covers both AO/AR cases. (as does NOTIFY_SDEI). The
>> Master/Target abort or Invalid-address types in the memory-error-section CPER
>> records look like the best bet.
> 
> Right, and we do all kinds of severity mapping there aka ghes_severity()
> so that'll be a good start, methinks.

The options are those 'aborts' in the memory error. These must have been a
result of some request. If we get a CPU error structure as part of the same
block, it may have a cache/bus error structure, which has a precise bit that
tells us whether this is a co-incidence. (but linux doesn't support any of those
structures today)



Thanks,

James
James Morse - Jan. 23, 2019, 6:40 p.m.
Hi Boris,

On 21/01/2019 17:58, Borislav Petkov wrote:
> On Mon, Dec 03, 2018 at 06:06:10PM +0000, James Morse wrote:
>> memory_failure() offlines or repairs pages of memory that have been
>> discovered to be corrupt. These may be detected by an external
>> component, (e.g. the memory controller), and notified via an IRQ.
>> In this case the work is queued as not all of memory_failure()s work
>> can happen in IRQ context.
>>
>> If the error was detected as a result of user-space accessing a
>> corrupt memory location the CPU may take an abort instead. On arm64
>> this is a 'synchronous external abort', and on a firmware first
>> system it is replayed using NOTIFY_SEA.
>>
>> This notification has NMI like properties, (it can interrupt
>> IRQ-masked code), so the memory_failure() work is queued. If we
>> return to user-space before the queued memory_failure() work is
>> processed, we will take the fault again. This loop may cause platform
>> firmware to exceed some threshold and reboot when Linux could have
>> recovered from this error.
>>
>> If a ghes notification type indicates that it may be triggered again
>> when we return to user-space, use the task-work and notify-resume
>> hooks to kick the relevant memory_failure() queue before returning
>> to user-space.

>> ---

>> I assume that if NOTIFY_NMI is coming from SMM it must suffer from
>> this problem too.
> 
> Good question.
> 
> I'm guessing all those things should be queued on a normal struct
> work_struct queue, no?

ghes_notify_nmi() does this today with its:
|	irq_work_queue(&ghes_proc_irq_work);

Once its in IRQ context, the irq_work pokes memory_failure_queue(), which
schedule_work_on()s.

Finally we schedule() in process context, and can unmap the affected memory.


The problem is between each of these steps we might return to user-space and run
the instruction that tripped all this to begin with.


My SMM comment was because the CPU must jump from user-space->SMM, which injects
an NMI into the kernel. The kernel's EIP must point into user-space, so
returning from the NMI without doing the memory_failure() work puts us back the
same position we started in.


> Now, memory_failure_queue() does that and can run from IRQ context so
> you need only an irq_work which can queue from NMI context. We do it
> this way in the MCA code:
> 

(was there something missing here?)

> We queue in an irq_work in NMI context and work through the items in
> process context.

How are you getting from NMI to process context in one go?

This patch causes the IRQ->process transition.
The arch specific bit of this gives the irq work queue a kick if returning from
the NMI would unmask IRQs. This makes it look like we moved from NMI to IRQ
context without returning to user-space.

Once ghes_handle_memory_failure() runs in IRQ context, it task_work_add()s the
call to ghes_kick_memory_failure().

Finally on the way out of the kernel to user-space that task_work runs and the
memory_failure() work happens in process context.

During all this the user-space program counter can point at a poisoned location,
but we don't return there until the memory_failure() work has been done.


>> @@ -407,7 +447,22 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
>>  
>>  	if (flags != -1)
>>  		memory_failure_queue(pfn, flags);
>> -#endif
>> +
>> +	/*
>> +	 * If the notification indicates that it was the interrupted
>> +	 * instruction that caused the error, try to kick the
>> +	 * memory_failure() queue before returning to user-space.
>> +	 */
>> +	if (ghes_is_synchronous(ghes) && current->mm != &init_mm) {
>> +		callback = kzalloc(sizeof(*callback), GFP_ATOMIC);
> 
> Can we avoid that GFP_ATOMIC allocation and kfree() in
> ghes_kick_memory_failure()?
> 
> I mean, that struct ghes_memory_failure_work is small enough and we
> already do lockless allocation:
> 
> 	estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
> 
> so I guess we could add that ghes_memory_failure_work struct to that
> estatus_node, hand it into ghes_do_proc() and then free it.

I forget estatus_node is a linux thing, not an ACPI-spec thing!

Hmmm, ghes_handle_memory_failure() runs for POLLED and irq error sources too,
they don't have an estatus_node. We don't care about this ret_to_user() problem
as they are all asynchronous, this is why we have ghes_is_synchronous()...

It feels like there should be a way to do this, let me have a go...


Thanks,

James
Borislav Petkov - Jan. 31, 2019, 2:04 p.m.
On Wed, Jan 23, 2019 at 06:40:08PM +0000, James Morse wrote:
> My SMM comment was because the CPU must jump from user-space->SMM, which injects
> an NMI into the kernel. The kernel's EIP must point into user-space, so
> returning from the NMI without doing the memory_failure() work puts us back the
> same position we started in.

Yeah, known issue. We dealt with that on x86 at the time:

d4812e169de4 ("x86, mce: Get rid of TIF_MCE_NOTIFY and associated mce tricks")

> > Now, memory_failure_queue() does that and can run from IRQ context so
> > you need only an irq_work which can queue from NMI context. We do it
> > this way in the MCA code:
> > 
> 
> (was there something missing here?)

Whoops. Yeah, I was about to paste this:

void mce_log(struct mce *m)
{
        if (!mce_gen_pool_add(m))
                irq_work_queue(&mce_irq_work);
}

we're basically queueing only into the lockless buffer and kicking the
IRQ work.

> > We queue in an irq_work in NMI context and work through the items in
> > process context.
> 
> How are you getting from NMI to process context in one go?

Well, #MC is basically an NMI context on x86 and when it is done, we
work through the items queued in process context. But see the commit
above too - for really urgent errors we run memory_failure *before* we
return to user.

> This patch causes the IRQ->process transition.
> The arch specific bit of this gives the irq work queue a kick if returning from
> the NMI would unmask IRQs. This makes it look like we moved from NMI to IRQ
> context without returning to user-space.
> 
> Once ghes_handle_memory_failure() runs in IRQ context, it task_work_add()s the
> call to ghes_kick_memory_failure().
> 
> Finally on the way out of the kernel to user-space that task_work runs and the
> memory_failure() work happens in process context.
> 
> During all this the user-space program counter can point at a poisoned location,
> but we don't return there until the memory_failure() work has been done.

Sounds very similar.

Actually, yours is even a bit more elegant. I wonder why we didn't use
task_work_add() then...

Thx.

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 6cbf9471b2a2..3e7da9243153 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -47,6 +47,7 @@ 
 #include <linux/sched/clock.h>
 #include <linux/uuid.h>
 #include <linux/ras.h>
+#include <linux/task_work.h>
 
 #include <acpi/actbl1.h>
 #include <acpi/ghes.h>
@@ -136,6 +137,26 @@  static atomic_t ghes_estatus_cache_alloced;
 
 static int ghes_panic_timeout __read_mostly = 30;
 
+static bool ghes_is_synchronous(struct ghes *ghes)
+{
+	switch (ghes->generic->notify.type) {
+	case ACPI_HEST_NOTIFY_NMI:	/* fall through */
+	case ACPI_HEST_NOTIFY_SEA:
+		/*
+		 * These notifications could be repeated if the interrupted
+		 * instruction is run again. e.g. a read of bad-memory causing
+		 * a trap to platform firmware.
+		 */
+		return true;
+	default:
+		/*
+		 * Other notifications are asynchronous, and not related to the
+		 * interrupted instruction. e.g. an IRQ.
+		 */
+		return false;
+	}
+}
+
 static void __iomem *ghes_map(u64 pfn, int fixmap_idx)
 {
 	phys_addr_t paddr;
@@ -379,14 +400,33 @@  static void ghes_clear_estatus(struct acpi_hest_generic_status *estatus,
 				      fixmap_idx);
 }
 
-static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int sev)
+struct ghes_memory_failure_work {
+	int cpu;
+	struct callback_head work;
+};
+
+static void ghes_kick_memory_failure(struct callback_head *head)
+{
+	struct ghes_memory_failure_work *callback;
+
+	callback = container_of(head, struct ghes_memory_failure_work, work);
+	memory_failure_queue_kick(callback->cpu);
+	kfree(callback);
+}
+
+static void ghes_handle_memory_failure(struct ghes *ghes,
+				       struct acpi_hest_generic_data *gdata,
+				       int sev)
 {
-#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
 	unsigned long pfn;
-	int flags = -1;
+	int flags = -1, ret;
+	struct ghes_memory_failure_work	*callback;
 	int sec_sev = ghes_severity(gdata->error_severity);
 	struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
+	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
+		return;
+
 	if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
 		return;
 
@@ -407,7 +447,22 @@  static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int
 
 	if (flags != -1)
 		memory_failure_queue(pfn, flags);
-#endif
+
+	/*
+	 * If the notification indicates that it was the interrupted
+	 * instruction that caused the error, try to kick the
+	 * memory_failure() queue before returning to user-space.
+	 */
+	if (ghes_is_synchronous(ghes) && current->mm != &init_mm) {
+		callback = kzalloc(sizeof(*callback), GFP_ATOMIC);
+		if (!callback)
+			return;
+		callback->work.func = ghes_kick_memory_failure;
+		callback->cpu = smp_processor_id();
+		ret = task_work_add(current, &callback->work, true);
+		if (ret)
+			kfree(callback);
+	}
 }
 
 /*
@@ -480,7 +535,7 @@  static void ghes_do_proc(struct ghes *ghes,
 			ghes_edac_report_mem_error(sev, mem_err);
 
 			arch_apei_report_mem_error(sev, mem_err);
-			ghes_handle_memory_failure(gdata, sev);
+			ghes_handle_memory_failure(ghes, gdata, sev);
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
 			ghes_handle_aer(gdata);