Patchwork [driver-core,v8,2/9] driver core: Establish order of operations for device_add and device_del via bitflag

login
register
mail settings
Submitter Alexander Duyck
Date Dec. 5, 2018, 5:25 p.m.
Message ID <154403072403.11544.10419282512791659652.stgit@ahduyck-desk1.jf.intel.com>
Download mbox | patch
Permalink /patch/673371/
State New
Headers show

Comments

Alexander Duyck - Dec. 5, 2018, 5:25 p.m.
Add an additional bit flag to the device struct named "dead".

This additional flag provides a guarantee that when a device_del is
executed on a given interface an async worker will not attempt to attach
the driver following the earlier device_del call. Previously this
guarantee was not present and could result in the device_del call
attempting to remove a driver from an interface only to have the async
worker attempt to probe the driver later when it finally completes the
asynchronous probe call.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
---
 drivers/base/core.c    |   11 +++++++++++
 drivers/base/dd.c      |    8 ++++++--
 include/linux/device.h |    5 +++++
 3 files changed, 22 insertions(+), 2 deletions(-)
Dan Williams - Dec. 10, 2018, 6:58 p.m.
On Wed, Dec 5, 2018 at 9:25 AM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
> Add an additional bit flag to the device struct named "dead".
>
> This additional flag provides a guarantee that when a device_del is
> executed on a given interface an async worker will not attempt to attach
> the driver following the earlier device_del call. Previously this
> guarantee was not present and could result in the device_del call
> attempting to remove a driver from an interface only to have the async
> worker attempt to probe the driver later when it finally completes the
> asynchronous probe call.
>
> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> ---
>  drivers/base/core.c    |   11 +++++++++++
>  drivers/base/dd.c      |    8 ++++++--
>  include/linux/device.h |    5 +++++
>  3 files changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index f3e6ca4170b4..70358327303b 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -2075,6 +2075,17 @@ void device_del(struct device *dev)
>         struct kobject *glue_dir = NULL;
>         struct class_interface *class_intf;
>
> +       /*
> +        * Hold the device lock and set the "dead" flag to guarantee that
> +        * the update behavior is consistent with the other bitfields near
> +        * it and that we cannot have an asynchronous probe routine trying
> +        * to run while we are tearing out the bus/class/sysfs from
> +        * underneath the device.
> +        */
> +       device_lock(dev);
> +       dev->dead = true;
> +       device_unlock(dev);
> +
>         /* Notify clients of device removal.  This call must come
>          * before dpm_sysfs_remove().
>          */
> diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> index 88713f182086..3bb8c3e0f3da 100644
> --- a/drivers/base/dd.c
> +++ b/drivers/base/dd.c
> @@ -774,6 +774,10 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
>
>         device_lock(dev);
>
> +       /* device is or has been removed from the bus, just bail out */
> +       if (dev->dead)
> +               goto out_unlock;
> +

What do you think about moving this check into
__device_attach_driver() alongside all the other checks? That way we
also get ->dead checking through the __device_attach() path.

...and after that maybe it could be made a common helper
(dev_driver_checks()?) shared between __device_attach_driver() and
__driver_attach() to reduce some duplication.
Alexander Duyck - Dec. 10, 2018, 7:35 p.m.
On Mon, 2018-12-10 at 10:58 -0800, Dan Williams wrote:
> On Wed, Dec 5, 2018 at 9:25 AM Alexander Duyck
> <alexander.h.duyck@linux.intel.com> wrote:
> > 
> > Add an additional bit flag to the device struct named "dead".
> > 
> > This additional flag provides a guarantee that when a device_del is
> > executed on a given interface an async worker will not attempt to attach
> > the driver following the earlier device_del call. Previously this
> > guarantee was not present and could result in the device_del call
> > attempting to remove a driver from an interface only to have the async
> > worker attempt to probe the driver later when it finally completes the
> > asynchronous probe call.
> > 
> > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > ---
> >  drivers/base/core.c    |   11 +++++++++++
> >  drivers/base/dd.c      |    8 ++++++--
> >  include/linux/device.h |    5 +++++
> >  3 files changed, 22 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > index f3e6ca4170b4..70358327303b 100644
> > --- a/drivers/base/core.c
> > +++ b/drivers/base/core.c
> > @@ -2075,6 +2075,17 @@ void device_del(struct device *dev)
> >         struct kobject *glue_dir = NULL;
> >         struct class_interface *class_intf;
> > 
> > +       /*
> > +        * Hold the device lock and set the "dead" flag to guarantee that
> > +        * the update behavior is consistent with the other bitfields near
> > +        * it and that we cannot have an asynchronous probe routine trying
> > +        * to run while we are tearing out the bus/class/sysfs from
> > +        * underneath the device.
> > +        */
> > +       device_lock(dev);
> > +       dev->dead = true;
> > +       device_unlock(dev);
> > +
> >         /* Notify clients of device removal.  This call must come
> >          * before dpm_sysfs_remove().
> >          */
> > diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> > index 88713f182086..3bb8c3e0f3da 100644
> > --- a/drivers/base/dd.c
> > +++ b/drivers/base/dd.c
> > @@ -774,6 +774,10 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
> > 
> >         device_lock(dev);
> > 
> > +       /* device is or has been removed from the bus, just bail out */
> > +       if (dev->dead)
> > +               goto out_unlock;
> > +
> 
> What do you think about moving this check into
> __device_attach_driver() alongside all the other checks? That way we
> also get ->dead checking through the __device_attach() path.

I'm not really sure that is the best spot to do that. Part of the
reason being that by placing it where I did we avoid messing with the
runtime power management for the parent if it was already powered off.

If anything I would say we could probably look at pulling the check out
and placing the driver check in __device_attach_async_helper since from
what I can tell the check is actually redundant in the non-async path
anyway since __device_attach already had taken the device lock and
checked dev->driver prior to calling __device_attach_driver.

> ...and after that maybe it could be made a common helper
> (dev_driver_checks()?) shared between __device_attach_driver() and
> __driver_attach() to reduce some duplication.

I'm not sure consolidating it into a function would really be worth the
extra effort. It would essentially just obfuscate the checks and I am
not sure you really save much with:
	if (dev_driver_checks(dev))
vs:
	if (!dev->dead && !dev->driver)

By the time you create the function and replace the few spots that are
making these checks you would end up most likely adding more complexity
to the kernel rather than reducing it any.
Dan Williams - Dec. 10, 2018, 7:43 p.m.
On Mon, Dec 10, 2018 at 11:35 AM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
> On Mon, 2018-12-10 at 10:58 -0800, Dan Williams wrote:
> > On Wed, Dec 5, 2018 at 9:25 AM Alexander Duyck
> > <alexander.h.duyck@linux.intel.com> wrote:
> > >
> > > Add an additional bit flag to the device struct named "dead".
> > >
> > > This additional flag provides a guarantee that when a device_del is
> > > executed on a given interface an async worker will not attempt to attach
> > > the driver following the earlier device_del call. Previously this
> > > guarantee was not present and could result in the device_del call
> > > attempting to remove a driver from an interface only to have the async
> > > worker attempt to probe the driver later when it finally completes the
> > > asynchronous probe call.
> > >
> > > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > ---
> > >  drivers/base/core.c    |   11 +++++++++++
> > >  drivers/base/dd.c      |    8 ++++++--
> > >  include/linux/device.h |    5 +++++
> > >  3 files changed, 22 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > > index f3e6ca4170b4..70358327303b 100644
> > > --- a/drivers/base/core.c
> > > +++ b/drivers/base/core.c
> > > @@ -2075,6 +2075,17 @@ void device_del(struct device *dev)
> > >         struct kobject *glue_dir = NULL;
> > >         struct class_interface *class_intf;
> > >
> > > +       /*
> > > +        * Hold the device lock and set the "dead" flag to guarantee that
> > > +        * the update behavior is consistent with the other bitfields near
> > > +        * it and that we cannot have an asynchronous probe routine trying
> > > +        * to run while we are tearing out the bus/class/sysfs from
> > > +        * underneath the device.
> > > +        */
> > > +       device_lock(dev);
> > > +       dev->dead = true;
> > > +       device_unlock(dev);
> > > +
> > >         /* Notify clients of device removal.  This call must come
> > >          * before dpm_sysfs_remove().
> > >          */
> > > diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> > > index 88713f182086..3bb8c3e0f3da 100644
> > > --- a/drivers/base/dd.c
> > > +++ b/drivers/base/dd.c
> > > @@ -774,6 +774,10 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
> > >
> > >         device_lock(dev);
> > >
> > > +       /* device is or has been removed from the bus, just bail out */
> > > +       if (dev->dead)
> > > +               goto out_unlock;
> > > +
> >
> > What do you think about moving this check into
> > __device_attach_driver() alongside all the other checks? That way we
> > also get ->dead checking through the __device_attach() path.
>
> I'm not really sure that is the best spot to do that. Part of the
> reason being that by placing it where I did we avoid messing with the
> runtime power management for the parent if it was already powered off.

...but this already a rare event and the parent shouldn't otherwise be
bothered by a spurious pm_runtime wakeup event.

> If anything I would say we could probably look at pulling the check out
> and placing the driver check in __device_attach_async_helper since from
> what I can tell the check is actually redundant in the non-async path
> anyway since __device_attach already had taken the device lock and
> checked dev->driver prior to calling __device_attach_driver.
>
> > ...and after that maybe it could be made a common helper
> > (dev_driver_checks()?) shared between __device_attach_driver() and
> > __driver_attach() to reduce some duplication.
>
> I'm not sure consolidating it into a function would really be worth the
> extra effort. It would essentially just obfuscate the checks and I am
> not sure you really save much with:
>         if (dev_driver_checks(dev))
> vs:
>         if (!dev->dead && !dev->driver)
>
> By the time you create the function and replace the few spots that are
> making these checks you would end up most likely adding more complexity
> to the kernel rather than reducing it any.

No, I was talking about removing this duplication in
__device_attach_driver() and __driver_attach():

        if (ret == 0) {
                /* no match */
                return 0;
        } else if (ret == -EPROBE_DEFER) {
                dev_dbg(dev, "Device match requests probe deferral\n");
                driver_deferred_probe_add(dev);
        } else if (ret < 0) {
                dev_dbg(dev, "Bus failed to match device: %d", ret);
                return ret;
        } /* ret > 0 means positive match */

...and lead in with a dev->dead check.
Alexander Duyck - Dec. 10, 2018, 8:57 p.m.
On Mon, 2018-12-10 at 11:43 -0800, Dan Williams wrote:
> On Mon, Dec 10, 2018 at 11:35 AM Alexander Duyck
> <alexander.h.duyck@linux.intel.com> wrote:
> > 
> > On Mon, 2018-12-10 at 10:58 -0800, Dan Williams wrote:
> > > On Wed, Dec 5, 2018 at 9:25 AM Alexander Duyck
> > > <alexander.h.duyck@linux.intel.com> wrote:
> > > > 
> > > > Add an additional bit flag to the device struct named "dead".
> > > > 
> > > > This additional flag provides a guarantee that when a device_del is
> > > > executed on a given interface an async worker will not attempt to attach
> > > > the driver following the earlier device_del call. Previously this
> > > > guarantee was not present and could result in the device_del call
> > > > attempting to remove a driver from an interface only to have the async
> > > > worker attempt to probe the driver later when it finally completes the
> > > > asynchronous probe call.
> > > > 
> > > > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > ---
> > > >  drivers/base/core.c    |   11 +++++++++++
> > > >  drivers/base/dd.c      |    8 ++++++--
> > > >  include/linux/device.h |    5 +++++
> > > >  3 files changed, 22 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > > > index f3e6ca4170b4..70358327303b 100644
> > > > --- a/drivers/base/core.c
> > > > +++ b/drivers/base/core.c
> > > > @@ -2075,6 +2075,17 @@ void device_del(struct device *dev)
> > > >         struct kobject *glue_dir = NULL;
> > > >         struct class_interface *class_intf;
> > > > 
> > > > +       /*
> > > > +        * Hold the device lock and set the "dead" flag to guarantee that
> > > > +        * the update behavior is consistent with the other bitfields near
> > > > +        * it and that we cannot have an asynchronous probe routine trying
> > > > +        * to run while we are tearing out the bus/class/sysfs from
> > > > +        * underneath the device.
> > > > +        */
> > > > +       device_lock(dev);
> > > > +       dev->dead = true;
> > > > +       device_unlock(dev);
> > > > +
> > > >         /* Notify clients of device removal.  This call must come
> > > >          * before dpm_sysfs_remove().
> > > >          */
> > > > diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> > > > index 88713f182086..3bb8c3e0f3da 100644
> > > > --- a/drivers/base/dd.c
> > > > +++ b/drivers/base/dd.c
> > > > @@ -774,6 +774,10 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
> > > > 
> > > >         device_lock(dev);
> > > > 
> > > > +       /* device is or has been removed from the bus, just bail out */
> > > > +       if (dev->dead)
> > > > +               goto out_unlock;
> > > > +
> > > 
> > > What do you think about moving this check into
> > > __device_attach_driver() alongside all the other checks? That way we
> > > also get ->dead checking through the __device_attach() path.
> > 
> > I'm not really sure that is the best spot to do that. Part of the
> > reason being that by placing it where I did we avoid messing with the
> > runtime power management for the parent if it was already powered off.
> 
> ...but this already a rare event and the parent shouldn't otherwise be
> bothered by a spurious pm_runtime wakeup event.
> 
> > If anything I would say we could probably look at pulling the check out
> > and placing the driver check in __device_attach_async_helper since from
> > what I can tell the check is actually redundant in the non-async path
> > anyway since __device_attach already had taken the device lock and
> > checked dev->driver prior to calling __device_attach_driver.
> > 
> > > ...and after that maybe it could be made a common helper
> > > (dev_driver_checks()?) shared between __device_attach_driver() and
> > > __driver_attach() to reduce some duplication.
> > 
> > I'm not sure consolidating it into a function would really be worth the
> > extra effort. It would essentially just obfuscate the checks and I am
> > not sure you really save much with:
> >         if (dev_driver_checks(dev))
> > vs:
> >         if (!dev->dead && !dev->driver)
> > 
> > By the time you create the function and replace the few spots that are
> > making these checks you would end up most likely adding more complexity
> > to the kernel rather than reducing it any.
> 
> No, I was talking about removing this duplication in
> __device_attach_driver() and __driver_attach():
> 
>         if (ret == 0) {
>                 /* no match */
>                 return 0;
>         } else if (ret == -EPROBE_DEFER) {
>                 dev_dbg(dev, "Device match requests probe deferral\n");
>                 driver_deferred_probe_add(dev);

Is this bit of code correct? Seems like there should be a return here
doesn't it?

I just double checked and this is what is in the kernel too.

>         } else if (ret < 0) {
>                 dev_dbg(dev, "Bus failed to match device: %d", ret);
>                 return ret;
>         } /* ret > 0 means positive match */
> 
> ...and lead in with a dev->dead check.

I would think that we would want to check for dev->dead before we even
call driver_match_device. That way we don't have the match function
crawling around a device that is being disassembled. Is that what you
were referring to?

Also the context for the two functions seems to be a bit different. In
the case of __device_attach_driver the device_lock is already held. In
__driver_attach the lock on the device isn't taken until after a match
has been found.
Dan Williams - Dec. 10, 2018, 9:15 p.m.
On Mon, Dec 10, 2018 at 12:58 PM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
> On Mon, 2018-12-10 at 11:43 -0800, Dan Williams wrote:
> > On Mon, Dec 10, 2018 at 11:35 AM Alexander Duyck
> > <alexander.h.duyck@linux.intel.com> wrote:
> > >
> > > On Mon, 2018-12-10 at 10:58 -0800, Dan Williams wrote:
> > > > On Wed, Dec 5, 2018 at 9:25 AM Alexander Duyck
> > > > <alexander.h.duyck@linux.intel.com> wrote:
> > > > >
> > > > > Add an additional bit flag to the device struct named "dead".
> > > > >
> > > > > This additional flag provides a guarantee that when a device_del is
> > > > > executed on a given interface an async worker will not attempt to attach
> > > > > the driver following the earlier device_del call. Previously this
> > > > > guarantee was not present and could result in the device_del call
> > > > > attempting to remove a driver from an interface only to have the async
> > > > > worker attempt to probe the driver later when it finally completes the
> > > > > asynchronous probe call.
> > > > >
> > > > > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> > > > > ---
> > > > >  drivers/base/core.c    |   11 +++++++++++
> > > > >  drivers/base/dd.c      |    8 ++++++--
> > > > >  include/linux/device.h |    5 +++++
> > > > >  3 files changed, 22 insertions(+), 2 deletions(-)
> > > > >
> > > > > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > > > > index f3e6ca4170b4..70358327303b 100644
> > > > > --- a/drivers/base/core.c
> > > > > +++ b/drivers/base/core.c
> > > > > @@ -2075,6 +2075,17 @@ void device_del(struct device *dev)
> > > > >         struct kobject *glue_dir = NULL;
> > > > >         struct class_interface *class_intf;
> > > > >
> > > > > +       /*
> > > > > +        * Hold the device lock and set the "dead" flag to guarantee that
> > > > > +        * the update behavior is consistent with the other bitfields near
> > > > > +        * it and that we cannot have an asynchronous probe routine trying
> > > > > +        * to run while we are tearing out the bus/class/sysfs from
> > > > > +        * underneath the device.
> > > > > +        */
> > > > > +       device_lock(dev);
> > > > > +       dev->dead = true;
> > > > > +       device_unlock(dev);
> > > > > +
> > > > >         /* Notify clients of device removal.  This call must come
> > > > >          * before dpm_sysfs_remove().
> > > > >          */
> > > > > diff --git a/drivers/base/dd.c b/drivers/base/dd.c
> > > > > index 88713f182086..3bb8c3e0f3da 100644
> > > > > --- a/drivers/base/dd.c
> > > > > +++ b/drivers/base/dd.c
> > > > > @@ -774,6 +774,10 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
> > > > >
> > > > >         device_lock(dev);
> > > > >
> > > > > +       /* device is or has been removed from the bus, just bail out */
> > > > > +       if (dev->dead)
> > > > > +               goto out_unlock;
> > > > > +
> > > >
> > > > What do you think about moving this check into
> > > > __device_attach_driver() alongside all the other checks? That way we
> > > > also get ->dead checking through the __device_attach() path.
> > >
> > > I'm not really sure that is the best spot to do that. Part of the
> > > reason being that by placing it where I did we avoid messing with the
> > > runtime power management for the parent if it was already powered off.
> >
> > ...but this already a rare event and the parent shouldn't otherwise be
> > bothered by a spurious pm_runtime wakeup event.
> >
> > > If anything I would say we could probably look at pulling the check out
> > > and placing the driver check in __device_attach_async_helper since from
> > > what I can tell the check is actually redundant in the non-async path
> > > anyway since __device_attach already had taken the device lock and
> > > checked dev->driver prior to calling __device_attach_driver.
> > >
> > > > ...and after that maybe it could be made a common helper
> > > > (dev_driver_checks()?) shared between __device_attach_driver() and
> > > > __driver_attach() to reduce some duplication.
> > >
> > > I'm not sure consolidating it into a function would really be worth the
> > > extra effort. It would essentially just obfuscate the checks and I am
> > > not sure you really save much with:
> > >         if (dev_driver_checks(dev))
> > > vs:
> > >         if (!dev->dead && !dev->driver)
> > >
> > > By the time you create the function and replace the few spots that are
> > > making these checks you would end up most likely adding more complexity
> > > to the kernel rather than reducing it any.
> >
> > No, I was talking about removing this duplication in
> > __device_attach_driver() and __driver_attach():
> >
> >         if (ret == 0) {
> >                 /* no match */
> >                 return 0;
> >         } else if (ret == -EPROBE_DEFER) {
> >                 dev_dbg(dev, "Device match requests probe deferral\n");
> >                 driver_deferred_probe_add(dev);
>
> Is this bit of code correct? Seems like there should be a return here
> doesn't it?

It does look odd, but I think it's ok as the driver is expected to
have its probe routine called multiple times and return -EPROBE_DEFER
if it's not ready yet.

> I just double checked and this is what is in the kernel too.

Yeah, I just copy-pasted it, but it might deserve a comment that the
fallthrough / no return is on purpose.

> >         } else if (ret < 0) {
> >                 dev_dbg(dev, "Bus failed to match device: %d", ret);
> >                 return ret;
> >         } /* ret > 0 means positive match */
> >
> > ...and lead in with a dev->dead check.
>
> I would think that we would want to check for dev->dead before we even
> call driver_match_device. That way we don't have the match function
> crawling around a device that is being disassembled. Is that what you
> were referring to?

I wasn't too concerned about optimizing the case where the probe path
loses the race with device_del().

> Also the context for the two functions seems to be a bit different. In
> the case of __device_attach_driver the device_lock is already held. In
> __driver_attach the lock on the device isn't taken until after a match
> has been found.

Yes, I was only pattern matching when looking at the context of where
dev->dead is checked in __driver_attach() and wondering why it was
checked outside of __device_attach_driver()
Dan Williams - Dec. 10, 2018, 9:23 p.m.
On Mon, Dec 10, 2018 at 1:15 PM Dan Williams <dan.j.williams@intel.com> wrote:
>
> On Mon, Dec 10, 2018 at 12:58 PM Alexander Duyck
> <alexander.h.duyck@linux.intel.com> wrote:
[..]
> > Also the context for the two functions seems to be a bit different. In
> > the case of __device_attach_driver the device_lock is already held. In
> > __driver_attach the lock on the device isn't taken until after a match
> > has been found.
>
> Yes, I was only pattern matching when looking at the context of where
> dev->dead is checked in __driver_attach() and wondering why it was
> checked outside of __device_attach_driver()

...and now I realize the bigger point of your concern, we need to
check dev->dead after acquiring the device_lock otherwise the race is
back. We can defer that consolidation, but the larger concern of
making it internal to __device_attach_driver() still stands.
Alexander Duyck - Dec. 10, 2018, 10:24 p.m.
On Mon, 2018-12-10 at 13:23 -0800, Dan Williams wrote:
> On Mon, Dec 10, 2018 at 1:15 PM Dan Williams <dan.j.williams@intel.com> wrote:
> > 
> > On Mon, Dec 10, 2018 at 12:58 PM Alexander Duyck
> > <alexander.h.duyck@linux.intel.com> wrote:
> 
> [..]
> > > Also the context for the two functions seems to be a bit different. In
> > > the case of __device_attach_driver the device_lock is already held. In
> > > __driver_attach the lock on the device isn't taken until after a match
> > > has been found.
> > 
> > Yes, I was only pattern matching when looking at the context of where
> > dev->dead is checked in __driver_attach() and wondering why it was
> > checked outside of __device_attach_driver()
> 
> ...and now I realize the bigger point of your concern, we need to
> check dev->dead after acquiring the device_lock otherwise the race is
> back. We can defer that consolidation, but the larger concern of
> making it internal to __device_attach_driver() still stands.

I'm still not a fan of moving it into __device_attach_driver. I would
much rather pull out the dev->driver check and instead place that in
__device_attach_async_helper.

The __device_attach function as I said took the device_lock and had
already checked dev->driver. So in the non-async path it shouldn't be
possible for dev->driver to ever be set anyway. In addition
__device_attach_driver is called once for each driver on a given bus,
so dropping the test should reduce driver load time since it is one
less test that has to be performed per driver.
Dan Williams - Dec. 10, 2018, 10:41 p.m.
On Mon, Dec 10, 2018 at 2:24 PM Alexander Duyck
<alexander.h.duyck@linux.intel.com> wrote:
>
> On Mon, 2018-12-10 at 13:23 -0800, Dan Williams wrote:
> > On Mon, Dec 10, 2018 at 1:15 PM Dan Williams <dan.j.williams@intel.com> wrote:
> > >
> > > On Mon, Dec 10, 2018 at 12:58 PM Alexander Duyck
> > > <alexander.h.duyck@linux.intel.com> wrote:
> >
> > [..]
> > > > Also the context for the two functions seems to be a bit different. In
> > > > the case of __device_attach_driver the device_lock is already held. In
> > > > __driver_attach the lock on the device isn't taken until after a match
> > > > has been found.
> > >
> > > Yes, I was only pattern matching when looking at the context of where
> > > dev->dead is checked in __driver_attach() and wondering why it was
> > > checked outside of __device_attach_driver()
> >
> > ...and now I realize the bigger point of your concern, we need to
> > check dev->dead after acquiring the device_lock otherwise the race is
> > back. We can defer that consolidation, but the larger concern of
> > making it internal to __device_attach_driver() still stands.
>
> I'm still not a fan of moving it into __device_attach_driver. I would
> much rather pull out the dev->driver check and instead place that in
> __device_attach_async_helper.
>
> The __device_attach function as I said took the device_lock and had
> already checked dev->driver. So in the non-async path it shouldn't be
> possible for dev->driver to ever be set anyway.

True.

> In addition
> __device_attach_driver is called once for each driver on a given bus,
> so dropping the test should reduce driver load time since it is one
> less test that has to be performed per driver.

Ok. You can add my Reviewed-by.

Patch

diff --git a/drivers/base/core.c b/drivers/base/core.c
index f3e6ca4170b4..70358327303b 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2075,6 +2075,17 @@  void device_del(struct device *dev)
 	struct kobject *glue_dir = NULL;
 	struct class_interface *class_intf;
 
+	/*
+	 * Hold the device lock and set the "dead" flag to guarantee that
+	 * the update behavior is consistent with the other bitfields near
+	 * it and that we cannot have an asynchronous probe routine trying
+	 * to run while we are tearing out the bus/class/sysfs from
+	 * underneath the device.
+	 */
+	device_lock(dev);
+	dev->dead = true;
+	device_unlock(dev);
+
 	/* Notify clients of device removal.  This call must come
 	 * before dpm_sysfs_remove().
 	 */
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index 88713f182086..3bb8c3e0f3da 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -774,6 +774,10 @@  static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
 
 	device_lock(dev);
 
+	/* device is or has been removed from the bus, just bail out */
+	if (dev->dead)
+		goto out_unlock;
+
 	if (dev->parent)
 		pm_runtime_get_sync(dev->parent);
 
@@ -784,7 +788,7 @@  static void __device_attach_async_helper(void *_dev, async_cookie_t cookie)
 
 	if (dev->parent)
 		pm_runtime_put(dev->parent);
-
+out_unlock:
 	device_unlock(dev);
 
 	put_device(dev);
@@ -897,7 +901,7 @@  static int __driver_attach(struct device *dev, void *data)
 	if (dev->parent && dev->bus->need_parent_lock)
 		device_lock(dev->parent);
 	device_lock(dev);
-	if (!dev->driver)
+	if (!dev->dead && !dev->driver)
 		driver_probe_device(drv, dev);
 	device_unlock(dev);
 	if (dev->parent && dev->bus->need_parent_lock)
diff --git a/include/linux/device.h b/include/linux/device.h
index 4921a6192f6b..393704e5b602 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -957,6 +957,10 @@  struct dev_links_info {
  *              device.
  * @dma_coherent: this particular device is dma coherent, even if the
  *		architecture supports non-coherent devices.
+ * @dead:	This device is currently either in the process of or has
+ *		been removed from the system. Any asynchronous events
+ *		scheduled for this device should exit without taking any
+ *		action.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -1051,6 +1055,7 @@  struct device {
     defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
 	bool			dma_coherent:1;
 #endif
+	bool			dead:1;
 };
 
 static inline struct device *kobj_to_dev(struct kobject *kobj)