[PATCH -v2 0/6] spin_unlock_wait borkage

classic Classic list List threaded Threaded
7 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH -v2 0/6] spin_unlock_wait borkage

Peter Zijlstra-5
This version rewrites all spin_unlock_wait() implementations to provide the
acquire order against the spin_unlock() release we wait on, ensuring we can
fully observe the critical section we waited on.

It pulls in the smp_cond_acquire() rewrite because it introduces a lot more
users of it. All simple spin_unlock_wait implementations end up being an
smp_cond_load_acquire().

And while this still introduces smp_acquire__after_ctrl_dep() it keeps its
usage contained to a few sites.

Please consider.. carefully.

Reply | Threaded
Open this post in threaded view
|

[PATCH -v2 5/6] locking: Update spin_unlock_wait users

Peter Zijlstra-5
With the modified semantics of spin_unlock_wait() a number of
explicit barriers can be removed. And update the comment for the
do_exit() usecase, as that was somewhat stale/obscure.

Signed-off-by: Peter Zijlstra (Intel) <[hidden email]>
---
 ipc/sem.c          |    1 -
 kernel/exit.c      |    8 ++++++--
 kernel/task_work.c |    1 -
 3 files changed, 6 insertions(+), 4 deletions(-)

--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -282,7 +282,6 @@ static void sem_wait_array(struct sem_ar
  sem = sma->sem_base + i;
  spin_unlock_wait(&sem->lock);
  }
- smp_acquire__after_ctrl_dep();
 }
 
 /*
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -776,10 +776,14 @@ void do_exit(long code)
 
  exit_signals(tsk);  /* sets PF_EXITING */
  /*
- * tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes.
+ * Ensure that all new tsk->pi_lock acquisitions must observe
+ * PF_EXITING. Serializes against futex.c:attach_to_pi_owner().
  */
  smp_mb();
+ /*
+ * Ensure that we must observe the pi_state in exit_mm() ->
+ * mm_release() -> exit_pi_state_list().
+ */
  raw_spin_unlock_wait(&tsk->pi_lock);
 
  if (unlikely(in_atomic())) {
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -108,7 +108,6 @@ void task_work_run(void)
  * fail, but it can play with *work and other entries.
  */
  raw_spin_unlock_wait(&task->pi_lock);
- smp_mb();
 
  do {
  next = work->next;


Reply | Threaded
Open this post in threaded view
|

[PATCH -v2 3/6] locking: Introduce smp_acquire__after_ctrl_dep

Peter Zijlstra-5
In reply to this post by Peter Zijlstra-5
Introduce smp_acquire__after_ctrl_dep(), this construct is not
uncommen, but the lack of this barrier is.

Signed-off-by: Peter Zijlstra (Intel) <[hidden email]>
---
 include/linux/compiler.h |   15 ++++++++++-----
 ipc/sem.c                |   14 ++------------
 2 files changed, 12 insertions(+), 17 deletions(-)

--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -305,6 +305,15 @@ static __always_inline void __write_once
 })
 
 /**
+ * smp_acquire__after_ctrl_dep() - Provide ACQUIRE ordering after a control dependency
+ *
+ * A control dependency provides a LOAD->STORE order, the additional RMB
+ * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
+ * aka. (load)-ACQUIRE.
+ */
+#define smp_acquire__after_ctrl_dep() smp_rmb()
+
+/**
  * cmpwait - compare and wait for a variable to change
  * @ptr: pointer to the variable to wait on
  * @val: the value it should change from
@@ -331,10 +340,6 @@ static __always_inline void __write_once
  *
  * Due to C lacking lambda expressions we load the value of *ptr into a
  * pre-named variable @VAL to be used in @cond.
- *
- * The control dependency provides a LOAD->STORE order, the additional RMB
- * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
- * aka. ACQUIRE.
  */
 #ifndef smp_cond_load_acquire
 #define smp_cond_load_acquire(ptr, cond_expr) ({ \
@@ -346,7 +351,7 @@ static __always_inline void __write_once
  break; \
  cmpwait(__PTR, VAL); \
  } \
- smp_rmb(); /* ctrl + rmb := acquire */ \
+ smp_acquire__after_ctrl_dep(); \
  VAL; \
 })
 #endif
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -260,16 +260,6 @@ static void sem_rcu_free(struct rcu_head
 }
 
 /*
- * spin_unlock_wait() and !spin_is_locked() are not memory barriers, they
- * are only control barriers.
- * The code must pair with spin_unlock(&sem->lock) or
- * spin_unlock(&sem_perm.lock), thus just the control barrier is insufficient.
- *
- * smp_rmb() is sufficient, as writes cannot pass the control barrier.
- */
-#define ipc_smp_acquire__after_spin_is_unlocked() smp_rmb()
-
-/*
  * Wait until all currently ongoing simple ops have completed.
  * Caller must own sem_perm.lock.
  * New simple ops cannot start, because simple ops first check
@@ -292,7 +282,7 @@ static void sem_wait_array(struct sem_ar
  sem = sma->sem_base + i;
  spin_unlock_wait(&sem->lock);
  }
- ipc_smp_acquire__after_spin_is_unlocked();
+ smp_acquire__after_ctrl_dep();
 }
 
 /*
@@ -350,7 +340,7 @@ static inline int sem_lock(struct sem_ar
  * complex_count++;
  * spin_unlock(sem_perm.lock);
  */
- ipc_smp_acquire__after_spin_is_unlocked();
+ smp_acquire__after_ctrl_dep();
 
  /*
  * Now repeat the test of complex_count:


Reply | Threaded
Open this post in threaded view
|

[PATCH -v2 2/6] locking: Introduce cmpwait()

Peter Zijlstra-5
In reply to this post by Peter Zijlstra-5
Provide the cmpwait() primitive, which will 'spin' wait for a variable
to change and use it to implement smp_cond_load_acquire().

This primitive can be implemented with hardware assist on some
platforms (ARM64, x86).

Suggested-by: Will Deacon <[hidden email]>
Signed-off-by: Peter Zijlstra (Intel) <[hidden email]>
---
 include/linux/compiler.h |   19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -305,6 +305,23 @@ static __always_inline void __write_once
 })
 
 /**
+ * cmpwait - compare and wait for a variable to change
+ * @ptr: pointer to the variable to wait on
+ * @val: the value it should change from
+ *
+ * A simple constuct that waits for a variable to change from a known
+ * value; some architectures can do this in hardware.
+ */
+#ifndef cmpwait
+#define cmpwait(ptr, val) do { \
+ typeof (ptr) __ptr = (ptr); \
+ typeof (val) __val = (val); \
+ while (READ_ONCE(*__ptr) == __val) \
+ cpu_relax(); \
+} while (0)
+#endif
+
+/**
  * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
  * @ptr: pointer to the variable to wait on
  * @cond: boolean expression to wait for
@@ -327,7 +344,7 @@ static __always_inline void __write_once
  VAL = READ_ONCE(*__PTR); \
  if (cond_expr) \
  break; \
- cpu_relax(); \
+ cmpwait(__PTR, VAL); \
  } \
  smp_rmb(); /* ctrl + rmb := acquire */ \
  VAL; \


Reply | Threaded
Open this post in threaded view
|

[PATCH -v2 1/6] locking: Replace smp_cond_acquire with smp_cond_load_acquire

Peter Zijlstra-5
In reply to this post by Peter Zijlstra-5
This new form allows using hardware assisted waiting.

Requested-by: Will Deacon <[hidden email]>
Suggested-by: Linus Torvalds <[hidden email]>
Signed-off-by: Peter Zijlstra (Intel) <[hidden email]>
---
 include/linux/compiler.h   |   25 +++++++++++++++++++------
 kernel/locking/qspinlock.c |   12 ++++++------
 kernel/sched/core.c        |    8 ++++----
 kernel/sched/sched.h       |    2 +-
 kernel/smp.c               |    2 +-
 5 files changed, 31 insertions(+), 18 deletions(-)

--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -305,21 +305,34 @@ static __always_inline void __write_once
 })
 
 /**
- * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
+ * smp_cond_load_acquire() - (Spin) wait for cond with ACQUIRE ordering
+ * @ptr: pointer to the variable to wait on
  * @cond: boolean expression to wait for
  *
  * Equivalent to using smp_load_acquire() on the condition variable but employs
  * the control dependency of the wait to reduce the barrier on many platforms.
  *
+ * Due to C lacking lambda expressions we load the value of *ptr into a
+ * pre-named variable @VAL to be used in @cond.
+ *
  * The control dependency provides a LOAD->STORE order, the additional RMB
  * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
  * aka. ACQUIRE.
  */
-#define smp_cond_acquire(cond) do { \
- while (!(cond)) \
- cpu_relax(); \
- smp_rmb(); /* ctrl + rmb := acquire */ \
-} while (0)
+#ifndef smp_cond_load_acquire
+#define smp_cond_load_acquire(ptr, cond_expr) ({ \
+ typeof(ptr) __PTR = (ptr); \
+ typeof(*ptr) VAL; \
+ for (;;) { \
+ VAL = READ_ONCE(*__PTR); \
+ if (cond_expr) \
+ break; \
+ cpu_relax(); \
+ } \
+ smp_rmb(); /* ctrl + rmb := acquire */ \
+ VAL; \
+})
+#endif
 
 #endif /* __KERNEL__ */
 
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -358,7 +358,7 @@ void queued_spin_lock_slowpath(struct qs
  * sequentiality; this is because not all clear_pending_set_locked()
  * implementations imply full barriers.
  */
- smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
 
  /*
  * take ownership and clear the pending bit.
@@ -434,7 +434,7 @@ void queued_spin_lock_slowpath(struct qs
  *
  * The PV pv_wait_head_or_lock function, if active, will acquire
  * the lock and return a non-zero value. So we have to skip the
- * smp_cond_acquire() call. As the next PV queue head hasn't been
+ * smp_cond_load_acquire() call. As the next PV queue head hasn't been
  * designated yet, there is no way for the locked value to become
  * _Q_SLOW_VAL. So both the set_locked() and the
  * atomic_cmpxchg_relaxed() calls will be safe.
@@ -445,7 +445,7 @@ void queued_spin_lock_slowpath(struct qs
  if ((val = pv_wait_head_or_lock(lock, node)))
  goto locked;
 
- smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+ val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
 
 locked:
  /*
@@ -465,9 +465,9 @@ void queued_spin_lock_slowpath(struct qs
  break;
  }
  /*
- * The smp_cond_acquire() call above has provided the necessary
- * acquire semantics required for locking. At most two
- * iterations of this loop may be ran.
+ * The smp_cond_load_acquire() call above has provided the
+ * necessary acquire semantics required for locking. At most
+ * two iterations of this loop may be ran.
  */
  old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
  if (old == val)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1843,7 +1843,7 @@ static void ttwu_queue(struct task_struc
  * chain to provide order. Instead we do:
  *
  *   1) smp_store_release(X->on_cpu, 0)
- *   2) smp_cond_acquire(!X->on_cpu)
+ *   2) smp_cond_load_acquire(!X->on_cpu)
  *
  * Example:
  *
@@ -1854,7 +1854,7 @@ static void ttwu_queue(struct task_struc
  *   sched-out X
  *   smp_store_release(X->on_cpu, 0);
  *
- *                    smp_cond_acquire(!X->on_cpu);
+ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
  *                    X->state = WAKING
  *                    set_task_cpu(X,2)
  *
@@ -1880,7 +1880,7 @@ static void ttwu_queue(struct task_struc
  * This means that any means of doing remote wakeups must order the CPU doing
  * the wakeup against the CPU the task is going to end up running on. This,
  * however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
  *
  */
 
@@ -1953,7 +1953,7 @@ try_to_wake_up(struct task_struct *p, un
  * This ensures that tasks getting woken will be fully ordered against
  * their previous state and preserve Program Order.
  */
- smp_cond_acquire(!p->on_cpu);
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
 
  p->sched_contributes_to_load = !!task_contributes_to_load(p);
  p->state = TASK_WAKING;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1104,7 +1104,7 @@ static inline void finish_lock_switch(st
  * In particular, the load of prev->state in finish_task_switch() must
  * happen before this.
  *
- * Pairs with the smp_cond_acquire() in try_to_wake_up().
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
  */
  smp_store_release(&prev->on_cpu, 0);
 #endif
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -107,7 +107,7 @@ void __init call_function_init(void)
  */
 static __always_inline void csd_lock_wait(struct call_single_data *csd)
 {
- smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
+ smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
 }
 
 static __always_inline void csd_lock(struct call_single_data *csd)


Reply | Threaded
Open this post in threaded view
|

[PATCH -v2 6/6] locking,netfilter: Fix nf_conntrack_lock()

Peter Zijlstra-5
In reply to this post by Peter Zijlstra-5
nf_conntrack_lock{,_all}() is borken as it misses a bunch of memory
barriers to order the whole global vs local locks scheme.

Even x86 (and other TSO archs) are affected.

Signed-off-by: Peter Zijlstra (Intel) <[hidden email]>
---
 net/netfilter/nf_conntrack_core.c |   18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -83,6 +83,12 @@ void nf_conntrack_lock(spinlock_t *lock)
  spin_lock(lock);
  while (unlikely(nf_conntrack_locks_all)) {
  spin_unlock(lock);
+
+ /* Order the nf_contrack_locks_all load vs the
+ * spin_unlock_wait() loads below, to ensure locks_all is
+ * indeed held.
+ */
+ smp_rmb(); /* spin_lock(locks_all) */
  spin_unlock_wait(&nf_conntrack_locks_all_lock);
  spin_lock(lock);
  }
@@ -128,6 +134,12 @@ static void nf_conntrack_all_lock(void)
  spin_lock(&nf_conntrack_locks_all_lock);
  nf_conntrack_locks_all = true;
 
+ /* Order the above store against the spin_unlock_wait() loads
+ * below, such that if nf_conntrack_lock() observes lock_all
+ * we must observe lock[] held.
+ */
+ smp_mb(); /* spin_lock(locks_all) */
+
  for (i = 0; i < CONNTRACK_LOCKS; i++) {
  spin_unlock_wait(&nf_conntrack_locks[i]);
  }
@@ -135,7 +147,11 @@ static void nf_conntrack_all_lock(void)
 
 static void nf_conntrack_all_unlock(void)
 {
- nf_conntrack_locks_all = false;
+ /* All prior stores must be complete before we clear locks_all.
+ * Otherwise nf_conntrack_lock() might observe the false but not the
+ * entire critical section.
+ */
+ smp_store_release(&nf_conntrack_locks_all, false);
  spin_unlock(&nf_conntrack_locks_all_lock);
 }
 


Reply | Threaded
Open this post in threaded view
|

[PATCH -v2 4/6] locking, arch: Update spin_unlock_wait()

Peter Zijlstra-5
In reply to this post by Peter Zijlstra-5
This patch updates/fixes all spin_unlock_wait() implementations.

The update is in semantics; where it previously was only a control
dependency, we now upgrade to a full load-acquire to match the
store-release from the spin_unlock() we waited on. This ensures that
when spin_unlock_wait() returns, we're guaranteed to observe the full
critical section we waited on.

This fixes a number of spin_unlock_wait() users that (not
unreasonably) rely on this.

I also fixed a number of ticket lock versions to only wait on the
current lock holder, instead of for a full unlock, as this is
sufficient.

Furthermore; again for ticket locks; I added an smp_rmb() in between
the initial ticket load and the spin loop testing the current value
because I could not convince myself the address dependency is
sufficient, esp. if the loads are of different sizes.

I'm more than happy to remove this smp_rmb() again if people are
certain the address dependency does indeed work as expected.

Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Cc: [hidden email]
Signed-off-by: Peter Zijlstra (Intel) <[hidden email]>
---
 arch/alpha/include/asm/spinlock.h    |    7 +++++--
 arch/arc/include/asm/spinlock.h      |    7 +++++--
 arch/arm/include/asm/spinlock.h      |   18 ++++++++++++++++--
 arch/blackfin/include/asm/spinlock.h |    3 +--
 arch/hexagon/include/asm/spinlock.h  |    8 ++++++--
 arch/ia64/include/asm/spinlock.h     |    2 ++
 arch/m32r/include/asm/spinlock.h     |    7 +++++--
 arch/metag/include/asm/spinlock.h    |   11 +++++++++--
 arch/mips/include/asm/spinlock.h     |   18 ++++++++++++++++--
 arch/mn10300/include/asm/spinlock.h  |    6 +++++-
 arch/parisc/include/asm/spinlock.h   |    9 +++++++--
 arch/powerpc/include/asm/spinlock.h  |    6 ++++--
 arch/s390/include/asm/spinlock.h     |    3 +--
 arch/sh/include/asm/spinlock.h       |    7 +++++--
 arch/sparc/include/asm/spinlock_32.h |    6 ++++--
 arch/sparc/include/asm/spinlock_64.h |    9 ++++++---
 arch/tile/lib/spinlock_32.c          |    4 ++++
 arch/tile/lib/spinlock_64.c          |    4 ++++
 arch/xtensa/include/asm/spinlock.h   |    7 +++++--
 include/asm-generic/qspinlock.h      |    3 +--
 include/linux/spinlock_up.h          |    9 ++++++---
 21 files changed, 117 insertions(+), 37 deletions(-)

--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -13,8 +13,11 @@
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 #define arch_spin_is_locked(x) ((x)->lock != 0)
-#define arch_spin_unlock_wait(x) \
- do { cpu_relax(); } while ((x)->lock)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->lock, !VAL);
+}
 
 static inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 {
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -15,8 +15,11 @@
 
 #define arch_spin_is_locked(x) ((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_spin_unlock_wait(x) \
- do { while (arch_spin_is_locked(x)) cpu_relax(); } while (0)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->slock, !VAL);
+}
 
 #ifdef CONFIG_ARC_HAS_LLSC
 
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -50,8 +50,22 @@ static inline void dsb_sev(void)
  * memory.
  */
 
-#define arch_spin_unlock_wait(lock) \
- do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ u16 owner = READ_ONCE(lock->tickets.owner);
+
+ smp_rmb();
+ for (;;) {
+ arch_spinlock_t tmp = READ_ONCE(*lock);
+
+ if (tmp.tickets.owner == tmp.tickets.next ||
+    tmp.tickets.owner != owner)
+ break;
+
+ wfe();
+ }
+ smp_acquire__after_ctrl_dep();
+}
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
--- a/arch/blackfin/include/asm/spinlock.h
+++ b/arch/blackfin/include/asm/spinlock.h
@@ -48,8 +48,7 @@ static inline void arch_spin_unlock(arch
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
- while (arch_spin_is_locked(lock))
- cpu_relax();
+ smp_cond_load_acquire(&lock->lock, !VAL);
 }
 
 static inline int arch_read_can_lock(arch_rwlock_t *rw)
--- a/arch/hexagon/include/asm/spinlock.h
+++ b/arch/hexagon/include/asm/spinlock.h
@@ -176,8 +176,12 @@ static inline unsigned int arch_spin_try
  * SMP spinlocks are intended to allow only a single CPU at the lock
  */
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_spin_unlock_wait(lock) \
- do {while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->lock, !VAL);
+}
+
 #define arch_spin_is_locked(x) ((x)->lock != 0)
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -86,6 +86,8 @@ static __always_inline void __ticket_spi
  return;
  cpu_relax();
  }
+
+ smp_acquire__after_ctrl_dep();
 }
 
 static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
--- a/arch/m32r/include/asm/spinlock.h
+++ b/arch/m32r/include/asm/spinlock.h
@@ -27,8 +27,11 @@
 
 #define arch_spin_is_locked(x) (*(volatile int *)(&(x)->slock) <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_spin_unlock_wait(x) \
- do { cpu_relax(); } while (arch_spin_is_locked(x))
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->slock, VAL > 0);
+}
 
 /**
  * arch_spin_trylock - Try spin lock and return a result
--- a/arch/metag/include/asm/spinlock.h
+++ b/arch/metag/include/asm/spinlock.h
@@ -7,8 +7,15 @@
 #include <asm/spinlock_lnkget.h>
 #endif
 
-#define arch_spin_unlock_wait(lock) \
- do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+/*
+ * both lock1 and lnkget are test-and-set spinlocks with 0 unlocked and 1
+ * locked.
+ */
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->lock, !VAL);
+}
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -48,8 +48,22 @@ static inline int arch_spin_value_unlock
 }
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_spin_unlock_wait(x) \
- while (arch_spin_is_locked(x)) { cpu_relax(); }
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ u16 owner = READ_ONCE(lock->h.serving_now);
+ smp_rmb();
+ for (;;) {
+ arch_spinlock_t tmp = READ_ONCE(*lock);
+
+ if (tmp.h.serving_now == tmp.h.ticket ||
+    tmp.h.serving_now != owner)
+ break;
+
+ cpu_relax();
+ }
+ smp_acquire__after_ctrl_dep();
+}
 
 static inline int arch_spin_is_contended(arch_spinlock_t *lock)
 {
--- a/arch/mn10300/include/asm/spinlock.h
+++ b/arch/mn10300/include/asm/spinlock.h
@@ -23,7 +23,11 @@
  */
 
 #define arch_spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) != 0)
-#define arch_spin_unlock_wait(x) do { barrier(); } while (arch_spin_is_locked(x))
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->slock, !VAL);
+}
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -13,8 +13,13 @@ static inline int arch_spin_is_locked(ar
 }
 
 #define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0)
-#define arch_spin_unlock_wait(x) \
- do { cpu_relax(); } while (arch_spin_is_locked(x))
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *x)
+{
+ volatile unsigned int *a = __ldcw_align(x);
+
+ smp_cond_load_acquire(a, VAL);
+}
 
 static inline void arch_spin_lock_flags(arch_spinlock_t *x,
  unsigned long flags)
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -165,8 +165,10 @@ static inline void arch_spin_unlock(arch
 #ifdef CONFIG_PPC64
 extern void arch_spin_unlock_wait(arch_spinlock_t *lock);
 #else
-#define arch_spin_unlock_wait(lock) \
- do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->slock, !VAL);
+}
 #endif
 
 /*
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -95,8 +95,7 @@ static inline void arch_spin_unlock(arch
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
- while (arch_spin_is_locked(lock))
- arch_spin_relax(lock);
+ smp_cond_load_acquire(&lock->lock, !VAL);
 }
 
 /*
--- a/arch/sh/include/asm/spinlock.h
+++ b/arch/sh/include/asm/spinlock.h
@@ -25,8 +25,11 @@
 
 #define arch_spin_is_locked(x) ((x)->lock <= 0)
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
-#define arch_spin_unlock_wait(x) \
- do { while (arch_spin_is_locked(x)) cpu_relax(); } while (0)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->lock, VAL > 0);
+}
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -13,8 +13,10 @@
 
 #define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0)
 
-#define arch_spin_unlock_wait(lock) \
- do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->lock, !VAL);
+}
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -8,6 +8,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/process.h>
+
 /* To get debugging spinlocks which detect and catch
  * deadlock situations, set CONFIG_DEBUG_SPINLOCK
  * and rebuild your kernel.
@@ -23,9 +25,10 @@
 
 #define arch_spin_is_locked(lp) ((lp)->lock != 0)
 
-#define arch_spin_unlock_wait(lp) \
- do { rmb(); \
- } while((lp)->lock)
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->lock, !VAL);
+}
 
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
--- a/arch/tile/lib/spinlock_32.c
+++ b/arch/tile/lib/spinlock_32.c
@@ -72,10 +72,14 @@ void arch_spin_unlock_wait(arch_spinlock
  if (next == curr)
  return;
 
+ smp_rmb();
+
  /* Wait until the current locker has released the lock. */
  do {
  delay_backoff(iterations++);
  } while (READ_ONCE(lock->current_ticket) == curr);
+
+ smp_acquire__after_ctrl_dep();
 }
 EXPORT_SYMBOL(arch_spin_unlock_wait);
 
--- a/arch/tile/lib/spinlock_64.c
+++ b/arch/tile/lib/spinlock_64.c
@@ -72,10 +72,14 @@ void arch_spin_unlock_wait(arch_spinlock
  if (arch_spin_next(val) == curr)
  return;
 
+ smp_rmb();
+
  /* Wait until the current locker has released the lock. */
  do {
  delay_backoff(iterations++);
  } while (arch_spin_current(READ_ONCE(lock->lock)) == curr);
+
+ smp_acquire__after_ctrl_dep();
 }
 EXPORT_SYMBOL(arch_spin_unlock_wait);
 
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -29,8 +29,11 @@
  */
 
 #define arch_spin_is_locked(x) ((x)->slock != 0)
-#define arch_spin_unlock_wait(lock) \
- do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0)
+
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->slock, !VAL);
+}
 
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -133,8 +133,7 @@ static inline void queued_spin_unlock_wa
 {
  /* See queued_spin_is_locked() */
  smp_mb();
- while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
- cpu_relax();
+ smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
 }
 
 #ifndef virt_spin_lock
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -25,6 +25,11 @@
 #ifdef CONFIG_DEBUG_SPINLOCK
 #define arch_spin_is_locked(x) ((x)->slock == 0)
 
+static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
+{
+ smp_cond_load_acquire(&lock->slock, VAL);
+}
+
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
  lock->slock = 0;
@@ -67,6 +72,7 @@ static inline void arch_spin_unlock(arch
 
 #else /* DEBUG_SPINLOCK */
 #define arch_spin_is_locked(lock) ((void)(lock), 0)
+#define arch_spin_unlock_wait(lock) do { barrier(); (void)(lock); } while (0)
 /* for sched/core.c and kernel_lock.c: */
 # define arch_spin_lock(lock) do { barrier(); (void)(lock); } while (0)
 # define arch_spin_lock_flags(lock, flags) do { barrier(); (void)(lock); } while (0)
@@ -79,7 +85,4 @@ static inline void arch_spin_unlock(arch
 #define arch_read_can_lock(lock) (((void)(lock), 1))
 #define arch_write_can_lock(lock) (((void)(lock), 1))
 
-#define arch_spin_unlock_wait(lock) \
- do { cpu_relax(); } while (arch_spin_is_locked(lock))
-
 #endif /* __LINUX_SPINLOCK_UP_H */