[PATCH RT 1/6] kernel: softirq: unlock with irqs on

classic Classic list List threaded Threaded
41 messages Options
123
Reply | Threaded
Open this post in threaded view
|

[PATCH RT 1/6] kernel: softirq: unlock with irqs on

Sebastian Andrzej Siewior-4
We unlock the lock while the interrupts are off. This isn't a problem
now but will get because the migrate_disable() + enable are not
symmetrical in regard to the status of interrupts.

Signed-off-by: Sebastian Andrzej Siewior <[hidden email]>
---
 kernel/softirq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index d1e999e74d23..2ca63cc1469e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -563,8 +563,10 @@ static void do_current_softirqs(void)
  do_single_softirq(i);
  }
  softirq_clr_runner(i);
- unlock_softirq(i);
  WARN_ON(current->softirq_nestcnt != 1);
+ local_irq_enable();
+ unlock_softirq(i);
+ local_irq_disable();
  }
 }
 
--
2.7.0

Reply | Threaded
Open this post in threaded view
|

[PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Sebastian Andrzej Siewior-4
From: Thomas Gleixner <[hidden email]>

We currently disable migration across lock acquisition. That includes the part
where we block on the lock and schedule out. We cannot disable migration after
taking the lock as that would cause a possible lock inversion.

But we can be smart and enable migration when we block and schedule out. That
allows the scheduler to place the task freely at least if this is the first
migrate disable level. For nested locking this does not help at all.

Signed-off-by: Thomas Gleixner <[hidden email]>
Signed-off-by: Sebastian Andrzej Siewior <[hidden email]>
---
 kernel/locking/rtmutex.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 913aa40f3b5e..66971005cc12 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -924,14 +924,19 @@ static int __try_to_take_rt_mutex(struct rt_mutex *lock,
  * preemptible spin_lock functions:
  */
 static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
- void  (*slowfn)(struct rt_mutex *lock))
+ void  (*slowfn)(struct rt_mutex *lock,
+ bool mg_off),
+ bool do_mig_dis)
 {
  might_sleep_no_state_check();
 
+ if (do_mig_dis)
+ migrate_disable();
+
  if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
  rt_mutex_deadlock_account_lock(lock, current);
  else
- slowfn(lock);
+ slowfn(lock, do_mig_dis);
 }
 
 static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
@@ -989,7 +994,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
  * We store the current state under p->pi_lock in p->saved_state and
  * the try_to_wake_up() code handles this accordingly.
  */
-static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
+static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock,
+    bool mg_off)
 {
  struct task_struct *lock_owner, *self = current;
  struct rt_mutex_waiter waiter, *top_waiter;
@@ -1033,8 +1039,13 @@ static void  noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock)
 
  debug_rt_mutex_print_deadlock(&waiter);
 
- if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
+ if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
+ if (mg_off)
+ migrate_enable();
  schedule();
+ if (mg_off)
+ migrate_disable();
+ }
 
  raw_spin_lock_irqsave(&lock->wait_lock, flags);
 
@@ -1105,38 +1116,35 @@ static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
 
 void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
 {
- rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, false);
  spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 }
 EXPORT_SYMBOL(rt_spin_lock__no_mg);
 
 void __lockfunc rt_spin_lock(spinlock_t *lock)
 {
- migrate_disable();
- rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
  spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 }
 EXPORT_SYMBOL(rt_spin_lock);
 
 void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
 {
- migrate_disable();
- rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, true);
 }
 EXPORT_SYMBOL(__rt_spin_lock);
 
 void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
 {
- rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock, false);
 }
 EXPORT_SYMBOL(__rt_spin_lock__no_mg);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
 {
- migrate_disable();
- rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
  spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock, true);
 }
 EXPORT_SYMBOL(rt_spin_lock_nested);
 #endif
--
2.7.0

Reply | Threaded
Open this post in threaded view
|

[PATCH RT 6/6] rcu: disable more spots of rcu_bh

Sebastian Andrzej Siewior-4
In reply to this post by Sebastian Andrzej Siewior-4
We don't use ru_bh on -RT but we still fork a thread for it and keep it
as a flavour. No more.

Signed-off-by: Sebastian Andrzej Siewior <[hidden email]>
---
 kernel/rcu/tree.c | 6 ++++++
 kernel/rcu/tree.h | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5359091fecaa..64098d35de19 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -454,11 +454,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
 /*
  * Return the number of RCU BH batches started thus far for debug & stats.
  */
+#ifndef CONFIG_PREEMPT_RT_FULL
 unsigned long rcu_batches_started_bh(void)
 {
  return rcu_bh_state.gpnum;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+#endif
 
 /*
  * Return the number of RCU batches completed thus far for debug & stats.
@@ -563,9 +565,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags,
  case RCU_FLAVOR:
  rsp = rcu_state_p;
  break;
+#ifndef CONFIG_PREEMPT_RT_FULL
  case RCU_BH_FLAVOR:
  rsp = &rcu_bh_state;
  break;
+#endif
  case RCU_SCHED_FLAVOR:
  rsp = &rcu_sched_state;
  break;
@@ -4695,7 +4699,9 @@ void __init rcu_init(void)
 
  rcu_bootup_announce();
  rcu_init_geometry();
+#ifndef CONFIG_PREEMPT_RT_FULL
  rcu_init_one(&rcu_bh_state, &rcu_bh_data);
+#endif
  rcu_init_one(&rcu_sched_state, &rcu_sched_data);
  if (dump_tree)
  rcu_dump_rcu_node_tree(&rcu_sched_state);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 588509d94bbd..2ba8f6c2e81e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -557,7 +557,9 @@ extern struct list_head rcu_struct_flavors;
  */
 extern struct rcu_state rcu_sched_state;
 
+#ifndef CONFIG_PREEMPT_RT_FULL
 extern struct rcu_state rcu_bh_state;
+#endif
 
 #ifdef CONFIG_PREEMPT_RCU
 extern struct rcu_state rcu_preempt_state;
--
2.7.0

Reply | Threaded
Open this post in threaded view
|

[PATCH RT 3/6] rtmutex: push down migrate_disable() into rt_spin_lock()

Sebastian Andrzej Siewior-4
In reply to this post by Sebastian Andrzej Siewior-4
No point in having the migrate disable/enable invocations in all the
macro/inlines. That's just more code for no win as we do a function
call anyway. Move it to the core code and save quite some text size.

    text      data        bss        dec        filename
11034127   3676912   14901248   29612287  vmlinux.before
10990437   3676848   14901248   29568533  vmlinux.after

~-40KiB

Signed-off-by: Sebastian Andrzej Siewior <[hidden email]>
---
 include/linux/locallock.h   |  6 +++---
 include/linux/spinlock_rt.h | 25 +++++++-----------------
 kernel/cpu.c                |  4 ++--
 kernel/locking/lglock.c     |  2 +-
 kernel/locking/rt.c         |  2 --
 kernel/locking/rtmutex.c    | 46 +++++++++++++++++++++++++++++++++++++++++----
 6 files changed, 55 insertions(+), 30 deletions(-)

diff --git a/include/linux/locallock.h b/include/linux/locallock.h
index 339ba00adb9a..6fe5928fc2ab 100644
--- a/include/linux/locallock.h
+++ b/include/linux/locallock.h
@@ -43,9 +43,9 @@ struct local_irq_lock {
  * for CONFIG_PREEMPT_BASE map to the normal spin_* calls.
  */
 #ifdef CONFIG_PREEMPT_RT_FULL
-# define spin_lock_local(lock) rt_spin_lock(lock)
-# define spin_trylock_local(lock) rt_spin_trylock(lock)
-# define spin_unlock_local(lock) rt_spin_unlock(lock)
+# define spin_lock_local(lock) rt_spin_lock__no_mg(lock)
+# define spin_trylock_local(lock) rt_spin_trylock__no_mg(lock)
+# define spin_unlock_local(lock) rt_spin_unlock__no_mg(lock)
 #else
 # define spin_lock_local(lock) spin_lock(lock)
 # define spin_trylock_local(lock) spin_trylock(lock)
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
index f757096b230c..3b2825537531 100644
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -18,6 +18,10 @@ do { \
  __rt_spin_lock_init(slock, #slock, &__key); \
 } while (0)
 
+void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock);
+void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock);
+int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock);
+
 extern void __lockfunc rt_spin_lock(spinlock_t *lock);
 extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
 extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
@@ -32,20 +36,16 @@ extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
  * lockdep-less calls, for derived types like rwlock:
  * (for trylock they can use rt_mutex_trylock() directly.
  */
+extern void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock);
 extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock);
 extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock);
 extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
 
-#define spin_lock(lock) \
- do { \
- migrate_disable(); \
- rt_spin_lock(lock); \
- } while (0)
+#define spin_lock(lock) rt_spin_lock(lock)
 
 #define spin_lock_bh(lock) \
  do { \
  local_bh_disable(); \
- migrate_disable(); \
  rt_spin_lock(lock); \
  } while (0)
 
@@ -56,24 +56,19 @@ extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
 #define spin_trylock(lock) \
 ({ \
  int __locked; \
- migrate_disable(); \
  __locked = spin_do_trylock(lock); \
- if (!__locked) \
- migrate_enable(); \
  __locked; \
 })
 
 #ifdef CONFIG_LOCKDEP
 # define spin_lock_nested(lock, subclass) \
  do { \
- migrate_disable(); \
  rt_spin_lock_nested(lock, subclass); \
  } while (0)
 
 #define spin_lock_bh_nested(lock, subclass) \
  do { \
  local_bh_disable(); \
- migrate_disable(); \
  rt_spin_lock_nested(lock, subclass); \
  } while (0)
 
@@ -81,7 +76,6 @@ extern int __lockfunc __rt_spin_trylock(struct rt_mutex *lock);
  do { \
  typecheck(unsigned long, flags); \
  flags = 0; \
- migrate_disable(); \
  rt_spin_lock_nested(lock, subclass); \
  } while (0)
 #else
@@ -117,16 +111,11 @@ static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
 /* FIXME: we need rt_spin_lock_nest_lock */
 #define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
 
-#define spin_unlock(lock) \
- do { \
- rt_spin_unlock(lock); \
- migrate_enable(); \
- } while (0)
+#define spin_unlock(lock) rt_spin_unlock(lock)
 
 #define spin_unlock_bh(lock) \
  do { \
  rt_spin_unlock(lock); \
- migrate_enable(); \
  local_bh_enable(); \
  } while (0)
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 51aafc259546..8edd3c716092 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -127,8 +127,8 @@ struct hotplug_pcp {
 };
 
 #ifdef CONFIG_PREEMPT_RT_FULL
-# define hotplug_lock(hp) rt_spin_lock(&(hp)->lock)
-# define hotplug_unlock(hp) rt_spin_unlock(&(hp)->lock)
+# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
+# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
 #else
 # define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
 # define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
index d8be4fcc14f8..51bfe64e7d16 100644
--- a/kernel/locking/lglock.c
+++ b/kernel/locking/lglock.c
@@ -10,7 +10,7 @@
 # define lg_do_unlock(l) arch_spin_unlock(l)
 #else
 # define lg_lock_ptr struct rt_mutex
-# define lg_do_lock(l) __rt_spin_lock(l)
+# define lg_do_lock(l) __rt_spin_lock__no_mg(l)
 # define lg_do_unlock(l) __rt_spin_unlock(l)
 #endif
 /*
diff --git a/kernel/locking/rt.c b/kernel/locking/rt.c
index 1bbbcad5e360..d4ab61c1848b 100644
--- a/kernel/locking/rt.c
+++ b/kernel/locking/rt.c
@@ -235,7 +235,6 @@ EXPORT_SYMBOL(rt_read_trylock);
 void __lockfunc rt_write_lock(rwlock_t *rwlock)
 {
  rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
- migrate_disable();
  __rt_spin_lock(&rwlock->lock);
 }
 EXPORT_SYMBOL(rt_write_lock);
@@ -249,7 +248,6 @@ void __lockfunc rt_read_lock(rwlock_t *rwlock)
  * recursive read locks succeed when current owns the lock
  */
  if (rt_mutex_owner(lock) != current) {
- migrate_disable();
  rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_);
  __rt_spin_lock(lock);
  }
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a6f5326e4056..913aa40f3b5e 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1103,8 +1103,16 @@ static void  noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
  rt_mutex_adjust_prio(current);
 }
 
+void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
+{
+ rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
+ spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
+}
+EXPORT_SYMBOL(rt_spin_lock__no_mg);
+
 void __lockfunc rt_spin_lock(spinlock_t *lock)
 {
+ migrate_disable();
  rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
  spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
 }
@@ -1112,24 +1120,41 @@ EXPORT_SYMBOL(rt_spin_lock);
 
 void __lockfunc __rt_spin_lock(struct rt_mutex *lock)
 {
+ migrate_disable();
  rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
 }
 EXPORT_SYMBOL(__rt_spin_lock);
 
+void __lockfunc __rt_spin_lock__no_mg(struct rt_mutex *lock)
+{
+ rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock);
+}
+EXPORT_SYMBOL(__rt_spin_lock__no_mg);
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass)
 {
+ migrate_disable();
  rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock);
  spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
 }
 EXPORT_SYMBOL(rt_spin_lock_nested);
 #endif
 
+void __lockfunc rt_spin_unlock__no_mg(spinlock_t *lock)
+{
+ /* NOTE: we always pass in '1' for nested, for simplicity */
+ spin_release(&lock->dep_map, 1, _RET_IP_);
+ rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+}
+EXPORT_SYMBOL(rt_spin_unlock__no_mg);
+
 void __lockfunc rt_spin_unlock(spinlock_t *lock)
 {
  /* NOTE: we always pass in '1' for nested, for simplicity */
  spin_release(&lock->dep_map, 1, _RET_IP_);
  rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock);
+ migrate_enable();
 }
 EXPORT_SYMBOL(rt_spin_unlock);
 
@@ -1156,14 +1181,29 @@ int __lockfunc __rt_spin_trylock(struct rt_mutex *lock)
  return rt_mutex_trylock(lock);
 }
 
-int __lockfunc rt_spin_trylock(spinlock_t *lock)
+int __lockfunc rt_spin_trylock__no_mg(spinlock_t *lock)
 {
- int ret = rt_mutex_trylock(&lock->lock);
+ int ret;
 
+ ret = rt_mutex_trylock(&lock->lock);
  if (ret)
  spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
  return ret;
 }
+EXPORT_SYMBOL(rt_spin_trylock__no_mg);
+
+int __lockfunc rt_spin_trylock(spinlock_t *lock)
+{
+ int ret;
+
+ migrate_disable();
+ ret = rt_mutex_trylock(&lock->lock);
+ if (ret)
+ spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
+ else
+ migrate_enable();
+ return ret;
+}
 EXPORT_SYMBOL(rt_spin_trylock);
 
 int __lockfunc rt_spin_trylock_bh(spinlock_t *lock)
@@ -1200,12 +1240,10 @@ int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
  /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
  if (atomic_add_unless(atomic, -1, 1))
  return 0;
- migrate_disable();
  rt_spin_lock(lock);
  if (atomic_dec_and_test(atomic))
  return 1;
  rt_spin_unlock(lock);
- migrate_enable();
  return 0;
 }
 EXPORT_SYMBOL(atomic_dec_and_spin_lock);
--
2.7.0

Reply | Threaded
Open this post in threaded view
|

[PATCH RT 5/6] kernel/stop_machine: partly revert "stop_machine: Use raw spinlocks"

Sebastian Andrzej Siewior-4
In reply to this post by Sebastian Andrzej Siewior-4
With completion using swait and so rawlocks we don't need this anymore.
Further, bisect thinks this patch is responsible for:

|BUG: unable to handle kernel NULL pointer dereference at           (null)
|IP: [<ffffffff81082123>] sched_cpu_active+0x53/0x70
|PGD 0
|Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
|Dumping ftrace buffer:
|   (ftrace buffer empty)
|Modules linked in:
|CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.4.1+ #330
|Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Debian-1.8.2-1 04/01/2014
|task: ffff88013ae64b00 ti: ffff88013ae74000 task.ti: ffff88013ae74000
|RIP: 0010:[<ffffffff81082123>]  [<ffffffff81082123>] sched_cpu_active+0x53/0x70
|RSP: 0000:ffff88013ae77eb8  EFLAGS: 00010082
|RAX: 0000000000000001 RBX: ffffffff81c2cf20 RCX: 0000001050fb52fb
|RDX: 0000001050fb52fb RSI: 000000105117ca1e RDI: 00000000001c7723
|RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001
|R10: 0000000000000000 R11: 0000000000000001 R12: 00000000ffffffff
|R13: ffffffff81c2cee0 R14: 0000000000000000 R15: 0000000000000001
|FS:  0000000000000000(0000) GS:ffff88013b200000(0000) knlGS:0000000000000000
|CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
|CR2: 0000000000000000 CR3: 0000000001c09000 CR4: 00000000000006e0
|Stack:
| ffffffff810c446d ffff88013ae77f00 ffffffff8107d8dd 000000000000000a
| 0000000000000001 0000000000000000 0000000000000000 0000000000000000
| 0000000000000000 ffff88013ae77f10 ffffffff8107d90e ffff88013ae77f20
|Call Trace:
| [<ffffffff810c446d>] ? debug_lockdep_rcu_enabled+0x1d/0x20
| [<ffffffff8107d8dd>] ? notifier_call_chain+0x5d/0x80
| [<ffffffff8107d90e>] ? __raw_notifier_call_chain+0xe/0x10
| [<ffffffff810598a3>] ? cpu_notify+0x23/0x40
| [<ffffffff8105a7b8>] ? notify_cpu_starting+0x28/0x30

during hotplug. The rawlocks need to remain however.

Signed-off-by: Sebastian Andrzej Siewior <[hidden email]>
---
 kernel/stop_machine.c | 40 ++++++++--------------------------------
 1 file changed, 8 insertions(+), 32 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2c5acc882bad..f84d3b45cda7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -30,7 +30,7 @@ struct cpu_stop_done {
  atomic_t nr_todo; /* nr left to execute */
  bool executed; /* actually executed? */
  int ret; /* collected return value */
- struct task_struct *waiter; /* woken when nr_todo reaches 0 */
+ struct completion completion; /* fired if nr_todo reaches 0 */
 };
 
 /* the actual stopper, one per every possible cpu, enabled on online cpus */
@@ -59,7 +59,7 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
  memset(done, 0, sizeof(*done));
  atomic_set(&done->nr_todo, nr_todo);
- done->waiter = current;
+ init_completion(&done->completion);
 }
 
 /* signal completion unless @done is NULL */
@@ -68,10 +68,8 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
  if (done) {
  if (executed)
  done->executed = true;
- if (atomic_dec_and_test(&done->nr_todo)) {
- wake_up_process(done->waiter);
- done->waiter = NULL;
- }
+ if (atomic_dec_and_test(&done->nr_todo))
+ complete(&done->completion);
  }
 }
 
@@ -96,22 +94,6 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
  raw_spin_unlock_irqrestore(&stopper->lock, flags);
 }
 
-static void wait_for_stop_done(struct cpu_stop_done *done)
-{
- set_current_state(TASK_UNINTERRUPTIBLE);
- while (atomic_read(&done->nr_todo)) {
- schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
- }
- /*
- * We need to wait until cpu_stop_signal_done() has cleared
- * done->waiter.
- */
- while (done->waiter)
- cpu_relax();
- set_current_state(TASK_RUNNING);
-}
-
 /**
  * stop_one_cpu - stop a cpu
  * @cpu: cpu to stop
@@ -143,7 +125,7 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 
  cpu_stop_init_done(&done, 1);
  cpu_stop_queue_work(cpu, &work);
- wait_for_stop_done(&done);
+ wait_for_completion(&done.completion);
  return done.executed ? done.ret : -ENOENT;
 }
 
@@ -302,7 +284,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 
  preempt_enable_nort();
 
- wait_for_stop_done(&done);
+ wait_for_completion(&done.completion);
 
  return done.executed ? done.ret : -ENOENT;
 }
@@ -364,7 +346,7 @@ static int __stop_cpus(const struct cpumask *cpumask,
 
  cpu_stop_init_done(&done, cpumask_weight(cpumask));
  queue_stop_cpus_work(cpumask, fn, arg, &done, false);
- wait_for_stop_done(&done);
+ wait_for_completion(&done.completion);
  return done.executed ? done.ret : -ENOENT;
 }
 
@@ -495,13 +477,7 @@ static void cpu_stopper_thread(unsigned int cpu)
   kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
   ksym_buf), arg);
 
- /*
- * Make sure that the wakeup and setting done->waiter
- * to NULL is atomic.
- */
- local_irq_disable();
  cpu_stop_signal_done(done, true);
- local_irq_enable();
  goto repeat;
  }
 }
@@ -663,7 +639,7 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
  ret = multi_cpu_stop(&msdata);
 
  /* Busy wait for completion. */
- while (atomic_read(&done.nr_todo))
+ while (!completion_done(&done.completion))
  cpu_relax();
 
  mutex_unlock(&stop_cpus_mutex);
--
2.7.0

Reply | Threaded
Open this post in threaded view
|

[PATCH RT 2/6] kernel: migrate_disable() do fastpath in atomic & irqs-off

Sebastian Andrzej Siewior-4
In reply to this post by Sebastian Andrzej Siewior-4
With interrupts off it makes no sense to do the long path since we can't
leave the CPU anyway. Also we might end up in a recursion with lockdep.

Signed-off-by: Sebastian Andrzej Siewior <[hidden email]>
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1217926b500d..40c1e29416c0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3157,7 +3157,7 @@ void migrate_disable(void)
 {
  struct task_struct *p = current;
 
- if (in_atomic()) {
+ if (in_atomic() || irqs_disabled()) {
 #ifdef CONFIG_SCHED_DEBUG
  p->migrate_disable_atomic++;
 #endif
@@ -3188,7 +3188,7 @@ void migrate_enable(void)
 {
  struct task_struct *p = current;
 
- if (in_atomic()) {
+ if (in_atomic() || irqs_disabled()) {
 #ifdef CONFIG_SCHED_DEBUG
  p->migrate_disable_atomic--;
 #endif
--
2.7.0

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
In reply to this post by Sebastian Andrzej Siewior-4
On Sat, 2016-02-13 at 00:02 +0100, Sebastian Andrzej Siewior wrote:
> From: Thomas Gleixner <[hidden email]>
>
> We currently disable migration across lock acquisition. That includes the part
> where we block on the lock and schedule out. We cannot disable migration after
> taking the lock as that would cause a possible lock inversion.
>
> But we can be smart and enable migration when we block and schedule out. That
> allows the scheduler to place the task freely at least if this is the first
> migrate disable level. For nested locking this does not help at all.

I met a problem while testing shiny new hotplug machinery.

rt/locking: Fix rt_spin_lock_slowlock() vs hotplug migrate_disable() bug

migrate_disable() -> pin_current_cpu() -> hotplug_lock() leads to..
        BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
..so let's call migrate_disable() after we acquire the lock instead.

Fixes: e24b142cfb4a rt/locking: Reenable migration accross schedule
Signed-off-by: Mike Galbraith <[hidden email]>
---
 kernel/locking/rtmutex.c |   15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1011,7 +1011,7 @@ static void  noinline __sched rt_spin_lo
  struct task_struct *lock_owner, *self = current;
  struct rt_mutex_waiter waiter, *top_waiter;
  unsigned long flags;
- int ret;
+ bool mg_disable = false;
 
  rt_mutex_init_waiter(&waiter, true);
 
@@ -1035,8 +1035,7 @@ static void  noinline __sched rt_spin_lo
  __set_current_state_no_track(TASK_UNINTERRUPTIBLE);
  raw_spin_unlock(&self->pi_lock);
 
- ret = task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK);
- BUG_ON(ret);
+ BUG_ON(task_blocks_on_rt_mutex(lock, &waiter, self, RT_MUTEX_MIN_CHAINWALK));
 
  for (;;) {
  /* Try to acquire the lock again. */
@@ -1051,11 +1050,12 @@ static void  noinline __sched rt_spin_lo
  debug_rt_mutex_print_deadlock(&waiter);
 
  if (top_waiter != &waiter || adaptive_wait(lock, lock_owner)) {
- if (mg_off)
+ if (mg_off && self->migrate_disable == 1) {
+ mg_off = false;
+ mg_disable = true;
  migrate_enable();
+ }
  schedule();
- if (mg_off)
- migrate_disable();
  }
 
  raw_spin_lock_irqsave(&lock->wait_lock, flags);
@@ -1088,6 +1088,9 @@ static void  noinline __sched rt_spin_lo
 
  raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
+ if (mg_disable)
+ migrate_disable();
+
  debug_rt_mutex_free_waiter(&waiter);
 }
 
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
On Sun, 2016-03-20 at 09:43 +0100, Mike Galbraith wrote:

> On Sat, 2016-02-13 at 00:02 +0100, Sebastian Andrzej Siewior wrote:
> > From: Thomas Gleixner <[hidden email]>
> >
> > We currently disable migration across lock acquisition. That includes the part
> > where we block on the lock and schedule out. We cannot disable migration after
> > taking the lock as that would cause a possible lock inversion.
> >
> > But we can be smart and enable migration when we block and schedule out. That
> > allows the scheduler to place the task freely at least if this is the first
> > migrate disable level. For nested locking this does not help at all.
>
> I met a problem while testing shiny new hotplug machinery.
>
> rt/locking: Fix rt_spin_lock_slowlock() vs hotplug migrate_disable() bug
>
> migrate_disable() -> pin_current_cpu() -> hotplug_lock() leads to..
> > BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
> ..so let's call migrate_disable() after we acquire the lock instead.

Well crap, that wasn't very clever  A little voice kept nagging me, and
yesterday I realized what it was grumbling about, namely that doing
migrate_disable() after lock acquisition will resurrect a hotplug
deadlock that we fixed up a while back.

On the bright side, with the busted migrate enable business reverted,
plus one dinky change from me [1], master-rt.today has completed 100
iterations of Steven's hotplug stress script along side endless
futexstress, and is happily doing another 900 as I write this, so the
next -rt should finally be hotplug deadlock free.

Thomas's state machinery seems to work wonders.  'course this being
hotplug, the other shoe will likely apply itself to my backside soon.

        -Mike

1. nest module_mutex inside hotplug_lock to prevent bloody systemd
-udevd from blocking in migrate_disable() while holding kernfs_mutex
during module load, putting a quick end to hotplug stress testing.
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Thomas Gleixner
On Thu, 24 Mar 2016, Mike Galbraith wrote:

> On Sun, 2016-03-20 at 09:43 +0100, Mike Galbraith wrote:
> > On Sat, 2016-02-13 at 00:02 +0100, Sebastian Andrzej Siewior wrote:
> > > From: Thomas Gleixner <[hidden email]>
> > >
> > > We currently disable migration across lock acquisition. That includes the part
> > > where we block on the lock and schedule out. We cannot disable migration after
> > > taking the lock as that would cause a possible lock inversion.
> > >
> > > But we can be smart and enable migration when we block and schedule out. That
> > > allows the scheduler to place the task freely at least if this is the first
> > > migrate disable level. For nested locking this does not help at all.
> >
> > I met a problem while testing shiny new hotplug machinery.
> >
> > rt/locking: Fix rt_spin_lock_slowlock() vs hotplug migrate_disable() bug
> >
> > migrate_disable() -> pin_current_cpu() -> hotplug_lock() leads to..
> > > BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on));
> > ..so let's call migrate_disable() after we acquire the lock instead.
>
> Well crap, that wasn't very clever  A little voice kept nagging me, and
> yesterday I realized what it was grumbling about, namely that doing
> migrate_disable() after lock acquisition will resurrect a hotplug
> deadlock that we fixed up a while back.

Glad you found out yourself. Telling you that was on my todo list ....
 
> On the bright side, with the busted migrate enable business reverted,
> plus one dinky change from me [1], master-rt.today has completed 100
> iterations of Steven's hotplug stress script along side endless
> futexstress, and is happily doing another 900 as I write this, so the
> next -rt should finally be hotplug deadlock free.
>
> Thomas's state machinery seems to work wonders.  'course this being
> hotplug, the other shoe will likely apply itself to my backside soon.

That's a given :)

I really wonder what makes the change. The only thing which comes to my mind
is the enforcement of running the online and down_prepare callbacks on the
plugged cpu instead of doing it wherever the scheduler decides to run it.
 
> 1. nest module_mutex inside hotplug_lock to prevent bloody systemd
> -udevd from blocking in migrate_disable() while holding kernfs_mutex
> during module load, putting a quick end to hotplug stress testing.
 
Did I miss a patch here or is that still in your pile?

Thanks,

        tglx
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:

>  
> > On the bright side, with the busted migrate enable business reverted,
> > plus one dinky change from me [1], master-rt.today has completed 100
> > iterations of Steven's hotplug stress script along side endless
> > futexstress, and is happily doing another 900 as I write this, so the
> > next -rt should finally be hotplug deadlock free.
> >
> > Thomas's state machinery seems to work wonders.  'course this being
> > hotplug, the other shoe will likely apply itself to my backside soon.
>
> That's a given :)

blk-mq applied it shortly after I was satisfied enough to poke xmit.

> I really wonder what makes the change. The only thing which comes to my mind
> is the enforcement of running the online and down_prepare callbacks on the
> plugged cpu instead of doing it wherever the scheduler decides to run it.

No idea, but it certainly seems.. well, markedly less brick like.
 
> > 1. nest module_mutex inside hotplug_lock to prevent bloody systemd
> > -udevd from blocking in migrate_disable() while holding kernfs_mutex
> > during module load, putting a quick end to hotplug stress testing.
>  
> Did I miss a patch here or is that still in your pile?

You didn't miss it, it wasn't tested enough to consider sending.. and
now I'm starting down the familiar "next" path again.  Oh dear.

        -Mike
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:

> On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> >  
> > > On the bright side, with the busted migrate enable business reverted,
> > > plus one dinky change from me [1], master-rt.today has completed 100
> > > iterations of Steven's hotplug stress script along side endless
> > > futexstress, and is happily doing another 900 as I write this, so the
> > > next -rt should finally be hotplug deadlock free.
> > >
> > > Thomas's state machinery seems to work wonders.  'course this being
> > > hotplug, the other shoe will likely apply itself to my backside soon.
> >
> > That's a given :)
>
> blk-mq applied it shortly after I was satisfied enough to poke xmit.

The other shoe is that notifiers can depend upon RCU grace periods, so
when pin_current_cpu() snags rcu_sched, the hotplug game is over.

blk_mq_queue_reinit_notify:
        /*
         * We need to freeze and reinit all existing queues.  Freezing
         * involves synchronous wait for an RCU grace period and doing it
         * one by one may take a long time.  Start freezing all queues in
         * one swoop and then wait for the completions so that freezing can
         * take place in parallel.
         */
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_freeze_queue_start(q);
        list_for_each_entry(q, &all_q_list, all_q_node) {
                blk_mq_freeze_queue_wait(q);

Hohum (sharpens rock), next.

        -Mike
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Thomas Gleixner
On Fri, 25 Mar 2016, Mike Galbraith wrote:

> On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:
> > On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> > >  
> > > > On the bright side, with the busted migrate enable business reverted,
> > > > plus one dinky change from me [1], master-rt.today has completed 100
> > > > iterations of Steven's hotplug stress script along side endless
> > > > futexstress, and is happily doing another 900 as I write this, so the
> > > > next -rt should finally be hotplug deadlock free.
> > > >
> > > > Thomas's state machinery seems to work wonders.  'course this being
> > > > hotplug, the other shoe will likely apply itself to my backside soon.
> > >
> > > That's a given :)
> >
> > blk-mq applied it shortly after I was satisfied enough to poke xmit.
>
> The other shoe is that notifiers can depend upon RCU grace periods, so
> when pin_current_cpu() snags rcu_sched, the hotplug game is over.
>
> blk_mq_queue_reinit_notify:
>         /*
>          * We need to freeze and reinit all existing queues.  Freezing
>          * involves synchronous wait for an RCU grace period and doing it
>          * one by one may take a long time.  Start freezing all queues in
>          * one swoop and then wait for the completions so that freezing can
>          * take place in parallel.
>          */
>         list_for_each_entry(q, &all_q_list, all_q_node)
>                 blk_mq_freeze_queue_start(q);
>         list_for_each_entry(q, &all_q_list, all_q_node) {
>                 blk_mq_freeze_queue_wait(q);

Yeah, I stumbled over that already when analysing all the hotplug notifier
sites. That's definitely a horrible one.
 
> Hohum (sharpens rock), next.

/me recommends frozen sharks

Thanks,

       tglx
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
On Fri, 2016-03-25 at 09:52 +0100, Thomas Gleixner wrote:

> On Fri, 25 Mar 2016, Mike Galbraith wrote:
> > On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:
> > > On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> > > >  
> > > > > On the bright side, with the busted migrate enable business reverted,
> > > > > plus one dinky change from me [1], master-rt.today has completed 100
> > > > > iterations of Steven's hotplug stress script along side endless
> > > > > futexstress, and is happily doing another 900 as I write this, so the
> > > > > next -rt should finally be hotplug deadlock free.
> > > > >
> > > > > Thomas's state machinery seems to work wonders.  'course this being
> > > > > hotplug, the other shoe will likely apply itself to my backside soon.
> > > >
> > > > That's a given :)
> > >
> > > blk-mq applied it shortly after I was satisfied enough to poke xmit.
> >
> > The other shoe is that notifiers can depend upon RCU grace periods, so
> > when pin_current_cpu() snags rcu_sched, the hotplug game is over.
> >
> > blk_mq_queue_reinit_notify:
> >         /*
> >          * We need to freeze and reinit all existing queues.  Freezing
> >          * involves synchronous wait for an RCU grace period and doing it
> >          * one by one may take a long time.  Start freezing all queues in
> >          * one swoop and then wait for the completions so that freezing can
> >          * take place in parallel.
> >          */
> >         list_for_each_entry(q, &all_q_list, all_q_node)
> >                 blk_mq_freeze_queue_start(q);
> >         list_for_each_entry(q, &all_q_list, all_q_node) {
> >                 blk_mq_freeze_queue_wait(q);
>
> Yeah, I stumbled over that already when analysing all the hotplug notifier
> sites. That's definitely a horrible one.
>  
> > Hohum (sharpens rock), next.
>
> /me recommends frozen sharks

With the sharp rock below and the one I'll follow up with, master-rt on
my DL980 just passed 3 hours of endless hotplug stress concurrent with
endless tbench 8, stockfish and futextest.  It has never survived this
long with this load by a long shot.

hotplug/rt: Do not let pin_current_cpu() block RCU grace periods

Notifiers may depend upon grace periods continuing to advance
as blk_mq_queue_reinit_notify() below.

crash> bt ffff8803aee76400
PID: 1113   TASK: ffff8803aee76400  CPU: 0   COMMAND: "stress-cpu-hotp"
 #0 [ffff880396fe7ad8] __schedule at ffffffff816b7142
 #1 [ffff880396fe7b28] schedule at ffffffff816b797b
 #2 [ffff880396fe7b48] blk_mq_freeze_queue_wait at ffffffff8135c5ac
 #3 [ffff880396fe7b80] blk_mq_queue_reinit_notify at ffffffff8135f819
 #4 [ffff880396fe7b98] notifier_call_chain at ffffffff8109b8ed
 #5 [ffff880396fe7bd8] __raw_notifier_call_chain at ffffffff8109b91e
 #6 [ffff880396fe7be8] __cpu_notify at ffffffff81072825
 #7 [ffff880396fe7bf8] cpu_notify_nofail at ffffffff81072b15
 #8 [ffff880396fe7c08] notify_dead at ffffffff81072d06
 #9 [ffff880396fe7c38] cpuhp_invoke_callback at ffffffff81073718
#10 [ffff880396fe7c78] cpuhp_down_callbacks at ffffffff81073a70
#11 [ffff880396fe7cb8] _cpu_down at ffffffff816afc71
#12 [ffff880396fe7d38] do_cpu_down at ffffffff8107435c
#13 [ffff880396fe7d60] cpu_down at ffffffff81074390
#14 [ffff880396fe7d70] cpu_subsys_offline at ffffffff814cd854
#15 [ffff880396fe7d80] device_offline at ffffffff814c7cda
#16 [ffff880396fe7da8] online_store at ffffffff814c7dd0
#17 [ffff880396fe7dd0] dev_attr_store at ffffffff814c4fc8
#18 [ffff880396fe7de0] sysfs_kf_write at ffffffff812cfbe4
#19 [ffff880396fe7e08] kernfs_fop_write at ffffffff812cf172
#20 [ffff880396fe7e50] __vfs_write at ffffffff81241428
#21 [ffff880396fe7ed0] vfs_write at ffffffff81242535
#22 [ffff880396fe7f10] sys_write at ffffffff812438f9
#23 [ffff880396fe7f50] entry_SYSCALL_64_fastpath at ffffffff816bb4bc
    RIP: 00007fafd918acd0  RSP: 00007ffd2ca956e8  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 000000000226a770  RCX: 00007fafd918acd0
    RDX: 0000000000000002  RSI: 00007fafd9cb9000  RDI: 0000000000000001
    RBP: 00007ffd2ca95700   R8: 000000000000000a   R9: 00007fafd9cb3700
    R10: 00000000ffffffff  R11: 0000000000000246  R12: 0000000000000007
    R13: 0000000000000001  R14: 0000000000000009  R15: 000000000000000a
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b

blk_mq_queue_reinit_notify:
        /*
         * We need to freeze and reinit all existing queues.  Freezing
         * involves synchronous wait for an RCU grace period and doing it
         * one by one may take a long time.  Start freezing all queues in
         * one swoop and then wait for the completions so that freezing can
         * take place in parallel.
         */
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_freeze_queue_start(q);
        list_for_each_entry(q, &all_q_list, all_q_node) {
                blk_mq_freeze_queue_wait(q);

crash> bt ffff880176cc9900
PID: 17     TASK: ffff880176cc9900  CPU: 0   COMMAND: "rcu_sched"
 #0 [ffff880176cd7ab8] __schedule at ffffffff816b7142
 #1 [ffff880176cd7b08] schedule at ffffffff816b797b
 #2 [ffff880176cd7b28] rt_spin_lock_slowlock at ffffffff816b974d
 #3 [ffff880176cd7bc8] rt_spin_lock_fastlock at ffffffff811b0f3c
 #4 [ffff880176cd7be8] rt_spin_lock__no_mg at ffffffff816bac1b
 #5 [ffff880176cd7c08] pin_current_cpu at ffffffff8107406a
 #6 [ffff880176cd7c50] migrate_disable at ffffffff810a0e9e
 #7 [ffff880176cd7c70] rt_spin_lock at ffffffff816bad69
 #8 [ffff880176cd7c90] lock_timer_base at ffffffff810fc5e8
 #9 [ffff880176cd7cc8] try_to_del_timer_sync at ffffffff810fe290
#10 [ffff880176cd7cf0] del_timer_sync at ffffffff810fe381
#11 [ffff880176cd7d58] schedule_timeout at ffffffff816b9e4b
#12 [ffff880176cd7df0] rcu_gp_kthread at ffffffff810f52b4
#13 [ffff880176cd7e70] kthread at ffffffff8109a02f
#14 [ffff880176cd7f50] ret_from_fork at ffffffff816bb6f2

Game Over.

Signed-off-by: Mike Galbraith <[hidden email]>
---
 include/linux/sched.h |    1 +
 kernel/cpu.c          |    2 +-
 kernel/rcu/tree.c     |    3 +++
 3 files changed, 5 insertions(+), 1 deletion(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1492,6 +1492,7 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
  unsigned brk_randomized:1;
 #endif
+ unsigned sched_is_rcu:1; /* RT: is a critical RCU thread */
 
  unsigned long atomic_flags; /* Flags needing atomic access. */
 
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -156,7 +156,7 @@ void pin_current_cpu(void)
  hp = this_cpu_ptr(&hotplug_pcp);
 
  if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
-    hp->unplug == current) {
+    hp->unplug == current || current->sched_is_rcu) {
  hp->refcount++;
  return;
  }
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2100,6 +2100,9 @@ static int __noreturn rcu_gp_kthread(voi
  struct rcu_state *rsp = arg;
  struct rcu_node *rnp = rcu_get_root(rsp);
 
+ /* RT: pin_current_cpu() MUST NOT block RCU grace periods. */
+ current->sched_is_rcu = 1;
+
  rcu_bind_gp_kthread();
  for (;;) {
 
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
Rock #1..

hotplug/rt: Nest module_mutex inside cpu_hotplug.lock

PID: 11107  TASK: ffff8803b12b9900  CPU: 4   COMMAND: "stress-cpu-hotp"                                                                                                                                                                      
 #0 [ffff88038b34f9b8] __schedule at ffffffff816b7132
 #1 [ffff88038b34fa08] schedule at ffffffff816b796b
 #2 [ffff88038b34fa28] rt_mutex_slowlock at ffffffff816b93ee
 #3 [ffff88038b34fac8] rt_mutex_fastlock at ffffffff811b0e9d
 #4 [ffff88038b34faf0] rt_mutex_lock at ffffffff816b95c8
 #5 [ffff88038b34fb08] _mutex_lock at ffffffff816baf59
 #6 [ffff88038b34fb28] kernfs_find_and_get_ns at ffffffff812cd573
 #7 [ffff88038b34fb50] sysfs_remove_group at ffffffff812d100a
 #8 [ffff88038b34fb78] thermal_throttle_cpu_callback at ffffffff81036ab9
 #9 [ffff88038b34fb98] notifier_call_chain at ffffffff8109b8dd
#10 [ffff88038b34fbd8] __raw_notifier_call_chain at ffffffff8109b90e
#11 [ffff88038b34fbe8] __cpu_notify at ffffffff81072825
#12 [ffff88038b34fbf8] cpu_notify_nofail at ffffffff81072b15
#13 [ffff88038b34fc08] notify_dead at ffffffff81072d06
#14 [ffff88038b34fc38] cpuhp_invoke_callback at ffffffff81073718
#15 [ffff88038b34fc78] cpuhp_down_callbacks at ffffffff81073a70
#16 [ffff88038b34fcb8] _cpu_down at ffffffff816afc61
#17 [ffff88038b34fd38] do_cpu_down at ffffffff8107434c
#18 [ffff88038b34fd60] cpu_down at ffffffff81074380
#19 [ffff88038b34fd70] cpu_subsys_offline at ffffffff814cd844
#20 [ffff88038b34fd80] device_offline at ffffffff814c7cca
#21 [ffff88038b34fda8] online_store at ffffffff814c7dc0
#22 [ffff88038b34fdd0] dev_attr_store at ffffffff814c4fb8
#23 [ffff88038b34fde0] sysfs_kf_write at ffffffff812cfbd4
#24 [ffff88038b34fe08] kernfs_fop_write at ffffffff812cf162
#25 [ffff88038b34fe50] __vfs_write at ffffffff81241418
#26 [ffff88038b34fed0] vfs_write at ffffffff81242525
#27 [ffff88038b34ff10] sys_write at ffffffff812438e9
#28 [ffff88038b34ff50] entry_SYSCALL_64_fastpath at ffffffff816bb4fc
    RIP: 00007f05f3d69cd0  RSP: 00007ffdfc934468  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000001908770  RCX: 00007f05f3d69cd0
    RDX: 0000000000000002  RSI: 00007f05f4898000  RDI: 0000000000000001
    RBP: 00007ffdfc934480   R8: 000000000000000a   R9: 00007f05f4892700
    R10: 00000000ffffffff  R11: 0000000000000246  R12: 0000000000000007
    R13: 0000000000000001  R14: 0000000000000009  R15: 000000000000000a
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b

stress-cpu-hotp blocks on kernfs_mutex, held by systemd-udevd..

crash> bt ffff8803b12bcb00
PID: 11130  TASK: ffff8803b12bcb00  CPU: 6   COMMAND: "systemd-udevd"
 #0 [ffff88038b327a18] __schedule at ffffffff816b7132
 #1 [ffff88038b327a68] schedule at ffffffff816b796b
 #2 [ffff88038b327a88] rt_spin_lock_slowlock at ffffffff816b9750
 #3 [ffff88038b327b30] rt_spin_lock_fastlock at ffffffff811b0f2c
 #4 [ffff88038b327b50] rt_spin_lock__no_mg at ffffffff816bac7b
 #5 [ffff88038b327b70] pin_current_cpu at ffffffff8107406a
 #6 [ffff88038b327bb8] migrate_disable at ffffffff810a0e8e
 #7 [ffff88038b327bd8] rt_spin_lock at ffffffff816badc9
 #8 [ffff88038b327bf8] ida_simple_remove at ffffffff8138765c
 #9 [ffff88038b327c18] kernfs_put at ffffffff812ccc58
#10 [ffff88038b327c60] __kernfs_remove at ffffffff812cd15c
#11 [ffff88038b327cc0] kernfs_remove_by_name_ns at ffffffff812ce2f3
#12 [ffff88038b327ce8] sysfs_remove_link at ffffffff812d05e9
#13 [ffff88038b327cf8] free_module at ffffffff8111c8f2
#14 [ffff88038b327d30] do_init_module at ffffffff811b157f
#15 [ffff88038b327d58] load_module at ffffffff8111f11b
#16 [ffff88038b327e98] SYSC_finit_module at ffffffff8111faf9
#17 [ffff88038b327f40] sys_finit_module at ffffffff8111fb3e
#18 [ffff88038b327f50] entry_SYSCALL_64_fastpath at ffffffff816bb4fc
    RIP: 00007f75d9925f79  RSP: 00007ffd1c040ed8  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000001d368e0  RCX: 00007f75d9925f79
    RDX: 0000000000000000  RSI: 00007f75da0233c1  RDI: 0000000000000008
    RBP: 0000000000000008   R8: 0000000000000000   R9: 0000000001d39c82
    R10: 0000000000000008  R11: 0000000000000246  R12: 00007ffd1c03ff00
    R13: 00007ffd1c03fee0  R14: 0000000000000005  R15: 000000000aba9500
    ORIG_RAX: 0000000000000139  CS: 0033  SS: 002b

..which stress-cpu-hotp has blocked via pin_current_cpu().  Game Over.

Signed-off-by: Mike Galbraith <[hidden email]>
---
 kernel/cpu.c |    9 +++++++++
 1 file changed, 9 insertions(+)

--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -23,6 +23,9 @@
 #include <linux/tick.h>
 #include <linux/irq.h>
 #include <trace/events/power.h>
+#ifdef CONFIG_PREEMPT_RT_BASE
+#include <linux/module.h>
+#endif
 
 #include "smpboot.h"
 
@@ -442,10 +445,16 @@ void cpu_hotplug_begin(void)
  schedule();
  }
  finish_wait(&cpu_hotplug.wq, &wait);
+#ifdef CONFIG_PREEMPT_RT_BASE
+ mutex_lock(&module_mutex);
+#endif
 }
 
 void cpu_hotplug_done(void)
 {
+#ifdef CONFIG_PREEMPT_RT_BASE
+ mutex_unlock(&module_mutex);
+#endif
  cpu_hotplug.active_writer = NULL;
  mutex_unlock(&cpu_hotplug.lock);
  cpuhp_lock_release();
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
In reply to this post by Mike Galbraith-5
On Fri, 2016-03-25 at 10:13 +0100, Mike Galbraith wrote:

> On Fri, 2016-03-25 at 09:52 +0100, Thomas Gleixner wrote:
> > On Fri, 25 Mar 2016, Mike Galbraith wrote:
> > > On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:
> > > > On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> > > > >  
> > > > > > On the bright side, with the busted migrate enable business reverted,
> > > > > > plus one dinky change from me [1], master-rt.today has completed 100
> > > > > > iterations of Steven's hotplug stress script along side endless
> > > > > > futexstress, and is happily doing another 900 as I write this, so the
> > > > > > next -rt should finally be hotplug deadlock free.
> > > > > >
> > > > > > Thomas's state machinery seems to work wonders.  'course this being
> > > > > > hotplug, the other shoe will likely apply itself to my backside soon.
> > > > >
> > > > > That's a given :)
> > > >
> > > > blk-mq applied it shortly after I was satisfied enough to poke xmit.
> > >
> > > The other shoe is that notifiers can depend upon RCU grace periods, so
> > > when pin_current_cpu() snags rcu_sched, the hotplug game is over.
> > >
> > > blk_mq_queue_reinit_notify:
> > >         /*
> > >          * We need to freeze and reinit all existing queues.  Freezing
> > >          * involves synchronous wait for an RCU grace period and doing it
> > >          * one by one may take a long time.  Start freezing all queues in
> > >          * one swoop and then wait for the completions so that freezing can
> > >          * take place in parallel.
> > >          */
> > >         list_for_each_entry(q, &all_q_list, all_q_node)
> > >                 blk_mq_freeze_queue_start(q);
> > >         list_for_each_entry(q, &all_q_list, all_q_node) {
> > >                 blk_mq_freeze_queue_wait(q);
> >
> > Yeah, I stumbled over that already when analysing all the hotplug notifier
> > sites. That's definitely a horrible one.
> >  
> > > Hohum (sharpens rock), next.
> >
> > /me recommends frozen sharks
>
> With the sharp rock below and the one I'll follow up with, master-rt on
> my DL980 just passed 3 hours of endless hotplug stress concurrent with
> endless tbench 8, stockfish and futextest.  It has never survived this
> long with this load by a long shot.

I knew it was unlikely to surrender that quickly.  Oh well, on the
bright side it seems to be running low on deadlocks.

        Happy Easter,

        -Mike

(bite me beast, 666 indeed)

[26666.886077] ------------[ cut here ]------------
[26666.886078] kernel BUG at kernel/sched/core.c:1717!
[26666.886081] invalid opcode: 0000 [#1] PREEMPT SMP
[26666.886094] Dumping ftrace buffer:
[26666.886112]    (ftrace buffer empty)
[26666.886137] Modules linked in: autofs4 edd af_packet cpufreq_conservative cpufreq_ondemand cpufreq_userspace cpufreq_powersave fuse loop md_mod dm_mod vhost_net macvtap macvlan vhost tun ipmi_ssif kvm_intel kvm joydev hid_generic sr_m
od cdrom sg shpchp netxen_nic hpwdt hpilo ipmi_si ipmi_msghandler irqbypass bnx2 iTCO_wdt iTCO_vendor_support gpio_ich pcc_cpufreq fjes i7core_edac edac_core lpc_ich pcspkr 8250_fintek ehci_pci acpi_cpufreq acpi_power_meter button ext4 m
bcache jbd2 crc16 usbhid uhci_hcd ehci_hcd sd_mod usbcore usb_common thermal processor scsi_dh_hp_sw scsi_dh_emc scsi_dh_rdac scsi_dh_alua ata_generic ata_piix libata hpsa scsi_transport_sas cciss scsi_mod
[26666.886140] CPU: 2 PID: 41 Comm: migration/2 Not tainted 4.6.0-rt11 #69
[26666.886140] Hardware name: Hewlett-Packard ProLiant DL980 G7, BIOS P66 07/07/2010
[26666.886142] task: ffff88017e34e580 ti: ffff88017e394000 task.ti: ffff88017e394000
[26666.886149] RIP: 0010:[<ffffffff810a6f5c>]  [<ffffffff810a6f5c>] select_fallback_rq+0x19c/0x1d0
[26666.886149] RSP: 0018:ffff88017e397d28  EFLAGS: 00010046
[26666.886150] RAX: 0000000000000100 RBX: ffff88017e668348 RCX: 0000000000000003
[26666.886151] RDX: 0000000000000100 RSI: 0000000000000100 RDI: ffffffff81811420
[26666.886152] RBP: ffff88017e668000 R08: 0000000000000003 R09: 0000000000000000
[26666.886153] R10: ffff8802772b3ec0 R11: 0000000000000001 R12: 0000000000000002
[26666.886153] R13: 0000000000000002 R14: ffff88017e398000 R15: ffff88017e668000
[26666.886155] FS:  0000000000000000(0000) GS:ffff880276680000(0000) knlGS:0000000000000000
[26666.886156] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[26666.886156] CR2: 0000000000695c5c CR3: 0000000271419000 CR4: 00000000000006e0
[26666.886157] Stack:
[26666.886159]  ffff880276696900 ffff880276696900 ffff88017e668808 0000000000016900
[26666.886160]  ffffffff810a88f9 ffff88017e398000 ffff88017e398000 0000000000000046
[26666.886161]  ffff88017e34e580 00000000fffffff7 ffffffff81c5be90 0000000000000000
[26666.886162] Call Trace:
[26666.886166]  [<ffffffff810a88f9>] ? migration_call+0x1b9/0x3b0
[26666.886168]  [<ffffffff8109d724>] ? notifier_call_chain+0x44/0x70
[26666.886171]  [<ffffffff8107d430>] ? notify_online+0x20/0x20
[26666.886172]  [<ffffffff8107d381>] ? __cpu_notify+0x31/0x50
[26666.886173]  [<ffffffff8107d448>] ? notify_dying+0x18/0x20
[26666.886175]  [<ffffffff8107dfbf>] ? cpuhp_invoke_callback+0x3f/0x150
[26666.886178]  [<ffffffff8111ed01>] ? cpu_stop_should_run+0x11/0x50
[26666.886180]  [<ffffffff8107e6a2>] ? take_cpu_down+0x52/0x80
[26666.886181]  [<ffffffff8111ee7a>] ? multi_cpu_stop+0x9a/0xc0
[26666.886182]  [<ffffffff8111ede0>] ? cpu_stop_queue_work+0x80/0x80
[26666.886184]  [<ffffffff8111f078>] ? cpu_stopper_thread+0x88/0x120
[26666.886186]  [<ffffffff8109fede>] ? smpboot_thread_fn+0x14e/0x270
[26666.886188]  [<ffffffff8109fd90>] ? smpboot_update_cpumask_percpu_thread+0x130/0x130
[26666.886192]  [<ffffffff8109c68d>] ? kthread+0xbd/0xe0
[26666.886196]  [<ffffffff816097c2>] ? ret_from_fork+0x22/0x40
[26666.886198]  [<ffffffff8109c5d0>] ? kthread_worker_fn+0x160/0x160
[26666.886211] Code: 06 00 00 44 89 e9 48 c7 c7 f8 72 9f 81 31 c0 e8 fd 0a 0e 00 e9 32 ff ff ff 41 83 fc 01 74 21 72 0c 41 83 fc 02 0f 85 33 ff ff ff <0f> 0b 48 89 ef 41 bc 01 00 00 00 e8 54 5c 07 00 e9 1e ff ff ff
[26666.886212] RIP  [<ffffffff810a6f5c>] select_fallback_rq+0x19c/0x1d0
[26666.886213]  RSP <ffff88017e397d28>
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
On Fri, 2016-03-25 at 17:24 +0100, Mike Galbraith wrote:

> On Fri, 2016-03-25 at 10:13 +0100, Mike Galbraith wrote:
> > On Fri, 2016-03-25 at 09:52 +0100, Thomas Gleixner wrote:
> > > On Fri, 25 Mar 2016, Mike Galbraith wrote:
> > > > On Thu, 2016-03-24 at 12:06 +0100, Mike Galbraith wrote:
> > > > > On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:
> > > > > >  
> > > > > > > On the bright side, with the busted migrate enable business reverted,
> > > > > > > plus one dinky change from me [1], master-rt.today has completed 100
> > > > > > > iterations of Steven's hotplug stress script along side endless
> > > > > > > futexstress, and is happily doing another 900 as I write this, so the
> > > > > > > next -rt should finally be hotplug deadlock free.
> > > > > > >
> > > > > > > Thomas's state machinery seems to work wonders.  'course this being
> > > > > > > hotplug, the other shoe will likely apply itself to my backside soon.
> > > > > >
> > > > > > That's a given :)
> > > > >
> > > > > blk-mq applied it shortly after I was satisfied enough to poke xmit.
> > > >
> > > > The other shoe is that notifiers can depend upon RCU grace periods, so
> > > > when pin_current_cpu() snags rcu_sched, the hotplug game is over.
> > > >
> > > > blk_mq_queue_reinit_notify:
> > > >         /*
> > > >          * We need to freeze and reinit all existing queues.  Freezing
> > > >          * involves synchronous wait for an RCU grace period and doing it
> > > >          * one by one may take a long time.  Start freezing all queues in
> > > >          * one swoop and then wait for the completions so that freezing can
> > > >          * take place in parallel.
> > > >          */
> > > >         list_for_each_entry(q, &all_q_list, all_q_node)
> > > >                 blk_mq_freeze_queue_start(q);
> > > >         list_for_each_entry(q, &all_q_list, all_q_node) {
> > > >                 blk_mq_freeze_queue_wait(q);
> > >
> > > Yeah, I stumbled over that already when analysing all the hotplug notifier
> > > sites. That's definitely a horrible one.
> > >  
> > > > Hohum (sharpens rock), next.
> > >
> > > /me recommends frozen sharks
> >
> > With the sharp rock below and the one I'll follow up with, master-rt on
> > my DL980 just passed 3 hours of endless hotplug stress concurrent with
> > endless tbench 8, stockfish and futextest.  It has never survived this
> > long with this load by a long shot.
>
> I knew it was unlikely to surrender that quickly.  Oh well, on the
> bright side it seems to be running low on deadlocks.

The immunize rcu_sched rock did that btw.  Having accidentally whacked
the dump, I got to reproduce (took 30.03 hours) so I could analyze it.

Hohum, notifier woes definitely require somewhat sharper rocks.

I could make rcu_sched dodge the migration thread, but think I'll apply
frozen shark to blk-mq instead.

        -Mike

(a clever person would wait for Sir Thomas, remaining blissfully
ignorant of the gory dragon slaying details, but whatever, premature
testing and rt mole whacking may turn up something interesting, ya
never know)
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
In reply to this post by Thomas Gleixner
On Thu, 2016-03-24 at 11:44 +0100, Thomas Gleixner wrote:

> I really wonder what makes the change. The only thing which comes to my mind
> is the enforcement of running the online and down_prepare callbacks on the
> plugged cpu instead of doing it wherever the scheduler decides to run it.

It seems it's not the state machinery making a difference after all,
the only two deadlocks encountered in oodles of beating seem to boil
down to the grab_lock business being a pistol aimed at our own toes.

1. kernfs_mutex taken during hotplug: We don't pin across mutex
acquisition, so anyone grabbing it and then calling migrate_disable()
while grab_lock is set renders us dead.  Pin across acquisition of that
specific mutex fixes that specific grab_lock instigated deadlock.

2. notifier dependency upon RCU GP threads:  Telling same to always do
migrate_me() or hotplug can bloody well wait fixes that specific
grab_lock instigated deadlock.

With those two little hacks, all of my boxen including DL980 just keep
on chugging away in 4.[456]-rt, showing zero inclination to identify
any more hotplug bandits.

What I like much better than 1 + 2 is their sum, which would generate
minus signs, my favorite thing in patches, and fix the two above and
anything that resembles them in any way...

3. nuke irksome grab_lock: make everybody always try to get the hell
outta Dodge or hotplug can bloody well wait.

I haven't yet flogged my 64 core box doing that, but my local boxen
seem to be saying we don't really really need the grab_lock business.

Are my boxen fibbing, is that very attractive looking door #3 a trap?

        -Mike

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Sebastian Andrzej Siewior-4
* Mike Galbraith | 2016-03-31 08:31:43 [+0200]:

>3. nuke irksome grab_lock: make everybody always try to get the hell
>outta Dodge or hotplug can bloody well wait.
>
>I haven't yet flogged my 64 core box doing that, but my local boxen
>seem to be saying we don't really really need the grab_lock business.
>
>Are my boxen fibbing, is that very attractive looking door #3 a trap?

By the time I improved hotplug I played with this. I had a few ideas but
it didn't fly in the end. Today however I ended up with this:

--

Subject: [PATCH] kernel: hotplug: migrate to another CPU if the current is
 going away

If you X is going down then every task running on that CPU must go away.
This is achieved by setting hp->grab_lock to force the task to block
on hp->lock and schedule away in migrate_disable() before the task is
pinned to the CPU.
The task blocks on lock until the CPU is down. If the task holds any
ressources (locks) that are required by one of the CPU notifier then we
will dead lock here.
One petential deadlock is blk_mq CPU notifier (blk_mq_queue_reinit_notify())
which becuase it waits for the RCU grace period to advance/complete
which is stuck on the hp->lock and won't make any progress.
Mike identified another candidate "thermal_throttle_cpu_callback() ->
kernfs_find_and_get_ns()" which blocks on kernfs_mutex that is held
by udev via load_module() and is blocks on hp->lock.

So instead of getting tasks off the CPU by blocking on a lock I attempt
to get off the CPU by masking it out from the CPU mask. The ->mig_away
flag is used to primary avoid a blk_flush_plug_list() invacation via
schedule() and dead lock on cpumask_lock() (because the first spinlock
will invoke migrate_disable()).

Signed-off-by: Sebastian Andrzej Siewior <[hidden email]>
---
 include/linux/sched.h |  2 +-
 kernel/cpu.c          | 37 +++++++++++++++++++++++++++++++++++--
 kernel/sched/core.c   |  3 +++
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 58c5ec8c3742..d0ba00b9aff4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1492,7 +1492,7 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
  unsigned brk_randomized:1;
 #endif
-
+ unsigned mig_away :1;
  unsigned long atomic_flags; /* Flags needing atomic access. */
 
  struct restart_block restart_block;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8edd3c716092..3a1ee02ba3ab 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -30,6 +30,10 @@
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
+static DEFINE_SPINLOCK(cpumask_lock);
+static cpumask_var_t mig_cpumask;
+static cpumask_var_t mig_cpumask_org;
+
 /*
  * The following two APIs (cpu_maps_update_begin/done) must be used when
  * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
@@ -120,6 +124,8 @@ struct hotplug_pcp {
  * state.
  */
  spinlock_t lock;
+ cpumask_var_t cpumask;
+ cpumask_var_t cpumask_org;
 #else
  struct mutex mutex;
 #endif
@@ -158,9 +164,30 @@ void pin_current_cpu(void)
  return;
  }
  if (hp->grab_lock) {
+ int cpu;
+
+ cpu = smp_processor_id();
  preempt_enable();
- hotplug_lock(hp);
- hotplug_unlock(hp);
+ if (cpu != raw_smp_processor_id())
+ goto retry;
+
+ current->mig_away = 1;
+ rt_spin_lock__no_mg(&cpumask_lock);
+
+ /* DOWN */
+ cpumask_copy(mig_cpumask_org, tsk_cpus_allowed(current));
+ cpumask_andnot(mig_cpumask, cpu_online_mask, cpumask_of(cpu));
+ set_cpus_allowed_ptr(current, mig_cpumask);
+
+ if (cpu == raw_smp_processor_id()) {
+ /* BAD */
+ hotplug_lock(hp);
+ hotplug_unlock(hp);
+ }
+ set_cpus_allowed_ptr(current, mig_cpumask_org);
+ current->mig_away = 0;
+ rt_spin_unlock__no_mg(&cpumask_lock);
+
  } else {
  preempt_enable();
  /*
@@ -800,7 +827,13 @@ static struct notifier_block smpboot_thread_notifier = {
 
 void smpboot_thread_init(void)
 {
+ bool ok;
+
  register_cpu_notifier(&smpboot_thread_notifier);
+ ok = alloc_cpumask_var(&mig_cpumask, GFP_KERNEL);
+ BUG_ON(!ok);
+ ok = alloc_cpumask_var(&mig_cpumask_org, GFP_KERNEL);
+ BUG_ON(!ok);
 }
 
 /* Requires cpu_add_remove_lock to be held */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94827a59301e..ab06be452eb7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3369,6 +3369,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
 {
  if (!tsk->state)
  return;
+
+ if (tsk->mig_away)
+ return;
  /*
  * If a worker went to sleep, notify and ask workqueue whether
  * it wants to wake up a task to maintain concurrency.
--
2.8.0.rc3

Sebastian
Reply | Threaded
Open this post in threaded view
|

Re: [PATCH RT 4/6] rt/locking: Reenable migration accross schedule

Mike Galbraith-5
On Fri, 2016-04-01 at 23:11 +0200, Sebastian Andrzej Siewior wrote:

> * Mike Galbraith | 2016-03-31 08:31:43 [+0200]:
>
> > 3. nuke irksome grab_lock: make everybody always try to get the hell
> > outta Dodge or hotplug can bloody well wait.
> >
> > I haven't yet flogged my 64 core box doing that, but my local boxen
> > seem to be saying we don't really really need the grab_lock business.
> >
> > Are my boxen fibbing, is that very attractive looking door #3 a trap?
>
> By the time I improved hotplug I played with this. I had a few ideas but
> it didn't fly in the end. Today however I ended up with this:

Yeah, but that fails the duct tape test too.  Mine is below, and is the
extra sticky variety ;-)  With busted 0299 patch reverted and those two
applied, my DL980 took a beating for ~36 hours before I aborted it.. ie
hotplug road seemingly has no more -rt specific potholes.

If that lock dies, we can unpin when entering lock slow path and pin
again post acquisition with no ABBA worries as well, and not only does
existing hotplug work heaping truckloads better, -rt can perhaps help
spot trouble as the rewrite proceeds.

Current state is more broken than ever.. if that's possible.

        -Mike

hotplug/rt: Do not let pin_current_cpu() block RCU grace periods

Notifiers may depend upon grace periods continuing to advance
as blk_mq_queue_reinit_notify() below.

crash> bt ffff8803aee76400
PID: 1113   TASK: ffff8803aee76400  CPU: 0   COMMAND: "stress-cpu-hotp"
 #0 [ffff880396fe7ad8] __schedule at ffffffff816b7142
 #1 [ffff880396fe7b28] schedule at ffffffff816b797b
 #2 [ffff880396fe7b48] blk_mq_freeze_queue_wait at ffffffff8135c5ac
 #3 [ffff880396fe7b80] blk_mq_queue_reinit_notify at ffffffff8135f819
 #4 [ffff880396fe7b98] notifier_call_chain at ffffffff8109b8ed
 #5 [ffff880396fe7bd8] __raw_notifier_call_chain at ffffffff8109b91e
 #6 [ffff880396fe7be8] __cpu_notify at ffffffff81072825
 #7 [ffff880396fe7bf8] cpu_notify_nofail at ffffffff81072b15
 #8 [ffff880396fe7c08] notify_dead at ffffffff81072d06
 #9 [ffff880396fe7c38] cpuhp_invoke_callback at ffffffff81073718
#10 [ffff880396fe7c78] cpuhp_down_callbacks at ffffffff81073a70
#11 [ffff880396fe7cb8] _cpu_down at ffffffff816afc71
#12 [ffff880396fe7d38] do_cpu_down at ffffffff8107435c
#13 [ffff880396fe7d60] cpu_down at ffffffff81074390
#14 [ffff880396fe7d70] cpu_subsys_offline at ffffffff814cd854
#15 [ffff880396fe7d80] device_offline at ffffffff814c7cda
#16 [ffff880396fe7da8] online_store at ffffffff814c7dd0
#17 [ffff880396fe7dd0] dev_attr_store at ffffffff814c4fc8
#18 [ffff880396fe7de0] sysfs_kf_write at ffffffff812cfbe4
#19 [ffff880396fe7e08] kernfs_fop_write at ffffffff812cf172
#20 [ffff880396fe7e50] __vfs_write at ffffffff81241428
#21 [ffff880396fe7ed0] vfs_write at ffffffff81242535
#22 [ffff880396fe7f10] sys_write at ffffffff812438f9
#23 [ffff880396fe7f50] entry_SYSCALL_64_fastpath at ffffffff816bb4bc
    RIP: 00007fafd918acd0  RSP: 00007ffd2ca956e8  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 000000000226a770  RCX: 00007fafd918acd0
    RDX: 0000000000000002  RSI: 00007fafd9cb9000  RDI: 0000000000000001
    RBP: 00007ffd2ca95700   R8: 000000000000000a   R9: 00007fafd9cb3700
    R10: 00000000ffffffff  R11: 0000000000000246  R12: 0000000000000007
    R13: 0000000000000001  R14: 0000000000000009  R15: 000000000000000a
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b

blk_mq_queue_reinit_notify:
        /*
         * We need to freeze and reinit all existing queues.  Freezing
         * involves synchronous wait for an RCU grace period and doing it
         * one by one may take a long time.  Start freezing all queues in
         * one swoop and then wait for the completions so that freezing can
         * take place in parallel.
         */
        list_for_each_entry(q, &all_q_list, all_q_node)
                blk_mq_freeze_queue_start(q);
        list_for_each_entry(q, &all_q_list, all_q_node) {
                blk_mq_freeze_queue_wait(q);

crash> bt ffff880176cc9900
PID: 17     TASK: ffff880176cc9900  CPU: 0   COMMAND: "rcu_sched"
 #0 [ffff880176cd7ab8] __schedule at ffffffff816b7142
 #1 [ffff880176cd7b08] schedule at ffffffff816b797b
 #2 [ffff880176cd7b28] rt_spin_lock_slowlock at ffffffff816b974d
 #3 [ffff880176cd7bc8] rt_spin_lock_fastlock at ffffffff811b0f3c
 #4 [ffff880176cd7be8] rt_spin_lock__no_mg at ffffffff816bac1b
 #5 [ffff880176cd7c08] pin_current_cpu at ffffffff8107406a
 #6 [ffff880176cd7c50] migrate_disable at ffffffff810a0e9e
 #7 [ffff880176cd7c70] rt_spin_lock at ffffffff816bad69
 #8 [ffff880176cd7c90] lock_timer_base at ffffffff810fc5e8
 #9 [ffff880176cd7cc8] try_to_del_timer_sync at ffffffff810fe290
#10 [ffff880176cd7cf0] del_timer_sync at ffffffff810fe381
#11 [ffff880176cd7d58] schedule_timeout at ffffffff816b9e4b
#12 [ffff880176cd7df0] rcu_gp_kthread at ffffffff810f52b4
#13 [ffff880176cd7e70] kthread at ffffffff8109a02f
#14 [ffff880176cd7f50] ret_from_fork at ffffffff816bb6f2

Game Over.

Signed-off-by: Mike Galbraith <[hidden email]>
---
 include/linux/sched.h |    1 +
 kernel/cpu.c          |    7 ++++---
 kernel/rcu/tree.c     |    3 +++
 3 files changed, 8 insertions(+), 3 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1492,6 +1492,7 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
  unsigned brk_randomized:1;
 #endif
+ unsigned sched_is_rcu:1; /* RT: is a critical RCU thread */
 
  unsigned long atomic_flags; /* Flags needing atomic access. */
 
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -147,17 +147,18 @@ static DEFINE_PER_CPU(struct hotplug_pcp
 void pin_current_cpu(void)
 {
  struct hotplug_pcp *hp;
+ struct task_struct *p = current;
  int force = 0;
 
 retry:
  hp = this_cpu_ptr(&hotplug_pcp);
 
  if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
-    hp->unplug == current) {
+    hp->unplug == p) {
  hp->refcount++;
  return;
  }
- if (hp->grab_lock) {
+ if (hp->grab_lock && !p->sched_is_rcu) {
  preempt_enable();
  hotplug_lock(hp);
  hotplug_unlock(hp);
@@ -169,7 +170,7 @@ void pin_current_cpu(void)
  if (!migrate_me()) {
  preempt_disable();
  hp = this_cpu_ptr(&hotplug_pcp);
- if (!hp->grab_lock) {
+ if (!hp->grab_lock || (p->sched_is_rcu && p->state)) {
  /*
  * Just let it continue it's already pinned
  * or about to sleep.
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2100,6 +2100,9 @@ static int __noreturn rcu_gp_kthread(voi
  struct rcu_state *rsp = arg;
  struct rcu_node *rnp = rcu_get_root(rsp);
 
+ /* RT: pin_current_cpu() MUST NOT block RCU grace periods. */
+ current->sched_is_rcu = 1;
+
  rcu_bind_gp_kthread();
  for (;;) {
 

hotplug/rt/kernfs: Put kernfs_mutex under migrate_disable()/migrate_enable()

Hotplug path takes kernfs_mutex, so let pin_current_cpu() snag callers
before they can deadlock us.  Below, systemd-udevd was snagged while
acquiring a spinlock, but already holding kernfs_mutex.

PID: 11107  TASK: ffff8803b12b9900  CPU: 4   COMMAND: "stress-cpu-hotp"                                                                                                                                                                      
 #0 [ffff88038b34f9b8] __schedule at ffffffff816b7132
 #1 [ffff88038b34fa08] schedule at ffffffff816b796b
 #2 [ffff88038b34fa28] rt_mutex_slowlock at ffffffff816b93ee
 #3 [ffff88038b34fac8] rt_mutex_fastlock at ffffffff811b0e9d
 #4 [ffff88038b34faf0] rt_mutex_lock at ffffffff816b95c8
 #5 [ffff88038b34fb08] _mutex_lock at ffffffff816baf59
 #6 [ffff88038b34fb28] kernfs_find_and_get_ns at ffffffff812cd573
 #7 [ffff88038b34fb50] sysfs_remove_group at ffffffff812d100a
 #8 [ffff88038b34fb78] thermal_throttle_cpu_callback at ffffffff81036ab9
 #9 [ffff88038b34fb98] notifier_call_chain at ffffffff8109b8dd
#10 [ffff88038b34fbd8] __raw_notifier_call_chain at ffffffff8109b90e
#11 [ffff88038b34fbe8] __cpu_notify at ffffffff81072825
#12 [ffff88038b34fbf8] cpu_notify_nofail at ffffffff81072b15
#13 [ffff88038b34fc08] notify_dead at ffffffff81072d06
#14 [ffff88038b34fc38] cpuhp_invoke_callback at ffffffff81073718
#15 [ffff88038b34fc78] cpuhp_down_callbacks at ffffffff81073a70
#16 [ffff88038b34fcb8] _cpu_down at ffffffff816afc61
#17 [ffff88038b34fd38] do_cpu_down at ffffffff8107434c
#18 [ffff88038b34fd60] cpu_down at ffffffff81074380
#19 [ffff88038b34fd70] cpu_subsys_offline at ffffffff814cd844
#20 [ffff88038b34fd80] device_offline at ffffffff814c7cca
#21 [ffff88038b34fda8] online_store at ffffffff814c7dc0
#22 [ffff88038b34fdd0] dev_attr_store at ffffffff814c4fb8
#23 [ffff88038b34fde0] sysfs_kf_write at ffffffff812cfbd4
#24 [ffff88038b34fe08] kernfs_fop_write at ffffffff812cf162
#25 [ffff88038b34fe50] __vfs_write at ffffffff81241418
#26 [ffff88038b34fed0] vfs_write at ffffffff81242525
#27 [ffff88038b34ff10] sys_write at ffffffff812438e9
#28 [ffff88038b34ff50] entry_SYSCALL_64_fastpath at ffffffff816bb4fc
    RIP: 00007f05f3d69cd0  RSP: 00007ffdfc934468  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000001908770  RCX: 00007f05f3d69cd0
    RDX: 0000000000000002  RSI: 00007f05f4898000  RDI: 0000000000000001
    RBP: 00007ffdfc934480   R8: 000000000000000a   R9: 00007f05f4892700
    R10: 00000000ffffffff  R11: 0000000000000246  R12: 0000000000000007
    R13: 0000000000000001  R14: 0000000000000009  R15: 000000000000000a
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b

stress-cpu-hotp blocks on kernfs_mutex, held by systemd-udevd..

crash> bt ffff8803b12bcb00
PID: 11130  TASK: ffff8803b12bcb00  CPU: 6   COMMAND: "systemd-udevd"
 #0 [ffff88038b327a18] __schedule at ffffffff816b7132
 #1 [ffff88038b327a68] schedule at ffffffff816b796b
 #2 [ffff88038b327a88] rt_spin_lock_slowlock at ffffffff816b9750
 #3 [ffff88038b327b30] rt_spin_lock_fastlock at ffffffff811b0f2c
 #4 [ffff88038b327b50] rt_spin_lock__no_mg at ffffffff816bac7b
 #5 [ffff88038b327b70] pin_current_cpu at ffffffff8107406a
 #6 [ffff88038b327bb8] migrate_disable at ffffffff810a0e8e
 #7 [ffff88038b327bd8] rt_spin_lock at ffffffff816badc9
 #8 [ffff88038b327bf8] ida_simple_remove at ffffffff8138765c
 #9 [ffff88038b327c18] kernfs_put at ffffffff812ccc58
#10 [ffff88038b327c60] __kernfs_remove at ffffffff812cd15c
#11 [ffff88038b327cc0] kernfs_remove_by_name_ns at ffffffff812ce2f3
#12 [ffff88038b327ce8] sysfs_remove_link at ffffffff812d05e9
#13 [ffff88038b327cf8] free_module at ffffffff8111c8f2
#14 [ffff88038b327d30] do_init_module at ffffffff811b157f
#15 [ffff88038b327d58] load_module at ffffffff8111f11b
#16 [ffff88038b327e98] SYSC_finit_module at ffffffff8111faf9
#17 [ffff88038b327f40] sys_finit_module at ffffffff8111fb3e
#18 [ffff88038b327f50] entry_SYSCALL_64_fastpath at ffffffff816bb4fc
    RIP: 00007f75d9925f79  RSP: 00007ffd1c040ed8  RFLAGS: 00000246
    RAX: ffffffffffffffda  RBX: 0000000001d368e0  RCX: 00007f75d9925f79
    RDX: 0000000000000000  RSI: 00007f75da0233c1  RDI: 0000000000000008
    RBP: 0000000000000008   R8: 0000000000000000   R9: 0000000001d39c82
    R10: 0000000000000008  R11: 0000000000000246  R12: 00007ffd1c03ff00
    R13: 00007ffd1c03fee0  R14: 0000000000000005  R15: 000000000aba9500
    ORIG_RAX: 0000000000000139  CS: 0033  SS: 002b

..which stress-cpu-hotp has blocked via pin_current_cpu().  Game Over.

Signed-off-by: Mike Galbraith <[hidden email]>
---
 fs/kernfs/dir.c             |   56 ++++++++++++++++++++++----------------------
 fs/kernfs/file.c            |    4 +--
 fs/kernfs/inode.c           |   20 +++++++--------
 fs/kernfs/kernfs-internal.h |   15 +++++++++++
 fs/kernfs/mount.c           |   16 ++++++------
 fs/kernfs/symlink.c         |    4 +--
 6 files changed, 65 insertions(+), 50 deletions(-)

--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -372,7 +372,7 @@ static void kernfs_drain(struct kernfs_n
  lockdep_assert_held(&kernfs_mutex);
  WARN_ON_ONCE(kernfs_active(kn));
 
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 
  if (kernfs_lockdep(kn)) {
  rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
@@ -391,7 +391,7 @@ static void kernfs_drain(struct kernfs_n
 
  kernfs_unmap_bin_file(kn);
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
 }
 
 /**
@@ -471,7 +471,7 @@ static int kernfs_dop_revalidate(struct
  goto out_bad_unlocked;
 
  kn = dentry->d_fsdata;
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
 
  /* The kernfs node has been deactivated */
  if (!kernfs_active(kn))
@@ -490,10 +490,10 @@ static int kernfs_dop_revalidate(struct
     kernfs_info(dentry->d_sb)->ns != kn->ns)
  goto out_bad;
 
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  return 1;
 out_bad:
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 out_bad_unlocked:
  return 0;
 }
@@ -603,7 +603,7 @@ int kernfs_add_one(struct kernfs_node *k
  bool has_ns;
  int ret;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
 
  ret = -EINVAL;
  has_ns = kernfs_ns_enabled(parent);
@@ -634,7 +634,7 @@ int kernfs_add_one(struct kernfs_node *k
  ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
  }
 
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 
  /*
  * Activate the new node unless CREATE_DEACTIVATED is requested.
@@ -648,7 +648,7 @@ int kernfs_add_one(struct kernfs_node *k
  return 0;
 
 out_unlock:
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  return ret;
 }
 
@@ -709,10 +709,10 @@ struct kernfs_node *kernfs_find_and_get_
 {
  struct kernfs_node *kn;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  kn = kernfs_find_ns(parent, name, ns);
  kernfs_get(kn);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 
  return kn;
 }
@@ -851,7 +851,7 @@ static struct dentry *kernfs_iop_lookup(
  struct inode *inode;
  const void *ns = NULL;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
 
  if (kernfs_ns_enabled(parent))
  ns = kernfs_info(dir->i_sb)->ns;
@@ -876,7 +876,7 @@ static struct dentry *kernfs_iop_lookup(
  /* instantiate and hash dentry */
  ret = d_splice_alias(inode, dentry);
  out_unlock:
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  return ret;
 }
 
@@ -1030,7 +1030,7 @@ void kernfs_activate(struct kernfs_node
 {
  struct kernfs_node *pos;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
 
  pos = NULL;
  while ((pos = kernfs_next_descendant_post(pos, kn))) {
@@ -1044,7 +1044,7 @@ void kernfs_activate(struct kernfs_node
  pos->flags |= KERNFS_ACTIVATED;
  }
 
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 }
 
 static void __kernfs_remove(struct kernfs_node *kn)
@@ -1121,9 +1121,9 @@ static void __kernfs_remove(struct kernf
  */
 void kernfs_remove(struct kernfs_node *kn)
 {
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  __kernfs_remove(kn);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 }
 
 /**
@@ -1210,7 +1210,7 @@ bool kernfs_remove_self(struct kernfs_no
 {
  bool ret;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  kernfs_break_active_protection(kn);
 
  /*
@@ -1238,9 +1238,9 @@ bool kernfs_remove_self(struct kernfs_no
     atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
  break;
 
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  schedule();
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  }
  finish_wait(waitq, &wait);
  WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
@@ -1253,7 +1253,7 @@ bool kernfs_remove_self(struct kernfs_no
  */
  kernfs_unbreak_active_protection(kn);
 
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  return ret;
 }
 
@@ -1277,13 +1277,13 @@ int kernfs_remove_by_name_ns(struct kern
  return -ENOENT;
  }
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
 
  kn = kernfs_find_ns(parent, name, ns);
  if (kn)
  __kernfs_remove(kn);
 
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 
  if (kn)
  return 0;
@@ -1309,7 +1309,7 @@ int kernfs_rename_ns(struct kernfs_node
  if (!kn->parent)
  return -EINVAL;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
 
  error = -ENOENT;
  if (!kernfs_active(kn) || !kernfs_active(new_parent) ||
@@ -1363,7 +1363,7 @@ int kernfs_rename_ns(struct kernfs_node
 
  error = 0;
  out:
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  return error;
 }
 
@@ -1438,7 +1438,7 @@ static int kernfs_fop_readdir(struct fil
 
  if (!dir_emit_dots(file, ctx))
  return 0;
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
 
  if (kernfs_ns_enabled(parent))
  ns = kernfs_info(dentry->d_sb)->ns;
@@ -1455,12 +1455,12 @@ static int kernfs_fop_readdir(struct fil
  file->private_data = pos;
  kernfs_get(pos);
 
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  if (!dir_emit(ctx, name, len, ino, type))
  return 0;
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  }
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  file->private_data = NULL;
  ctx->pos = INT_MAX;
  return 0;
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -830,7 +830,7 @@ static void kernfs_notify_workfn(struct
  spin_unlock_irq(&kernfs_open_node_lock);
 
  /* kick fsnotify */
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
 
  list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
  struct inode *inode;
@@ -851,7 +851,7 @@ static void kernfs_notify_workfn(struct
  iput(inode);
  }
 
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  kernfs_put(kn);
  goto repeat;
 }
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -103,9 +103,9 @@ int kernfs_setattr(struct kernfs_node *k
 {
  int ret;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  ret = __kernfs_setattr(kn, iattr);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  return ret;
 }
 
@@ -118,7 +118,7 @@ int kernfs_iop_setattr(struct dentry *de
  if (!kn)
  return -EINVAL;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  error = inode_change_ok(inode, iattr);
  if (error)
  goto out;
@@ -131,7 +131,7 @@ int kernfs_iop_setattr(struct dentry *de
  setattr_copy(inode, iattr);
 
 out:
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  return error;
 }
 
@@ -181,9 +181,9 @@ int kernfs_iop_setxattr(struct dentry *d
  if (error)
  return error;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 
  if (secdata)
  security_release_secctx(secdata, secdata_len);
@@ -273,9 +273,9 @@ int kernfs_iop_getattr(struct vfsmount *
  struct kernfs_node *kn = dentry->d_fsdata;
  struct inode *inode = d_inode(dentry);
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  kernfs_refresh_inode(kn, inode);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 
  generic_fillattr(inode, stat);
  return 0;
@@ -364,9 +364,9 @@ int kernfs_iop_permission(struct inode *
 
  kn = inode->i_private;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  kernfs_refresh_inode(kn, inode);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 
  return generic_permission(inode, mask);
 }
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -115,4 +115,19 @@ void kernfs_unmap_bin_file(struct kernfs
  */
 extern const struct inode_operations kernfs_symlink_iops;
 
+static inline void kernfs_mutex_lock(struct mutex *lock)
+{
+#ifdef CONFIG_PREEMPT_RT_FULL
+ migrate_disable();
+#endif
+ mutex_lock(lock);
+}
+
+static inline void kernfs_mutex_unlock(struct mutex *lock)
+{
+ mutex_unlock(lock);
+#ifdef CONFIG_PREEMPT_RT_FULL
+ migrate_enable();
+#endif
+}
 #endif /* __KERNFS_INTERNAL_H */
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -76,9 +76,9 @@ static int kernfs_fill_super(struct supe
  sb->s_time_gran = 1;
 
  /* get root inode, initialize and unlock it */
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  inode = kernfs_get_inode(sb, info->root->kn);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  if (!inode) {
  pr_debug("kernfs: could not get root inode\n");
  return -ENOMEM;
@@ -177,9 +177,9 @@ struct dentry *kernfs_mount_ns(struct fi
  }
  sb->s_flags |= MS_ACTIVE;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  list_add(&info->node, &root->supers);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  }
 
  return dget(sb->s_root);
@@ -198,9 +198,9 @@ void kernfs_kill_sb(struct super_block *
  struct kernfs_super_info *info = kernfs_info(sb);
  struct kernfs_node *root_kn = sb->s_root->d_fsdata;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  list_del(&info->node);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 
  /*
  * Remove the superblock from fs_supers/s_instances
@@ -228,7 +228,7 @@ struct super_block *kernfs_pin_sb(struct
  struct kernfs_super_info *info;
  struct super_block *sb = NULL;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  list_for_each_entry(info, &root->supers, node) {
  if (info->ns == ns) {
  sb = info->sb;
@@ -237,7 +237,7 @@ struct super_block *kernfs_pin_sb(struct
  break;
  }
  }
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
  return sb;
 }
 
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -105,9 +105,9 @@ static int kernfs_getlink(struct dentry
  struct kernfs_node *target = kn->symlink.target_kn;
  int error;
 
- mutex_lock(&kernfs_mutex);
+ kernfs_mutex_lock(&kernfs_mutex);
  error = kernfs_get_target_path(parent, target, path);
- mutex_unlock(&kernfs_mutex);
+ kernfs_mutex_unlock(&kernfs_mutex);
 
  return error;
 }
Reply | Threaded
Open this post in threaded view
|

[rfc patch 0/2] Kill hotplug_lock()/hotplug_unlock()

Mike Galbraith-5
This is my stab at whacking $subject without adding any duct tape.  My
desktop box has run a thousand iterations of Steven's hotplug stress
script accompanied with stockfish, futextest, hackbench and tbench to
give hotplug lots of opportunity to prove this broken.  So far, it has
failed.  64 core DL980 is nowhere near as quick at iteration increment,
but has been happily chugging along all the while.

Patches are against 4.6-rc2-rt13.  Barring any frozen projectiles
lobbed by either humans or hotplug, I'll see what a backport to 4.4-rt
would look like.  This one has lots of minus signs.

patch1:
 kernel/cpu.c |  267 ++++++++++-------------------------------------------------
 1 file changed, 47 insertions(+), 220 deletions(-)
patch2:
 kernel/locking/rtmutex.c |   37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)
123