[ANNOUNCE] 4.4.7-rt16

classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[ANNOUNCE] 4.4.7-rt16

Sebastian Andrzej Siewior-4
Dear RT folks!

I'm pleased to announce the v4.4.7-rt16 patch set.
Changes since v4.4.7-rt15:

- picked a few panic() re-entrance from NMI fixes from upstream. On -RT
  we have the same problem without NMI but with the soft/hard watchdog
  triggering panic().

- Don't take the port->lock on oops_in_progress. We had a trylock but
  that trylock does not work if invoked with IRQs off (like from the
  panic() caller). I am not very happy about this but if we keep it
  that way it would make sense to make a similar change for the other
  UART drivers…

- Rik van Riel and Clark Williams pointed out that a change made by
  Frederic Weisbecker in v4.5 could be backported and then we could
  remove some locking around vtime handling.

Known issues:
  - CPU hotplug got a little better but can deadlock.

The delta patch against 4.4.7-rt15 is appended below and can be found here:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.4/incr/patch-4.4.7-rt15-rt16.patch.xz

You can get this release via the git tree at:

    git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v4.4.7-rt16

The RT patch against 4.4.7 can be found here:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.4/patch-4.4.7-rt16.patch.xz

The split quilt queue is available at:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.4/patches-4.4.7-rt16.tar.xz

Sebastian

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 697f90db0e37..424aec4a4c71 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs)
 #endif
 
  if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ nmi_panic(regs, "NMI: Not continuing");
 
  pr_emerg("Dazed and confused, but trying to continue\n");
 
@@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
  reason, smp_processor_id());
  show_regs(regs);
 
- if (panic_on_io_nmi)
- panic("NMI IOCK error: Not continuing");
+ if (panic_on_io_nmi) {
+ nmi_panic(regs, "NMI IOCK error: Not continuing");
+
+ /*
+ * If we end up here, it means we have received an NMI while
+ * processing panic(). Simply return without delaying and
+ * re-enabling NMIs.
+ */
+ return;
+ }
 
  /* Re-enable the IOCK line, wait for a few seconds */
  reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
@@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 
  pr_emerg("Do you have a strange power saving mode enabled?\n");
  if (unknown_nmi_panic || panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ nmi_panic(regs, "NMI: Not continuing");
 
  pr_emerg("Dazed and confused, but trying to continue\n");
 }
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f660d63f40fe..8384207adde2 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -726,6 +726,7 @@ static int crashing_cpu;
 static nmi_shootdown_cb shootdown_callback;
 
 static atomic_t waiting_for_crash_ipi;
+static int crash_ipi_issued;
 
 static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
 {
@@ -788,6 +789,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 
  smp_send_nmi_allbutself();
 
+ /* Kick CPUs looping in NMI context. */
+ WRITE_ONCE(crash_ipi_issued, 1);
+
  msecs = 1000; /* Wait at most a second for the other cpus to stop */
  while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
  mdelay(1);
@@ -796,6 +800,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 
  /* Leave the nmi callback set */
 }
+
+/* Override the weak function in kernel/panic.c */
+void nmi_panic_self_stop(struct pt_regs *regs)
+{
+ while (1) {
+ /*
+ * Wait for the crash dumping IPI to be issued, and then
+ * call its callback directly.
+ */
+ if (READ_ONCE(crash_ipi_issued))
+ crash_nmi_callback(0, regs); /* Don't return */
+
+ cpu_relax();
+ }
+}
+
 #else /* !CONFIG_SMP */
 void nmi_shootdown_cpus(nmi_shootdown_cb callback)
 {
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index 91b831a1cc1c..a0b9e854672c 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -2844,9 +2844,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s,
 
  serial8250_rpm_get(up);
 
- if (port->sysrq)
+ if (port->sysrq || oops_in_progress)
  locked = 0;
- else if (oops_in_progress || in_kdb_printk())
+ else if (in_kdb_printk())
  locked = spin_trylock_irqsave(&port->lock, flags);
  else
  spin_lock_irqsave(&port->lock, flags);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d8ec0f202eee..60fadde71a44 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -156,8 +156,7 @@ extern struct task_group root_task_group;
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 # define INIT_VTIME(tsk) \
- .vtime_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.vtime_lock), \
- .vtime_seq = SEQCNT_ZERO(tsk.vtime_seq), \
+ .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \
  .vtime_snap = 0, \
  .vtime_snap_whence = VTIME_SYS,
 #else
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c44e33a49a08..c84b10d6527d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -259,6 +259,7 @@ extern long (*panic_blink)(int state);
 __printf(1, 2)
 void panic(const char *fmt, ...)
  __noreturn __cold;
+void nmi_panic(struct pt_regs *regs, const char *msg);
 extern void oops_enter(void);
 extern void oops_exit(void);
 void print_oops_end_marker(void);
@@ -450,6 +451,14 @@ extern int sysctl_panic_on_stackoverflow;
 extern bool crash_kexec_post_notifiers;
 
 /*
+ * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
+ * holds a CPU number which is executing panic() currently. A value of
+ * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec().
+ */
+extern atomic_t panic_cpu;
+#define PANIC_CPU_INVALID -1
+
+/*
  * Only to be used by arch init code. If the user over-wrote the default
  * CONFIG_PANIC_TIMEOUT, honor it.
  */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 58c5ec8c3742..f9a0f2b540f1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1539,12 +1539,14 @@ struct task_struct {
  cputime_t gtime;
  struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- raw_spinlock_t vtime_lock;
- seqcount_t vtime_seq;
+ seqcount_t vtime_seqcount;
  unsigned long long vtime_snap;
  enum {
- VTIME_SLEEPING = 0,
+ /* Task is sleeping or running in a CPU with VTIME inactive */
+ VTIME_INACTIVE = 0,
+ /* Task runs in userspace in a CPU with VTIME active */
  VTIME_USER,
+ /* Task runs in kernelspace in a CPU with VTIME active */
  VTIME_SYS,
  } vtime_snap_whence;
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 46c1e8342ad8..4e93b4ea33f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1379,10 +1379,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  prev_cputime_init(&p->prev_cputime);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- raw_spin_lock_init(&p->vtime_lock);
- seqcount_init(&p->vtime_seq);
+ seqcount_init(&p->vtime_seqcount);
  p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_SLEEPING;
+ p->vtime_snap_whence = VTIME_INACTIVE;
 #endif
 
 #if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/panic.c b/kernel/panic.c
index 50d4ae2e7d1b..3535f802953a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -61,6 +61,37 @@ void __weak panic_smp_self_stop(void)
  cpu_relax();
 }
 
+/*
+ * Stop ourselves in NMI context if another CPU has already panicked. Arch code
+ * may override this to prepare for crash dumping, e.g. save regs info.
+ */
+void __weak nmi_panic_self_stop(struct pt_regs *regs)
+{
+ panic_smp_self_stop();
+}
+
+atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID);
+
+/*
+ * A variant of panic() called from NMI context. We return if we've already
+ * panicked on this CPU. If another CPU already panicked, loop in
+ * nmi_panic_self_stop() which can provide architecture dependent code such
+ * as saving register state for crash dump.
+ */
+void nmi_panic(struct pt_regs *regs, const char *msg)
+{
+ int old_cpu, cpu;
+
+ cpu = raw_smp_processor_id();
+ old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu);
+
+ if (old_cpu == PANIC_CPU_INVALID)
+ panic("%s", msg);
+ else if (old_cpu != cpu)
+ nmi_panic_self_stop(regs);
+}
+EXPORT_SYMBOL(nmi_panic);
+
 /**
  * panic - halt the system
  * @fmt: The text string to print
@@ -71,17 +102,17 @@ void __weak panic_smp_self_stop(void)
  */
 void panic(const char *fmt, ...)
 {
- static DEFINE_SPINLOCK(panic_lock);
  static char buf[1024];
  va_list args;
  long i, i_next = 0;
  int state = 0;
+ int old_cpu, this_cpu;
 
  /*
  * Disable local interrupts. This will prevent panic_smp_self_stop
  * from deadlocking the first cpu that invokes the panic, since
  * there is nothing to prevent an interrupt handler (that runs
- * after the panic_lock is acquired) from invoking panic again.
+ * after setting panic_cpu) from invoking panic() again.
  */
  local_irq_disable();
 
@@ -94,8 +125,16 @@ void panic(const char *fmt, ...)
  * multiple parallel invocations of panic, all other CPUs either
  * stop themself or will wait until they are stopped by the 1st CPU
  * with smp_send_stop().
+ *
+ * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which
+ * comes here, so go ahead.
+ * `old_cpu == this_cpu' means we came from nmi_panic() which sets
+ * panic_cpu to this CPU.  In this case, this is also the 1st CPU.
  */
- if (!spin_trylock(&panic_lock))
+ this_cpu = raw_smp_processor_id();
+ old_cpu  = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
+
+ if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu)
  panic_smp_self_stop();
 
  console_verbose();
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index c45f4b026230..4611b1c1cb12 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -680,7 +680,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
 {
  unsigned long long delta = vtime_delta(tsk);
 
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+ WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
  tsk->vtime_snap += delta;
 
  /* CHECKME: always safe to convert nsecs to cputime? */
@@ -696,45 +696,37 @@ static void __vtime_account_system(struct task_struct *tsk)
 
 void vtime_account_system(struct task_struct *tsk)
 {
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
  __vtime_account_system(tsk);
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_gen_account_irq_exit(struct task_struct *tsk)
 {
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
  __vtime_account_system(tsk);
  if (context_tracking_in_user())
  tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_account_user(struct task_struct *tsk)
 {
  cputime_t delta_cpu;
 
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
  delta_cpu = get_vtime_delta(tsk);
  tsk->vtime_snap_whence = VTIME_SYS;
  account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_user_enter(struct task_struct *tsk)
 {
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
  __vtime_account_system(tsk);
  tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
 }
 
 void vtime_guest_enter(struct task_struct *tsk)
@@ -746,23 +738,19 @@ void vtime_guest_enter(struct task_struct *tsk)
  * synchronization against the reader (task_gtime())
  * that can thus safely catch up with a tickless delta.
  */
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
  __vtime_account_system(tsk);
  current->flags |= PF_VCPU;
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_enter);
 
 void vtime_guest_exit(struct task_struct *tsk)
 {
- raw_spin_lock(&tsk->vtime_lock);
- write_seqcount_begin(&tsk->vtime_seq);
+ write_seqcount_begin(&tsk->vtime_seqcount);
  __vtime_account_system(tsk);
  current->flags &= ~PF_VCPU;
- write_seqcount_end(&tsk->vtime_seq);
- raw_spin_unlock(&tsk->vtime_lock);
+ write_seqcount_end(&tsk->vtime_seqcount);
 }
 EXPORT_SYMBOL_GPL(vtime_guest_exit);
 
@@ -775,30 +763,26 @@ void vtime_account_idle(struct task_struct *tsk)
 
 void arch_vtime_task_switch(struct task_struct *prev)
 {
- raw_spin_lock(&prev->vtime_lock);
- write_seqcount_begin(&prev->vtime_seq);
- prev->vtime_snap_whence = VTIME_SLEEPING;
- write_seqcount_end(&prev->vtime_seq);
- raw_spin_unlock(&prev->vtime_lock);
+ write_seqcount_begin(&prev->vtime_seqcount);
+ prev->vtime_snap_whence = VTIME_INACTIVE;
+ write_seqcount_end(&prev->vtime_seqcount);
 
- raw_spin_lock(&current->vtime_lock);
- write_seqcount_begin(&current->vtime_seq);
+ write_seqcount_begin(&current->vtime_seqcount);
  current->vtime_snap_whence = VTIME_SYS;
  current->vtime_snap = sched_clock_cpu(smp_processor_id());
- write_seqcount_end(&current->vtime_seq);
- raw_spin_unlock(&current->vtime_lock);
+ write_seqcount_end(&current->vtime_seqcount);
 }
 
 void vtime_init_idle(struct task_struct *t, int cpu)
 {
  unsigned long flags;
 
- raw_spin_lock_irqsave(&t->vtime_lock, flags);
- write_seqcount_begin(&t->vtime_seq);
+ local_irq_save(flags);
+ write_seqcount_begin(&t->vtime_seqcount);
  t->vtime_snap_whence = VTIME_SYS;
  t->vtime_snap = sched_clock_cpu(cpu);
- write_seqcount_end(&t->vtime_seq);
- raw_spin_unlock_irqrestore(&t->vtime_lock, flags);
+ write_seqcount_end(&t->vtime_seqcount);
+ local_irq_restore(flags);
 }
 
 cputime_t task_gtime(struct task_struct *t)
@@ -810,13 +794,13 @@ cputime_t task_gtime(struct task_struct *t)
  return t->gtime;
 
  do {
- seq = read_seqcount_begin(&t->vtime_seq);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
 
  gtime = t->gtime;
  if (t->flags & PF_VCPU)
  gtime += vtime_delta(t);
 
- } while (read_seqcount_retry(&t->vtime_seq, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
 
  return gtime;
 }
@@ -839,7 +823,7 @@ fetch_task_cputime(struct task_struct *t,
  *udelta = 0;
  *sdelta = 0;
 
- seq = read_seqcount_begin(&t->vtime_seq);
+ seq = read_seqcount_begin(&t->vtime_seqcount);
 
  if (u_dst)
  *u_dst = *u_src;
@@ -847,7 +831,7 @@ fetch_task_cputime(struct task_struct *t,
  *s_dst = *s_src;
 
  /* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_SLEEPING ||
+ if (t->vtime_snap_whence == VTIME_INACTIVE ||
     is_idle_task(t))
  continue;
 
@@ -863,7 +847,7 @@ fetch_task_cputime(struct task_struct *t,
  if (t->vtime_snap_whence == VTIME_SYS)
  *sdelta = delta;
  }
- } while (read_seqcount_retry(&t->vtime_seq, seq));
+ } while (read_seqcount_retry(&t->vtime_seqcount, seq));
 }
 
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d974121159ca..47d143740774 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -361,7 +361,7 @@ static void watchdog_overflow_callback(struct perf_event *event,
 
  raw_spin_unlock(&watchdog_output_lock);
  if (hardlockup_panic)
- panic("Hard LOCKUP");
+ nmi_panic(regs, "Hard LOCKUP");
 
  __this_cpu_write(hard_watchdog_warn, true);
  return;
diff --git a/localversion-rt b/localversion-rt
index 18777ec0c27d..1199ebade17b 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt15
+-rt16