diff --git a/MAINTAINERS b/MAINTAINERS index f66488dfdbc9..3b035584272f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8621,7 +8621,7 @@ M: Mathieu Desnoyers M: "Paul E. McKenney" L: linux-kernel@vger.kernel.org S: Supported -F: kernel/membarrier.c +F: kernel/sched/membarrier.c F: include/uapi/linux/membarrier.h MEMORY MANAGEMENT diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h index a40b9fc0c6c3..718ac0b64adf 100644 --- a/arch/alpha/include/asm/spinlock.h +++ b/arch/alpha/include/asm/spinlock.h @@ -16,11 +16,6 @@ #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #define arch_spin_is_locked(x) ((x)->lock != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline int arch_spin_value_unlocked(arch_spinlock_t lock) { return lock.lock == 0; diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h index 233d5ffe6ec7..a325e6a36523 100644 --- a/arch/arc/include/asm/spinlock.h +++ b/arch/arc/include/asm/spinlock.h @@ -16,11 +16,6 @@ #define arch_spin_is_locked(x) ((x)->slock != __ARCH_SPIN_LOCK_UNLOCKED__) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - #ifdef CONFIG_ARC_HAS_LLSC static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h index 4bec45442072..c030143c18c6 100644 --- a/arch/arm/include/asm/spinlock.h +++ b/arch/arm/include/asm/spinlock.h @@ -52,22 +52,6 @@ static inline void dsb_sev(void) * memory. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u16 owner = READ_ONCE(lock->tickets.owner); - - for (;;) { - arch_spinlock_t tmp = READ_ONCE(*lock); - - if (tmp.tickets.owner == tmp.tickets.next || - tmp.tickets.owner != owner) - break; - - wfe(); - } - smp_acquire__after_ctrl_dep(); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index cae331d553f8..f445bd7f2b9f 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -26,58 +26,6 @@ * The memory barriers are implicit with the load-acquire and store-release * instructions. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - unsigned int tmp; - arch_spinlock_t lockval; - u32 owner; - - /* - * Ensure prior spin_lock operations to other locks have completed - * on this CPU before we test whether "lock" is locked. - */ - smp_mb(); - owner = READ_ONCE(lock->owner) << 16; - - asm volatile( -" sevl\n" -"1: wfe\n" -"2: ldaxr %w0, %2\n" - /* Is the lock free? */ -" eor %w1, %w0, %w0, ror #16\n" -" cbz %w1, 3f\n" - /* Lock taken -- has there been a subsequent unlock->lock transition? */ -" eor %w1, %w3, %w0, lsl #16\n" -" cbz %w1, 1b\n" - /* - * The owner has been updated, so there was an unlock->lock - * transition that we missed. That means we can rely on the - * store-release of the unlock operation paired with the - * load-acquire of the lock operation to publish any of our - * previous stores to the new lock owner and therefore don't - * need to bother with the writeback below. - */ -" b 4f\n" -"3:\n" - /* - * Serialise against any concurrent lockers by writing back the - * unlocked lock value - */ - ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ -" stxr %w1, %w0, %2\n" - __nops(2), - /* LSE atomics */ -" mov %w1, %w0\n" -" cas %w0, %w0, %2\n" -" eor %w1, %w1, %w0\n") - /* Somebody else wrote to the lock, GOTO 10 and reload the value */ -" cbnz %w1, 2b\n" -"4:" - : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) - : "r" (owner) - : "memory"); -} #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) @@ -176,7 +124,11 @@ static inline int arch_spin_value_unlocked(arch_spinlock_t lock) static inline int arch_spin_is_locked(arch_spinlock_t *lock) { - smp_mb(); /* See arch_spin_unlock_wait */ + /* + * Ensure prior spin_lock operations to other locks have completed + * on this CPU before we test whether "lock" is locked. + */ + smp_mb(); /* ^^^ */ return !arch_spin_value_unlocked(READ_ONCE(*lock)); } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 659ae8094ed5..c8f7d98d8cb9 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, /* * Complete any pending TLB or cache maintenance on this CPU in case * the thread migrates to a different CPU. + * This full barrier is also required by the membarrier system + * call. */ dsb(ish); diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h index c58f4a83ed6f..f6431439d15d 100644 --- a/arch/blackfin/include/asm/spinlock.h +++ b/arch/blackfin/include/asm/spinlock.h @@ -48,11 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __raw_spin_unlock_asm(&lock->lock); } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline int arch_read_can_lock(arch_rwlock_t *rw) { return __raw_uncached_fetch_asm(&rw->lock) > 0; diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c index 0188c933b155..15af5768c403 100644 --- a/arch/blackfin/kernel/module.c +++ b/arch/blackfin/kernel/module.c @@ -4,8 +4,6 @@ * Licensed under the GPL-2 or later */ -#define pr_fmt(fmt) "module %s: " fmt, mod->name - #include #include #include @@ -16,6 +14,11 @@ #include #include +#define mod_err(mod, fmt, ...) \ + pr_err("module %s: " fmt, (mod)->name, ##__VA_ARGS__) +#define mod_debug(mod, fmt, ...) \ + pr_debug("module %s: " fmt, (mod)->name, ##__VA_ARGS__) + /* Transfer the section to the L1 memory */ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -44,7 +47,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_inst_sram_alloc(s->sh_size); mod->arch.text_l1 = dest; if (dest == NULL) { - pr_err("L1 inst memory allocation failed\n"); + mod_err(mod, "L1 inst memory allocation failed\n"); return -1; } dma_memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -56,7 +59,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_sram_alloc(s->sh_size); mod->arch.data_a_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -68,7 +71,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_sram_zalloc(s->sh_size); mod->arch.bss_a_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } @@ -77,7 +80,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_B_sram_alloc(s->sh_size); mod->arch.data_b_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -87,7 +90,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l1_data_B_sram_alloc(s->sh_size); mod->arch.bss_b_l1 = dest; if (dest == NULL) { - pr_err("L1 data memory allocation failed\n"); + mod_err(mod, "L1 data memory allocation failed\n"); return -1; } memset(dest, 0, s->sh_size); @@ -99,7 +102,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_alloc(s->sh_size); mod->arch.text_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -111,7 +114,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_alloc(s->sh_size); mod->arch.data_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } memcpy(dest, (void *)s->sh_addr, s->sh_size); @@ -123,7 +126,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, dest = l2_sram_zalloc(s->sh_size); mod->arch.bss_l2 = dest; if (dest == NULL) { - pr_err("L2 SRAM allocation failed\n"); + mod_err(mod, "L2 SRAM allocation failed\n"); return -1; } @@ -157,8 +160,8 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, Elf32_Sym *sym; unsigned long location, value, size; - pr_debug("applying relocate section %u to %u\n", - relsec, sechdrs[relsec].sh_info); + mod_debug(mod, "applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ @@ -174,14 +177,14 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, #ifdef CONFIG_SMP if (location >= COREB_L1_DATA_A_START) { - pr_err("cannot relocate in L1: %u (SMP kernel)\n", + mod_err(mod, "cannot relocate in L1: %u (SMP kernel)\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } #endif - pr_debug("location is %lx, value is %lx type is %d\n", - location, value, ELF32_R_TYPE(rel[i].r_info)); + mod_debug(mod, "location is %lx, value is %lx type is %d\n", + location, value, ELF32_R_TYPE(rel[i].r_info)); switch (ELF32_R_TYPE(rel[i].r_info)) { @@ -200,12 +203,12 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, case R_BFIN_PCREL12_JUMP: case R_BFIN_PCREL12_JUMP_S: case R_BFIN_PCREL10: - pr_err("unsupported relocation: %u (no -mlong-calls?)\n", + mod_err(mod, "unsupported relocation: %u (no -mlong-calls?)\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; default: - pr_err("unknown relocation: %u\n", + mod_err(mod, "unknown relocation: %u\n", ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -222,7 +225,7 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab, isram_memcpy((void *)location, &value, size); break; default: - pr_err("invalid relocation for %#lx\n", location); + mod_err(mod, "invalid relocation for %#lx\n", location); return -ENOEXEC; } } diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h index a1c55788c5d6..53a8d5885887 100644 --- a/arch/hexagon/include/asm/spinlock.h +++ b/arch/hexagon/include/asm/spinlock.h @@ -179,11 +179,6 @@ static inline unsigned int arch_spin_trylock(arch_spinlock_t *lock) */ #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - #define arch_spin_is_locked(x) ((x)->lock != 0) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index ca9e76149a4a..df2c121164b8 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -76,22 +76,6 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) ACCESS_ONCE(*p) = (tmp + 2) & ~1; } -static __always_inline void __ticket_spin_unlock_wait(arch_spinlock_t *lock) -{ - int *p = (int *)&lock->lock, ticket; - - ia64_invala(); - - for (;;) { - asm volatile ("ld4.c.nc %0=[%1]" : "=r"(ticket) : "r"(p) : "memory"); - if (!(((ticket >> TICKET_SHIFT) ^ ticket) & TICKET_MASK)) - return; - cpu_relax(); - } - - smp_acquire__after_ctrl_dep(); -} - static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { long tmp = ACCESS_ONCE(lock->lock); @@ -143,11 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, arch_spin_lock(lock); } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - __ticket_spin_unlock_wait(lock); -} - #define arch_read_can_lock(rw) (*(volatile int *)(rw) >= 0) #define arch_write_can_lock(rw) (*(volatile int *)(rw) == 0) diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h index 323c7fc953cd..a56825592b90 100644 --- a/arch/m32r/include/asm/spinlock.h +++ b/arch/m32r/include/asm/spinlock.h @@ -30,11 +30,6 @@ #define arch_spin_is_locked(x) (*(volatile int *)(&(x)->slock) <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, VAL > 0); -} - /** * arch_spin_trylock - Try spin lock and return a result * @lock: Pointer to the lock variable diff --git a/arch/metag/include/asm/spinlock.h b/arch/metag/include/asm/spinlock.h index c0c7a22be1ae..ddf7fe5708a6 100644 --- a/arch/metag/include/asm/spinlock.h +++ b/arch/metag/include/asm/spinlock.h @@ -15,11 +15,6 @@ * locked. */ -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h index 9c7b8f7942d8..fe413b41df6c 100644 --- a/arch/mn10300/include/asm/spinlock.h +++ b/arch/mn10300/include/asm/spinlock.h @@ -26,11 +26,6 @@ #define arch_spin_is_locked(x) (*(volatile signed char *)(&(x)->slock) != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - static inline void arch_spin_unlock(arch_spinlock_t *lock) { asm volatile( diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h index e32936cd7f10..55bfe4affca3 100644 --- a/arch/parisc/include/asm/spinlock.h +++ b/arch/parisc/include/asm/spinlock.h @@ -14,13 +14,6 @@ static inline int arch_spin_is_locked(arch_spinlock_t *x) #define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *x) -{ - volatile unsigned int *a = __ldcw_align(x); - - smp_cond_load_acquire(a, VAL); -} - static inline void arch_spin_lock_flags(arch_spinlock_t *x, unsigned long flags) { diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 8c1b913de6d7..d256e448ea49 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -170,39 +170,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->slock = 0; } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - arch_spinlock_t lock_val; - - smp_mb(); - - /* - * Atomically load and store back the lock value (unchanged). This - * ensures that our observation of the lock value is ordered with - * respect to other lock operations. - */ - __asm__ __volatile__( -"1: " PPC_LWARX(%0, 0, %2, 0) "\n" -" stwcx. %0, 0, %2\n" -" bne- 1b\n" - : "=&r" (lock_val), "+m" (*lock) - : "r" (lock) - : "cr0", "xer"); - - if (arch_spin_value_unlocked(lock_val)) - goto out; - - while (lock->slock) { - HMT_low(); - if (SHARED_PROCESSOR) - __spin_yield(lock); - } - HMT_medium(); - -out: - smp_mb(); -} - /* * Read-write spinlocks, allowing multiple readers * but only one writer. diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index f7838ecd83c6..217ee5210c32 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -98,13 +98,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp) : "cc", "memory"); } -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - while (arch_spin_is_locked(lock)) - arch_spin_relax(lock); - smp_acquire__after_ctrl_dep(); -} - /* * Read-write spinlocks, allowing multiple readers * but only one writer. diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h index c46e8cc7b515..5ed7dbbd94ff 100644 --- a/arch/sh/include/asm/spinlock-cas.h +++ b/arch/sh/include/asm/spinlock-cas.h @@ -29,11 +29,6 @@ static inline unsigned __sl_cas(volatile unsigned *p, unsigned old, unsigned new #define arch_spin_is_locked(x) ((x)->lock <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, VAL > 0); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { while (!__sl_cas(&lock->lock, 1, 0)); diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h index cec78143fa83..f77263aae760 100644 --- a/arch/sh/include/asm/spinlock-llsc.h +++ b/arch/sh/include/asm/spinlock-llsc.h @@ -21,11 +21,6 @@ #define arch_spin_is_locked(x) ((x)->lock <= 0) #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, VAL > 0); -} - /* * Simple spin lock operations. There are two variants, one clears IRQ's * on the local processor, one does not. diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h index 8011e79f59c9..67345b2dc408 100644 --- a/arch/sparc/include/asm/spinlock_32.h +++ b/arch/sparc/include/asm/spinlock_32.h @@ -14,11 +14,6 @@ #define arch_spin_is_locked(lock) (*((volatile unsigned char *)(lock)) != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->lock, !VAL); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { __asm__ __volatile__( diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h index b14b1ba5bf9c..cba8ba9b8da6 100644 --- a/arch/tile/include/asm/spinlock_32.h +++ b/arch/tile/include/asm/spinlock_32.h @@ -64,8 +64,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) lock->current_ticket = old_ticket + TICKET_QUANTUM; } -void arch_spin_unlock_wait(arch_spinlock_t *lock); - /* * Read-write spinlocks, allowing multiple readers * but only one writer. diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h index b9718fb4e74a..9a2c2d605752 100644 --- a/arch/tile/include/asm/spinlock_64.h +++ b/arch/tile/include/asm/spinlock_64.h @@ -58,8 +58,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) __insn_fetchadd4(&lock->lock, 1U << __ARCH_SPIN_CURRENT_SHIFT); } -void arch_spin_unlock_wait(arch_spinlock_t *lock); - void arch_spin_lock_slow(arch_spinlock_t *lock, u32 val); /* Grab the "next" ticket number and bump it atomically. diff --git a/arch/tile/lib/spinlock_32.c b/arch/tile/lib/spinlock_32.c index 076c6cc43113..db9333f2447c 100644 --- a/arch/tile/lib/spinlock_32.c +++ b/arch/tile/lib/spinlock_32.c @@ -62,29 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock) } EXPORT_SYMBOL(arch_spin_trylock); -void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u32 iterations = 0; - int curr = READ_ONCE(lock->current_ticket); - int next = READ_ONCE(lock->next_ticket); - - /* Return immediately if unlocked. */ - if (next == curr) - return; - - /* Wait until the current locker has released the lock. */ - do { - delay_backoff(iterations++); - } while (READ_ONCE(lock->current_ticket) == curr); - - /* - * The TILE architecture doesn't do read speculation; therefore - * a control dependency guarantees a LOAD->{LOAD,STORE} order. - */ - barrier(); -} -EXPORT_SYMBOL(arch_spin_unlock_wait); - /* * The low byte is always reserved to be the marker for a "tns" operation * since the low bit is set to "1" by a tns. The next seven bits are diff --git a/arch/tile/lib/spinlock_64.c b/arch/tile/lib/spinlock_64.c index a4b5b2cbce93..de414c22892f 100644 --- a/arch/tile/lib/spinlock_64.c +++ b/arch/tile/lib/spinlock_64.c @@ -62,28 +62,6 @@ int arch_spin_trylock(arch_spinlock_t *lock) } EXPORT_SYMBOL(arch_spin_trylock); -void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - u32 iterations = 0; - u32 val = READ_ONCE(lock->lock); - u32 curr = arch_spin_current(val); - - /* Return immediately if unlocked. */ - if (arch_spin_next(val) == curr) - return; - - /* Wait until the current locker has released the lock. */ - do { - delay_backoff(iterations++); - } while (arch_spin_current(READ_ONCE(lock->lock)) == curr); - - /* - * The TILE architecture doesn't do read speculation; therefore - * a control dependency guarantees a LOAD->{LOAD,STORE} order. - */ - barrier(); -} -EXPORT_SYMBOL(arch_spin_unlock_wait); /* * If the read lock fails due to a writer, we retry periodically diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h index a36221cf6363..3bb49681ee24 100644 --- a/arch/xtensa/include/asm/spinlock.h +++ b/arch/xtensa/include/asm/spinlock.h @@ -33,11 +33,6 @@ #define arch_spin_is_locked(x) ((x)->slock != 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, !VAL); -} - #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) static inline void arch_spin_lock(arch_spinlock_t *lock) diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index b70bcf6d2914..b325db27eb8c 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -645,12 +645,11 @@ void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, * completions are honored. A scmd is determined to have * timed out iff its associated qc is active and not failed. */ + spin_lock_irqsave(ap->lock, flags); if (ap->ops->error_handler) { struct scsi_cmnd *scmd, *tmp; int nr_timedout = 0; - spin_lock_irqsave(ap->lock, flags); - /* This must occur under the ap->lock as we don't want a polled recovery to race the real interrupt handler @@ -700,12 +699,11 @@ void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, if (nr_timedout) __ata_port_freeze(ap); - spin_unlock_irqrestore(ap->lock, flags); /* initialize eh_tries */ ap->eh_tries = ATA_EH_MAX_TRIES; - } else - spin_unlock_wait(ap->lock); + } + spin_unlock_irqrestore(ap->lock, flags); } EXPORT_SYMBOL(ata_scsi_cmd_error_handler); diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index 9f0681bf1e87..66260777d644 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -21,17 +21,6 @@ #include -/** - * queued_spin_unlock_wait - wait until the _current_ lock holder releases the lock - * @lock : Pointer to queued spinlock structure - * - * There is a very slight possibility of live-lock if the lockers keep coming - * and the waiter is just unfortunate enough to not see any unlock state. - */ -#ifndef queued_spin_unlock_wait -extern void queued_spin_unlock_wait(struct qspinlock *lock); -#endif - /** * queued_spin_is_locked - is the spinlock locked? * @lock: Pointer to queued spinlock structure @@ -41,8 +30,6 @@ extern void queued_spin_unlock_wait(struct qspinlock *lock); static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { /* - * See queued_spin_unlock_wait(). - * * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL * isn't immediately observable. */ @@ -135,6 +122,5 @@ static __always_inline bool virt_spin_lock(struct qspinlock *lock) #define arch_spin_trylock(l) queued_spin_trylock(l) #define arch_spin_unlock(l) queued_spin_unlock(l) #define arch_spin_lock_flags(l, f) queued_spin_lock(l) -#define arch_spin_unlock_wait(l) queued_spin_unlock_wait(l) #endif /* __ASM_GENERIC_QSPINLOCK_H */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index a2f6707e9fc0..0e849715e5be 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -125,18 +125,12 @@ extern struct group_info init_groups; #define INIT_IDS #endif -#ifdef CONFIG_PREEMPT_RCU -#define INIT_TASK_RCU_TREE_PREEMPT() \ - .rcu_blocked_node = NULL, -#else -#define INIT_TASK_RCU_TREE_PREEMPT(tsk) -#endif #ifdef CONFIG_PREEMPT_RCU #define INIT_TASK_RCU_PREEMPT(tsk) \ .rcu_read_lock_nesting = 0, \ .rcu_read_unlock_special.s = 0, \ .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ - INIT_TASK_RCU_TREE_PREEMPT() + .rcu_blocked_node = NULL, #else #define INIT_TASK_RCU_PREEMPT(tsk) #endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f816fc72b51e..96f1baf62ab8 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -58,8 +58,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); void call_rcu_bh(struct rcu_head *head, rcu_callback_t func); void call_rcu_sched(struct rcu_head *head, rcu_callback_t func); void synchronize_sched(void); -void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); -void synchronize_rcu_tasks(void); void rcu_barrier_tasks(void); #ifdef CONFIG_PREEMPT_RCU @@ -105,11 +103,13 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ void rcu_init(void); +extern int rcu_scheduler_active __read_mostly; void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); void rcu_report_dead(unsigned int cpu); void rcu_cpu_starting(unsigned int cpu); +void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); @@ -164,8 +164,6 @@ static inline void rcu_init_nohz(void) { } * macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU -#define TASKS_RCU(x) x -extern struct srcu_struct tasks_rcu_exit_srcu; #define rcu_note_voluntary_context_switch_lite(t) \ do { \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ @@ -176,10 +174,17 @@ extern struct srcu_struct tasks_rcu_exit_srcu; rcu_all_qs(); \ rcu_note_voluntary_context_switch_lite(t); \ } while (0) +void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); +void synchronize_rcu_tasks(void); +void exit_tasks_rcu_start(void); +void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU */ -#define TASKS_RCU(x) do { } while (0) #define rcu_note_voluntary_context_switch_lite(t) do { } while (0) #define rcu_note_voluntary_context_switch(t) rcu_all_qs() +#define call_rcu_tasks call_rcu_sched +#define synchronize_rcu_tasks synchronize_sched +static inline void exit_tasks_rcu_start(void) { } +static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5becbbccb998..b3dbf9502fd0 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -116,13 +116,11 @@ static inline void rcu_irq_exit_irqson(void) { } static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } static inline void exit_rcu(void) { } - -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) -extern int rcu_scheduler_active __read_mostly; +#ifdef CONFIG_SRCU void rcu_scheduler_starting(void); -#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ +#else /* #ifndef CONFIG_SRCU */ static inline void rcu_scheduler_starting(void) { } -#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ +#endif /* #else #ifndef CONFIG_SRCU */ static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_is_watching(void) { return true; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2db0bb2..e4c38809a09e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -589,9 +589,10 @@ struct task_struct { #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; - bool rcu_tasks_holdout; - struct list_head rcu_tasks_holdout_list; + u8 rcu_tasks_holdout; + u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; + struct list_head rcu_tasks_holdout_list; #endif /* #ifdef CONFIG_TASKS_RCU */ struct sched_info sched_info; diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index d9510e8522d4..ef018a6e4985 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -130,12 +130,6 @@ do { \ #define smp_mb__before_spinlock() smp_wmb() #endif -/** - * raw_spin_unlock_wait - wait until the spinlock gets unlocked - * @lock: the spinlock in question. - */ -#define raw_spin_unlock_wait(lock) arch_spin_unlock_wait(&(lock)->raw_lock) - #ifdef CONFIG_DEBUG_SPINLOCK extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock); #define do_raw_spin_lock_flags(lock, flags) do_raw_spin_lock(lock) @@ -369,31 +363,6 @@ static __always_inline int spin_trylock_irq(spinlock_t *lock) raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ }) -/** - * spin_unlock_wait - Interpose between successive critical sections - * @lock: the spinlock whose critical sections are to be interposed. - * - * Semantically this is equivalent to a spin_lock() immediately - * followed by a spin_unlock(). However, most architectures have - * more efficient implementations in which the spin_unlock_wait() - * cannot block concurrent lock acquisition, and in some cases - * where spin_unlock_wait() does not write to the lock variable. - * Nevertheless, spin_unlock_wait() can have high overhead, so if - * you feel the need to use it, please check to see if there is - * a better way to get your job done. - * - * The ordering guarantees provided by spin_unlock_wait() are: - * - * 1. All accesses preceding the spin_unlock_wait() happen before - * any accesses in later critical sections for this same lock. - * 2. All accesses following the spin_unlock_wait() happen after - * any accesses in earlier critical sections for this same lock. - */ -static __always_inline void spin_unlock_wait(spinlock_t *lock) -{ - raw_spin_unlock_wait(&lock->rlock); -} - static __always_inline int spin_is_locked(spinlock_t *lock) { return raw_spin_is_locked(&lock->rlock); diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h index 0d9848de677d..612fb530af41 100644 --- a/include/linux/spinlock_up.h +++ b/include/linux/spinlock_up.h @@ -26,11 +26,6 @@ #ifdef CONFIG_DEBUG_SPINLOCK #define arch_spin_is_locked(x) ((x)->slock == 0) -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - smp_cond_load_acquire(&lock->slock, VAL); -} - static inline void arch_spin_lock(arch_spinlock_t *lock) { lock->slock = 0; @@ -73,7 +68,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) #else /* DEBUG_SPINLOCK */ #define arch_spin_is_locked(lock) ((void)(lock), 0) -#define arch_spin_unlock_wait(lock) do { barrier(); (void)(lock); } while (0) /* for sched/core.c and kernel_lock.c: */ # define arch_spin_lock(lock) do { barrier(); (void)(lock); } while (0) # define arch_spin_lock_flags(lock, flags) do { barrier(); (void)(lock); } while (0) diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index cfbfc540cafc..261471f407a5 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -87,4 +87,17 @@ static inline void srcu_barrier(struct srcu_struct *sp) synchronize_srcu(sp); } +/* Defined here to avoid size increase for non-torture kernels. */ +static inline void srcu_torture_stats_print(struct srcu_struct *sp, + char *tt, char *tf) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx) & 0x1; + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", + tt, tf, idx, + READ_ONCE(sp->srcu_lock_nesting[!idx]), + READ_ONCE(sp->srcu_lock_nesting[idx])); +} + #endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 42973f787e7e..a949f4f9e4d7 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -104,8 +104,6 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 -void process_srcu(struct work_struct *work); - #define __SRCU_STRUCT_INIT(name) \ { \ .sda = &name##_srcu_data, \ @@ -141,5 +139,6 @@ void process_srcu(struct work_struct *work); void synchronize_srcu_expedited(struct srcu_struct *sp); void srcu_barrier(struct srcu_struct *sp); +void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf); #endif diff --git a/include/linux/swait.h b/include/linux/swait.h index c1f9c62a8a50..4a4e180d0a35 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -169,4 +169,59 @@ do { \ __ret; \ }) +#define __swait_event_idle(wq, condition) \ + (void)___swait_event(wq, condition, TASK_IDLE, 0, schedule()) + +/** + * swait_event_idle - wait without system load contribution + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * + * The process is put to sleep (TASK_IDLE) until the @condition evaluates to + * true. The @condition is checked each time the waitqueue @wq is woken up. + * + * This function is mostly used when a kthread or workqueue waits for some + * condition and doesn't want to contribute to system load. Signals are + * ignored. + */ +#define swait_event_idle(wq, condition) \ +do { \ + if (condition) \ + break; \ + __swait_event_idle(wq, condition); \ +} while (0) + +#define __swait_event_idle_timeout(wq, condition, timeout) \ + ___swait_event(wq, ___wait_cond_timeout(condition), \ + TASK_IDLE, timeout, \ + __ret = schedule_timeout(__ret)) + +/** + * swait_event_idle_timeout - wait up to timeout without load contribution + * @wq: the waitqueue to wait on + * @condition: a C expression for the event to wait for + * @timeout: timeout at which we'll give up in jiffies + * + * The process is put to sleep (TASK_IDLE) until the @condition evaluates to + * true. The @condition is checked each time the waitqueue @wq is woken up. + * + * This function is mostly used when a kthread or workqueue waits for some + * condition and doesn't want to contribute to system load. Signals are + * ignored. + * + * Returns: + * 0 if the @condition evaluated to %false after the @timeout elapsed, + * 1 if the @condition evaluated to %true after the @timeout elapsed, + * or the remaining jiffies (at least 1) if the @condition evaluated + * to %true before the @timeout elapsed. + */ +#define swait_event_idle_timeout(wq, condition, timeout) \ +({ \ + long __ret = timeout; \ + if (!___wait_cond_timeout(condition)) \ + __ret = __swait_event_idle_timeout(wq, \ + condition, timeout); \ + __ret; \ +}) + #endif /* _LINUX_SWAIT_H */ diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 91dc089d65b7..e91ae1f2290d 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -703,6 +703,7 @@ TRACE_EVENT(rcu_batch_end, * at the beginning and end of the read, respectively. Note that the * callback address can be NULL. */ +#define RCUTORTURENAME_LEN 8 TRACE_EVENT(rcu_torture_read, TP_PROTO(const char *rcutorturename, struct rcu_head *rhp, @@ -711,7 +712,7 @@ TRACE_EVENT(rcu_torture_read, TP_ARGS(rcutorturename, rhp, secs, c_old, c), TP_STRUCT__entry( - __field(const char *, rcutorturename) + __field(char, rcutorturename[RCUTORTURENAME_LEN]) __field(struct rcu_head *, rhp) __field(unsigned long, secs) __field(unsigned long, c_old) @@ -719,7 +720,9 @@ TRACE_EVENT(rcu_torture_read, ), TP_fast_assign( - __entry->rcutorturename = rcutorturename; + strncpy(__entry->rcutorturename, rcutorturename, + RCUTORTURENAME_LEN); + __entry->rcutorturename[RCUTORTURENAME_LEN - 1] = 0; __entry->rhp = rhp; __entry->secs = secs; __entry->c_old = c_old; diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h index e0b108bd2624..6d47b3249d8a 100644 --- a/include/uapi/linux/membarrier.h +++ b/include/uapi/linux/membarrier.h @@ -40,14 +40,33 @@ * (non-running threads are de facto in such a * state). This covers threads from all processes * running on the system. This command returns 0. + * @MEMBARRIER_CMD_PRIVATE_EXPEDITED: + * Execute a memory barrier on each running + * thread belonging to the same process as the current + * thread. Upon return from system call, the + * caller thread is ensured that all its running + * threads siblings have passed through a state + * where all memory accesses to user-space + * addresses match program order between entry + * to and return from the system call + * (non-running threads are de facto in such a + * state). This only covers threads from the + * same processes as the caller thread. This + * command returns 0. The "expedited" commands + * complete faster than the non-expedited ones, + * they never block, but have the downside of + * causing extra overhead. * * Command to be passed to the membarrier system call. The commands need to * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to * the value 0. */ enum membarrier_cmd { - MEMBARRIER_CMD_QUERY = 0, - MEMBARRIER_CMD_SHARED = (1 << 0), + MEMBARRIER_CMD_QUERY = 0, + MEMBARRIER_CMD_SHARED = (1 << 0), + /* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */ + /* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */ + MEMBARRIER_CMD_PRIVATE_EXPEDITED = (1 << 3), }; #endif /* _UAPI_LINUX_MEMBARRIER_H */ diff --git a/ipc/sem.c b/ipc/sem.c index 9e70cd7a17da..2570830e29fc 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2091,7 +2091,8 @@ void exit_sem(struct task_struct *tsk) * possibility where we exit while freeary() didn't * finish unlocking sem_undo_list. */ - spin_unlock_wait(&ulp->lock); + spin_lock(&ulp->lock); + spin_unlock(&ulp->lock); rcu_read_unlock(); break; } diff --git a/kernel/Makefile b/kernel/Makefile index 4cb8e8b23c6e..9c323a6daa46 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_HAS_IOMEM) += memremap.o diff --git a/kernel/cpu.c b/kernel/cpu.c index eee033134262..bfbd649ccdc8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -650,6 +650,7 @@ static int takedown_cpu(unsigned int cpu) __cpu_die(cpu); tick_cleanup_dead_cpu(cpu); + rcutree_migrate_callbacks(cpu); return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index c5548faa9f37..f9ef3ecc78c1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -764,7 +764,6 @@ void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; - TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); kcov_task_exit(tsk); @@ -819,7 +818,8 @@ void __noreturn do_exit(long code) * Ensure that we must observe the pi_state in exit_mm() -> * mm_release() -> exit_pi_state_list(). */ - raw_spin_unlock_wait(&tsk->pi_lock); + raw_spin_lock_irq(&tsk->pi_lock); + raw_spin_unlock_irq(&tsk->pi_lock); if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", @@ -881,9 +881,7 @@ void __noreturn do_exit(long code) */ flush_ptrace_hw_breakpoint(tsk); - TASKS_RCU(preempt_disable()); - TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); - TASKS_RCU(preempt_enable()); + exit_tasks_rcu_start(); exit_notify(tsk, group_dead); proc_exit_connector(tsk); mpol_put_task_policy(tsk); @@ -918,7 +916,7 @@ void __noreturn do_exit(long code) if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); - TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); + exit_tasks_rcu_finish(); do_task_dead(); } diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index fd24153e8a48..294294c71ba4 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -268,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif -/* - * Various notes on spin_is_locked() and spin_unlock_wait(), which are - * 'interesting' functions: - * - * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE - * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, - * PPC). Also qspinlock has a similar issue per construction, the setting of - * the locked byte can be unordered acquiring the lock proper. - * - * This gets to be 'interesting' in the following cases, where the /should/s - * end up false because of this issue. - * - * - * CASE 1: - * - * So the spin_is_locked() correctness issue comes from something like: - * - * CPU0 CPU1 - * - * global_lock(); local_lock(i) - * spin_lock(&G) spin_lock(&L[i]) - * for (i) if (!spin_is_locked(&G)) { - * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); - * return; - * } - * // deal with fail - * - * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such - * that there is exclusion between the two critical sections. - * - * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from - * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) - * /should/ be constrained by the ACQUIRE from spin_lock(&G). - * - * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. - * - * - * CASE 2: - * - * For spin_unlock_wait() there is a second correctness issue, namely: - * - * CPU0 CPU1 - * - * flag = set; - * smp_mb(); spin_lock(&l) - * spin_unlock_wait(&l); if (!flag) - * // add to lockless list - * spin_unlock(&l); - * // iterate lockless list - * - * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 - * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE - * semantics etc..) - * - * Where flag /should/ be ordered against the locked store of l. - */ - -/* - * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before - * issuing an _unordered_ store to set _Q_LOCKED_VAL. - * - * This means that the store can be delayed, but no later than the - * store-release from the unlock. This means that simply observing - * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. - * - * There are two paths that can issue the unordered store: - * - * (1) clear_pending_set_locked(): *,1,0 -> *,0,1 - * - * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0 - * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1 - * - * However, in both cases we have other !0 state we've set before to queue - * ourseves: - * - * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our - * load is constrained by that ACQUIRE to not pass before that, and thus must - * observe the store. - * - * For (2) we have a more intersting scenario. We enqueue ourselves using - * xchg_tail(), which ends up being a RELEASE. This in itself is not - * sufficient, however that is followed by an smp_cond_acquire() on the same - * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and - * guarantees we must observe that store. - * - * Therefore both cases have other !0 state that is observable before the - * unordered locked byte store comes through. This means we can use that to - * wait for the lock store, and then wait for an unlock. - */ -#ifndef queued_spin_unlock_wait -void queued_spin_unlock_wait(struct qspinlock *lock) -{ - u32 val; - - for (;;) { - val = atomic_read(&lock->val); - - if (!val) /* not locked, we're done */ - goto done; - - if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ - break; - - /* not locked, but pending, wait until we observe the lock */ - cpu_relax(); - } - - /* any unlock is good */ - while (atomic_read(&lock->val) & _Q_LOCKED_MASK) - cpu_relax(); - -done: - smp_acquire__after_ctrl_dep(); -} -EXPORT_SYMBOL(queued_spin_unlock_wait); -#endif - #endif /* _GEN_PV_LOCK_SLOWPATH */ /** diff --git a/kernel/membarrier.c b/kernel/membarrier.c deleted file mode 100644 index 9f9284f37f8d..000000000000 --- a/kernel/membarrier.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2010, 2015 Mathieu Desnoyers - * - * membarrier system call - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#include -#include -#include - -/* - * Bitmask made from a "or" of all commands within enum membarrier_cmd, - * except MEMBARRIER_CMD_QUERY. - */ -#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED) - -/** - * sys_membarrier - issue memory barriers on a set of threads - * @cmd: Takes command values defined in enum membarrier_cmd. - * @flags: Currently needs to be 0. For future extensions. - * - * If this system call is not implemented, -ENOSYS is returned. If the - * command specified does not exist, or if the command argument is invalid, - * this system call returns -EINVAL. For a given command, with flags argument - * set to 0, this system call is guaranteed to always return the same value - * until reboot. - * - * All memory accesses performed in program order from each targeted thread - * is guaranteed to be ordered with respect to sys_membarrier(). If we use - * the semantic "barrier()" to represent a compiler barrier forcing memory - * accesses to be performed in program order across the barrier, and - * smp_mb() to represent explicit memory barriers forcing full memory - * ordering across the barrier, we have the following ordering table for - * each pair of barrier(), sys_membarrier() and smp_mb(): - * - * The pair ordering is detailed as (O: ordered, X: not ordered): - * - * barrier() smp_mb() sys_membarrier() - * barrier() X X O - * smp_mb() X O O - * sys_membarrier() O O O - */ -SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) -{ - /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ - if (tick_nohz_full_enabled()) - return -ENOSYS; - if (unlikely(flags)) - return -EINVAL; - switch (cmd) { - case MEMBARRIER_CMD_QUERY: - return MEMBARRIER_CMD_BITMASK; - case MEMBARRIER_CMD_SHARED: - if (num_online_cpus() > 1) - synchronize_sched(); - return 0; - default: - return -EINVAL; - } -} diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index be90c945063f..9210379c0353 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -69,8 +69,7 @@ config TREE_SRCU This option selects the full-fledged version of SRCU. config TASKS_RCU - bool - default n + def_bool PREEMPT select SRCU help This option enables a task-based RCU implementation that uses diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 808b8c85f626..e4b43fef89f5 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -356,22 +356,10 @@ do { \ #ifdef CONFIG_TINY_RCU /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ -static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ -{ - return true; -} -static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ -{ - return false; -} - -static inline void rcu_expedite_gp(void) -{ -} - -static inline void rcu_unexpedite_gp(void) -{ -} +static inline bool rcu_gp_is_normal(void) { return true; } +static inline bool rcu_gp_is_expedited(void) { return false; } +static inline void rcu_expedite_gp(void) { } +static inline void rcu_unexpedite_gp(void) { } #else /* #ifdef CONFIG_TINY_RCU */ bool rcu_gp_is_normal(void); /* Internal RCU use. */ bool rcu_gp_is_expedited(void); /* Internal RCU use. */ @@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, *gpnum = 0; *completed = 0; } -static inline void rcutorture_record_test_transition(void) -{ -} -static inline void rcutorture_record_progress(unsigned long vernum) -{ -} +static inline void rcutorture_record_test_transition(void) { } +static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, #endif #ifdef CONFIG_TINY_RCU - -/* - * Return the number of grace periods started. - */ -static inline unsigned long rcu_batches_started(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods started. - */ -static inline unsigned long rcu_batches_started_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods started. - */ -static inline unsigned long rcu_batches_started_sched(void) -{ - return 0; -} - -/* - * Return the number of grace periods completed. - */ -static inline unsigned long rcu_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of bottom-half grace periods completed. - */ -static inline unsigned long rcu_batches_completed_bh(void) -{ - return 0; -} - -/* - * Return the number of sched grace periods completed. - */ -static inline unsigned long rcu_batches_completed_sched(void) -{ - return 0; -} - -/* - * Return the number of expedited grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed(void) -{ - return 0; -} - -/* - * Return the number of expedited sched grace periods completed. - */ -static inline unsigned long rcu_exp_batches_completed_sched(void) -{ - return 0; -} - -static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) -{ - return 0; -} - -static inline void rcu_force_quiescent_state(void) -{ -} - -static inline void rcu_bh_force_quiescent_state(void) -{ -} - -static inline void rcu_sched_force_quiescent_state(void) -{ -} - -static inline void show_rcu_gp_kthreads(void) -{ -} - +static inline unsigned long rcu_batches_started(void) { return 0; } +static inline unsigned long rcu_batches_started_bh(void) { return 0; } +static inline unsigned long rcu_batches_started_sched(void) { return 0; } +static inline unsigned long rcu_batches_completed(void) { return 0; } +static inline unsigned long rcu_batches_completed_bh(void) { return 0; } +static inline unsigned long rcu_batches_completed_sched(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed(void) { return 0; } +static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; } +static inline unsigned long +srcu_batches_completed(struct srcu_struct *sp) { return 0; } +static inline void rcu_force_quiescent_state(void) { } +static inline void rcu_bh_force_quiescent_state(void) { } +static inline void rcu_sched_force_quiescent_state(void) { } +static inline void show_rcu_gp_kthreads(void) { } #else /* #ifdef CONFIG_TINY_RCU */ extern unsigned long rcutorture_testseq; extern unsigned long rcutorture_vernum; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 2b62a38b080f..7649fcd2c4c7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -35,24 +35,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->len_lazy = 0; } -/* - * Debug function to actually count the number of callbacks. - * If the number exceeds the limit specified, return -1. - */ -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) -{ - int cnt = 0; - struct rcu_head **rhpp = &rclp->head; - - for (;;) { - if (!*rhpp) - return cnt; - if (++cnt > lim) - return -1; - rhpp = &(*rhpp)->next; - } -} - /* * Dequeue the oldest rcu_head structure from the specified callback * list. This function assumes that the callback is non-lazy, but @@ -102,17 +84,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) rsclp->tails[RCU_NEXT_TAIL] = NULL; } -/* - * Is the specified segment of the specified rcu_segcblist structure - * empty of callbacks? - */ -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) -{ - if (seg == RCU_DONE_TAIL) - return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; - return rsclp->tails[seg - 1] == rsclp->tails[seg]; -} - /* * Does the specified rcu_segcblist structure contain callbacks that * are ready to be invoked? @@ -133,50 +104,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); } -/* - * Dequeue and return the first ready-to-invoke callback. If there - * are no ready-to-invoke callbacks, return NULL. Disables interrupts - * to avoid interference. Does not protect from interference from other - * CPUs or tasks. - */ -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - int i; - struct rcu_head *rhp; - - local_irq_save(flags); - if (!rcu_segcblist_ready_cbs(rsclp)) { - local_irq_restore(flags); - return NULL; - } - rhp = rsclp->head; - BUG_ON(!rhp); - rsclp->head = rhp->next; - for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { - if (rsclp->tails[i] != &rhp->next) - break; - rsclp->tails[i] = &rsclp->head; - } - smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ - WRITE_ONCE(rsclp->len, rsclp->len - 1); - local_irq_restore(flags); - return rhp; -} - -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) -{ - unsigned long flags; - - local_irq_save(flags); - rsclp->len_lazy--; - local_irq_restore(flags); -} - /* * Return a pointer to the first callback in the specified rcu_segcblist * structure. This is useful for diagnostics. @@ -202,17 +129,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) return NULL; } -/* - * Does the specified rcu_segcblist structure contain callbacks that - * have not yet been processed beyond having been posted, that is, - * does it contain callbacks in its last segment? - */ -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_is_enabled(rsclp) && - !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); -} - /* * Enqueue the specified callback onto the specified rcu_segcblist * structure, updating accounting as needed. Note that the ->len @@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, return true; return false; } + +/* + * Merge the source rcu_segcblist structure into the destination + * rcu_segcblist structure, then initialize the source. Any pending + * callbacks from the source get to start over. It is best to + * advance and accelerate both the destination and the source + * before merging. + */ +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp) +{ + struct rcu_cblist donecbs; + struct rcu_cblist pendcbs; + + rcu_cblist_init(&donecbs); + rcu_cblist_init(&pendcbs); + rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); + rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); + rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); +} diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 6e36e36478cd..581c12b63544 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) rclp->len_lazy--; } -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) -{ - return rclp->head; -} - -/* - * Interim function to return rcu_cblist head pointer. Longer term, the - * rcu_cblist will be used more pervasively, removing the need for this - * function. - */ -static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) -{ - WARN_ON_ONCE(!rclp->head); - return rclp->tail; -} - void rcu_cblist_init(struct rcu_cblist *rclp); -long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); /* @@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); -struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); -void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); -bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp, bool lazy); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, @@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, unsigned long seq); +void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, + struct rcu_segcblist *src_rsclp); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cc18110b612..1f87a02c3399 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks perf testing. */ @@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = { .name = "tasks" }; -#define RCUPERF_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUPERF_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * If performance tests complete, wait for shutdown to commence. */ @@ -658,7 +643,7 @@ rcu_perf_init(void) int firsterr = 0; static struct rcu_perf_ops *perf_ops[] = { &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, - RCUPERF_TASKS_OPS + &tasks_ops, }; if (!torture_init_begin(perf_type, verbose, &perf_runnable)) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b8f7f8ce8575..45f2ffbc1e78 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); static u64 notrace rcu_trace_clock_local(void) { u64 ts = trace_clock_local(); - unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); + + (void)do_div(ts, NSEC_PER_USEC); return ts; } #else /* #ifdef CONFIG_RCU_TRACE */ @@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = { .fqs = NULL, .stats = NULL, .irq_capable = 1, - .name = "rcu_busted" + .name = "busted" }; /* @@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp) delay = torture_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); - if (!delay) + if (!delay && in_task()) schedule_timeout_interruptible(longdelay); else rcu_read_delay(rrsp); @@ -561,44 +562,7 @@ static void srcu_torture_barrier(void) static void srcu_torture_stats(void) { - int __maybe_unused cpu; - int idx; - -#ifdef CONFIG_TREE_SRCU - idx = srcu_ctlp->srcu_idx & 0x1; - pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", - torture_type, TORTURE_FLAG, idx); - for_each_possible_cpu(cpu) { - unsigned long l0, l1; - unsigned long u0, u1; - long c0, c1; - struct srcu_data *counts; - - counts = per_cpu_ptr(srcu_ctlp->sda, cpu); - u0 = counts->srcu_unlock_count[!idx]; - u1 = counts->srcu_unlock_count[idx]; - - /* - * Make sure that a lock is always counted if the corresponding - * unlock is counted. - */ - smp_rmb(); - - l0 = counts->srcu_lock_count[!idx]; - l1 = counts->srcu_lock_count[idx]; - - c0 = l0 - u0; - c1 = l1 - u1; - pr_cont(" %d(%ld,%ld)", cpu, c0, c1); - } - pr_cont("\n"); -#elif defined(CONFIG_TINY_SRCU) - idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; - pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", - torture_type, TORTURE_FLAG, idx, - READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), - READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); -#endif + srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG); } static void srcu_torture_synchronize_expedited(void) @@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcu" }; @@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .irq_capable = 1, .name = "srcud" }; @@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -#ifdef CONFIG_TASKS_RCU - /* * Definitions for RCU-tasks torture testing. */ @@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = { .name = "tasks" }; -#define RCUTORTURE_TASKS_OPS &tasks_ops, - static bool __maybe_unused torturing_tasks(void) { return cur_ops == &tasks_ops; } -#else /* #ifdef CONFIG_TASKS_RCU */ - -#define RCUTORTURE_TASKS_OPS - -static bool __maybe_unused torturing_tasks(void) -{ - return false; -} - -#endif /* #else #ifdef CONFIG_TASKS_RCU */ - /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg) return 0; } +static void rcu_torture_timer_cb(struct rcu_head *rhp) +{ + kfree(rhp); +} + /* * RCU torture reader from timer handler. Dereferences rcu_torture_current, * incrementing the corresponding element of the pipeline array. The @@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused) __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); + + /* Test call_rcu() invocation from interrupt handler. */ + if (cur_ops->call) { + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT); + + if (rhp) + cur_ops->call(rhp, rcu_torture_timer_cb); + } } /* @@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void) srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gpnum, &completed); wtp = READ_ONCE(writer_task); - pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n", rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags, - wtp == NULL ? ~0UL : wtp->state); + wtp == NULL ? ~0UL : wtp->state, + wtp == NULL ? -1 : (int)task_cpu(wtp)); show_rcu_gp_kthreads(); rcu_ftrace_dump(DUMP_ALL); } @@ -1749,7 +1714,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, - &sched_ops, RCUTORTURE_TASKS_OPS + &sched_ops, &tasks_ops, }; if (!torture_init_begin(torture_type, verbose, &torture_runnable)) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 1a1c1047d2ed..76ac5f50b2c7 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -33,6 +33,8 @@ #include "rcu_segcblist.h" #include "rcu.h" +int rcu_scheduler_active __read_mostly; + static int init_srcu_struct_fields(struct srcu_struct *sp) { sp->srcu_lock_nesting[0] = 0; @@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp) destroy_rcu_head_on_stack(&rs.head); } EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* Lockdep diagnostics. */ +void __init rcu_scheduler_starting(void) +{ + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; +} diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index d0ca524bf042..729a8706751d 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444); static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); +static void process_srcu(struct work_struct *work); /* * Initialize SRCU combining tree. Note that statically allocated @@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); wait_for_completion(&rcu.completion); destroy_rcu_head_on_stack(&rcu.head); + + /* + * Make sure that later code is ordered after the SRCU grace + * period. This pairs with the raw_spin_lock_irq_rcu_node() + * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed + * because the current CPU might have been totally uninvolved with + * (and thus unordered against) that grace period. + */ + smp_mb(); } /** @@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* * This is the work-queue function that handles SRCU grace periods. */ -void process_srcu(struct work_struct *work) +static void process_srcu(struct work_struct *work) { struct srcu_struct *sp; @@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work) srcu_advance_state(sp); srcu_reschedule(sp, srcu_get_delay(sp)); } -EXPORT_SYMBOL_GPL(process_srcu); void srcutorture_get_gp_data(enum rcutorture_type test_type, struct srcu_struct *sp, int *flags, @@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type, } EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); +void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf) +{ + int cpu; + int idx; + unsigned long s0 = 0, s1 = 0; + + idx = sp->srcu_idx & 0x1; + pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx); + for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; + long c0, c1; + struct srcu_data *counts; + + counts = per_cpu_ptr(sp->sda, cpu); + u0 = counts->srcu_unlock_count[!idx]; + u1 = counts->srcu_unlock_count[idx]; + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = counts->srcu_lock_count[!idx]; + l1 = counts->srcu_lock_count[idx]; + + c0 = l0 - u0; + c1 = l1 - u1; + pr_cont(" %d(%ld,%ld)", cpu, c0, c1); + s0 += c0; + s1 += c1; + } + pr_cont(" T(%ld,%ld)\n", s0, s1); +} +EXPORT_SYMBOL_GPL(srcu_torture_stats_print); + static int __init srcu_bootup_announce(void) { pr_info("Hierarchical SRCU implementation.\n"); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f8488965250f..a64eee0db39e 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { .curtail = &rcu_bh_ctrlblk.rcucblist, }; -#include "tiny_plugin.h" - void rcu_barrier_bh(void) { wait_rcu_gp(call_rcu_bh); diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h deleted file mode 100644 index f0a01b2a3062..000000000000 --- a/kernel/rcu/tiny_plugin.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition - * Internal non-public definitions that provide either classic - * or preemptible semantics. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, you can access it online at - * http://www.gnu.org/licenses/gpl-2.0.html. - * - * Copyright (c) 2010 Linaro - * - * Author: Paul E. McKenney - */ - -#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) -#include - -int rcu_scheduler_active __read_mostly; -EXPORT_SYMBOL_GPL(rcu_scheduler_active); - -/* - * During boot, we forgive RCU lockdep issues. After this function is - * invoked, we start taking RCU lockdep issues seriously. Note that unlike - * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE - * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. - * The reason for this is that Tiny RCU does not need kthreads, so does - * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process. Unless SRCU is in the mix. - */ -void __init rcu_scheduler_starting(void) -{ - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) - ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; -} - -#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 51d4c3acf32d..84fe96641b2e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \ .gp_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ - .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ - .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ - .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - unsigned long flags; - - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!"); rcu_eqs_enter(false); - local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL /** @@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { - rcu_eqs_enter(1); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!"); + rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ @@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user) if (oldval & DYNTICK_TASK_NEST_MASK) { rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; } else { + __this_cpu_inc(disable_rcu_irq_enter); rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_eqs_exit_common(oldval, user); + __this_cpu_dec(disable_rcu_irq_enter); } } @@ -979,7 +975,6 @@ void rcu_idle_exit(void) rcu_eqs_exit(false); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(rcu_idle_exit); #ifdef CONFIG_NO_HZ_FULL /** @@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) { - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, rsp->gp_flags, gp_state_getname(rsp->gp_state), rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + rsp->gp_kthread ? rsp->gp_kthread->state : ~0, + rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1); if (rsp->gp_kthread) { sched_show_task(rsp->gp_kthread); wake_up_process(rsp->gp_kthread); @@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp) } /* - * Helper function for wait_event_interruptible_timeout() wakeup - * at force-quiescent-state time. + * Helper function for swait_event_idle() wakeup at force-quiescent-state + * time. */ static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) { @@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - swait_event_interruptible(rsp->gp_wq, - READ_ONCE(rsp->gp_flags) & - RCU_GP_FLAG_INIT); + swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & + RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) @@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = swait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_idle_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ @@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 && + rcu_preempt_blocked_readers_cgp(rnp)); rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, mask, rnp->qsmask, rnp->level, @@ -2562,85 +2559,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) rcu_report_qs_rdp(rdp->cpu, rsp, rdp); } -/* - * Send the specified CPU's RCU callbacks to the orphanage. The - * specified CPU must be offline, and the caller must hold the - * ->orphan_lock. - */ -static void -rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, - struct rcu_node *rnp, struct rcu_data *rdp) -{ - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs do not have orphanable callbacks. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) - return; - - /* - * Orphan the callbacks. First adjust the counts. This is safe - * because _rcu_barrier() excludes CPU-hotplug operations, so it - * cannot be running now. Thus no memory barrier is required. - */ - rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); - rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); - - /* - * Next, move those callbacks still needing a grace period to - * the orphanage, where some other CPU will pick them up. - * Some of the callbacks might have gone partway through a grace - * period, but that is too bad. They get to start over because we - * cannot assume that grace periods are synchronized across CPUs. - */ - rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - - /* - * Then move the ready-to-invoke callbacks to the orphanage, - * where some other CPU will pick them up. These will not be - * required to pass though another grace period: They are done. - */ - rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - - /* Finally, disallow further callbacks on this CPU. */ - rcu_segcblist_disable(&rdp->cblist); -} - -/* - * Adopt the RCU callbacks from the specified rcu_state structure's - * orphanage. The caller must hold the ->orphan_lock. - */ -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) -{ - struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - - lockdep_assert_held(&rsp->orphan_lock); - - /* No-CBs CPUs are handled specially. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || - rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) - return; - - /* Do the accounting first. */ - rdp->n_cbs_adopted += rsp->orphan_done.len; - if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) - rcu_idle_count_callbacks_posted(); - rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); - - /* - * We do not need a memory barrier here because the only way we - * can get here if there is an rcu_barrier() in flight is if - * we are the task doing the rcu_barrier(). - */ - - /* First adopt the ready-to-invoke callbacks, then the done ones. */ - rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); - WARN_ON_ONCE(rsp->orphan_done.head); - rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); - WARN_ON_ONCE(rsp->orphan_pend.head); - WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != - !rcu_segcblist_n_cbs(&rdp->cblist)); -} - /* * Trace the fact that this CPU is going offline. */ @@ -2704,14 +2622,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) /* * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup, - * including orphaning the outgoing CPU's RCU callbacks, and also - * adopting them. There can only be one CPU hotplug operation at a time, - * so no other CPU can be attempting to update rcu_cpu_kthread_task. + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { - unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2720,18 +2636,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - - /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); - rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); - rcu_adopt_orphan_cbs(rsp, flags); - raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - - WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || - !rcu_segcblist_empty(&rdp->cblist), - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", - cpu, rcu_segcblist_n_cbs(&rdp->cblist), - rcu_segcblist_first_cb(&rdp->cblist)); } /* @@ -3569,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("LastCB"), -1, + rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence); } } @@ -3584,14 +3489,15 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence); rdp->barrier_head.func = rcu_barrier_callback; debug_rcu_head_queue(&rdp->barrier_head); if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { atomic_inc(&rsp->barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); - _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1, + rsp->barrier_sequence); } } @@ -3605,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp) struct rcu_data *rdp; unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, s); + _rcu_barrier_trace(rsp, TPS("Begin"), -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); /* Did someone else do our work for us? */ if (rcu_seq_done(&rsp->barrier_sequence, s)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1, + rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; @@ -3620,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp) /* Mark the start of the barrier operation. */ rcu_seq_start(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3643,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp) rdp = per_cpu_ptr(rsp->rda, cpu); if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { - _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu, rsp->barrier_sequence); } else { - _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu, rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); @@ -3654,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp) rcu_barrier_callback, rsp, cpu, 0); } } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { - _rcu_barrier_trace(rsp, "OnlineQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu, rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { - _rcu_barrier_trace(rsp, "OnlineNQ", cpu, + _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu, rsp->barrier_sequence); } } @@ -3675,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp) wait_for_completion(&rsp->barrier_completion); /* Mark the end of the barrier operation. */ - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence); rcu_seq_end(&rsp->barrier_sequence); /* Other rcu_barrier() invocations can now safely proceed. */ @@ -3777,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - if (!rdp->beenonline) - WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); rdp->beenonline = true; /* We have now been online. */ rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; @@ -3882,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu) { unsigned long flags; unsigned long mask; + int nbits; + unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp; struct rcu_state *rsp; @@ -3892,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->qsmaskinitnext |= mask; + oldmask = rnp->expmaskinitnext; rnp->expmaskinitnext |= mask; + oldmask ^= rnp->expmaskinitnext; + nbits = bitmap_weight(&oldmask, BITS_PER_LONG); + /* Allow lockless access for expedited grace periods. */ + smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */ raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } #ifdef CONFIG_HOTPLUG_CPU @@ -3937,6 +3850,50 @@ void rcu_report_dead(unsigned int cpu) for_each_rcu_flavor(rsp) rcu_cleanup_dying_idle_cpu(cpu, rsp); } + +/* Migrate the dead CPU's callbacks to the current CPU. */ +static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *my_rdp; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + + if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist)) + return; /* No callbacks to migrate. */ + + local_irq_save(flags); + my_rdp = this_cpu_ptr(rsp->rda); + if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) { + local_irq_restore(flags); + return; + } + raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ + rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */ + rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */ + rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist); + WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != + !rcu_segcblist_n_cbs(&my_rdp->cblist)); + raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags); + WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || + !rcu_segcblist_empty(&rdp->cblist), + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", + cpu, rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_first_cb(&rdp->cblist)); +} + +/* + * The outgoing CPU has just passed through the dying-idle state, + * and we are being invoked from the CPU that was IPIed to continue the + * offline operation. We need to migrate the outgoing CPU's callbacks. + */ +void rcutree_migrate_callbacks(int cpu) +{ + struct rcu_state *rsp; + + for_each_rcu_flavor(rsp) + rcu_migrate_callbacks(cpu, rsp); +} #endif /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9af0f31d6847..8e1f285f0a70 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -219,8 +219,6 @@ struct rcu_data { /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ - unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ - unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -268,7 +266,9 @@ struct rcu_data { struct rcu_head **nocb_follower_tail; struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; + raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ + struct timer_list nocb_timer; /* Enforce finite deferral. */ /* The following fields are used by the leader, hence own cacheline. */ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; @@ -350,15 +350,6 @@ struct rcu_state { /* End of fields guarded by root rcu_node's lock. */ - raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; - /* Protect following fields. */ - struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ - /* need a grace period. */ - struct rcu_cblist orphan_done; /* Orphaned callbacks that */ - /* are ready to invoke. */ - /* (Contains counts.) */ - /* End of fields guarded by orphan_lock. */ - struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ @@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); -static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index dd21ca47e4b4..46d61b597731 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) unsigned long flags; unsigned long mask; unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); + int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */ struct rcu_node *rnp; struct rcu_node *rnp_up; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 908b309d60d7..55bde94b9572 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) struct task_struct *t = current; lockdep_assert_held(&rnp->lock); + WARN_ON_ONCE(rdp->mynode != rnp); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); /* * Decide where to queue the newly blocked task. In theory, @@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; + WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != + !(rnp->qsmask & rdp->grpmask)); + WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != + !(rnp->expmask & rdp->grpmask)); raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* @@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t) rnp = t->rcu_blocked_node; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ WARN_ON_ONCE(rnp != t->rcu_blocked_node); + WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t) if (&t->rcu_node_entry == rnp->exp_tasks) rnp->exp_tasks = np; if (IS_ENABLED(CONFIG_RCU_BOOST)) { - if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; } /* @@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) { + struct task_struct *t; + RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); - if (rcu_preempt_has_tasks(rnp)) + if (rcu_preempt_has_tasks(rnp)) { rnp->gp_tasks = rnp->blkd_tasks.next; + t = container_of(rnp->gp_tasks, struct task_struct, + rcu_node_entry); + trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"), + rnp->gpnum, t->pid); + } WARN_ON_ONCE(rnp->qsmask); } @@ -1788,22 +1802,61 @@ bool rcu_is_nocb_cpu(int cpu) } /* - * Kick the leader kthread for this NOCB group. + * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock + * and this function releases it. */ -static void wake_nocb_leader(struct rcu_data *rdp, bool force) +static void __wake_nocb_leader(struct rcu_data *rdp, bool force, + unsigned long flags) + __releases(rdp->nocb_lock) { struct rcu_data *rdp_leader = rdp->nocb_leader; - if (!READ_ONCE(rdp_leader->nocb_kthread)) + lockdep_assert_held(&rdp->nocb_lock); + if (!READ_ONCE(rdp_leader->nocb_kthread)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; - if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { + } + if (rdp_leader->nocb_leader_sleep || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); + del_timer(&rdp->nocb_timer); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ swake_up(&rdp_leader->nocb_wq); + } else { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); } } +/* + * Kick the leader kthread for this NOCB group, but caller has not + * acquired locks. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + __wake_nocb_leader(rdp, force, flags); +} + +/* + * Arrange to wake the leader kthread for this NOCB group at some + * future time when it is safe to do so. + */ +static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype, + const char *reason) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) + mod_timer(&rdp->nocb_timer, jiffies + 1); + WRITE_ONCE(rdp->nocb_defer_wakeup, waketype); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason); + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); +} + /* * Does the specified CPU need an RCU callback for the specified flavor * of rcu_barrier()? @@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeEmptyIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeEmptyIsDeferred")); } rdp->qlen_last_fqs_check = 0; } else if (len > rdp->qlen_last_fqs_check + qhimark) { @@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); - /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ - smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - TPS("WakeOvfIsDeferred")); + wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE, + TPS("WakeOvfIsDeferred")); } rdp->qlen_last_fqs_check = LONG_MAX / 2; } else { @@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is * not a no-CBs CPU. */ -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->orphan_done.len; - long qll = rsp->orphan_done.len_lazy; - - /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!"); if (!rcu_is_nocb_cpu(smp_processor_id())) - return false; - - /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_done.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), - rcu_cblist_tail(&rsp->orphan_done), - ql, qll, flags); - } - if (rsp->orphan_pend.head) { - __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), - rcu_cblist_tail(&rsp->orphan_pend), - ql, qll, flags); - } - rcu_cblist_init(&rsp->orphan_done); - rcu_cblist_init(&rsp->orphan_pend); + return false; /* Not NOCBs CPU, caller must migrate CBs. */ + __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist), + rcu_segcblist_tail(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags); + rcu_segcblist_init(&rdp->cblist); + rcu_segcblist_disable(&rdp->cblist); return true; } @@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) static void nocb_leader_wait(struct rcu_data *my_rdp) { bool firsttime = true; + unsigned long flags; bool gotcbs; struct rcu_data *rdp; struct rcu_head **tail; @@ -2039,13 +2076,17 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep")); swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); - /* Memory barrier handled by smp_mb() calls below and repoll. */ + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); + my_rdp->nocb_leader_sleep = true; + WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + del_timer(&my_rdp->nocb_timer); + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); } else if (firsttime) { firsttime = false; /* Don't drown trace log with "Poll"! */ - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll")); } /* @@ -2054,7 +2095,7 @@ wait_again: * nocb_gp_head, where they await a grace period. */ gotcbs = false; - smp_mb(); /* wakeup before ->nocb_head reads. */ + smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); if (!rdp->nocb_gp_head) @@ -2066,56 +2107,41 @@ wait_again: gotcbs = true; } - /* - * If there were no callbacks, sleep a bit, rescan after a - * memory barrier, and go retry. - */ + /* No callbacks? Sleep a bit if polling, and go retry. */ if (unlikely(!gotcbs)) { - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, - "WokeEmpty"); WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); - - /* Rescan in case we were a victim of memory ordering. */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan. */ - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) - if (READ_ONCE(rdp->nocb_head)) { - /* Found CB, so short-circuit next wait. */ - my_rdp->nocb_leader_sleep = false; - break; - } + if (rcu_nocb_poll) { + schedule_timeout_interruptible(1); + } else { + trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, + TPS("WokeEmpty")); + } goto wait_again; } /* Wait for one grace period. */ rcu_nocb_wait_gp(my_rdp); - /* - * We left ->nocb_leader_sleep unset to reduce cache thrashing. - * We set it now, but recheck for new callbacks while - * traversing our follower list. - */ - my_rdp->nocb_leader_sleep = true; - smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */ - /* Each pass through the following loop wakes a follower, if needed. */ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { - if (READ_ONCE(rdp->nocb_head)) + if (!rcu_nocb_poll && + READ_ONCE(rdp->nocb_head) && + READ_ONCE(my_rdp->nocb_leader_sleep)) { + raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags); my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ + raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags); + } if (!rdp->nocb_gp_head) continue; /* No CBs, so no need to wake follower. */ /* Append callbacks to follower's "done" list. */ - tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = rdp->nocb_gp_tail; *tail = rdp->nocb_gp_head; - smp_mb__after_atomic(); /* Store *tail before wakeup. */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { - /* - * List was empty, wake up the follower. - * Memory barriers supplied by atomic_long_add(). - */ + /* List was empty, so wake up the follower. */ swake_up(&rdp->nocb_wq); } } @@ -2131,28 +2157,16 @@ wait_again: */ static void nocb_follower_wait(struct rcu_data *rdp) { - bool firsttime = true; - for (;;) { - if (!rcu_nocb_poll) { - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "FollowerSleep"); - swait_event_interruptible(rdp->nocb_wq, - READ_ONCE(rdp->nocb_follower_head)); - } else if (firsttime) { - /* Don't drown trace log with "Poll"! */ - firsttime = false; - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); - } + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep")); + swait_event_interruptible(rdp->nocb_wq, + READ_ONCE(rdp->nocb_follower_head)); if (smp_load_acquire(&rdp->nocb_follower_head)) { /* ^^^ Ensure CB invocation follows _head test. */ return; } - if (!rcu_nocb_poll) - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, - "WokeEmpty"); WARN_ON(signal_pending(current)); - schedule_timeout_interruptible(1); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty")); } } @@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) static int rcu_nocb_kthread(void *arg) { int c, cl; + unsigned long flags; struct rcu_head *list; struct rcu_head *next; struct rcu_head **tail; @@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg) nocb_follower_wait(rdp); /* Pull the ready-to-invoke callbacks onto local list. */ - list = READ_ONCE(rdp->nocb_follower_head); + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + list = rdp->nocb_follower_head; + rdp->nocb_follower_head = NULL; + tail = rdp->nocb_follower_tail; + rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); BUG_ON(!list); - trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); - WRITE_ONCE(rdp->nocb_follower_head, NULL); - tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); + trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty")); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, @@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { + unsigned long flags; int ndw; - if (!rcu_nocb_need_deferred_wakeup(rdp)) + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + if (!rcu_nocb_need_deferred_wakeup(rdp)) { + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return; + } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); + __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); } +/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ +static void do_nocb_deferred_wakeup_timer(unsigned long x) +{ + do_nocb_deferred_wakeup_common((struct rcu_data *)x); +} + +/* + * Do a deferred wakeup of rcu_nocb_kthread() from fastpath. + * This means we do an inexact common-case check. Note that if + * we miss, ->nocb_timer will eventually clean things up. + */ +static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +{ + if (rcu_nocb_need_deferred_wakeup(rdp)) + do_nocb_deferred_wakeup_common(rdp); +} + void __init rcu_init_nohz(void) { int cpu; @@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) rdp->nocb_tail = &rdp->nocb_head; init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; + raw_spin_lock_init(&rdp->nocb_lock); + setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, + (unsigned long)rdp); } /* @@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, return false; } -static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, +static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp, struct rcu_data *rdp, unsigned long flags) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 00e77c470017..5033b66d2753 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq); static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); /* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_SRCU(tasks_rcu_exit_srcu); +DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ #define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) @@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void) mutex_unlock(&rcu_tasks_kthread_mutex); } +/* Do the srcu_read_lock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_start(void) +{ + preempt_disable(); + current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + preempt_enable(); +} + +/* Do the srcu_read_unlock() for the above synchronize_srcu(). */ +void exit_tasks_rcu_finish(void) +{ + preempt_disable(); + __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx); + preempt_enable(); +} + #endif /* #ifdef CONFIG_TASKS_RCU */ #ifndef CONFIG_TINY_RCU diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 53f0164ed362..78f54932ea1d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +obj-$(CONFIG_MEMBARRIER) += membarrier.o diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 13fc5ae9bf2f..c9524d2d9316 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -300,6 +300,8 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; + if (!READ_ONCE(x->done)) return false; @@ -307,14 +309,9 @@ bool completion_done(struct completion *x) * If ->done, we need to wait for complete() to release ->wait.lock * otherwise we can end up freeing the completion before complete() * is done referencing it. - * - * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders - * the loads of ->done and ->wait.lock such that we cannot observe - * the lock before complete() acquires it while observing the ->done - * after it's acquired the lock. */ - smp_rmb(); - spin_unlock_wait(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); + spin_unlock_irqrestore(&x->wait.lock, flags); return true; } EXPORT_SYMBOL(completion_done); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b427b4..3f29c6a89d80 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -951,8 +951,13 @@ struct migration_arg { static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, int dest_cpu) { - if (unlikely(!cpu_active(dest_cpu))) - return rq; + if (p->flags & PF_KTHREAD) { + if (unlikely(!cpu_online(dest_cpu))) + return rq; + } else { + if (unlikely(!cpu_active(dest_cpu))) + return rq; + } /* Affinity changed (again). */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) @@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + /* + * The membarrier system call requires a full memory barrier + * after storing to rq->curr, before going back to user-space. + * + * TODO: This smp_mb__after_unlock_lock can go away if PPC end + * up adding a full barrier to switch_mm(), or we should figure + * out if a smp_mb__after_unlock_lock is really the proper API + * to use. + */ + smp_mb__after_unlock_lock(); finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); @@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt) if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; + /* + * The membarrier system call requires each architecture + * to have a full memory barrier after updating + * rq->curr, before returning to user-space. For TSO + * (e.g. x86), the architecture must provide its own + * barrier in switch_mm(). For weakly ordered machines + * for which spin_unlock() acts as a full memory + * barrier, finish_lock_switch() in common code takes + * care of this barrier. For weakly ordered machines for + * which spin_unlock() acts as a RELEASE barrier (only + * arm64 and PowerPC), arm64 has a full barrier in + * switch_to(), and PowerPC has + * smp_mb__after_unlock_lock() before + * finish_lock_switch(). + */ ++*switch_count; trace_sched_switch(preempt, prev, next); @@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void) * To avoid it, we have to wait for releasing tsk->pi_lock which * is held by try_to_wake_up() */ - smp_mb(); - raw_spin_unlock_wait(¤t->pi_lock); + raw_spin_lock_irq(¤t->pi_lock); + raw_spin_unlock_irq(¤t->pi_lock); /* Causes final put_task_struct in finish_task_switch(): */ __set_current_state(TASK_DEAD); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c new file mode 100644 index 000000000000..a92fddc22747 --- /dev/null +++ b/kernel/sched/membarrier.c @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2010-2017 Mathieu Desnoyers + * + * membarrier system call + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include + +#include "sched.h" /* for cpu_rq(). */ + +/* + * Bitmask made from a "or" of all commands within enum membarrier_cmd, + * except MEMBARRIER_CMD_QUERY. + */ +#define MEMBARRIER_CMD_BITMASK \ + (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED) + +static void ipi_mb(void *info) +{ + smp_mb(); /* IPIs should be serializing but paranoid. */ +} + +static void membarrier_private_expedited(void) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (num_online_cpus() == 1) + return; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm == current->mm) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ +} + +/** + * sys_membarrier - issue memory barriers on a set of threads + * @cmd: Takes command values defined in enum membarrier_cmd. + * @flags: Currently needs to be 0. For future extensions. + * + * If this system call is not implemented, -ENOSYS is returned. If the + * command specified does not exist, not available on the running + * kernel, or if the command argument is invalid, this system call + * returns -EINVAL. For a given command, with flags argument set to 0, + * this system call is guaranteed to always return the same value until + * reboot. + * + * All memory accesses performed in program order from each targeted thread + * is guaranteed to be ordered with respect to sys_membarrier(). If we use + * the semantic "barrier()" to represent a compiler barrier forcing memory + * accesses to be performed in program order across the barrier, and + * smp_mb() to represent explicit memory barriers forcing full memory + * ordering across the barrier, we have the following ordering table for + * each pair of barrier(), sys_membarrier() and smp_mb(): + * + * The pair ordering is detailed as (O: ordered, X: not ordered): + * + * barrier() smp_mb() sys_membarrier() + * barrier() X X O + * smp_mb() X O O + * sys_membarrier() O O O + */ +SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) +{ + if (unlikely(flags)) + return -EINVAL; + switch (cmd) { + case MEMBARRIER_CMD_QUERY: + { + int cmd_mask = MEMBARRIER_CMD_BITMASK; + + if (tick_nohz_full_enabled()) + cmd_mask &= ~MEMBARRIER_CMD_SHARED; + return cmd_mask; + } + case MEMBARRIER_CMD_SHARED: + /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + if (tick_nohz_full_enabled()) + return -EINVAL; + if (num_online_cpus() > 1) + synchronize_sched(); + return 0; + case MEMBARRIER_CMD_PRIVATE_EXPEDITED: + membarrier_private_expedited(); + return 0; + default: + return -EINVAL; + } +} diff --git a/kernel/task_work.c b/kernel/task_work.c index d513051fcca2..836a72a66fba 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -96,20 +96,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + raw_spin_lock_irq(&task->pi_lock); do { work = READ_ONCE(task->task_works); head = !work && (task->flags & PF_EXITING) ? &work_exited : NULL; } while (cmpxchg(&task->task_works, work, head) != work); + raw_spin_unlock_irq(&task->pi_lock); if (!work) break; - /* - * Synchronize with task_work_cancel(). It can't remove - * the first entry == work, cmpxchg(task_works) should - * fail, but it can play with *work and other entries. - */ - raw_spin_unlock_wait(&task->pi_lock); do { next = work->next; diff --git a/kernel/torture.c b/kernel/torture.c index 55de96529287..637e172835d8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, torture_type, cpu); (*n_offl_successes)++; delta = jiffies - starttime; - sum_offl += delta; + *sum_offl += delta; if (*min_offl < 0) { *min_offl = delta; *max_offl = delta; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9979f46c81dc..51390febd5e3 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -96,19 +96,26 @@ static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) { + /* 1) Acquire the lock */ spin_lock(lock); - while (unlikely(nf_conntrack_locks_all)) { - spin_unlock(lock); - /* - * Order the 'nf_conntrack_locks_all' load vs. the - * spin_unlock_wait() loads below, to ensure - * that 'nf_conntrack_locks_all_lock' is indeed held: - */ - smp_rmb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ - spin_unlock_wait(&nf_conntrack_locks_all_lock); - spin_lock(lock); - } + /* 2) read nf_conntrack_locks_all, with ACQUIRE semantics + * It pairs with the smp_store_release() in nf_conntrack_all_unlock() + */ + if (likely(smp_load_acquire(&nf_conntrack_locks_all) == false)) + return; + + /* fast path failed, unlock */ + spin_unlock(lock); + + /* Slow path 1) get global lock */ + spin_lock(&nf_conntrack_locks_all_lock); + + /* Slow path 2) get the lock we want */ + spin_lock(lock); + + /* Slow path 3) release the global lock */ + spin_unlock(&nf_conntrack_locks_all_lock); } EXPORT_SYMBOL_GPL(nf_conntrack_lock); @@ -149,28 +156,27 @@ static void nf_conntrack_all_lock(void) int i; spin_lock(&nf_conntrack_locks_all_lock); + nf_conntrack_locks_all = true; - /* - * Order the above store of 'nf_conntrack_locks_all' against - * the spin_unlock_wait() loads below, such that if - * nf_conntrack_lock() observes 'nf_conntrack_locks_all' - * we must observe nf_conntrack_locks[] held: - */ - smp_mb(); /* spin_lock(&nf_conntrack_locks_all_lock) */ - for (i = 0; i < CONNTRACK_LOCKS; i++) { - spin_unlock_wait(&nf_conntrack_locks[i]); + spin_lock(&nf_conntrack_locks[i]); + + /* This spin_unlock provides the "release" to ensure that + * nf_conntrack_locks_all==true is visible to everyone that + * acquired spin_lock(&nf_conntrack_locks[]). + */ + spin_unlock(&nf_conntrack_locks[i]); } } static void nf_conntrack_all_unlock(void) { - /* - * All prior stores must be complete before we clear + /* All prior stores must be complete before we clear * 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock() * might observe the false value but not the entire - * critical section: + * critical section. + * It pairs with the smp_load_acquire() in nf_conntrack_lock() */ smp_store_release(&nf_conntrack_locks_all, false); spin_unlock(&nf_conntrack_locks_all_lock); diff --git a/tools/testing/selftests/rcutorture/bin/config_override.sh b/tools/testing/selftests/rcutorture/bin/config_override.sh new file mode 100755 index 000000000000..49fa51726ce3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/bin/config_override.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# +# config_override.sh base override +# +# Combines base and override, removing any Kconfig options from base +# that conflict with any in override, concatenating what remains and +# sending the result to standard output. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, you can access it online at +# http://www.gnu.org/licenses/gpl-2.0.html. +# +# Copyright (C) IBM Corporation, 2017 +# +# Authors: Paul E. McKenney + +base=$1 +if test -r $base +then + : +else + echo Base file $base unreadable!!! + exit 1 +fi + +override=$2 +if test -r $override +then + : +else + echo Override file $override unreadable!!! + exit 1 +fi + +T=/tmp/config_override.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +sed < $override -e 's/^/grep -v "/' -e 's/=.*$/="/' | + awk ' + { + if (last) + print last " |"; + last = $0; + } + END { + if (last) + print last; + }' > $T/script +sh $T/script < $base +cat $override diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 1426a9b97494..07a13779eece 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -66,8 +66,33 @@ configfrag_boot_params () { # configfrag_boot_cpus bootparam-string config-fragment-file config-cpus # -# Decreases number of CPUs based on any maxcpus= boot parameters specified. +# Decreases number of CPUs based on any nr_cpus= boot parameters specified. configfrag_boot_cpus () { + local bootargs="`configfrag_boot_params "$1" "$2"`" + local nr_cpus + if echo "${bootargs}" | grep -q 'nr_cpus=[0-9]' + then + nr_cpus="`echo "${bootargs}" | sed -e 's/^.*nr_cpus=\([0-9]*\).*$/\1/'`" + if test "$3" -gt "$nr_cpus" + then + echo $nr_cpus + else + echo $3 + fi + else + echo $3 + fi +} + +# configfrag_boot_maxcpus bootparam-string config-fragment-file config-cpus +# +# Decreases number of CPUs based on any maxcpus= boot parameters specified. +# This allows tests where additional CPUs come online later during the +# test run. However, the torture parameters will be set based on the +# number of CPUs initially present, so the scripting should schedule +# test runs based on the maxcpus= boot parameter controlling the initial +# number of CPUs instead of on the ultimate number of CPUs. +configfrag_boot_maxcpus () { local bootargs="`configfrag_boot_params "$1" "$2"`" local maxcpus if echo "${bootargs}" | grep -q 'maxcpus=[0-9]' diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh index c29f2ec0bf9f..46752c164676 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh @@ -2,7 +2,7 @@ # # Build a kvm-ready Linux kernel from the tree in the current directory. # -# Usage: kvm-build.sh config-template build-dir more-configs +# Usage: kvm-build.sh config-template build-dir # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -34,24 +34,17 @@ then echo "kvm-build.sh :$builddir: Not a writable directory, cannot build into it" exit 1 fi -moreconfigs=${3} -if test -z "$moreconfigs" -o ! -r "$moreconfigs" -then - echo "kvm-build.sh :$moreconfigs: Not a readable file" - exit 1 -fi T=/tmp/test-linux.sh.$$ trap 'rm -rf $T' 0 mkdir $T -grep -v 'CONFIG_[A-Z]*_TORTURE_TEST=' < ${config_template} > $T/config +cp ${config_template} $T/config cat << ___EOF___ >> $T/config CONFIG_INITRAMFS_SOURCE="$TORTURE_INITRD" CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_CONSOLE=y ___EOF___ -cat $moreconfigs >> $T/config configinit.sh $T/config O=$builddir retval=$? diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 93eede4e8fbe..0af36a721b9c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -40,7 +40,7 @@ T=/tmp/kvm-test-1-run.sh.$$ trap 'rm -rf $T' 0 -touch $T +mkdir $T . $KVM/bin/functions.sh . $CONFIGFRAG/ver_functions.sh @@ -60,37 +60,33 @@ then echo "kvm-test-1-run.sh :$resdir: Not a writable directory, cannot store results into it" exit 1 fi -cp $config_template $resdir/ConfigFragment echo ' ---' `date`: Starting build echo ' ---' Kconfig fragment at: $config_template >> $resdir/log +touch $resdir/ConfigFragment.input $resdir/ConfigFragment if test -r "$config_dir/CFcommon" then - cat < $config_dir/CFcommon >> $T + echo " --- $config_dir/CFcommon" >> $resdir/ConfigFragment.input + cat < $config_dir/CFcommon >> $resdir/ConfigFragment.input + config_override.sh $config_dir/CFcommon $config_template > $T/Kc1 + grep '#CHECK#' $config_dir/CFcommon >> $resdir/ConfigFragment +else + cp $config_template $T/Kc1 fi -# Optimizations below this point -# CONFIG_USB=n -# CONFIG_SECURITY=n -# CONFIG_NFS_FS=n -# CONFIG_SOUND=n -# CONFIG_INPUT_JOYSTICK=n -# CONFIG_INPUT_TABLET=n -# CONFIG_INPUT_TOUCHSCREEN=n -# CONFIG_INPUT_MISC=n -# CONFIG_INPUT_MOUSE=n -# # CONFIG_NET=n # disables console access, so accept the slower build. -# CONFIG_SCSI=n -# CONFIG_ATA=n -# CONFIG_FAT_FS=n -# CONFIG_MSDOS_FS=n -# CONFIG_VFAT_FS=n -# CONFIG_ISO9660_FS=n -# CONFIG_QUOTA=n -# CONFIG_HID=n -# CONFIG_CRYPTO=n -# CONFIG_PCCARD=n -# CONFIG_PCMCIA=n -# CONFIG_CARDBUS=n -# CONFIG_YENTA=n +echo " --- $config_template" >> $resdir/ConfigFragment.input +cat $config_template >> $resdir/ConfigFragment.input +grep '#CHECK#' $config_template >> $resdir/ConfigFragment +if test -n "$TORTURE_KCONFIG_ARG" +then + echo $TORTURE_KCONFIG_ARG | tr -s " " "\012" > $T/cmdline + echo " --- --kconfig argument" >> $resdir/ConfigFragment.input + cat $T/cmdline >> $resdir/ConfigFragment.input + config_override.sh $T/Kc1 $T/cmdline > $T/Kc2 + # Note that "#CHECK#" is not permitted on commandline. +else + cp $T/Kc1 $T/Kc2 +fi +cat $T/Kc2 >> $resdir/ConfigFragment + base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'` if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux then @@ -100,7 +96,9 @@ then KERNEL=$base_resdir/${BOOT_IMAGE##*/} # use the last component of ${BOOT_IMAGE} ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh ln -s $base_resdir/.config $resdir # for kvm-recheck.sh -elif kvm-build.sh $config_template $builddir $T + # Arch-independent indicator + touch $resdir/builtkernel +elif kvm-build.sh $T/Kc2 $builddir then # Had to build a kernel for this test. QEMU="`identify_qemu $builddir/vmlinux`" @@ -112,6 +110,8 @@ then then cp $builddir/$BOOT_IMAGE $resdir KERNEL=$resdir/${BOOT_IMAGE##*/} + # Arch-independent indicator + touch $resdir/builtkernel else echo No identifiable boot image, not running KVM, see $resdir. echo Do the torture scripts know about your architecture? @@ -149,7 +149,7 @@ fi # Generate -smp qemu argument. qemu_args="-enable-kvm -nographic $qemu_args" -cpu_count=`configNR_CPUS.sh $config_template` +cpu_count=`configNR_CPUS.sh $resdir/ConfigFragment` cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"` vcpus=`identify_qemu_vcpus` if test $cpu_count -gt $vcpus diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 50091de3a911..b55895fb10ed 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -41,6 +41,7 @@ PATH=${KVM}/bin:$PATH; export PATH TORTURE_DEFCONFIG=defconfig TORTURE_BOOT_IMAGE="" TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD +TORTURE_KCONFIG_ARG="" TORTURE_KMAKE_ARG="" TORTURE_SHUTDOWN_GRACE=180 TORTURE_SUITE=rcu @@ -65,6 +66,7 @@ usage () { echo " --duration minutes" echo " --interactive" echo " --jitter N [ maxsleep (us) [ maxspin (us) ] ]" + echo " --kconfig Kconfig-options" echo " --kmake-arg kernel-make-arguments" echo " --mac nn:nn:nn:nn:nn:nn" echo " --no-initrd" @@ -129,6 +131,11 @@ do jitter="$2" shift ;; + --kconfig) + checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$' + TORTURE_KCONFIG_ARG="$2" + shift + ;; --kmake-arg) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" @@ -205,6 +212,7 @@ do then cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF1` cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"` + cpu_count=`configfrag_boot_maxcpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"` for ((cur_rep=0;cur_rep<$config_reps;cur_rep++)) do echo $CF1 $cpu_count >> $T/cfgcpu @@ -275,6 +283,7 @@ TORTURE_BOOT_IMAGE="$TORTURE_BOOT_IMAGE"; export TORTURE_BOOT_IMAGE TORTURE_BUILDONLY="$TORTURE_BUILDONLY"; export TORTURE_BUILDONLY TORTURE_DEFCONFIG="$TORTURE_DEFCONFIG"; export TORTURE_DEFCONFIG TORTURE_INITRD="$TORTURE_INITRD"; export TORTURE_INITRD +TORTURE_KCONFIG_ARG="$TORTURE_KCONFIG_ARG"; export TORTURE_KCONFIG_ARG TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE @@ -324,6 +333,7 @@ function dump(first, pastlast, batchnum) { print "echo ----Start batch " batchnum ": `date`"; print "echo ----Start batch " batchnum ": `date` >> " rd "/log"; + print "needqemurun=" jn=1 for (j = first; j < pastlast; j++) { builddir=KVM "/b" jn @@ -359,10 +369,11 @@ function dump(first, pastlast, batchnum) for (j = 1; j < jn; j++) { builddir=KVM "/b" j print "rm -f " builddir ".ready" - print "if test -z \"$TORTURE_BUILDONLY\"" + print "if test -f \"" rd cfr[j] "/builtkernel\"" print "then" - print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date`"; - print "\techo ----", cfr[j], cpusr[j] ovf ": Starting kernel. `date` >> " rd "/log"; + print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date`"; + print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date` >> " rd "/log"; + print "\tneedqemurun=1" print "fi" } njitter = 0; @@ -377,13 +388,22 @@ function dump(first, pastlast, batchnum) njitter = 0; print "echo Build-only run, so suppressing jitter >> " rd "/log" } - for (j = 0; j < njitter; j++) - print "jitter.sh " j " " dur " " ja[2] " " ja[3] "&" - print "wait" - print "if test -z \"$TORTURE_BUILDONLY\"" + if (TORTURE_BUILDONLY) { + print "needqemurun=" + } + print "if test -n \"$needqemurun\"" print "then" + print "\techo ---- Starting kernels. `date`"; + print "\techo ---- Starting kernels. `date` >> " rd "/log"; + for (j = 0; j < njitter; j++) + print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&" + print "\twait" print "\techo ---- All kernel runs complete. `date`"; print "\techo ---- All kernel runs complete. `date` >> " rd "/log"; + print "else" + print "\twait" + print "\techo ---- No kernel runs. `date`"; + print "\techo ---- No kernel runs. `date` >> " rd "/log"; print "fi" for (j = 1; j < jn; j++) { builddir=KVM "/b" j diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot index 6804f9dcfc1b..be7728db42fd 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot @@ -1 +1 @@ -rcutorture.torture_type=rcu_busted +rcutorture.torture_type=busted diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot deleted file mode 100644 index 84a7d51b7481..000000000000 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-C.boot +++ /dev/null @@ -1 +0,0 @@ -rcutorture.torture_type=srcud diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u index 6bc24e99862f..c15ada821e45 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u @@ -4,6 +4,7 @@ CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n #CHECK#CONFIG_TINY_SRCU=y CONFIG_RCU_TRACE=n -CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_PREEMPT_COUNT=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot index 1d14e1383016..9f3a4d28e508 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot @@ -1,4 +1,4 @@ -rcutorture.torture_type=rcu_bh maxcpus=8 +rcutorture.torture_type=rcu_bh maxcpus=8 nr_cpus=43 rcutree.gp_preinit_delay=3 rcutree.gp_init_delay=3 rcutree.gp_cleanup_delay=3 diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 9ad3f89c8dc7..af6fca03602f 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -69,11 +69,11 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE CONFIG_PREEMPT_RCU CONFIG_TREE_RCU CONFIG_TINY_RCU +CONFIG_TASKS_RCU These are controlled by CONFIG_PREEMPT and/or CONFIG_SMP. CONFIG_SRCU -CONFIG_TASKS_RCU Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable.