diff --git a/arch/Kconfig b/arch/Kconfig index 8a8ea7110de8..8f3638674e05 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -87,7 +87,6 @@ config KPROBES_ON_FTRACE config UPROBES def_bool n - select PERCPU_RWSEM help Uprobes is the user-space counterpart to kprobes: they enable instrumentation applications (such as 'perf probe') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f5021fcb154e..a8ab8f5ef38e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1638,9 +1638,7 @@ static void do_async_commit(struct work_struct *work) * Tell lockdep about it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS); current->journal_info = ac->newtrans; @@ -1679,9 +1677,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * async commit thread will be the one to unlock it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_release( - &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); diff --git a/fs/super.c b/fs/super.c index b61372354f2b..767b1e10f6ad 100644 --- a/fs/super.c +++ b/fs/super.c @@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink, return total_objects; } +static void destroy_super_work(struct work_struct *work) +{ + struct super_block *s = container_of(work, struct super_block, + destroy_work); + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_free_rwsem(&s->s_writers.rw_sem[i]); + kfree(s); +} + +static void destroy_super_rcu(struct rcu_head *head) +{ + struct super_block *s = container_of(head, struct super_block, rcu); + INIT_WORK(&s->destroy_work, destroy_super_work); + schedule_work(&s->destroy_work); +} + /** * destroy_super - frees a superblock * @s: superblock to free @@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink, */ static void destroy_super(struct super_block *s) { - int i; list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - for (i = 0; i < SB_FREEZE_LEVELS; i++) - percpu_counter_destroy(&s->s_writers.counter[i]); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); kfree(s->s_options); - kfree_rcu(s, rcu); + call_rcu(&s->rcu, destroy_super_rcu); } /** @@ -178,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0, - GFP_KERNEL) < 0) + if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], + sb_writers_name[i], + &type->s_writers_key[i])) goto fail; - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], - &type->s_writers_key[i], 0); } - init_waitqueue_head(&s->s_writers.wait); init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; @@ -1146,72 +1159,46 @@ out: */ void __sb_end_write(struct super_block *sb, int level) { - percpu_counter_dec(&sb->s_writers.counter[level-1]); - /* - * Make sure s_writers are updated before we wake up waiters in - * freeze_super(). - */ - smp_mb(); - if (waitqueue_active(&sb->s_writers.wait)) - wake_up(&sb->s_writers.wait); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); + percpu_up_read(sb->s_writers.rw_sem + level-1); } EXPORT_SYMBOL(__sb_end_write); -#ifdef CONFIG_LOCKDEP -/* - * We want lockdep to tell us about possible deadlocks with freezing but - * it's it bit tricky to properly instrument it. Getting a freeze protection - * works as getting a read lock but there are subtle problems. XFS for example - * gets freeze protection on internal level twice in some cases, which is OK - * only because we already hold a freeze protection also on higher level. Due - * to these cases we have to tell lockdep we are doing trylock when we - * already hold a freeze protection for a higher freeze level. - */ -static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, - unsigned long ip) -{ - int i; - - if (!trylock) { - for (i = 0; i < level - 1; i++) - if (lock_is_held(&sb->s_writers.lock_map[i])) { - trylock = true; - break; - } - } - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); -} -#endif - /* * This is an internal function, please use sb_start_{write,pagefault,intwrite} * instead. */ int __sb_start_write(struct super_block *sb, int level, bool wait) { -retry: - if (unlikely(sb->s_writers.frozen >= level)) { - if (!wait) - return 0; - wait_event(sb->s_writers.wait_unfrozen, - sb->s_writers.frozen < level); - } + bool force_trylock = false; + int ret = 1; #ifdef CONFIG_LOCKDEP - acquire_freeze_lock(sb, level, !wait, _RET_IP_); -#endif - percpu_counter_inc(&sb->s_writers.counter[level-1]); /* - * Make sure counter is updated before we check for frozen. - * freeze_super() first sets frozen and then checks the counter. + * We want lockdep to tell us about possible deadlocks with freezing + * but it's it bit tricky to properly instrument it. Getting a freeze + * protection works as getting a read lock but there are subtle + * problems. XFS for example gets freeze protection on internal level + * twice in some cases, which is OK only because we already hold a + * freeze protection also on higher level. Due to these cases we have + * to use wait == F (trylock mode) which must not fail. */ - smp_mb(); - if (unlikely(sb->s_writers.frozen >= level)) { - __sb_end_write(sb, level); - goto retry; + if (wait) { + int i; + + for (i = 0; i < level - 1; i++) + if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { + force_trylock = true; + break; + } } - return 1; +#endif + if (wait && !force_trylock) + percpu_down_read(sb->s_writers.rw_sem + level-1); + else + ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); + + WARN_ON(force_trylock & !ret); + return ret; } EXPORT_SYMBOL(__sb_start_write); @@ -1221,37 +1208,33 @@ EXPORT_SYMBOL(__sb_start_write); * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file - * system. Caller of this function should make sure there can be no new writers - * of type @level before calling this function. Otherwise this function can - * livelock. + * system. */ static void sb_wait_write(struct super_block *sb, int level) { - s64 writers; - + percpu_down_write(sb->s_writers.rw_sem + level-1); /* - * We just cycle-through lockdep here so that it does not complain - * about returning with lock to userspace + * We are going to return to userspace and forget about this lock, the + * ownership goes to the caller of thaw_super() which does unlock. + * + * FIXME: we should do this before return from freeze_super() after we + * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() + * should re-acquire these locks before s_op->unfreeze_fs(sb). However + * this leads to lockdep false-positives, so currently we do the early + * release right after acquire. */ - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); + percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); +} - do { - DEFINE_WAIT(wait); +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; - /* - * We use a barrier in prepare_to_wait() to separate setting - * of frozen and checking of the counter - */ - prepare_to_wait(&sb->s_writers.wait, &wait, - TASK_UNINTERRUPTIBLE); + for (level = 0; level < SB_FREEZE_LEVELS; ++level) + percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); - writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); - if (writers) - schedule(); - - finish_wait(&sb->s_writers.wait, &wait); - } while (writers); + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_up_write(sb->s_writers.rw_sem + level); } /** @@ -1310,20 +1293,14 @@ int freeze_super(struct super_block *sb) return 0; } - /* From now on, no new normal writers can start */ sb->s_writers.frozen = SB_FREEZE_WRITE; - smp_wmb(); - /* Release s_umount to preserve sb_start_write -> s_umount ordering */ up_write(&sb->s_umount); - sb_wait_write(sb, SB_FREEZE_WRITE); + down_write(&sb->s_umount); /* Now we go and block page faults... */ - down_write(&sb->s_umount); sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; - smp_wmb(); - sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ @@ -1331,7 +1308,6 @@ int freeze_super(struct super_block *sb) /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; - smp_wmb(); sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { @@ -1340,7 +1316,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1372,8 +1348,10 @@ int thaw_super(struct super_block *sb) return -EINVAL; } - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & MS_RDONLY) { + sb->s_writers.frozen = SB_UNFROZEN; goto out; + } if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -1385,12 +1363,11 @@ int thaw_super(struct super_block *sb) } } -out: sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); +out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); - return 0; } EXPORT_SYMBOL(thaw_super); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3859f5e27a4d..9bbb3507376a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc( * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ - rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); /* * We hand off the transaction to the completion thread now, so * clear the flag here. @@ -171,8 +170,7 @@ xfs_setfilesize_ioend( * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); - rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 84b783f277f7..ce356f66cc2a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1,7 +1,6 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H - #include #include #include @@ -30,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -1274,16 +1275,9 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { - /* Counters for counting writers at each level */ - struct percpu_counter counter[SB_FREEZE_LEVELS]; - wait_queue_head_t wait; /* queue for waiting for - writers / faults to finish */ - int frozen; /* Is sb frozen? */ - wait_queue_head_t wait_unfrozen; /* queue for waiting for - sb to be thawed */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map lock_map[SB_FREEZE_LEVELS]; -#endif + int frozen; /* Is sb frozen? */ + wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { @@ -1375,7 +1369,7 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; - + struct work_struct destroy_work; /* * Indicates how deep in a filesystem stack this SB is */ @@ -1391,6 +1385,11 @@ extern struct timespec current_fs_time(struct super_block *sb); void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); +#define __sb_writers_acquired(sb, lev) \ + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 3e88c9a7d57f..834c4e52cb2d 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -16,6 +16,7 @@ struct percpu_rw_semaphore { }; extern void percpu_down_read(struct percpu_rw_semaphore *); +extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); extern void percpu_up_read(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); @@ -31,4 +32,23 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); __percpu_init_rwsem(brw, #brw, &rwsem_key); \ }) + +#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) + +static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_release(&sem->rw_sem.dep_map, 1, ip); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + if (!read) + sem->rw_sem.owner = NULL; +#endif +} + +static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip); +} + #endif diff --git a/init/Kconfig b/init/Kconfig index af09b4fb43d2..288c0122c2a5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -925,7 +925,6 @@ config NUMA_BALANCING_DEFAULT_ENABLED menuconfig CGROUPS bool "Control Group support" select KERNFS - select PERCPU_RWSEM help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 7dd5c9918e4c..4c6a97e1a849 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -25,6 +25,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o -obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 652a8ee8efe9..f32567254867 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw) __up_read(&brw->rw_sem); } +int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) +{ + if (unlikely(!update_fast_ctr(brw, +1))) { + if (!__down_read_trylock(&brw->rw_sem)) + return 0; + atomic_inc(&brw->slow_read_ctr); + __up_read(&brw->rw_sem); + } + + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + return 1; +} + void percpu_up_read(struct percpu_rw_semaphore *brw) { rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); diff --git a/lib/Kconfig b/lib/Kconfig index 3a2ef67db6c7..f6aa03dc1576 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -53,9 +53,6 @@ config GENERIC_IO config STMP_DEVICE bool -config PERCPU_RWSEM - bool - config ARCH_USE_CMPXCHG_LOCKREF bool