From a523841ee4e506fa1f05ff3a85b1e6d8176a3d4d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 1 Oct 2015 15:36:48 -0700 Subject: [PATCH 01/12] arch/x86/include/asm/efi.h: fix build failure With KMEMCHECK=y, KASAN=n: arch/x86/platform/efi/efi.c:673:3: error: implicit declaration of function `memcpy' [-Werror=implicit-function-declaration] arch/x86/platform/efi/efi_64.c:139:2: error: implicit declaration of function `memcpy' [-Werror=implicit-function-declaration] arch/x86/include/asm/desc.h:121:2: error: implicit declaration of function `memcpy' [-Werror=implicit-function-declaration] Don't #undef memcpy if KASAN=n. Fixes: 769a8089c1fd ("x86, efi, kasan: #undef memset/memcpy/memmove per arch") Signed-off-by: Andrey Ryabinin Reported-by: Ingo Molnar Reported-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/efi.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ab5f1d447ef9..ae68be92f755 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -86,6 +86,7 @@ extern u64 asmlinkage efi_call(void *fp, ...); extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, u32 type, u64 attribute); +#ifdef CONFIG_KASAN /* * CONFIG_KASAN may redefine memset to __memset. __memset function is present * only in kernel binary. Since the EFI stub linked into a separate binary it @@ -95,6 +96,7 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, #undef memcpy #undef memset #undef memmove +#endif #endif /* CONFIG_X86_32 */ From 9ff42d10c3b3e26d9555878f31b9a2e5c24efa57 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 1 Oct 2015 15:36:51 -0700 Subject: [PATCH 02/12] userfaultfd: remove kernel header include from uapi header As include/uapi/linux/userfaultfd.h is a user visible header file, it should not include kernel-exclusive header files. So trying to build the userfaultfd test program from the selftests directory fails, since it contains a reference to linux/compiler.h. As it turns out, that header is not really needed there, so we can simply remove it to fix that issue. Signed-off-by: Andre Przywara Cc: Andrea Arcangeli Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/userfaultfd.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index df0e09bb7dd5..9057d7af3ae1 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -11,8 +11,6 @@ #include -#include - #define UFFD_API ((__u64)0xAA) /* * After implementing the respective features it will become: From 03a2d2a3eafe4015412cf4e9675ca0e2d9204074 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 1 Oct 2015 15:36:54 -0700 Subject: [PATCH 03/12] mm/slab: fix unexpected index mapping result of kmalloc_size(INDEX_NODE+1) Commit description is copied from the original post of this bug: http://comments.gmane.org/gmane.linux.kernel.mm/135349 Kernels after v3.9 use kmalloc_size(INDEX_NODE + 1) to get the next larger cache size than the size index INDEX_NODE mapping. In kernels 3.9 and earlier we used malloc_sizes[INDEX_L3 + 1].cs_size. However, sometimes we can't get the right output we expected via kmalloc_size(INDEX_NODE + 1), causing a BUG(). The mapping table in the latest kernel is like: index = {0, 1, 2 , 3, 4, 5, 6, n} size = {0, 96, 192, 8, 16, 32, 64, 2^n} The mapping table before 3.10 is like this: index = {0 , 1 , 2, 3, 4 , 5 , 6, n} size = {32, 64, 96, 128, 192, 256, 512, 2^(n+3)} The problem on my mips64 machine is as follows: (1) When configured DEBUG_SLAB && DEBUG_PAGEALLOC && DEBUG_LOCK_ALLOC && DEBUG_SPINLOCK, the sizeof(struct kmem_cache_node) will be "150", and the macro INDEX_NODE turns out to be "2": #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) (2) Then the result of kmalloc_size(INDEX_NODE + 1) is 8. (3) Then "if(size >= kmalloc_size(INDEX_NODE + 1)" will lead to "size = PAGE_SIZE". (4) Then "if ((size >= (PAGE_SIZE >> 3))" test will be satisfied and "flags |= CFLGS_OFF_SLAB" will be covered. (5) if (flags & CFLGS_OFF_SLAB)" test will be satisfied and will go to "cachep->slabp_cache = kmalloc_slab(slab_size, 0u)", and the result here may be NULL while kernel bootup. (6) Finally,"BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));" causes the BUG info as the following shows (may be only mips64 has this problem): This patch fixes the problem of kmalloc_size(INDEX_NODE + 1) and removes the BUG by adding 'size >= 256' check to guarantee that all necessary small sized slabs are initialized regardless sequence of slab size in mapping table. Fixes: e33660165c90 ("slab: Use common kmalloc_index/kmalloc_size...") Signed-off-by: Joonsoo Kim Reported-by: Liuhailong Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index c77ebe6cc87c..4fcc5dd8d5a6 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2190,9 +2190,16 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - if (size >= kmalloc_size(INDEX_NODE + 1) - && cachep->object_size > cache_line_size() - && ALIGN(size, cachep->align) < PAGE_SIZE) { + /* + * To activate debug pagealloc, off-slab management is necessary + * requirement. In early phase of initialization, small sized slab + * doesn't get initialized so it would not be possible. So, we need + * to check size >= 256. It guarantees that all necessary small + * sized slab is initialized in current slab initialization sequence. + */ + if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && + size >= 256 && cachep->object_size > cache_line_size() && + ALIGN(size, cachep->align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); size = PAGE_SIZE; } From 2f84a8990ebbe235c59716896e017c6b2ca1200f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 1 Oct 2015 15:36:57 -0700 Subject: [PATCH 04/12] mm: hugetlbfs: skip shared VMAs when unmapping private pages to satisfy a fault SunDong reported the following on https://bugzilla.kernel.org/show_bug.cgi?id=103841 I think I find a linux bug, I have the test cases is constructed. I can stable recurring problems in fedora22(4.0.4) kernel version, arch for x86_64. I construct transparent huge page, when the parent and child process with MAP_SHARE, MAP_PRIVATE way to access the same huge page area, it has the opportunity to lead to huge page copy on write failure, and then it will munmap the child corresponding mmap area, but then the child mmap area with VM_MAYSHARE attributes, child process munmap this area can trigger VM_BUG_ON in set_vma_resv_flags functions (vma - > vm_flags & VM_MAYSHARE). There were a number of problems with the report (e.g. it's hugetlbfs that triggers this, not transparent huge pages) but it was fundamentally correct in that a VM_BUG_ON in set_vma_resv_flags() can be triggered that looks like this vma ffff8804651fd0d0 start 00007fc474e00000 end 00007fc475e00000 next ffff8804651fd018 prev ffff8804651fd188 mm ffff88046b1b1800 prot 8000000000000027 anon_vma (null) vm_ops ffffffff8182a7a0 pgoff 0 file ffff88106bdb9800 private_data (null) flags: 0x84400fb(read|write|shared|mayread|maywrite|mayexec|mayshare|dontexpand|hugetlb) ------------ kernel BUG at mm/hugetlb.c:462! SMP Modules linked in: xt_pkttype xt_LOG xt_limit [..] CPU: 38 PID: 26839 Comm: map Not tainted 4.0.4-default #1 Hardware name: Dell Inc. PowerEdge R810/0TT6JF, BIOS 2.7.4 04/26/2012 set_vma_resv_flags+0x2d/0x30 The VM_BUG_ON is correct because private and shared mappings have different reservation accounting but the warning clearly shows that the VMA is shared. When a private COW fails to allocate a new page then only the process that created the VMA gets the page -- all the children unmap the page. If the children access that data in the future then they get killed. The problem is that the same file is mapped shared and private. During the COW, the allocation fails, the VMAs are traversed to unmap the other private pages but a shared VMA is found and the bug is triggered. This patch identifies such VMAs and skips them. Signed-off-by: Mel Gorman Reported-by: SunDong Reviewed-by: Michal Hocko Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: David Rientjes Reviewed-by: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 999fb0aef8f1..9cc773483624 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3201,6 +3201,14 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, if (iter_vma == vma) continue; + /* + * Shared VMAs have their own reserves and do not affect + * MAP_PRIVATE accounting but it is possible that a shared + * VMA is using the same page so check and skip such VMAs. + */ + if (iter_vma->vm_flags & VM_MAYSHARE) + continue; + /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these From 8346c416d17bf5b4ea1508662959bb62e73fd6a5 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 1 Oct 2015 15:36:59 -0700 Subject: [PATCH 05/12] dax: fix NULL pointer in __dax_pmd_fault() Commit 46c043ede471 ("mm: take i_mmap_lock in unmap_mapping_range() for DAX") moved some code in __dax_pmd_fault() that was responsible for zeroing newly allocated PMD pages. The new location didn't properly set up 'kaddr', so when run this code resulted in a NULL pointer BUG. Fix this by getting the correct 'kaddr' via bdev_direct_access(). Signed-off-by: Ross Zwisler Reported-by: Dan Williams Reviewed-by: Dan Williams Cc: Alexander Viro Cc: Matthew Wilcox Cc: "Kirill A. Shutemov" Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 7ae6df7ea1d2..bcfb14bfc1e4 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -569,8 +569,20 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) goto fallback; + sector = bh.b_blocknr << (blkbits - 9); + if (buffer_unwritten(&bh) || buffer_new(&bh)) { int i; + + length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, + bh.b_size); + if (length < 0) { + result = VM_FAULT_SIGBUS; + goto out; + } + if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) + goto fallback; + for (i = 0; i < PTRS_PER_PMD; i++) clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); wmb_pmem(); @@ -623,7 +635,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { - sector = bh.b_blocknr << (blkbits - 9); length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, bh.b_size); if (length < 0) { From 0610c25daa3e76e38ad5a8fae683a89ff9f71798 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 1 Oct 2015 15:37:02 -0700 Subject: [PATCH 06/12] memcg: fix dirty page migration The problem starts with a file backed dirty page which is charged to a memcg. Then page migration is used to move oldpage to newpage. Migration: - copies the oldpage's data to newpage - clears oldpage.PG_dirty - sets newpage.PG_dirty - uncharges oldpage from memcg - charges newpage to memcg Clearing oldpage.PG_dirty decrements the charged memcg's dirty page count. However, because newpage is not yet charged, setting newpage.PG_dirty does not increment the memcg's dirty page count. After migration completes newpage.PG_dirty is eventually cleared, often in account_page_cleaned(). At this time newpage is charged to a memcg so the memcg's dirty page count is decremented which causes underflow because the count was not previously incremented by migration. This underflow causes balance_dirty_pages() to see a very large unsigned number of dirty memcg pages which leads to aggressive throttling of buffered writes by processes in non root memcg. This issue: - can harm performance of non root memcg buffered writes. - can report too small (even negative) values in memory.stat[(total_)dirty] counters of all memcg, including the root. To avoid polluting migrate.c with #ifdef CONFIG_MEMCG checks, introduce page_memcg() and set_page_memcg() helpers. Test: 0) setup and enter limited memcg mkdir /sys/fs/cgroup/test echo 1G > /sys/fs/cgroup/test/memory.limit_in_bytes echo $$ > /sys/fs/cgroup/test/cgroup.procs 1) buffered writes baseline dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k sync grep ^dirty /sys/fs/cgroup/test/memory.stat 2) buffered writes with compaction antagonist to induce migration yes 1 > /proc/sys/vm/compact_memory & rm -rf /data/tmp/foo dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k kill % sync grep ^dirty /sys/fs/cgroup/test/memory.stat 3) buffered writes without antagonist, should match baseline rm -rf /data/tmp/foo dd if=/dev/zero of=/data/tmp/foo bs=1M count=1k sync grep ^dirty /sys/fs/cgroup/test/memory.stat (speed, dirty residue) unpatched patched 1) 841 MB/s 0 dirty pages 886 MB/s 0 dirty pages 2) 611 MB/s -33427456 dirty pages 793 MB/s 0 dirty pages 3) 114 MB/s -33427456 dirty pages 891 MB/s 0 dirty pages Notice that unpatched baseline performance (1) fell after migration (3): 841 -> 114 MB/s. In the patched kernel, post migration performance matches baseline. Fixes: c4843a7593a9 ("memcg: add per cgroup dirty page accounting") Signed-off-by: Greg Thelen Reported-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 21 +++++++++++++++++++++ mm/migrate.c | 12 +++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 91c08f6f0dc9..80001de019ba 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -905,6 +905,27 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } +#ifdef CONFIG_MEMCG +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return page->mem_cgroup; +} + +static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg) +{ + page->mem_cgroup = memcg; +} +#else +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return NULL; +} + +static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg) +{ +} +#endif + /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/mm/migrate.c b/mm/migrate.c index 7452a00bbb50..842ecd7aaf7f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -740,6 +740,15 @@ static int move_to_new_page(struct page *newpage, struct page *page, if (PageSwapBacked(page)) SetPageSwapBacked(newpage); + /* + * Indirectly called below, migrate_page_copy() copies PG_dirty and thus + * needs newpage's memcg set to transfer memcg dirty page accounting. + * So perform memcg migration in two steps: + * 1. set newpage->mem_cgroup (here) + * 2. clear page->mem_cgroup (below) + */ + set_page_memcg(newpage, page_memcg(page)); + mapping = page_mapping(page); if (!mapping) rc = migrate_page(mapping, newpage, page, mode); @@ -756,9 +765,10 @@ static int move_to_new_page(struct page *newpage, struct page *page, rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc != MIGRATEPAGE_SUCCESS) { + set_page_memcg(newpage, NULL); newpage->mapping = NULL; } else { - mem_cgroup_migrate(page, newpage, false); + set_page_memcg(page, NULL); if (page_was_mapped) remove_migration_ptes(page, newpage); page->mapping = NULL; From 484ebb3b8c8b27dd2171696462a3116edb9ff801 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 1 Oct 2015 15:37:05 -0700 Subject: [PATCH 07/12] memcg: make mem_cgroup_read_stat() unsigned mem_cgroup_read_stat() returns a page count by summing per cpu page counters. The summing is racy wrt. updates, so a transient negative sum is possible. Callers don't want negative values: - mem_cgroup_wb_stats() doesn't want negative nr_dirty or nr_writeback. This could confuse dirty throttling. - oom reports and memory.stat shouldn't show confusing negative usage. - tree_usage() already avoids negatives. Avoid returning negative page counts from mem_cgroup_read_stat() and convert it to unsigned. [akpm@linux-foundation.org: fix old typo while we're in there] Signed-off-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6ddaeba34e09..03cc0a742ff1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -644,12 +644,14 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) } /* + * Return page count for single (non recursive) @memcg. + * * Implementation Note: reading percpu statistics for memcg. * * Both of vmstat[] and percpu_counter has threshold and do periodic * synchronization to implement "quick" read. There are trade-off between * reading cost and precision of value. Then, we may have a chance to implement - * a periodic synchronizion of counter in memcg's counter. + * a periodic synchronization of counter in memcg's counter. * * But this _read() function is used for user interface now. The user accounts * memory usage by memory cgroup and he _always_ requires exact value because @@ -659,17 +661,24 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) * * If there are kernel internal actions which can make use of some not-exact * value, and reading all cpu value can be performance bottleneck in some - * common workload, threashold and synchonization as vmstat[] should be + * common workload, threshold and synchronization as vmstat[] should be * implemented. */ -static long mem_cgroup_read_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static unsigned long +mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { long val = 0; int cpu; + /* Per-cpu values can be negative, use a signed accumulator */ for_each_possible_cpu(cpu) val += per_cpu(memcg->stat->count[idx], cpu); + /* + * Summing races with updates, so val may be negative. Avoid exposing + * transient negative values. + */ + if (val < 0) + val = 0; return val; } @@ -1254,7 +1263,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; - pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], + pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], K(mem_cgroup_read_stat(iter, i))); } @@ -2819,14 +2828,11 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; - long val = 0; + unsigned long val = 0; - /* Per-cpu values can be negative, use a signed accumulator */ for_each_mem_cgroup_tree(iter, memcg) val += mem_cgroup_read_stat(iter, idx); - if (val < 0) /* race ? */ - val = 0; return val; } @@ -3169,7 +3175,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; - seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], + seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); } @@ -3194,13 +3200,13 @@ static int memcg_stat_show(struct seq_file *m, void *v) (u64)memsw * PAGE_SIZE); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - long long val = 0; + unsigned long long val = 0; if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; for_each_mem_cgroup_tree(mi, memcg) val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; - seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); + seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val); } for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { From 09a59a9d57a9d6f49510c93304d6e105deb83b93 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 1 Oct 2015 15:37:08 -0700 Subject: [PATCH 08/12] drivers/input/joystick/Kconfig: zhenhua.c needs BITREVERSE It uses bitrev8(), so it must ensure that lib/bitrev.o gets included in vmlinux. Cc: Fengguang Wu Cc: yalin wang Cc: Dmitry Torokhov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/input/joystick/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/joystick/Kconfig b/drivers/input/joystick/Kconfig index 56eb471b5576..4215b5382092 100644 --- a/drivers/input/joystick/Kconfig +++ b/drivers/input/joystick/Kconfig @@ -196,6 +196,7 @@ config JOYSTICK_TWIDJOY config JOYSTICK_ZHENHUA tristate "5-byte Zhenhua RC transmitter" select SERIO + select BITREVERSE help Say Y here if you have a Zhen Hua PPM-4CH transmitter which is supplied with a ready to fly micro electric indoor helicopters From 54aea4542980a3ed580426a81c5af799df4d610d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 1 Oct 2015 15:37:11 -0700 Subject: [PATCH 09/12] kprobes: use _do_fork() in samples to make them work again Commit 3033f14ab78c ("clone: support passing tls argument via C rather than pt_regs magic") introduced _do_fork() that allowed to pass @tls parameter. The old do_fork() is defined only for architectures that are not ready to use this way and do not define HAVE_COPY_THREAD_TLS. Let's use _do_fork() in the kprobe examples to make them work again on all architectures. Signed-off-by: Petr Mladek Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Thiago Macieira Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- samples/kprobes/jprobe_example.c | 14 +++++++------- samples/kprobes/kprobe_example.c | 6 +++--- samples/kprobes/kretprobe_example.c | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/samples/kprobes/jprobe_example.c b/samples/kprobes/jprobe_example.c index 9119ac6a8270..c285a3b8a9f1 100644 --- a/samples/kprobes/jprobe_example.c +++ b/samples/kprobes/jprobe_example.c @@ -1,13 +1,13 @@ /* * Here's a sample kernel module showing the use of jprobes to dump - * the arguments of do_fork(). + * the arguments of _do_fork(). * * For more information on theory of operation of jprobes, see * Documentation/kprobes.txt * * Build and insert the kernel module as done in the kprobe example. * You will see the trace data in /var/log/messages and on the - * console whenever do_fork() is invoked to create a new process. + * console whenever _do_fork() is invoked to create a new process. * (Some messages may be suppressed if syslogd is configured to * eliminate duplicate messages.) */ @@ -17,13 +17,13 @@ #include /* - * Jumper probe for do_fork. + * Jumper probe for _do_fork. * Mirror principle enables access to arguments of the probed routine * from the probe handler. */ -/* Proxy routine having the same arguments as actual do_fork() routine */ -static long jdo_fork(unsigned long clone_flags, unsigned long stack_start, +/* Proxy routine having the same arguments as actual _do_fork() routine */ +static long j_do_fork(unsigned long clone_flags, unsigned long stack_start, unsigned long stack_size, int __user *parent_tidptr, int __user *child_tidptr) { @@ -36,9 +36,9 @@ static long jdo_fork(unsigned long clone_flags, unsigned long stack_start, } static struct jprobe my_jprobe = { - .entry = jdo_fork, + .entry = j_do_fork, .kp = { - .symbol_name = "do_fork", + .symbol_name = "_do_fork", }, }; diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c index 366db1a9fb65..727eb21c9c56 100644 --- a/samples/kprobes/kprobe_example.c +++ b/samples/kprobes/kprobe_example.c @@ -1,13 +1,13 @@ /* * NOTE: This example is works on x86 and powerpc. * Here's a sample kernel module showing the use of kprobes to dump a - * stack trace and selected registers when do_fork() is called. + * stack trace and selected registers when _do_fork() is called. * * For more information on theory of operation of kprobes, see * Documentation/kprobes.txt * * You will see the trace data in /var/log/messages and on the console - * whenever do_fork() is invoked to create a new process. + * whenever _do_fork() is invoked to create a new process. */ #include @@ -16,7 +16,7 @@ /* For each probe you need to allocate a kprobe structure */ static struct kprobe kp = { - .symbol_name = "do_fork", + .symbol_name = "_do_fork", }; /* kprobe pre_handler: called just before the probed instruction is executed */ diff --git a/samples/kprobes/kretprobe_example.c b/samples/kprobes/kretprobe_example.c index 1041b6731598..ebb1d1aed547 100644 --- a/samples/kprobes/kretprobe_example.c +++ b/samples/kprobes/kretprobe_example.c @@ -7,7 +7,7 @@ * * usage: insmod kretprobe_example.ko func= * - * If no func_name is specified, do_fork is instrumented + * If no func_name is specified, _do_fork is instrumented * * For more information on theory of operation of kretprobes, see * Documentation/kprobes.txt @@ -25,7 +25,7 @@ #include #include -static char func_name[NAME_MAX] = "do_fork"; +static char func_name[NAME_MAX] = "_do_fork"; module_param_string(func, func_name, NAME_MAX, S_IRUGO); MODULE_PARM_DESC(func, "Function to kretprobe; this module will report the" " function's execution time"); From ef510194cefe0cd369ef73419cd65b0a5bb4fb5b Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 1 Oct 2015 15:37:13 -0700 Subject: [PATCH 10/12] memcg: remove pcp_counter_lock Commit 733a572e66d2 ("memcg: make mem_cgroup_read_{stat|event}() iterate possible cpus instead of online") removed the last use of the per memcg pcp_counter_lock but forgot to remove the variable. Kill the vestigial variable. Signed-off-by: Greg Thelen Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - mm/memcontrol.c | 1 - 2 files changed, 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad800e62cb7a..6452ff4c463f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -242,7 +242,6 @@ struct mem_cgroup { * percpu counter. */ struct mem_cgroup_stat_cpu __percpu *stat; - spinlock_t pcp_counter_lock; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 03cc0a742ff1..1fedbde68f59 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4185,7 +4185,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg_wb_domain_init(memcg, GFP_KERNEL)) goto out_free_stat; - spin_lock_init(&memcg->pcp_counter_lock); return memcg; out_free_stat: From 44241628bb207ec211bebd156aaf69470d90c209 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 1 Oct 2015 15:37:16 -0700 Subject: [PATCH 11/12] thermal: avoid division by zero in power allocator During boot I get a div by zero Oops regression starting in v4.3-rc3. Signed-off-by: Andrea Arcangeli Reviewed-by: Javi Merino Cc: Zhang Rui Cc: Eduardo Valentin Cc: Daniel Kurtz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/thermal/power_allocator.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/thermal/power_allocator.c b/drivers/thermal/power_allocator.c index 7ff96270c933..e570ff084add 100644 --- a/drivers/thermal/power_allocator.c +++ b/drivers/thermal/power_allocator.c @@ -144,6 +144,16 @@ static void estimate_pid_constants(struct thermal_zone_device *tz, switch_on_temp = 0; temperature_threshold = control_temp - switch_on_temp; + /* + * estimate_pid_constants() tries to find appropriate default + * values for thermal zones that don't provide them. If a + * system integrator has configured a thermal zone with two + * passive trip points at the same temperature, that person + * hasn't put any effort to set up the thermal zone properly + * so just give up. + */ + if (!temperature_threshold) + return; if (!tz->tzp->k_po || force) tz->tzp->k_po = int_to_frac(sustainable_power) / From 676bd99178cd962ed24ffdad222b7069d330a969 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 1 Oct 2015 15:37:19 -0700 Subject: [PATCH 12/12] dmapool: fix overflow condition in pool_find_page() If a DMA pool lies at the very top of the dma_addr_t range (as may happen with an IOMMU involved), the calculated end address of the pool wraps around to zero, and page lookup always fails. Tweak the relevant calculation to be overflow-proof. Signed-off-by: Robin Murphy Cc: Arnd Bergmann Cc: Marek Szyprowski Cc: Sumit Semwal Cc: Sakari Ailus Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/dmapool.c b/mm/dmapool.c index 71a8998cd03a..312a716fa14c 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -394,7 +394,7 @@ static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) list_for_each_entry(page, &pool->page_list, page_list) { if (dma < page->dma) continue; - if (dma < (page->dma + pool->allocation)) + if ((dma - page->dma) < pool->allocation) return page; } return NULL;