25e813ddc6
-----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEZH8oZUiU471FcZm+ONu9yGCSaT4FAmK22iMACgkQONu9yGCS aT5HJQ/6ApRt37oalJ6HFGpWaGChmbs9ttGCNxXMyUSLgSzfVqXEZBT4S5Nyjhz0 D6rxpFMHrQUWfoyEG7CEo53dBTeG6g3/NKah4godguxUqEmbKAy9rGYKLL9VTdo/ nH5mBXJJaMKlGX105R94Aq2BCKVeycNpcqWWTZrZepqCL1mFqGh0VhgU8wCeTi5f wmRuMh58WiWgdBOHTMYseUB8YsLEeDC1qZsQ/aD4tg3FaTK6KVSuervz++M4WzeK QG2JnFLJ3Sl/lPDMNhHEYK7PmHhYwBDonT36QP6Lr7yuOSd37fBrEufWI+9T1ULA lHtsLMPBiIGumOKRqUIKAH2etqeGaWrd4I311XkWI42vEMqM8ZBlVBFnRbC8VoPF irrzmYJ1LS0Fp3+0cdZBKmBa2mztgh+aWpsVfQk4jvwxafit5LIntENOHbbVeFSm LIlts+uB6nY3sPr8GOeKFbFyxDEeMR06GS1emzDKkFRi83dOLYcVbnrn6k8lY5j8 Utd/DONlhLfybvxw4bJi1ovc+kqUe66h9w7sxVQv0z5F5xiU6ur8VTJjTUr6NgXK MpkcvoaF8WXnoeHdFl7s+c1Q3O5q9HwTIpxa7HcT44euHj9ngq4QbXECF/xNJeok /iBtOjnpSjtobqet8QmZtOYdIqtQbTGpS2TjhqQSLzbLg7Q+sJA= =/rv5 -----END PGP SIGNATURE----- Merge 4.19.249 into android-4.19-stable Changes in 4.19.249 9p: missing chunk of "fs/9p: Don't update file type when updating file attributes" drivers/char/random.c: constify poolinfo_table drivers/char/random.c: remove unused stuct poolinfo::poolbits drivers/char/random.c: make primary_crng static random: only read from /dev/random after its pool has received 128 bits random: move rand_initialize() earlier random: document get_random_int() family latent_entropy: avoid build error when plugin cflags are not set random: fix soft lockup when trying to read from an uninitialized blocking pool random: Support freezable kthreads in add_hwgenerator_randomness() fdt: add support for rng-seed random: Use wait_event_freezable() in add_hwgenerator_randomness() char/random: Add a newline at the end of the file Revert "hwrng: core - Freeze khwrng thread during suspend" crypto: blake2s - generic C library implementation and selftest lib/crypto: blake2s: move hmac construction into wireguard lib/crypto: sha1: re-roll loops to reduce code size random: Don't wake crng_init_wait when crng_init == 1 random: Add a urandom_read_nowait() for random APIs that don't warn random: add GRND_INSECURE to return best-effort non-cryptographic bytes random: ignore GRND_RANDOM in getentropy(2) random: make /dev/random be almost like /dev/urandom char/random: silence a lockdep splat with printk() random: fix crash on multiple early calls to add_bootloader_randomness() random: remove the blocking pool random: delete code to pull data into pools random: remove kernel.random.read_wakeup_threshold random: remove unnecessary unlikely() random: convert to ENTROPY_BITS for better code readability random: Add and use pr_fmt() random: fix typo in add_timer_randomness() random: remove some dead code of poolinfo random: split primary/secondary crng init paths random: avoid warnings for !CONFIG_NUMA builds x86: Remove arch_has_random, arch_has_random_seed powerpc: Remove arch_has_random, arch_has_random_seed s390: Remove arch_has_random, arch_has_random_seed linux/random.h: Remove arch_has_random, arch_has_random_seed linux/random.h: Use false with bool linux/random.h: Mark CONFIG_ARCH_RANDOM functions __must_check powerpc: Use bool in archrandom.h random: add arch_get_random_*long_early() random: avoid arch_get_random_seed_long() when collecting IRQ randomness random: remove dead code left over from blocking pool MAINTAINERS: co-maintain random.c crypto: blake2s - include <linux/bug.h> instead of <asm/bug.h> crypto: blake2s - adjust include guard naming random: document add_hwgenerator_randomness() with other input functions random: remove unused irq_flags argument from add_interrupt_randomness() random: use BLAKE2s instead of SHA1 in extraction random: do not sign extend bytes for rotation when mixing random: do not re-init if crng_reseed completes before primary init random: mix bootloader randomness into pool random: harmonize "crng init done" messages random: use IS_ENABLED(CONFIG_NUMA) instead of ifdefs random: initialize ChaCha20 constants with correct endianness random: early initialization of ChaCha constants random: avoid superfluous call to RDRAND in CRNG extraction random: don't reset crng_init_cnt on urandom_read() random: fix typo in comments random: cleanup poolinfo abstraction random: cleanup integer types random: remove incomplete last_data logic random: remove unused extract_entropy() reserved argument random: rather than entropy_store abstraction, use global random: remove unused OUTPUT_POOL constants random: de-duplicate INPUT_POOL constants random: prepend remaining pool constants with POOL_ random: cleanup fractional entropy shift constants random: access input_pool_data directly rather than through pointer random: simplify arithmetic function flow in account() random: continually use hwgenerator randomness random: access primary_pool directly rather than through pointer random: only call crng_finalize_init() for primary_crng random: use computational hash for entropy extraction random: simplify entropy debiting random: use linear min-entropy accumulation crediting random: always wake up entropy writers after extraction random: make credit_entropy_bits() always safe random: remove use_input_pool parameter from crng_reseed() random: remove batched entropy locking random: fix locking in crng_fast_load() random: use RDSEED instead of RDRAND in entropy extraction random: inline leaves of rand_initialize() random: ensure early RDSEED goes through mixer on init random: do not xor RDRAND when writing into /dev/random random: absorb fast pool into input pool after fast load random: use hash function for crng_slow_load() random: remove outdated INT_MAX >> 6 check in urandom_read() random: zero buffer after reading entropy from userspace random: tie batched entropy generation to base_crng generation random: remove ifdef'd out interrupt bench random: remove unused tracepoints random: add proper SPDX header random: deobfuscate irq u32/u64 contributions random: introduce drain_entropy() helper to declutter crng_reseed() random: remove useless header comment random: remove whitespace and reorder includes random: group initialization wait functions random: group entropy extraction functions random: group entropy collection functions random: group userspace read/write functions random: group sysctl functions random: rewrite header introductory comment random: defer fast pool mixing to worker random: do not take pool spinlock at boot random: unify early init crng load accounting random: check for crng_init == 0 in add_device_randomness() random: pull add_hwgenerator_randomness() declaration into random.h random: clear fast pool, crng, and batches in cpuhp bring up random: round-robin registers as ulong, not u32 random: only wake up writers after zap if threshold was passed random: cleanup UUID handling random: unify cycles_t and jiffies usage and types random: do crng pre-init loading in worker rather than irq random: give sysctl_random_min_urandom_seed a more sensible value random: don't let 644 read-only sysctls be written to random: replace custom notifier chain with standard one random: use SipHash as interrupt entropy accumulator random: make consistent usage of crng_ready() random: reseed more often immediately after booting random: check for signal and try earlier when generating entropy random: skip fast_init if hwrng provides large chunk of entropy random: treat bootloader trust toggle the same way as cpu trust toggle random: re-add removed comment about get_random_{u32,u64} reseeding random: mix build-time latent entropy into pool at init random: do not split fast init input in add_hwgenerator_randomness() random: do not allow user to keep crng key around on stack random: check for signal_pending() outside of need_resched() check random: check for signals every PAGE_SIZE chunk of /dev/[u]random random: make random_get_entropy() return an unsigned long random: document crng_fast_key_erasure() destination possibility random: fix sysctl documentation nits init: call time_init() before rand_initialize() ia64: define get_cycles macro for arch-override s390: define get_cycles macro for arch-override parisc: define get_cycles macro for arch-override alpha: define get_cycles macro for arch-override powerpc: define get_cycles macro for arch-override timekeeping: Add raw clock fallback for random_get_entropy() m68k: use fallback for random_get_entropy() instead of zero mips: use fallback for random_get_entropy() instead of just c0 random arm: use fallback for random_get_entropy() instead of zero nios2: use fallback for random_get_entropy() instead of zero x86/tsc: Use fallback for random_get_entropy() instead of zero um: use fallback for random_get_entropy() instead of zero sparc: use fallback for random_get_entropy() instead of zero xtensa: use fallback for random_get_entropy() instead of zero random: insist on random_get_entropy() existing in order to simplify random: do not use batches when !crng_ready() random: do not pretend to handle premature next security model random: order timer entropy functions below interrupt functions random: do not use input pool from hard IRQs random: help compiler out with fast_mix() by using simpler arguments siphash: use one source of truth for siphash permutations random: use symbolic constants for crng_init states random: avoid initializing twice in credit race random: remove ratelimiting for in-kernel unseeded randomness random: use proper jiffies comparison macro random: handle latent entropy and command line from random_init() random: credit architectural init the exact amount random: use static branch for crng_ready() random: remove extern from functions in header random: use proper return types on get_random_{int,long}_wait() random: move initialization functions out of hot pages random: move randomize_page() into mm where it belongs random: convert to using fops->write_iter() random: wire up fops->splice_{read,write}_iter() random: check for signals after page of pool writes Revert "random: use static branch for crng_ready()" crypto: drbg - add FIPS 140-2 CTRNG for noise source crypto: drbg - always seeded with SP800-90B compliant noise source crypto: drbg - prepare for more fine-grained tracking of seeding state crypto: drbg - track whether DRBG was seeded with !rng_is_initialized() crypto: drbg - move dynamic ->reseed_threshold adjustments to __drbg_seed() crypto: drbg - always try to free Jitter RNG instance crypto: drbg - make reseeding from get_random_bytes() synchronous random: avoid checking crng_ready() twice in random_init() random: mark bootloader randomness code as __init random: account for arch randomness in bits powerpc/kasan: Silence KASAN warnings in __get_wchan() ASoC: cs42l52: Fix TLV scales for mixer controls ASoC: cs53l30: Correct number of volume levels on SX controls ASoC: cs42l52: Correct TLV for Bypass Volume ASoC: cs42l56: Correct typo in minimum level for SX volume controls ata: libata-core: fix NULL pointer deref in ata_host_alloc_pinfo() ASoC: wm8962: Fix suspend while playing music ASoC: es8328: Fix event generation for deemphasis control ASoC: wm_adsp: Fix event generation for wm_adsp_fw_put() scsi: vmw_pvscsi: Expand vcpuHint to 16 bits scsi: lpfc: Fix port stuck in bypassed state after LIP in PT2PT topology scsi: ipr: Fix missing/incorrect resource cleanup in error case scsi: pmcraid: Fix missing resource cleanup in error case virtio-mmio: fix missing put_device() when vm_cmdline_parent registration failed nfc: nfcmrvl: Fix memory leak in nfcmrvl_play_deferred ipv6: Fix signed integer overflow in l2tp_ip6_sendmsg net: ethernet: mtk_eth_soc: fix misuse of mem alloc interface netdev[napi]_alloc_frag random: credit cpu and bootloader seeds by default pNFS: Don't keep retrying if the server replied NFS4ERR_LAYOUTUNAVAILABLE i40e: Fix adding ADQ filter to TC0 i40e: Fix call trace in setup_tx_descriptors tty: goldfish: Fix free_irq() on remove misc: atmel-ssc: Fix IRQ check in ssc_probe mlxsw: spectrum_cnt: Reorder counter pools net: bgmac: Fix an erroneous kfree() in bgmac_remove() arm64: ftrace: fix branch range checks certs/blacklist_hashes.c: fix const confusion in certs blacklist faddr2line: Fix overlapping text section failures, the sequel irqchip/gic/realview: Fix refcount leak in realview_gic_of_init irqchip/gic-v3: Fix refcount leak in gic_populate_ppi_partitions comedi: vmk80xx: fix expression for tx buffer size USB: serial: option: add support for Cinterion MV31 with new baseline USB: serial: io_ti: add Agilent E5805A support usb: dwc2: Fix memory leak in dwc2_hcd_init usb: gadget: lpc32xx_udc: Fix refcount leak in lpc32xx_udc_probe serial: 8250: Store to lsr_save_flags after lsr read ext4: fix bug_on ext4_mb_use_inode_pa ext4: make variable "count" signed ext4: add reserved GDT blocks check virtio-pci: Remove wrong address verification in vp_del_vqs() net: openvswitch: fix misuse of the cached connection on tuple changes net: openvswitch: fix leak of nested actions RISC-V: fix barrier() use in <vdso/processor.h> powerpc/mm: Switch obsolete dssall to .long s390/mm: use non-quiescing sske for KVM switch to keyed guest usb: gadget: u_ether: fix regression in setting fixed MAC address xprtrdma: fix incorrect header size calculations tcp: add some entropy in __inet_hash_connect() tcp: use different parts of the port_offset for index and offset tcp: add small random increments to the source port tcp: dynamically allocate the perturb table used by source ports tcp: increase source port perturb table to 2^16 tcp: drop the hash_32() part from the index calculation Revert "hwmon: Make chip parameter for with_info API mandatory" Linux 4.19.249 Merge resolution notes: - Dropped the changes that added an LTS-specific backport of the blake2s library, since this branch already has a newer version of the blake2s library. - Added CHACHA20_KEY_SIZE and CHACHA20_BLOCK_SIZE constants to chacha.h, to minimize changes from the 4.19 LTS version of random.c - Retain a fix to the rng-seed support in drivers/of/fdt.c that this branch and 4.19.250 have, but 4.19.249 doesn't have. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: If9d9e3168f0976f61ae1ab9b36c063558a7f6ebf
840 lines
21 KiB
C
840 lines
21 KiB
C
#include <linux/mm.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/string.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/export.h>
|
|
#include <linux/err.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/sched/task_stack.h>
|
|
#include <linux/security.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/swapops.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/userfaultfd_k.h>
|
|
#include <linux/random.h>
|
|
|
|
#include <asm/sections.h>
|
|
#include <linux/uaccess.h>
|
|
|
|
#include "internal.h"
|
|
|
|
static inline int is_kernel_rodata(unsigned long addr)
|
|
{
|
|
return addr >= (unsigned long)__start_rodata &&
|
|
addr < (unsigned long)__end_rodata;
|
|
}
|
|
|
|
/**
|
|
* kfree_const - conditionally free memory
|
|
* @x: pointer to the memory
|
|
*
|
|
* Function calls kfree only if @x is not in .rodata section.
|
|
*/
|
|
void kfree_const(const void *x)
|
|
{
|
|
if (!is_kernel_rodata((unsigned long)x))
|
|
kfree(x);
|
|
}
|
|
EXPORT_SYMBOL(kfree_const);
|
|
|
|
/**
|
|
* kstrdup - allocate space for and copy an existing string
|
|
* @s: the string to duplicate
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
*/
|
|
char *kstrdup(const char *s, gfp_t gfp)
|
|
{
|
|
size_t len;
|
|
char *buf;
|
|
|
|
if (!s)
|
|
return NULL;
|
|
|
|
len = strlen(s) + 1;
|
|
buf = kmalloc_track_caller(len, gfp);
|
|
if (buf)
|
|
memcpy(buf, s, len);
|
|
return buf;
|
|
}
|
|
EXPORT_SYMBOL(kstrdup);
|
|
|
|
/**
|
|
* kstrdup_const - conditionally duplicate an existing const string
|
|
* @s: the string to duplicate
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
*
|
|
* Function returns source string if it is in .rodata section otherwise it
|
|
* fallbacks to kstrdup.
|
|
* Strings allocated by kstrdup_const should be freed by kfree_const.
|
|
*/
|
|
const char *kstrdup_const(const char *s, gfp_t gfp)
|
|
{
|
|
if (is_kernel_rodata((unsigned long)s))
|
|
return s;
|
|
|
|
return kstrdup(s, gfp);
|
|
}
|
|
EXPORT_SYMBOL(kstrdup_const);
|
|
|
|
/**
|
|
* kstrndup - allocate space for and copy an existing string
|
|
* @s: the string to duplicate
|
|
* @max: read at most @max chars from @s
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
*
|
|
* Note: Use kmemdup_nul() instead if the size is known exactly.
|
|
*/
|
|
char *kstrndup(const char *s, size_t max, gfp_t gfp)
|
|
{
|
|
size_t len;
|
|
char *buf;
|
|
|
|
if (!s)
|
|
return NULL;
|
|
|
|
len = strnlen(s, max);
|
|
buf = kmalloc_track_caller(len+1, gfp);
|
|
if (buf) {
|
|
memcpy(buf, s, len);
|
|
buf[len] = '\0';
|
|
}
|
|
return buf;
|
|
}
|
|
EXPORT_SYMBOL(kstrndup);
|
|
|
|
/**
|
|
* kmemdup - duplicate region of memory
|
|
*
|
|
* @src: memory region to duplicate
|
|
* @len: memory region length
|
|
* @gfp: GFP mask to use
|
|
*/
|
|
void *kmemdup(const void *src, size_t len, gfp_t gfp)
|
|
{
|
|
void *p;
|
|
|
|
p = kmalloc_track_caller(len, gfp);
|
|
if (p)
|
|
memcpy(p, src, len);
|
|
return p;
|
|
}
|
|
EXPORT_SYMBOL(kmemdup);
|
|
|
|
/**
|
|
* kmemdup_nul - Create a NUL-terminated string from unterminated data
|
|
* @s: The data to stringify
|
|
* @len: The size of the data
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
*/
|
|
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
|
|
{
|
|
char *buf;
|
|
|
|
if (!s)
|
|
return NULL;
|
|
|
|
buf = kmalloc_track_caller(len + 1, gfp);
|
|
if (buf) {
|
|
memcpy(buf, s, len);
|
|
buf[len] = '\0';
|
|
}
|
|
return buf;
|
|
}
|
|
EXPORT_SYMBOL(kmemdup_nul);
|
|
|
|
/**
|
|
* memdup_user - duplicate memory region from user space
|
|
*
|
|
* @src: source address in user space
|
|
* @len: number of bytes to copy
|
|
*
|
|
* Returns an ERR_PTR() on failure. Result is physically
|
|
* contiguous, to be freed by kfree().
|
|
*/
|
|
void *memdup_user(const void __user *src, size_t len)
|
|
{
|
|
void *p;
|
|
|
|
p = kmalloc_track_caller(len, GFP_USER);
|
|
if (!p)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
kfree(p);
|
|
return ERR_PTR(-EFAULT);
|
|
}
|
|
|
|
return p;
|
|
}
|
|
EXPORT_SYMBOL(memdup_user);
|
|
|
|
/**
|
|
* vmemdup_user - duplicate memory region from user space
|
|
*
|
|
* @src: source address in user space
|
|
* @len: number of bytes to copy
|
|
*
|
|
* Returns an ERR_PTR() on failure. Result may be not
|
|
* physically contiguous. Use kvfree() to free.
|
|
*/
|
|
void *vmemdup_user(const void __user *src, size_t len)
|
|
{
|
|
void *p;
|
|
|
|
p = kvmalloc(len, GFP_USER);
|
|
if (!p)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
kvfree(p);
|
|
return ERR_PTR(-EFAULT);
|
|
}
|
|
|
|
return p;
|
|
}
|
|
EXPORT_SYMBOL(vmemdup_user);
|
|
|
|
/**
|
|
* strndup_user - duplicate an existing string from user space
|
|
* @s: The string to duplicate
|
|
* @n: Maximum number of bytes to copy, including the trailing NUL.
|
|
*/
|
|
char *strndup_user(const char __user *s, long n)
|
|
{
|
|
char *p;
|
|
long length;
|
|
|
|
length = strnlen_user(s, n);
|
|
|
|
if (!length)
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
if (length > n)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
p = memdup_user(s, length);
|
|
|
|
if (IS_ERR(p))
|
|
return p;
|
|
|
|
p[length - 1] = '\0';
|
|
|
|
return p;
|
|
}
|
|
EXPORT_SYMBOL(strndup_user);
|
|
|
|
/**
|
|
* memdup_user_nul - duplicate memory region from user space and NUL-terminate
|
|
*
|
|
* @src: source address in user space
|
|
* @len: number of bytes to copy
|
|
*
|
|
* Returns an ERR_PTR() on failure.
|
|
*/
|
|
void *memdup_user_nul(const void __user *src, size_t len)
|
|
{
|
|
char *p;
|
|
|
|
/*
|
|
* Always use GFP_KERNEL, since copy_from_user() can sleep and
|
|
* cause pagefault, which makes it pointless to use GFP_NOFS
|
|
* or GFP_ATOMIC.
|
|
*/
|
|
p = kmalloc_track_caller(len + 1, GFP_KERNEL);
|
|
if (!p)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
kfree(p);
|
|
return ERR_PTR(-EFAULT);
|
|
}
|
|
p[len] = '\0';
|
|
|
|
return p;
|
|
}
|
|
EXPORT_SYMBOL(memdup_user_nul);
|
|
|
|
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
struct vm_area_struct *prev, struct rb_node *rb_parent)
|
|
{
|
|
struct vm_area_struct *next;
|
|
|
|
vma->vm_prev = prev;
|
|
if (prev) {
|
|
next = prev->vm_next;
|
|
prev->vm_next = vma;
|
|
} else {
|
|
mm->mmap = vma;
|
|
if (rb_parent)
|
|
next = rb_entry(rb_parent,
|
|
struct vm_area_struct, vm_rb);
|
|
else
|
|
next = NULL;
|
|
}
|
|
vma->vm_next = next;
|
|
if (next)
|
|
next->vm_prev = vma;
|
|
}
|
|
|
|
/* Check if the vma is being used as a stack by this task */
|
|
int vma_is_stack_for_current(struct vm_area_struct *vma)
|
|
{
|
|
struct task_struct * __maybe_unused t = current;
|
|
|
|
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
|
|
}
|
|
|
|
/**
|
|
* randomize_page - Generate a random, page aligned address
|
|
* @start: The smallest acceptable address the caller will take.
|
|
* @range: The size of the area, starting at @start, within which the
|
|
* random address must fall.
|
|
*
|
|
* If @start + @range would overflow, @range is capped.
|
|
*
|
|
* NOTE: Historical use of randomize_range, which this replaces, presumed that
|
|
* @start was already page aligned. We now align it regardless.
|
|
*
|
|
* Return: A page aligned address within [start, start + range). On error,
|
|
* @start is returned.
|
|
*/
|
|
unsigned long randomize_page(unsigned long start, unsigned long range)
|
|
{
|
|
if (!PAGE_ALIGNED(start)) {
|
|
range -= PAGE_ALIGN(start) - start;
|
|
start = PAGE_ALIGN(start);
|
|
}
|
|
|
|
if (start > ULONG_MAX - range)
|
|
range = ULONG_MAX - start;
|
|
|
|
range >>= PAGE_SHIFT;
|
|
|
|
if (range == 0)
|
|
return start;
|
|
|
|
return start + (get_random_long() % range << PAGE_SHIFT);
|
|
}
|
|
|
|
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
|
|
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
|
|
{
|
|
mm->mmap_base = TASK_UNMAPPED_BASE;
|
|
mm->get_unmapped_area = arch_get_unmapped_area;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
|
|
* back to the regular GUP.
|
|
* Note a difference with get_user_pages_fast: this always returns the
|
|
* number of pages pinned, 0 if no pages were pinned.
|
|
* If the architecture does not support this function, simply return with no
|
|
* pages pinned.
|
|
*/
|
|
int __weak __get_user_pages_fast(unsigned long start,
|
|
int nr_pages, int write, struct page **pages)
|
|
{
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(__get_user_pages_fast);
|
|
|
|
/**
|
|
* get_user_pages_fast() - pin user pages in memory
|
|
* @start: starting user address
|
|
* @nr_pages: number of pages from start to pin
|
|
* @write: whether pages will be written to
|
|
* @pages: array that receives pointers to the pages pinned.
|
|
* Should be at least nr_pages long.
|
|
*
|
|
* Returns number of pages pinned. This may be fewer than the number
|
|
* requested. If nr_pages is 0 or negative, returns 0. If no pages
|
|
* were pinned, returns -errno.
|
|
*
|
|
* get_user_pages_fast provides equivalent functionality to get_user_pages,
|
|
* operating on current and current->mm, with force=0 and vma=NULL. However
|
|
* unlike get_user_pages, it must be called without mmap_sem held.
|
|
*
|
|
* get_user_pages_fast may take mmap_sem and page table locks, so no
|
|
* assumptions can be made about lack of locking. get_user_pages_fast is to be
|
|
* implemented in a way that is advantageous (vs get_user_pages()) when the
|
|
* user memory area is already faulted in and present in ptes. However if the
|
|
* pages have to be faulted in, it may turn out to be slightly slower so
|
|
* callers need to carefully consider what to use. On many architectures,
|
|
* get_user_pages_fast simply falls back to get_user_pages.
|
|
*/
|
|
int __weak get_user_pages_fast(unsigned long start,
|
|
int nr_pages, int write, struct page **pages)
|
|
{
|
|
return get_user_pages_unlocked(start, nr_pages, pages,
|
|
write ? FOLL_WRITE : 0);
|
|
}
|
|
EXPORT_SYMBOL_GPL(get_user_pages_fast);
|
|
|
|
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
|
|
unsigned long len, unsigned long prot,
|
|
unsigned long flag, unsigned long pgoff)
|
|
{
|
|
unsigned long ret;
|
|
struct mm_struct *mm = current->mm;
|
|
unsigned long populate;
|
|
LIST_HEAD(uf);
|
|
|
|
ret = security_mmap_file(file, prot, flag);
|
|
if (!ret) {
|
|
if (down_write_killable(&mm->mmap_sem))
|
|
return -EINTR;
|
|
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
|
|
&populate, &uf);
|
|
up_write(&mm->mmap_sem);
|
|
userfaultfd_unmap_complete(mm, &uf);
|
|
if (populate)
|
|
mm_populate(ret, populate);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
unsigned long vm_mmap(struct file *file, unsigned long addr,
|
|
unsigned long len, unsigned long prot,
|
|
unsigned long flag, unsigned long offset)
|
|
{
|
|
if (unlikely(offset + PAGE_ALIGN(len) < offset))
|
|
return -EINVAL;
|
|
if (unlikely(offset_in_page(offset)))
|
|
return -EINVAL;
|
|
|
|
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
|
|
}
|
|
EXPORT_SYMBOL(vm_mmap);
|
|
|
|
/**
|
|
* kvmalloc_node - attempt to allocate physically contiguous memory, but upon
|
|
* failure, fall back to non-contiguous (vmalloc) allocation.
|
|
* @size: size of the request.
|
|
* @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
|
|
* @node: numa node to allocate from
|
|
*
|
|
* Uses kmalloc to get the memory but if the allocation fails then falls back
|
|
* to the vmalloc allocator. Use kvfree for freeing the memory.
|
|
*
|
|
* Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
|
|
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
|
|
* preferable to the vmalloc fallback, due to visible performance drawbacks.
|
|
*
|
|
* Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
|
|
* fall back to vmalloc.
|
|
*/
|
|
void *kvmalloc_node(size_t size, gfp_t flags, int node)
|
|
{
|
|
gfp_t kmalloc_flags = flags;
|
|
void *ret;
|
|
|
|
/*
|
|
* vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
|
|
* so the given set of flags has to be compatible.
|
|
*/
|
|
if ((flags & GFP_KERNEL) != GFP_KERNEL)
|
|
return kmalloc_node(size, flags, node);
|
|
|
|
/*
|
|
* We want to attempt a large physically contiguous block first because
|
|
* it is less likely to fragment multiple larger blocks and therefore
|
|
* contribute to a long term fragmentation less than vmalloc fallback.
|
|
* However make sure that larger requests are not too disruptive - no
|
|
* OOM killer and no allocation failure warnings as we have a fallback.
|
|
*/
|
|
if (size > PAGE_SIZE) {
|
|
kmalloc_flags |= __GFP_NOWARN;
|
|
|
|
if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
|
|
kmalloc_flags |= __GFP_NORETRY;
|
|
}
|
|
|
|
ret = kmalloc_node(size, kmalloc_flags, node);
|
|
|
|
/*
|
|
* It doesn't really make sense to fallback to vmalloc for sub page
|
|
* requests
|
|
*/
|
|
if (ret || size <= PAGE_SIZE)
|
|
return ret;
|
|
|
|
return __vmalloc_node_flags_caller(size, node, flags,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(kvmalloc_node);
|
|
|
|
/**
|
|
* kvfree() - Free memory.
|
|
* @addr: Pointer to allocated memory.
|
|
*
|
|
* kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc().
|
|
* It is slightly more efficient to use kfree() or vfree() if you are certain
|
|
* that you know which one to use.
|
|
*
|
|
* Context: Any context except NMI.
|
|
*/
|
|
void kvfree(const void *addr)
|
|
{
|
|
if (is_vmalloc_addr(addr))
|
|
vfree(addr);
|
|
else
|
|
kfree(addr);
|
|
}
|
|
EXPORT_SYMBOL(kvfree);
|
|
|
|
/**
|
|
* kvfree_sensitive - Free a data object containing sensitive information.
|
|
* @addr: address of the data object to be freed.
|
|
* @len: length of the data object.
|
|
*
|
|
* Use the special memzero_explicit() function to clear the content of a
|
|
* kvmalloc'ed object containing sensitive data to make sure that the
|
|
* compiler won't optimize out the data clearing.
|
|
*/
|
|
void kvfree_sensitive(const void *addr, size_t len)
|
|
{
|
|
if (likely(!ZERO_OR_NULL_PTR(addr))) {
|
|
memzero_explicit((void *)addr, len);
|
|
kvfree(addr);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL(kvfree_sensitive);
|
|
|
|
static inline void *__page_rmapping(struct page *page)
|
|
{
|
|
unsigned long mapping;
|
|
|
|
mapping = (unsigned long)page->mapping;
|
|
mapping &= ~PAGE_MAPPING_FLAGS;
|
|
|
|
return (void *)mapping;
|
|
}
|
|
|
|
/* Neutral page->mapping pointer to address_space or anon_vma or other */
|
|
void *page_rmapping(struct page *page)
|
|
{
|
|
page = compound_head(page);
|
|
return __page_rmapping(page);
|
|
}
|
|
|
|
/*
|
|
* Return true if this page is mapped into pagetables.
|
|
* For compound page it returns true if any subpage of compound page is mapped.
|
|
*/
|
|
bool page_mapped(struct page *page)
|
|
{
|
|
int i;
|
|
|
|
if (likely(!PageCompound(page)))
|
|
return atomic_read(&page->_mapcount) >= 0;
|
|
page = compound_head(page);
|
|
if (atomic_read(compound_mapcount_ptr(page)) >= 0)
|
|
return true;
|
|
if (PageHuge(page))
|
|
return false;
|
|
for (i = 0; i < (1 << compound_order(page)); i++) {
|
|
if (atomic_read(&page[i]._mapcount) >= 0)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
EXPORT_SYMBOL(page_mapped);
|
|
|
|
struct anon_vma *page_anon_vma(struct page *page)
|
|
{
|
|
unsigned long mapping;
|
|
|
|
page = compound_head(page);
|
|
mapping = (unsigned long)page->mapping;
|
|
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
|
|
return NULL;
|
|
return __page_rmapping(page);
|
|
}
|
|
|
|
struct address_space *page_mapping(struct page *page)
|
|
{
|
|
struct address_space *mapping;
|
|
|
|
page = compound_head(page);
|
|
|
|
/* This happens if someone calls flush_dcache_page on slab page */
|
|
if (unlikely(PageSlab(page)))
|
|
return NULL;
|
|
|
|
if (unlikely(PageSwapCache(page))) {
|
|
swp_entry_t entry;
|
|
|
|
entry.val = page_private(page);
|
|
return swap_address_space(entry);
|
|
}
|
|
|
|
mapping = page->mapping;
|
|
if ((unsigned long)mapping & PAGE_MAPPING_ANON)
|
|
return NULL;
|
|
|
|
return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
|
|
}
|
|
EXPORT_SYMBOL(page_mapping);
|
|
|
|
/*
|
|
* For file cache pages, return the address_space, otherwise return NULL
|
|
*/
|
|
struct address_space *page_mapping_file(struct page *page)
|
|
{
|
|
if (unlikely(PageSwapCache(page)))
|
|
return NULL;
|
|
return page_mapping(page);
|
|
}
|
|
|
|
/* Slow path of page_mapcount() for compound pages */
|
|
int __page_mapcount(struct page *page)
|
|
{
|
|
int ret;
|
|
|
|
ret = atomic_read(&page->_mapcount) + 1;
|
|
/*
|
|
* For file THP page->_mapcount contains total number of mapping
|
|
* of the page: no need to look into compound_mapcount.
|
|
*/
|
|
if (!PageAnon(page) && !PageHuge(page))
|
|
return ret;
|
|
page = compound_head(page);
|
|
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
|
|
if (PageDoubleMap(page))
|
|
ret--;
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(__page_mapcount);
|
|
|
|
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
|
|
int sysctl_overcommit_ratio __read_mostly = 50;
|
|
unsigned long sysctl_overcommit_kbytes __read_mostly;
|
|
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
|
|
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
|
|
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
|
|
|
|
int overcommit_ratio_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos)
|
|
{
|
|
int ret;
|
|
|
|
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
|
if (ret == 0 && write)
|
|
sysctl_overcommit_kbytes = 0;
|
|
return ret;
|
|
}
|
|
|
|
int overcommit_kbytes_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos)
|
|
{
|
|
int ret;
|
|
|
|
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
|
if (ret == 0 && write)
|
|
sysctl_overcommit_ratio = 0;
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
|
|
*/
|
|
unsigned long vm_commit_limit(void)
|
|
{
|
|
unsigned long allowed;
|
|
|
|
if (sysctl_overcommit_kbytes)
|
|
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
|
|
else
|
|
allowed = ((totalram_pages - hugetlb_total_pages())
|
|
* sysctl_overcommit_ratio / 100);
|
|
allowed += total_swap_pages;
|
|
|
|
return allowed;
|
|
}
|
|
|
|
/*
|
|
* Make sure vm_committed_as in one cacheline and not cacheline shared with
|
|
* other variables. It can be updated by several CPUs frequently.
|
|
*/
|
|
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
|
|
|
|
/*
|
|
* The global memory commitment made in the system can be a metric
|
|
* that can be used to drive ballooning decisions when Linux is hosted
|
|
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
|
|
* balancing memory across competing virtual machines that are hosted.
|
|
* Several metrics drive this policy engine including the guest reported
|
|
* memory commitment.
|
|
*/
|
|
unsigned long vm_memory_committed(void)
|
|
{
|
|
return percpu_counter_read_positive(&vm_committed_as);
|
|
}
|
|
EXPORT_SYMBOL_GPL(vm_memory_committed);
|
|
|
|
/*
|
|
* Check that a process has enough memory to allocate a new virtual
|
|
* mapping. 0 means there is enough memory for the allocation to
|
|
* succeed and -ENOMEM implies there is not.
|
|
*
|
|
* We currently support three overcommit policies, which are set via the
|
|
* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst
|
|
*
|
|
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
|
|
* Additional code 2002 Jul 20 by Robert Love.
|
|
*
|
|
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
|
|
*
|
|
* Note this is a helper function intended to be used by LSMs which
|
|
* wish to use this logic.
|
|
*/
|
|
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
|
|
{
|
|
long free, allowed, reserve;
|
|
|
|
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
|
|
-(s64)vm_committed_as_batch * num_online_cpus(),
|
|
"memory commitment underflow");
|
|
|
|
vm_acct_memory(pages);
|
|
|
|
/*
|
|
* Sometimes we want to use more memory than we have
|
|
*/
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
|
|
return 0;
|
|
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
|
|
free = global_zone_page_state(NR_FREE_PAGES);
|
|
free += global_node_page_state(NR_FILE_PAGES);
|
|
|
|
/*
|
|
* shmem pages shouldn't be counted as free in this
|
|
* case, they can't be purged, only swapped out, and
|
|
* that won't affect the overall amount of available
|
|
* memory in the system.
|
|
*/
|
|
free -= global_node_page_state(NR_SHMEM);
|
|
|
|
free += get_nr_swap_pages();
|
|
|
|
/*
|
|
* Any slabs which are created with the
|
|
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
|
|
* which are reclaimable, under pressure. The dentry
|
|
* cache and most inode caches should fall into this
|
|
*/
|
|
free += global_node_page_state(NR_SLAB_RECLAIMABLE);
|
|
|
|
/*
|
|
* Part of the kernel memory, which can be released
|
|
* under memory pressure.
|
|
*/
|
|
free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
|
|
|
|
/*
|
|
* Leave reserved pages. The pages are not for anonymous pages.
|
|
*/
|
|
if (free <= totalreserve_pages)
|
|
goto error;
|
|
else
|
|
free -= totalreserve_pages;
|
|
|
|
/*
|
|
* Reserve some for root
|
|
*/
|
|
if (!cap_sys_admin)
|
|
free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
if (free > pages)
|
|
return 0;
|
|
|
|
goto error;
|
|
}
|
|
|
|
allowed = vm_commit_limit();
|
|
/*
|
|
* Reserve some for root
|
|
*/
|
|
if (!cap_sys_admin)
|
|
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
/*
|
|
* Don't let a single process grow so big a user can't recover
|
|
*/
|
|
if (mm) {
|
|
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
allowed -= min_t(long, mm->total_vm / 32, reserve);
|
|
}
|
|
|
|
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
|
|
return 0;
|
|
error:
|
|
vm_unacct_memory(pages);
|
|
|
|
return -ENOMEM;
|
|
}
|
|
|
|
/**
|
|
* get_cmdline() - copy the cmdline value to a buffer.
|
|
* @task: the task whose cmdline value to copy.
|
|
* @buffer: the buffer to copy to.
|
|
* @buflen: the length of the buffer. Larger cmdline values are truncated
|
|
* to this length.
|
|
* Returns the size of the cmdline field copied. Note that the copy does
|
|
* not guarantee an ending NULL byte.
|
|
*/
|
|
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
|
|
{
|
|
int res = 0;
|
|
unsigned int len;
|
|
struct mm_struct *mm = get_task_mm(task);
|
|
unsigned long arg_start, arg_end, env_start, env_end;
|
|
if (!mm)
|
|
goto out;
|
|
if (!mm->arg_end)
|
|
goto out_mm; /* Shh! No looking before we're done */
|
|
|
|
down_read(&mm->mmap_sem);
|
|
arg_start = mm->arg_start;
|
|
arg_end = mm->arg_end;
|
|
env_start = mm->env_start;
|
|
env_end = mm->env_end;
|
|
up_read(&mm->mmap_sem);
|
|
|
|
len = arg_end - arg_start;
|
|
|
|
if (len > buflen)
|
|
len = buflen;
|
|
|
|
res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
|
|
|
|
/*
|
|
* If the nul at the end of args has been overwritten, then
|
|
* assume application is using setproctitle(3).
|
|
*/
|
|
if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
|
|
len = strnlen(buffer, res);
|
|
if (len < res) {
|
|
res = len;
|
|
} else {
|
|
len = env_end - env_start;
|
|
if (len > buflen - res)
|
|
len = buflen - res;
|
|
res += access_process_vm(task, env_start,
|
|
buffer+res, len,
|
|
FOLL_FORCE);
|
|
res = strnlen(buffer, res);
|
|
}
|
|
}
|
|
out_mm:
|
|
mmput(mm);
|
|
out:
|
|
return res;
|
|
}
|