Changes in 4.9.320 9p: missing chunk of "fs/9p: Don't update file type when updating file attributes" random: remove stale maybe_reseed_primary_crng random: remove stale urandom_init_wait random: remove variable limit random: fix comment for unused random_min_urandom_seed random: convert get_random_int/long into get_random_u32/u64 random: move random_min_urandom_seed into CONFIG_SYSCTL ifdef block random: invalidate batched entropy after crng init random: silence compiler warnings and fix race random: add wait_for_random_bytes() API random: add get_random_{bytes,u32,u64,int,long,once}_wait family random: warn when kernel uses unseeded randomness random: do not ignore early device randomness random: suppress spammy warnings about unseeded randomness random: reorder READ_ONCE() in get_random_uXX random: fix warning message on ia64 and parisc random: use a different mixing algorithm for add_device_randomness() random: set up the NUMA crng instances after the CRNG is fully initialized random: fix possible sleeping allocation from irq context random: rate limit unseeded randomness warnings random: add a spinlock_t to struct batched_entropy char/random: silence a lockdep splat with printk() Revert "char/random: silence a lockdep splat with printk()" random: always use batched entropy for get_random_u{32,64} random: fix data race on crng_node_pool crypto: chacha20 - Fix keystream alignment for chacha20_block() random: always fill buffer in get_random_bytes_wait random: optimize add_interrupt_randomness drivers/char/random.c: remove unused dont_count_entropy random: Fix whitespace pre random-bytes work random: Return nbytes filled from hw RNG random: add a config option to trust the CPU's hwrng random: remove preempt disabled region random: Make crng state queryable random: make CPU trust a boot parameter drivers/char/random.c: constify poolinfo_table drivers/char/random.c: remove unused stuct poolinfo::poolbits drivers/char/random.c: make primary_crng static random: only read from /dev/random after its pool has received 128 bits random: move rand_initialize() earlier random: document get_random_int() family latent_entropy: avoid build error when plugin cflags are not set random: fix soft lockup when trying to read from an uninitialized blocking pool random: Support freezable kthreads in add_hwgenerator_randomness() fdt: add support for rng-seed random: Use wait_event_freezable() in add_hwgenerator_randomness() char/random: Add a newline at the end of the file Revert "hwrng: core - Freeze khwrng thread during suspend" crypto: Deduplicate le32_to_cpu_array() and cpu_to_le32_array() crypto: blake2s - generic C library implementation and selftest lib/crypto: blake2s: move hmac construction into wireguard lib/crypto: sha1: re-roll loops to reduce code size random: Don't wake crng_init_wait when crng_init == 1 random: Add a urandom_read_nowait() for random APIs that don't warn random: add GRND_INSECURE to return best-effort non-cryptographic bytes random: ignore GRND_RANDOM in getentropy(2) random: make /dev/random be almost like /dev/urandom random: fix crash on multiple early calls to add_bootloader_randomness() random: remove the blocking pool random: delete code to pull data into pools random: remove kernel.random.read_wakeup_threshold random: remove unnecessary unlikely() random: convert to ENTROPY_BITS for better code readability random: Add and use pr_fmt() random: fix typo in add_timer_randomness() random: remove some dead code of poolinfo random: split primary/secondary crng init paths random: avoid warnings for !CONFIG_NUMA builds x86: Remove arch_has_random, arch_has_random_seed powerpc: Remove arch_has_random, arch_has_random_seed linux/random.h: Remove arch_has_random, arch_has_random_seed linux/random.h: Use false with bool linux/random.h: Mark CONFIG_ARCH_RANDOM functions __must_check powerpc: Use bool in archrandom.h random: add arch_get_random_*long_early() random: avoid arch_get_random_seed_long() when collecting IRQ randomness random: remove dead code left over from blocking pool MAINTAINERS: co-maintain random.c crypto: blake2s - include <linux/bug.h> instead of <asm/bug.h> crypto: blake2s - adjust include guard naming random: document add_hwgenerator_randomness() with other input functions random: remove unused irq_flags argument from add_interrupt_randomness() random: use BLAKE2s instead of SHA1 in extraction random: do not sign extend bytes for rotation when mixing random: do not re-init if crng_reseed completes before primary init random: mix bootloader randomness into pool random: harmonize "crng init done" messages random: use IS_ENABLED(CONFIG_NUMA) instead of ifdefs random: initialize ChaCha20 constants with correct endianness random: early initialization of ChaCha constants random: avoid superfluous call to RDRAND in CRNG extraction random: don't reset crng_init_cnt on urandom_read() random: fix typo in comments random: cleanup poolinfo abstraction crypto: chacha20 - Fix chacha20_block() keystream alignment (again) random: cleanup integer types random: remove incomplete last_data logic random: remove unused extract_entropy() reserved argument random: try to actively add entropy rather than passively wait for it random: rather than entropy_store abstraction, use global random: remove unused OUTPUT_POOL constants random: de-duplicate INPUT_POOL constants random: prepend remaining pool constants with POOL_ random: cleanup fractional entropy shift constants random: access input_pool_data directly rather than through pointer random: simplify arithmetic function flow in account() random: continually use hwgenerator randomness random: access primary_pool directly rather than through pointer random: only call crng_finalize_init() for primary_crng random: use computational hash for entropy extraction random: simplify entropy debiting random: use linear min-entropy accumulation crediting random: always wake up entropy writers after extraction random: make credit_entropy_bits() always safe random: remove use_input_pool parameter from crng_reseed() random: remove batched entropy locking random: fix locking in crng_fast_load() random: use RDSEED instead of RDRAND in entropy extraction random: inline leaves of rand_initialize() random: ensure early RDSEED goes through mixer on init random: do not xor RDRAND when writing into /dev/random random: absorb fast pool into input pool after fast load random: use hash function for crng_slow_load() random: remove outdated INT_MAX >> 6 check in urandom_read() random: zero buffer after reading entropy from userspace random: tie batched entropy generation to base_crng generation random: remove ifdef'd out interrupt bench random: remove unused tracepoints random: add proper SPDX header random: deobfuscate irq u32/u64 contributions random: introduce drain_entropy() helper to declutter crng_reseed() random: remove useless header comment random: remove whitespace and reorder includes random: group initialization wait functions random: group entropy extraction functions random: group entropy collection functions random: group userspace read/write functions random: group sysctl functions random: rewrite header introductory comment workqueue: make workqueue available early during boot random: defer fast pool mixing to worker random: do not take pool spinlock at boot random: unify early init crng load accounting random: check for crng_init == 0 in add_device_randomness() hwrng: core - do not use multiple blank lines hwrng: core - rewrite better comparison to NULL hwrng: core - Rewrite the header hwrng: core - Move hwrng miscdev minor number to include/linux/miscdevice.h hwrng: core - remove unused PFX macro hwrng: use rng source with best quality hwrng: remember rng chosen by user random: pull add_hwgenerator_randomness() declaration into random.h random: clear fast pool, crng, and batches in cpuhp bring up random: round-robin registers as ulong, not u32 random: only wake up writers after zap if threshold was passed random: cleanup UUID handling random: unify cycles_t and jiffies usage and types random: do crng pre-init loading in worker rather than irq random: give sysctl_random_min_urandom_seed a more sensible value random: don't let 644 read-only sysctls be written to random: replace custom notifier chain with standard one random: use SipHash as interrupt entropy accumulator random: make consistent usage of crng_ready() random: reseed more often immediately after booting random: check for signal and try earlier when generating entropy random: skip fast_init if hwrng provides large chunk of entropy random: treat bootloader trust toggle the same way as cpu trust toggle random: re-add removed comment about get_random_{u32,u64} reseeding random: mix build-time latent entropy into pool at init random: do not split fast init input in add_hwgenerator_randomness() random: do not allow user to keep crng key around on stack random: check for signal_pending() outside of need_resched() check random: check for signals every PAGE_SIZE chunk of /dev/[u]random random: make random_get_entropy() return an unsigned long random: document crng_fast_key_erasure() destination possibility random: fix sysctl documentation nits init: call time_init() before rand_initialize() ia64: define get_cycles macro for arch-override s390: define get_cycles macro for arch-override parisc: define get_cycles macro for arch-override alpha: define get_cycles macro for arch-override powerpc: define get_cycles macro for arch-override timekeeping: Add raw clock fallback for random_get_entropy() m68k: use fallback for random_get_entropy() instead of zero mips: use fallback for random_get_entropy() instead of just c0 random arm: use fallback for random_get_entropy() instead of zero nios2: use fallback for random_get_entropy() instead of zero x86/tsc: Use fallback for random_get_entropy() instead of zero um: use fallback for random_get_entropy() instead of zero sparc: use fallback for random_get_entropy() instead of zero xtensa: use fallback for random_get_entropy() instead of zero uapi: rename ext2_swab() to swab() and share globally in swab.h random: insist on random_get_entropy() existing in order to simplify random: do not use batches when !crng_ready() random: do not pretend to handle premature next security model random: order timer entropy functions below interrupt functions random: do not use input pool from hard IRQs random: help compiler out with fast_mix() by using simpler arguments siphash: use one source of truth for siphash permutations random: use symbolic constants for crng_init states random: avoid initializing twice in credit race random: remove ratelimiting for in-kernel unseeded randomness random: use proper jiffies comparison macro random: handle latent entropy and command line from random_init() random: credit architectural init the exact amount random: use static branch for crng_ready() random: remove extern from functions in header random: use proper return types on get_random_{int,long}_wait() random: move initialization functions out of hot pages random: move randomize_page() into mm where it belongs random: convert to using fops->write_iter() random: wire up fops->splice_{read,write}_iter() random: check for signals after page of pool writes Revert "random: use static branch for crng_ready()" crypto: drbg - add FIPS 140-2 CTRNG for noise source crypto: drbg - always seeded with SP800-90B compliant noise source crypto: drbg - prepare for more fine-grained tracking of seeding state crypto: drbg - track whether DRBG was seeded with !rng_is_initialized() crypto: drbg - move dynamic ->reseed_threshold adjustments to __drbg_seed() crypto: drbg - always try to free Jitter RNG instance crypto: drbg - make reseeding from get_random_bytes() synchronous random: avoid checking crng_ready() twice in random_init() random: mark bootloader randomness code as __init random: account for arch randomness in bits ASoC: cs42l52: Fix TLV scales for mixer controls ASoC: cs53l30: Correct number of volume levels on SX controls ASoC: cs42l52: Correct TLV for Bypass Volume ASoC: cs42l56: Correct typo in minimum level for SX volume controls ata: libata-core: fix NULL pointer deref in ata_host_alloc_pinfo() ASoC: wm8962: Fix suspend while playing music scsi: vmw_pvscsi: Expand vcpuHint to 16 bits scsi: lpfc: Fix port stuck in bypassed state after LIP in PT2PT topology virtio-mmio: fix missing put_device() when vm_cmdline_parent registration failed nfc: nfcmrvl: Fix memory leak in nfcmrvl_play_deferred ipv6: Fix signed integer overflow in l2tp_ip6_sendmsg net: ethernet: mtk_eth_soc: fix misuse of mem alloc interface netdev[napi]_alloc_frag random: credit cpu and bootloader seeds by default pNFS: Don't keep retrying if the server replied NFS4ERR_LAYOUTUNAVAILABLE misc: atmel-ssc: Fix IRQ check in ssc_probe irqchip/gic/realview: Fix refcount leak in realview_gic_of_init irqchip/gic-v3: Iterate over possible CPUs by for_each_possible_cpu() comedi: vmk80xx: fix expression for tx buffer size USB: serial: option: add support for Cinterion MV31 with new baseline USB: serial: io_ti: add Agilent E5805A support usb: gadget: lpc32xx_udc: Fix refcount leak in lpc32xx_udc_probe serial: 8250: Store to lsr_save_flags after lsr read ext4: fix bug_on ext4_mb_use_inode_pa ext4: make variable "count" signed ext4: add reserved GDT blocks check l2tp: don't use inet_shutdown on ppp session destroy l2tp: fix race in pppol2tp_release with session object destroy s390/mm: use non-quiescing sske for KVM switch to keyed guest xprtrdma: fix incorrect header size calculations swiotlb: fix info leak with DMA_FROM_DEVICE Reinstate some of "swiotlb: rework "fix info leak with DMA_FROM_DEVICE"" fuse: fix pipe buffer lifetime for direct_io tcp: change source port randomizarion at connect() time tcp: add some entropy in __inet_hash_connect() secure_seq: use the 64 bits of the siphash for port offset calculation tcp: use different parts of the port_offset for index and offset tcp: add small random increments to the source port tcp: dynamically allocate the perturb table used by source ports tcp: increase source port perturb table to 2^16 tcp: drop the hash_32() part from the index calculation Linux 4.9.320 Conflicts: crypto/chacha20_generic.c drivers/char/random.c drivers/of/fdt.c include/crypto/chacha20.h lib/chacha20.c Merge resolution notes: - Added CHACHA20_KEY_SIZE and CHACHA20_BLOCK_SIZE constants to chacha.h, to minimize changes from the 4.9.320 version of random.c - Updated lib/vsprintf.c for "random: replace custom notifier chain with standard one". Change-Id: Ia7a12d8883b808f88bbe807d6150552bb084f6b3 Signed-off-by: Eric Biggers <ebiggers@google.com>
711 lines
17 KiB
C
711 lines
17 KiB
C
#include <linux/mm.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/string.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/export.h>
|
|
#include <linux/err.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/security.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/swapops.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/random.h>
|
|
|
|
#include <asm/sections.h>
|
|
#include <asm/uaccess.h>
|
|
|
|
#include "internal.h"
|
|
|
|
static inline int is_kernel_rodata(unsigned long addr)
|
|
{
|
|
return addr >= (unsigned long)__start_rodata &&
|
|
addr < (unsigned long)__end_rodata;
|
|
}
|
|
|
|
/**
|
|
* kfree_const - conditionally free memory
|
|
* @x: pointer to the memory
|
|
*
|
|
* Function calls kfree only if @x is not in .rodata section.
|
|
*/
|
|
void kfree_const(const void *x)
|
|
{
|
|
if (!is_kernel_rodata((unsigned long)x))
|
|
kfree(x);
|
|
}
|
|
EXPORT_SYMBOL(kfree_const);
|
|
|
|
/**
|
|
* kstrdup - allocate space for and copy an existing string
|
|
* @s: the string to duplicate
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
*/
|
|
char *kstrdup(const char *s, gfp_t gfp)
|
|
{
|
|
size_t len;
|
|
char *buf;
|
|
|
|
if (!s)
|
|
return NULL;
|
|
|
|
len = strlen(s) + 1;
|
|
buf = kmalloc_track_caller(len, gfp);
|
|
if (buf)
|
|
memcpy(buf, s, len);
|
|
return buf;
|
|
}
|
|
EXPORT_SYMBOL(kstrdup);
|
|
|
|
/**
|
|
* kstrdup_const - conditionally duplicate an existing const string
|
|
* @s: the string to duplicate
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
*
|
|
* Function returns source string if it is in .rodata section otherwise it
|
|
* fallbacks to kstrdup.
|
|
* Strings allocated by kstrdup_const should be freed by kfree_const.
|
|
*/
|
|
const char *kstrdup_const(const char *s, gfp_t gfp)
|
|
{
|
|
if (is_kernel_rodata((unsigned long)s))
|
|
return s;
|
|
|
|
return kstrdup(s, gfp);
|
|
}
|
|
EXPORT_SYMBOL(kstrdup_const);
|
|
|
|
/**
|
|
* kstrndup - allocate space for and copy an existing string
|
|
* @s: the string to duplicate
|
|
* @max: read at most @max chars from @s
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
*
|
|
* Note: Use kmemdup_nul() instead if the size is known exactly.
|
|
*/
|
|
char *kstrndup(const char *s, size_t max, gfp_t gfp)
|
|
{
|
|
size_t len;
|
|
char *buf;
|
|
|
|
if (!s)
|
|
return NULL;
|
|
|
|
len = strnlen(s, max);
|
|
buf = kmalloc_track_caller(len+1, gfp);
|
|
if (buf) {
|
|
memcpy(buf, s, len);
|
|
buf[len] = '\0';
|
|
}
|
|
return buf;
|
|
}
|
|
EXPORT_SYMBOL(kstrndup);
|
|
|
|
/**
|
|
* kmemdup - duplicate region of memory
|
|
*
|
|
* @src: memory region to duplicate
|
|
* @len: memory region length
|
|
* @gfp: GFP mask to use
|
|
*/
|
|
void *kmemdup(const void *src, size_t len, gfp_t gfp)
|
|
{
|
|
void *p;
|
|
|
|
p = kmalloc_track_caller(len, gfp);
|
|
if (p)
|
|
memcpy(p, src, len);
|
|
return p;
|
|
}
|
|
EXPORT_SYMBOL(kmemdup);
|
|
|
|
/**
|
|
* kmemdup_nul - Create a NUL-terminated string from unterminated data
|
|
* @s: The data to stringify
|
|
* @len: The size of the data
|
|
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
|
|
*/
|
|
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
|
|
{
|
|
char *buf;
|
|
|
|
if (!s)
|
|
return NULL;
|
|
|
|
buf = kmalloc_track_caller(len + 1, gfp);
|
|
if (buf) {
|
|
memcpy(buf, s, len);
|
|
buf[len] = '\0';
|
|
}
|
|
return buf;
|
|
}
|
|
EXPORT_SYMBOL(kmemdup_nul);
|
|
|
|
/**
|
|
* memdup_user - duplicate memory region from user space
|
|
*
|
|
* @src: source address in user space
|
|
* @len: number of bytes to copy
|
|
*
|
|
* Returns an ERR_PTR() on failure.
|
|
*/
|
|
void *memdup_user(const void __user *src, size_t len)
|
|
{
|
|
void *p;
|
|
|
|
/*
|
|
* Always use GFP_KERNEL, since copy_from_user() can sleep and
|
|
* cause pagefault, which makes it pointless to use GFP_NOFS
|
|
* or GFP_ATOMIC.
|
|
*/
|
|
p = kmalloc_track_caller(len, GFP_KERNEL);
|
|
if (!p)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
kfree(p);
|
|
return ERR_PTR(-EFAULT);
|
|
}
|
|
|
|
return p;
|
|
}
|
|
EXPORT_SYMBOL(memdup_user);
|
|
|
|
/*
|
|
* strndup_user - duplicate an existing string from user space
|
|
* @s: The string to duplicate
|
|
* @n: Maximum number of bytes to copy, including the trailing NUL.
|
|
*/
|
|
char *strndup_user(const char __user *s, long n)
|
|
{
|
|
char *p;
|
|
long length;
|
|
|
|
length = strnlen_user(s, n);
|
|
|
|
if (!length)
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
if (length > n)
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
p = memdup_user(s, length);
|
|
|
|
if (IS_ERR(p))
|
|
return p;
|
|
|
|
p[length - 1] = '\0';
|
|
|
|
return p;
|
|
}
|
|
EXPORT_SYMBOL(strndup_user);
|
|
|
|
/**
|
|
* memdup_user_nul - duplicate memory region from user space and NUL-terminate
|
|
*
|
|
* @src: source address in user space
|
|
* @len: number of bytes to copy
|
|
*
|
|
* Returns an ERR_PTR() on failure.
|
|
*/
|
|
void *memdup_user_nul(const void __user *src, size_t len)
|
|
{
|
|
char *p;
|
|
|
|
/*
|
|
* Always use GFP_KERNEL, since copy_from_user() can sleep and
|
|
* cause pagefault, which makes it pointless to use GFP_NOFS
|
|
* or GFP_ATOMIC.
|
|
*/
|
|
p = kmalloc_track_caller(len + 1, GFP_KERNEL);
|
|
if (!p)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
if (copy_from_user(p, src, len)) {
|
|
kfree(p);
|
|
return ERR_PTR(-EFAULT);
|
|
}
|
|
p[len] = '\0';
|
|
|
|
return p;
|
|
}
|
|
EXPORT_SYMBOL(memdup_user_nul);
|
|
|
|
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
struct vm_area_struct *prev, struct rb_node *rb_parent)
|
|
{
|
|
struct vm_area_struct *next;
|
|
|
|
vma->vm_prev = prev;
|
|
if (prev) {
|
|
next = prev->vm_next;
|
|
prev->vm_next = vma;
|
|
} else {
|
|
mm->mmap = vma;
|
|
if (rb_parent)
|
|
next = rb_entry(rb_parent,
|
|
struct vm_area_struct, vm_rb);
|
|
else
|
|
next = NULL;
|
|
}
|
|
vma->vm_next = next;
|
|
if (next)
|
|
next->vm_prev = vma;
|
|
}
|
|
|
|
/* Check if the vma is being used as a stack by this task */
|
|
int vma_is_stack_for_current(struct vm_area_struct *vma)
|
|
{
|
|
struct task_struct * __maybe_unused t = current;
|
|
|
|
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
|
|
}
|
|
|
|
/**
|
|
* randomize_page - Generate a random, page aligned address
|
|
* @start: The smallest acceptable address the caller will take.
|
|
* @range: The size of the area, starting at @start, within which the
|
|
* random address must fall.
|
|
*
|
|
* If @start + @range would overflow, @range is capped.
|
|
*
|
|
* NOTE: Historical use of randomize_range, which this replaces, presumed that
|
|
* @start was already page aligned. We now align it regardless.
|
|
*
|
|
* Return: A page aligned address within [start, start + range). On error,
|
|
* @start is returned.
|
|
*/
|
|
unsigned long randomize_page(unsigned long start, unsigned long range)
|
|
{
|
|
if (!PAGE_ALIGNED(start)) {
|
|
range -= PAGE_ALIGN(start) - start;
|
|
start = PAGE_ALIGN(start);
|
|
}
|
|
|
|
if (start > ULONG_MAX - range)
|
|
range = ULONG_MAX - start;
|
|
|
|
range >>= PAGE_SHIFT;
|
|
|
|
if (range == 0)
|
|
return start;
|
|
|
|
return start + (get_random_long() % range << PAGE_SHIFT);
|
|
}
|
|
|
|
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
|
|
void arch_pick_mmap_layout(struct mm_struct *mm)
|
|
{
|
|
mm->mmap_base = TASK_UNMAPPED_BASE;
|
|
mm->get_unmapped_area = arch_get_unmapped_area;
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
|
|
* back to the regular GUP.
|
|
* If the architecture not support this function, simply return with no
|
|
* page pinned
|
|
*/
|
|
int __weak __get_user_pages_fast(unsigned long start,
|
|
int nr_pages, int write, struct page **pages)
|
|
{
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(__get_user_pages_fast);
|
|
|
|
/**
|
|
* get_user_pages_fast() - pin user pages in memory
|
|
* @start: starting user address
|
|
* @nr_pages: number of pages from start to pin
|
|
* @write: whether pages will be written to
|
|
* @pages: array that receives pointers to the pages pinned.
|
|
* Should be at least nr_pages long.
|
|
*
|
|
* Returns number of pages pinned. This may be fewer than the number
|
|
* requested. If nr_pages is 0 or negative, returns 0. If no pages
|
|
* were pinned, returns -errno.
|
|
*
|
|
* get_user_pages_fast provides equivalent functionality to get_user_pages,
|
|
* operating on current and current->mm, with force=0 and vma=NULL. However
|
|
* unlike get_user_pages, it must be called without mmap_sem held.
|
|
*
|
|
* get_user_pages_fast may take mmap_sem and page table locks, so no
|
|
* assumptions can be made about lack of locking. get_user_pages_fast is to be
|
|
* implemented in a way that is advantageous (vs get_user_pages()) when the
|
|
* user memory area is already faulted in and present in ptes. However if the
|
|
* pages have to be faulted in, it may turn out to be slightly slower so
|
|
* callers need to carefully consider what to use. On many architectures,
|
|
* get_user_pages_fast simply falls back to get_user_pages.
|
|
*/
|
|
int __weak get_user_pages_fast(unsigned long start,
|
|
int nr_pages, int write, struct page **pages)
|
|
{
|
|
return get_user_pages_unlocked(start, nr_pages, pages,
|
|
write ? FOLL_WRITE : 0);
|
|
}
|
|
EXPORT_SYMBOL_GPL(get_user_pages_fast);
|
|
|
|
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
|
|
unsigned long len, unsigned long prot,
|
|
unsigned long flag, unsigned long pgoff)
|
|
{
|
|
unsigned long ret;
|
|
struct mm_struct *mm = current->mm;
|
|
unsigned long populate;
|
|
|
|
ret = security_mmap_file(file, prot, flag);
|
|
if (!ret) {
|
|
if (down_write_killable(&mm->mmap_sem))
|
|
return -EINTR;
|
|
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
|
|
&populate);
|
|
up_write(&mm->mmap_sem);
|
|
if (populate)
|
|
mm_populate(ret, populate);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
unsigned long vm_mmap(struct file *file, unsigned long addr,
|
|
unsigned long len, unsigned long prot,
|
|
unsigned long flag, unsigned long offset)
|
|
{
|
|
if (unlikely(offset + PAGE_ALIGN(len) < offset))
|
|
return -EINVAL;
|
|
if (unlikely(offset_in_page(offset)))
|
|
return -EINVAL;
|
|
|
|
return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
|
|
}
|
|
EXPORT_SYMBOL(vm_mmap);
|
|
|
|
void kvfree(const void *addr)
|
|
{
|
|
if (is_vmalloc_addr(addr))
|
|
vfree(addr);
|
|
else
|
|
kfree(addr);
|
|
}
|
|
EXPORT_SYMBOL(kvfree);
|
|
|
|
static inline void *__page_rmapping(struct page *page)
|
|
{
|
|
unsigned long mapping;
|
|
|
|
mapping = (unsigned long)page->mapping;
|
|
mapping &= ~PAGE_MAPPING_FLAGS;
|
|
|
|
return (void *)mapping;
|
|
}
|
|
|
|
/* Neutral page->mapping pointer to address_space or anon_vma or other */
|
|
void *page_rmapping(struct page *page)
|
|
{
|
|
page = compound_head(page);
|
|
return __page_rmapping(page);
|
|
}
|
|
|
|
/*
|
|
* Return true if this page is mapped into pagetables.
|
|
* For compound page it returns true if any subpage of compound page is mapped.
|
|
*/
|
|
bool page_mapped(struct page *page)
|
|
{
|
|
int i;
|
|
|
|
if (likely(!PageCompound(page)))
|
|
return atomic_read(&page->_mapcount) >= 0;
|
|
page = compound_head(page);
|
|
if (atomic_read(compound_mapcount_ptr(page)) >= 0)
|
|
return true;
|
|
if (PageHuge(page))
|
|
return false;
|
|
for (i = 0; i < (1 << compound_order(page)); i++) {
|
|
if (atomic_read(&page[i]._mapcount) >= 0)
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
EXPORT_SYMBOL(page_mapped);
|
|
|
|
struct anon_vma *page_anon_vma(struct page *page)
|
|
{
|
|
unsigned long mapping;
|
|
|
|
page = compound_head(page);
|
|
mapping = (unsigned long)page->mapping;
|
|
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
|
|
return NULL;
|
|
return __page_rmapping(page);
|
|
}
|
|
|
|
struct address_space *page_mapping(struct page *page)
|
|
{
|
|
struct address_space *mapping;
|
|
|
|
page = compound_head(page);
|
|
|
|
/* This happens if someone calls flush_dcache_page on slab page */
|
|
if (unlikely(PageSlab(page)))
|
|
return NULL;
|
|
|
|
if (unlikely(PageSwapCache(page))) {
|
|
swp_entry_t entry;
|
|
|
|
entry.val = page_private(page);
|
|
return swap_address_space(entry);
|
|
}
|
|
|
|
mapping = page->mapping;
|
|
if ((unsigned long)mapping & PAGE_MAPPING_ANON)
|
|
return NULL;
|
|
|
|
return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
|
|
}
|
|
EXPORT_SYMBOL(page_mapping);
|
|
|
|
/* Slow path of page_mapcount() for compound pages */
|
|
int __page_mapcount(struct page *page)
|
|
{
|
|
int ret;
|
|
|
|
ret = atomic_read(&page->_mapcount) + 1;
|
|
/*
|
|
* For file THP page->_mapcount contains total number of mapping
|
|
* of the page: no need to look into compound_mapcount.
|
|
*/
|
|
if (!PageAnon(page) && !PageHuge(page))
|
|
return ret;
|
|
page = compound_head(page);
|
|
ret += atomic_read(compound_mapcount_ptr(page)) + 1;
|
|
if (PageDoubleMap(page))
|
|
ret--;
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL_GPL(__page_mapcount);
|
|
|
|
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
|
|
int sysctl_overcommit_ratio __read_mostly = 50;
|
|
unsigned long sysctl_overcommit_kbytes __read_mostly;
|
|
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
|
|
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
|
|
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
|
|
|
|
int overcommit_ratio_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos)
|
|
{
|
|
int ret;
|
|
|
|
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
|
if (ret == 0 && write)
|
|
sysctl_overcommit_kbytes = 0;
|
|
return ret;
|
|
}
|
|
|
|
int overcommit_kbytes_handler(struct ctl_table *table, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos)
|
|
{
|
|
int ret;
|
|
|
|
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
|
|
if (ret == 0 && write)
|
|
sysctl_overcommit_ratio = 0;
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
|
|
*/
|
|
unsigned long vm_commit_limit(void)
|
|
{
|
|
unsigned long allowed;
|
|
|
|
if (sysctl_overcommit_kbytes)
|
|
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
|
|
else
|
|
allowed = ((totalram_pages - hugetlb_total_pages())
|
|
* sysctl_overcommit_ratio / 100);
|
|
allowed += total_swap_pages;
|
|
|
|
return allowed;
|
|
}
|
|
|
|
/*
|
|
* Make sure vm_committed_as in one cacheline and not cacheline shared with
|
|
* other variables. It can be updated by several CPUs frequently.
|
|
*/
|
|
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
|
|
|
|
/*
|
|
* The global memory commitment made in the system can be a metric
|
|
* that can be used to drive ballooning decisions when Linux is hosted
|
|
* as a guest. On Hyper-V, the host implements a policy engine for dynamically
|
|
* balancing memory across competing virtual machines that are hosted.
|
|
* Several metrics drive this policy engine including the guest reported
|
|
* memory commitment.
|
|
*/
|
|
unsigned long vm_memory_committed(void)
|
|
{
|
|
return percpu_counter_read_positive(&vm_committed_as);
|
|
}
|
|
EXPORT_SYMBOL_GPL(vm_memory_committed);
|
|
|
|
/*
|
|
* Check that a process has enough memory to allocate a new virtual
|
|
* mapping. 0 means there is enough memory for the allocation to
|
|
* succeed and -ENOMEM implies there is not.
|
|
*
|
|
* We currently support three overcommit policies, which are set via the
|
|
* vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
|
|
*
|
|
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
|
|
* Additional code 2002 Jul 20 by Robert Love.
|
|
*
|
|
* cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
|
|
*
|
|
* Note this is a helper function intended to be used by LSMs which
|
|
* wish to use this logic.
|
|
*/
|
|
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
|
|
{
|
|
long free, allowed, reserve;
|
|
|
|
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
|
|
-(s64)vm_committed_as_batch * num_online_cpus(),
|
|
"memory commitment underflow");
|
|
|
|
vm_acct_memory(pages);
|
|
|
|
/*
|
|
* Sometimes we want to use more memory than we have
|
|
*/
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
|
|
return 0;
|
|
|
|
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
|
|
free = global_page_state(NR_FREE_PAGES);
|
|
free += global_node_page_state(NR_FILE_PAGES);
|
|
|
|
/*
|
|
* shmem pages shouldn't be counted as free in this
|
|
* case, they can't be purged, only swapped out, and
|
|
* that won't affect the overall amount of available
|
|
* memory in the system.
|
|
*/
|
|
free -= global_node_page_state(NR_SHMEM);
|
|
|
|
free += get_nr_swap_pages();
|
|
|
|
/*
|
|
* Any slabs which are created with the
|
|
* SLAB_RECLAIM_ACCOUNT flag claim to have contents
|
|
* which are reclaimable, under pressure. The dentry
|
|
* cache and most inode caches should fall into this
|
|
*/
|
|
free += global_page_state(NR_SLAB_RECLAIMABLE);
|
|
|
|
/*
|
|
* Leave reserved pages. The pages are not for anonymous pages.
|
|
*/
|
|
if (free <= totalreserve_pages)
|
|
goto error;
|
|
else
|
|
free -= totalreserve_pages;
|
|
|
|
/*
|
|
* Reserve some for root
|
|
*/
|
|
if (!cap_sys_admin)
|
|
free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
if (free > pages)
|
|
return 0;
|
|
|
|
goto error;
|
|
}
|
|
|
|
allowed = vm_commit_limit();
|
|
/*
|
|
* Reserve some for root
|
|
*/
|
|
if (!cap_sys_admin)
|
|
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
|
|
/*
|
|
* Don't let a single process grow so big a user can't recover
|
|
*/
|
|
if (mm) {
|
|
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
|
|
allowed -= min_t(long, mm->total_vm / 32, reserve);
|
|
}
|
|
|
|
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
|
|
return 0;
|
|
error:
|
|
vm_unacct_memory(pages);
|
|
|
|
return -ENOMEM;
|
|
}
|
|
|
|
/**
|
|
* get_cmdline() - copy the cmdline value to a buffer.
|
|
* @task: the task whose cmdline value to copy.
|
|
* @buffer: the buffer to copy to.
|
|
* @buflen: the length of the buffer. Larger cmdline values are truncated
|
|
* to this length.
|
|
* Returns the size of the cmdline field copied. Note that the copy does
|
|
* not guarantee an ending NULL byte.
|
|
*/
|
|
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
|
|
{
|
|
int res = 0;
|
|
unsigned int len;
|
|
struct mm_struct *mm = get_task_mm(task);
|
|
unsigned long arg_start, arg_end, env_start, env_end;
|
|
if (!mm)
|
|
goto out;
|
|
if (!mm->arg_end)
|
|
goto out_mm; /* Shh! No looking before we're done */
|
|
|
|
down_read(&mm->mmap_sem);
|
|
arg_start = mm->arg_start;
|
|
arg_end = mm->arg_end;
|
|
env_start = mm->env_start;
|
|
env_end = mm->env_end;
|
|
up_read(&mm->mmap_sem);
|
|
|
|
len = arg_end - arg_start;
|
|
|
|
if (len > buflen)
|
|
len = buflen;
|
|
|
|
res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);
|
|
|
|
/*
|
|
* If the nul at the end of args has been overwritten, then
|
|
* assume application is using setproctitle(3).
|
|
*/
|
|
if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
|
|
len = strnlen(buffer, res);
|
|
if (len < res) {
|
|
res = len;
|
|
} else {
|
|
len = env_end - env_start;
|
|
if (len > buflen - res)
|
|
len = buflen - res;
|
|
res += access_process_vm(task, env_start,
|
|
buffer+res, len,
|
|
FOLL_FORCE);
|
|
res = strnlen(buffer, res);
|
|
}
|
|
}
|
|
out_mm:
|
|
mmput(mm);
|
|
out:
|
|
return res;
|
|
}
|