Documentation
arch
block
certs
crypto
drivers
firmware
fs
include
init
ipc
kernel
lib
mm
kasan
Kconfig
Kconfig.debug
Makefile
backing-dev.c
balloon_compaction.c
bootmem.c
cleancache.c
cma.c
cma.h
cma_debug.c
compaction.c
debug.c
debug_page_ref.c
dmapool.c
early_ioremap.c
fadvise.c
failslab.c
filemap.c
frame_vector.c
frontswap.c
gup.c
highmem.c
huge_memory.c
hugetlb.c
hugetlb_cgroup.c
hwpoison-inject.c
init-mm.c
internal.h
interval_tree.c
khugepaged.c
kmemcheck.c
kmemleak-test.c
kmemleak.c
ksm.c
list_lru.c
maccess.c
madvise.c
memblock.c
memcontrol.c
memory-failure.c
memory.c
memory_hotplug.c
mempolicy.c
mempool.c
memtest.c
migrate.c
mincore.c
mlock.c
mm_init.c
mmap.c
mmu_context.c
mmu_notifier.c
mmzone.c
mprotect.c
mremap.c
msync.c
nobootmem.c
nommu.c
oom_kill.c
page-writeback.c
page_alloc.c
page_counter.c
page_ext.c
page_idle.c
page_io.c
page_isolation.c
page_owner.c
page_poison.c
pagewalk.c
percpu-km.c
percpu-vm.c
percpu.c
pgtable-generic.c
process_vm_access.c
quicklist.c
readahead.c
rmap.c
shmem.c
slab.c
slab.h
slab_common.c
slob.c
slub.c
sparse-vmemmap.c
sparse.c
swap.c
swap_cgroup.c
swap_state.c
swapfile.c
truncate.c
usercopy.c
userfaultfd.c
util.c
vmacache.c
vmalloc.c
vmpressure.c
vmscan.c
vmstat.c
workingset.c
z3fold.c
zbud.c
zpool.c
zsmalloc.c
zswap.c
ndm
net
samples
scripts
security
sound
tools
usr
virt
.cocciconfig
.gitignore
.mailmap
COPYING
CREDITS
Kbuild
Kconfig
MAINTAINERS
Makefile
README
REPORTING-BUGS
build.config.aarch64
build.config.common
build.config.cuttlefish.aarch64
build.config.cuttlefish.x86_64
build.config.goldfish.arm
build.config.goldfish.arm64
build.config.goldfish.mips
build.config.goldfish.mips64
build.config.goldfish.x86
build.config.goldfish.x86_64
build.config.x86_64
localversion-ndm
verity_dev_keys.x509
Changes in 4.9.291 binder: use euid from cred instead of using task binder: use cred instead of task for selinux checks xhci: Fix USB 3.1 enumeration issues by increasing roothub power-on-good delay Input: elantench - fix misreporting trackpoint coordinates Input: i8042 - Add quirk for Fujitsu Lifebook T725 libata: fix read log timeout value ocfs2: fix data corruption on truncate mmc: dw_mmc: Dont wait for DRTO on Write RSP error parisc: Fix ptrace check on syscall return media: ite-cir: IR receiver stop working after receive overflow ALSA: ua101: fix division by zero at probe ALSA: 6fire: fix control and bulk message timeouts ALSA: line6: fix control and interrupt message timeouts ALSA: synth: missing check for possible NULL after the call to kstrdup ALSA: timer: Fix use-after-free problem ALSA: timer: Unconditionally unlink slave instances, too x86/irq: Ensure PI wakeup handler is unregistered before module unload sfc: Don't use netif_info before net_device setup hyperv/vmbus: include linux/bitops.h mmc: winbond: don't build on M68K bpf: Prevent increasing bpf_jit_limit above max xen/netfront: stop tx queues during live migration spi: spl022: fix Microwire full duplex mode watchdog: Fix OMAP watchdog early handling vmxnet3: do not stop tx queues after netif_device_detach() btrfs: fix lost error handling when replaying directory deletes hwmon: (pmbus/lm25066) Add offset coefficients regulator: s5m8767: do not use reset value as DVS voltage if GPIO DVS is disabled regulator: dt-bindings: samsung,s5m8767: correct s5m8767,pmic-buck-default-dvs-idx property EDAC/sb_edac: Fix top-of-high-memory value for Broadwell/Haswell mwifiex: fix division by zero in fw download path ath6kl: fix division by zero in send path ath6kl: fix control-message timeout PCI: Mark Atheros QCA6174 to avoid bus reset rtl8187: fix control-message timeouts evm: mark evm_fixmode as __ro_after_init wcn36xx: Fix HT40 capability for 2Ghz band mwifiex: Read a PCI register after writing the TX ring write pointer wcn36xx: handle connection loss indication RDMA/qedr: Fix NULL deref for query_qp on the GSI QP signal: Remove the bogus sigkill_pending in ptrace_stop signal/mips: Update (_save|_restore)_fp_context to fail with -EFAULT power: supply: max17042_battery: Prevent int underflow in set_soc_threshold power: supply: max17042_battery: use VFSOC for capacity when no rsns powerpc/85xx: Fix oops when mpc85xx_smp_guts_ids node cannot be found serial: core: Fix initializing and restoring termios speed ALSA: mixer: oss: Fix racy access to slots ALSA: mixer: fix deadlock in snd_mixer_oss_set_volume PCI: aardvark: Read all 16-bits from PCIE_MSI_PAYLOAD_REG quota: check block number when reading the block in quota file quota: correct error number in free_dqentry() iio: dac: ad5446: Fix ad5622_write() return value USB: serial: keyspan: fix memleak on probe errors USB: iowarrior: fix control-message timeouts Bluetooth: sco: Fix lock_sock() blockage by memcpy_from_msg() Bluetooth: fix use-after-free error in lock_sock_nested() platform/x86: wmi: do not fail if disabling fails MIPS: lantiq: dma: add small delay after reset MIPS: lantiq: dma: reset correct number of channel locking/lockdep: Avoid RCU-induced noinstr fail smackfs: Fix use-after-free in netlbl_catmap_walk() x86: Increase exception stack sizes media: mt9p031: Fix corrupted frame after restarting stream media: netup_unidvb: handle interrupt properly according to the firmware media: uvcvideo: Set capability in s_param media: s5p-mfc: fix possible null-pointer dereference in s5p_mfc_probe() media: mceusb: return without resubmitting URB in case of -EPROTO error. ia64: don't do IA64_CMPXCHG_DEBUG without CONFIG_PRINTK ACPICA: Avoid evaluating methods too early during system resume media: usb: dvd-usb: fix uninit-value bug in dibusb_read_eeprom_byte() tracefs: Have tracefs directories not set OTH permission bits by default ath: dfs_pattern_detector: Fix possible null-pointer dereference in channel_detector_create() ACPI: battery: Accept charges over the design capacity as full memstick: r592: Fix a UAF bug when removing the driver lib/xz: Avoid overlapping memcpy() with invalid input with in-place decompression lib/xz: Validate the value before assigning it to an enum variable tracing/cfi: Fix cmp_entries_* functions signature mismatch mwl8k: Fix use-after-free in mwl8k_fw_state_machine() PM: hibernate: Get block device exclusively in swsusp_check() iwlwifi: mvm: disable RX-diversity in powersave smackfs: use __GFP_NOFAIL for smk_cipso_doi() ARM: clang: Do not rely on lr register for stacktrace ARM: 9136/1: ARMv7-M uses BE-8, not BE-32 spi: bcm-qspi: Fix missing clk_disable_unprepare() on error in bcm_qspi_probe() parisc: fix warning in flush_tlb_all parisc/kgdb: add kgdb_roundup() to make kgdb work with idle polling cgroup: Make rebind_subsystems() disable v2 controllers all at once media: dvb-usb: fix ununit-value in az6027_rc_query media: mtk-vpu: Fix a resource leak in the error handling path of 'mtk_vpu_probe()' media: si470x: Avoid card name truncation cpuidle: Fix kobject memory leaks in error paths ath9k: Fix potential interrupt storm on queue reset crypto: qat - detect PFVF collision after ACK crypto: qat - disregard spurious PFVF interrupts b43legacy: fix a lower bounds test b43: fix a lower bounds test memstick: avoid out-of-range warning memstick: jmb38x_ms: use appropriate free function in jmb38x_ms_alloc_host() hwmon: Fix possible memleak in __hwmon_device_register() ath10k: fix max antenna gain unit drm/msm: uninitialized variable in msm_gem_import() net: stream: don't purge sk_error_queue in sk_stream_kill_queues() mmc: mxs-mmc: disable regulator on error and in the remove function platform/x86: thinkpad_acpi: Fix bitwise vs. logical warning mwifiex: Send DELBA requests according to spec phy: micrel: ksz8041nl: do not use power down mode smackfs: use netlbl_cfg_cipsov4_del() for deleting cipso_v4_doi s390/gmap: don't unconditionally call pte_unmap_unlock() in __gmap_zap() irq: mips: avoid nested irq_enter() samples/kretprobes: Fix return value if register_kretprobe() failed libertas_tf: Fix possible memory leak in probe and disconnect libertas: Fix possible memory leak in probe and disconnect crypto: pcrypt - Delay write to padata->info RDMA/rxe: Fix wrong port_cap_flags ARM: s3c: irq-s3c24xx: Fix return value check for s3c24xx_init_intc() scsi: dc395: Fix error case unwinding MIPS: loongson64: make CPU_LOONGSON64 depends on MIPS_FP_SUPPORT JFS: fix memleak in jfs_mount arm: dts: omap3-gta04a4: accelerometer irq fix soc/tegra: Fix an error handling path in tegra_powergate_power_up() memory: fsl_ifc: fix leak of irq and nand_irq in fsl_ifc_ctrl_probe video: fbdev: chipsfb: use memset_io() instead of memset() serial: 8250_dw: Drop wrong use of ACPI_PTR() usb: gadget: hid: fix error code in do_config() power: supply: rt5033_battery: Change voltage values to µV scsi: csiostor: Uninitialized data in csio_ln_vnp_read_cbfn() RDMA/mlx4: Return missed an error if device doesn't support steering serial: xilinx_uartps: Fix race condition causing stuck TX power: supply: bq27xxx: Fix kernel crash on IRQ handler register error pnfs/flexfiles: Fix misplaced barrier in nfs4_ff_layout_prepare_ds drm/plane-helper: fix uninitialized variable reference PCI: aardvark: Don't spam about PIO Response Status fs: orangefs: fix error return code of orangefs_revalidate_lookup() mtd: spi-nor: hisi-sfc: Remove excessive clk_disable_unprepare() dmaengine: at_xdmac: fix AT_XDMAC_CC_PERID() macro auxdisplay: img-ascii-lcd: Fix lock-up when displaying empty string netfilter: nfnetlink_queue: fix OOB when mac header was cleared dmaengine: dmaengine_desc_callback_valid(): Check for `callback_result` m68k: set a default value for MEMORY_RESERVE watchdog: f71808e_wdt: fix inaccurate report in WDIOC_GETTIMEOUT scsi: qla2xxx: Turn off target reset during issue_lip i2c: xlr: Fix a resource leak in the error handling path of 'xlr_i2c_probe()' xen-pciback: Fix return in pm_ctrl_init() net: davinci_emac: Fix interrupt pacing disable ACPI: PMIC: Fix intel_pmic_regs_handler() read accesses bonding: Fix a use-after-free problem when bond_sysfs_slave_add() failed mm/zsmalloc.c: close race window between zs_pool_dec_isolated() and zs_unregister_migration() llc: fix out-of-bound array index in llc_sk_dev_hash() nfc: pn533: Fix double free when pn533_fill_fragment_skbs() fails vsock: prevent unnecessary refcnt inc for nonblocking connect USB: chipidea: fix interrupt deadlock ARM: 9156/1: drop cc-option fallbacks for architecture selection powerpc/bpf: Validate branch ranges powerpc/bpf: Fix BPF_SUB when imm == 0x80000000 mm, oom: pagefault_out_of_memory: don't force global OOM for dying tasks mm, oom: do not trigger out_of_memory from the #PF PCI: Add PCI_EXP_DEVCTL_PAYLOAD_* macros net: mdio-mux: fix unbalanced put_device parisc/entry: fix trace test in syscall exit path PCI/MSI: Destroy sysfs before freeing entries scsi: lpfc: Fix list_add() corruption in lpfc_drain_txq() usb: musb: tusb6010: check return value after calling platform_get_resource() scsi: advansys: Fix kernel pointer leak ARM: dts: omap: fix gpmc,mux-add-data type usb: host: ohci-tmio: check return value after calling platform_get_resource() tty: tty_buffer: Fix the softlockup issue in flush_to_ldisc MIPS: sni: Fix the build scsi: target: Fix ordered tag handling scsi: target: Fix alua_tg_pt_gps_count tracking powerpc/5200: dts: fix memory node unit name ALSA: gus: fix null pointer dereference on pointer block powerpc/dcr: Use cmplwi instead of 3-argument cmpli sh: check return code of request_irq maple: fix wrong return value of maple_bus_init(). sh: fix kconfig unmet dependency warning for FRAME_POINTER sh: define __BIG_ENDIAN for math-emu mips: BCM63XX: ensure that CPU_SUPPORTS_32BIT_KERNEL is set sched/core: Mitigate race cpus_share_cache()/update_top_cache_domain() net: bnx2x: fix variable dereferenced before check iavf: Fix for the false positive ASQ/ARQ errors while issuing VF reset mips: bcm63xx: add support for clk_get_parent() platform/x86: hp_accel: Fix an error handling path in 'lis3lv02d_probe()' NFC: reorganize the functions in nci_request NFC: reorder the logic in nfc_{un,}register_device perf/x86/intel/uncore: Fix filter_tid mask for CHA events on Skylake Server perf/x86/intel/uncore: Fix IIO event constraints for Skylake Server tun: fix bonding active backup with arp monitoring hexagon: export raw I/O routines for modules mm: kmemleak: slob: respect SLAB_NOLEAKTRACE flag btrfs: fix memory ordering between normal and ordered work functions parisc/sticon: fix reverse colors cfg80211: call cfg80211_stop_ap when switch from P2P_GO type drm/udl: fix control-message timeout drm/amdgpu: fix set scaling mode Full/Full aspect/Center not works on vga and dvi connectors batman-adv: Keep fragments equally sized batman-adv: Fix own OGM check in aggregated OGMs batman-adv: mcast: fix duplicate mcast packets in BLA backbone from LAN batman-adv: mcast: fix duplicate mcast packets from BLA backbone to mesh batman-adv: Consider fragmentation for needed_headroom batman-adv: Reserve needed_*room for fragments batman-adv: Don't always reallocate the fragmentation skb head ASoC: DAPM: Cover regression by kctl change notification fix usb: max-3421: Use driver data instead of maintaining a list of bound devices soc/tegra: pmc: Fix imbalanced clock disabling in error code path Linux 4.9.291 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I23d798c10aebab1e51add60ccb34a8b289d49a4d
1100 lines
30 KiB
C
1100 lines
30 KiB
C
/*
|
|
* linux/mm/oom_kill.c
|
|
*
|
|
* Copyright (C) 1998,2000 Rik van Riel
|
|
* Thanks go out to Claus Fischer for some serious inspiration and
|
|
* for goading me into coding this file...
|
|
* Copyright (C) 2010 Google, Inc.
|
|
* Rewritten by David Rientjes
|
|
*
|
|
* The routines in this file are used to kill a process when
|
|
* we're seriously out of memory. This gets called from __alloc_pages()
|
|
* in mm/page_alloc.c when we really run out of memory.
|
|
*
|
|
* Since we won't call these routines often (on a well-configured
|
|
* machine) this file will double as a 'coding guide' and a signpost
|
|
* for newbie kernel hackers. It features several pointers to major
|
|
* kernel subsystems and hints as to where to find out what things do.
|
|
*/
|
|
|
|
#include <linux/oom.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/err.h>
|
|
#include <linux/gfp.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/swap.h>
|
|
#include <linux/timex.h>
|
|
#include <linux/jiffies.h>
|
|
#include <linux/cpuset.h>
|
|
#include <linux/export.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/memcontrol.h>
|
|
#include <linux/mempolicy.h>
|
|
#include <linux/security.h>
|
|
#include <linux/ptrace.h>
|
|
#include <linux/freezer.h>
|
|
#include <linux/ftrace.h>
|
|
#include <linux/ratelimit.h>
|
|
#include <linux/kthread.h>
|
|
#include <linux/init.h>
|
|
#include <linux/mmu_notifier.h>
|
|
|
|
#include <asm/tlb.h>
|
|
#include "internal.h"
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/oom.h>
|
|
|
|
int sysctl_panic_on_oom;
|
|
int sysctl_oom_kill_allocating_task;
|
|
int sysctl_oom_dump_tasks = 1;
|
|
|
|
DEFINE_MUTEX(oom_lock);
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/**
|
|
* has_intersects_mems_allowed() - check task eligiblity for kill
|
|
* @start: task struct of which task to consider
|
|
* @mask: nodemask passed to page allocator for mempolicy ooms
|
|
*
|
|
* Task eligibility is determined by whether or not a candidate task, @tsk,
|
|
* shares the same mempolicy nodes as current if it is bound by such a policy
|
|
* and whether or not it has the same set of allowed cpuset nodes.
|
|
*/
|
|
static bool has_intersects_mems_allowed(struct task_struct *start,
|
|
const nodemask_t *mask)
|
|
{
|
|
struct task_struct *tsk;
|
|
bool ret = false;
|
|
|
|
rcu_read_lock();
|
|
for_each_thread(start, tsk) {
|
|
if (mask) {
|
|
/*
|
|
* If this is a mempolicy constrained oom, tsk's
|
|
* cpuset is irrelevant. Only return true if its
|
|
* mempolicy intersects current, otherwise it may be
|
|
* needlessly killed.
|
|
*/
|
|
ret = mempolicy_nodemask_intersects(tsk, mask);
|
|
} else {
|
|
/*
|
|
* This is not a mempolicy constrained oom, so only
|
|
* check the mems of tsk's cpuset.
|
|
*/
|
|
ret = cpuset_mems_allowed_intersects(current, tsk);
|
|
}
|
|
if (ret)
|
|
break;
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
}
|
|
#else
|
|
static bool has_intersects_mems_allowed(struct task_struct *tsk,
|
|
const nodemask_t *mask)
|
|
{
|
|
return true;
|
|
}
|
|
#endif /* CONFIG_NUMA */
|
|
|
|
/*
|
|
* The process p may have detached its own ->mm while exiting or through
|
|
* use_mm(), but one or more of its subthreads may still have a valid
|
|
* pointer. Return p, or any of its subthreads with a valid ->mm, with
|
|
* task_lock() held.
|
|
*/
|
|
struct task_struct *find_lock_task_mm(struct task_struct *p)
|
|
{
|
|
struct task_struct *t;
|
|
|
|
rcu_read_lock();
|
|
|
|
for_each_thread(p, t) {
|
|
task_lock(t);
|
|
if (likely(t->mm))
|
|
goto found;
|
|
task_unlock(t);
|
|
}
|
|
t = NULL;
|
|
found:
|
|
rcu_read_unlock();
|
|
|
|
return t;
|
|
}
|
|
|
|
/*
|
|
* order == -1 means the oom kill is required by sysrq, otherwise only
|
|
* for display purposes.
|
|
*/
|
|
static inline bool is_sysrq_oom(struct oom_control *oc)
|
|
{
|
|
return oc->order == -1;
|
|
}
|
|
|
|
static inline bool is_memcg_oom(struct oom_control *oc)
|
|
{
|
|
return oc->memcg != NULL;
|
|
}
|
|
|
|
/* return true if the task is not adequate as candidate victim task. */
|
|
static bool oom_unkillable_task(struct task_struct *p,
|
|
struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
|
{
|
|
if (is_global_init(p))
|
|
return true;
|
|
if (p->flags & PF_KTHREAD)
|
|
return true;
|
|
|
|
/* When mem_cgroup_out_of_memory() and p is not member of the group */
|
|
if (memcg && !task_in_mem_cgroup(p, memcg))
|
|
return true;
|
|
|
|
/* p may not have freeable memory in nodemask */
|
|
if (!has_intersects_mems_allowed(p, nodemask))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* oom_badness - heuristic function to determine which candidate task to kill
|
|
* @p: task struct of which task we should calculate
|
|
* @totalpages: total present RAM allowed for page allocation
|
|
*
|
|
* The heuristic for determining which task to kill is made to be as simple and
|
|
* predictable as possible. The goal is to return the highest value for the
|
|
* task consuming the most memory to avoid subsequent oom failures.
|
|
*/
|
|
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
|
|
const nodemask_t *nodemask, unsigned long totalpages)
|
|
{
|
|
long points;
|
|
long adj;
|
|
|
|
if (oom_unkillable_task(p, memcg, nodemask))
|
|
return 0;
|
|
|
|
p = find_lock_task_mm(p);
|
|
if (!p)
|
|
return 0;
|
|
|
|
/*
|
|
* Do not even consider tasks which are explicitly marked oom
|
|
* unkillable or have been already oom reaped or the are in
|
|
* the middle of vfork
|
|
*/
|
|
adj = (long)p->signal->oom_score_adj;
|
|
if (adj == OOM_SCORE_ADJ_MIN ||
|
|
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
|
|
in_vfork(p)) {
|
|
task_unlock(p);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* The baseline for the badness score is the proportion of RAM that each
|
|
* task's rss, pagetable and swap space use.
|
|
*/
|
|
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
|
|
atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
|
|
task_unlock(p);
|
|
|
|
/*
|
|
* Root processes get 3% bonus, just like the __vm_enough_memory()
|
|
* implementation used by LSMs.
|
|
*/
|
|
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
|
|
points -= (points * 3) / 100;
|
|
|
|
/* Normalize to oom_score_adj units */
|
|
adj *= totalpages / 1000;
|
|
points += adj;
|
|
|
|
/*
|
|
* Never return 0 for an eligible task regardless of the root bonus and
|
|
* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
|
|
*/
|
|
return points > 0 ? points : 1;
|
|
}
|
|
|
|
enum oom_constraint {
|
|
CONSTRAINT_NONE,
|
|
CONSTRAINT_CPUSET,
|
|
CONSTRAINT_MEMORY_POLICY,
|
|
CONSTRAINT_MEMCG,
|
|
};
|
|
|
|
/*
|
|
* Determine the type of allocation constraint.
|
|
*/
|
|
static enum oom_constraint constrained_alloc(struct oom_control *oc)
|
|
{
|
|
struct zone *zone;
|
|
struct zoneref *z;
|
|
enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
|
|
bool cpuset_limited = false;
|
|
int nid;
|
|
|
|
if (is_memcg_oom(oc)) {
|
|
oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1;
|
|
return CONSTRAINT_MEMCG;
|
|
}
|
|
|
|
/* Default to all available memory */
|
|
oc->totalpages = totalram_pages + total_swap_pages;
|
|
|
|
if (!IS_ENABLED(CONFIG_NUMA))
|
|
return CONSTRAINT_NONE;
|
|
|
|
if (!oc->zonelist)
|
|
return CONSTRAINT_NONE;
|
|
/*
|
|
* Reach here only when __GFP_NOFAIL is used. So, we should avoid
|
|
* to kill current.We have to random task kill in this case.
|
|
* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
|
|
*/
|
|
if (oc->gfp_mask & __GFP_THISNODE)
|
|
return CONSTRAINT_NONE;
|
|
|
|
/*
|
|
* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
|
|
* the page allocator means a mempolicy is in effect. Cpuset policy
|
|
* is enforced in get_page_from_freelist().
|
|
*/
|
|
if (oc->nodemask &&
|
|
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
|
|
oc->totalpages = total_swap_pages;
|
|
for_each_node_mask(nid, *oc->nodemask)
|
|
oc->totalpages += node_spanned_pages(nid);
|
|
return CONSTRAINT_MEMORY_POLICY;
|
|
}
|
|
|
|
/* Check this allocation failure is caused by cpuset's wall function */
|
|
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
|
|
high_zoneidx, oc->nodemask)
|
|
if (!cpuset_zone_allowed(zone, oc->gfp_mask))
|
|
cpuset_limited = true;
|
|
|
|
if (cpuset_limited) {
|
|
oc->totalpages = total_swap_pages;
|
|
for_each_node_mask(nid, cpuset_current_mems_allowed)
|
|
oc->totalpages += node_spanned_pages(nid);
|
|
return CONSTRAINT_CPUSET;
|
|
}
|
|
return CONSTRAINT_NONE;
|
|
}
|
|
|
|
static int oom_evaluate_task(struct task_struct *task, void *arg)
|
|
{
|
|
struct oom_control *oc = arg;
|
|
unsigned long points;
|
|
|
|
if (oom_unkillable_task(task, NULL, oc->nodemask))
|
|
goto next;
|
|
|
|
/*
|
|
* This task already has access to memory reserves and is being killed.
|
|
* Don't allow any other task to have access to the reserves unless
|
|
* the task has MMF_OOM_SKIP because chances that it would release
|
|
* any memory is quite low.
|
|
*/
|
|
if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
|
|
if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
|
|
goto next;
|
|
goto abort;
|
|
}
|
|
|
|
/*
|
|
* If task is allocating a lot of memory and has been marked to be
|
|
* killed first if it triggers an oom, then select it.
|
|
*/
|
|
if (oom_task_origin(task)) {
|
|
points = ULONG_MAX;
|
|
goto select;
|
|
}
|
|
|
|
points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
|
|
if (!points || points < oc->chosen_points)
|
|
goto next;
|
|
|
|
/* Prefer thread group leaders for display purposes */
|
|
if (points == oc->chosen_points && thread_group_leader(oc->chosen))
|
|
goto next;
|
|
select:
|
|
if (oc->chosen)
|
|
put_task_struct(oc->chosen);
|
|
get_task_struct(task);
|
|
oc->chosen = task;
|
|
oc->chosen_points = points;
|
|
next:
|
|
return 0;
|
|
abort:
|
|
if (oc->chosen)
|
|
put_task_struct(oc->chosen);
|
|
oc->chosen = (void *)-1UL;
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Simple selection loop. We choose the process with the highest number of
|
|
* 'points'. In case scan was aborted, oc->chosen is set to -1.
|
|
*/
|
|
static void select_bad_process(struct oom_control *oc)
|
|
{
|
|
if (is_memcg_oom(oc))
|
|
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
|
|
else {
|
|
struct task_struct *p;
|
|
|
|
rcu_read_lock();
|
|
for_each_process(p)
|
|
if (oom_evaluate_task(p, oc))
|
|
break;
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
|
|
}
|
|
|
|
/**
|
|
* dump_tasks - dump current memory state of all system tasks
|
|
* @memcg: current's memory controller, if constrained
|
|
* @nodemask: nodemask passed to page allocator for mempolicy ooms
|
|
*
|
|
* Dumps the current memory state of all eligible tasks. Tasks not in the same
|
|
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
|
|
* are not shown.
|
|
* State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
|
|
* swapents, oom_score_adj value, and name.
|
|
*/
|
|
static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
|
|
{
|
|
struct task_struct *p;
|
|
struct task_struct *task;
|
|
|
|
pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
|
|
rcu_read_lock();
|
|
for_each_process(p) {
|
|
if (oom_unkillable_task(p, memcg, nodemask))
|
|
continue;
|
|
|
|
task = find_lock_task_mm(p);
|
|
if (!task) {
|
|
/*
|
|
* This is a kthread or all of p's threads have already
|
|
* detached their mm's. There's no need to report
|
|
* them; they can't be oom killed anyway.
|
|
*/
|
|
continue;
|
|
}
|
|
|
|
pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
|
|
task->pid, from_kuid(&init_user_ns, task_uid(task)),
|
|
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
|
|
atomic_long_read(&task->mm->nr_ptes),
|
|
mm_nr_pmds(task->mm),
|
|
get_mm_counter(task->mm, MM_SWAPENTS),
|
|
task->signal->oom_score_adj, task->comm);
|
|
task_unlock(task);
|
|
}
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
static void dump_header(struct oom_control *oc, struct task_struct *p)
|
|
{
|
|
nodemask_t *nm = (oc->nodemask) ? oc->nodemask : &cpuset_current_mems_allowed;
|
|
|
|
pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
|
|
current->comm, oc->gfp_mask, &oc->gfp_mask,
|
|
nodemask_pr_args(nm), oc->order,
|
|
current->signal->oom_score_adj);
|
|
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
|
|
pr_warn("COMPACTION is disabled!!!\n");
|
|
|
|
cpuset_print_current_mems_allowed();
|
|
dump_stack();
|
|
if (oc->memcg)
|
|
mem_cgroup_print_oom_info(oc->memcg, p);
|
|
else
|
|
show_mem(SHOW_MEM_FILTER_NODES);
|
|
if (sysctl_oom_dump_tasks)
|
|
dump_tasks(oc->memcg, oc->nodemask);
|
|
}
|
|
|
|
/*
|
|
* Number of OOM victims in flight
|
|
*/
|
|
static atomic_t oom_victims = ATOMIC_INIT(0);
|
|
static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
|
|
|
|
static bool oom_killer_disabled __read_mostly;
|
|
|
|
#define K(x) ((x) << (PAGE_SHIFT-10))
|
|
|
|
/*
|
|
* task->mm can be NULL if the task is the exited group leader. So to
|
|
* determine whether the task is using a particular mm, we examine all the
|
|
* task's threads: if one of those is using this mm then this task was also
|
|
* using it.
|
|
*/
|
|
bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
|
|
{
|
|
struct task_struct *t;
|
|
|
|
for_each_thread(p, t) {
|
|
struct mm_struct *t_mm = READ_ONCE(t->mm);
|
|
if (t_mm)
|
|
return t_mm == mm;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_MMU
|
|
/*
|
|
* OOM Reaper kernel thread which tries to reap the memory used by the OOM
|
|
* victim (if that is possible) to help the OOM killer to move on.
|
|
*/
|
|
static struct task_struct *oom_reaper_th;
|
|
static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
|
|
static struct task_struct *oom_reaper_list;
|
|
static DEFINE_SPINLOCK(oom_reaper_lock);
|
|
|
|
static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
|
|
{
|
|
struct mmu_gather tlb;
|
|
struct vm_area_struct *vma;
|
|
struct zap_details details = {.check_swap_entries = true,
|
|
.ignore_dirty = true};
|
|
bool ret = true;
|
|
|
|
/*
|
|
* We have to make sure to not race with the victim exit path
|
|
* and cause premature new oom victim selection:
|
|
* __oom_reap_task_mm exit_mm
|
|
* mmget_not_zero
|
|
* mmput
|
|
* atomic_dec_and_test
|
|
* exit_oom_victim
|
|
* [...]
|
|
* out_of_memory
|
|
* select_bad_process
|
|
* # no TIF_MEMDIE task selects new victim
|
|
* unmap_page_range # frees some memory
|
|
*/
|
|
mutex_lock(&oom_lock);
|
|
|
|
if (!down_read_trylock(&mm->mmap_sem)) {
|
|
ret = false;
|
|
goto unlock_oom;
|
|
}
|
|
|
|
/*
|
|
* If the mm has notifiers then we would need to invalidate them around
|
|
* unmap_page_range and that is risky because notifiers can sleep and
|
|
* what they do is basically undeterministic. So let's have a short
|
|
* sleep to give the oom victim some more time.
|
|
* TODO: we really want to get rid of this ugly hack and make sure that
|
|
* notifiers cannot block for unbounded amount of time and add
|
|
* mmu_notifier_invalidate_range_{start,end} around unmap_page_range
|
|
*/
|
|
if (mm_has_notifiers(mm)) {
|
|
up_read(&mm->mmap_sem);
|
|
schedule_timeout_idle(HZ);
|
|
goto unlock_oom;
|
|
}
|
|
|
|
/*
|
|
* increase mm_users only after we know we will reap something so
|
|
* that the mmput_async is called only when we have reaped something
|
|
* and delayed __mmput doesn't matter that much
|
|
*/
|
|
if (!mmget_not_zero(mm)) {
|
|
up_read(&mm->mmap_sem);
|
|
goto unlock_oom;
|
|
}
|
|
|
|
/*
|
|
* Tell all users of get_user/copy_from_user etc... that the content
|
|
* is no longer stable. No barriers really needed because unmapping
|
|
* should imply barriers already and the reader would hit a page fault
|
|
* if it stumbled over a reaped memory.
|
|
*/
|
|
set_bit(MMF_UNSTABLE, &mm->flags);
|
|
|
|
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
|
|
if (is_vm_hugetlb_page(vma))
|
|
continue;
|
|
|
|
/*
|
|
* mlocked VMAs require explicit munlocking before unmap.
|
|
* Let's keep it simple here and skip such VMAs.
|
|
*/
|
|
if (vma->vm_flags & VM_LOCKED)
|
|
continue;
|
|
|
|
/*
|
|
* Only anonymous pages have a good chance to be dropped
|
|
* without additional steps which we cannot afford as we
|
|
* are OOM already.
|
|
*
|
|
* We do not even care about fs backed pages because all
|
|
* which are reclaimable have already been reclaimed and
|
|
* we do not want to block exit_mmap by keeping mm ref
|
|
* count elevated without a good reason.
|
|
*/
|
|
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
|
|
tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
|
|
unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
|
|
&details);
|
|
tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
|
|
}
|
|
}
|
|
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
|
|
task_pid_nr(tsk), tsk->comm,
|
|
K(get_mm_counter(mm, MM_ANONPAGES)),
|
|
K(get_mm_counter(mm, MM_FILEPAGES)),
|
|
K(get_mm_counter(mm, MM_SHMEMPAGES)));
|
|
up_read(&mm->mmap_sem);
|
|
|
|
/*
|
|
* Drop our reference but make sure the mmput slow path is called from a
|
|
* different context because we shouldn't risk we get stuck there and
|
|
* put the oom_reaper out of the way.
|
|
*/
|
|
mmput_async(mm);
|
|
unlock_oom:
|
|
mutex_unlock(&oom_lock);
|
|
return ret;
|
|
}
|
|
|
|
#define MAX_OOM_REAP_RETRIES 10
|
|
static void oom_reap_task(struct task_struct *tsk)
|
|
{
|
|
int attempts = 0;
|
|
struct mm_struct *mm = tsk->signal->oom_mm;
|
|
|
|
/* Retry the down_read_trylock(mmap_sem) a few times */
|
|
while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
|
|
schedule_timeout_idle(HZ/10);
|
|
|
|
if (attempts <= MAX_OOM_REAP_RETRIES)
|
|
goto done;
|
|
|
|
|
|
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
|
|
task_pid_nr(tsk), tsk->comm);
|
|
debug_show_all_locks();
|
|
|
|
done:
|
|
tsk->oom_reaper_list = NULL;
|
|
|
|
/*
|
|
* Hide this mm from OOM killer because it has been either reaped or
|
|
* somebody can't call up_write(mmap_sem).
|
|
*/
|
|
set_bit(MMF_OOM_SKIP, &mm->flags);
|
|
|
|
/* Drop a reference taken by wake_oom_reaper */
|
|
put_task_struct(tsk);
|
|
}
|
|
|
|
static int oom_reaper(void *unused)
|
|
{
|
|
while (true) {
|
|
struct task_struct *tsk = NULL;
|
|
|
|
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
|
|
spin_lock(&oom_reaper_lock);
|
|
if (oom_reaper_list != NULL) {
|
|
tsk = oom_reaper_list;
|
|
oom_reaper_list = tsk->oom_reaper_list;
|
|
}
|
|
spin_unlock(&oom_reaper_lock);
|
|
|
|
if (tsk)
|
|
oom_reap_task(tsk);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void wake_oom_reaper(struct task_struct *tsk)
|
|
{
|
|
if (!oom_reaper_th)
|
|
return;
|
|
|
|
/* mm is already queued? */
|
|
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
|
|
return;
|
|
|
|
get_task_struct(tsk);
|
|
|
|
spin_lock(&oom_reaper_lock);
|
|
tsk->oom_reaper_list = oom_reaper_list;
|
|
oom_reaper_list = tsk;
|
|
spin_unlock(&oom_reaper_lock);
|
|
wake_up(&oom_reaper_wait);
|
|
}
|
|
|
|
static int __init oom_init(void)
|
|
{
|
|
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
|
|
if (IS_ERR(oom_reaper_th)) {
|
|
pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
|
|
PTR_ERR(oom_reaper_th));
|
|
oom_reaper_th = NULL;
|
|
}
|
|
return 0;
|
|
}
|
|
subsys_initcall(oom_init)
|
|
#else
|
|
static inline void wake_oom_reaper(struct task_struct *tsk)
|
|
{
|
|
}
|
|
#endif /* CONFIG_MMU */
|
|
|
|
/**
|
|
* mark_oom_victim - mark the given task as OOM victim
|
|
* @tsk: task to mark
|
|
*
|
|
* Has to be called with oom_lock held and never after
|
|
* oom has been disabled already.
|
|
*
|
|
* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
|
|
* under task_lock or operate on the current).
|
|
*/
|
|
static void mark_oom_victim(struct task_struct *tsk)
|
|
{
|
|
struct mm_struct *mm = tsk->mm;
|
|
|
|
WARN_ON(oom_killer_disabled);
|
|
/* OOM killer might race with memcg OOM */
|
|
if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
|
|
return;
|
|
|
|
/* oom_mm is bound to the signal struct life time. */
|
|
if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
|
|
atomic_inc(&tsk->signal->oom_mm->mm_count);
|
|
|
|
/*
|
|
* Make sure that the task is woken up from uninterruptible sleep
|
|
* if it is frozen because OOM killer wouldn't be able to free
|
|
* any memory and livelock. freezing_slow_path will tell the freezer
|
|
* that TIF_MEMDIE tasks should be ignored.
|
|
*/
|
|
__thaw_task(tsk);
|
|
atomic_inc(&oom_victims);
|
|
}
|
|
|
|
/**
|
|
* exit_oom_victim - note the exit of an OOM victim
|
|
*/
|
|
void exit_oom_victim(void)
|
|
{
|
|
clear_thread_flag(TIF_MEMDIE);
|
|
|
|
if (!atomic_dec_return(&oom_victims))
|
|
wake_up_all(&oom_victims_wait);
|
|
}
|
|
|
|
/**
|
|
* oom_killer_enable - enable OOM killer
|
|
*/
|
|
void oom_killer_enable(void)
|
|
{
|
|
oom_killer_disabled = false;
|
|
}
|
|
|
|
/**
|
|
* oom_killer_disable - disable OOM killer
|
|
* @timeout: maximum timeout to wait for oom victims in jiffies
|
|
*
|
|
* Forces all page allocations to fail rather than trigger OOM killer.
|
|
* Will block and wait until all OOM victims are killed or the given
|
|
* timeout expires.
|
|
*
|
|
* The function cannot be called when there are runnable user tasks because
|
|
* the userspace would see unexpected allocation failures as a result. Any
|
|
* new usage of this function should be consulted with MM people.
|
|
*
|
|
* Returns true if successful and false if the OOM killer cannot be
|
|
* disabled.
|
|
*/
|
|
bool oom_killer_disable(signed long timeout)
|
|
{
|
|
signed long ret;
|
|
|
|
/*
|
|
* Make sure to not race with an ongoing OOM killer. Check that the
|
|
* current is not killed (possibly due to sharing the victim's memory).
|
|
*/
|
|
if (mutex_lock_killable(&oom_lock))
|
|
return false;
|
|
oom_killer_disabled = true;
|
|
mutex_unlock(&oom_lock);
|
|
|
|
ret = wait_event_interruptible_timeout(oom_victims_wait,
|
|
!atomic_read(&oom_victims), timeout);
|
|
if (ret <= 0) {
|
|
oom_killer_enable();
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static inline bool __task_will_free_mem(struct task_struct *task)
|
|
{
|
|
struct signal_struct *sig = task->signal;
|
|
|
|
/*
|
|
* A coredumping process may sleep for an extended period in exit_mm(),
|
|
* so the oom killer cannot assume that the process will promptly exit
|
|
* and release memory.
|
|
*/
|
|
if (sig->flags & SIGNAL_GROUP_COREDUMP)
|
|
return false;
|
|
|
|
if (sig->flags & SIGNAL_GROUP_EXIT)
|
|
return true;
|
|
|
|
if (thread_group_empty(task) && (task->flags & PF_EXITING))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Checks whether the given task is dying or exiting and likely to
|
|
* release its address space. This means that all threads and processes
|
|
* sharing the same mm have to be killed or exiting.
|
|
* Caller has to make sure that task->mm is stable (hold task_lock or
|
|
* it operates on the current).
|
|
*/
|
|
static bool task_will_free_mem(struct task_struct *task)
|
|
{
|
|
struct mm_struct *mm = task->mm;
|
|
struct task_struct *p;
|
|
bool ret = true;
|
|
|
|
/*
|
|
* Skip tasks without mm because it might have passed its exit_mm and
|
|
* exit_oom_victim. oom_reaper could have rescued that but do not rely
|
|
* on that for now. We can consider find_lock_task_mm in future.
|
|
*/
|
|
if (!mm)
|
|
return false;
|
|
|
|
if (!__task_will_free_mem(task))
|
|
return false;
|
|
|
|
/*
|
|
* This task has already been drained by the oom reaper so there are
|
|
* only small chances it will free some more
|
|
*/
|
|
if (test_bit(MMF_OOM_SKIP, &mm->flags))
|
|
return false;
|
|
|
|
if (atomic_read(&mm->mm_users) <= 1)
|
|
return true;
|
|
|
|
/*
|
|
* Make sure that all tasks which share the mm with the given tasks
|
|
* are dying as well to make sure that a) nobody pins its mm and
|
|
* b) the task is also reapable by the oom reaper.
|
|
*/
|
|
rcu_read_lock();
|
|
for_each_process(p) {
|
|
if (!process_shares_mm(p, mm))
|
|
continue;
|
|
if (same_thread_group(task, p))
|
|
continue;
|
|
ret = __task_will_free_mem(p);
|
|
if (!ret)
|
|
break;
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void oom_kill_process(struct oom_control *oc, const char *message)
|
|
{
|
|
struct task_struct *p = oc->chosen;
|
|
unsigned int points = oc->chosen_points;
|
|
struct task_struct *victim = p;
|
|
struct task_struct *child;
|
|
struct task_struct *t;
|
|
struct mm_struct *mm;
|
|
unsigned int victim_points = 0;
|
|
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
|
DEFAULT_RATELIMIT_BURST);
|
|
bool can_oom_reap = true;
|
|
|
|
/*
|
|
* If the task is already exiting, don't alarm the sysadmin or kill
|
|
* its children or threads, just set TIF_MEMDIE so it can die quickly
|
|
*/
|
|
task_lock(p);
|
|
if (task_will_free_mem(p)) {
|
|
mark_oom_victim(p);
|
|
wake_oom_reaper(p);
|
|
task_unlock(p);
|
|
put_task_struct(p);
|
|
return;
|
|
}
|
|
task_unlock(p);
|
|
|
|
if (__ratelimit(&oom_rs))
|
|
dump_header(oc, p);
|
|
|
|
pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
|
|
message, task_pid_nr(p), p->comm, points);
|
|
|
|
/*
|
|
* If any of p's children has a different mm and is eligible for kill,
|
|
* the one with the highest oom_badness() score is sacrificed for its
|
|
* parent. This attempts to lose the minimal amount of work done while
|
|
* still freeing memory.
|
|
*/
|
|
read_lock(&tasklist_lock);
|
|
|
|
/*
|
|
* The task 'p' might have already exited before reaching here. The
|
|
* put_task_struct() will free task_struct 'p' while the loop still try
|
|
* to access the field of 'p', so, get an extra reference.
|
|
*/
|
|
get_task_struct(p);
|
|
for_each_thread(p, t) {
|
|
list_for_each_entry(child, &t->children, sibling) {
|
|
unsigned int child_points;
|
|
|
|
if (process_shares_mm(child, p->mm))
|
|
continue;
|
|
/*
|
|
* oom_badness() returns 0 if the thread is unkillable
|
|
*/
|
|
child_points = oom_badness(child,
|
|
oc->memcg, oc->nodemask, oc->totalpages);
|
|
if (child_points > victim_points) {
|
|
put_task_struct(victim);
|
|
victim = child;
|
|
victim_points = child_points;
|
|
get_task_struct(victim);
|
|
}
|
|
}
|
|
}
|
|
put_task_struct(p);
|
|
read_unlock(&tasklist_lock);
|
|
|
|
p = find_lock_task_mm(victim);
|
|
if (!p) {
|
|
put_task_struct(victim);
|
|
return;
|
|
} else if (victim != p) {
|
|
get_task_struct(p);
|
|
put_task_struct(victim);
|
|
victim = p;
|
|
}
|
|
|
|
/* Get a reference to safely compare mm after task_unlock(victim) */
|
|
mm = victim->mm;
|
|
atomic_inc(&mm->mm_count);
|
|
/*
|
|
* We should send SIGKILL before setting TIF_MEMDIE in order to prevent
|
|
* the OOM victim from depleting the memory reserves from the user
|
|
* space under its control.
|
|
*/
|
|
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
|
|
mark_oom_victim(victim);
|
|
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
|
|
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
|
|
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
|
|
K(get_mm_counter(victim->mm, MM_FILEPAGES)),
|
|
K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
|
|
task_unlock(victim);
|
|
|
|
/*
|
|
* Kill all user processes sharing victim->mm in other thread groups, if
|
|
* any. They don't get access to memory reserves, though, to avoid
|
|
* depletion of all memory. This prevents mm->mmap_sem livelock when an
|
|
* oom killed thread cannot exit because it requires the semaphore and
|
|
* its contended by another thread trying to allocate memory itself.
|
|
* That thread will now get access to memory reserves since it has a
|
|
* pending fatal signal.
|
|
*/
|
|
rcu_read_lock();
|
|
for_each_process(p) {
|
|
if (!process_shares_mm(p, mm))
|
|
continue;
|
|
if (same_thread_group(p, victim))
|
|
continue;
|
|
if (is_global_init(p)) {
|
|
can_oom_reap = false;
|
|
set_bit(MMF_OOM_SKIP, &mm->flags);
|
|
pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
|
|
task_pid_nr(victim), victim->comm,
|
|
task_pid_nr(p), p->comm);
|
|
continue;
|
|
}
|
|
/*
|
|
* No use_mm() user needs to read from the userspace so we are
|
|
* ok to reap it.
|
|
*/
|
|
if (unlikely(p->flags & PF_KTHREAD))
|
|
continue;
|
|
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
if (can_oom_reap)
|
|
wake_oom_reaper(victim);
|
|
|
|
mmdrop(mm);
|
|
put_task_struct(victim);
|
|
}
|
|
#undef K
|
|
|
|
/*
|
|
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
|
|
*/
|
|
static void check_panic_on_oom(struct oom_control *oc,
|
|
enum oom_constraint constraint)
|
|
{
|
|
if (likely(!sysctl_panic_on_oom))
|
|
return;
|
|
if (sysctl_panic_on_oom != 2) {
|
|
/*
|
|
* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
|
|
* does not panic for cpuset, mempolicy, or memcg allocation
|
|
* failures.
|
|
*/
|
|
if (constraint != CONSTRAINT_NONE)
|
|
return;
|
|
}
|
|
/* Do not panic for oom kills triggered by sysrq */
|
|
if (is_sysrq_oom(oc))
|
|
return;
|
|
dump_header(oc, NULL);
|
|
panic("Out of memory: %s panic_on_oom is enabled\n",
|
|
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
|
|
}
|
|
|
|
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
|
|
|
|
int register_oom_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_register(&oom_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(register_oom_notifier);
|
|
|
|
int unregister_oom_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_unregister(&oom_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
|
|
|
|
/**
|
|
* out_of_memory - kill the "best" process when we run out of memory
|
|
* @oc: pointer to struct oom_control
|
|
*
|
|
* If we run out of memory, we have the choice between either
|
|
* killing a random task (bad), letting the system crash (worse)
|
|
* OR try to be smart about which process to kill. Note that we
|
|
* don't have to be perfect here, we just have to be good.
|
|
*/
|
|
bool out_of_memory(struct oom_control *oc)
|
|
{
|
|
unsigned long freed = 0;
|
|
enum oom_constraint constraint = CONSTRAINT_NONE;
|
|
|
|
if (oom_killer_disabled)
|
|
return false;
|
|
|
|
if (!is_memcg_oom(oc)) {
|
|
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
|
if (freed > 0)
|
|
/* Got some memory back in the last second. */
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* If current has a pending SIGKILL or is exiting, then automatically
|
|
* select it. The goal is to allow it to allocate so that it may
|
|
* quickly exit and free its memory.
|
|
*/
|
|
if (task_will_free_mem(current)) {
|
|
mark_oom_victim(current);
|
|
wake_oom_reaper(current);
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* The OOM killer does not compensate for IO-less reclaim.
|
|
* pagefault_out_of_memory lost its gfp context so we have to
|
|
* make sure exclude 0 mask - all other users should have at least
|
|
* ___GFP_DIRECT_RECLAIM to get here.
|
|
*/
|
|
if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
|
|
return true;
|
|
|
|
/*
|
|
* Check if there were limitations on the allocation (only relevant for
|
|
* NUMA and memcg) that may require different handling.
|
|
*/
|
|
constraint = constrained_alloc(oc);
|
|
if (constraint != CONSTRAINT_MEMORY_POLICY)
|
|
oc->nodemask = NULL;
|
|
check_panic_on_oom(oc, constraint);
|
|
|
|
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
|
|
current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
|
|
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
|
|
get_task_struct(current);
|
|
oc->chosen = current;
|
|
oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
|
|
return true;
|
|
}
|
|
|
|
select_bad_process(oc);
|
|
/* Found nothing?!?! Either we hang forever, or we panic. */
|
|
if (!oc->chosen && !is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
|
|
dump_header(oc, NULL);
|
|
panic("Out of memory and no killable processes...\n");
|
|
}
|
|
if (oc->chosen && oc->chosen != (void *)-1UL) {
|
|
oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
|
|
"Memory cgroup out of memory");
|
|
/*
|
|
* Give the killed process a good chance to exit before trying
|
|
* to allocate memory again.
|
|
*/
|
|
schedule_timeout_killable(1);
|
|
}
|
|
return !!oc->chosen;
|
|
}
|
|
|
|
/*
|
|
* The pagefault handler calls here because some allocation has failed. We have
|
|
* to take care of the memcg OOM here because this is the only safe context without
|
|
* any locks held but let the oom killer triggered from the allocation context care
|
|
* about the global OOM.
|
|
*/
|
|
void pagefault_out_of_memory(void)
|
|
{
|
|
static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
|
|
DEFAULT_RATELIMIT_BURST);
|
|
|
|
if (mem_cgroup_oom_synchronize(true))
|
|
return;
|
|
|
|
if (fatal_signal_pending(current))
|
|
return;
|
|
|
|
if (__ratelimit(&pfoom_rs))
|
|
pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
|
|
}
|