1
0
Files
kernel-49/arch/x86/mm/gup.c
Greg Kroah-Hartman fee50d6688 Merge 4.9.298 into android-4.9-q
Changes in 4.9.298
	Bluetooth: bfusb: fix division by zero in send path
	USB: core: Fix bug in resuming hub's handling of wakeup requests
	USB: Fix "slab-out-of-bounds Write" bug in usb_hcd_poll_rh_status
	mfd: intel-lpss: Fix too early PM enablement in the ACPI ->probe()
	can: gs_usb: fix use of uninitialized variable, detach device on reception of invalid USB data
	can: gs_usb: gs_can_start_xmit(): zero-initialize hf->{flags,reserved}
	random: fix data race on crng_node_pool
	random: fix data race on crng init time
	staging: wlan-ng: Avoid bitwise vs logical OR warning in hfa384x_usb_throttlefn()
	drm/i915: Avoid bitwise vs logical OR warning in snb_wm_latency_quirk()
	media: uvcvideo: fix division by zero at stream start
	rtlwifi: rtl8192cu: Fix WARNING when calling local_irq_restore() with interrupts enabled
	HID: uhid: Fix worker destroying device without any protection
	HID: wacom: Avoid using stale array indicies to read contact count
	nfc: llcp: fix NULL error pointer dereference on sendmsg() after failed bind()
	rtc: cmos: take rtc_lock while reading from CMOS
	media: flexcop-usb: fix control-message timeouts
	media: mceusb: fix control-message timeouts
	media: em28xx: fix control-message timeouts
	media: cpia2: fix control-message timeouts
	media: s2255: fix control-message timeouts
	media: dib0700: fix undefined behavior in tuner shutdown
	media: redrat3: fix control-message timeouts
	media: pvrusb2: fix control-message timeouts
	media: stk1160: fix control-message timeouts
	can: softing_cs: softingcs_probe(): fix memleak on registration failure
	PCI: Add function 1 DMA alias quirk for Marvell 88SE9125 SATA controller
	shmem: fix a race between shmem_unused_huge_shrink and shmem_evict_inode
	Bluetooth: cmtp: fix possible panic when cmtp_init_sockets() fails
	wcn36xx: Indicate beacon not connection loss on MISSED_BEACON_IND
	Bluetooth: stop proccessing malicious adv data
	media: dmxdev: fix UAF when dvb_register_device() fails
	crypto: qce - fix uaf on qce_ahash_register_one
	tty: serial: atmel: Check return code of dmaengine_submit()
	tty: serial: atmel: Call dma_async_issue_pending()
	netfilter: bridge: add support for pppoe filtering
	arm64: dts: qcom: msm8916: fix MMC controller aliases
	drm/amdgpu: Fix a NULL pointer dereference in amdgpu_connector_lcd_native_mode()
	drm/radeon/radeon_kms: Fix a NULL pointer dereference in radeon_driver_open_kms()
	serial: amba-pl011: do not request memory region twice
	floppy: Fix hang in watchdog when disk is ejected
	media: dib8000: Fix a memleak in dib8000_init()
	media: saa7146: mxb: Fix a NULL pointer dereference in mxb_attach()
	media: si2157: Fix "warm" tuner state detection
	media: msi001: fix possible null-ptr-deref in msi001_probe()
	usb: ftdi-elan: fix memory leak on device disconnect
	pcmcia: rsrc_nonstatic: Fix a NULL pointer dereference in __nonstatic_find_io_region()
	pcmcia: rsrc_nonstatic: Fix a NULL pointer dereference in nonstatic_find_mem_region()
	ppp: ensure minimum packet size in ppp_write()
	fsl/fman: Check for null pointer after calling devm_ioremap
	spi: spi-meson-spifc: Add missing pm_runtime_disable() in meson_spifc_probe
	can: softing: softing_startstop(): fix set but not used variable warning
	can: xilinx_can: xcan_probe(): check for error irq
	pcmcia: fix setting of kthread task states
	net: mcs7830: handle usb read errors properly
	ext4: avoid trim error on fs with small groups
	ALSA: jack: Add missing rwsem around snd_ctl_remove() calls
	ALSA: PCM: Add missing rwsem around snd_ctl_remove() calls
	ALSA: hda: Add missing rwsem around snd_ctl_remove() calls
	RDMA/hns: Validate the pkey index
	powerpc/prom_init: Fix improper check of prom_getprop()
	ALSA: oss: fix compile error when OSS_DEBUG is enabled
	char/mwave: Adjust io port register size
	scsi: ufs: Fix race conditions related to driver data
	RDMA/core: Let ib_find_gid() continue search even after empty entry
	dmaengine: pxa/mmp: stop referencing config->slave_id
	ASoC: samsung: idma: Check of ioremap return value
	misc: lattice-ecp3-config: Fix task hung when firmware load failed
	mips: lantiq: add support for clk_set_parent()
	mips: bcm63xx: add support for clk_set_parent()
	RDMA/cxgb4: Set queue pair state when being queried
	Bluetooth: Fix debugfs entry leak in hci_register_dev()
	fs: dlm: filter user dlm messages for kernel locks
	ar5523: Fix null-ptr-deref with unexpected WDCMSG_TARGET_START reply
	usb: gadget: f_fs: Use stream_open() for endpoint files
	HID: apple: Do not reset quirks when the Fn key is not found
	media: b2c2: Add missing check in flexcop_pci_isr:
	gpiolib: acpi: Do not set the IRQ type if the IRQ is already in use
	HSI: core: Fix return freed object in hsi_new_client
	mwifiex: Fix skb_over_panic in mwifiex_usb_recv()
	floppy: Add max size check for user space request
	media: saa7146: hexium_orion: Fix a NULL pointer dereference in hexium_attach()
	media: m920x: don't use stack on USB reads
	iwlwifi: mvm: synchronize with FW after multicast commands
	ath10k: Fix tx hanging
	net: bonding: debug: avoid printing debug logs when bond is not notifying peers
	media: igorplugusb: receiver overflow should be reported
	media: saa7146: hexium_gemini: Fix a NULL pointer dereference in hexium_attach()
	usb: hub: Add delay for SuperSpeed hub resume to let links transit to U0
	ath9k: Fix out-of-bound memcpy in ath9k_hif_usb_rx_stream
	um: registers: Rename function names to avoid conflicts and build problems
	jffs2: GC deadlock reading a page that is used in jffs2_write_begin()
	ACPICA: Utilities: Avoid deleting the same object twice in a row
	ACPICA: Executer: Fix the REFCLASS_REFOF case in acpi_ex_opcode_1A_0T_1R()
	btrfs: remove BUG_ON() in find_parent_nodes()
	btrfs: remove BUG_ON(!eie) in find_parent_nodes
	net: mdio: Demote probed message to debug print
	dm btree: add a defensive bounds check to insert_at()
	dm space map common: add bounds check to sm_ll_lookup_bitmap()
	serial: pl010: Drop CR register reset on set_termios
	serial: core: Keep mctrl register state and cached copy in sync
	parisc: Avoid calling faulthandler_disabled() twice
	powerpc/6xx: add missing of_node_put
	powerpc/powernv: add missing of_node_put
	powerpc/cell: add missing of_node_put
	powerpc/btext: add missing of_node_put
	i2c: i801: Don't silently correct invalid transfer size
	powerpc/smp: Move setup_profiling_timer() under CONFIG_PROFILING
	i2c: mpc: Correct I2C reset procedure
	w1: Misuse of get_user()/put_user() reported by sparse
	ALSA: seq: Set upper limit of processed events
	i2c: designware-pci: Fix to change data types of hcnt and lcnt parameters
	MIPS: Octeon: Fix build errors using clang
	scsi: sr: Don't use GFP_DMA
	ASoC: mediatek: mt8173: fix device_node leak
	power: bq25890: Enable continuous conversion for ADC at charging
	ubifs: Error path in ubifs_remount_rw() seems to wrongly free write buffers
	iwlwifi: mvm: Increase the scan timeout guard to 30 seconds
	ext4: set csum seed in tmp inode while migrating to extents
	ext4: Fix BUG_ON in ext4_bread when write quota data
	ext4: don't use the orphan list when migrating an inode
	fuse: fix bad inode
	fuse: fix live lock in fuse_iget()
	drm/radeon: fix error handling in radeon_driver_open_kms
	RDMA/hns: Modify the mapping attribute of doorbell to device
	RDMA/rxe: Fix a typo in opcode name
	powerpc/fsl/dts: Enable WA for erratum A-009885 on fman3l MDIO buses
	net/fsl: xgmac_mdio: Fix incorrect iounmap when removing module
	parisc: pdc_stable: Fix memory leak in pdcs_register_pathentries
	af_unix: annote lockless accesses to unix_tot_inflight & gc_in_progress
	net: axienet: Wait for PhyRstCmplt after core reset
	net: axienet: fix number of TX ring slots for available check
	netns: add schedule point in ops_exit_list()
	libcxgb: Don't accidentally set RTO_ONLINK in cxgb_find_route()
	dmaengine: at_xdmac: Don't start transactions at tx_submit level
	dmaengine: at_xdmac: Print debug message after realeasing the lock
	dmaengine: at_xdmac: Fix lld view setting
	dmaengine: at_xdmac: Fix at_xdmac_lld struct definition
	net_sched: restore "mpu xxx" handling
	bcmgenet: add WOL IRQ check
	scripts/dtc: dtx_diff: remove broken example from help text
	lib82596: Fix IRQ check in sni_82596_probe
	Revert "gup: document and work around "COW can break either way" issue"
	gup: document and work around "COW can break either way" issue
	drm/ttm/nouveau: don't call tt destroy callback on alloc failure.
	gianfar: simplify FCS handling and fix memory leak
	gianfar: fix jumbo packets+napi+rx overrun crash
	cipso,calipso: resolve a number of problems with the DOI refcounts
	rbtree: cache leftmost node internally
	lib/timerqueue: Rely on rbtree semantics for next timer
	mm: add follow_pte_pmd()
	KVM: do not assume PTE is writable after follow_pfn
	KVM: Use kvm_pfn_t for local PFN variable in hva_to_pfn_remapped()
	KVM: do not allow mapping valid but non-reference-counted pages
	Linux 4.9.298

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: Ifcea82a702a0906d9090c89785363c2d5423f652
2022-01-31 17:10:19 +03:00

497 lines
12 KiB
C

/*
* Lockless get_user_pages_fast for x86
*
* Copyright (C) 2008 Nick Piggin
* Copyright (C) 2008 Novell Inc.
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/memremap.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
static inline pte_t gup_get_pte(pte_t *ptep)
{
#ifndef CONFIG_X86_PAE
return READ_ONCE(*ptep);
#else
/*
* With get_user_pages_fast, we walk down the pagetables without taking
* any locks. For this we would like to load the pointers atomically,
* but that is not possible (without expensive cmpxchg8b) on PAE. What
* we do have is the guarantee that a pte will only either go from not
* present to present, or present to not present or both -- it will not
* switch to a completely different present page without a TLB flush in
* between; something that we are blocking by holding interrupts off.
*
* Setting ptes from not present to present goes:
* ptep->pte_high = h;
* smp_wmb();
* ptep->pte_low = l;
*
* And present to not present goes:
* ptep->pte_low = 0;
* smp_wmb();
* ptep->pte_high = 0;
*
* We must ensure here that the load of pte_low sees l iff pte_high
* sees h. We load pte_high *after* loading pte_low, which ensures we
* don't see an older value of pte_high. *Then* we recheck pte_low,
* which ensures that we haven't picked up a changed pte high. We might
* have got rubbish values from pte_low and pte_high, but we are
* guaranteed that pte_low will not have the present bit set *unless*
* it is 'l'. And get_user_pages_fast only operates on present ptes, so
* we're safe.
*
* gup_get_pte should not be used or copied outside gup.c without being
* very careful -- it does not atomically load the pte or anything that
* is likely to be useful for you.
*/
pte_t pte;
retry:
pte.pte_low = ptep->pte_low;
smp_rmb();
pte.pte_high = ptep->pte_high;
smp_rmb();
if (unlikely(pte.pte_low != ptep->pte_low))
goto retry;
return pte;
#endif
}
static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
{
while ((*nr) - nr_start) {
struct page *page = pages[--(*nr)];
ClearPageReferenced(page);
put_page(page);
}
}
/*
* 'pteval' can come from a pte, pmd or pud. We only check
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
* same value on all 3 types.
*/
static inline int pte_allows_gup(unsigned long pteval, int write)
{
unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
if (write)
need_pte_bits |= _PAGE_RW;
if ((pteval & need_pte_bits) != need_pte_bits)
return 0;
/* Check memory protection keys permissions. */
if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
return 0;
return 1;
}
/*
* Return the compund head page with ref appropriately incremented,
* or NULL if that failed.
*/
static inline struct page *try_get_compound_head(struct page *page, int refs)
{
struct page *head = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(head) < 0))
return NULL;
if (unlikely(!page_cache_add_speculative(head, refs)))
return NULL;
return head;
}
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
* register pressure.
*/
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr;
pte_t *ptep;
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
struct page *head, *page;
/* Similar to the PMD case, NUMA hinting must take slow path */
if (pte_protnone(pte)) {
pte_unmap(ptep);
return 0;
}
if (!pte_allows_gup(pte_val(pte), write)) {
pte_unmap(ptep);
return 0;
}
if (pte_devmap(pte)) {
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
pte_unmap(ptep);
return 0;
}
} else if (pte_special(pte)) {
pte_unmap(ptep);
return 0;
}
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
head = try_get_compound_head(page, 1);
if (!head) {
put_dev_pagemap(pgmap);
pte_unmap(ptep);
return 0;
}
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
put_page(head);
put_dev_pagemap(pgmap);
pte_unmap(ptep);
return 0;
}
put_dev_pagemap(pgmap);
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
} while (ptep++, addr += PAGE_SIZE, addr != end);
pte_unmap(ptep - 1);
return 1;
}
static inline void get_head_page_multiple(struct page *page, int nr)
{
VM_BUG_ON_PAGE(page != compound_head(page), page);
VM_BUG_ON_PAGE(page_count(page) == 0, page);
page_ref_add(page, nr);
SetPageReferenced(page);
}
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
int nr_start = *nr;
unsigned long pfn = pmd_pfn(pmd);
struct dev_pagemap *pgmap = NULL;
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
do {
struct page *page = pfn_to_page(pfn);
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
if (unlikely(!try_get_page(page))) {
put_dev_pagemap(pgmap);
return 0;
}
SetPageReferenced(page);
pages[*nr] = page;
put_dev_pagemap(pgmap);
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
return 1;
}
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
if (!pte_allows_gup(pmd_val(pmd), write))
return 0;
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
if (pmd_devmap(pmd))
return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
/* hugepages are never "special" */
VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
refs = 0;
head = pmd_page(pmd);
if (WARN_ON_ONCE(page_ref_count(head) <= 0))
return 0;
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
get_head_page_multiple(head, refs);
return 1;
}
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
pmdp = pmd_offset(&pud, addr);
do {
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
if (pmd_none(pmd))
return 0;
if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
if (pmd_protnone(pmd))
return 0;
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
return 0;
} else {
if (!gup_pte_range(pmd, addr, next, write, pages, nr))
return 0;
}
} while (pmdp++, addr = next, addr != end);
return 1;
}
static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
if (!pte_allows_gup(pud_val(pud), write))
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
refs = 0;
head = pud_page(pud);
if (WARN_ON_ONCE(page_ref_count(head) <= 0))
return 0;
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
get_head_page_multiple(head, refs);
return 1;
}
static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
pudp = pud_offset(&pgd, addr);
do {
pud_t pud = *pudp;
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
if (unlikely(pud_large(pud))) {
if (!gup_huge_pud(pud, addr, next, write, pages, nr))
return 0;
} else {
if (!gup_pmd_range(pud, addr, next, write, pages, nr))
return 0;
}
} while (pudp++, addr = next, addr != end);
return 1;
}
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
*/
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
unsigned long next;
unsigned long flags;
pgd_t *pgdp;
int nr = 0;
start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
(void __user *)start, len)))
return 0;
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
* important workloads (eg. DB2), and whether limiting the batch size
* will decrease performance.
*
* It seems like we're in the clear for the moment. Direct-IO is
* the main guy that batches up lots of get_user_pages, and even
* they are limited to 64-at-a-time which is not so many.
*/
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
*
* So long as we atomically load page table pointers versus teardown
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = *pgdp;
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
break;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
return nr;
}
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @write: whether pages will be written to
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Attempt to pin user pages in memory without taking mm->mmap_sem.
* If not successful, it will fall back to taking the lock and
* calling get_user_pages().
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
unsigned long next;
pgd_t *pgdp;
int nr = 0;
start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
if (end < start)
goto slow_irqon;
#ifdef CONFIG_X86_64
if (end >> __VIRTUAL_MASK_SHIFT)
goto slow_irqon;
#endif
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
* important workloads (eg. DB2), and whether limiting the batch size
* will decrease performance.
*
* It seems like we're in the clear for the moment. Direct-IO is
* the main guy that batches up lots of get_user_pages, and even
* they are limited to 64-at-a-time which is not so many.
*/
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
*
* So long as we atomically load page table pointers versus teardown
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
local_irq_disable();
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = *pgdp;
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
/*
* The FAST_GUP case requires FOLL_WRITE even for pure reads,
* because get_user_pages() may need to cause an early COW in
* order to avoid confusing the normal COW routines. So only
* targets that are already writable are safe to do by just
* looking at the page tables.
*/
if (!gup_pud_range(pgd, addr, next, 1, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
return nr;
{
int ret;
slow:
local_irq_enable();
slow_irqon:
/* Try to get the remaining pages with get_user_pages */
start += nr << PAGE_SHIFT;
pages += nr;
ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT,
pages, write ? FOLL_WRITE : 0);
/* Have to be a bit careful with return values */
if (nr > 0) {
if (ret < 0)
ret = nr;
else
ret += nr;
}
return ret;
}
}