android_kernel_motorola_sm6225/mm/mempolicy.c
Michael Bestas 1b59618ce4
Merge tag 'ASB-2023-09-05_4.19-stable' of https://android.googlesource.com/kernel/common into android13-4.19-kona
https://source.android.com/docs/security/bulletin/2023-09-01

* tag 'ASB-2023-09-05_4.19-stable' of https://android.googlesource.com/kernel/common:
  Linux 4.19.294
  Revert "ARM: ep93xx: fix missing-prototype warnings"
  Revert "MIPS: Alchemy: fix dbdma2"
  Linux 4.19.293
  dma-buf/sw_sync: Avoid recursive lock during fence signal
  clk: Fix undefined reference to `clk_rate_exclusive_{get,put}'
  scsi: core: raid_class: Remove raid_component_add()
  scsi: snic: Fix double free in snic_tgt_create()
  irqchip/mips-gic: Don't touch vl_map if a local interrupt is not routable
  rtnetlink: Reject negative ifindexes in RTM_NEWLINK
  netfilter: nf_queue: fix socket leak
  sched/rt: pick_next_rt_entity(): check list_entry
  mmc: block: Fix in_flight[issue_type] value error
  x86/fpu: Set X86_FEATURE_OSXSAVE feature after enabling OSXSAVE in CR4
  PCI: acpiphp: Use pci_assign_unassigned_bridge_resources() only for non-root bus
  media: vcodec: Fix potential array out-of-bounds in encoder queue_setup
  lib/clz_ctz.c: Fix __clzdi2() and __ctzdi2() for 32-bit kernels
  batman-adv: Fix batadv_v_ogm_aggr_send memory leak
  batman-adv: Fix TT global entry leak when client roamed back
  batman-adv: Do not get eth header before batadv_check_management_packet
  batman-adv: Don't increase MTU when set by user
  batman-adv: Trigger events for auto adjusted MTU
  nfsd: Fix race to FREE_STATEID and cl_revoked
  ibmveth: Use dcbf rather than dcbfl
  ipvs: fix racy memcpy in proc_do_sync_threshold
  ipvs: Improve robustness to the ipvs sysctl
  bonding: fix macvlan over alb bond support
  net: remove bond_slave_has_mac_rcu()
  net/sched: fix a qdisc modification with ambiguous command request
  igb: Avoid starting unnecessary workqueues
  dccp: annotate data-races in dccp_poll()
  sock: annotate data-races around prot->memory_pressure
  tracing: Fix memleak due to race between current_tracer and trace
  drm/amd/display: check TG is non-null before checking if enabled
  drm/amd/display: do not wait for mpc idle if tg is disabled
  regmap: Account for register length in SMBus I/O limits
  dm integrity: reduce vmalloc space footprint on 32-bit architectures
  dm integrity: increase RECALC_SECTORS to improve recalculate speed
  powerpc: Fail build if using recordmcount with binutils v2.37
  powerpc: remove leftover code of old GCC version checks
  powerpc/32: add stack protector support
  fbdev: fix potential OOB read in fast_imageblit()
  fbdev: Fix sys_imageblit() for arbitrary image widths
  fbdev: Improve performance of sys_imageblit()
  tty: serial: fsl_lpuart: add earlycon for imx8ulp platform
  Revert "tty: serial: fsl_lpuart: drop earlycon entry for i.MX8QXP"
  MIPS: cpu-features: Use boot_cpu_type for CPU type based features
  MIPS: cpu-features: Enable octeon_cache by cpu_type
  fs: dlm: fix mismatch of plock results from userspace
  fs: dlm: use dlm_plock_info for do_unlock_close
  fs: dlm: change plock interrupted message to debug again
  fs: dlm: add pid to debug log
  dlm: replace usage of found with dedicated list iterator variable
  dlm: improve plock logging if interrupted
  PCI: acpiphp: Reassign resources on bridge if necessary
  net: phy: broadcom: stub c45 read/write for 54810
  net: xfrm: Amend XFRMA_SEC_CTX nla_policy structure
  net: fix the RTO timer retransmitting skb every 1ms if linear option is enabled
  virtio-net: set queues after driver_ok
  af_unix: Fix null-ptr-deref in unix_stream_sendpage().
  netfilter: set default timeout to 3 secs for sctp shutdown send and recv state
  test_firmware: prevent race conditions by a correct implementation of locking
  mmc: wbsd: fix double mmc_free_host() in wbsd_init()
  cifs: Release folio lock on fscache read hit.
  ALSA: usb-audio: Add support for Mythware XA001AU capture and playback interfaces.
  serial: 8250: Fix oops for port->pm on uart_change_pm()
  ASoC: meson: axg-tdm-formatter: fix channel slot allocation
  ASoC: rt5665: add missed regulator_bulk_disable
  net: do not allow gso_size to be set to GSO_BY_FRAGS
  sock: Fix misuse of sk_under_memory_pressure()
  i40e: fix misleading debug logs
  team: Fix incorrect deletion of ETH_P_8021AD protocol vid from slaves
  netfilter: nft_dynset: disallow object maps
  selftests: mirror_gre_changes: Tighten up the TTL test match
  xfrm: add NULL check in xfrm_update_ae_params
  ip_vti: fix potential slab-use-after-free in decode_session6
  ip6_vti: fix slab-use-after-free in decode_session6
  xfrm: fix slab-use-after-free in decode_session6
  xfrm: interface: rename xfrm_interface.c to xfrm_interface_core.c
  net: af_key: fix sadb_x_filter validation
  net: xfrm: Fix xfrm_address_filter OOB read
  btrfs: fix BUG_ON condition in btrfs_cancel_balance
  powerpc/rtas_flash: allow user copy to flash block cache objects
  fbdev: mmp: fix value check in mmphw_probe()
  virtio-mmio: don't break lifecycle of vm_dev
  virtio-mmio: Use to_virtio_mmio_device() to simply code
  virtio-mmio: convert to devm_platform_ioremap_resource
  nfsd: Remove incorrect check in nfsd4_validate_stateid
  nfsd4: kill warnings on testing stateids with mismatched clientids
  block: fix signed int overflow in Amiga partition support
  mmc: sunxi: fix deferred probing
  mmc: bcm2835: fix deferred probing
  mmc: Remove dev_err() usage after platform_get_irq()
  mmc: tmio: move tmio_mmc_set_clock() to platform hook
  mmc: tmio: replace tmio_mmc_clk_stop() calls with tmio_mmc_set_clock()
  mmc: meson-gx: remove redundant mmc_request_done() call from irq context
  mmc: meson-gx: remove useless lock
  USB: dwc3: qcom: fix NULL-deref on suspend
  usb: dwc3: qcom: Add helper functions to enable,disable wake irqs
  irqchip/mips-gic: Use raw spinlock for gic_lock
  irqchip/mips-gic: Get rid of the reliance on irq_cpu_online()
  x86/topology: Fix erroneous smp_num_siblings on Intel Hybrid platforms
  powerpc/64s/radix: Fix soft dirty tracking
  powerpc: Move page table dump files in a dedicated subdirectory
  powerpc/mm: dump block address translation on book3s/32
  powerpc/mm: dump segment registers on book3s/32
  powerpc/mm: Move pgtable_t into platform headers
  powerpc/mm: move platform specific mmu-xxx.h in platform directories
  iio: addac: stx104: Fix race condition when converting analog-to-digital
  iio: addac: stx104: Fix race condition for stx104_write_raw()
  iio: adc: stx104: Implement and utilize register structures
  iio: adc: stx104: Utilize iomap interface
  iio: add addac subdirectory
  IMA: allow/fix UML builds
  drm/amdgpu: Fix potential fence use-after-free v2
  Bluetooth: L2CAP: Fix use-after-free
  pcmcia: rsrc_nonstatic: Fix memory leak in nonstatic_release_resource_db()
  gfs2: Fix possible data races in gfs2_show_options()
  media: platform: mediatek: vpu: fix NULL ptr dereference
  media: v4l2-mem2mem: add lock to protect parameter num_rdy
  FS: JFS: Check for read-only mounted filesystem in txBegin
  FS: JFS: Fix null-ptr-deref Read in txBegin
  MIPS: dec: prom: Address -Warray-bounds warning
  fs: jfs: Fix UBSAN: array-index-out-of-bounds in dbAllocDmapLev
  udf: Fix uninitialized array access for some pathnames
  HID: add quirk for 03f0:464a HP Elite Presenter Mouse
  quota: fix warning in dqgrab()
  quota: Properly disable quotas when add_dquot_ref() fails
  ALSA: emu10k1: roll up loops in DSP setup code for Audigy
  drm/radeon: Fix integer overflow in radeon_cs_parser_init
  selftests: forwarding: tc_flower: Relax success criterion
  lib/mpi: Eliminate unused umul_ppmm definitions for MIPS
  Revert "posix-timers: Ensure timer ID search-loop limit is valid"
  UPSTREAM: media: usb: siano: Fix warning due to null work_func_t function pointer
  UPSTREAM: Bluetooth: L2CAP: Fix use-after-free in l2cap_sock_ready_cb
  UPSTREAM: net/sched: cls_route: No longer copy tcf_result on update to avoid use-after-free
  UPSTREAM: net/sched: cls_u32: No longer copy tcf_result on update to avoid use-after-free
  Linux 4.19.292
  sch_netem: fix issues in netem_change() vs get_dist_table()
  alpha: remove __init annotation from exported page_is_ram()
  scsi: core: Fix possible memory leak if device_add() fails
  scsi: snic: Fix possible memory leak if device_add() fails
  scsi: 53c700: Check that command slot is not NULL
  scsi: storvsc: Fix handling of virtual Fibre Channel timeouts
  scsi: core: Fix legacy /proc parsing buffer overflow
  netfilter: nf_tables: report use refcount overflow
  netfilter: nf_tables: bogus EBUSY when deleting flowtable after flush
  btrfs: don't stop integrity writeback too early
  ibmvnic: Handle DMA unmapping of login buffs in release functions
  wifi: cfg80211: fix sband iftype data lookup for AP_VLAN
  IB/hfi1: Fix possible panic during hotplug remove
  drivers: net: prevent tun_build_skb() to exceed the packet size limit
  dccp: fix data-race around dp->dccps_mss_cache
  bonding: Fix incorrect deletion of ETH_P_8021AD protocol vid from slaves
  net/packet: annotate data-races around tp->status
  mISDN: Update parameter type of dsp_cmx_send()
  drm/nouveau/disp: Revert a NULL check inside nouveau_connector_get_modes
  x86: Move gds_ucode_mitigated() declaration to header
  x86/mm: Fix VDSO and VVAR placement on 5-level paging machines
  x86/cpu/amd: Enable Zenbleed fix for AMD Custom APU 0405
  usb: dwc3: Properly handle processing of pending events
  usb-storage: alauda: Fix uninit-value in alauda_check_media()
  binder: fix memory leak in binder_init()
  iio: cros_ec: Fix the allocation size for cros_ec_command
  nilfs2: fix use-after-free of nilfs_root in dirtying inodes via iput
  radix tree test suite: fix incorrect allocation size for pthreads
  drm/nouveau/gr: enable memory loads on helper invocation on all channels
  dmaengine: pl330: Return DMA_PAUSED when transaction is paused
  ipv6: adjust ndisc_is_useropt() to also return true for PIO
  mmc: moxart: read scr register without changing byte order
  sparc: fix up arch_cpu_finalize_init() build breakage.
  UPSTREAM: net/sched: cls_fw: Fix improper refcount update leads to use-after-free
  Linux 4.19.291
  drm/edid: fix objtool warning in drm_cvt_modes()
  arm64: dts: stratix10: fix incorrect I2C property for SCL signal
  drivers core: Use sysfs_emit and sysfs_emit_at for show(device *...) functions
  ARM: dts: nxp/imx6sll: fix wrong property name in usbphy node
  ARM: dts: imx6sll: fixup of operating points
  ARM: dts: imx: add usb alias
  ARM: dts: imx6sll: Make ssi node name same as other platforms
  PM: sleep: wakeirq: fix wake irq arming
  PM / wakeirq: support enabling wake-up irq after runtime_suspend called
  powerpc/mm/altmap: Fix altmap boundary check
  mtd: rawnand: omap_elm: Fix incorrect type in assignment
  test_firmware: return ENOMEM instead of ENOSPC on failed memory allocation
  test_firmware: fix a memory leak with reqs buffer
  ext2: Drop fragment support
  net: usbnet: Fix WARNING in usbnet_start_xmit/usb_submit_urb
  Bluetooth: L2CAP: Fix use-after-free in l2cap_sock_ready_cb
  fs/sysv: Null check to prevent null-ptr-deref bug
  USB: zaurus: Add ID for A-300/B-500/C-700
  libceph: fix potential hang in ceph_osdc_notify()
  scsi: zfcp: Defer fc_rport blocking until after ADISC response
  tcp_metrics: fix data-race in tcpm_suck_dst() vs fastopen
  tcp_metrics: annotate data-races around tm->tcpm_net
  tcp_metrics: annotate data-races around tm->tcpm_vals[]
  tcp_metrics: annotate data-races around tm->tcpm_lock
  tcp_metrics: annotate data-races around tm->tcpm_stamp
  tcp_metrics: fix addr_same() helper
  ip6mr: Fix skb_under_panic in ip6mr_cache_report()
  net/sched: cls_route: No longer copy tcf_result on update to avoid use-after-free
  net/sched: cls_u32: No longer copy tcf_result on update to avoid use-after-free
  net: add missing data-race annotation for sk_ll_usec
  net: add missing data-race annotations around sk->sk_peek_off
  net: sched: cls_u32: Fix match key mis-addressing
  perf test uprobe_from_different_cu: Skip if there is no gcc
  net/mlx5e: fix return value check in mlx5e_ipsec_remove_trailer()
  KVM: s390: fix sthyi error handling
  word-at-a-time: use the same return type for has_zero regardless of endianness
  loop: Select I/O scheduler 'none' from inside add_disk()
  perf: Fix function pointer case
  net/sched: cls_u32: Fix reference counter leak leading to overflow
  ASoC: cs42l51: fix driver to properly autoload with automatic module loading
  net/sched: sch_qfq: account for stab overhead in qfq_enqueue
  net/sched: cls_fw: Fix improper refcount update leads to use-after-free
  drm/client: Fix memory leak in drm_client_target_cloned
  dm cache policy smq: ensure IO doesn't prevent cleaner policy progress
  ASoC: wm8904: Fill the cache for WM8904_ADC_TEST_0 register
  s390/dasd: fix hanging device after quiesce/resume
  virtio-net: fix race between set queues and probe
  serial: 8250_dw: Preserve original value of DLF register
  serial: 8250_dw: split Synopsys DesignWare 8250 common functions
  irq-bcm6345-l1: Do not assume a fixed block to cpu mapping
  tpm_tis: Explicitly check for error code
  btrfs: check for commit error at btrfs_attach_transaction_barrier()
  hwmon: (nct7802) Fix for temp6 (PECI1) processed even if PECI1 disabled
  staging: ks7010: potential buffer overflow in ks_wlan_set_encode_ext()
  Documentation: security-bugs.rst: clarify CVE handling
  Documentation: security-bugs.rst: update preferences when dealing with the linux-distros group
  usb: xhci-mtk: set the dma max_seg_size
  USB: quirks: add quirk for Focusrite Scarlett
  usb: ohci-at91: Fix the unhandle interrupt when resume
  usb: dwc3: don't reset device side if dwc3 was configured as host-only
  usb: dwc3: pci: skip BYT GPIO lookup table for hardwired phy
  Revert "usb: dwc3: core: Enable AutoRetry feature in the controller"
  can: gs_usb: gs_can_close(): add missing set of CAN state to CAN_STATE_STOPPED
  USB: serial: simple: sort driver entries
  USB: serial: simple: add Kaufmann RKS+CAN VCP
  USB: serial: option: add Quectel EC200A module support
  USB: serial: option: support Quectel EM060K_128
  tracing: Fix warning in trace_buffered_event_disable()
  ring-buffer: Fix wrong stat of cpu_buffer->read
  ata: pata_ns87415: mark ns87560_tf_read static
  dm raid: fix missing reconfig_mutex unlock in raid_ctr() error paths
  block: Fix a source code comment in include/uapi/linux/blkzoned.h
  ASoC: fsl_spdif: Silence output on stop
  drm/msm: Fix IS_ERR_OR_NULL() vs NULL check in a5xx_submit_in_rb()
  RDMA/mlx4: Make check for invalid flags stricter
  benet: fix return value check in be_lancer_xmit_workarounds()
  net/sched: mqprio: Add length check for TCA_MQPRIO_{MAX/MIN}_RATE64
  net/sched: mqprio: add extack to mqprio_parse_nlattr()
  net/sched: mqprio: refactor nlattr parsing to a separate function
  platform/x86: msi-laptop: Fix rfkill out-of-sync on MSI Wind U100
  team: reset team's flags when down link is P2P device
  bonding: reset bond's flags when down link is P2P device
  tcp: Reduce chance of collisions in inet6_hashfn().
  ipv6 addrconf: fix bug where deleting a mngtmpaddr can create a new temporary address
  ethernet: atheros: fix return value check in atl1e_tso_csum()
  phy: hisilicon: Fix an out of bounds check in hisi_inno_phy_probe()
  i40e: Fix an NULL vs IS_ERR() bug for debugfs_create_dir()
  ext4: fix to check return value of freeze_bdev() in ext4_shutdown()
  scsi: qla2xxx: Array index may go out of bound
  scsi: qla2xxx: Fix inconsistent format argument type in qla_os.c
  ftrace: Fix possible warning on checking all pages used in ftrace_process_locs()
  ftrace: Store the order of pages allocated in ftrace_page
  ftrace: Check if pages were allocated before calling free_pages()
  ftrace: Add information on number of page groups allocated
  fs: dlm: interrupt posix locks only when process is killed
  dlm: rearrange async condition return
  dlm: cleanup plock_op vs plock_xop
  PCI/ASPM: Avoid link retraining race
  PCI/ASPM: Factor out pcie_wait_for_retrain()
  PCI/ASPM: Return 0 or -ETIMEDOUT from pcie_retrain_link()
  PCI: Rework pcie_retrain_link() wait loop
  ext4: Fix reusing stale buffer heads from last failed mounting
  ext4: rename journal_dev to s_journal_dev inside ext4_sb_info
  btrfs: fix extent buffer leak after tree mod log failure at split_node()
  bcache: Fix __bch_btree_node_alloc to make the failure behavior consistent
  bcache: remove 'int n' from parameter list of bch_bucket_alloc_set()
  bcache: use MAX_CACHES_PER_SET instead of magic number 8 in __bch_bucket_alloc_set
  gpio: tps68470: Make tps68470_gpio_output() always set the initial value
  tracing/histograms: Return an error if we fail to add histogram to hist_vars list
  tcp: annotate data-races around fastopenq.max_qlen
  tcp: annotate data-races around tp->notsent_lowat
  tcp: annotate data-races around rskq_defer_accept
  tcp: annotate data-races around tp->linger2
  net: Replace the limit of TCP_LINGER2 with TCP_FIN_TIMEOUT_MAX
  netfilter: nf_tables: can't schedule in nft_chain_validate
  netfilter: nf_tables: fix spurious set element insertion failure
  llc: Don't drop packet from non-root netns.
  fbdev: au1200fb: Fix missing IRQ check in au1200fb_drv_probe
  Revert "tcp: avoid the lookup process failing to get sk in ehash table"
  net:ipv6: check return value of pskb_trim()
  net: ethernet: ti: cpsw_ale: Fix cpsw_ale_get_field()/cpsw_ale_set_field()
  pinctrl: amd: Use amd_pinconf_set() for all config options
  fbdev: imxfb: warn about invalid left/right margin
  spi: bcm63xx: fix max prepend length
  igb: Fix igb_down hung on surprise removal
  wifi: iwlwifi: mvm: avoid baid size integer overflow
  wifi: wext-core: Fix -Wstringop-overflow warning in ioctl_standard_iw_point()
  bpf: Address KCSAN report on bpf_lru_list
  sched/fair: Don't balance task to its current running CPU
  posix-timers: Ensure timer ID search-loop limit is valid
  md/raid10: prevent soft lockup while flush writes
  md: fix data corruption for raid456 when reshape restart while grow up
  nbd: Add the maximum limit of allocated index in nbd_dev_add
  debugobjects: Recheck debug_objects_enabled before reporting
  ext4: correct inline offset when handling xattrs in inode body
  can: bcm: Fix UAF in bcm_proc_show()
  fuse: revalidate: don't invalidate if interrupted
  perf probe: Add test for regression introduced by switch to die_get_decl_file()
  tracing/histograms: Add histograms to hist_vars if they have referenced variables
  drm/atomic: Fix potential use-after-free in nonblocking commits
  scsi: qla2xxx: Pointer may be dereferenced
  scsi: qla2xxx: Check valid rport returned by fc_bsg_to_rport()
  scsi: qla2xxx: Fix potential NULL pointer dereference
  scsi: qla2xxx: Wait for io return on terminate rport
  xtensa: ISS: fix call to split_if_spec
  ring-buffer: Fix deadloop issue on reading trace_pipe
  tty: serial: samsung_tty: Fix a memory leak in s3c24xx_serial_getclk() when iterating clk
  tty: serial: samsung_tty: Fix a memory leak in s3c24xx_serial_getclk() in case of error
  Revert "8250: add support for ASIX devices with a FIFO bug"
  meson saradc: fix clock divider mask length
  ceph: don't let check_caps skip sending responses for revoke msgs
  hwrng: imx-rngc - fix the timeout for init and self check
  serial: atmel: don't enable IRQs prematurely
  fs: dlm: return positive pid value for F_GETLK
  md/raid0: add discard support for the 'original' layout
  misc: pci_endpoint_test: Re-init completion for every test
  misc: pci_endpoint_test: Free IRQs before removing the device
  PCI: rockchip: Use u32 variable to access 32-bit registers
  PCI: rockchip: Fix legacy IRQ generation for RK3399 PCIe endpoint core
  PCI: rockchip: Add poll and timeout to wait for PHY PLLs to be locked
  PCI: rockchip: Write PCI Device ID to correct register
  PCI: rockchip: Assert PCI Configuration Enable bit after probe
  PCI: qcom: Disable write access to read only registers for IP v2.3.3
  PCI: Add function 1 DMA alias quirk for Marvell 88SE9235
  PCI/PM: Avoid putting EloPOS E2/S2/H2 PCIe Ports in D3cold
  jfs: jfs_dmap: Validate db_l2nbperpage while mounting
  ext4: only update i_reserved_data_blocks on successful block allocation
  ext4: fix wrong unit use in ext4_mb_clear_bb
  perf intel-pt: Fix CYC timestamps after standalone CBR
  SUNRPC: Fix UAF in svc_tcp_listen_data_ready()
  net: bcmgenet: Ensure MDIO unregistration has clocks enabled
  tpm: tpm_vtpm_proxy: fix a race condition in /dev/vtpmx creation
  pinctrl: amd: Only use special debounce behavior for GPIO 0
  pinctrl: amd: Detect internal GPIO0 debounce handling
  pinctrl: amd: Fix mistake in handling clearing pins at startup
  net/sched: make psched_mtu() RTNL-less safe
  wifi: airo: avoid uninitialized warning in airo_get_rate()
  ipv6/addrconf: fix a potential refcount underflow for idev
  NTB: ntb_tool: Add check for devm_kcalloc
  NTB: ntb_transport: fix possible memory leak while device_register() fails
  ntb: intel: Fix error handling in intel_ntb_pci_driver_init()
  NTB: amd: Fix error handling in amd_ntb_pci_driver_init()
  ntb: idt: Fix error handling in idt_pci_driver_init()
  udp6: fix udp6_ehashfn() typo
  icmp6: Fix null-ptr-deref of ip6_null_entry->rt6i_idev in icmp6_dev().
  vrf: Increment Icmp6InMsgs on the original netdev
  net: mvneta: fix txq_map in case of txq_number==1
  workqueue: clean up WORK_* constant types, clarify masking
  net: lan743x: Don't sleep in atomic context
  netfilter: nf_tables: prevent OOB access in nft_byteorder_eval
  netfilter: conntrack: Avoid nf_ct_helper_hash uses after free
  netfilter: nf_tables: fix scheduling-while-atomic splat
  netfilter: nf_tables: unbind non-anonymous set if rule construction fails
  netfilter: nf_tables: reject unbound anonymous set before commit phase
  netfilter: nf_tables: add NFT_TRANS_PREPARE_ERROR to deal with bound set/chain
  netfilter: nf_tables: incorrect error path handling with NFT_MSG_NEWRULE
  netfilter: nf_tables: use net_generic infra for transaction data
  netfilter: add helper function to set up the nfnetlink header and use it
  netfilter: nftables: add helper function to set the base sequence number
  netfilter: nf_tables: add rescheduling points during loop detection walks
  netfilter: nf_tables: fix nat hook table deletion
  spi: spi-fsl-spi: allow changing bits_per_word while CS is still active
  spi: spi-fsl-spi: relax message sanity checking a little
  spi: spi-fsl-spi: remove always-true conditional in fsl_spi_do_one_msg
  ARM: orion5x: fix d2net gpio initialization
  btrfs: fix race when deleting quota root from the dirty cow roots list
  jffs2: reduce stack usage in jffs2_build_xattr_subsystem()
  integrity: Fix possible multiple allocation in integrity_inode_get()
  bcache: Remove unnecessary NULL point check in node allocations
  mmc: core: disable TRIM on Micron MTFC4GACAJCN-1M
  mmc: core: disable TRIM on Kingston EMMC04G-M627
  NFSD: add encoding of op_recall flag for write delegation
  ALSA: jack: Fix mutex call in snd_jack_report()
  i2c: xiic: Don't try to handle more interrupt events after error
  i2c: xiic: Defer xiic_wakeup() and __xiic_start_xfer() in xiic_process()
  sh: dma: Fix DMA channel offset calculation
  net/sched: act_pedit: Add size check for TCA_PEDIT_PARMS_EX
  tcp: annotate data races in __tcp_oow_rate_limited()
  net: bridge: keep ports without IFF_UNICAST_FLT in BR_PROMISC mode
  powerpc: allow PPC_EARLY_DEBUG_CPM only when SERIAL_CPM=y
  f2fs: fix error path handling in truncate_dnode()
  mailbox: ti-msgmgr: Fill non-message tx data fields with 0x0
  spi: bcm-qspi: return error if neither hif_mspi nor mspi is available
  Add MODULE_FIRMWARE() for FIRMWARE_TG357766.
  sctp: fix potential deadlock on &net->sctp.addr_wq_lock
  rtc: st-lpc: Release some resources in st_rtc_probe() in case of error
  mfd: stmpe: Only disable the regulators if they are enabled
  mfd: intel-lpss: Add missing check for platform_get_resource
  KVM: s390: fix KVM_S390_GET_CMMA_BITS for GFNs in memslot holes
  mfd: rt5033: Drop rt5033-battery sub-device
  usb: phy: phy-tahvo: fix memory leak in tahvo_usb_probe()
  extcon: Fix kernel doc of property capability fields to avoid warnings
  extcon: Fix kernel doc of property fields to avoid warnings
  media: usb: siano: Fix warning due to null work_func_t function pointer
  media: videodev2.h: Fix struct v4l2_input tuner index comment
  media: usb: Check az6007_read() return value
  sh: j2: Use ioremap() to translate device tree address into kernel memory
  w1: fix loop in w1_fini()
  block: change all __u32 annotations to __be32 in affs_hardblocks.h
  USB: serial: option: add LARA-R6 01B PIDs
  ARC: define ASM_NL and __ALIGN(_STR) outside #ifdef __ASSEMBLY__ guard
  ARCv2: entry: rewrite to enable use of double load/stores LDD/STD
  ARCv2: entry: avoid a branch
  ARCv2: entry: push out the Z flag unclobber from common EXCEPTION_PROLOGUE
  ARCv2: entry: comments about hardware auto-save on taken interrupts
  modpost: fix section mismatch message for R_ARM_{PC24,CALL,JUMP24}
  modpost: fix section mismatch message for R_ARM_ABS32
  crypto: nx - fix build warnings when DEBUG_FS is not enabled
  hwrng: virtio - Fix race on data_avail and actual data
  hwrng: virtio - always add a pending request
  hwrng: virtio - don't waste entropy
  hwrng: virtio - don't wait on cleanup
  hwrng: virtio - add an internal buffer
  pinctrl: at91-pio4: check return value of devm_kasprintf()
  perf dwarf-aux: Fix off-by-one in die_get_varname()
  pinctrl: cherryview: Return correct value if pin in push-pull mode
  PCI: Add pci_clear_master() stub for non-CONFIG_PCI
  scsi: 3w-xxxx: Add error handling for initialization failure in tw_probe()
  ALSA: ac97: Fix possible NULL dereference in snd_ac97_mixer
  drm/radeon: fix possible division-by-zero errors
  fbdev: omapfb: lcd_mipid: Fix an error handling path in mipid_spi_probe()
  arm64: dts: renesas: ulcb-kf: Remove flow control for SCIF1
  IB/hfi1: Fix sdma.h tx->num_descs off-by-one errors
  soc/fsl/qe: fix usb.c build errors
  ASoC: es8316: Increment max value for ALC Capture Target Volume control
  ARM: ep93xx: fix missing-prototype warnings
  drm/panel: simple: fix active size for Ampire AM-480272H3TMQW-T01H
  Input: adxl34x - do not hardcode interrupt trigger type
  ARM: dts: BCM5301X: Drop "clock-names" from the SPI node
  Input: drv260x - sleep between polling GO bit
  radeon: avoid double free in ci_dpm_init()
  netlink: Add __sock_i_ino() for __netlink_diag_dump().
  ipvlan: Fix return value of ipvlan_queue_xmit()
  netfilter: nf_conntrack_sip: fix the ct_sip_parse_numerical_param() return value.
  lib/ts_bm: reset initial match offset for every block of text
  gtp: Fix use-after-free in __gtp_encap_destroy().
  netlink: do not hard code device address lenth in fdb dumps
  netlink: fix potential deadlock in netlink_set_err()
  wifi: ath9k: convert msecs to jiffies where needed
  wifi: ath9k: Fix possible stall on ath9k_txq_list_has_key()
  memstick r592: make memstick_debug_get_tpc_name() static
  kexec: fix a memory leak in crash_shrink_memory()
  watchdog/perf: more properly prevent false positives with turbo modes
  watchdog/perf: define dummy watchdog_update_hrtimer_threshold() on correct config
  wifi: rsi: Do not set MMC_PM_KEEP_POWER in shutdown
  wifi: ath9k: don't allow to overwrite ENDPOINT0 attributes
  wifi: ray_cs: Fix an error handling path in ray_probe()
  wifi: ray_cs: Drop useless status variable in parse_addr()
  wifi: ray_cs: Utilize strnlen() in parse_addr()
  wifi: wl3501_cs: Fix an error handling path in wl3501_probe()
  wl3501_cs: use eth_hw_addr_set()
  net: create netdev->dev_addr assignment helpers
  wl3501_cs: Fix misspelling and provide missing documentation
  wl3501_cs: Remove unnecessary NULL check
  wl3501_cs: Fix a bunch of formatting issues related to function docs
  wifi: atmel: Fix an error handling path in atmel_probe()
  wifi: orinoco: Fix an error handling path in orinoco_cs_probe()
  wifi: orinoco: Fix an error handling path in spectrum_cs_probe()
  nfc: llcp: fix possible use of uninitialized variable in nfc_llcp_send_connect()
  nfc: constify several pointers to u8, char and sk_buff
  wifi: mwifiex: Fix the size of a memory allocation in mwifiex_ret_802_11_scan()
  samples/bpf: Fix buffer overflow in tcp_basertt
  wifi: ath9k: avoid referencing uninit memory in ath9k_wmi_ctrl_rx
  wifi: ath9k: fix AR9003 mac hardware hang check register offset calculation
  evm: Complete description of evm_inode_setattr()
  ARM: 9303/1: kprobes: avoid missing-declaration warnings
  PM: domains: fix integer overflow issues in genpd_parse_state()
  clocksource/drivers/cadence-ttc: Fix memory leak in ttc_timer_probe
  clocksource/drivers/cadence-ttc: Use ttc driver as platform driver
  clocksource/drivers: Unify the names to timer-* format
  irqchip/jcore-aic: Fix missing allocation of IRQ descriptors
  irqchip/jcore-aic: Kill use of irq_create_strict_mappings()
  md/raid10: fix io loss while replacement replace rdev
  md/raid10: fix wrong setting of max_corr_read_errors
  md/raid10: fix overflow of md/safe_mode_delay
  md/raid10: check slab-out-of-bounds in md_bitmap_get_counter
  treewide: Remove uninitialized_var() usage
  drm/amdgpu: Validate VM ioctl flags.
  scripts/tags.sh: Resolve gtags empty index generation
  drm/edid: Fix uninitialized variable in drm_cvt_modes()
  fbdev: imsttfb: Fix use after free bug in imsttfb_probe
  video: imsttfb: check for ioremap() failures
  x86/smp: Use dedicated cache-line for mwait_play_dead()
  gfs2: Don't deref jdesc in evict
  Linux 4.19.290
  x86: fix backwards merge of GDS/SRSO bit
  xen/netback: Fix buffer overrun triggered by unusual packet
  Documentation/x86: Fix backwards on/off logic about YMM support
  x86/xen: Fix secondary processors' FPU initialization
  KVM: Add GDS_NO support to KVM
  x86/speculation: Add Kconfig option for GDS
  x86/speculation: Add force option to GDS mitigation
  x86/speculation: Add Gather Data Sampling mitigation
  x86/fpu: Move FPU initialization into arch_cpu_finalize_init()
  x86/fpu: Mark init functions __init
  x86/fpu: Remove cpuinfo argument from init functions
  init, x86: Move mem_encrypt_init() into arch_cpu_finalize_init()
  init: Invoke arch_cpu_finalize_init() earlier
  init: Remove check_bugs() leftovers
  um/cpu: Switch to arch_cpu_finalize_init()
  sparc/cpu: Switch to arch_cpu_finalize_init()
  sh/cpu: Switch to arch_cpu_finalize_init()
  mips/cpu: Switch to arch_cpu_finalize_init()
  m68k/cpu: Switch to arch_cpu_finalize_init()
  ia64/cpu: Switch to arch_cpu_finalize_init()
  ARM: cpu: Switch to arch_cpu_finalize_init()
  x86/cpu: Switch to arch_cpu_finalize_init()
  init: Provide arch_cpu_finalize_init()

 Conflicts:
	drivers/mmc/core/block.c
	drivers/mmc/host/sdhci-msm.c
	drivers/usb/dwc3/core.c
	drivers/usb/dwc3/gadget.c

Change-Id: Id2f4d5c8067f8e5eda39c0eaa5e59d54a394b4c7
2023-09-19 18:11:03 +03:00

3001 lines
75 KiB
C

/*
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
* Subject to the GNU Public License, version 2.
*
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
*
* Support four policies per VMA and per process:
*
* The VMA policy has priority over the process policy for a page fault.
*
* interleave Allocate memory interleaved over a set of nodes,
* with normal fallback if it fails.
* For VMA based allocations this interleaves based on the
* offset into the backing object or offset into the mapping
* for anonymous memory. For process policy an process counter
* is used.
*
* bind Only allocate memory on a specific set of nodes,
* no fallback.
* FIXME: memory is allocated starting with the first node
* to the last. It would be better if bind would truly restrict
* the allocation to memory nodes instead
*
* preferred Try a specific node first before normal fallback.
* As a special case NUMA_NO_NODE here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
*
* default Allocate on the local node first, or when on a VMA
* use the process policy. This is what Linux always did
* in a NUMA aware kernel and still does by, ahem, default.
*
* The process policy is applied for most non interrupt memory allocations
* in that process' context. Interrupts ignore the policies and always
* try to allocate on the local CPU. The VMA policy is only applied for memory
* allocations for a VMA in the VM.
*
* Currently there are a few corner cases in swapping where the policy
* is not applied, but the majority should be handled. When process policy
* is used it is not remembered over swap outs/swap ins.
*
* Only the highest zone in the zone hierarchy gets policied. Allocations
* requesting a lower zone just use default policy. This implies that
* on systems with highmem kernel lowmem allocation don't get policied.
* Same with GFP_DMA allocations.
*
* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
* all users and remembered even when nobody has memory mapped.
*/
/* Notebook:
fix mmap readahead to honour policy and enable policy for any page cache
object
statistics for bigpages
global policy for page cache? currently it uses process policy. Requires
first item above.
handle mremap for shared memory (currently ignored for the policy)
grows down?
make bind policy root only? It can trigger oom much faster and the
kernel is not always grateful with that.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
#include <asm/tlbflush.h>
#include <linux/uaccess.h>
#include "internal.h"
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;
/* Highest zone. An specific allocation for a zone below that is not
policied. */
enum zone_type policy_zone = 0;
/*
* run-time system-wide default policy => local allocation
*/
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_PREFERRED,
.flags = MPOL_F_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
int node;
if (pol)
return pol;
node = numa_node_id();
if (node != NUMA_NO_NODE) {
pol = &preferred_node_policy[node];
/* preferred_node_policy is not initialised early in boot */
if (pol->mode)
return pol;
}
return &default_policy;
}
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
return pol->flags & MPOL_MODE_FLAGS;
}
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
const nodemask_t *rel)
{
nodemask_t tmp;
nodes_fold(tmp, *orig, nodes_weight(*rel));
nodes_onto(*ret, tmp, *rel);
}
static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
pol->v.nodes = *nodes;
return 0;
}
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
if (!nodes)
pol->flags |= MPOL_F_LOCAL; /* local allocation */
else if (nodes_empty(*nodes))
return -EINVAL; /* no allowed nodes */
else
pol->v.preferred_node = first_node(*nodes);
return 0;
}
static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
pol->v.nodes = *nodes;
return 0;
}
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
* parameter with respect to the policy mode and flags. But, we need to
* handle an empty nodemask with MPOL_PREFERRED here.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_semaphore for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
int ret;
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
nodes = NULL; /* explicit local allocation */
else {
if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
else
nodes_and(nsc->mask2, *nodes, nsc->mask1);
if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes;
else
pol->w.cpuset_mems_allowed =
cpuset_current_mems_allowed;
}
if (nodes)
ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
else
ret = mpol_ops[pol->mode].create(pol, NULL);
return ret;
}
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
*/
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
return NULL;
}
VM_BUG_ON(!nodes);
/*
* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
* All other modes require a valid pointer to a non-empty nodemask.
*/
if (mode == MPOL_PREFERRED) {
if (nodes_empty(*nodes)) {
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
return policy;
}
/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *p)
{
if (!atomic_dec_and_test(&p->refcnt))
return;
kmem_cache_free(policy_cache, p);
}
static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}
static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
nodemask_t tmp;
if (pol->flags & MPOL_F_STATIC_NODES)
nodes_and(tmp, pol->w.user_nodemask, *nodes);
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
if (nodes_empty(tmp))
tmp = *nodes;
pol->v.nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
const nodemask_t *nodes)
{
nodemask_t tmp;
if (pol->flags & MPOL_F_STATIC_NODES) {
int node = first_node(pol->w.user_nodemask);
if (node_isset(node, *nodes)) {
pol->v.preferred_node = node;
pol->flags &= ~MPOL_F_LOCAL;
} else
pol->flags |= MPOL_F_LOCAL;
} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
pol->v.preferred_node = first_node(tmp);
} else if (!(pol->flags & MPOL_F_LOCAL)) {
pol->v.preferred_node = node_remap(pol->v.preferred_node,
pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
}
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
* Per-vma policies are protected by mmap_sem. Allocations using per-task
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
*/
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol || pol->mode == MPOL_LOCAL)
return;
if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
mpol_ops[pol->mode].rebind(pol, newmask);
}
/*
* Wrapper for mpol_rebind_policy() that just requires task
* pointer, and updates task mempolicy.
*
* Called with task's alloc_lock held.
*/
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
mpol_rebind_policy(tsk->mempolicy, new);
}
/*
* Rebind each vma in mm to new nodemask.
*
* Call holding a reference to mm. Takes mm->mmap_sem during call.
*/
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
down_write(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vm_write_begin(vma);
mpol_rebind_policy(vma->vm_policy, new);
vm_write_end(vma);
}
up_write(&mm->mmap_sem);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
[MPOL_DEFAULT] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
.create = mpol_new_interleave,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
.create = mpol_new_preferred,
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
.create = mpol_new_bind,
.rebind = mpol_rebind_nodemask,
},
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
struct queue_pages {
struct list_head *pagelist;
unsigned long flags;
nodemask_t *nmask;
struct vm_area_struct *prev;
};
/*
* Check if the page's nid is in qp->nmask.
*
* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
* in the invert of qp->nmask.
*/
static inline bool queue_pages_required(struct page *page,
struct queue_pages *qp)
{
int nid = page_to_nid(page);
unsigned long flags = qp->flags;
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
/*
* queue_pages_pmd() has four possible return values:
* 0 - pages are placed on the right node or queued successfully.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 2 - THP was split.
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
* existing page was already on a node that does not follow the
* policy.
*/
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
int ret = 0;
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags;
if (unlikely(is_pmd_migration_entry(*pmd))) {
ret = -EIO;
goto unlock;
}
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
ret = 2;
goto out;
}
if (!queue_pages_required(page, qp))
goto unlock;
flags = qp->flags;
/* go to thp migration */
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
if (!vma_migratable(walk->vma) ||
migrate_page_add(page, qp->pagelist, flags)) {
ret = 1;
goto unlock;
}
} else
ret = -EIO;
unlock:
spin_unlock(ptl);
out:
return ret;
}
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*
* queue_pages_pte_range() has three possible return values:
* 0 - pages are placed on the right node or queued successfully.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* -EIO - only MPOL_MF_STRICT was specified and an existing page was already
* on a node that does not follow the policy.
*/
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
int ret;
bool has_unmovable = false;
pte_t *pte, *mapped_pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
if (ret != 2)
return ret;
}
/* THP was split, fall through to pte walk */
if (pmd_trans_unstable(pmd))
return 0;
mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
/*
* vm_normal_page() filters out zero pages, but there might
* still be PageReserved pages to skip, perhaps in a VDSO.
*/
if (PageReserved(page))
continue;
if (!queue_pages_required(page, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
/* MPOL_MF_STRICT must be specified if we get here */
if (!vma_migratable(vma)) {
has_unmovable = true;
break;
}
/*
* Do not abort immediately since there may be
* temporary off LRU pages in the range. Still
* need migrate other LRU pages.
*/
if (migrate_page_add(page, qp->pagelist, flags))
has_unmovable = true;
} else
break;
}
pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
if (has_unmovable)
return 1;
return addr != end ? -EIO : 0;
}
static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
struct page *page;
spinlock_t *ptl;
pte_t entry;
ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto unlock;
page = pte_page(entry);
if (!queue_pages_required(page, qp))
goto unlock;
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
!hugetlb_pmd_shared(pte)))
isolate_huge_page(page, qp->pagelist);
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
return 0;
}
#ifdef CONFIG_NUMA_BALANCING
/*
* This is used to mark a range of virtual addresses to be inaccessible.
* These are later cleared by a NUMA hinting fault. Depending on these
* faults, pages may be migrated for better NUMA placement.
*
* This is assuming that NUMA faults are handled using PROT_NONE. If
* an architecture makes a different choice, it will need further
* changes to the core.
*/
unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
int nr_updated;
nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
return nr_updated;
}
#else
static unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
{
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
static int queue_pages_test_walk(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
struct queue_pages *qp = walk->private;
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
/*
* Need check MPOL_MF_STRICT to return -EIO if possible
* regardless of vma_migratable
*/
if (!vma_migratable(vma) &&
!(flags & MPOL_MF_STRICT))
return 1;
if (endvma > end)
endvma = end;
if (vma->vm_start > start)
start = vma->vm_start;
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
return -EFAULT;
if (qp->prev && qp->prev->vm_end < vma->vm_start)
return -EFAULT;
}
qp->prev = vma;
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
if (!is_vm_hugetlb_page(vma) &&
(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
!(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
}
/* queue pages from current vma */
if (flags & MPOL_MF_VALID)
return 0;
return 1;
}
/*
* Walk through page tables and collect pages to be migrated.
*
* If pages found in a given range are on a set of nodes (determined by
* @nodes and @flags,) it's isolated and queued to the pagelist which is
* passed via @private.
*
* queue_pages_range() has three possible return values:
* 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 0 - queue pages successfully or no misplaced page.
* errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
* memory range specified by nodemask and maxnode points outside
* your accessible address space (-EFAULT)
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{
struct queue_pages qp = {
.pagelist = pagelist,
.flags = flags,
.nmask = nodes,
.prev = NULL,
};
struct mm_walk queue_pages_walk = {
.hugetlb_entry = queue_pages_hugetlb,
.pmd_entry = queue_pages_pte_range,
.test_walk = queue_pages_test_walk,
.mm = mm,
.private = &qp,
};
return walk_page_range(start, end, &queue_pages_walk);
}
/*
* Apply policy to a single VMA
* This must be called with the mmap_sem held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *pol)
{
int err;
struct mempolicy *old;
struct mempolicy *new;
pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
new = mpol_dup(pol);
if (IS_ERR(new))
return PTR_ERR(new);
vm_write_begin(vma);
if (vma->vm_ops && vma->vm_ops->set_policy) {
err = vma->vm_ops->set_policy(vma, new);
if (err)
goto err_out;
}
old = vma->vm_policy;
/*
* The speculative page fault handler accesses this field without
* hodling the mmap_sem.
*/
WRITE_ONCE(vma->vm_policy, new);
vm_write_end(vma);
mpol_put(old);
return 0;
err_out:
vm_write_end(vma);
mpol_put(new);
return err;
}
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
pgoff_t pgoff;
unsigned long vmstart;
unsigned long vmend;
vma = find_vma(mm, start);
if (!vma || vma->vm_start > start)
return -EFAULT;
prev = vma->vm_prev;
if (start > vma->vm_start)
prev = vma;
for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
if (mpol_equal(vma_policy(vma), new_pol))
continue;
pgoff = vma->vm_pgoff +
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx,
vma_get_anon_name(vma));
if (prev) {
vma = prev;
goto replace;
}
if (vma->vm_start != vmstart) {
err = split_vma(vma->vm_mm, vma, vmstart, 1);
if (err)
goto out;
}
if (vma->vm_end != vmend) {
err = split_vma(vma->vm_mm, vma, vmend, 0);
if (err)
goto out;
}
replace:
err = vma_replace_policy(vma, new_pol);
if (err)
goto out;
}
out:
return err;
}
/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new, *old;
NODEMASK_SCRATCH(scratch);
int ret;
if (!scratch)
return -ENOMEM;
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto out;
}
task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
task_unlock(current);
mpol_put(new);
goto out;
}
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
current->il_prev = MAX_NUMNODES-1;
task_unlock(current);
mpol_put(old);
ret = 0;
out:
NODEMASK_SCRATCH_FREE(scratch);
return ret;
}
/*
* Return nodemask for policy for get_mempolicy() query
*
* Called with task's alloc_lock held
*/
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
nodes_clear(*nodes);
if (p == &default_policy)
return;
switch (p->mode) {
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
case MPOL_PREFERRED:
if (!(p->flags & MPOL_F_LOCAL))
node_set(p->v.preferred_node, *nodes);
/* else return empty node mask for local allocation */
break;
default:
BUG();
}
}
static int lookup_node(unsigned long addr)
{
struct page *p;
int err;
err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
}
return err;
}
/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
unsigned long addr, unsigned long flags)
{
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
if (flags & MPOL_F_MEMS_ALLOWED) {
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
*policy = 0; /* just so it's initialized */
task_lock(current);
*nmask = cpuset_current_mems_allowed;
task_unlock(current);
return 0;
}
if (flags & MPOL_F_ADDR) {
/*
* Do NOT fall back to task policy if the
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
up_read(&mm->mmap_sem);
return -EFAULT;
}
if (vma->vm_ops && vma->vm_ops->get_policy)
pol = vma->vm_ops->get_policy(vma, addr);
else
pol = vma->vm_policy;
} else if (addr)
return -EINVAL;
if (!pol)
pol = &default_policy; /* indicates default behavior */
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
err = lookup_node(addr);
if (err < 0)
goto out;
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
*policy = next_node_in(current->il_prev, pol->v.nodes);
} else {
err = -EINVAL;
goto out;
}
} else {
*policy = pol == &default_policy ? MPOL_DEFAULT :
pol->mode;
/*
* Internal mempolicy flags must be masked off before exposing
* the policy to userspace.
*/
*policy |= (pol->flags & MPOL_MODE_FLAGS);
}
err = 0;
if (nmask) {
if (mpol_store_user_nodemask(pol)) {
*nmask = pol->w.user_nodemask;
} else {
task_lock(current);
get_policy_nodemask(pol, nmask);
task_unlock(current);
}
}
out:
mpol_cond_put(pol);
if (vma)
up_read(&current->mm->mmap_sem);
return err;
}
#ifdef CONFIG_MIGRATION
/*
* page migration, thp tail pages can be passed.
*/
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
struct page *head = compound_head(page);
/*
* Avoid migrating a page that is shared with others.
*/
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) {
if (!isolate_lru_page(head)) {
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_cache(head),
hpage_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
* Non-movable page may reach here. And, there may be
* temporary off LRU pages or non-LRU movable pages.
* Treat them as unmovable pages since they can't be
* isolated, so they can't be moved at the moment. It
* should return -EIO for this case too.
*/
return -EIO;
}
}
return 0;
}
/* page allocation callback for NUMA node migration */
struct page *alloc_new_node_page(struct page *page, unsigned long node)
{
if (PageHuge(page))
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_pages_node(node,
(GFP_TRANSHUGE | __GFP_THISNODE),
HPAGE_PMD_ORDER);
if (!thp)
return NULL;
prep_transhuge_page(thp);
return thp;
} else
return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE, 0);
}
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
*/
static int migrate_to_node(struct mm_struct *mm, int source, int dest,
int flags)
{
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
nodes_clear(nmask);
node_set(source, nmask);
/*
* This does not "check" the range but isolates all pages that
* need migration. Between passing in the full user address
* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
*/
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
}
return err;
}
/*
* Move pages between the two nodesets so as to preserve the physical
* layout as much as possible.
*
* Returns the number of page that could not be moved.
*/
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
int busy = 0;
int err;
nodemask_t tmp;
err = migrate_prep();
if (err)
return err;
down_read(&mm->mmap_sem);
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
* bit in 'tmp', and return that <source, dest> pair for migration.
* The pair of nodemasks 'to' and 'from' define the map.
*
* If no pair of bits is found that way, fallback to picking some
* pair of 'source' and 'dest' bits that are not the same. If the
* 'source' and 'dest' bits are the same, this represents a node
* that will be migrating to itself, so no pages need move.
*
* If no bits are left in 'tmp', or if all remaining bits left
* in 'tmp' correspond to the same bit in 'to', return false
* (nothing left to migrate).
*
* This lets us pick a pair of nodes to migrate between, such that
* if possible the dest node is not already occupied by some other
* source node, minimizing the risk of overloading the memory on a
* node that would happen if we migrated incoming memory to a node
* before migrating outgoing memory source that same node.
*
* A single scan of tmp is sufficient. As we go, we remember the
* most recent <s, d> pair that moved (s != d). If we find a pair
* that not only moved, but what's better, moved to an empty slot
* (d is not set in tmp), then we break out then, with that pair.
* Otherwise when we finish scanning from_tmp, we at least have the
* most recent <s, d> pair that moved. If we get all the way through
* the scan of tmp without finding any node that moved, much less
* moved to an empty node, then there is nothing left worth migrating.
*/
tmp = *from;
while (!nodes_empty(tmp)) {
int s,d;
int source = NUMA_NO_NODE;
int dest = 0;
for_each_node_mask(s, tmp) {
/*
* do_migrate_pages() tries to maintain the relative
* node relationship of the pages established between
* threads and memory areas.
*
* However if the number of source nodes is not equal to
* the number of destination nodes we can not preserve
* this node relative relationship. In that case, skip
* copying memory from a node that is in the destination
* mask.
*
* Example: [2,3,4] -> [3,4,5] moves everything.
* [0-7] - > [3,4,5] moves only 0,1,2,6,7.
*/
if ((nodes_weight(*from) != nodes_weight(*to)) &&
(node_isset(s, *to)))
continue;
d = node_remap(s, *from, *to);
if (s == d)
continue;
source = s; /* Node moved. Memorize */
dest = d;
/* dest not in remaining from nodes? */
if (!node_isset(dest, tmp))
break;
}
if (source == NUMA_NO_NODE)
break;
node_clear(source, tmp);
err = migrate_to_node(mm, source, dest, flags);
if (err > 0)
busy += err;
if (err < 0)
break;
}
up_read(&mm->mmap_sem);
if (err < 0)
return err;
return busy;
}
/*
* Allocate a new page for page migration based on vma policy.
* Start by assuming the page is mapped by the same vma as contains @start.
* Search forward from there, if not. N.B., this assumes that the
* list of pages handed to migrate_pages()--which is how we get here--
* is in virtual address order.
*/
static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
unsigned long address;
vma = find_vma(current->mm, start);
while (vma) {
address = page_address_in_vma(page, vma);
if (address != -EFAULT)
break;
vma = vma->vm_next;
}
if (PageHuge(page)) {
return alloc_huge_page_vma(page_hstate(compound_head(page)),
vma, address);
} else if (PageTransHuge(page)) {
struct page *thp;
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
HPAGE_PMD_ORDER);
if (!thp)
return NULL;
prep_transhuge_page(thp);
return thp;
}
/*
* if !vma, alloc_page_vma() will use task or system default policy
*/
return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL,
vma, address);
}
#else
static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
return -EIO;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to, int flags)
{
return -ENOSYS;
}
static struct page *new_page(struct page *page, unsigned long start)
{
return NULL;
}
#endif
static long do_mbind(unsigned long start, unsigned long len,
unsigned short mode, unsigned short mode_flags,
nodemask_t *nmask, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct mempolicy *new;
unsigned long end;
int err;
int ret;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (start & ~PAGE_MASK)
return -EINVAL;
if (mode == MPOL_DEFAULT)
flags &= ~MPOL_MF_STRICT;
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
new = mpol_new(mode, mode_flags, nmask);
if (IS_ERR(new))
return PTR_ERR(new);
if (flags & MPOL_MF_LAZY)
new->flags |= MPOL_F_MOF;
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
*/
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
start, start + len, mode, mode_flags,
nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
err = migrate_prep();
if (err)
goto mpol_out;
}
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
down_write(&mm->mmap_sem);
task_lock(current);
err = mpol_set_nodemask(new, nmask, scratch);
task_unlock(current);
if (err)
up_write(&mm->mmap_sem);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
}
if (err)
goto mpol_out;
ret = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
if (ret < 0) {
err = ret;
goto up_out;
}
err = mbind_range(mm, start, end, new);
if (!err) {
int nr_failed = 0;
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_page, NULL,
start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_movable_pages(&pagelist);
}
if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
err = -EIO;
} else {
up_out:
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
}
up_write(&mm->mmap_sem);
mpol_out:
mpol_put(new);
return err;
}
/*
* User space interface with variable sized bitmaps for nodelists.
*/
/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long maxnode)
{
unsigned long k;
unsigned long t;
unsigned long nlongs;
unsigned long endmask;
--maxnode;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
return -EINVAL;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
endmask = ~0UL;
else
endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
/*
* When the user specified more nodes than supported just check
* if the non supported part is all zero.
*
* If maxnode have more longs than MAX_NUMNODES, check
* the bits in that area first. And then go through to
* check the rest bits which equal or bigger than MAX_NUMNODES.
* Otherwise, just check bits [MAX_NUMNODES, maxnode).
*/
if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
if (get_user(t, nmask + k))
return -EFAULT;
if (k == nlongs - 1) {
if (t & endmask)
return -EINVAL;
} else if (t)
return -EINVAL;
}
nlongs = BITS_TO_LONGS(MAX_NUMNODES);
endmask = ~0UL;
}
if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
unsigned long valid_mask = endmask;
valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
if (get_user(t, nmask + nlongs - 1))
return -EFAULT;
if (t & valid_mask)
return -EINVAL;
}
if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
return -EFAULT;
nodes_addr(*nodes)[nlongs-1] &= endmask;
return 0;
}
/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
return -EINVAL;
if (clear_user((char __user *)mask + nbytes, copy - nbytes))
return -EFAULT;
copy = nbytes;
}
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
static long kernel_mbind(unsigned long start, unsigned long len,
unsigned long mode, const unsigned long __user *nmask,
unsigned long maxnode, unsigned int flags)
{
nodemask_t nodes;
int err;
unsigned short mode_flags;
start = untagged_addr(start);
mode_flags = mode & MPOL_MODE_FLAGS;
mode &= ~MPOL_MODE_FLAGS;
if (mode >= MPOL_MAX)
return -EINVAL;
if ((mode_flags & MPOL_F_STATIC_NODES) &&
(mode_flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_mbind(start, len, mode, mode_flags, &nodes, flags);
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
unsigned long, mode, const unsigned long __user *, nmask,
unsigned long, maxnode, unsigned int, flags)
{
return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}
/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode)
{
int err;
nodemask_t nodes;
unsigned short flags;
flags = mode & MPOL_MODE_FLAGS;
mode &= ~MPOL_MODE_FLAGS;
if ((unsigned int)mode >= MPOL_MAX)
return -EINVAL;
if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
return do_set_mempolicy(mode, flags, &nodes);
}
SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
unsigned long, maxnode)
{
return kernel_set_mempolicy(mode, nmask, maxnode);
}
static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
const unsigned long __user *old_nodes,
const unsigned long __user *new_nodes)
{
struct mm_struct *mm = NULL;
struct task_struct *task;
nodemask_t task_nodes;
int err;
nodemask_t *old;
nodemask_t *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
return -ENOMEM;
old = &scratch->mask1;
new = &scratch->mask2;
err = get_nodes(old, old_nodes, maxnode);
if (err)
goto out;
err = get_nodes(new, new_nodes, maxnode);
if (err)
goto out;
/* Find the mm_struct */
rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
rcu_read_unlock();
err = -ESRCH;
goto out;
}
get_task_struct(task);
err = -EINVAL;
/*
* Check if this process has the right to modify the specified process.
* Use the regular "ptrace_may_access()" checks.
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out_put;
}
rcu_read_unlock();
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out_put;
}
task_nodes = cpuset_mems_allowed(current);
nodes_and(*new, *new, task_nodes);
if (nodes_empty(*new))
goto out_put;
nodes_and(*new, *new, node_states[N_MEMORY]);
if (nodes_empty(*new))
goto out_put;
err = security_task_movememory(task);
if (err)
goto out_put;
mm = get_task_mm(task);
put_task_struct(task);
if (!mm) {
err = -EINVAL;
goto out;
}
err = do_migrate_pages(mm, old, new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
mmput(mm);
out:
NODEMASK_SCRATCH_FREE(scratch);
return err;
out_put:
put_task_struct(task);
goto out;
}
SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, old_nodes,
const unsigned long __user *, new_nodes)
{
return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}
/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
unsigned long __user *nmask,
unsigned long maxnode,
unsigned long addr,
unsigned long flags)
{
int err;
int pval;
nodemask_t nodes;
addr = untagged_addr(addr);
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
return err;
if (policy && put_user(pval, policy))
return -EFAULT;
if (nmask)
err = copy_nodes_to_user(nmask, maxnode, &nodes);
return err;
}
SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long __user *, nmask, unsigned long, maxnode,
unsigned long, addr, unsigned long, flags)
{
return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode,
compat_ulong_t, addr, compat_ulong_t, flags)
{
long err;
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask)
nm = compat_alloc_user_space(alloc_size);
err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
if (!err && nmask) {
unsigned long copy_size;
copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
err = copy_from_user(bm, nm, copy_size);
/* ensure entire bitmap is zeroed */
err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
err |= compat_put_bitmap(nmask, bm, nr_bits);
}
return err;
}
COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode)
{
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
if (compat_get_bitmap(bm, nmask, nr_bits))
return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
if (copy_to_user(nm, bm, alloc_size))
return -EFAULT;
}
return kernel_set_mempolicy(mode, nm, nr_bits+1);
}
COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
compat_ulong_t, mode, compat_ulong_t __user *, nmask,
compat_ulong_t, maxnode, compat_ulong_t, flags)
{
unsigned long __user *nm = NULL;
unsigned long nr_bits, alloc_size;
nodemask_t bm;
nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask) {
if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
return -EFAULT;
nm = compat_alloc_user_space(alloc_size);
if (copy_to_user(nm, nodes_addr(bm), alloc_size))
return -EFAULT;
}
return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
}
COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
compat_ulong_t, maxnode,
const compat_ulong_t __user *, old_nodes,
const compat_ulong_t __user *, new_nodes)
{
unsigned long __user *old = NULL;
unsigned long __user *new = NULL;
nodemask_t tmp_mask;
unsigned long nr_bits;
unsigned long size;
nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (old_nodes) {
if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
return -EFAULT;
old = compat_alloc_user_space(new_nodes ? size * 2 : size);
if (new_nodes)
new = old + size / sizeof(unsigned long);
if (copy_to_user(old, nodes_addr(tmp_mask), size))
return -EFAULT;
}
if (new_nodes) {
if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
return -EFAULT;
if (new == NULL)
new = compat_alloc_user_space(size);
if (copy_to_user(new, nodes_addr(tmp_mask), size))
return -EFAULT;
}
return kernel_migrate_pages(pid, nr_bits + 1, old, new);
}
#endif /* CONFIG_COMPAT */
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol;
if (!vma)
return NULL;
if (vma->vm_ops && vma->vm_ops->get_policy)
return vma->vm_ops->get_policy(vma, addr);
/*
* This could be called without holding the mmap_sem in the
* speculative page fault handler's path.
*/
pol = READ_ONCE(vma->vm_policy);
if (pol) {
/*
* shmem_alloc_page() passes MPOL_F_SHARED policy with
* a pseudo vma whose vma->vm_ops=NULL. Take a reference
* count on these policies which will be dropped by
* mpol_cond_put() later
*/
if (mpol_needs_cond_ref(pol))
mpol_get(pol);
}
return pol;
}
/*
* get_vma_policy(@vma, @addr)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup
*
* Returns effective policy for a VMA at specified address.
* Falls back to current->mempolicy or system default policy, as necessary.
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
* count--added by the get_policy() vm_op, as appropriate--to protect against
* freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies.
*/
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
struct mempolicy *pol = __get_vma_policy(vma, addr);
if (!pol)
pol = get_task_policy(current);
return pol;
}
bool vma_policy_mof(struct vm_area_struct *vma)
{
struct mempolicy *pol;
if (vma->vm_ops && vma->vm_ops->get_policy) {
bool ret = false;
pol = vma->vm_ops->get_policy(vma, vma->vm_start);
if (pol && (pol->flags & MPOL_F_MOF))
ret = true;
mpol_cond_put(pol);
return ret;
}
pol = vma->vm_policy;
if (!pol)
pol = get_task_policy(current);
return pol->flags & MPOL_F_MOF;
}
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
/*
* if policy->v.nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
* policy->v.nodes is intersect with node_states[N_MEMORY].
* so if the following test faile, it implies
* policy->v.nodes has movable memory only.
*/
if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
dynamic_policy_zone = ZONE_MOVABLE;
return zone >= dynamic_policy_zone;
}
/*
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
*/
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
apply_policy_zone(policy, gfp_zone(gfp)) &&
cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
return &policy->v.nodes;
return NULL;
}
/* Return the node id preferred by the given mempolicy, or the given id */
static int policy_node(gfp_t gfp, struct mempolicy *policy,
int nd)
{
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
else {
/*
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
* requested node and not break the policy.
*/
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
return nd;
}
/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned next;
struct task_struct *me = current;
next = next_node_in(me->il_prev, policy->v.nodes);
if (next < MAX_NUMNODES)
me->il_prev = next;
return next;
}
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
*/
unsigned int mempolicy_slab_node(void)
{
struct mempolicy *policy;
int node = numa_mem_id();
if (in_interrupt())
return node;
policy = current->mempolicy;
if (!policy || policy->flags & MPOL_F_LOCAL)
return node;
switch (policy->mode) {
case MPOL_PREFERRED:
/*
* handled MPOL_F_LOCAL above
*/
return policy->v.preferred_node;
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
case MPOL_BIND: {
struct zoneref *z;
/*
* Follow bind policy behavior and start allocation at the
* first node.
*/
struct zonelist *zonelist;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes);
return z->zone ? zone_to_nid(z->zone) : node;
}
default:
BUG();
}
}
/*
* Do static interleaving for a VMA with known offset @n. Returns the n'th
* node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
* number of present nodes.
*/
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
int i;
int nid;
if (!nnodes)
return numa_node_id();
target = (unsigned int)n % nnodes;
nid = first_node(pol->v.nodes);
for (i = 0; i < target; i++)
nid = next_node(nid, pol->v.nodes);
return nid;
}
/* Determine a node number for interleave */
static inline unsigned interleave_nid(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long addr, int shift)
{
if (vma) {
unsigned long off;
/*
* for small pages, there is no difference between
* shift and PAGE_SHIFT, so the bit-shift is safe.
* for huge pages, since vm_pgoff is in units of small
* pages, we need to shift off the always 0 bits to get
* a useful offset.
*/
BUG_ON(shift < PAGE_SHIFT);
off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
off += (addr - vma->vm_start) >> shift;
return offset_il_node(pol, off);
} else
return interleave_nodes(pol);
}
#ifdef CONFIG_HUGETLBFS
/*
* huge_node(@vma, @addr, @gfp_flags, @mpol)
* @vma: virtual memory area whose policy is sought
* @addr: address in @vma for shared policy lookup and interleave policy
* @gfp_flags: for requested zone
* @mpol: pointer to mempolicy pointer for reference counted mempolicy
* @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
*
* Returns a nid suitable for a huge page allocation and a pointer
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
*
* Must be protected by read_mems_allowed_begin()
*/
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
int nid;
*mpol = get_vma_policy(vma, addr);
*nodemask = NULL; /* assume !MPOL_BIND */
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
nid = interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma)));
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
if ((*mpol)->mode == MPOL_BIND)
*nodemask = &(*mpol)->v.nodes;
}
return nid;
}
/*
* init_nodemask_of_mempolicy
*
* If the current task's mempolicy is "default" [NULL], return 'false'
* to indicate default policy. Otherwise, extract the policy nodemask
* for 'bind' or 'interleave' policy into the argument nodemask, or
* initialize the argument nodemask to contain the single node for
* 'preferred' or 'local' policy and return 'true' to indicate presence
* of non-default mempolicy.
*
* We don't bother with reference counting the mempolicy [mpol_get/put]
* because the current task is examining it's own mempolicy and a task's
* mempolicy is only ever changed by the task itself.
*
* N.B., it is the caller's responsibility to free a returned nodemask.
*/
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
struct mempolicy *mempolicy;
int nid;
if (!(mask && current->mempolicy))
return false;
task_lock(current);
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
if (mempolicy->flags & MPOL_F_LOCAL)
nid = numa_node_id();
else
nid = mempolicy->v.preferred_node;
init_nodemask_of_node(mask, nid);
break;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
*mask = mempolicy->v.nodes;
break;
default:
BUG();
}
task_unlock(current);
return true;
}
#endif
/*
* mempolicy_nodemask_intersects
*
* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
* policy. Otherwise, check for intersection between mask and the policy
* nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
* policy, always return true since it may allocate elsewhere on fallback.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
bool mempolicy_nodemask_intersects(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
bool ret = true;
if (!mask)
return ret;
task_lock(tsk);
mempolicy = tsk->mempolicy;
if (!mempolicy)
goto out;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
/*
* MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
* allocate from, they may fallback to other nodes when oom.
* Thus, it's possible for tsk to have allocated memory from
* nodes in mask.
*/
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
ret = nodes_intersects(mempolicy->v.nodes, *mask);
break;
default:
BUG();
}
out:
task_unlock(tsk);
return ret;
}
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
unsigned nid)
{
struct page *page;
page = __alloc_pages(gfp, order, nid);
/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
if (!static_branch_likely(&vm_numa_stat_key))
return page;
if (page && page_to_nid(page) == nid) {
preempt_disable();
__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
preempt_enable();
}
return page;
}
/**
* alloc_pages_vma - Allocate a page for a VMA.
*
* @gfp:
* %GFP_USER user allocation.
* %GFP_KERNEL kernel allocations,
* %GFP_HIGHMEM highmem/user allocations,
* %GFP_FS allocation should not call back into a file system.
* %GFP_ATOMIC don't sleep.
*
* @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
* @node: Which node to prefer for allocation (modulo policy).
* @hugepage: for hugepages try only the preferred node if possible
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
* When VMA is not NULL caller must hold down_read on the mmap_sem of the
* mm_struct of the VMA to prevent it from going away. Should be used for
* all allocations for pages that will be mapped into user space. Returns
* NULL when no page can be allocated.
*/
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
struct page *page;
int preferred_nid;
nodemask_t *nmask;
pol = get_vma_policy(vma, addr);
if (pol->mode == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
goto out;
}
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
int hpage_node = node;
/*
* For hugepage allocation and non-interleave policy which
* allows the current node (or other explicitly preferred
* node) we only try to allocate from the current/preferred
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
* If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode == MPOL_PREFERRED &&
!(pol->flags & MPOL_F_LOCAL))
hpage_node = pol->v.preferred_node;
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
/*
* We cannot invoke reclaim if __GFP_THISNODE
* is set. Invoking reclaim with
* __GFP_THISNODE set, would cause THP
* allocations to trigger heavy swapping
* despite there may be tons of free memory
* (including potentially plenty of THP
* already available in the buddy) on all the
* other NUMA nodes.
*
* At most we could invoke compaction when
* __GFP_THISNODE is set (but we would need to
* refrain from invoking reclaim even if
* compaction returned COMPACT_SKIPPED because
* there wasn't not enough memory to succeed
* compaction). For now just avoid
* __GFP_THISNODE instead of limiting the
* allocation path to a strict and single
* compaction invocation.
*
* Supposedly if direct reclaim was enabled by
* the caller, the app prefers THP regardless
* of the node it comes from so this would be
* more desiderable behavior than only
* providing THP originated from the local
* node in such case.
*/
if (!(gfp & __GFP_DIRECT_RECLAIM))
gfp |= __GFP_THISNODE;
page = __alloc_pages_node(hpage_node, gfp, order);
goto out;
}
}
nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
mpol_cond_put(pol);
out:
return page;
}
/**
* alloc_pages_current - Allocate pages.
*
* @gfp:
* %GFP_USER user allocation,
* %GFP_KERNEL kernel allocation,
* %GFP_HIGHMEM highmem allocation,
* %GFP_FS don't call back into a file system.
* %GFP_ATOMIC don't sleep.
* @order: Power of two of allocation size in pages. 0 is a single page.
*
* Allocate a page from the kernel page pool. When not in
* interrupt context and apply the current process NUMA policy.
* Returns NULL when no page can be allocated.
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = &default_policy;
struct page *page;
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
page = __alloc_pages_nodemask(gfp, order,
policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol));
return page;
}
EXPORT_SYMBOL(alloc_pages_current);
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
struct mempolicy *pol = mpol_dup(vma_policy(src));
if (IS_ERR(pol))
return PTR_ERR(pol);
dst->vm_policy = pol;
return 0;
}
/*
* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
* rebinds the mempolicy its copying by calling mpol_rebind_policy()
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
*
* current's mempolicy may be rebinded by the other task(the task that changes
* cpuset's mems), so we needn't do rebind work for current task.
*/
/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!new)
return ERR_PTR(-ENOMEM);
/* task's mempolicy is protected by alloc_lock */
if (old == current->mempolicy) {
task_lock(current);
*new = *old;
task_unlock(current);
} else
*new = *old;
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
mpol_rebind_policy(new, &mems);
}
atomic_set(&new->refcnt, 1);
return new;
}
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
if (!a || !b)
return false;
if (a->mode != b->mode)
return false;
if (a->flags != b->flags)
return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
return false;
switch (a->mode) {
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
/* a's ->flags is the same as b's */
if (a->flags & MPOL_F_LOCAL)
return true;
return a->v.preferred_node == b->v.preferred_node;
default:
BUG();
return false;
}
}
/*
* Shared memory backing store policy support.
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
* They are protected by the sp->lock rwlock, which should be held
* for any accesses to the tree.
*/
/*
* lookup first element intersecting start-end. Caller holds sp->lock for
* reading or for writing
*/
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
struct rb_node *n = sp->root.rb_node;
while (n) {
struct sp_node *p = rb_entry(n, struct sp_node, nd);
if (start >= p->end)
n = n->rb_right;
else if (end <= p->start)
n = n->rb_left;
else
break;
}
if (!n)
return NULL;
for (;;) {
struct sp_node *w = NULL;
struct rb_node *prev = rb_prev(n);
if (!prev)
break;
w = rb_entry(prev, struct sp_node, nd);
if (w->end <= start)
break;
n = prev;
}
return rb_entry(n, struct sp_node, nd);
}
/*
* Insert a new shared policy into the list. Caller holds sp->lock for
* writing.
*/
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
struct rb_node *parent = NULL;
struct sp_node *nd;
while (*p) {
parent = *p;
nd = rb_entry(parent, struct sp_node, nd);
if (new->start < nd->start)
p = &(*p)->rb_left;
else if (new->end > nd->end)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
new->policy ? new->policy->mode : 0);
}
/* Find shared policy intersecting idx */
struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
{
struct mempolicy *pol = NULL;
struct sp_node *sn;
if (!sp->root.rb_node)
return NULL;
read_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
read_unlock(&sp->lock);
return pol;
}
static void sp_free(struct sp_node *n)
{
mpol_put(n->policy);
kmem_cache_free(sn_cache, n);
}
/**
* mpol_misplaced - check whether current page node is valid in policy
*
* @page: page to be checked
* @vma: vm area where page mapped
* @addr: virtual address where page mapped
*
* Lookup current policy node id for vma,addr and "compare to" page's
* node id.
*
* Returns:
* -1 - not misplaced, page is in the right node
* node - node id where the page should be
*
* Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*/
int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol;
struct zoneref *z;
int curnid = page_to_nid(page);
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
int polnid = -1;
int ret = -1;
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
switch (pol->mode) {
case MPOL_INTERLEAVE:
pgoff = vma->vm_pgoff;
pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
polnid = offset_il_node(pol, pgoff);
break;
case MPOL_PREFERRED:
if (pol->flags & MPOL_F_LOCAL)
polnid = numa_node_id();
else
polnid = pol->v.preferred_node;
break;
case MPOL_BIND:
/*
* allows binding to multiple nodes.
* use current page if in policy nodemask,
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
if (node_isset(curnid, pol->v.nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->v.nodes);
polnid = zone_to_nid(z->zone);
break;
default:
BUG();
}
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
goto out;
}
if (curnid != polnid)
ret = polnid;
out:
mpol_cond_put(pol);
return ret;
}
/*
* Drop the (possibly final) reference to task->mempolicy. It needs to be
* dropped after task->mempolicy is set to NULL so that any allocation done as
* part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
* policy.
*/
void mpol_put_task_policy(struct task_struct *task)
{
struct mempolicy *pol;
task_lock(task);
pol = task->mempolicy;
task->mempolicy = NULL;
task_unlock(task);
mpol_put(pol);
}
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
sp_free(n);
}
static void sp_node_init(struct sp_node *node, unsigned long start,
unsigned long end, struct mempolicy *pol)
{
node->start = start;
node->end = end;
node->policy = pol;
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
struct sp_node *n;
struct mempolicy *newpol;
n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
newpol = mpol_dup(pol);
if (IS_ERR(newpol)) {
kmem_cache_free(sn_cache, n);
return NULL;
}
newpol->flags |= MPOL_F_SHARED;
sp_node_init(n, start, end, newpol);
return n;
}
/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
struct sp_node *n;
struct sp_node *n_new = NULL;
struct mempolicy *mpol_new = NULL;
int ret = 0;
restart:
write_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
struct rb_node *next = rb_next(&n->nd);
if (n->start >= start) {
if (n->end <= end)
sp_delete(sp, n);
else
n->start = end;
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
if (!n_new)
goto alloc_new;
*mpol_new = *n->policy;
atomic_set(&mpol_new->refcnt, 1);
sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
sp_insert(sp, n_new);
n_new = NULL;
mpol_new = NULL;
break;
} else
n->end = start;
}
if (!next)
break;
n = rb_entry(next, struct sp_node, nd);
}
if (new)
sp_insert(sp, new);
write_unlock(&sp->lock);
ret = 0;
err_out:
if (mpol_new)
mpol_put(mpol_new);
if (n_new)
kmem_cache_free(sn_cache, n_new);
return ret;
alloc_new:
write_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
goto err_out;
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
/**
* mpol_shared_policy_init - initialize shared policy for inode
* @sp: pointer to inode shared policy
* @mpol: struct mempolicy to install
*
* Install non-NULL @mpol in inode's shared policy rb-tree.
* On entry, the current task has a reference on a non-NULL @mpol.
* This must be released on exit.
* This is called at get_inode() calls and we can use GFP_KERNEL.
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
rwlock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
struct mempolicy *new;
NODEMASK_SCRATCH(scratch);
if (!scratch)
goto put_mpol;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
if (IS_ERR(new))
goto free_scratch; /* no valid nodemask intersection */
task_lock(current);
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
task_unlock(current);
if (ret)
goto put_new;
/* Create pseudo-vma that contains just the policy */
vma_init(&pvma, NULL);
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
put_new:
mpol_put(new); /* drop initial ref */
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{
int err;
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
vma->vm_pgoff,
sz, npol ? npol->mode : -1,
npol ? npol->flags : -1,
npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
if (!new)
return -ENOMEM;
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
sp_free(new);
return err;
}
/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *p)
{
struct sp_node *n;
struct rb_node *next;
if (!p->root.rb_node)
return;
write_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
write_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
bool numabalancing_default = false;
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
if (numabalancing_override)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
static int __init setup_numabalancing(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
numabalancing_override = 1;
ret = 1;
} else if (!strcmp(str, "disable")) {
numabalancing_override = -1;
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse numa_balancing=\n");
return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
nodemask_t interleave_nodes;
unsigned long largest = 0;
int nid, prefer = 0;
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
0, SLAB_PANIC, NULL);
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.v = { .preferred_node = nid, },
};
}
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
if (largest < total_pages) {
largest = total_pages;
prefer = nid;
}
/* Interleave this node? */
if ((total_pages << PAGE_SHIFT) >= (16 << 20))
node_set(nid, interleave_nodes);
}
/* All too small, use the largest */
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable();
}
/* Reset policy of current process to default */
void numa_default_policy(void)
{
do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}
/*
* Parse and format mempolicy from/to strings
*/
/*
* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
*/
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
[MPOL_LOCAL] = "local",
};
#ifdef CONFIG_TMPFS
/**
* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
* On success, returns 0, else 1
*/
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
unsigned short mode;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
int err = 1;
if (flags)
*flags++ = '\0'; /* terminate mode string */
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
if (nodelist_parse(nodelist, nodes))
goto out;
if (!nodes_subset(nodes, node_states[N_MEMORY]))
goto out;
} else
nodes_clear(nodes);
for (mode = 0; mode < MPOL_MAX; mode++) {
if (!strcmp(str, policy_modes[mode])) {
break;
}
}
if (mode >= MPOL_MAX)
goto out;
switch (mode) {
case MPOL_PREFERRED:
/*
* Insist on a nodelist of one node only, although later
* we use first_node(nodes) to grab a single node, so here
* nodelist (or nodes) cannot be empty.
*/
if (nodelist) {
char *rest = nodelist;
while (isdigit(*rest))
rest++;
if (*rest)
goto out;
if (nodes_empty(nodes))
goto out;
}
break;
case MPOL_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
nodes = node_states[N_MEMORY];
break;
case MPOL_LOCAL:
/*
* Don't allow a nodelist; mpol_new() checks flags
*/
if (nodelist)
goto out;
mode = MPOL_PREFERRED;
break;
case MPOL_DEFAULT:
/*
* Insist on a empty nodelist
*/
if (!nodelist)
err = 0;
goto out;
case MPOL_BIND:
/*
* Insist on a nodelist
*/
if (!nodelist)
goto out;
}
mode_flags = 0;
if (flags) {
/*
* Currently, we only support two mutually exclusive
* mode flags.
*/
if (!strcmp(flags, "static"))
mode_flags |= MPOL_F_STATIC_NODES;
else if (!strcmp(flags, "relative"))
mode_flags |= MPOL_F_RELATIVE_NODES;
else
goto out;
}
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
goto out;
/*
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
if (mode != MPOL_PREFERRED)
new->v.nodes = nodes;
else if (nodelist)
new->v.preferred_node = first_node(nodes);
else
new->flags |= MPOL_F_LOCAL;
/*
* Save nodes for contextualization: this will be used to "clone"
* the mempolicy in a specific context [cpuset] at a later time.
*/
new->w.user_nodemask = nodes;
err = 0;
out:
/* Restore string for error message */
if (nodelist)
*--nodelist = ':';
if (flags)
*--flags = '=';
if (!err)
*mpol = new;
return err;
}
#endif /* CONFIG_TMPFS */
/**
* mpol_to_str - format a mempolicy structure for printing
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
* Convert @pol into a string. If @buffer is too short, truncate the string.
* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
* longest flag, "relative", and to display at least a few node ids.
*/
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
nodemask_t nodes = NODE_MASK_NONE;
unsigned short mode = MPOL_DEFAULT;
unsigned short flags = 0;
if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
mode = pol->mode;
flags = pol->flags;
}
switch (mode) {
case MPOL_DEFAULT:
break;
case MPOL_PREFERRED:
if (flags & MPOL_F_LOCAL)
mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
nodes = pol->v.nodes;
break;
default:
WARN_ON_ONCE(1);
snprintf(p, maxlen, "unknown");
return;
}
p += snprintf(p, maxlen, "%s", policy_modes[mode]);
if (flags & MPOL_MODE_FLAGS) {
p += snprintf(p, buffer + maxlen - p, "=");
/*
* Currently, the only defined flags are mutually exclusive
*/
if (flags & MPOL_F_STATIC_NODES)
p += snprintf(p, buffer + maxlen - p, "static");
else if (flags & MPOL_F_RELATIVE_NODES)
p += snprintf(p, buffer + maxlen - p, "relative");
}
if (!nodes_empty(nodes))
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}