c2ec4b3fbc
* refs/heads/tmp-5da1114: Revert crypto changes from android-4.19.79-95 Revert "UPSTREAM: PM / wakeup updates" Revert "ANDROID: of: property: Enable of_devlink by default" Revert "UPSTREAM: dt-bindings: arm: coresight: Add support for coresight-loses-context-with-cpu" UPSTREAM: net: usbnet: Fix -Wcast-function-type UPSTREAM: USB: dummy-hcd: use usb_urb_dir_in instead of usb_pipein UPSTREAM: USB: dummy-hcd: increase max number of devices to 32 ANDROID: tty: serdev: Fix broken serial console input ANDROID: update kernel ABI (perf_event changes) BACKPORT: perf_event: Add support for LSM and SELinux checks UPSTREAM: iommu: Allow io-pgtable to be used outside of drivers/iommu/ ANDROID: update abi for 4.19.94 release ANDROID: update abi due to revert Revert "BACKPORT: perf_event: Add support for LSM and SELinux checks" UPSTREAM: selinux: sidtab reverse lookup hash table UPSTREAM: selinux: avoid atomic_t usage in sidtab UPSTREAM: selinux: check sidtab limit before adding a new entry UPSTREAM: selinux: fix context string corruption in convert_context() UPSTREAM: selinux: overhaul sidtab to fix bug and improve performance UPSTREAM: selinux: refactor mls_context_to_sid() and make it stricter UPSTREAM: selinux: use separate table for initial SID lookup UPSTREAM: selinux: make "selinux_policycap_names[]" const char * UPSTREAM: selinux: refactor sidtab conversion ANDROID: Update ABI representation ANDROID: GKI: clk: Don't disable unused clocks with sync state support ANDROID: GKI: clk: Add support for clock providers with sync state ANDROID: GKI: driver core: Add dev_has_sync_state() ANDROID: update kernel ABI representation BACKPORT: perf_event: Add support for LSM and SELinux checks ANDROID: update ABI representation UPSTREAM: exit: panic before exit_mm() on global init exit ANDROID: serdev: Fix platform device support ANDROID: Kconfig.gki: Add Hidden SPRD DRM configs ANDROID: gki_defconfig: Disable TRANSPARENT_HUGEPAGE ANDROID: gki_defconfig: Enable CONFIG_GNSS_CMDLINE_SERIAL ANDROID: gnss: Add command line test driver ANDROID: serdev: add platform device support ANDROID: gki_defconfig: enable ARM64_SW_TTBR0_PAN ANDROID: gki_defconfig: Set BINFMT_MISC as =m UPSTREAM: binder: fix incorrect calculation for num_valid ABI: Update ABI after f2fs merge ANDROID: add initial ABI whitelist for android-4.19 ANDROID: staging: android: ion: Fix build when CONFIG_ION_SYSTEM_HEAP=n ANDROID: staging: android: ion: Expose total heap and pool sizes via sysfs ANDROID: Update ABI representation due to vmstat counter changes UPSTREAM: include/linux/slab.h: fix sparse warning in kmalloc_type() UPSTREAM: mm, slab: shorten kmalloc cache names for large sizes UPSTREAM: mm, proc: add KReclaimable to /proc/meminfo UPSTREAM: mm: rename and change semantics of nr_indirectly_reclaimable_bytes UPSTREAM: dcache: allocate external names from reclaimable kmalloc caches UPSTREAM: mm, slab/slub: introduce kmalloc-reclaimable caches UPSTREAM: mm, slab: combine kmalloc_caches and kmalloc_dma_caches ANDROID: abi update for 4.19.89 ANDROID: update abi_gki_aarch64.xml for LTO, CFI, and SCS ANDROID: gki_defconfig: enable LTO, CFI, and SCS ANDROID: update abi_gki_aarch64.xml for CONFIG_GNSS ANDROID: cuttlefish_defconfig: Enable CONFIG_GNSS UPSTREAM: arm64: Validate tagged addresses in access_ok() called from kernel threads ANDROID: mm: Throttle rss_stat tracepoint UPSTREAM: mm: slub: really fix slab walking for init_on_free ANDROID: update abi_gki_aarch64.xml for nf change ANDROID: kbuild: limit LTO inlining ANDROID: kbuild: merge module sections with LTO ANDROID: netfilter: nf_nat: remove static from nf_nat_ipv4_fn UPSTREAM: drm/client: remove the exporting of drm_client_close ANDROID: f2fs: fix possible merge of unencrypted with encrypted I/O UPSTREAM: binder: Add binder_proc logging to binderfs UPSTREAM: binder: Make transaction_log available in binderfs UPSTREAM: binder: Add stats, state and transactions files UPSTREAM: binder: add a mount option to show global stats UPSTREAM: binder: Validate the default binderfs device names. UPSTREAM: binder: Add default binder devices through binderfs when configured UPSTREAM: binder: fix CONFIG_ANDROID_BINDER_DEVICES UPSTREAM: android: binder: use kstrdup instead of open-coding it UPSTREAM: binderfs: remove separate device_initcall() UPSTREAM: binderfs: respect limit on binder control creation UPSTREAM: binderfs: switch from d_add() to d_instantiate() UPSTREAM: binderfs: drop lock in binderfs_binder_ctl_create UPSTREAM: binderfs: kill_litter_super() before cleanup UPSTREAM: binderfs: rework binderfs_binder_device_create() UPSTREAM: binderfs: rework binderfs_fill_super() UPSTREAM: binderfs: prevent renaming the control dentry UPSTREAM: binderfs: remove outdated comment UPSTREAM: binderfs: fix error return code in binderfs_fill_super() UPSTREAM: binderfs: handle !CONFIG_IPC_NS builds UPSTREAM: binderfs: reserve devices for initial mount UPSTREAM: binderfs: rename header to binderfs.h UPSTREAM: binderfs: implement "max" mount option UPSTREAM: binderfs: make each binderfs mount a new instance UPSTREAM: binderfs: remove wrong kern_mount() call UPSTREAM: binder: implement binderfs UPSTREAM: binder: remove BINDER_DEBUG_ENTRY() ANDROID: Don't base allmodconfig on gki_defconfig ANDROID: Disable UNWINDER_ORC for allmodconfig ANDROID: update abi_gki_aarch64.xml for 4.19.87 BACKPORT: ARM: 8905/1: Emit __gnu_mcount_nc when using Clang 10.0.0 or newer ANDROID: update abi_gki_aarch64.xml ANDROID: gki_defconfig: =m's applied for virtio configs in arm64 UPSTREAM: of: property: Add device link support for interrupt-parent, dmas and -gpio(s) UPSTREAM: of: property: Add device link support for "iommu-map" UPSTREAM: of: property: Fix the semantics of of_is_ancestor_of() UPSTREAM: i2c: of: Populate fwnode in of_i2c_get_board_info() UPSTREAM: driver core: Clarify documentation for fwnode_operations.add_links() UPSTREAM: dt-bindings: arm: coresight: Add support for coresight-loses-context-with-cpu BACKPORT: coresight: etm4x: Save/restore state across CPU low power states ANDROID: Update ABI representation ANDROID: gki_defconfig: IIO=y f2fs: stop GC when the victim becomes fully valid f2fs: expose main_blkaddr in sysfs f2fs: choose hardlimit when softlimit is larger than hardlimit in f2fs_statfs_project() f2fs: Fix deadlock in f2fs_gc() context during atomic files handling f2fs: show f2fs instance in printk_ratelimited f2fs: fix potential overflow f2fs: fix to update dir's i_pino during cross_rename f2fs: support aligned pinned file f2fs: avoid kernel panic on corruption test f2fs: fix wrong description in document f2fs: cache global IPU bio f2fs: fix to avoid memory leakage in f2fs_listxattr f2fs: check total_segments from devices in raw_super f2fs: update multi-dev metadata in resize_fs f2fs: mark recovery flag correctly in read_raw_super_block() f2fs: fix to update time in lazytime mode vfs: don't allow writes to swap files mm: set S_SWAPFILE on blockdev swap devices BACKPORT: ARM: 8900/1: UNWINDER_FRAME_POINTER implementation for Clang ANDROID: update abi_gki_aarch64.xml for 4.19.87 ANDROID: gki_defconfig: FW_CACHE to no FROMGIT: firmware_class: make firmware caching configurable FROMLIST: arm64: implement Shadow Call Stack FROMLIST: arm64: disable SCS for hypervisor code BACKPORT: FROMLIST: arm64: vdso: disable Shadow Call Stack FROMLIST: arm64: efi: restore x18 if it was corrupted FROMLIST: arm64: preserve x18 when CPU is suspended FROMLIST: arm64: reserve x18 from general allocation with SCS FROMLIST: arm64: disable function graph tracing with SCS FROMLIST: scs: add support for stack usage debugging FROMLIST: scs: add accounting FROMLIST: add support for Clang's Shadow Call Stack (SCS) FROMLIST: arm64: kernel: avoid x18 in __cpu_soft_restart FROMLIST: arm64: kvm: stop treating register x18 as caller save FROMLIST: arm64/lib: copy_page: avoid x18 register in assembler code FROMLIST: arm64: mm: avoid x18 in idmap_kpti_install_ng_mappings ANDROID: use non-canonical CFI jump tables ANDROID: arm64: add __nocfi to __apply_alternatives ANDROID: arm64: add __pa_function ANDROID: arm64: allow ThinLTO to be selected ANDROID: soc/tegra: disable ARCH_TEGRA_210_SOC with LTO FROMLIST: arm64: fix alternatives with LLVM's integrated assembler ANDROID: irqchip/gic-v3: rename gic_of_init to work around a ThinLTO+CFI bug ANDROID: init: ensure initcall ordering with LTO Revert "ANDROID: init: ensure initcall ordering with LTO" ANDROID: add support for ThinLTO ANDROID: clang: update to 10.0.1 ANDROID: gki_defconfig: enable CONFIG_REGULATOR_FIXED_VOLTAGE ANDROID: gki_defconfig: removed CONFIG_PM_WAKELOCKS ANDROID: gki_defconfig: enable CONFIG_IKHEADERS as m FROMGIT: pinctrl: devicetree: Avoid taking direct reference to device name string ANDROID: update abi_gki_aarch64.xml for 4.19.86 update ANDROID: Update ABI representation ANDROID: gki_defconfig: disable FUNCTION_TRACER ANDROID: Update the ABI representation ANDROID: update ABI representation ANDROID: add unstripped modules to the distribution FROMLIST: vsprintf: Inline call to ptr_to_hashval UPSTREAM: rss_stat: Add support to detect RSS updates of external mm UPSTREAM: mm: emit tracepoint when RSS changes FROMGIT: driver core: Allow device link operations inside sync_state() ANDROID: uid_sys_stats: avoid double accounting of dying threads ANDROID: scsi: ufs-qcom: Enable BROKEN_CRYPTO quirk flag ANDROID: scsi: ufs-hisi: Enable BROKEN_CRYPTO quirk flag ANDROID: scsi: ufs: Add quirk bit for controllers that don't play well with inline crypto ANDROID: scsi: ufs: UFS init should not require inline crypto ANDROID: scsi: ufs: UFS crypto variant operations API ANDROID: gki_defconfig: enable inline encryption BACKPORT: FROMLIST: ext4: add inline encryption support BACKPORT: FROMLIST: f2fs: add inline encryption support BACKPORT: FROMLIST: fscrypt: add inline encryption support BACKPORT: FROMLIST: scsi: ufs: Add inline encryption support to UFS BACKPORT: FROMLIST: scsi: ufs: UFS crypto API BACKPORT: FROMLIST: scsi: ufs: UFS driver v2.1 spec crypto additions BACKPORT: FROMLIST: block: blk-crypto for Inline Encryption ANDROID: block: Fix bio_crypt_should_process WARN_ON BACKPORT: FROMLIST: block: Add encryption context to struct bio BACKPORT: FROMLIST: block: Keyslot Manager for Inline Encryption FROMLIST: f2fs: add support for IV_INO_LBLK_64 encryption policies FROMLIST: ext4: add support for IV_INO_LBLK_64 encryption policies BACKPORT: FROMLIST: fscrypt: add support for IV_INO_LBLK_64 policies FROMLIST: fscrypt: zeroize fscrypt_info before freeing FROMLIST: fscrypt: remove struct fscrypt_ctx BACKPORT: FROMLIST: fscrypt: invoke crypto API for ESSIV handling ANDROID: build kernels with llvm-nm and llvm-objcopy ANDROID: Fix allmodconfig build with CC=clang UPSTREAM: mm/page_poison: expose page_poisoning_enabled to kernel modules FROMGIT: of: property: Add device link support for iommus, mboxes and io-channels FROMGIT: of: property: Make it easy to add device links from DT properties FROMGIT: of: property: Minor style clean up of of_link_to_phandle() Revert "ANDROID: of/property: Add device link support for iommus" ANDROID: Add allmodconfig build.configs for x86_64 and aarch64 ANDROID: fix allmodconfig build ANDROID: nf: IDLETIMER: Fix possible use before initialization in idletimer_resume BACKPORT: coresight: funnel: Support static funnel BACKPORT:FROMGIT: coresight: replicator: Fix missing spin_lock_init() BACKPORT:FROMGIT: coresight: funnel: Fix missing spin_lock_init() BACKPORT:FROMGIT: coresight: Serialize enabling/disabling a link device. UPSTREAM: coresight: tmc-etr: Add barrier packets when moving offset forward UPSTREAM: coresight: tmc-etr: Decouple buffer sync and barrier packet insertion UPSTREAM: coresight: tmc: Make memory width mask computation into a function UPSTREAM: coresight: tmc-etr: Fix perf_data check UPSTREAM: coresight: tmc-etr: Fix updating buffer in not-snapshot mode. UPSTREAM: coresight: tmc-etr: Check if non-secure access is enabled UPSTREAM: coresight: tmc-etr: Handle memory errors BACKPORT: coresight: etr_buf: Consolidate refcount initialization UPSTREAM: coresight: Fix DEBUG_LOCKS_WARN_ON for uninitialized attribute UPSTREAM: coresight: Use coresight device names for sinks in PMU attribute UPSTREAM: coresight: tmc-etr: alloc_perf_buf: Do not call smp_processor_id from preemptible UPSTREAM: coresight: tmc-etr: Do not call smp_processor_id() from preemptible UPSTREAM: coresight: perf: Don't set the truncated flag in snapshot mode UPSTREAM: coresight: tmc-etf: Fix snapshot mode update function UPSTREAM: coresight: tmc-etr: Properly set AUX buffer head in snapshot mode UPSTREAM: coresight: tmc-etr: Add support for CPU-wide trace scenarios UPSTREAM: coresight: tmc-etr: Allocate and free ETR memory buffers for CPU-wide scenarios UPSTREAM: coresight: tmc-etr: Introduce the notion of IDR to ETR devices UPSTREAM: coresight: tmc-etr: Introduce the notion of reference counting to ETR devices UPSTREAM: coresight: tmc-etr: Introduce the notion of process ID to ETR devices UPSTREAM: coresight: tmc-etr: Create per-thread buffer allocation function UPSTREAM: coresight: tmc-etr: Refactor function tmc_etr_setup_perf_buf() UPSTREAM: coresight: Communicate perf event to sink buffer allocation functions UPSTREAM: coresight: perf: Refactor function free_event_data() UPSTREAM: coresight: perf: Clean up function etm_setup_aux() UPSTREAM: coresight: Properly address concurrency in sink::update() functions UPSTREAM: coresight: Properly address errors in sink::disable() functions UPSTREAM: coresight: Move reference counting inside sink drivers UPSTREAM: coresight: Adding return code to sink::disable() operation UPSTREAM: coresight: etm4x: Configure tracers to emit timestamps UPSTREAM: coresight: etm4x: Skip selector pair 0 UPSTREAM: coresight: etm4x: Add kernel configuration for CONTEXTID UPSTREAM: coresight: pmu: Adding ITRACE property to cs_etm PMU UPSTREAM: coresight: tmc: Cleanup power management UPSTREAM: coresight: Fix freeing up the coresight connections UPSTREAM: coresight: tmc: Report DMA setup failures UPSTREAM: coresight: catu: fix clang build warning UPSTREAM: perf/core: Fix the address filtering fix UPSTREAM: perf, pt, coresight: Fix address filters for vmas with non-zero offset UPSTREAM: perf: Copy parent's address filter offsets on clone UPSTREAM: coresight: Use event attributes for sink selection UPSTREAM: coresight: perf: Add "sinks" group to PMU directory UPSTREAM: coresight: etb10: Add support for CLAIM tag UPSTREAM: coreisght: tmc: Claim device before use UPSTREAM: coresight: dynamic-replicator: Claim device for use UPSTREAM: coresight: funnel: Claim devices before use UPSTREAM: coresight: etmx: Claim devices before use UPSTREAM: coresight: Add support for CLAIM tag protocol UPSTREAM: coresight: dynamic-replicator: Handle multiple connections UPSTREAM: coresight: etb10: Handle errors enabling the device UPSTREAM: coresight: etm3: Add support for handling errors UPSTREAM: coresight: etm4x: Add support for handling errors UPSTREAM: coresight: tmc-etb/etf: Prepare to handle errors enabling UPSTREAM: coresight: tmc-etr: Handle errors enabling CATU UPSTREAM: coresight: tmc-etr: Refactor for handling errors UPSTREAM: coresight: Handle failures in enabling a trace path UPSTREAM: coresight: tmc: Fix byte-address alignment for RRP UPSTREAM: coresight: etm4x: Configure EL2 exception level when kernel is running in HYP UPSTREAM: coresight: etb10: Splitting function etb_enable() UPSTREAM: coresight: etb10: Refactor etb_drvdata::mode handling UPSTREAM: coresight: etm-perf: Add support for ETR backend UPSTREAM: coresight: perf: Remove set_buffer call back UPSTREAM: coresight: perf: Add helper to retrieve sink configuration UPSTREAM: coresight: perf: Remove reset_buffer call back for sinks UPSTREAM: coresight: Convert driver messages to dev_dbg UPSTREAM: coresight: tmc-etr: Relax collection of trace from sysfs mode UPSTREAM: coresight: tmc-etr: Handle driver mode specific ETR buffers UPSTREAM: coresight: perf: Disable trace path upon source error UPSTREAM: coresight: perf: Allow tracing on hotplugged CPUs UPSTREAM: coresight: perf: Avoid unncessary CPU hotplug read lock UPSTREAM: coresight: perf: Fix per cpu path management UPSTREAM: coresight: Fix handling of sinks UPSTREAM: coresight: Use ERR_CAST instead of ERR_PTR UPSTREAM: coresight: Fix remote endpoint parsing UPSTREAM: coresight: platform: Fix leaking device reference UPSTREAM: coresight: platform: Fix refcounting for graph nodes UPSTREAM: coresight: platform: Refactor graph endpoint parsing UPSTREAM: coresight: Document error handling in coresight_register ANDROID: regression introduced override_creds=off ANDROID: overlayfs: internal getxattr operations without sepolicy checking ANDROID: overlayfs: add __get xattr method ANDROID: Add optional __get xattr method paired to __vfs_getxattr UPSTREAM: scsi: ufs: override auto suspend tunables for ufs UPSTREAM: scsi: core: allow auto suspend override by low-level driver FROMGIT: of: property: Skip adding device links to suppliers that aren't devices ANDROID: gki_defconfig: enable CONFIG_KEYBOARD_GPIO UPSTREAM: dm bufio: introduce a global cache replacement UPSTREAM: dm bufio: remove old-style buffer cleanup UPSTREAM: dm bufio: introduce a global queue UPSTREAM: dm bufio: refactor adjust_total_allocated UPSTREAM: dm bufio: call adjust_total_allocated from __link_buffer and __unlink_buffer ANDROID: dummy_cpufreq: Implement get() ANDROID: gki_defconfig: enable CONFIG_CPUSETS ANDROID: virtio: virtio_input: Set the amount of multitouch slots in virtio input rtlwifi: Fix potential overflow on P2P code ANDROID: cpufreq: create dummy cpufreq driver ANDROID: Allow DRM_IOCTL_MODE_*_DUMB for render clients. Cuttlefish Wifi: Add data ops in virt_wifi driver for scan data simulation ANDROID: of: property: Enable of_devlink by default ANDROID: of: property: Make sure child dependencies don't block probing of parent ANDROID: driver core: Allow fwnode_operations.add_links to differentiate errors ANDROID: driver core: Allow a device to wait on optional suppliers ANDROID: driver core: Add device link support for SYNC_STATE_ONLY flag FROMGIT: docs: driver-model: Add documentation for sync_state FROMGIT: driver: core: Improve documentation for fwnode_operations.add_links() FROMGIT: of: property: Minor code formatting/style clean ups ANDROID: of/property: Add device link support for iommus ANDROID: move up spin_unlock_bh() ahead of remove_proc_entry() BACKPORT: arm64: tags: Preserve tags for addresses translated via TTBR1 UPSTREAM: arm64: memory: Implement __tag_set() as common function UPSTREAM: arm64/mm: fix variable 'tag' set but not used UPSTREAM: arm64: avoid clang warning about self-assignment ANDROID: sdcardfs: evict dentries on fscrypt key removal ANDROID: fscrypt: add key removal notifier chain ANDROID: refactor build.config files to remove duplication ANDROID: Move from clang r353983c to r365631c ANDROID: gki_defconfig: remove PWRSEQ_EMMC and PWRSEQ_SIMPLE ANDROID: unconditionally compile sig_ok in struct module ANDROID: gki_defconfig: enable fs-verity UPSTREAM: mm: vmalloc: show number of vmalloc pages in /proc/meminfo BACKPORT: PM/sleep: Expose suspend stats in sysfs UPSTREAM: power: supply: Init device wakeup after device_add() UPSTREAM: PM / wakeup: Unexport wakeup_source_sysfs_{add,remove}() UPSTREAM: PM / wakeup: Register wakeup class kobj after device is added UPSTREAM: PM / wakeup: Fix sysfs registration error path UPSTREAM: PM / wakeup: Show wakeup sources stats in sysfs UPSTREAM: PM / wakeup: Use wakeup_source_register() in wakelock.c UPSTREAM: PM / wakeup: Drop wakeup_source_init(), wakeup_source_prepare() UPSTREAM: PM / wakeup: Drop wakeup_source_drop() UPSTREAM: PM / core: Add support to skip power management in device/driver model gki_defconfig: Enable CONFIG_DM_SNAPSHOT ANDROID: gki_defconfig: enable accelerated AES and SHA-256 ANDROID: fix overflow in /proc/uid_cputime/remove_uid_range ANDROID: kasan: fix has_attribute check on older GCC versions ANDROID: gki_defconfig: enable CONFIG_PARAVIRT and CONFIG_HYPERVISOR_GUEST ANDROID: gki_defconfig: enable CONFIG_NLS_* ANDROID: gki_defconfig: Enable BPF_JIT and BPF_JIT_ALWAYS_ON FROMGIT: of: property: Create device links for all child-supplier depencencies FROMGIT: of/platform: Pause/resume sync state during init and of_platform_populate() BACKPORT: FROMGIT: driver core: Add sync_state driver/bus callback BACKPORT: FROMGIT: of: property: Add functional dependency link from DT bindings FROMGIT: driver core: Add support for linking devices during device addition FROMGIT: driver core: Add fwnode_to_dev() to look up device from fwnode UPSTREAM: mm: untag user pointers in mmap/munmap/mremap/brk UPSTREAM: vfio/type1: untag user pointers in vaddr_get_pfn UPSTREAM: tee/shm: untag user pointers in tee_shm_register UPSTREAM: media/v4l2-core: untag user pointers in videobuf_dma_contig_user_get UPSTREAM: drm/radeon: untag user pointers in radeon_gem_userptr_ioctl BACKPORT: drm/amdgpu: untag user pointers UPSTREAM: userfaultfd: untag user pointers UPSTREAM: fs/namespace: untag user pointers in copy_mount_options UPSTREAM: mm: untag user pointers in get_vaddr_frames UPSTREAM: mm: untag user pointers in mm/gup.c UPSTREAM: mm: untag user pointers passed to memory syscalls BACKPORT: lib: untag user pointers in strn*_user UPSTREAM: arm64: Fix reference to docs for ARM64_TAGGED_ADDR_ABI UPSTREAM: selftests, arm64: add kernel headers path for tags_test BACKPORT: arm64: Relax Documentation/arm64/tagged-pointers.rst UPSTREAM: arm64: Define Documentation/arm64/tagged-address-abi.rst UPSTREAM: arm64: Change the tagged_addr sysctl control semantics to only prevent the opt-in UPSTREAM: arm64: Tighten the PR_{SET, GET}_TAGGED_ADDR_CTRL prctl() unused arguments UPSTREAM: selftests, arm64: fix uninitialized symbol in tags_test.c UPSTREAM: arm64: mm: Really fix sparse warning in untagged_addr() UPSTREAM: selftests, arm64: add a selftest for passing tagged pointers to kernel BACKPORT: arm64: Introduce prctl() options to control the tagged user addresses ABI UPSTREAM: arm64: untag user pointers in access_ok and __uaccess_mask_ptr UPSTREAM: uaccess: add noop untagged_addr definition BACKPORT: block: annotate refault stalls from IO submission f2fs: add a condition to detect overflow in f2fs_ioc_gc_range() f2fs: fix to add missing F2FS_IO_ALIGNED() condition f2fs: fix to fallback to buffered IO in IO aligned mode f2fs: fix to handle error path correctly in f2fs_map_blocks f2fs: fix extent corrupotion during directIO in LFS mode f2fs: check all the data segments against all node ones f2fs: Add a small clarification to CONFIG_FS_F2FS_FS_SECURITY f2fs: fix inode rwsem regression f2fs: fix to avoid accessing uninitialized field of inode page in is_alive() f2fs: avoid infinite GC loop due to stale atomic files f2fs: Fix indefinite loop in f2fs_gc() f2fs: convert inline_data in prior to i_size_write f2fs: fix error path of f2fs_convert_inline_page() f2fs: add missing documents of reserve_root/resuid/resgid f2fs: fix flushing node pages when checkpoint is disabled f2fs: enhance f2fs_is_checkpoint_ready()'s readability f2fs: clean up __bio_alloc()'s parameter f2fs: fix wrong error injection path in inc_valid_block_count() f2fs: fix to writeout dirty inode during node flush f2fs: optimize case-insensitive lookups f2fs: introduce f2fs_match_name() for cleanup f2fs: Fix indefinite loop in f2fs_gc() f2fs: allocate memory in batch in build_sit_info() f2fs: support FS_IOC_{GET,SET}FSLABEL f2fs: fix to avoid data corruption by forbidding SSR overwrite f2fs: Fix build error while CONFIG_NLS=m Revert "f2fs: avoid out-of-range memory access" f2fs: cleanup the code in build_sit_entries. f2fs: fix wrong available node count calculation f2fs: remove duplicate code in f2fs_file_write_iter f2fs: fix to migrate blocks correctly during defragment f2fs: use wrapped f2fs_cp_error() f2fs: fix to use more generic EOPNOTSUPP f2fs: use wrapped IS_SWAPFILE() f2fs: Support case-insensitive file name lookups f2fs: include charset encoding information in the superblock fs: Reserve flag for casefolding f2fs: fix to avoid call kvfree under spinlock fs: f2fs: Remove unnecessary checks of SM_I(sbi) in update_general_status() f2fs: disallow direct IO in atomic write f2fs: fix to handle quota_{on,off} correctly f2fs: fix to detect cp error in f2fs_setxattr() f2fs: fix to spread f2fs_is_checkpoint_ready() f2fs: support fiemap() for directory inode f2fs: fix to avoid discard command leak f2fs: fix to avoid tagging SBI_QUOTA_NEED_REPAIR incorrectly f2fs: fix to drop meta/node pages during umount f2fs: disallow switching io_bits option during remount f2fs: fix panic of IO alignment feature f2fs: introduce {page,io}_is_mergeable() for readability f2fs: fix livelock in swapfile writes f2fs: add fs-verity support ext4: update on-disk format documentation for fs-verity ext4: add fs-verity read support ext4: add basic fs-verity support fs-verity: support builtin file signatures fs-verity: add SHA-512 support fs-verity: implement FS_IOC_MEASURE_VERITY ioctl fs-verity: implement FS_IOC_ENABLE_VERITY ioctl fs-verity: add data verification hooks for ->readpages() fs-verity: add the hook for file ->setattr() fs-verity: add the hook for file ->open() fs-verity: add inode and superblock fields fs-verity: add Kconfig and the helper functions for hashing fs: uapi: define verity bit for FS_IOC_GETFLAGS fs-verity: add UAPI header fs-verity: add MAINTAINERS file entry fs-verity: add a documentation file ext4: fix kernel oops caused by spurious casefold flag ext4: fix coverity warning on error path of filename setup ext4: optimize case-insensitive lookups ext4: fix dcache lookup of !casefolded directories unicode: update to Unicode 12.1.0 final unicode: add missing check for an error return from utf8lookup() ext4: export /sys/fs/ext4/feature/casefold if Unicode support is present unicode: refactor the rule for regenerating utf8data.h ext4: Support case-insensitive file name lookups ext4: include charset encoding information in the superblock unicode: update unicode database unicode version 12.1.0 unicode: introduce test module for normalized utf8 implementation unicode: implement higher level API for string handling unicode: reduce the size of utf8data[] unicode: introduce code for UTF-8 normalization unicode: introduce UTF-8 character database ext4 crypto: fix to check feature status before get policy fscrypt: document the new ioctls and policy version ubifs: wire up new fscrypt ioctls f2fs: wire up new fscrypt ioctls ext4: wire up new fscrypt ioctls fscrypt: require that key be added when setting a v2 encryption policy fscrypt: add FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS ioctl fscrypt: allow unprivileged users to add/remove keys for v2 policies fscrypt: v2 encryption policy support fscrypt: add an HKDF-SHA512 implementation fscrypt: add FS_IOC_GET_ENCRYPTION_KEY_STATUS ioctl fscrypt: add FS_IOC_REMOVE_ENCRYPTION_KEY ioctl fscrypt: add FS_IOC_ADD_ENCRYPTION_KEY ioctl fscrypt: rename keyinfo.c to keysetup.c fscrypt: move v1 policy key setup to keysetup_v1.c fscrypt: refactor key setup code in preparation for v2 policies fscrypt: rename fscrypt_master_key to fscrypt_direct_key fscrypt: add ->ci_inode to fscrypt_info fscrypt: use FSCRYPT_* definitions, not FS_* fscrypt: use FSCRYPT_ prefix for uapi constants fs, fscrypt: move uapi definitions to new header <linux/fscrypt.h> fscrypt: use ENOPKG when crypto API support missing fscrypt: improve warnings for missing crypto API support fscrypt: improve warning messages for unsupported encryption contexts fscrypt: make fscrypt_msg() take inode instead of super_block fscrypt: clean up base64 encoding/decoding fscrypt: remove loadable module related code Updated following files to fix build errors: drivers/gpu/msm/kgsl_pool.c drivers/hwtracing/coresight/coresight-dummy.c drivers/iommu/dma-mapping-fast.c drivers/iommu/io-pgtable-fast.c drivers/iommu/io-pgtable-msm-secure.c kernel/taskstats.c mm/vmalloc.c security/selinux/ss/sidtab.h Conflicts: arch/arm/Makefile arch/arm64/Kconfig arch/x86/include/asm/syscall_wrapper.h build.config.common drivers/clk/clk.c drivers/hwtracing/coresight/coresight-etm-perf.c drivers/hwtracing/coresight/coresight-funnel.c drivers/hwtracing/coresight/coresight-tmc-etf.c drivers/hwtracing/coresight/coresight-tmc-etr.c drivers/hwtracing/coresight/coresight-tmc.c drivers/hwtracing/coresight/coresight-tmc.h drivers/hwtracing/coresight/coresight.c drivers/hwtracing/coresight/of_coresight.c drivers/iommu/arm-smmu.c drivers/iommu/io-pgtable-arm.c drivers/iommu/io-pgtable.c drivers/scsi/scsi_sysfs.c drivers/scsi/sd.c drivers/scsi/ufs/ufshcd.c drivers/scsi/ufs/ufshcd.h drivers/staging/android/ion/ion.c drivers/staging/android/ion/ion.h drivers/staging/android/ion/ion_page_pool.c fs/ext4/readpage.c fs/f2fs/data.c fs/f2fs/f2fs.h fs/f2fs/file.c fs/f2fs/segment.c fs/f2fs/super.c include/linux/clk-provider.h include/linux/compiler_types.h include/linux/coresight.h include/linux/mmzone.h include/scsi/scsi_device.h include/trace/events/kmem.h kernel/events/core.c kernel/sched/core.c mm/vmstat.c Change-Id: I2eca52b08b484f2b5c30437671cab8cb0195b8d6 Signed-off-by: Ivaylo Georgiev <irgeorgiev@codeaurora.org>
3498 lines
87 KiB
C
3498 lines
87 KiB
C
/*
|
|
* linux/mm/vmalloc.c
|
|
*
|
|
* Copyright (C) 1993 Linus Torvalds
|
|
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
|
|
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
|
|
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
|
|
* Numa awareness, Christoph Lameter, SGI, June 2005
|
|
*/
|
|
|
|
#include <linux/vmalloc.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/module.h>
|
|
#include <linux/highmem.h>
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/interrupt.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/debugobjects.h>
|
|
#include <linux/kallsyms.h>
|
|
#include <linux/list.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/radix-tree.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/pfn.h>
|
|
#include <linux/kmemleak.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/llist.h>
|
|
#include <linux/bitops.h>
|
|
#include <linux/rbtree_augmented.h>
|
|
|
|
#include <linux/uaccess.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/shmparam.h>
|
|
|
|
#include "internal.h"
|
|
|
|
struct vfree_deferred {
|
|
struct llist_head list;
|
|
struct work_struct wq;
|
|
};
|
|
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
|
|
|
|
static void __vunmap(const void *, int);
|
|
|
|
static void free_work(struct work_struct *w)
|
|
{
|
|
struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
|
|
struct llist_node *t, *llnode;
|
|
|
|
llist_for_each_safe(llnode, t, llist_del_all(&p->list))
|
|
__vunmap((void *)llnode, 1);
|
|
}
|
|
|
|
/*** Page table manipulation functions ***/
|
|
|
|
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
|
|
{
|
|
pte_t *pte;
|
|
|
|
pte = pte_offset_kernel(pmd, addr);
|
|
do {
|
|
pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
|
|
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
|
|
} while (pte++, addr += PAGE_SIZE, addr != end);
|
|
}
|
|
|
|
static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
|
|
pmd = pmd_offset(pud, addr);
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
if (pmd_clear_huge(pmd))
|
|
continue;
|
|
if (pmd_none_or_clear_bad(pmd))
|
|
continue;
|
|
vunmap_pte_range(pmd, addr, next);
|
|
} while (pmd++, addr = next, addr != end);
|
|
}
|
|
|
|
static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
|
|
pud = pud_offset(p4d, addr);
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
if (pud_clear_huge(pud))
|
|
continue;
|
|
if (pud_none_or_clear_bad(pud))
|
|
continue;
|
|
vunmap_pmd_range(pud, addr, next);
|
|
} while (pud++, addr = next, addr != end);
|
|
}
|
|
|
|
static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
|
|
{
|
|
p4d_t *p4d;
|
|
unsigned long next;
|
|
|
|
p4d = p4d_offset(pgd, addr);
|
|
do {
|
|
next = p4d_addr_end(addr, end);
|
|
if (p4d_clear_huge(p4d))
|
|
continue;
|
|
if (p4d_none_or_clear_bad(p4d))
|
|
continue;
|
|
vunmap_pud_range(p4d, addr, next);
|
|
} while (p4d++, addr = next, addr != end);
|
|
}
|
|
|
|
static void vunmap_page_range(unsigned long addr, unsigned long end)
|
|
{
|
|
pgd_t *pgd;
|
|
unsigned long next;
|
|
|
|
BUG_ON(addr >= end);
|
|
pgd = pgd_offset_k(addr);
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
if (pgd_none_or_clear_bad(pgd))
|
|
continue;
|
|
vunmap_p4d_range(pgd, addr, next);
|
|
} while (pgd++, addr = next, addr != end);
|
|
}
|
|
|
|
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
|
|
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
|
|
{
|
|
pte_t *pte;
|
|
|
|
/*
|
|
* nr is a running index into the array which helps higher level
|
|
* callers keep track of where we're up to.
|
|
*/
|
|
|
|
pte = pte_alloc_kernel(pmd, addr);
|
|
if (!pte)
|
|
return -ENOMEM;
|
|
do {
|
|
struct page *page = pages[*nr];
|
|
|
|
if (WARN_ON(!pte_none(*pte)))
|
|
return -EBUSY;
|
|
if (WARN_ON(!page))
|
|
return -ENOMEM;
|
|
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
|
|
(*nr)++;
|
|
} while (pte++, addr += PAGE_SIZE, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_pmd_range(pud_t *pud, unsigned long addr,
|
|
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
|
|
pmd = pmd_alloc(&init_mm, pud, addr);
|
|
if (!pmd)
|
|
return -ENOMEM;
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
|
|
return -ENOMEM;
|
|
} while (pmd++, addr = next, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
|
|
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
|
|
pud = pud_alloc(&init_mm, p4d, addr);
|
|
if (!pud)
|
|
return -ENOMEM;
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
|
|
return -ENOMEM;
|
|
} while (pud++, addr = next, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
|
|
unsigned long end, pgprot_t prot, struct page **pages, int *nr)
|
|
{
|
|
p4d_t *p4d;
|
|
unsigned long next;
|
|
|
|
p4d = p4d_alloc(&init_mm, pgd, addr);
|
|
if (!p4d)
|
|
return -ENOMEM;
|
|
do {
|
|
next = p4d_addr_end(addr, end);
|
|
if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
|
|
return -ENOMEM;
|
|
} while (p4d++, addr = next, addr != end);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
|
|
* will have pfns corresponding to the "pages" array.
|
|
*
|
|
* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
|
|
*/
|
|
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
|
|
pgprot_t prot, struct page **pages)
|
|
{
|
|
pgd_t *pgd;
|
|
unsigned long next;
|
|
unsigned long addr = start;
|
|
int err = 0;
|
|
int nr = 0;
|
|
|
|
BUG_ON(addr >= end);
|
|
pgd = pgd_offset_k(addr);
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
|
|
if (err)
|
|
return err;
|
|
} while (pgd++, addr = next, addr != end);
|
|
|
|
return nr;
|
|
}
|
|
|
|
static int vmap_page_range(unsigned long start, unsigned long end,
|
|
pgprot_t prot, struct page **pages)
|
|
{
|
|
int ret;
|
|
|
|
ret = vmap_page_range_noflush(start, end, prot, pages);
|
|
flush_cache_vmap(start, end);
|
|
return ret;
|
|
}
|
|
|
|
int is_vmalloc_or_module_addr(const void *x)
|
|
{
|
|
/*
|
|
* ARM, x86-64 and sparc64 put modules in a special place,
|
|
* and fall back on vmalloc() if that fails. Others
|
|
* just put it in the vmalloc space.
|
|
*/
|
|
#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
|
|
unsigned long addr = (unsigned long)x;
|
|
if (addr >= MODULES_VADDR && addr < MODULES_END)
|
|
return 1;
|
|
#endif
|
|
return is_vmalloc_addr(x);
|
|
}
|
|
|
|
/*
|
|
* Walk a vmap address to the struct page it maps.
|
|
*/
|
|
struct page *vmalloc_to_page(const void *vmalloc_addr)
|
|
{
|
|
unsigned long addr = (unsigned long) vmalloc_addr;
|
|
struct page *page = NULL;
|
|
pgd_t *pgd = pgd_offset_k(addr);
|
|
p4d_t *p4d;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
pte_t *ptep, pte;
|
|
|
|
/*
|
|
* XXX we might need to change this if we add VIRTUAL_BUG_ON for
|
|
* architectures that do not vmalloc module space
|
|
*/
|
|
VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
|
|
|
|
if (pgd_none(*pgd))
|
|
return NULL;
|
|
p4d = p4d_offset(pgd, addr);
|
|
if (p4d_none(*p4d))
|
|
return NULL;
|
|
pud = pud_offset(p4d, addr);
|
|
|
|
/*
|
|
* Don't dereference bad PUD or PMD (below) entries. This will also
|
|
* identify huge mappings, which we may encounter on architectures
|
|
* that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
|
|
* identified as vmalloc addresses by is_vmalloc_addr(), but are
|
|
* not [unambiguously] associated with a struct page, so there is
|
|
* no correct value to return for them.
|
|
*/
|
|
WARN_ON_ONCE(pud_bad(*pud));
|
|
if (pud_none(*pud) || pud_bad(*pud))
|
|
return NULL;
|
|
pmd = pmd_offset(pud, addr);
|
|
WARN_ON_ONCE(pmd_bad(*pmd));
|
|
if (pmd_none(*pmd) || pmd_bad(*pmd))
|
|
return NULL;
|
|
|
|
ptep = pte_offset_map(pmd, addr);
|
|
pte = *ptep;
|
|
if (pte_present(pte))
|
|
page = pte_page(pte);
|
|
pte_unmap(ptep);
|
|
return page;
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_to_page);
|
|
|
|
/*
|
|
* Map a vmalloc()-space virtual address to the physical page frame number.
|
|
*/
|
|
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
|
|
{
|
|
return page_to_pfn(vmalloc_to_page(vmalloc_addr));
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_to_pfn);
|
|
|
|
|
|
/*** Global kva allocator ***/
|
|
|
|
#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
|
|
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
|
|
|
|
#define VM_LAZY_FREE 0x02
|
|
#define VM_VM_AREA 0x04
|
|
|
|
static DEFINE_SPINLOCK(vmap_area_lock);
|
|
/* Export for kexec only */
|
|
LIST_HEAD(vmap_area_list);
|
|
static LLIST_HEAD(vmap_purge_list);
|
|
static struct rb_root vmap_area_root = RB_ROOT;
|
|
static bool vmap_initialized __read_mostly;
|
|
|
|
/*
|
|
* This kmem_cache is used for vmap_area objects. Instead of
|
|
* allocating from slab we reuse an object from this cache to
|
|
* make things faster. Especially in "no edge" splitting of
|
|
* free block.
|
|
*/
|
|
static struct kmem_cache *vmap_area_cachep;
|
|
|
|
/*
|
|
* This linked list is used in pair with free_vmap_area_root.
|
|
* It gives O(1) access to prev/next to perform fast coalescing.
|
|
*/
|
|
static LIST_HEAD(free_vmap_area_list);
|
|
|
|
/*
|
|
* This augment red-black tree represents the free vmap space.
|
|
* All vmap_area objects in this tree are sorted by va->va_start
|
|
* address. It is used for allocation and merging when a vmap
|
|
* object is released.
|
|
*
|
|
* Each vmap_area node contains a maximum available free block
|
|
* of its sub-tree, right or left. Therefore it is possible to
|
|
* find a lowest match of free area.
|
|
*/
|
|
static struct rb_root free_vmap_area_root = RB_ROOT;
|
|
|
|
static __always_inline unsigned long
|
|
va_size(struct vmap_area *va)
|
|
{
|
|
return (va->va_end - va->va_start);
|
|
}
|
|
|
|
static __always_inline unsigned long
|
|
get_subtree_max_size(struct rb_node *node)
|
|
{
|
|
struct vmap_area *va;
|
|
|
|
va = rb_entry_safe(node, struct vmap_area, rb_node);
|
|
return va ? va->subtree_max_size : 0;
|
|
}
|
|
|
|
/*
|
|
* Gets called when remove the node and rotate.
|
|
*/
|
|
static __always_inline unsigned long
|
|
compute_subtree_max_size(struct vmap_area *va)
|
|
{
|
|
return max3(va_size(va),
|
|
get_subtree_max_size(va->rb_node.rb_left),
|
|
get_subtree_max_size(va->rb_node.rb_right));
|
|
}
|
|
|
|
RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb,
|
|
struct vmap_area, rb_node, unsigned long, subtree_max_size,
|
|
compute_subtree_max_size)
|
|
|
|
static void purge_vmap_area_lazy(void);
|
|
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
|
|
static unsigned long lazy_max_pages(void);
|
|
|
|
static atomic_long_t nr_vmalloc_pages;
|
|
|
|
unsigned long vmalloc_nr_pages(void)
|
|
{
|
|
return atomic_long_read(&nr_vmalloc_pages);
|
|
}
|
|
|
|
#ifdef CONFIG_ENABLE_VMALLOC_SAVING
|
|
#define POSSIBLE_VMALLOC_START PAGE_OFFSET
|
|
|
|
#define VMALLOC_BITMAP_SIZE ((VMALLOC_END - PAGE_OFFSET) >> \
|
|
PAGE_SHIFT)
|
|
#define VMALLOC_TO_BIT(addr) ((addr - PAGE_OFFSET) >> PAGE_SHIFT)
|
|
#define BIT_TO_VMALLOC(i) (PAGE_OFFSET + i * PAGE_SIZE)
|
|
|
|
unsigned long total_vmalloc_size;
|
|
unsigned long vmalloc_reserved;
|
|
|
|
DECLARE_BITMAP(possible_areas, VMALLOC_BITMAP_SIZE);
|
|
|
|
void mark_vmalloc_reserved_area(void *x, unsigned long size)
|
|
{
|
|
unsigned long addr = (unsigned long)x;
|
|
|
|
bitmap_set(possible_areas, VMALLOC_TO_BIT(addr), size >> PAGE_SHIFT);
|
|
vmalloc_reserved += size;
|
|
}
|
|
|
|
int is_vmalloc_addr(const void *x)
|
|
{
|
|
unsigned long addr = (unsigned long)x;
|
|
|
|
if (addr < POSSIBLE_VMALLOC_START || addr >= VMALLOC_END)
|
|
return 0;
|
|
|
|
if (test_bit(VMALLOC_TO_BIT(addr), possible_areas))
|
|
return 0;
|
|
|
|
return 1;
|
|
}
|
|
|
|
static void calc_total_vmalloc_size(void)
|
|
{
|
|
total_vmalloc_size = VMALLOC_END - POSSIBLE_VMALLOC_START -
|
|
vmalloc_reserved;
|
|
}
|
|
#else
|
|
int is_vmalloc_addr(const void *x)
|
|
{
|
|
unsigned long addr = (unsigned long)x;
|
|
|
|
return addr >= VMALLOC_START && addr < VMALLOC_END;
|
|
}
|
|
|
|
static void calc_total_vmalloc_size(void) { }
|
|
#endif
|
|
EXPORT_SYMBOL(is_vmalloc_addr);
|
|
|
|
static atomic_long_t nr_vmalloc_pages;
|
|
|
|
static struct vmap_area *__find_vmap_area(unsigned long addr)
|
|
{
|
|
struct rb_node *n = vmap_area_root.rb_node;
|
|
|
|
while (n) {
|
|
struct vmap_area *va;
|
|
|
|
va = rb_entry(n, struct vmap_area, rb_node);
|
|
if (addr < va->va_start)
|
|
n = n->rb_left;
|
|
else if (addr >= va->va_end)
|
|
n = n->rb_right;
|
|
else
|
|
return va;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* This function returns back addresses of parent node
|
|
* and its left or right link for further processing.
|
|
*/
|
|
static __always_inline struct rb_node **
|
|
find_va_links(struct vmap_area *va,
|
|
struct rb_root *root, struct rb_node *from,
|
|
struct rb_node **parent)
|
|
{
|
|
struct vmap_area *tmp_va;
|
|
struct rb_node **link;
|
|
|
|
if (root) {
|
|
link = &root->rb_node;
|
|
if (unlikely(!*link)) {
|
|
*parent = NULL;
|
|
return link;
|
|
}
|
|
} else {
|
|
link = &from;
|
|
}
|
|
|
|
/*
|
|
* Go to the bottom of the tree. When we hit the last point
|
|
* we end up with parent rb_node and correct direction, i name
|
|
* it link, where the new va->rb_node will be attached to.
|
|
*/
|
|
do {
|
|
tmp_va = rb_entry(*link, struct vmap_area, rb_node);
|
|
|
|
/*
|
|
* During the traversal we also do some sanity check.
|
|
* Trigger the BUG() if there are sides(left/right)
|
|
* or full overlaps.
|
|
*/
|
|
if (va->va_start < tmp_va->va_end &&
|
|
va->va_end <= tmp_va->va_start)
|
|
link = &(*link)->rb_left;
|
|
else if (va->va_end > tmp_va->va_start &&
|
|
va->va_start >= tmp_va->va_end)
|
|
link = &(*link)->rb_right;
|
|
else
|
|
BUG();
|
|
} while (*link);
|
|
|
|
*parent = &tmp_va->rb_node;
|
|
return link;
|
|
}
|
|
|
|
static __always_inline struct list_head *
|
|
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
|
|
{
|
|
struct list_head *list;
|
|
|
|
if (unlikely(!parent))
|
|
/*
|
|
* The red-black tree where we try to find VA neighbors
|
|
* before merging or inserting is empty, i.e. it means
|
|
* there is no free vmap space. Normally it does not
|
|
* happen but we handle this case anyway.
|
|
*/
|
|
return NULL;
|
|
|
|
list = &rb_entry(parent, struct vmap_area, rb_node)->list;
|
|
return (&parent->rb_right == link ? list->next : list);
|
|
}
|
|
|
|
static __always_inline void
|
|
link_va(struct vmap_area *va, struct rb_root *root,
|
|
struct rb_node *parent, struct rb_node **link, struct list_head *head)
|
|
{
|
|
/*
|
|
* VA is still not in the list, but we can
|
|
* identify its future previous list_head node.
|
|
*/
|
|
if (likely(parent)) {
|
|
head = &rb_entry(parent, struct vmap_area, rb_node)->list;
|
|
if (&parent->rb_right != link)
|
|
head = head->prev;
|
|
}
|
|
|
|
/* Insert to the rb-tree */
|
|
rb_link_node(&va->rb_node, parent, link);
|
|
if (root == &free_vmap_area_root) {
|
|
/*
|
|
* Some explanation here. Just perform simple insertion
|
|
* to the tree. We do not set va->subtree_max_size to
|
|
* its current size before calling rb_insert_augmented().
|
|
* It is because of we populate the tree from the bottom
|
|
* to parent levels when the node _is_ in the tree.
|
|
*
|
|
* Therefore we set subtree_max_size to zero after insertion,
|
|
* to let __augment_tree_propagate_from() puts everything to
|
|
* the correct order later on.
|
|
*/
|
|
rb_insert_augmented(&va->rb_node,
|
|
root, &free_vmap_area_rb_augment_cb);
|
|
va->subtree_max_size = 0;
|
|
} else {
|
|
rb_insert_color(&va->rb_node, root);
|
|
}
|
|
|
|
/* Address-sort this list */
|
|
list_add(&va->list, head);
|
|
}
|
|
|
|
static __always_inline void
|
|
unlink_va(struct vmap_area *va, struct rb_root *root)
|
|
{
|
|
/*
|
|
* During merging a VA node can be empty, therefore
|
|
* not linked with the tree nor list. Just check it.
|
|
*/
|
|
if (!RB_EMPTY_NODE(&va->rb_node)) {
|
|
if (root == &free_vmap_area_root)
|
|
rb_erase_augmented(&va->rb_node,
|
|
root, &free_vmap_area_rb_augment_cb);
|
|
else
|
|
rb_erase(&va->rb_node, root);
|
|
|
|
list_del(&va->list);
|
|
RB_CLEAR_NODE(&va->rb_node);
|
|
}
|
|
}
|
|
|
|
#if DEBUG_AUGMENT_PROPAGATE_CHECK
|
|
static void
|
|
augment_tree_propagate_check(struct rb_node *n)
|
|
{
|
|
struct vmap_area *va;
|
|
struct rb_node *node;
|
|
unsigned long size;
|
|
bool found = false;
|
|
|
|
if (n == NULL)
|
|
return;
|
|
|
|
va = rb_entry(n, struct vmap_area, rb_node);
|
|
size = va->subtree_max_size;
|
|
node = n;
|
|
|
|
while (node) {
|
|
va = rb_entry(node, struct vmap_area, rb_node);
|
|
|
|
if (get_subtree_max_size(node->rb_left) == size) {
|
|
node = node->rb_left;
|
|
} else {
|
|
if (va_size(va) == size) {
|
|
found = true;
|
|
break;
|
|
}
|
|
|
|
node = node->rb_right;
|
|
}
|
|
}
|
|
|
|
if (!found) {
|
|
va = rb_entry(n, struct vmap_area, rb_node);
|
|
pr_emerg("tree is corrupted: %lu, %lu\n",
|
|
va_size(va), va->subtree_max_size);
|
|
}
|
|
|
|
augment_tree_propagate_check(n->rb_left);
|
|
augment_tree_propagate_check(n->rb_right);
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* This function populates subtree_max_size from bottom to upper
|
|
* levels starting from VA point. The propagation must be done
|
|
* when VA size is modified by changing its va_start/va_end. Or
|
|
* in case of newly inserting of VA to the tree.
|
|
*
|
|
* It means that __augment_tree_propagate_from() must be called:
|
|
* - After VA has been inserted to the tree(free path);
|
|
* - After VA has been shrunk(allocation path);
|
|
* - After VA has been increased(merging path).
|
|
*
|
|
* Please note that, it does not mean that upper parent nodes
|
|
* and their subtree_max_size are recalculated all the time up
|
|
* to the root node.
|
|
*
|
|
* 4--8
|
|
* /\
|
|
* / \
|
|
* / \
|
|
* 2--2 8--8
|
|
*
|
|
* For example if we modify the node 4, shrinking it to 2, then
|
|
* no any modification is required. If we shrink the node 2 to 1
|
|
* its subtree_max_size is updated only, and set to 1. If we shrink
|
|
* the node 8 to 6, then its subtree_max_size is set to 6 and parent
|
|
* node becomes 4--6.
|
|
*/
|
|
static __always_inline void
|
|
augment_tree_propagate_from(struct vmap_area *va)
|
|
{
|
|
struct rb_node *node = &va->rb_node;
|
|
unsigned long new_va_sub_max_size;
|
|
|
|
while (node) {
|
|
va = rb_entry(node, struct vmap_area, rb_node);
|
|
new_va_sub_max_size = compute_subtree_max_size(va);
|
|
|
|
/*
|
|
* If the newly calculated maximum available size of the
|
|
* subtree is equal to the current one, then it means that
|
|
* the tree is propagated correctly. So we have to stop at
|
|
* this point to save cycles.
|
|
*/
|
|
if (va->subtree_max_size == new_va_sub_max_size)
|
|
break;
|
|
|
|
va->subtree_max_size = new_va_sub_max_size;
|
|
node = rb_parent(&va->rb_node);
|
|
}
|
|
|
|
#if DEBUG_AUGMENT_PROPAGATE_CHECK
|
|
augment_tree_propagate_check(free_vmap_area_root.rb_node);
|
|
#endif
|
|
}
|
|
|
|
static void
|
|
insert_vmap_area(struct vmap_area *va,
|
|
struct rb_root *root, struct list_head *head)
|
|
{
|
|
struct rb_node **link;
|
|
struct rb_node *parent;
|
|
|
|
link = find_va_links(va, root, NULL, &parent);
|
|
link_va(va, root, parent, link, head);
|
|
}
|
|
|
|
static void
|
|
insert_vmap_area_augment(struct vmap_area *va,
|
|
struct rb_node *from, struct rb_root *root,
|
|
struct list_head *head)
|
|
{
|
|
struct rb_node **link;
|
|
struct rb_node *parent;
|
|
|
|
if (from)
|
|
link = find_va_links(va, NULL, from, &parent);
|
|
else
|
|
link = find_va_links(va, root, NULL, &parent);
|
|
|
|
link_va(va, root, parent, link, head);
|
|
augment_tree_propagate_from(va);
|
|
}
|
|
|
|
/*
|
|
* Merge de-allocated chunk of VA memory with previous
|
|
* and next free blocks. If coalesce is not done a new
|
|
* free area is inserted. If VA has been merged, it is
|
|
* freed.
|
|
*/
|
|
static __always_inline void
|
|
merge_or_add_vmap_area(struct vmap_area *va,
|
|
struct rb_root *root, struct list_head *head)
|
|
{
|
|
struct vmap_area *sibling;
|
|
struct list_head *next;
|
|
struct rb_node **link;
|
|
struct rb_node *parent;
|
|
bool merged = false;
|
|
|
|
/*
|
|
* Find a place in the tree where VA potentially will be
|
|
* inserted, unless it is merged with its sibling/siblings.
|
|
*/
|
|
link = find_va_links(va, root, NULL, &parent);
|
|
|
|
/*
|
|
* Get next node of VA to check if merging can be done.
|
|
*/
|
|
next = get_va_next_sibling(parent, link);
|
|
if (unlikely(next == NULL))
|
|
goto insert;
|
|
|
|
/*
|
|
* start end
|
|
* | |
|
|
* |<------VA------>|<-----Next----->|
|
|
* | |
|
|
* start end
|
|
*/
|
|
if (next != head) {
|
|
sibling = list_entry(next, struct vmap_area, list);
|
|
if (sibling->va_start == va->va_end) {
|
|
sibling->va_start = va->va_start;
|
|
|
|
/* Check and update the tree if needed. */
|
|
augment_tree_propagate_from(sibling);
|
|
|
|
/* Remove this VA, it has been merged. */
|
|
unlink_va(va, root);
|
|
|
|
/* Free vmap_area object. */
|
|
kmem_cache_free(vmap_area_cachep, va);
|
|
|
|
/* Point to the new merged area. */
|
|
va = sibling;
|
|
merged = true;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* start end
|
|
* | |
|
|
* |<-----Prev----->|<------VA------>|
|
|
* | |
|
|
* start end
|
|
*/
|
|
if (next->prev != head) {
|
|
sibling = list_entry(next->prev, struct vmap_area, list);
|
|
if (sibling->va_end == va->va_start) {
|
|
sibling->va_end = va->va_end;
|
|
|
|
/* Check and update the tree if needed. */
|
|
augment_tree_propagate_from(sibling);
|
|
|
|
/* Remove this VA, it has been merged. */
|
|
unlink_va(va, root);
|
|
|
|
/* Free vmap_area object. */
|
|
kmem_cache_free(vmap_area_cachep, va);
|
|
|
|
return;
|
|
}
|
|
}
|
|
|
|
insert:
|
|
if (!merged) {
|
|
link_va(va, root, parent, link, head);
|
|
augment_tree_propagate_from(va);
|
|
}
|
|
}
|
|
|
|
static __always_inline bool
|
|
is_within_this_va(struct vmap_area *va, unsigned long size,
|
|
unsigned long align, unsigned long vstart)
|
|
{
|
|
unsigned long nva_start_addr;
|
|
|
|
if (va->va_start > vstart)
|
|
nva_start_addr = ALIGN(va->va_start, align);
|
|
else
|
|
nva_start_addr = ALIGN(vstart, align);
|
|
|
|
/* Can be overflowed due to big size or alignment. */
|
|
if (nva_start_addr + size < nva_start_addr ||
|
|
nva_start_addr < vstart)
|
|
return false;
|
|
|
|
return (nva_start_addr + size <= va->va_end);
|
|
}
|
|
|
|
/*
|
|
* Find the first free block(lowest start address) in the tree,
|
|
* that will accomplish the request corresponding to passing
|
|
* parameters.
|
|
*/
|
|
static __always_inline struct vmap_area *
|
|
find_vmap_lowest_match(unsigned long size,
|
|
unsigned long align, unsigned long vstart)
|
|
{
|
|
struct vmap_area *va;
|
|
struct rb_node *node;
|
|
unsigned long length;
|
|
|
|
/* Start from the root. */
|
|
node = free_vmap_area_root.rb_node;
|
|
|
|
/* Adjust the search size for alignment overhead. */
|
|
length = size + align - 1;
|
|
|
|
while (node) {
|
|
va = rb_entry(node, struct vmap_area, rb_node);
|
|
|
|
if (get_subtree_max_size(node->rb_left) >= length &&
|
|
vstart < va->va_start) {
|
|
node = node->rb_left;
|
|
} else {
|
|
if (is_within_this_va(va, size, align, vstart))
|
|
return va;
|
|
|
|
/*
|
|
* Does not make sense to go deeper towards the right
|
|
* sub-tree if it does not have a free block that is
|
|
* equal or bigger to the requested search length.
|
|
*/
|
|
if (get_subtree_max_size(node->rb_right) >= length) {
|
|
node = node->rb_right;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* OK. We roll back and find the fist right sub-tree,
|
|
* that will satisfy the search criteria. It can happen
|
|
* only once due to "vstart" restriction.
|
|
*/
|
|
while ((node = rb_parent(node))) {
|
|
va = rb_entry(node, struct vmap_area, rb_node);
|
|
if (is_within_this_va(va, size, align, vstart))
|
|
return va;
|
|
|
|
if (get_subtree_max_size(node->rb_right) >= length &&
|
|
vstart <= va->va_start) {
|
|
node = node->rb_right;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
|
|
#include <linux/random.h>
|
|
|
|
static struct vmap_area *
|
|
find_vmap_lowest_linear_match(unsigned long size,
|
|
unsigned long align, unsigned long vstart)
|
|
{
|
|
struct vmap_area *va;
|
|
|
|
list_for_each_entry(va, &free_vmap_area_list, list) {
|
|
if (!is_within_this_va(va, size, align, vstart))
|
|
continue;
|
|
|
|
return va;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void
|
|
find_vmap_lowest_match_check(unsigned long size)
|
|
{
|
|
struct vmap_area *va_1, *va_2;
|
|
unsigned long vstart;
|
|
unsigned int rnd;
|
|
|
|
get_random_bytes(&rnd, sizeof(rnd));
|
|
vstart = VMALLOC_START + rnd;
|
|
|
|
va_1 = find_vmap_lowest_match(size, 1, vstart);
|
|
va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
|
|
|
|
if (va_1 != va_2)
|
|
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
|
|
va_1, va_2, vstart);
|
|
}
|
|
#endif
|
|
|
|
enum fit_type {
|
|
NOTHING_FIT = 0,
|
|
FL_FIT_TYPE = 1, /* full fit */
|
|
LE_FIT_TYPE = 2, /* left edge fit */
|
|
RE_FIT_TYPE = 3, /* right edge fit */
|
|
NE_FIT_TYPE = 4 /* no edge fit */
|
|
};
|
|
|
|
static __always_inline enum fit_type
|
|
classify_va_fit_type(struct vmap_area *va,
|
|
unsigned long nva_start_addr, unsigned long size)
|
|
{
|
|
enum fit_type type;
|
|
|
|
/* Check if it is within VA. */
|
|
if (nva_start_addr < va->va_start ||
|
|
nva_start_addr + size > va->va_end)
|
|
return NOTHING_FIT;
|
|
|
|
/* Now classify. */
|
|
if (va->va_start == nva_start_addr) {
|
|
if (va->va_end == nva_start_addr + size)
|
|
type = FL_FIT_TYPE;
|
|
else
|
|
type = LE_FIT_TYPE;
|
|
} else if (va->va_end == nva_start_addr + size) {
|
|
type = RE_FIT_TYPE;
|
|
} else {
|
|
type = NE_FIT_TYPE;
|
|
}
|
|
|
|
return type;
|
|
}
|
|
|
|
static __always_inline int
|
|
adjust_va_to_fit_type(struct vmap_area *va,
|
|
unsigned long nva_start_addr, unsigned long size,
|
|
enum fit_type type)
|
|
{
|
|
struct vmap_area *lva = NULL;
|
|
|
|
if (type == FL_FIT_TYPE) {
|
|
/*
|
|
* No need to split VA, it fully fits.
|
|
*
|
|
* | |
|
|
* V NVA V
|
|
* |---------------|
|
|
*/
|
|
unlink_va(va, &free_vmap_area_root);
|
|
kmem_cache_free(vmap_area_cachep, va);
|
|
} else if (type == LE_FIT_TYPE) {
|
|
/*
|
|
* Split left edge of fit VA.
|
|
*
|
|
* | |
|
|
* V NVA V R
|
|
* |-------|-------|
|
|
*/
|
|
va->va_start += size;
|
|
} else if (type == RE_FIT_TYPE) {
|
|
/*
|
|
* Split right edge of fit VA.
|
|
*
|
|
* | |
|
|
* L V NVA V
|
|
* |-------|-------|
|
|
*/
|
|
va->va_end = nva_start_addr;
|
|
} else if (type == NE_FIT_TYPE) {
|
|
/*
|
|
* Split no edge of fit VA.
|
|
*
|
|
* | |
|
|
* L V NVA V R
|
|
* |---|-------|---|
|
|
*/
|
|
lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
|
|
if (unlikely(!lva))
|
|
return -1;
|
|
|
|
/*
|
|
* Build the remainder.
|
|
*/
|
|
lva->va_start = va->va_start;
|
|
lva->va_end = nva_start_addr;
|
|
|
|
/*
|
|
* Shrink this VA to remaining size.
|
|
*/
|
|
va->va_start = nva_start_addr + size;
|
|
} else {
|
|
return -1;
|
|
}
|
|
|
|
if (type != FL_FIT_TYPE) {
|
|
augment_tree_propagate_from(va);
|
|
|
|
if (lva) /* type == NE_FIT_TYPE */
|
|
insert_vmap_area_augment(lva, &va->rb_node,
|
|
&free_vmap_area_root, &free_vmap_area_list);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Returns a start address of the newly allocated area, if success.
|
|
* Otherwise a vend is returned that indicates failure.
|
|
*/
|
|
static __always_inline unsigned long
|
|
__alloc_vmap_area(unsigned long size, unsigned long align,
|
|
unsigned long vstart, unsigned long vend, int node)
|
|
{
|
|
unsigned long nva_start_addr;
|
|
struct vmap_area *va;
|
|
enum fit_type type;
|
|
int ret;
|
|
|
|
va = find_vmap_lowest_match(size, align, vstart);
|
|
if (unlikely(!va))
|
|
return vend;
|
|
|
|
if (va->va_start > vstart)
|
|
nva_start_addr = ALIGN(va->va_start, align);
|
|
else
|
|
nva_start_addr = ALIGN(vstart, align);
|
|
|
|
/* Check the "vend" restriction. */
|
|
if (nva_start_addr + size > vend)
|
|
return vend;
|
|
|
|
/* Classify what we have found. */
|
|
type = classify_va_fit_type(va, nva_start_addr, size);
|
|
if (WARN_ON_ONCE(type == NOTHING_FIT))
|
|
return vend;
|
|
|
|
/* Update the free vmap_area. */
|
|
ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
|
|
if (ret)
|
|
return vend;
|
|
|
|
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
|
|
find_vmap_lowest_match_check(size);
|
|
#endif
|
|
|
|
return nva_start_addr;
|
|
}
|
|
|
|
/*
|
|
* Allocate a region of KVA of the specified size and alignment, within the
|
|
* vstart and vend.
|
|
*/
|
|
static struct vmap_area *alloc_vmap_area(unsigned long size,
|
|
unsigned long align,
|
|
unsigned long vstart, unsigned long vend,
|
|
int node, gfp_t gfp_mask)
|
|
{
|
|
struct vmap_area *va;
|
|
unsigned long addr;
|
|
int purged = 0;
|
|
|
|
BUG_ON(!size);
|
|
BUG_ON(offset_in_page(size));
|
|
BUG_ON(!is_power_of_2(align));
|
|
|
|
if (unlikely(!vmap_initialized))
|
|
return ERR_PTR(-EBUSY);
|
|
|
|
might_sleep();
|
|
|
|
va = kmem_cache_alloc_node(vmap_area_cachep,
|
|
gfp_mask & GFP_RECLAIM_MASK, node);
|
|
if (unlikely(!va))
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
/*
|
|
* Only scan the relevant parts containing pointers to other objects
|
|
* to avoid false negatives.
|
|
*/
|
|
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
|
|
|
|
retry:
|
|
spin_lock(&vmap_area_lock);
|
|
|
|
/*
|
|
* If an allocation fails, the "vend" address is
|
|
* returned. Therefore trigger the overflow path.
|
|
*/
|
|
addr = __alloc_vmap_area(size, align, vstart, vend, node);
|
|
if (unlikely(addr == vend))
|
|
goto overflow;
|
|
|
|
va->va_start = addr;
|
|
va->va_end = addr + size;
|
|
va->flags = 0;
|
|
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
|
|
|
|
spin_unlock(&vmap_area_lock);
|
|
|
|
BUG_ON(!IS_ALIGNED(va->va_start, align));
|
|
BUG_ON(va->va_start < vstart);
|
|
BUG_ON(va->va_end > vend);
|
|
|
|
return va;
|
|
|
|
overflow:
|
|
spin_unlock(&vmap_area_lock);
|
|
if (!purged) {
|
|
purge_vmap_area_lazy();
|
|
purged = 1;
|
|
goto retry;
|
|
}
|
|
|
|
if (gfpflags_allow_blocking(gfp_mask)) {
|
|
unsigned long freed = 0;
|
|
blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
|
|
if (freed > 0) {
|
|
purged = 0;
|
|
goto retry;
|
|
}
|
|
}
|
|
|
|
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
|
|
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
|
|
size);
|
|
|
|
kmem_cache_free(vmap_area_cachep, va);
|
|
return ERR_PTR(-EBUSY);
|
|
}
|
|
|
|
int register_vmap_purge_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_register(&vmap_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
|
|
|
|
int unregister_vmap_purge_notifier(struct notifier_block *nb)
|
|
{
|
|
return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
|
|
}
|
|
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
|
|
|
|
static void __free_vmap_area(struct vmap_area *va)
|
|
{
|
|
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
|
|
|
|
/*
|
|
* Remove from the busy tree/list.
|
|
*/
|
|
unlink_va(va, &vmap_area_root);
|
|
|
|
/*
|
|
* Merge VA with its neighbors, otherwise just add it.
|
|
*/
|
|
merge_or_add_vmap_area(va,
|
|
&free_vmap_area_root, &free_vmap_area_list);
|
|
}
|
|
|
|
/*
|
|
* Free a region of KVA allocated by alloc_vmap_area
|
|
*/
|
|
static void free_vmap_area(struct vmap_area *va)
|
|
{
|
|
spin_lock(&vmap_area_lock);
|
|
__free_vmap_area(va);
|
|
spin_unlock(&vmap_area_lock);
|
|
}
|
|
|
|
/*
|
|
* Clear the pagetable entries of a given vmap_area
|
|
*/
|
|
static void unmap_vmap_area(struct vmap_area *va)
|
|
{
|
|
vunmap_page_range(va->va_start, va->va_end);
|
|
}
|
|
|
|
/*
|
|
* lazy_max_pages is the maximum amount of virtual address space we gather up
|
|
* before attempting to purge with a TLB flush.
|
|
*
|
|
* There is a tradeoff here: a larger number will cover more kernel page tables
|
|
* and take slightly longer to purge, but it will linearly reduce the number of
|
|
* global TLB flushes that must be performed. It would seem natural to scale
|
|
* this number up linearly with the number of CPUs (because vmapping activity
|
|
* could also scale linearly with the number of CPUs), however it is likely
|
|
* that in practice, workloads might be constrained in other ways that mean
|
|
* vmap activity will not scale linearly with CPUs. Also, I want to be
|
|
* conservative and not introduce a big latency on huge systems, so go with
|
|
* a less aggressive log scale. It will still be an improvement over the old
|
|
* code, and it will be simple to change the scale factor if we find that it
|
|
* becomes a problem on bigger systems.
|
|
*/
|
|
static unsigned long lazy_max_pages(void)
|
|
{
|
|
unsigned int log;
|
|
|
|
log = fls(num_online_cpus());
|
|
|
|
return log * (1UL * CONFIG_VMAP_LAZY_PURGING_FACTOR *
|
|
1024 * 1024 / PAGE_SIZE);
|
|
}
|
|
|
|
static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
|
|
|
|
/*
|
|
* Serialize vmap purging. There is no actual criticial section protected
|
|
* by this look, but we want to avoid concurrent calls for performance
|
|
* reasons and to make the pcpu_get_vm_areas more deterministic.
|
|
*/
|
|
static DEFINE_MUTEX(vmap_purge_lock);
|
|
|
|
/* for per-CPU blocks */
|
|
static void purge_fragmented_blocks_allcpus(void);
|
|
|
|
/*
|
|
* called before a call to iounmap() if the caller wants vm_area_struct's
|
|
* immediately freed.
|
|
*/
|
|
void set_iounmap_nonlazy(void)
|
|
{
|
|
atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
|
|
}
|
|
|
|
/*
|
|
* Purges all lazily-freed vmap areas.
|
|
*/
|
|
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
|
|
{
|
|
unsigned long resched_threshold;
|
|
struct llist_node *valist;
|
|
struct vmap_area *va;
|
|
struct vmap_area *n_va;
|
|
|
|
lockdep_assert_held(&vmap_purge_lock);
|
|
|
|
valist = llist_del_all(&vmap_purge_list);
|
|
if (unlikely(valist == NULL))
|
|
return false;
|
|
|
|
/*
|
|
* TODO: to calculate a flush range without looping.
|
|
* The list can be up to lazy_max_pages() elements.
|
|
*/
|
|
llist_for_each_entry(va, valist, purge_list) {
|
|
if (va->va_start < start)
|
|
start = va->va_start;
|
|
if (va->va_end > end)
|
|
end = va->va_end;
|
|
}
|
|
|
|
flush_tlb_kernel_range(start, end);
|
|
resched_threshold = lazy_max_pages() << 1;
|
|
|
|
spin_lock(&vmap_area_lock);
|
|
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
|
|
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
|
|
|
|
__free_vmap_area(va);
|
|
atomic_long_sub(nr, &vmap_lazy_nr);
|
|
|
|
if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
|
|
cond_resched_lock(&vmap_area_lock);
|
|
}
|
|
spin_unlock(&vmap_area_lock);
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Kick off a purge of the outstanding lazy areas. Don't bother if somebody
|
|
* is already purging.
|
|
*/
|
|
static void try_purge_vmap_area_lazy(void)
|
|
{
|
|
if (mutex_trylock(&vmap_purge_lock)) {
|
|
__purge_vmap_area_lazy(ULONG_MAX, 0);
|
|
mutex_unlock(&vmap_purge_lock);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Kick off a purge of the outstanding lazy areas.
|
|
*/
|
|
static void purge_vmap_area_lazy(void)
|
|
{
|
|
mutex_lock(&vmap_purge_lock);
|
|
purge_fragmented_blocks_allcpus();
|
|
__purge_vmap_area_lazy(ULONG_MAX, 0);
|
|
mutex_unlock(&vmap_purge_lock);
|
|
}
|
|
|
|
/*
|
|
* Free a vmap area, caller ensuring that the area has been unmapped
|
|
* and flush_cache_vunmap had been called for the correct range
|
|
* previously.
|
|
*/
|
|
static void free_vmap_area_noflush(struct vmap_area *va)
|
|
{
|
|
unsigned long nr_lazy;
|
|
|
|
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
|
|
PAGE_SHIFT, &vmap_lazy_nr);
|
|
|
|
/* After this point, we may free va at any time */
|
|
llist_add(&va->purge_list, &vmap_purge_list);
|
|
|
|
if (unlikely(nr_lazy > lazy_max_pages()))
|
|
try_purge_vmap_area_lazy();
|
|
}
|
|
|
|
/*
|
|
* Free and unmap a vmap area
|
|
*/
|
|
static void free_unmap_vmap_area(struct vmap_area *va)
|
|
{
|
|
flush_cache_vunmap(va->va_start, va->va_end);
|
|
unmap_vmap_area(va);
|
|
if (debug_pagealloc_enabled())
|
|
flush_tlb_kernel_range(va->va_start, va->va_end);
|
|
|
|
free_vmap_area_noflush(va);
|
|
}
|
|
|
|
static struct vmap_area *find_vmap_area(unsigned long addr)
|
|
{
|
|
struct vmap_area *va;
|
|
|
|
spin_lock(&vmap_area_lock);
|
|
va = __find_vmap_area(addr);
|
|
spin_unlock(&vmap_area_lock);
|
|
|
|
return va;
|
|
}
|
|
|
|
/*** Per cpu kva allocator ***/
|
|
|
|
/*
|
|
* vmap space is limited especially on 32 bit architectures. Ensure there is
|
|
* room for at least 16 percpu vmap blocks per CPU.
|
|
*/
|
|
/*
|
|
* If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
|
|
* to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
|
|
* instead (we just need a rough idea)
|
|
*/
|
|
#if BITS_PER_LONG == 32
|
|
#define VMALLOC_SPACE (128UL*1024*1024)
|
|
#else
|
|
#define VMALLOC_SPACE (128UL*1024*1024*1024)
|
|
#endif
|
|
|
|
#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
|
|
#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
|
|
#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
|
|
#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
|
|
#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
|
|
#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
|
|
#define VMAP_BBMAP_BITS \
|
|
VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
|
|
VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
|
|
VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
|
|
|
|
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
|
|
|
|
struct vmap_block_queue {
|
|
spinlock_t lock;
|
|
struct list_head free;
|
|
};
|
|
|
|
struct vmap_block {
|
|
spinlock_t lock;
|
|
struct vmap_area *va;
|
|
unsigned long free, dirty;
|
|
unsigned long dirty_min, dirty_max; /*< dirty range */
|
|
struct list_head free_list;
|
|
struct rcu_head rcu_head;
|
|
struct list_head purge;
|
|
};
|
|
|
|
/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
|
|
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
|
|
|
|
/*
|
|
* Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
|
|
* in the free path. Could get rid of this if we change the API to return a
|
|
* "cookie" from alloc, to be passed to free. But no big deal yet.
|
|
*/
|
|
static DEFINE_SPINLOCK(vmap_block_tree_lock);
|
|
static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
|
|
|
|
/*
|
|
* We should probably have a fallback mechanism to allocate virtual memory
|
|
* out of partially filled vmap blocks. However vmap block sizing should be
|
|
* fairly reasonable according to the vmalloc size, so it shouldn't be a
|
|
* big problem.
|
|
*/
|
|
|
|
static unsigned long addr_to_vb_idx(unsigned long addr)
|
|
{
|
|
addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
|
|
addr /= VMAP_BLOCK_SIZE;
|
|
return addr;
|
|
}
|
|
|
|
static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
|
|
{
|
|
unsigned long addr;
|
|
|
|
addr = va_start + (pages_off << PAGE_SHIFT);
|
|
BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
|
|
return (void *)addr;
|
|
}
|
|
|
|
/**
|
|
* new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
|
|
* block. Of course pages number can't exceed VMAP_BBMAP_BITS
|
|
* @order: how many 2^order pages should be occupied in newly allocated block
|
|
* @gfp_mask: flags for the page level allocator
|
|
*
|
|
* Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
|
|
*/
|
|
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
|
|
{
|
|
struct vmap_block_queue *vbq;
|
|
struct vmap_block *vb;
|
|
struct vmap_area *va;
|
|
unsigned long vb_idx;
|
|
int node, err;
|
|
void *vaddr;
|
|
|
|
node = numa_node_id();
|
|
|
|
vb = kmalloc_node(sizeof(struct vmap_block),
|
|
gfp_mask & GFP_RECLAIM_MASK, node);
|
|
if (unlikely(!vb))
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
|
|
VMALLOC_START, VMALLOC_END,
|
|
node, gfp_mask);
|
|
if (IS_ERR(va)) {
|
|
kfree(vb);
|
|
return ERR_CAST(va);
|
|
}
|
|
|
|
err = radix_tree_preload(gfp_mask);
|
|
if (unlikely(err)) {
|
|
kfree(vb);
|
|
free_vmap_area(va);
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
vaddr = vmap_block_vaddr(va->va_start, 0);
|
|
spin_lock_init(&vb->lock);
|
|
vb->va = va;
|
|
/* At least something should be left free */
|
|
BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
|
|
vb->free = VMAP_BBMAP_BITS - (1UL << order);
|
|
vb->dirty = 0;
|
|
vb->dirty_min = VMAP_BBMAP_BITS;
|
|
vb->dirty_max = 0;
|
|
INIT_LIST_HEAD(&vb->free_list);
|
|
|
|
vb_idx = addr_to_vb_idx(va->va_start);
|
|
spin_lock(&vmap_block_tree_lock);
|
|
err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
|
|
spin_unlock(&vmap_block_tree_lock);
|
|
BUG_ON(err);
|
|
radix_tree_preload_end();
|
|
|
|
vbq = &get_cpu_var(vmap_block_queue);
|
|
spin_lock(&vbq->lock);
|
|
list_add_tail_rcu(&vb->free_list, &vbq->free);
|
|
spin_unlock(&vbq->lock);
|
|
put_cpu_var(vmap_block_queue);
|
|
|
|
return vaddr;
|
|
}
|
|
|
|
static void free_vmap_block(struct vmap_block *vb)
|
|
{
|
|
struct vmap_block *tmp;
|
|
unsigned long vb_idx;
|
|
|
|
vb_idx = addr_to_vb_idx(vb->va->va_start);
|
|
spin_lock(&vmap_block_tree_lock);
|
|
tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
|
|
spin_unlock(&vmap_block_tree_lock);
|
|
BUG_ON(tmp != vb);
|
|
|
|
free_vmap_area_noflush(vb->va);
|
|
kfree_rcu(vb, rcu_head);
|
|
}
|
|
|
|
static void purge_fragmented_blocks(int cpu)
|
|
{
|
|
LIST_HEAD(purge);
|
|
struct vmap_block *vb;
|
|
struct vmap_block *n_vb;
|
|
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
|
|
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
|
|
|
|
if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
|
|
continue;
|
|
|
|
spin_lock(&vb->lock);
|
|
if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
|
|
vb->free = 0; /* prevent further allocs after releasing lock */
|
|
vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
|
|
vb->dirty_min = 0;
|
|
vb->dirty_max = VMAP_BBMAP_BITS;
|
|
spin_lock(&vbq->lock);
|
|
list_del_rcu(&vb->free_list);
|
|
spin_unlock(&vbq->lock);
|
|
spin_unlock(&vb->lock);
|
|
list_add_tail(&vb->purge, &purge);
|
|
} else
|
|
spin_unlock(&vb->lock);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
list_for_each_entry_safe(vb, n_vb, &purge, purge) {
|
|
list_del(&vb->purge);
|
|
free_vmap_block(vb);
|
|
}
|
|
}
|
|
|
|
static void purge_fragmented_blocks_allcpus(void)
|
|
{
|
|
int cpu;
|
|
|
|
for_each_possible_cpu(cpu)
|
|
purge_fragmented_blocks(cpu);
|
|
}
|
|
|
|
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
|
|
{
|
|
struct vmap_block_queue *vbq;
|
|
struct vmap_block *vb;
|
|
void *vaddr = NULL;
|
|
unsigned int order;
|
|
|
|
BUG_ON(offset_in_page(size));
|
|
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
|
|
if (WARN_ON(size == 0)) {
|
|
/*
|
|
* Allocating 0 bytes isn't what caller wants since
|
|
* get_order(0) returns funny result. Just warn and terminate
|
|
* early.
|
|
*/
|
|
return NULL;
|
|
}
|
|
order = get_order(size);
|
|
|
|
rcu_read_lock();
|
|
vbq = &get_cpu_var(vmap_block_queue);
|
|
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
|
|
unsigned long pages_off;
|
|
|
|
spin_lock(&vb->lock);
|
|
if (vb->free < (1UL << order)) {
|
|
spin_unlock(&vb->lock);
|
|
continue;
|
|
}
|
|
|
|
pages_off = VMAP_BBMAP_BITS - vb->free;
|
|
vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
|
|
vb->free -= 1UL << order;
|
|
if (vb->free == 0) {
|
|
spin_lock(&vbq->lock);
|
|
list_del_rcu(&vb->free_list);
|
|
spin_unlock(&vbq->lock);
|
|
}
|
|
|
|
spin_unlock(&vb->lock);
|
|
break;
|
|
}
|
|
|
|
put_cpu_var(vmap_block_queue);
|
|
rcu_read_unlock();
|
|
|
|
/* Allocate new block if nothing was found */
|
|
if (!vaddr)
|
|
vaddr = new_vmap_block(order, gfp_mask);
|
|
|
|
return vaddr;
|
|
}
|
|
|
|
static void vb_free(const void *addr, unsigned long size)
|
|
{
|
|
unsigned long offset;
|
|
unsigned long vb_idx;
|
|
unsigned int order;
|
|
struct vmap_block *vb;
|
|
|
|
BUG_ON(offset_in_page(size));
|
|
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
|
|
|
|
flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
|
|
|
|
order = get_order(size);
|
|
|
|
offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
|
|
offset >>= PAGE_SHIFT;
|
|
|
|
vb_idx = addr_to_vb_idx((unsigned long)addr);
|
|
rcu_read_lock();
|
|
vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
|
|
rcu_read_unlock();
|
|
BUG_ON(!vb);
|
|
|
|
vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
|
|
|
|
if (debug_pagealloc_enabled())
|
|
flush_tlb_kernel_range((unsigned long)addr,
|
|
(unsigned long)addr + size);
|
|
|
|
spin_lock(&vb->lock);
|
|
|
|
/* Expand dirty range */
|
|
vb->dirty_min = min(vb->dirty_min, offset);
|
|
vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
|
|
|
|
vb->dirty += 1UL << order;
|
|
if (vb->dirty == VMAP_BBMAP_BITS) {
|
|
BUG_ON(vb->free);
|
|
spin_unlock(&vb->lock);
|
|
free_vmap_block(vb);
|
|
} else
|
|
spin_unlock(&vb->lock);
|
|
}
|
|
|
|
/**
|
|
* vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
|
|
*
|
|
* The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
|
|
* to amortize TLB flushing overheads. What this means is that any page you
|
|
* have now, may, in a former life, have been mapped into kernel virtual
|
|
* address by the vmap layer and so there might be some CPUs with TLB entries
|
|
* still referencing that page (additional to the regular 1:1 kernel mapping).
|
|
*
|
|
* vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
|
|
* be sure that none of the pages we have control over will have any aliases
|
|
* from the vmap layer.
|
|
*/
|
|
void vm_unmap_aliases(void)
|
|
{
|
|
unsigned long start = ULONG_MAX, end = 0;
|
|
int cpu;
|
|
int flush = 0;
|
|
|
|
if (unlikely(!vmap_initialized))
|
|
return;
|
|
|
|
might_sleep();
|
|
|
|
for_each_possible_cpu(cpu) {
|
|
struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
|
|
struct vmap_block *vb;
|
|
|
|
rcu_read_lock();
|
|
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
|
|
spin_lock(&vb->lock);
|
|
if (vb->dirty) {
|
|
unsigned long va_start = vb->va->va_start;
|
|
unsigned long s, e;
|
|
|
|
s = va_start + (vb->dirty_min << PAGE_SHIFT);
|
|
e = va_start + (vb->dirty_max << PAGE_SHIFT);
|
|
|
|
start = min(s, start);
|
|
end = max(e, end);
|
|
|
|
flush = 1;
|
|
}
|
|
spin_unlock(&vb->lock);
|
|
}
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
mutex_lock(&vmap_purge_lock);
|
|
purge_fragmented_blocks_allcpus();
|
|
if (!__purge_vmap_area_lazy(start, end) && flush)
|
|
flush_tlb_kernel_range(start, end);
|
|
mutex_unlock(&vmap_purge_lock);
|
|
}
|
|
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
|
|
|
|
/**
|
|
* vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
|
|
* @mem: the pointer returned by vm_map_ram
|
|
* @count: the count passed to that vm_map_ram call (cannot unmap partial)
|
|
*/
|
|
void vm_unmap_ram(const void *mem, unsigned int count)
|
|
{
|
|
unsigned long size = (unsigned long)count << PAGE_SHIFT;
|
|
unsigned long addr = (unsigned long)mem;
|
|
struct vmap_area *va;
|
|
|
|
might_sleep();
|
|
BUG_ON(!addr);
|
|
BUG_ON(addr < VMALLOC_START);
|
|
BUG_ON(addr > VMALLOC_END);
|
|
BUG_ON(!PAGE_ALIGNED(addr));
|
|
|
|
if (likely(count <= VMAP_MAX_ALLOC)) {
|
|
debug_check_no_locks_freed(mem, size);
|
|
vb_free(mem, size);
|
|
return;
|
|
}
|
|
|
|
va = find_vmap_area(addr);
|
|
BUG_ON(!va);
|
|
debug_check_no_locks_freed((void *)va->va_start,
|
|
(va->va_end - va->va_start));
|
|
free_unmap_vmap_area(va);
|
|
}
|
|
EXPORT_SYMBOL(vm_unmap_ram);
|
|
|
|
/**
|
|
* vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
|
|
* @pages: an array of pointers to the pages to be mapped
|
|
* @count: number of pages
|
|
* @node: prefer to allocate data structures on this node
|
|
* @prot: memory protection to use. PAGE_KERNEL for regular RAM
|
|
*
|
|
* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
|
|
* faster than vmap so it's good. But if you mix long-life and short-life
|
|
* objects with vm_map_ram(), it could consume lots of address space through
|
|
* fragmentation (especially on a 32bit machine). You could see failures in
|
|
* the end. Please use this function for short-lived objects.
|
|
*
|
|
* Returns: a pointer to the address that has been mapped, or %NULL on failure
|
|
*/
|
|
void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
|
|
{
|
|
unsigned long size = (unsigned long)count << PAGE_SHIFT;
|
|
unsigned long addr;
|
|
void *mem;
|
|
|
|
if (likely(count <= VMAP_MAX_ALLOC)) {
|
|
mem = vb_alloc(size, GFP_KERNEL);
|
|
if (IS_ERR(mem))
|
|
return NULL;
|
|
addr = (unsigned long)mem;
|
|
} else {
|
|
struct vmap_area *va;
|
|
va = alloc_vmap_area(size, PAGE_SIZE,
|
|
VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
|
|
if (IS_ERR(va))
|
|
return NULL;
|
|
|
|
addr = va->va_start;
|
|
mem = (void *)addr;
|
|
}
|
|
if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
|
|
vm_unmap_ram(mem, count);
|
|
return NULL;
|
|
}
|
|
return mem;
|
|
}
|
|
EXPORT_SYMBOL(vm_map_ram);
|
|
|
|
static struct vm_struct *vmlist __initdata;
|
|
|
|
/**
|
|
* vm_area_check_early - check if vmap area is already mapped
|
|
* @vm: vm_struct to be checked
|
|
*
|
|
* This function is used to check if the vmap area has been
|
|
* mapped already. @vm->addr, @vm->size and @vm->flags should
|
|
* contain proper values.
|
|
*
|
|
*/
|
|
int __init vm_area_check_early(struct vm_struct *vm)
|
|
{
|
|
struct vm_struct *tmp, **p;
|
|
|
|
BUG_ON(vmap_initialized);
|
|
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
|
|
if (tmp->addr >= vm->addr) {
|
|
if (tmp->addr < vm->addr + vm->size)
|
|
return 1;
|
|
} else {
|
|
if (tmp->addr + tmp->size > vm->addr)
|
|
return 1;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* vm_area_add_early - add vmap area early during boot
|
|
* @vm: vm_struct to add
|
|
*
|
|
* This function is used to add fixed kernel vm area to vmlist before
|
|
* vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
|
|
* should contain proper values and the other fields should be zero.
|
|
*
|
|
* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
|
|
*/
|
|
void __init vm_area_add_early(struct vm_struct *vm)
|
|
{
|
|
struct vm_struct *tmp, **p;
|
|
|
|
BUG_ON(vmap_initialized);
|
|
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
|
|
if (tmp->addr >= vm->addr) {
|
|
BUG_ON(tmp->addr < vm->addr + vm->size);
|
|
break;
|
|
} else
|
|
BUG_ON(tmp->addr + tmp->size > vm->addr);
|
|
}
|
|
vm->next = *p;
|
|
*p = vm;
|
|
}
|
|
|
|
/**
|
|
* vm_area_register_early - register vmap area early during boot
|
|
* @vm: vm_struct to register
|
|
* @align: requested alignment
|
|
*
|
|
* This function is used to register kernel vm area before
|
|
* vmalloc_init() is called. @vm->size and @vm->flags should contain
|
|
* proper values on entry and other fields should be zero. On return,
|
|
* vm->addr contains the allocated address.
|
|
*
|
|
* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
|
|
*/
|
|
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
|
|
{
|
|
static size_t vm_init_off __initdata;
|
|
unsigned long addr;
|
|
|
|
addr = ALIGN(VMALLOC_START + vm_init_off, align);
|
|
vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
|
|
|
|
vm->addr = (void *)addr;
|
|
|
|
vm_area_add_early(vm);
|
|
}
|
|
|
|
static void vmap_init_free_space(void)
|
|
{
|
|
unsigned long vmap_start = 1;
|
|
const unsigned long vmap_end = ULONG_MAX;
|
|
struct vmap_area *busy, *free;
|
|
|
|
/*
|
|
* B F B B B F
|
|
* -|-----|.....|-----|-----|-----|.....|-
|
|
* | The KVA space |
|
|
* |<--------------------------------->|
|
|
*/
|
|
list_for_each_entry(busy, &vmap_area_list, list) {
|
|
if (busy->va_start - vmap_start > 0) {
|
|
free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
|
|
if (!WARN_ON_ONCE(!free)) {
|
|
free->va_start = vmap_start;
|
|
free->va_end = busy->va_start;
|
|
|
|
insert_vmap_area_augment(free, NULL,
|
|
&free_vmap_area_root,
|
|
&free_vmap_area_list);
|
|
}
|
|
}
|
|
|
|
vmap_start = busy->va_end;
|
|
}
|
|
|
|
if (vmap_end - vmap_start > 0) {
|
|
free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
|
|
if (!WARN_ON_ONCE(!free)) {
|
|
free->va_start = vmap_start;
|
|
free->va_end = vmap_end;
|
|
|
|
insert_vmap_area_augment(free, NULL,
|
|
&free_vmap_area_root,
|
|
&free_vmap_area_list);
|
|
}
|
|
}
|
|
}
|
|
|
|
void __init vmalloc_init(void)
|
|
{
|
|
struct vmap_area *va;
|
|
struct vm_struct *tmp;
|
|
int i;
|
|
|
|
/*
|
|
* Create the cache for vmap_area objects.
|
|
*/
|
|
vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
|
|
|
|
for_each_possible_cpu(i) {
|
|
struct vmap_block_queue *vbq;
|
|
struct vfree_deferred *p;
|
|
|
|
vbq = &per_cpu(vmap_block_queue, i);
|
|
spin_lock_init(&vbq->lock);
|
|
INIT_LIST_HEAD(&vbq->free);
|
|
p = &per_cpu(vfree_deferred, i);
|
|
init_llist_head(&p->list);
|
|
INIT_WORK(&p->wq, free_work);
|
|
}
|
|
|
|
/* Import existing vmlist entries. */
|
|
for (tmp = vmlist; tmp; tmp = tmp->next) {
|
|
va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
|
|
if (WARN_ON_ONCE(!va))
|
|
continue;
|
|
|
|
va->flags = VM_VM_AREA;
|
|
va->va_start = (unsigned long)tmp->addr;
|
|
va->va_end = va->va_start + tmp->size;
|
|
va->vm = tmp;
|
|
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
|
|
}
|
|
|
|
/*
|
|
* Now we can initialize a free vmap space.
|
|
*/
|
|
vmap_init_free_space();
|
|
calc_total_vmalloc_size();
|
|
vmap_initialized = true;
|
|
}
|
|
|
|
/**
|
|
* map_kernel_range_noflush - map kernel VM area with the specified pages
|
|
* @addr: start of the VM area to map
|
|
* @size: size of the VM area to map
|
|
* @prot: page protection flags to use
|
|
* @pages: pages to map
|
|
*
|
|
* Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
|
|
* specify should have been allocated using get_vm_area() and its
|
|
* friends.
|
|
*
|
|
* NOTE:
|
|
* This function does NOT do any cache flushing. The caller is
|
|
* responsible for calling flush_cache_vmap() on to-be-mapped areas
|
|
* before calling this function.
|
|
*
|
|
* RETURNS:
|
|
* The number of pages mapped on success, -errno on failure.
|
|
*/
|
|
int map_kernel_range_noflush(unsigned long addr, unsigned long size,
|
|
pgprot_t prot, struct page **pages)
|
|
{
|
|
return vmap_page_range_noflush(addr, addr + size, prot, pages);
|
|
}
|
|
|
|
/**
|
|
* unmap_kernel_range_noflush - unmap kernel VM area
|
|
* @addr: start of the VM area to unmap
|
|
* @size: size of the VM area to unmap
|
|
*
|
|
* Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
|
|
* specify should have been allocated using get_vm_area() and its
|
|
* friends.
|
|
*
|
|
* NOTE:
|
|
* This function does NOT do any cache flushing. The caller is
|
|
* responsible for calling flush_cache_vunmap() on to-be-mapped areas
|
|
* before calling this function and flush_tlb_kernel_range() after.
|
|
*/
|
|
void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
|
|
{
|
|
vunmap_page_range(addr, addr + size);
|
|
}
|
|
EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
|
|
|
|
/**
|
|
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
|
|
* @addr: start of the VM area to unmap
|
|
* @size: size of the VM area to unmap
|
|
*
|
|
* Similar to unmap_kernel_range_noflush() but flushes vcache before
|
|
* the unmapping and tlb after.
|
|
*/
|
|
void unmap_kernel_range(unsigned long addr, unsigned long size)
|
|
{
|
|
unsigned long end = addr + size;
|
|
|
|
flush_cache_vunmap(addr, end);
|
|
vunmap_page_range(addr, end);
|
|
flush_tlb_kernel_range(addr, end);
|
|
}
|
|
EXPORT_SYMBOL_GPL(unmap_kernel_range);
|
|
|
|
int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
|
|
{
|
|
unsigned long addr = (unsigned long)area->addr;
|
|
unsigned long end = addr + get_vm_area_size(area);
|
|
int err;
|
|
|
|
err = vmap_page_range(addr, end, prot, pages);
|
|
|
|
return err > 0 ? 0 : err;
|
|
}
|
|
EXPORT_SYMBOL_GPL(map_vm_area);
|
|
|
|
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
|
|
unsigned long flags, const void *caller)
|
|
{
|
|
spin_lock(&vmap_area_lock);
|
|
vm->flags = flags;
|
|
vm->addr = (void *)va->va_start;
|
|
vm->size = va->va_end - va->va_start;
|
|
vm->caller = caller;
|
|
va->vm = vm;
|
|
va->flags |= VM_VM_AREA;
|
|
spin_unlock(&vmap_area_lock);
|
|
}
|
|
|
|
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
|
|
{
|
|
/*
|
|
* Before removing VM_UNINITIALIZED,
|
|
* we should make sure that vm has proper values.
|
|
* Pair with smp_rmb() in show_numa_info().
|
|
*/
|
|
smp_wmb();
|
|
vm->flags &= ~VM_UNINITIALIZED;
|
|
}
|
|
|
|
static struct vm_struct *__get_vm_area_node(unsigned long size,
|
|
unsigned long align, unsigned long flags, unsigned long start,
|
|
unsigned long end, int node, gfp_t gfp_mask, const void *caller)
|
|
{
|
|
struct vmap_area *va;
|
|
struct vm_struct *area;
|
|
|
|
BUG_ON(in_interrupt());
|
|
size = PAGE_ALIGN(size);
|
|
if (unlikely(!size))
|
|
return NULL;
|
|
|
|
if (flags & VM_IOREMAP)
|
|
align = 1ul << clamp_t(int, get_count_order_long(size),
|
|
PAGE_SHIFT, IOREMAP_MAX_ORDER);
|
|
|
|
area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
|
|
if (unlikely(!area))
|
|
return NULL;
|
|
|
|
if (!(flags & VM_NO_GUARD))
|
|
size += PAGE_SIZE;
|
|
|
|
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
|
|
if (IS_ERR(va)) {
|
|
kfree(area);
|
|
return NULL;
|
|
}
|
|
|
|
setup_vmalloc_vm(area, va, flags, caller);
|
|
|
|
return area;
|
|
}
|
|
|
|
struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
|
|
GFP_KERNEL, __builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL_GPL(__get_vm_area);
|
|
|
|
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
|
|
unsigned long start, unsigned long end,
|
|
const void *caller)
|
|
{
|
|
return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
|
|
GFP_KERNEL, caller);
|
|
}
|
|
|
|
/**
|
|
* get_vm_area - reserve a contiguous kernel virtual area
|
|
* @size: size of the area
|
|
* @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
|
|
*
|
|
* Search an area of @size in the kernel virtual mapping area,
|
|
* and reserved it for out purposes. Returns the area descriptor
|
|
* on success or %NULL on failure.
|
|
*/
|
|
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
|
|
{
|
|
#ifdef CONFIG_ENABLE_VMALLOC_SAVING
|
|
return __get_vm_area_node(size, 1, flags, PAGE_OFFSET, VMALLOC_END,
|
|
NUMA_NO_NODE, GFP_KERNEL,
|
|
__builtin_return_address(0));
|
|
#else
|
|
return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
|
|
NUMA_NO_NODE, GFP_KERNEL,
|
|
__builtin_return_address(0));
|
|
#endif
|
|
}
|
|
|
|
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
|
|
const void *caller)
|
|
{
|
|
#ifdef CONFIG_ENABLE_VMALLOC_SAVING
|
|
return __get_vm_area_node(size, 1, flags, PAGE_OFFSET, VMALLOC_END,
|
|
NUMA_NO_NODE, GFP_KERNEL, caller);
|
|
#else
|
|
return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
|
|
NUMA_NO_NODE, GFP_KERNEL, caller);
|
|
#endif
|
|
}
|
|
|
|
/**
|
|
* find_vm_area - find a continuous kernel virtual area
|
|
* @addr: base address
|
|
*
|
|
* Search for the kernel VM area starting at @addr, and return it.
|
|
* It is up to the caller to do all required locking to keep the returned
|
|
* pointer valid.
|
|
*/
|
|
struct vm_struct *find_vm_area(const void *addr)
|
|
{
|
|
struct vmap_area *va;
|
|
|
|
va = find_vmap_area((unsigned long)addr);
|
|
if (va && va->flags & VM_VM_AREA)
|
|
return va->vm;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static struct vm_struct *__remove_vm_area(struct vmap_area *va)
|
|
{
|
|
struct vm_struct *vm = va->vm;
|
|
|
|
spin_lock(&vmap_area_lock);
|
|
va->vm = NULL;
|
|
va->flags &= ~VM_VM_AREA;
|
|
va->flags |= VM_LAZY_FREE;
|
|
spin_unlock(&vmap_area_lock);
|
|
|
|
kasan_free_shadow(vm);
|
|
free_unmap_vmap_area(va);
|
|
|
|
return vm;
|
|
}
|
|
|
|
/**
|
|
* remove_vm_area - find and remove a continuous kernel virtual area
|
|
* @addr: base address
|
|
*
|
|
* Search for the kernel VM area starting at @addr, and remove it.
|
|
* This function returns the found VM area, but using it is NOT safe
|
|
* on SMP machines, except for its size or flags.
|
|
*/
|
|
struct vm_struct *remove_vm_area(const void *addr)
|
|
{
|
|
struct vm_struct *vm = NULL;
|
|
struct vmap_area *va;
|
|
|
|
va = find_vmap_area((unsigned long)addr);
|
|
if (va && va->flags & VM_VM_AREA)
|
|
vm = __remove_vm_area(va);
|
|
|
|
return vm;
|
|
}
|
|
|
|
static void __vunmap(const void *addr, int deallocate_pages)
|
|
{
|
|
struct vm_struct *area;
|
|
struct vmap_area *va;
|
|
|
|
if (!addr)
|
|
return;
|
|
|
|
if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
|
|
addr))
|
|
return;
|
|
|
|
va = find_vmap_area((unsigned long)addr);
|
|
if (unlikely(!va || !(va->flags & VM_VM_AREA))) {
|
|
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
|
|
addr);
|
|
return;
|
|
}
|
|
|
|
area = va->vm;
|
|
debug_check_no_locks_freed(addr, get_vm_area_size(area));
|
|
debug_check_no_obj_freed(addr, get_vm_area_size(area));
|
|
|
|
__remove_vm_area(va);
|
|
if (deallocate_pages) {
|
|
int i;
|
|
|
|
for (i = 0; i < area->nr_pages; i++) {
|
|
struct page *page = area->pages[i];
|
|
|
|
BUG_ON(!page);
|
|
__free_pages(page, 0);
|
|
}
|
|
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
|
|
|
|
kvfree(area->pages);
|
|
}
|
|
|
|
kfree(area);
|
|
}
|
|
|
|
static inline void __vfree_deferred(const void *addr)
|
|
{
|
|
/*
|
|
* Use raw_cpu_ptr() because this can be called from preemptible
|
|
* context. Preemption is absolutely fine here, because the llist_add()
|
|
* implementation is lockless, so it works even if we are adding to
|
|
* nother cpu's list. schedule_work() should be fine with this too.
|
|
*/
|
|
struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
|
|
|
|
if (llist_add((struct llist_node *)addr, &p->list))
|
|
schedule_work(&p->wq);
|
|
}
|
|
|
|
/**
|
|
* vfree_atomic - release memory allocated by vmalloc()
|
|
* @addr: memory base address
|
|
*
|
|
* This one is just like vfree() but can be called in any atomic context
|
|
* except NMIs.
|
|
*/
|
|
void vfree_atomic(const void *addr)
|
|
{
|
|
BUG_ON(in_nmi());
|
|
|
|
kmemleak_free(addr);
|
|
|
|
if (!addr)
|
|
return;
|
|
__vfree_deferred(addr);
|
|
}
|
|
|
|
/**
|
|
* vfree - release memory allocated by vmalloc()
|
|
* @addr: memory base address
|
|
*
|
|
* Free the virtually continuous memory area starting at @addr, as
|
|
* obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
|
|
* NULL, no operation is performed.
|
|
*
|
|
* Must not be called in NMI context (strictly speaking, only if we don't
|
|
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
|
|
* conventions for vfree() arch-depenedent would be a really bad idea)
|
|
*
|
|
* NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
|
|
*/
|
|
void vfree(const void *addr)
|
|
{
|
|
BUG_ON(in_nmi());
|
|
|
|
kmemleak_free(addr);
|
|
|
|
if (!addr)
|
|
return;
|
|
if (unlikely(in_interrupt()))
|
|
__vfree_deferred(addr);
|
|
else
|
|
__vunmap(addr, 1);
|
|
}
|
|
EXPORT_SYMBOL(vfree);
|
|
|
|
/**
|
|
* vunmap - release virtual mapping obtained by vmap()
|
|
* @addr: memory base address
|
|
*
|
|
* Free the virtually contiguous memory area starting at @addr,
|
|
* which was created from the page array passed to vmap().
|
|
*
|
|
* Must not be called in interrupt context.
|
|
*/
|
|
void vunmap(const void *addr)
|
|
{
|
|
BUG_ON(in_interrupt());
|
|
might_sleep();
|
|
if (addr)
|
|
__vunmap(addr, 0);
|
|
}
|
|
EXPORT_SYMBOL(vunmap);
|
|
|
|
/**
|
|
* vmap - map an array of pages into virtually contiguous space
|
|
* @pages: array of page pointers
|
|
* @count: number of pages to map
|
|
* @flags: vm_area->flags
|
|
* @prot: page protection for the mapping
|
|
*
|
|
* Maps @count pages from @pages into contiguous kernel virtual
|
|
* space.
|
|
*/
|
|
void *vmap(struct page **pages, unsigned int count,
|
|
unsigned long flags, pgprot_t prot)
|
|
{
|
|
struct vm_struct *area;
|
|
unsigned long size; /* In bytes */
|
|
|
|
might_sleep();
|
|
|
|
if (count > totalram_pages)
|
|
return NULL;
|
|
|
|
size = (unsigned long)count << PAGE_SHIFT;
|
|
area = get_vm_area_caller(size, flags, __builtin_return_address(0));
|
|
if (!area)
|
|
return NULL;
|
|
|
|
if (map_vm_area(area, prot, pages)) {
|
|
vunmap(area->addr);
|
|
return NULL;
|
|
}
|
|
|
|
return area->addr;
|
|
}
|
|
EXPORT_SYMBOL(vmap);
|
|
|
|
static void *__vmalloc_node(unsigned long size, unsigned long align,
|
|
gfp_t gfp_mask, pgprot_t prot,
|
|
int node, const void *caller);
|
|
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
|
pgprot_t prot, int node)
|
|
{
|
|
struct page **pages;
|
|
unsigned int nr_pages, array_size, i;
|
|
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
|
|
const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
|
|
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
|
|
0 :
|
|
__GFP_HIGHMEM;
|
|
|
|
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
|
|
array_size = (nr_pages * sizeof(struct page *));
|
|
|
|
area->nr_pages = nr_pages;
|
|
/* Please note that the recursion is strictly bounded. */
|
|
if (array_size > PAGE_SIZE) {
|
|
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
|
|
PAGE_KERNEL, node, area->caller);
|
|
} else {
|
|
pages = kmalloc_node(array_size, nested_gfp, node);
|
|
}
|
|
area->pages = pages;
|
|
if (!area->pages) {
|
|
remove_vm_area(area->addr);
|
|
kfree(area);
|
|
return NULL;
|
|
}
|
|
|
|
for (i = 0; i < area->nr_pages; i++) {
|
|
struct page *page;
|
|
|
|
if (node == NUMA_NO_NODE)
|
|
page = alloc_page(alloc_mask|highmem_mask);
|
|
else
|
|
page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
|
|
|
|
if (unlikely(!page)) {
|
|
/* Successfully allocated i pages, free them in __vunmap() */
|
|
area->nr_pages = i;
|
|
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
|
|
goto fail;
|
|
}
|
|
area->pages[i] = page;
|
|
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
|
|
cond_resched();
|
|
}
|
|
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
|
|
|
|
if (map_vm_area(area, prot, pages))
|
|
goto fail;
|
|
return area->addr;
|
|
|
|
fail:
|
|
warn_alloc(gfp_mask, NULL,
|
|
"vmalloc: allocation failure, allocated %ld of %ld bytes",
|
|
(area->nr_pages*PAGE_SIZE), area->size);
|
|
vfree(area->addr);
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* __vmalloc_node_range - allocate virtually contiguous memory
|
|
* @size: allocation size
|
|
* @align: desired alignment
|
|
* @start: vm area range start
|
|
* @end: vm area range end
|
|
* @gfp_mask: flags for the page level allocator
|
|
* @prot: protection mask for the allocated pages
|
|
* @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
|
|
* @node: node to use for allocation or NUMA_NO_NODE
|
|
* @caller: caller's return address
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator with @gfp_mask flags. Map them into contiguous
|
|
* kernel virtual space, using a pagetable protection of @prot.
|
|
*/
|
|
void *__vmalloc_node_range(unsigned long size, unsigned long align,
|
|
unsigned long start, unsigned long end, gfp_t gfp_mask,
|
|
pgprot_t prot, unsigned long vm_flags, int node,
|
|
const void *caller)
|
|
{
|
|
struct vm_struct *area;
|
|
void *addr;
|
|
unsigned long real_size = size;
|
|
|
|
size = PAGE_ALIGN(size);
|
|
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
|
|
goto fail;
|
|
|
|
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
|
|
vm_flags, start, end, node, gfp_mask, caller);
|
|
if (!area)
|
|
goto fail;
|
|
|
|
addr = __vmalloc_area_node(area, gfp_mask, prot, node);
|
|
if (!addr)
|
|
return NULL;
|
|
|
|
/*
|
|
* First make sure the mappings are removed from all page-tables
|
|
* before they are freed.
|
|
*/
|
|
vmalloc_sync_all();
|
|
|
|
/*
|
|
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
|
|
* flag. It means that vm_struct is not fully initialized.
|
|
* Now, it is fully initialized, so remove this flag here.
|
|
*/
|
|
clear_vm_uninitialized_flag(area);
|
|
|
|
kmemleak_vmalloc(area, size, gfp_mask);
|
|
|
|
return addr;
|
|
|
|
fail:
|
|
warn_alloc(gfp_mask, NULL,
|
|
"vmalloc: allocation failure: %lu bytes", real_size);
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* __vmalloc_node - allocate virtually contiguous memory
|
|
* @size: allocation size
|
|
* @align: desired alignment
|
|
* @gfp_mask: flags for the page level allocator
|
|
* @prot: protection mask for the allocated pages
|
|
* @node: node to use for allocation or NUMA_NO_NODE
|
|
* @caller: caller's return address
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator with @gfp_mask flags. Map them into contiguous
|
|
* kernel virtual space, using a pagetable protection of @prot.
|
|
*
|
|
* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
|
|
* and __GFP_NOFAIL are not supported
|
|
*
|
|
* Any use of gfp flags outside of GFP_KERNEL should be consulted
|
|
* with mm people.
|
|
*
|
|
*/
|
|
static void *__vmalloc_node(unsigned long size, unsigned long align,
|
|
gfp_t gfp_mask, pgprot_t prot,
|
|
int node, const void *caller)
|
|
{
|
|
#ifdef CONFIG_ENABLE_VMALLOC_SAVING
|
|
return __vmalloc_node_range(size, align, PAGE_OFFSET, VMALLOC_END,
|
|
gfp_mask, prot, 0, node, caller);
|
|
#else
|
|
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
|
|
gfp_mask, prot, 0, node, caller);
|
|
#endif
|
|
}
|
|
|
|
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
|
|
{
|
|
return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
|
|
__builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(__vmalloc);
|
|
|
|
static inline void *__vmalloc_node_flags(unsigned long size,
|
|
int node, gfp_t flags)
|
|
{
|
|
return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
|
|
node, __builtin_return_address(0));
|
|
}
|
|
|
|
|
|
void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
|
|
void *caller)
|
|
{
|
|
return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
|
|
}
|
|
|
|
/**
|
|
* vmalloc - allocate virtually contiguous memory
|
|
* @size: allocation size
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator and map them into contiguous kernel virtual space.
|
|
*
|
|
* For tight control over page level allocator and protection flags
|
|
* use __vmalloc() instead.
|
|
*/
|
|
void *vmalloc(unsigned long size)
|
|
{
|
|
return __vmalloc_node_flags(size, NUMA_NO_NODE,
|
|
GFP_KERNEL);
|
|
}
|
|
EXPORT_SYMBOL(vmalloc);
|
|
|
|
/**
|
|
* vzalloc - allocate virtually contiguous memory with zero fill
|
|
* @size: allocation size
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator and map them into contiguous kernel virtual space.
|
|
* The memory allocated is set to zero.
|
|
*
|
|
* For tight control over page level allocator and protection flags
|
|
* use __vmalloc() instead.
|
|
*/
|
|
void *vzalloc(unsigned long size)
|
|
{
|
|
return __vmalloc_node_flags(size, NUMA_NO_NODE,
|
|
GFP_KERNEL | __GFP_ZERO);
|
|
}
|
|
EXPORT_SYMBOL(vzalloc);
|
|
|
|
/**
|
|
* vmalloc_user - allocate zeroed virtually contiguous memory for userspace
|
|
* @size: allocation size
|
|
*
|
|
* The resulting memory area is zeroed so it can be mapped to userspace
|
|
* without leaking data.
|
|
*/
|
|
void *vmalloc_user(unsigned long size)
|
|
{
|
|
struct vm_struct *area;
|
|
void *ret;
|
|
|
|
ret = __vmalloc_node(size, SHMLBA,
|
|
GFP_KERNEL | __GFP_ZERO,
|
|
PAGE_KERNEL, NUMA_NO_NODE,
|
|
__builtin_return_address(0));
|
|
if (ret) {
|
|
area = find_vm_area(ret);
|
|
area->flags |= VM_USERMAP;
|
|
}
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_user);
|
|
|
|
/**
|
|
* vmalloc_node - allocate memory on a specific node
|
|
* @size: allocation size
|
|
* @node: numa node
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator and map them into contiguous kernel virtual space.
|
|
*
|
|
* For tight control over page level allocator and protection flags
|
|
* use __vmalloc() instead.
|
|
*/
|
|
void *vmalloc_node(unsigned long size, int node)
|
|
{
|
|
return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
|
|
node, __builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_node);
|
|
|
|
/**
|
|
* vzalloc_node - allocate memory on a specific node with zero fill
|
|
* @size: allocation size
|
|
* @node: numa node
|
|
*
|
|
* Allocate enough pages to cover @size from the page level
|
|
* allocator and map them into contiguous kernel virtual space.
|
|
* The memory allocated is set to zero.
|
|
*
|
|
* For tight control over page level allocator and protection flags
|
|
* use __vmalloc_node() instead.
|
|
*/
|
|
void *vzalloc_node(unsigned long size, int node)
|
|
{
|
|
return __vmalloc_node_flags(size, node,
|
|
GFP_KERNEL | __GFP_ZERO);
|
|
}
|
|
EXPORT_SYMBOL(vzalloc_node);
|
|
|
|
/**
|
|
* vmalloc_exec - allocate virtually contiguous, executable memory
|
|
* @size: allocation size
|
|
*
|
|
* Kernel-internal function to allocate enough pages to cover @size
|
|
* the page level allocator and map them into contiguous and
|
|
* executable kernel virtual space.
|
|
*
|
|
* For tight control over page level allocator and protection flags
|
|
* use __vmalloc() instead.
|
|
*/
|
|
|
|
void *vmalloc_exec(unsigned long size)
|
|
{
|
|
return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
|
|
NUMA_NO_NODE, __builtin_return_address(0));
|
|
}
|
|
|
|
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
|
|
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
|
|
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
|
|
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
|
|
#else
|
|
/*
|
|
* 64b systems should always have either DMA or DMA32 zones. For others
|
|
* GFP_DMA32 should do the right thing and use the normal zone.
|
|
*/
|
|
#define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
|
|
#endif
|
|
|
|
/**
|
|
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
|
|
* @size: allocation size
|
|
*
|
|
* Allocate enough 32bit PA addressable pages to cover @size from the
|
|
* page level allocator and map them into contiguous kernel virtual space.
|
|
*/
|
|
void *vmalloc_32(unsigned long size)
|
|
{
|
|
return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
|
|
NUMA_NO_NODE, __builtin_return_address(0));
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_32);
|
|
|
|
/**
|
|
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
|
|
* @size: allocation size
|
|
*
|
|
* The resulting memory area is 32bit addressable and zeroed so it can be
|
|
* mapped to userspace without leaking data.
|
|
*/
|
|
void *vmalloc_32_user(unsigned long size)
|
|
{
|
|
struct vm_struct *area;
|
|
void *ret;
|
|
|
|
ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
|
|
NUMA_NO_NODE, __builtin_return_address(0));
|
|
if (ret) {
|
|
area = find_vm_area(ret);
|
|
area->flags |= VM_USERMAP;
|
|
}
|
|
return ret;
|
|
}
|
|
EXPORT_SYMBOL(vmalloc_32_user);
|
|
|
|
/*
|
|
* small helper routine , copy contents to buf from addr.
|
|
* If the page is not present, fill zero.
|
|
*/
|
|
|
|
static int aligned_vread(char *buf, char *addr, unsigned long count)
|
|
{
|
|
struct page *p;
|
|
int copied = 0;
|
|
|
|
while (count) {
|
|
unsigned long offset, length;
|
|
|
|
offset = offset_in_page(addr);
|
|
length = PAGE_SIZE - offset;
|
|
if (length > count)
|
|
length = count;
|
|
p = vmalloc_to_page(addr);
|
|
/*
|
|
* To do safe access to this _mapped_ area, we need
|
|
* lock. But adding lock here means that we need to add
|
|
* overhead of vmalloc()/vfree() calles for this _debug_
|
|
* interface, rarely used. Instead of that, we'll use
|
|
* kmap() and get small overhead in this access function.
|
|
*/
|
|
if (p) {
|
|
/*
|
|
* we can expect USER0 is not used (see vread/vwrite's
|
|
* function description)
|
|
*/
|
|
void *map = kmap_atomic(p);
|
|
memcpy(buf, map + offset, length);
|
|
kunmap_atomic(map);
|
|
} else
|
|
memset(buf, 0, length);
|
|
|
|
addr += length;
|
|
buf += length;
|
|
copied += length;
|
|
count -= length;
|
|
}
|
|
return copied;
|
|
}
|
|
|
|
static int aligned_vwrite(char *buf, char *addr, unsigned long count)
|
|
{
|
|
struct page *p;
|
|
int copied = 0;
|
|
|
|
while (count) {
|
|
unsigned long offset, length;
|
|
|
|
offset = offset_in_page(addr);
|
|
length = PAGE_SIZE - offset;
|
|
if (length > count)
|
|
length = count;
|
|
p = vmalloc_to_page(addr);
|
|
/*
|
|
* To do safe access to this _mapped_ area, we need
|
|
* lock. But adding lock here means that we need to add
|
|
* overhead of vmalloc()/vfree() calles for this _debug_
|
|
* interface, rarely used. Instead of that, we'll use
|
|
* kmap() and get small overhead in this access function.
|
|
*/
|
|
if (p) {
|
|
/*
|
|
* we can expect USER0 is not used (see vread/vwrite's
|
|
* function description)
|
|
*/
|
|
void *map = kmap_atomic(p);
|
|
memcpy(map + offset, buf, length);
|
|
kunmap_atomic(map);
|
|
}
|
|
addr += length;
|
|
buf += length;
|
|
copied += length;
|
|
count -= length;
|
|
}
|
|
return copied;
|
|
}
|
|
|
|
/**
|
|
* vread() - read vmalloc area in a safe way.
|
|
* @buf: buffer for reading data
|
|
* @addr: vm address.
|
|
* @count: number of bytes to be read.
|
|
*
|
|
* Returns # of bytes which addr and buf should be increased.
|
|
* (same number to @count). Returns 0 if [addr...addr+count) doesn't
|
|
* includes any intersect with alive vmalloc area.
|
|
*
|
|
* This function checks that addr is a valid vmalloc'ed area, and
|
|
* copy data from that area to a given buffer. If the given memory range
|
|
* of [addr...addr+count) includes some valid address, data is copied to
|
|
* proper area of @buf. If there are memory holes, they'll be zero-filled.
|
|
* IOREMAP area is treated as memory hole and no copy is done.
|
|
*
|
|
* If [addr...addr+count) doesn't includes any intersects with alive
|
|
* vm_struct area, returns 0. @buf should be kernel's buffer.
|
|
*
|
|
* Note: In usual ops, vread() is never necessary because the caller
|
|
* should know vmalloc() area is valid and can use memcpy().
|
|
* This is for routines which have to access vmalloc area without
|
|
* any informaion, as /dev/kmem.
|
|
*
|
|
*/
|
|
|
|
long vread(char *buf, char *addr, unsigned long count)
|
|
{
|
|
struct vmap_area *va;
|
|
struct vm_struct *vm;
|
|
char *vaddr, *buf_start = buf;
|
|
unsigned long buflen = count;
|
|
unsigned long n;
|
|
|
|
/* Don't allow overflow */
|
|
if ((unsigned long) addr + count < count)
|
|
count = -(unsigned long) addr;
|
|
|
|
spin_lock(&vmap_area_lock);
|
|
list_for_each_entry(va, &vmap_area_list, list) {
|
|
if (!count)
|
|
break;
|
|
|
|
if (!(va->flags & VM_VM_AREA))
|
|
continue;
|
|
|
|
vm = va->vm;
|
|
vaddr = (char *) vm->addr;
|
|
if (addr >= vaddr + get_vm_area_size(vm))
|
|
continue;
|
|
while (addr < vaddr) {
|
|
if (count == 0)
|
|
goto finished;
|
|
*buf = '\0';
|
|
buf++;
|
|
addr++;
|
|
count--;
|
|
}
|
|
n = vaddr + get_vm_area_size(vm) - addr;
|
|
if (n > count)
|
|
n = count;
|
|
if (!(vm->flags & VM_IOREMAP))
|
|
aligned_vread(buf, addr, n);
|
|
else /* IOREMAP area is treated as memory hole */
|
|
memset(buf, 0, n);
|
|
buf += n;
|
|
addr += n;
|
|
count -= n;
|
|
}
|
|
finished:
|
|
spin_unlock(&vmap_area_lock);
|
|
|
|
if (buf == buf_start)
|
|
return 0;
|
|
/* zero-fill memory holes */
|
|
if (buf != buf_start + buflen)
|
|
memset(buf, 0, buflen - (buf - buf_start));
|
|
|
|
return buflen;
|
|
}
|
|
|
|
/**
|
|
* vwrite() - write vmalloc area in a safe way.
|
|
* @buf: buffer for source data
|
|
* @addr: vm address.
|
|
* @count: number of bytes to be read.
|
|
*
|
|
* Returns # of bytes which addr and buf should be incresed.
|
|
* (same number to @count).
|
|
* If [addr...addr+count) doesn't includes any intersect with valid
|
|
* vmalloc area, returns 0.
|
|
*
|
|
* This function checks that addr is a valid vmalloc'ed area, and
|
|
* copy data from a buffer to the given addr. If specified range of
|
|
* [addr...addr+count) includes some valid address, data is copied from
|
|
* proper area of @buf. If there are memory holes, no copy to hole.
|
|
* IOREMAP area is treated as memory hole and no copy is done.
|
|
*
|
|
* If [addr...addr+count) doesn't includes any intersects with alive
|
|
* vm_struct area, returns 0. @buf should be kernel's buffer.
|
|
*
|
|
* Note: In usual ops, vwrite() is never necessary because the caller
|
|
* should know vmalloc() area is valid and can use memcpy().
|
|
* This is for routines which have to access vmalloc area without
|
|
* any informaion, as /dev/kmem.
|
|
*/
|
|
|
|
long vwrite(char *buf, char *addr, unsigned long count)
|
|
{
|
|
struct vmap_area *va;
|
|
struct vm_struct *vm;
|
|
char *vaddr;
|
|
unsigned long n, buflen;
|
|
int copied = 0;
|
|
|
|
/* Don't allow overflow */
|
|
if ((unsigned long) addr + count < count)
|
|
count = -(unsigned long) addr;
|
|
buflen = count;
|
|
|
|
spin_lock(&vmap_area_lock);
|
|
list_for_each_entry(va, &vmap_area_list, list) {
|
|
if (!count)
|
|
break;
|
|
|
|
if (!(va->flags & VM_VM_AREA))
|
|
continue;
|
|
|
|
vm = va->vm;
|
|
vaddr = (char *) vm->addr;
|
|
if (addr >= vaddr + get_vm_area_size(vm))
|
|
continue;
|
|
while (addr < vaddr) {
|
|
if (count == 0)
|
|
goto finished;
|
|
buf++;
|
|
addr++;
|
|
count--;
|
|
}
|
|
n = vaddr + get_vm_area_size(vm) - addr;
|
|
if (n > count)
|
|
n = count;
|
|
if (!(vm->flags & VM_IOREMAP)) {
|
|
aligned_vwrite(buf, addr, n);
|
|
copied++;
|
|
}
|
|
buf += n;
|
|
addr += n;
|
|
count -= n;
|
|
}
|
|
finished:
|
|
spin_unlock(&vmap_area_lock);
|
|
if (!copied)
|
|
return 0;
|
|
return buflen;
|
|
}
|
|
|
|
/**
|
|
* remap_vmalloc_range_partial - map vmalloc pages to userspace
|
|
* @vma: vma to cover
|
|
* @uaddr: target user address to start at
|
|
* @kaddr: virtual address of vmalloc kernel memory
|
|
* @size: size of map area
|
|
*
|
|
* Returns: 0 for success, -Exxx on failure
|
|
*
|
|
* This function checks that @kaddr is a valid vmalloc'ed area,
|
|
* and that it is big enough to cover the range starting at
|
|
* @uaddr in @vma. Will return failure if that criteria isn't
|
|
* met.
|
|
*
|
|
* Similar to remap_pfn_range() (see mm/memory.c)
|
|
*/
|
|
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
|
|
void *kaddr, unsigned long size)
|
|
{
|
|
struct vm_struct *area;
|
|
|
|
size = PAGE_ALIGN(size);
|
|
|
|
if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
|
|
return -EINVAL;
|
|
|
|
area = find_vm_area(kaddr);
|
|
if (!area)
|
|
return -EINVAL;
|
|
|
|
if (!(area->flags & VM_USERMAP))
|
|
return -EINVAL;
|
|
|
|
if (kaddr + size > area->addr + get_vm_area_size(area))
|
|
return -EINVAL;
|
|
|
|
do {
|
|
struct page *page = vmalloc_to_page(kaddr);
|
|
int ret;
|
|
|
|
ret = vm_insert_page(vma, uaddr, page);
|
|
if (ret)
|
|
return ret;
|
|
|
|
uaddr += PAGE_SIZE;
|
|
kaddr += PAGE_SIZE;
|
|
size -= PAGE_SIZE;
|
|
} while (size > 0);
|
|
|
|
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
|
|
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL(remap_vmalloc_range_partial);
|
|
|
|
/**
|
|
* remap_vmalloc_range - map vmalloc pages to userspace
|
|
* @vma: vma to cover (map full range of vma)
|
|
* @addr: vmalloc memory
|
|
* @pgoff: number of pages into addr before first page to map
|
|
*
|
|
* Returns: 0 for success, -Exxx on failure
|
|
*
|
|
* This function checks that addr is a valid vmalloc'ed area, and
|
|
* that it is big enough to cover the vma. Will return failure if
|
|
* that criteria isn't met.
|
|
*
|
|
* Similar to remap_pfn_range() (see mm/memory.c)
|
|
*/
|
|
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
|
|
unsigned long pgoff)
|
|
{
|
|
return remap_vmalloc_range_partial(vma, vma->vm_start,
|
|
addr + (pgoff << PAGE_SHIFT),
|
|
vma->vm_end - vma->vm_start);
|
|
}
|
|
EXPORT_SYMBOL(remap_vmalloc_range);
|
|
|
|
/*
|
|
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
|
|
* have one.
|
|
*
|
|
* The purpose of this function is to make sure the vmalloc area
|
|
* mappings are identical in all page-tables in the system.
|
|
*/
|
|
void __weak vmalloc_sync_all(void)
|
|
{
|
|
}
|
|
|
|
|
|
static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
|
|
{
|
|
pte_t ***p = data;
|
|
|
|
if (p) {
|
|
*(*p) = pte;
|
|
(*p)++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* alloc_vm_area - allocate a range of kernel address space
|
|
* @size: size of the area
|
|
* @ptes: returns the PTEs for the address space
|
|
*
|
|
* Returns: NULL on failure, vm_struct on success
|
|
*
|
|
* This function reserves a range of kernel address space, and
|
|
* allocates pagetables to map that range. No actual mappings
|
|
* are created.
|
|
*
|
|
* If @ptes is non-NULL, pointers to the PTEs (in init_mm)
|
|
* allocated for the VM area are returned.
|
|
*/
|
|
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
|
|
{
|
|
struct vm_struct *area;
|
|
|
|
area = get_vm_area_caller(size, VM_IOREMAP,
|
|
__builtin_return_address(0));
|
|
if (area == NULL)
|
|
return NULL;
|
|
|
|
/*
|
|
* This ensures that page tables are constructed for this region
|
|
* of kernel virtual address space and mapped into init_mm.
|
|
*/
|
|
if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
|
|
size, f, ptes ? &ptes : NULL)) {
|
|
free_vm_area(area);
|
|
return NULL;
|
|
}
|
|
|
|
return area;
|
|
}
|
|
EXPORT_SYMBOL_GPL(alloc_vm_area);
|
|
|
|
void free_vm_area(struct vm_struct *area)
|
|
{
|
|
struct vm_struct *ret;
|
|
ret = remove_vm_area(area->addr);
|
|
BUG_ON(ret != area);
|
|
kfree(area);
|
|
}
|
|
EXPORT_SYMBOL_GPL(free_vm_area);
|
|
|
|
#ifdef CONFIG_SMP
|
|
static struct vmap_area *node_to_va(struct rb_node *n)
|
|
{
|
|
return rb_entry_safe(n, struct vmap_area, rb_node);
|
|
}
|
|
|
|
/**
|
|
* pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
|
|
* @addr: target address
|
|
*
|
|
* Returns: vmap_area if it is found. If there is no such area
|
|
* the first highest(reverse order) vmap_area is returned
|
|
* i.e. va->va_start < addr && va->va_end < addr or NULL
|
|
* if there are no any areas before @addr.
|
|
*/
|
|
static struct vmap_area *
|
|
pvm_find_va_enclose_addr(unsigned long addr)
|
|
{
|
|
struct vmap_area *va, *tmp;
|
|
struct rb_node *n;
|
|
|
|
n = free_vmap_area_root.rb_node;
|
|
va = NULL;
|
|
|
|
while (n) {
|
|
tmp = rb_entry(n, struct vmap_area, rb_node);
|
|
if (tmp->va_start <= addr) {
|
|
va = tmp;
|
|
if (tmp->va_end >= addr)
|
|
break;
|
|
|
|
n = n->rb_right;
|
|
} else {
|
|
n = n->rb_left;
|
|
}
|
|
}
|
|
|
|
return va;
|
|
}
|
|
|
|
/**
|
|
* pvm_determine_end_from_reverse - find the highest aligned address
|
|
* of free block below VMALLOC_END
|
|
* @va:
|
|
* in - the VA we start the search(reverse order);
|
|
* out - the VA with the highest aligned end address.
|
|
*
|
|
* Returns: determined end address within vmap_area
|
|
*/
|
|
static unsigned long
|
|
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
|
|
{
|
|
unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
|
|
unsigned long addr;
|
|
|
|
if (likely(*va)) {
|
|
list_for_each_entry_from_reverse((*va),
|
|
&free_vmap_area_list, list) {
|
|
addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
|
|
if ((*va)->va_start < addr)
|
|
return addr;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
|
|
* @offsets: array containing offset of each area
|
|
* @sizes: array containing size of each area
|
|
* @nr_vms: the number of areas to allocate
|
|
* @align: alignment, all entries in @offsets and @sizes must be aligned to this
|
|
*
|
|
* Returns: kmalloc'd vm_struct pointer array pointing to allocated
|
|
* vm_structs on success, %NULL on failure
|
|
*
|
|
* Percpu allocator wants to use congruent vm areas so that it can
|
|
* maintain the offsets among percpu areas. This function allocates
|
|
* congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
|
|
* be scattered pretty far, distance between two areas easily going up
|
|
* to gigabytes. To avoid interacting with regular vmallocs, these
|
|
* areas are allocated from top.
|
|
*
|
|
* Despite its complicated look, this allocator is rather simple. It
|
|
* does everything top-down and scans free blocks from the end looking
|
|
* for matching base. While scanning, if any of the areas do not fit the
|
|
* base address is pulled down to fit the area. Scanning is repeated till
|
|
* all the areas fit and then all necessary data structures are inserted
|
|
* and the result is returned.
|
|
*/
|
|
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
|
|
const size_t *sizes, int nr_vms,
|
|
size_t align)
|
|
{
|
|
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
|
|
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
|
|
struct vmap_area **vas, *va;
|
|
struct vm_struct **vms;
|
|
int area, area2, last_area, term_area;
|
|
unsigned long base, start, size, end, last_end;
|
|
bool purged = false;
|
|
enum fit_type type;
|
|
|
|
/* verify parameters and allocate data structures */
|
|
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
|
|
for (last_area = 0, area = 0; area < nr_vms; area++) {
|
|
start = offsets[area];
|
|
end = start + sizes[area];
|
|
|
|
/* is everything aligned properly? */
|
|
BUG_ON(!IS_ALIGNED(offsets[area], align));
|
|
BUG_ON(!IS_ALIGNED(sizes[area], align));
|
|
|
|
/* detect the area with the highest address */
|
|
if (start > offsets[last_area])
|
|
last_area = area;
|
|
|
|
for (area2 = area + 1; area2 < nr_vms; area2++) {
|
|
unsigned long start2 = offsets[area2];
|
|
unsigned long end2 = start2 + sizes[area2];
|
|
|
|
BUG_ON(start2 < end && start < end2);
|
|
}
|
|
}
|
|
last_end = offsets[last_area] + sizes[last_area];
|
|
|
|
if (vmalloc_end - vmalloc_start < last_end) {
|
|
WARN_ON(true);
|
|
return NULL;
|
|
}
|
|
|
|
vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
|
|
vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
|
|
if (!vas || !vms)
|
|
goto err_free2;
|
|
|
|
for (area = 0; area < nr_vms; area++) {
|
|
vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
|
|
vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
|
|
if (!vas[area] || !vms[area])
|
|
goto err_free;
|
|
}
|
|
retry:
|
|
spin_lock(&vmap_area_lock);
|
|
|
|
/* start scanning - we scan from the top, begin with the last area */
|
|
area = term_area = last_area;
|
|
start = offsets[area];
|
|
end = start + sizes[area];
|
|
|
|
va = pvm_find_va_enclose_addr(vmalloc_end);
|
|
base = pvm_determine_end_from_reverse(&va, align) - end;
|
|
|
|
while (true) {
|
|
/*
|
|
* base might have underflowed, add last_end before
|
|
* comparing.
|
|
*/
|
|
if (base + last_end < vmalloc_start + last_end)
|
|
goto overflow;
|
|
|
|
/*
|
|
* Fitting base has not been found.
|
|
*/
|
|
if (va == NULL)
|
|
goto overflow;
|
|
|
|
/*
|
|
* If required width exeeds current VA block, move
|
|
* base downwards and then recheck.
|
|
*/
|
|
if (base + end > va->va_end) {
|
|
base = pvm_determine_end_from_reverse(&va, align) - end;
|
|
term_area = area;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* If this VA does not fit, move base downwards and recheck.
|
|
*/
|
|
if (base + start < va->va_start) {
|
|
va = node_to_va(rb_prev(&va->rb_node));
|
|
base = pvm_determine_end_from_reverse(&va, align) - end;
|
|
term_area = area;
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* This area fits, move on to the previous one. If
|
|
* the previous one is the terminal one, we're done.
|
|
*/
|
|
area = (area + nr_vms - 1) % nr_vms;
|
|
if (area == term_area)
|
|
break;
|
|
|
|
start = offsets[area];
|
|
end = start + sizes[area];
|
|
va = pvm_find_va_enclose_addr(base + end);
|
|
}
|
|
|
|
/* we've found a fitting base, insert all va's */
|
|
for (area = 0; area < nr_vms; area++) {
|
|
int ret;
|
|
|
|
start = base + offsets[area];
|
|
size = sizes[area];
|
|
|
|
va = pvm_find_va_enclose_addr(start);
|
|
if (WARN_ON_ONCE(va == NULL))
|
|
/* It is a BUG(), but trigger recovery instead. */
|
|
goto recovery;
|
|
|
|
type = classify_va_fit_type(va, start, size);
|
|
if (WARN_ON_ONCE(type == NOTHING_FIT))
|
|
/* It is a BUG(), but trigger recovery instead. */
|
|
goto recovery;
|
|
|
|
ret = adjust_va_to_fit_type(va, start, size, type);
|
|
if (unlikely(ret))
|
|
goto recovery;
|
|
|
|
/* Allocated area. */
|
|
va = vas[area];
|
|
va->va_start = start;
|
|
va->va_end = start + size;
|
|
|
|
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
|
|
}
|
|
|
|
spin_unlock(&vmap_area_lock);
|
|
|
|
/* insert all vm's */
|
|
for (area = 0; area < nr_vms; area++)
|
|
setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
|
|
pcpu_get_vm_areas);
|
|
|
|
kfree(vas);
|
|
return vms;
|
|
|
|
recovery:
|
|
/* Remove previously inserted areas. */
|
|
while (area--) {
|
|
__free_vmap_area(vas[area]);
|
|
vas[area] = NULL;
|
|
}
|
|
|
|
overflow:
|
|
spin_unlock(&vmap_area_lock);
|
|
if (!purged) {
|
|
purge_vmap_area_lazy();
|
|
purged = true;
|
|
|
|
/* Before "retry", check if we recover. */
|
|
for (area = 0; area < nr_vms; area++) {
|
|
if (vas[area])
|
|
continue;
|
|
|
|
vas[area] = kmem_cache_zalloc(
|
|
vmap_area_cachep, GFP_KERNEL);
|
|
if (!vas[area])
|
|
goto err_free;
|
|
}
|
|
|
|
goto retry;
|
|
}
|
|
|
|
err_free:
|
|
for (area = 0; area < nr_vms; area++) {
|
|
if (vas[area])
|
|
kmem_cache_free(vmap_area_cachep, vas[area]);
|
|
|
|
kfree(vms[area]);
|
|
}
|
|
err_free2:
|
|
kfree(vas);
|
|
kfree(vms);
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
|
|
* @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
|
|
* @nr_vms: the number of allocated areas
|
|
*
|
|
* Free vm_structs and the array allocated by pcpu_get_vm_areas().
|
|
*/
|
|
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < nr_vms; i++)
|
|
free_vm_area(vms[i]);
|
|
kfree(vms);
|
|
}
|
|
#endif /* CONFIG_SMP */
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
static void *s_start(struct seq_file *m, loff_t *pos)
|
|
__acquires(&vmap_area_lock)
|
|
{
|
|
spin_lock(&vmap_area_lock);
|
|
return seq_list_start(&vmap_area_list, *pos);
|
|
}
|
|
|
|
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
|
|
{
|
|
return seq_list_next(p, &vmap_area_list, pos);
|
|
}
|
|
|
|
static void s_stop(struct seq_file *m, void *p)
|
|
__releases(&vmap_area_lock)
|
|
{
|
|
spin_unlock(&vmap_area_lock);
|
|
}
|
|
|
|
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
|
|
{
|
|
if (IS_ENABLED(CONFIG_NUMA)) {
|
|
unsigned int nr, *counters = m->private;
|
|
|
|
if (!counters)
|
|
return;
|
|
|
|
if (v->flags & VM_UNINITIALIZED)
|
|
return;
|
|
/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
|
|
smp_rmb();
|
|
|
|
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
|
|
|
|
for (nr = 0; nr < v->nr_pages; nr++)
|
|
counters[page_to_nid(v->pages[nr])]++;
|
|
|
|
for_each_node_state(nr, N_HIGH_MEMORY)
|
|
if (counters[nr])
|
|
seq_printf(m, " N%u=%u", nr, counters[nr]);
|
|
}
|
|
}
|
|
|
|
static int s_show(struct seq_file *m, void *p)
|
|
{
|
|
struct vmap_area *va;
|
|
struct vm_struct *v;
|
|
|
|
va = list_entry(p, struct vmap_area, list);
|
|
|
|
/*
|
|
* s_show can encounter race with remove_vm_area, !VM_VM_AREA on
|
|
* behalf of vmap area is being tear down or vm_map_ram allocation.
|
|
*/
|
|
if (!(va->flags & VM_VM_AREA)) {
|
|
seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
|
|
(void *)va->va_start, (void *)va->va_end,
|
|
va->va_end - va->va_start,
|
|
va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
|
|
|
|
return 0;
|
|
}
|
|
|
|
v = va->vm;
|
|
|
|
if (v->flags & VM_LOWMEM)
|
|
return 0;
|
|
|
|
seq_printf(m, "0x%pK-0x%pK %7ld",
|
|
v->addr, v->addr + v->size, v->size);
|
|
|
|
if (v->caller)
|
|
seq_printf(m, " %pS", v->caller);
|
|
|
|
if (v->nr_pages)
|
|
seq_printf(m, " pages=%d", v->nr_pages);
|
|
|
|
if (v->phys_addr)
|
|
seq_printf(m, " phys=%pa", &v->phys_addr);
|
|
|
|
if (v->flags & VM_IOREMAP)
|
|
seq_puts(m, " ioremap");
|
|
|
|
if (v->flags & VM_ALLOC)
|
|
seq_puts(m, " vmalloc");
|
|
|
|
if (v->flags & VM_MAP)
|
|
seq_puts(m, " vmap");
|
|
|
|
if (v->flags & VM_USERMAP)
|
|
seq_puts(m, " user");
|
|
|
|
if (is_vmalloc_addr(v->pages))
|
|
seq_puts(m, " vpages");
|
|
|
|
show_numa_info(m, v);
|
|
seq_putc(m, '\n');
|
|
return 0;
|
|
}
|
|
|
|
static const struct seq_operations vmalloc_op = {
|
|
.start = s_start,
|
|
.next = s_next,
|
|
.stop = s_stop,
|
|
.show = s_show,
|
|
};
|
|
|
|
static int __init proc_vmalloc_init(void)
|
|
{
|
|
if (IS_ENABLED(CONFIG_NUMA))
|
|
proc_create_seq_private("vmallocinfo", 0400, NULL,
|
|
&vmalloc_op,
|
|
nr_node_ids * sizeof(unsigned int), NULL);
|
|
else
|
|
proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
|
|
return 0;
|
|
}
|
|
module_init(proc_vmalloc_init);
|
|
|
|
#endif
|
|
|