Merge "vmscan: Support multiple kswapd threads per node"

This commit is contained in:
qctecmdr 2020-04-26 04:01:41 -07:00 committed by Gerrit - the friendly Code Review server
commit f6b3aa8de5
6 changed files with 156 additions and 19 deletions

View file

@ -33,6 +33,7 @@ Currently, these files are in /proc/sys/vm:
- extfrag_threshold
- extra_free_kbytes
- hugetlb_shm_group
- kswapd_threads
- laptop_mode
- legacy_va_layout
- lowmem_reserve_ratio
@ -300,6 +301,28 @@ shared memory segment using hugetlb page.
==============================================================
kswapd_threads
kswapd_threads allows you to control the number of kswapd threads per node
running on the system. This provides the ability to devote additional CPU
resources toward proactive page replacement with the goal of reducing
direct reclaims. When direct reclaims are prevented, the CPU consumed
by them is prevented as well. Depending on the workload, the result can
cause aggregate CPU usage on the system to go up, down or stay the same.
More aggressive page replacement can reduce direct reclaims which cause
latency for tasks and decrease throughput when doing filesystem IO through
the pagecache. Direct reclaims are recorded using the allocstall counter
in /proc/vmstat.
The default value is 1 and the range of acceptible values are 1-16.
Always start with lower values in the 2-6 range. Higher values should
be justified with testing. If direct reclaims occur in spite of high
values, the cost of direct reclaims (in latency) that occur can be
higher due to increased lock contention.
==============================================================
laptop_mode
laptop_mode is a knob that controls "laptop mode". All the things that are

View file

@ -2331,6 +2331,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
enum memmap_context, struct vmem_altmap *);
extern void setup_per_zone_wmarks(void);
extern void update_kswapd_threads(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
@ -2351,6 +2352,7 @@ extern void zone_pcp_update(struct zone *zone);
extern void zone_pcp_reset(struct zone *zone);
/* page_alloc.c */
extern int kswapd_threads;
extern int min_free_kbytes;
extern int watermark_boost_factor;
extern int watermark_scale_factor;

View file

@ -36,6 +36,8 @@
*/
#define PAGE_ALLOC_COSTLY_ORDER 3
#define MAX_KSWAPD_THREADS 16
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
@ -676,8 +678,10 @@ typedef struct pglist_data {
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
/*
* Protected by mem_hotplug_begin/end()
*/
struct task_struct *kswapd[MAX_KSWAPD_THREADS];
int kswapd_order;
enum zone_type kswapd_classzone_idx;
@ -904,6 +908,8 @@ static inline int is_highmem(struct zone *zone)
/* These two functions are used to setup the per zone pages min values */
struct ctl_table;
int kswapd_threads_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,

View file

@ -140,6 +140,7 @@ static int ten_thousand = 10000;
#ifdef CONFIG_PERF_EVENTS
static int six_hundred_forty_kb = 640 * 1024;
#endif
static int max_kswapd_threads = MAX_KSWAPD_THREADS;
static int two_hundred_fifty_five = 255;
static int __maybe_unused two_hundred_million = 200000000;
@ -1773,6 +1774,15 @@ static struct ctl_table vm_table[] = {
.proc_handler = watermark_boost_factor_sysctl_handler,
.extra1 = &zero,
},
{
.procname = "kswapd_threads",
.data = &kswapd_threads,
.maxlen = sizeof(kswapd_threads),
.mode = 0644,
.proc_handler = kswapd_threads_sysctl_handler,
.extra1 = &one,
.extra2 = &max_kswapd_threads,
},
{
.procname = "watermark_scale_factor",
.data = &watermark_scale_factor,

View file

@ -7872,6 +7872,21 @@ int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
int kswapd_threads_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write)
update_kswapd_threads();
return 0;
}
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{

View file

@ -135,6 +135,13 @@ struct scan_control {
struct vm_area_struct *target_vma;
};
/*
* Number of active kswapd threads
*/
#define DEF_KSWAPD_THREADS_PER_NODE 1
int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE;
int kswapd_threads_current = DEF_KSWAPD_THREADS_PER_NODE;
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
do { \
@ -4081,21 +4088,83 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
restore their cpu bindings. */
static int kswapd_cpu_online(unsigned int cpu)
{
int nid;
int nid, hid;
int nr_threads = kswapd_threads_current;
for_each_node_state(nid, N_MEMORY) {
pg_data_t *pgdat = NODE_DATA(nid);
const struct cpumask *mask;
mask = cpumask_of_node(pgdat->node_id);
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
/* One of our CPUs online: restore mask */
set_cpus_allowed_ptr(pgdat->kswapd, mask);
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) {
for (hid = 0; hid < nr_threads; hid++) {
/* One of our CPUs online: restore mask */
set_cpus_allowed_ptr(pgdat->kswapd[hid], mask);
}
}
}
return 0;
}
static void update_kswapd_threads_node(int nid)
{
pg_data_t *pgdat;
int drop, increase;
int last_idx, start_idx, hid;
int nr_threads = kswapd_threads_current;
pgdat = NODE_DATA(nid);
last_idx = nr_threads - 1;
if (kswapd_threads < nr_threads) {
drop = nr_threads - kswapd_threads;
for (hid = last_idx; hid > (last_idx - drop); hid--) {
if (pgdat->kswapd[hid]) {
kthread_stop(pgdat->kswapd[hid]);
pgdat->kswapd[hid] = NULL;
}
}
} else {
increase = kswapd_threads - nr_threads;
start_idx = last_idx + 1;
for (hid = start_idx; hid < (start_idx + increase); hid++) {
pgdat->kswapd[hid] = kthread_run(kswapd, pgdat,
"kswapd%d:%d", nid, hid);
if (IS_ERR(pgdat->kswapd[hid])) {
pr_err("Failed to start kswapd%d on node %d\n",
hid, nid);
pgdat->kswapd[hid] = NULL;
/*
* We are out of resources. Do not start any
* more threads.
*/
break;
}
}
}
}
void update_kswapd_threads(void)
{
int nid;
if (kswapd_threads_current == kswapd_threads)
return;
/*
* Hold the memory hotplug lock to avoid racing with memory
* hotplug initiated updates
*/
mem_hotplug_begin();
for_each_node_state(nid, N_MEMORY)
update_kswapd_threads_node(nid);
pr_info("kswapd_thread count changed, old:%d new:%d\n",
kswapd_threads_current, kswapd_threads);
kswapd_threads_current = kswapd_threads;
mem_hotplug_done();
}
/*
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
@ -4104,18 +4173,25 @@ int kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
int ret = 0;
int hid, nr_threads;
if (pgdat->kswapd)
if (pgdat->kswapd[0])
return 0;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
nr_threads = kswapd_threads;
for (hid = 0; hid < nr_threads; hid++) {
pgdat->kswapd[hid] = kthread_run(kswapd, pgdat, "kswapd%d:%d",
nid, hid);
if (IS_ERR(pgdat->kswapd[hid])) {
/* failure at boot is fatal */
BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd%d on node %d\n",
hid, nid);
ret = PTR_ERR(pgdat->kswapd[hid]);
pgdat->kswapd[hid] = NULL;
}
}
kswapd_threads_current = nr_threads;
return ret;
}
@ -4125,11 +4201,16 @@ int kswapd_run(int nid)
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
struct task_struct *kswapd;
int hid;
int nr_threads = kswapd_threads_current;
if (kswapd) {
kthread_stop(kswapd);
NODE_DATA(nid)->kswapd = NULL;
for (hid = 0; hid < nr_threads; hid++) {
kswapd = NODE_DATA(nid)->kswapd[hid];
if (kswapd) {
kthread_stop(kswapd);
NODE_DATA(nid)->kswapd[hid] = NULL;
}
}
}