Merge "vmscan: Support multiple kswapd threads per node"
This commit is contained in:
commit
f6b3aa8de5
6 changed files with 156 additions and 19 deletions
|
@ -33,6 +33,7 @@ Currently, these files are in /proc/sys/vm:
|
|||
- extfrag_threshold
|
||||
- extra_free_kbytes
|
||||
- hugetlb_shm_group
|
||||
- kswapd_threads
|
||||
- laptop_mode
|
||||
- legacy_va_layout
|
||||
- lowmem_reserve_ratio
|
||||
|
@ -300,6 +301,28 @@ shared memory segment using hugetlb page.
|
|||
|
||||
==============================================================
|
||||
|
||||
kswapd_threads
|
||||
|
||||
kswapd_threads allows you to control the number of kswapd threads per node
|
||||
running on the system. This provides the ability to devote additional CPU
|
||||
resources toward proactive page replacement with the goal of reducing
|
||||
direct reclaims. When direct reclaims are prevented, the CPU consumed
|
||||
by them is prevented as well. Depending on the workload, the result can
|
||||
cause aggregate CPU usage on the system to go up, down or stay the same.
|
||||
|
||||
More aggressive page replacement can reduce direct reclaims which cause
|
||||
latency for tasks and decrease throughput when doing filesystem IO through
|
||||
the pagecache. Direct reclaims are recorded using the allocstall counter
|
||||
in /proc/vmstat.
|
||||
|
||||
The default value is 1 and the range of acceptible values are 1-16.
|
||||
Always start with lower values in the 2-6 range. Higher values should
|
||||
be justified with testing. If direct reclaims occur in spite of high
|
||||
values, the cost of direct reclaims (in latency) that occur can be
|
||||
higher due to increased lock contention.
|
||||
|
||||
==============================================================
|
||||
|
||||
laptop_mode
|
||||
|
||||
laptop_mode is a knob that controls "laptop mode". All the things that are
|
||||
|
|
|
@ -2331,6 +2331,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
|
|||
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
|
||||
enum memmap_context, struct vmem_altmap *);
|
||||
extern void setup_per_zone_wmarks(void);
|
||||
extern void update_kswapd_threads(void);
|
||||
extern int __meminit init_per_zone_wmark_min(void);
|
||||
extern void mem_init(void);
|
||||
extern void __init mmap_init(void);
|
||||
|
@ -2351,6 +2352,7 @@ extern void zone_pcp_update(struct zone *zone);
|
|||
extern void zone_pcp_reset(struct zone *zone);
|
||||
|
||||
/* page_alloc.c */
|
||||
extern int kswapd_threads;
|
||||
extern int min_free_kbytes;
|
||||
extern int watermark_boost_factor;
|
||||
extern int watermark_scale_factor;
|
||||
|
|
|
@ -36,6 +36,8 @@
|
|||
*/
|
||||
#define PAGE_ALLOC_COSTLY_ORDER 3
|
||||
|
||||
#define MAX_KSWAPD_THREADS 16
|
||||
|
||||
enum migratetype {
|
||||
MIGRATE_UNMOVABLE,
|
||||
MIGRATE_MOVABLE,
|
||||
|
@ -676,8 +678,10 @@ typedef struct pglist_data {
|
|||
int node_id;
|
||||
wait_queue_head_t kswapd_wait;
|
||||
wait_queue_head_t pfmemalloc_wait;
|
||||
struct task_struct *kswapd; /* Protected by
|
||||
mem_hotplug_begin/end() */
|
||||
/*
|
||||
* Protected by mem_hotplug_begin/end()
|
||||
*/
|
||||
struct task_struct *kswapd[MAX_KSWAPD_THREADS];
|
||||
int kswapd_order;
|
||||
enum zone_type kswapd_classzone_idx;
|
||||
|
||||
|
@ -904,6 +908,8 @@ static inline int is_highmem(struct zone *zone)
|
|||
|
||||
/* These two functions are used to setup the per zone pages min values */
|
||||
struct ctl_table;
|
||||
int kswapd_threads_sysctl_handler(struct ctl_table *, int,
|
||||
void __user *, size_t *, loff_t *);
|
||||
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
|
||||
void __user *, size_t *, loff_t *);
|
||||
int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,
|
||||
|
|
|
@ -140,6 +140,7 @@ static int ten_thousand = 10000;
|
|||
#ifdef CONFIG_PERF_EVENTS
|
||||
static int six_hundred_forty_kb = 640 * 1024;
|
||||
#endif
|
||||
static int max_kswapd_threads = MAX_KSWAPD_THREADS;
|
||||
static int two_hundred_fifty_five = 255;
|
||||
static int __maybe_unused two_hundred_million = 200000000;
|
||||
|
||||
|
@ -1773,6 +1774,15 @@ static struct ctl_table vm_table[] = {
|
|||
.proc_handler = watermark_boost_factor_sysctl_handler,
|
||||
.extra1 = &zero,
|
||||
},
|
||||
{
|
||||
.procname = "kswapd_threads",
|
||||
.data = &kswapd_threads,
|
||||
.maxlen = sizeof(kswapd_threads),
|
||||
.mode = 0644,
|
||||
.proc_handler = kswapd_threads_sysctl_handler,
|
||||
.extra1 = &one,
|
||||
.extra2 = &max_kswapd_threads,
|
||||
},
|
||||
{
|
||||
.procname = "watermark_scale_factor",
|
||||
.data = &watermark_scale_factor,
|
||||
|
|
|
@ -7872,6 +7872,21 @@ int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
|
|||
return 0;
|
||||
}
|
||||
|
||||
int kswapd_threads_sysctl_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *length, loff_t *ppos)
|
||||
{
|
||||
int rc;
|
||||
|
||||
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
|
||||
if (rc)
|
||||
return rc;
|
||||
|
||||
if (write)
|
||||
update_kswapd_threads();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *length, loff_t *ppos)
|
||||
{
|
||||
|
|
105
mm/vmscan.c
105
mm/vmscan.c
|
@ -135,6 +135,13 @@ struct scan_control {
|
|||
struct vm_area_struct *target_vma;
|
||||
};
|
||||
|
||||
/*
|
||||
* Number of active kswapd threads
|
||||
*/
|
||||
#define DEF_KSWAPD_THREADS_PER_NODE 1
|
||||
int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE;
|
||||
int kswapd_threads_current = DEF_KSWAPD_THREADS_PER_NODE;
|
||||
|
||||
#ifdef ARCH_HAS_PREFETCH
|
||||
#define prefetch_prev_lru_page(_page, _base, _field) \
|
||||
do { \
|
||||
|
@ -4081,21 +4088,83 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
|
|||
restore their cpu bindings. */
|
||||
static int kswapd_cpu_online(unsigned int cpu)
|
||||
{
|
||||
int nid;
|
||||
int nid, hid;
|
||||
int nr_threads = kswapd_threads_current;
|
||||
|
||||
for_each_node_state(nid, N_MEMORY) {
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
const struct cpumask *mask;
|
||||
|
||||
mask = cpumask_of_node(pgdat->node_id);
|
||||
|
||||
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
|
||||
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) {
|
||||
for (hid = 0; hid < nr_threads; hid++) {
|
||||
/* One of our CPUs online: restore mask */
|
||||
set_cpus_allowed_ptr(pgdat->kswapd, mask);
|
||||
set_cpus_allowed_ptr(pgdat->kswapd[hid], mask);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void update_kswapd_threads_node(int nid)
|
||||
{
|
||||
pg_data_t *pgdat;
|
||||
int drop, increase;
|
||||
int last_idx, start_idx, hid;
|
||||
int nr_threads = kswapd_threads_current;
|
||||
|
||||
pgdat = NODE_DATA(nid);
|
||||
last_idx = nr_threads - 1;
|
||||
if (kswapd_threads < nr_threads) {
|
||||
drop = nr_threads - kswapd_threads;
|
||||
for (hid = last_idx; hid > (last_idx - drop); hid--) {
|
||||
if (pgdat->kswapd[hid]) {
|
||||
kthread_stop(pgdat->kswapd[hid]);
|
||||
pgdat->kswapd[hid] = NULL;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
increase = kswapd_threads - nr_threads;
|
||||
start_idx = last_idx + 1;
|
||||
for (hid = start_idx; hid < (start_idx + increase); hid++) {
|
||||
pgdat->kswapd[hid] = kthread_run(kswapd, pgdat,
|
||||
"kswapd%d:%d", nid, hid);
|
||||
if (IS_ERR(pgdat->kswapd[hid])) {
|
||||
pr_err("Failed to start kswapd%d on node %d\n",
|
||||
hid, nid);
|
||||
pgdat->kswapd[hid] = NULL;
|
||||
/*
|
||||
* We are out of resources. Do not start any
|
||||
* more threads.
|
||||
*/
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void update_kswapd_threads(void)
|
||||
{
|
||||
int nid;
|
||||
|
||||
if (kswapd_threads_current == kswapd_threads)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Hold the memory hotplug lock to avoid racing with memory
|
||||
* hotplug initiated updates
|
||||
*/
|
||||
mem_hotplug_begin();
|
||||
for_each_node_state(nid, N_MEMORY)
|
||||
update_kswapd_threads_node(nid);
|
||||
|
||||
pr_info("kswapd_thread count changed, old:%d new:%d\n",
|
||||
kswapd_threads_current, kswapd_threads);
|
||||
kswapd_threads_current = kswapd_threads;
|
||||
mem_hotplug_done();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This kswapd start function will be called by init and node-hot-add.
|
||||
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
|
||||
|
@ -4104,18 +4173,25 @@ int kswapd_run(int nid)
|
|||
{
|
||||
pg_data_t *pgdat = NODE_DATA(nid);
|
||||
int ret = 0;
|
||||
int hid, nr_threads;
|
||||
|
||||
if (pgdat->kswapd)
|
||||
if (pgdat->kswapd[0])
|
||||
return 0;
|
||||
|
||||
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
|
||||
if (IS_ERR(pgdat->kswapd)) {
|
||||
nr_threads = kswapd_threads;
|
||||
for (hid = 0; hid < nr_threads; hid++) {
|
||||
pgdat->kswapd[hid] = kthread_run(kswapd, pgdat, "kswapd%d:%d",
|
||||
nid, hid);
|
||||
if (IS_ERR(pgdat->kswapd[hid])) {
|
||||
/* failure at boot is fatal */
|
||||
BUG_ON(system_state < SYSTEM_RUNNING);
|
||||
pr_err("Failed to start kswapd on node %d\n", nid);
|
||||
ret = PTR_ERR(pgdat->kswapd);
|
||||
pgdat->kswapd = NULL;
|
||||
pr_err("Failed to start kswapd%d on node %d\n",
|
||||
hid, nid);
|
||||
ret = PTR_ERR(pgdat->kswapd[hid]);
|
||||
pgdat->kswapd[hid] = NULL;
|
||||
}
|
||||
}
|
||||
kswapd_threads_current = nr_threads;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -4125,11 +4201,16 @@ int kswapd_run(int nid)
|
|||
*/
|
||||
void kswapd_stop(int nid)
|
||||
{
|
||||
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
|
||||
struct task_struct *kswapd;
|
||||
int hid;
|
||||
int nr_threads = kswapd_threads_current;
|
||||
|
||||
for (hid = 0; hid < nr_threads; hid++) {
|
||||
kswapd = NODE_DATA(nid)->kswapd[hid];
|
||||
if (kswapd) {
|
||||
kthread_stop(kswapd);
|
||||
NODE_DATA(nid)->kswapd = NULL;
|
||||
NODE_DATA(nid)->kswapd[hid] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue