2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* kernel/sched.c
|
|
|
|
*
|
|
|
|
* Kernel scheduler and related syscalls
|
|
|
|
*
|
|
|
|
* Copyright (C) 1991-2002 Linus Torvalds
|
|
|
|
*
|
|
|
|
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
|
|
|
|
* make semaphores SMP safe
|
|
|
|
* 1998-11-19 Implemented schedule_timeout() and related stuff
|
|
|
|
* by Andrea Arcangeli
|
|
|
|
* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
|
|
|
|
* hybrid priority-list and round-robin design with
|
|
|
|
* an array-switch method of distributing timeslices
|
|
|
|
* and per-CPU runqueues. Cleanups and useful suggestions
|
|
|
|
* by Davide Libenzi, preemptible kernel bits by Robert Love.
|
|
|
|
* 2003-09-03 Interactivity tuning by Con Kolivas.
|
|
|
|
* 2004-04-02 Scheduler domains code by Nick Piggin
|
2007-07-09 18:52:01 +02:00
|
|
|
* 2007-04-15 Work begun on replacing all interactivity tuning with a
|
|
|
|
* fair scheduling design by Con Kolivas.
|
|
|
|
* 2007-05-05 Load balancing (smp-nice) and other improvements
|
|
|
|
* by Peter Williams
|
|
|
|
* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
|
|
|
|
* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
|
2008-01-25 21:08:19 +01:00
|
|
|
* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
|
|
|
|
* Thomas Gleixner, Mike Kravetz
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/nmi.h>
|
|
|
|
#include <linux/init.h>
|
2007-07-09 18:52:00 +02:00
|
|
|
#include <linux/uaccess.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/smp_lock.h>
|
|
|
|
#include <asm/mmu_context.h>
|
|
|
|
#include <linux/interrupt.h>
|
2006-01-11 21:17:46 +01:00
|
|
|
#include <linux/capability.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
#include <linux/completion.h>
|
|
|
|
#include <linux/kernel_stat.h>
|
2006-07-03 09:24:33 +02:00
|
|
|
#include <linux/debug_locks.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
#include <linux/security.h>
|
|
|
|
#include <linux/notifier.h>
|
|
|
|
#include <linux/profile.h>
|
2006-12-07 05:34:23 +01:00
|
|
|
#include <linux/freezer.h>
|
[PATCH] scheduler cache-hot-autodetect
)
From: Ingo Molnar <mingo@elte.hu>
This is the latest version of the scheduler cache-hot-auto-tune patch.
The first problem was that detection time scaled with O(N^2), which is
unacceptable on larger SMP and NUMA systems. To solve this:
- I've added a 'domain distance' function, which is used to cache
measurement results. Each distance is only measured once. This means
that e.g. on NUMA distances of 0, 1 and 2 might be measured, on HT
distances 0 and 1, and on SMP distance 0 is measured. The code walks
the domain tree to determine the distance, so it automatically follows
whatever hierarchy an architecture sets up. This cuts down on the boot
time significantly and removes the O(N^2) limit. The only assumption
is that migration costs can be expressed as a function of domain
distance - this covers the overwhelming majority of existing systems,
and is a good guess even for more assymetric systems.
[ People hacking systems that have assymetries that break this
assumption (e.g. different CPU speeds) should experiment a bit with
the cpu_distance() function. Adding a ->migration_distance factor to
the domain structure would be one possible solution - but lets first
see the problem systems, if they exist at all. Lets not overdesign. ]
Another problem was that only a single cache-size was used for measuring
the cost of migration, and most architectures didnt set that variable
up. Furthermore, a single cache-size does not fit NUMA hierarchies with
L3 caches and does not fit HT setups, where different CPUs will often
have different 'effective cache sizes'. To solve this problem:
- Instead of relying on a single cache-size provided by the platform and
sticking to it, the code now auto-detects the 'effective migration
cost' between two measured CPUs, via iterating through a wide range of
cachesizes. The code searches for the maximum migration cost, which
occurs when the working set of the test-workload falls just below the
'effective cache size'. I.e. real-life optimized search is done for
the maximum migration cost, between two real CPUs.
This, amongst other things, has the positive effect hat if e.g. two
CPUs share a L2/L3 cache, a different (and accurate) migration cost
will be found than between two CPUs on the same system that dont share
any caches.
(The reliable measurement of migration costs is tricky - see the source
for details.)
Furthermore i've added various boot-time options to override/tune
migration behavior.
Firstly, there's a blanket override for autodetection:
migration_cost=1000,2000,3000
will override the depth 0/1/2 values with 1msec/2msec/3msec values.
Secondly, there's a global factor that can be used to increase (or
decrease) the autodetected values:
migration_factor=120
will increase the autodetected values by 20%. This option is useful to
tune things in a workload-dependent way - e.g. if a workload is
cache-insensitive then CPU utilization can be maximized by specifying
migration_factor=0.
I've tested the autodetection code quite extensively on x86, on 3
P3/Xeon/2MB, and the autodetected values look pretty good:
Dual Celeron (128K L2 cache):
---------------------
migration cost matrix (max_cache_size: 131072, cpu: 467 MHz):
---------------------
[00] [01]
[00]: - 1.7(1)
[01]: 1.7(1) -
---------------------
cacheflush times [2]: 0.0 (0) 1.7 (1784008)
---------------------
Here the slow memory subsystem dominates system performance, and even
though caches are small, the migration cost is 1.7 msecs.
Dual HT P4 (512K L2 cache):
---------------------
migration cost matrix (max_cache_size: 524288, cpu: 2379 MHz):
---------------------
[00] [01] [02] [03]
[00]: - 0.4(1) 0.0(0) 0.4(1)
[01]: 0.4(1) - 0.4(1) 0.0(0)
[02]: 0.0(0) 0.4(1) - 0.4(1)
[03]: 0.4(1) 0.0(0) 0.4(1) -
---------------------
cacheflush times [2]: 0.0 (33900) 0.4 (448514)
---------------------
Here it can be seen that there is no migration cost between two HT
siblings (CPU#0/2 and CPU#1/3 are separate physical CPUs). A fast memory
system makes inter-physical-CPU migration pretty cheap: 0.4 msecs.
8-way P3/Xeon [2MB L2 cache]:
---------------------
migration cost matrix (max_cache_size: 2097152, cpu: 700 MHz):
---------------------
[00] [01] [02] [03] [04] [05] [06] [07]
[00]: - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[01]: 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[02]: 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[03]: 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[04]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1)
[05]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1)
[06]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1)
[07]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) -
---------------------
cacheflush times [2]: 0.0 (0) 19.2 (19281756)
---------------------
This one has huge caches and a relatively slow memory subsystem - so the
migration cost is 19 msecs.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: <wilder@us.ibm.com>
Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-12 10:05:30 +01:00
|
|
|
#include <linux/vmalloc.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/delay.h>
|
2007-10-19 08:40:14 +02:00
|
|
|
#include <linux/pid_namespace.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/threads.h>
|
|
|
|
#include <linux/timer.h>
|
|
|
|
#include <linux/rcupdate.h>
|
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/cpuset.h>
|
|
|
|
#include <linux/percpu.h>
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
#include <linux/seq_file.h>
|
2007-07-26 13:40:43 +02:00
|
|
|
#include <linux/sysctl.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
#include <linux/syscalls.h>
|
|
|
|
#include <linux/times.h>
|
2006-10-01 08:28:59 +02:00
|
|
|
#include <linux/tsacct_kern.h>
|
2006-03-26 11:38:20 +02:00
|
|
|
#include <linux/kprobes.h>
|
2006-07-14 09:24:37 +02:00
|
|
|
#include <linux/delayacct.h>
|
2007-05-08 09:32:57 +02:00
|
|
|
#include <linux/reciprocal_div.h>
|
2007-07-09 18:52:00 +02:00
|
|
|
#include <linux/unistd.h>
|
2007-09-21 09:19:54 +02:00
|
|
|
#include <linux/pagemap.h>
|
2008-01-25 21:08:29 +01:00
|
|
|
#include <linux/hrtimer.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-05-08 09:32:57 +02:00
|
|
|
#include <asm/tlb.h>
|
2007-10-24 18:23:50 +02:00
|
|
|
#include <asm/irq_regs.h>
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-02-10 10:45:10 +01:00
|
|
|
/*
|
|
|
|
* Scheduler clock - returns current time in nanosec units.
|
|
|
|
* This is default implementation.
|
|
|
|
* Architectures and sub-architectures can override this.
|
|
|
|
*/
|
|
|
|
unsigned long long __attribute__((weak)) sched_clock(void)
|
|
|
|
{
|
2007-11-09 22:39:38 +01:00
|
|
|
return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
|
2007-02-10 10:45:10 +01:00
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* Convert user-nice values [ -20 ... 0 ... 19 ]
|
|
|
|
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
|
|
|
|
* and back.
|
|
|
|
*/
|
|
|
|
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
|
|
|
|
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
|
|
|
|
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* 'User priority' is the nice value converted to something we
|
|
|
|
* can work with better when scaling various scheduler parameters,
|
|
|
|
* it's a [ 0 ... 39 ] range.
|
|
|
|
*/
|
|
|
|
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
|
|
|
|
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
|
|
|
|
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
|
|
|
|
|
|
|
|
/*
|
2008-01-25 21:08:19 +01:00
|
|
|
* Helpers for converting nanosecond timing to jiffy resolution
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-11-09 22:39:38 +01:00
|
|
|
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
#define NICE_0_LOAD SCHED_LOAD_SCALE
|
|
|
|
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* These are the 'tuning knobs' of the scheduler:
|
|
|
|
*
|
2007-10-15 17:00:13 +02:00
|
|
|
* default timeslice is 100 msecs (used only for SCHED_RR tasks).
|
2005-04-17 00:20:36 +02:00
|
|
|
* Timeslices get refilled after they expire.
|
|
|
|
*/
|
|
|
|
#define DEF_TIMESLICE (100 * HZ / 1000)
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
|
2007-05-08 09:32:57 +02:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/*
|
|
|
|
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
|
|
|
|
* Since cpu_power is a 'constant', we can use a reciprocal divide.
|
|
|
|
*/
|
|
|
|
static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
|
|
|
|
{
|
|
|
|
return reciprocal_divide(load, sg->reciprocal_cpu_power);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Each time a sched group cpu_power is changed,
|
|
|
|
* we must compute its reciprocal value
|
|
|
|
*/
|
|
|
|
static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
|
|
|
|
{
|
|
|
|
sg->__cpu_power += val;
|
|
|
|
sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
static inline int rt_policy(int policy)
|
|
|
|
{
|
|
|
|
if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
|
|
|
|
return 1;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int task_has_rt_policy(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return rt_policy(p->policy);
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
2007-07-09 18:51:58 +02:00
|
|
|
* This is the priority-queue data structure of the RT scheduling class:
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-07-09 18:51:58 +02:00
|
|
|
struct rt_prio_array {
|
|
|
|
DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
|
|
|
|
struct list_head queue[MAX_RT_PRIO];
|
|
|
|
};
|
|
|
|
|
2007-10-15 17:00:07 +02:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
|
2007-10-19 08:41:03 +02:00
|
|
|
#include <linux/cgroup.h>
|
|
|
|
|
2007-10-15 17:00:07 +02:00
|
|
|
struct cfs_rq;
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static LIST_HEAD(task_groups);
|
|
|
|
|
2007-10-15 17:00:07 +02:00
|
|
|
/* task group related information */
|
2007-10-15 17:00:14 +02:00
|
|
|
struct task_group {
|
2007-10-19 08:41:03 +02:00
|
|
|
#ifdef CONFIG_FAIR_CGROUP_SCHED
|
|
|
|
struct cgroup_subsys_state css;
|
|
|
|
#endif
|
2007-10-15 17:00:07 +02:00
|
|
|
/* schedulable entities of this group on each cpu */
|
|
|
|
struct sched_entity **se;
|
|
|
|
/* runqueue "owned" by this group on each cpu */
|
|
|
|
struct cfs_rq **cfs_rq;
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
struct sched_rt_entity **rt_se;
|
|
|
|
struct rt_rq **rt_rq;
|
|
|
|
|
|
|
|
unsigned int rt_ratio;
|
|
|
|
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
/*
|
|
|
|
* shares assigned to a task group governs how much of cpu bandwidth
|
|
|
|
* is allocated to the group. The more shares a group has, the more is
|
|
|
|
* the cpu bandwidth allocated to it.
|
|
|
|
*
|
|
|
|
* For ex, lets say that there are three task groups, A, B and C which
|
|
|
|
* have been assigned shares 1000, 2000 and 3000 respectively. Then,
|
|
|
|
* cpu bandwidth allocated by the scheduler to task groups A, B and C
|
|
|
|
* should be:
|
|
|
|
*
|
|
|
|
* Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
|
|
|
|
* Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
|
2008-01-25 21:08:28 +01:00
|
|
|
* Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
*
|
|
|
|
* The weight assigned to a task group's schedulable entities on every
|
|
|
|
* cpu (task_group.se[a_cpu]->load.weight) is derived from the task
|
|
|
|
* group's shares. For ex: lets say that task group A has been
|
|
|
|
* assigned shares of 1000 and there are two CPUs in a system. Then,
|
|
|
|
*
|
|
|
|
* tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
|
|
|
|
*
|
|
|
|
* Note: It's not necessary that each of a task's group schedulable
|
2008-01-25 21:08:28 +01:00
|
|
|
* entity have the same weight on all CPUs. If the group
|
|
|
|
* has 2 of its tasks on CPU0 and 1 task on CPU1, then a
|
|
|
|
* better distribution of weight could be:
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
*
|
|
|
|
* tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
|
|
|
|
* tg_A->se[1]->load.weight = 1/2 * 2000 = 667
|
|
|
|
*
|
|
|
|
* rebalance_shares() is responsible for distributing the shares of a
|
|
|
|
* task groups like this among the group's schedulable entities across
|
|
|
|
* cpus.
|
|
|
|
*
|
|
|
|
*/
|
2007-10-15 17:00:07 +02:00
|
|
|
unsigned long shares;
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
|
2007-10-29 21:18:11 +01:00
|
|
|
struct rcu_head rcu;
|
2008-01-25 21:08:30 +01:00
|
|
|
struct list_head list;
|
2007-10-15 17:00:07 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/* Default task group's sched entity on each cpu */
|
|
|
|
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
|
|
|
|
/* Default task group's cfs_rq on each cpu */
|
|
|
|
static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
|
|
|
|
static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
|
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
static struct sched_entity *init_sched_entity_p[NR_CPUS];
|
|
|
|
static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
|
|
|
|
static struct rt_rq *init_rt_rq_p[NR_CPUS];
|
|
|
|
|
2008-01-25 21:07:59 +01:00
|
|
|
/* task_group_mutex serializes add/remove of task groups and also changes to
|
|
|
|
* a task group's cpu shares.
|
|
|
|
*/
|
|
|
|
static DEFINE_MUTEX(task_group_mutex);
|
|
|
|
|
2008-01-25 21:08:00 +01:00
|
|
|
/* doms_cur_mutex serializes access to doms_cur[] array */
|
|
|
|
static DEFINE_MUTEX(doms_cur_mutex);
|
|
|
|
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/* kernel thread that runs rebalance_shares() periodically */
|
|
|
|
static struct task_struct *lb_monitor_task;
|
|
|
|
static int load_balance_monitor(void *unused);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void set_se_shares(struct sched_entity *se, unsigned long shares);
|
|
|
|
|
2007-10-15 17:00:07 +02:00
|
|
|
/* Default task group.
|
2007-10-15 17:00:12 +02:00
|
|
|
* Every task in system belong to this group at bootup.
|
2007-10-15 17:00:07 +02:00
|
|
|
*/
|
2007-10-15 17:00:14 +02:00
|
|
|
struct task_group init_task_group = {
|
2008-01-25 21:08:19 +01:00
|
|
|
.se = init_sched_entity_p,
|
2007-10-15 17:00:12 +02:00
|
|
|
.cfs_rq = init_cfs_rq_p,
|
2008-01-25 21:08:30 +01:00
|
|
|
|
|
|
|
.rt_se = init_sched_rt_entity_p,
|
|
|
|
.rt_rq = init_rt_rq_p,
|
2007-10-15 17:00:12 +02:00
|
|
|
};
|
2007-10-15 17:00:09 +02:00
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
#ifdef CONFIG_FAIR_USER_SCHED
|
2008-01-25 21:08:19 +01:00
|
|
|
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
|
2007-10-15 17:00:09 +02:00
|
|
|
#else
|
2008-01-25 21:07:59 +01:00
|
|
|
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
|
2007-10-15 17:00:09 +02:00
|
|
|
#endif
|
|
|
|
|
2008-01-25 21:08:19 +01:00
|
|
|
#define MIN_GROUP_SHARES 2
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
|
2008-01-25 21:07:59 +01:00
|
|
|
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
|
|
|
/* return group to which a task belongs */
|
2007-10-15 17:00:14 +02:00
|
|
|
static inline struct task_group *task_group(struct task_struct *p)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
struct task_group *tg;
|
2007-10-15 17:00:09 +02:00
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
#ifdef CONFIG_FAIR_USER_SCHED
|
|
|
|
tg = p->user->tg;
|
2007-10-19 08:41:03 +02:00
|
|
|
#elif defined(CONFIG_FAIR_CGROUP_SCHED)
|
|
|
|
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
|
|
|
|
struct task_group, css);
|
2007-10-15 17:00:09 +02:00
|
|
|
#else
|
2007-12-05 15:46:09 +01:00
|
|
|
tg = &init_task_group;
|
2007-10-15 17:00:09 +02:00
|
|
|
#endif
|
2007-10-15 17:00:09 +02:00
|
|
|
return tg;
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
|
2008-01-25 21:08:30 +01:00
|
|
|
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
2007-11-15 20:57:40 +01:00
|
|
|
p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
|
|
|
|
p->se.parent = task_group(p)->se[cpu];
|
2008-01-25 21:08:30 +01:00
|
|
|
|
|
|
|
p->rt.rt_rq = task_group(p)->rt_rq[cpu];
|
|
|
|
p->rt.parent = task_group(p)->rt_se[cpu];
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:07:59 +01:00
|
|
|
static inline void lock_task_group_list(void)
|
|
|
|
{
|
|
|
|
mutex_lock(&task_group_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void unlock_task_group_list(void)
|
|
|
|
{
|
|
|
|
mutex_unlock(&task_group_mutex);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:00 +01:00
|
|
|
static inline void lock_doms_cur(void)
|
|
|
|
{
|
|
|
|
mutex_lock(&doms_cur_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void unlock_doms_cur(void)
|
|
|
|
{
|
|
|
|
mutex_unlock(&doms_cur_mutex);
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:07 +02:00
|
|
|
#else
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
|
2008-01-25 21:07:59 +01:00
|
|
|
static inline void lock_task_group_list(void) { }
|
|
|
|
static inline void unlock_task_group_list(void) { }
|
2008-01-25 21:08:00 +01:00
|
|
|
static inline void lock_doms_cur(void) { }
|
|
|
|
static inline void unlock_doms_cur(void) { }
|
2007-10-15 17:00:07 +02:00
|
|
|
|
|
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
/* CFS-related fields in a runqueue */
|
|
|
|
struct cfs_rq {
|
|
|
|
struct load_weight load;
|
|
|
|
unsigned long nr_running;
|
|
|
|
|
|
|
|
u64 exec_clock;
|
2007-10-15 17:00:04 +02:00
|
|
|
u64 min_vruntime;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
|
|
|
struct rb_root tasks_timeline;
|
|
|
|
struct rb_node *rb_leftmost;
|
|
|
|
struct rb_node *rb_load_balance_curr;
|
|
|
|
/* 'curr' points to currently running entity on this cfs_rq.
|
|
|
|
* It is set to NULL otherwise (i.e when none are currently running).
|
|
|
|
*/
|
|
|
|
struct sched_entity *curr;
|
2007-10-15 17:00:10 +02:00
|
|
|
|
|
|
|
unsigned long nr_spread_over;
|
|
|
|
|
2007-10-15 17:00:03 +02:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2007-07-09 18:51:58 +02:00
|
|
|
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
|
|
|
|
|
2007-12-05 15:46:09 +01:00
|
|
|
/*
|
|
|
|
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
|
2007-07-09 18:51:58 +02:00
|
|
|
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
|
|
|
|
* (like users, containers etc.)
|
|
|
|
*
|
|
|
|
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
|
|
|
|
* list is used during load balance.
|
|
|
|
*/
|
2007-12-05 15:46:09 +01:00
|
|
|
struct list_head leaf_cfs_rq_list;
|
|
|
|
struct task_group *tg; /* group that "owns" this runqueue */
|
2007-07-09 18:51:58 +02:00
|
|
|
#endif
|
|
|
|
};
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
/* Real-Time classes' related field in a runqueue: */
|
|
|
|
struct rt_rq {
|
|
|
|
struct rt_prio_array active;
|
2008-01-25 21:08:03 +01:00
|
|
|
unsigned long rt_nr_running;
|
2008-01-25 21:08:30 +01:00
|
|
|
#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
|
|
|
|
int highest_prio; /* highest queued rt task prio */
|
|
|
|
#endif
|
2008-01-25 21:08:29 +01:00
|
|
|
#ifdef CONFIG_SMP
|
2008-01-25 21:08:07 +01:00
|
|
|
unsigned long rt_nr_migratory;
|
2008-01-25 21:08:12 +01:00
|
|
|
int overloaded;
|
2008-01-25 21:08:29 +01:00
|
|
|
#endif
|
2008-01-25 21:08:30 +01:00
|
|
|
int rt_throttled;
|
2008-01-25 21:08:29 +01:00
|
|
|
u64 rt_time;
|
2008-01-25 21:08:30 +01:00
|
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
struct rq *rq;
|
|
|
|
struct list_head leaf_rt_rq_list;
|
|
|
|
struct task_group *tg;
|
|
|
|
struct sched_rt_entity *rt_se;
|
|
|
|
#endif
|
2007-07-09 18:51:58 +02:00
|
|
|
};
|
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We add the notion of a root-domain which will be used to define per-domain
|
2008-01-25 21:08:19 +01:00
|
|
|
* variables. Each exclusive cpuset essentially defines an island domain by
|
|
|
|
* fully partitioning the member cpus from any other cpuset. Whenever a new
|
2008-01-25 21:08:18 +01:00
|
|
|
* exclusive cpuset is created, we also create and attach a new root-domain
|
|
|
|
* object.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
struct root_domain {
|
|
|
|
atomic_t refcount;
|
|
|
|
cpumask_t span;
|
|
|
|
cpumask_t online;
|
2008-01-25 21:08:18 +01:00
|
|
|
|
2008-01-25 21:08:19 +01:00
|
|
|
/*
|
2008-01-25 21:08:18 +01:00
|
|
|
* The "RT overload" flag: it gets set if a CPU has more than
|
|
|
|
* one runnable RT task.
|
|
|
|
*/
|
|
|
|
cpumask_t rto_mask;
|
2008-01-25 21:08:19 +01:00
|
|
|
atomic_t rto_count;
|
2008-01-25 21:08:18 +01:00
|
|
|
};
|
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
/*
|
|
|
|
* By default the system creates a single root-domain with all cpus as
|
|
|
|
* members (mimicking the global state we have today).
|
|
|
|
*/
|
2008-01-25 21:08:18 +01:00
|
|
|
static struct root_domain def_root_domain;
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* This is the main, per-CPU runqueue data structure.
|
|
|
|
*
|
|
|
|
* Locking rule: those places that want to lock multiple runqueues
|
|
|
|
* (such as the load balancing or the thread migration code), lock
|
|
|
|
* acquire operations must be ordered by ascending &runqueue.
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq {
|
2007-10-18 21:32:55 +02:00
|
|
|
/* runqueue lock: */
|
|
|
|
spinlock_t lock;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* nr_running and cpu_load should be in the same cacheline because
|
|
|
|
* remote CPUs use both these fields when doing load calculation.
|
|
|
|
*/
|
|
|
|
unsigned long nr_running;
|
2007-07-09 18:51:58 +02:00
|
|
|
#define CPU_LOAD_IDX_MAX 5
|
|
|
|
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
|
2007-05-08 09:32:48 +02:00
|
|
|
unsigned char idle_at_tick;
|
2007-05-08 09:32:51 +02:00
|
|
|
#ifdef CONFIG_NO_HZ
|
|
|
|
unsigned char in_nohz_recently;
|
|
|
|
#endif
|
2007-10-18 21:32:55 +02:00
|
|
|
/* capture load from *all* tasks on this cpu: */
|
|
|
|
struct load_weight load;
|
2007-07-09 18:51:58 +02:00
|
|
|
unsigned long nr_load_updates;
|
|
|
|
u64 nr_switches;
|
|
|
|
|
|
|
|
struct cfs_rq cfs;
|
2008-01-25 21:08:30 +01:00
|
|
|
struct rt_rq rt;
|
|
|
|
u64 rt_period_expire;
|
2008-01-25 21:08:31 +01:00
|
|
|
int rt_throttled;
|
2008-01-25 21:08:30 +01:00
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2007-10-18 21:32:55 +02:00
|
|
|
/* list of leaf cfs_rq on this cpu: */
|
|
|
|
struct list_head leaf_cfs_rq_list;
|
2008-01-25 21:08:30 +01:00
|
|
|
struct list_head leaf_rt_rq_list;
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is part of a global counter where only the total sum
|
|
|
|
* over all CPUs matters. A task can increase this counter on
|
|
|
|
* one CPU and if it got migrated afterwards it may decrease
|
|
|
|
* it on another CPU. Always updated under the runqueue lock:
|
|
|
|
*/
|
|
|
|
unsigned long nr_uninterruptible;
|
|
|
|
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *curr, *idle;
|
2006-12-10 11:20:25 +01:00
|
|
|
unsigned long next_balance;
|
2005-04-17 00:20:36 +02:00
|
|
|
struct mm_struct *prev_mm;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
|
|
|
u64 clock, prev_clock_raw;
|
|
|
|
s64 clock_max_delta;
|
|
|
|
|
2008-01-25 21:08:34 +01:00
|
|
|
unsigned int clock_warps, clock_overflows, clock_underflows;
|
2007-08-23 15:18:02 +02:00
|
|
|
u64 idle_clock;
|
|
|
|
unsigned int clock_deep_idle_events;
|
2007-08-10 23:05:11 +02:00
|
|
|
u64 tick_timestamp;
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
atomic_t nr_iowait;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
2008-01-25 21:08:19 +01:00
|
|
|
struct root_domain *rd;
|
2005-04-17 00:20:36 +02:00
|
|
|
struct sched_domain *sd;
|
|
|
|
|
|
|
|
/* For active balancing */
|
|
|
|
int active_balance;
|
|
|
|
int push_cpu;
|
2007-10-18 21:32:55 +02:00
|
|
|
/* cpu of this runqueue: */
|
|
|
|
int cpu;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *migration_thread;
|
2005-04-17 00:20:36 +02:00
|
|
|
struct list_head migration_queue;
|
|
|
|
#endif
|
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
#ifdef CONFIG_SCHED_HRTICK
|
|
|
|
unsigned long hrtick_flags;
|
|
|
|
ktime_t hrtick_expire;
|
|
|
|
struct hrtimer hrtick_timer;
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
/* latency stats */
|
|
|
|
struct sched_info rq_sched_info;
|
|
|
|
|
|
|
|
/* sys_sched_yield() stats */
|
2007-10-18 21:32:56 +02:00
|
|
|
unsigned int yld_exp_empty;
|
|
|
|
unsigned int yld_act_empty;
|
|
|
|
unsigned int yld_both_empty;
|
|
|
|
unsigned int yld_count;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/* schedule() stats */
|
2007-10-18 21:32:56 +02:00
|
|
|
unsigned int sched_switch;
|
|
|
|
unsigned int sched_count;
|
|
|
|
unsigned int sched_goidle;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/* try_to_wake_up() stats */
|
2007-10-18 21:32:56 +02:00
|
|
|
unsigned int ttwu_count;
|
|
|
|
unsigned int ttwu_local;
|
2007-10-15 17:00:10 +02:00
|
|
|
|
|
|
|
/* BKL stats */
|
2007-10-18 21:32:56 +02:00
|
|
|
unsigned int bkl_count;
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
2006-07-03 09:25:10 +02:00
|
|
|
struct lock_class_key rq_lock_key;
|
2005-04-17 00:20:36 +02:00
|
|
|
};
|
|
|
|
|
2007-07-19 10:48:13 +02:00
|
|
|
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
rq->curr->sched_class->check_preempt_curr(rq, p);
|
|
|
|
}
|
|
|
|
|
2006-09-26 08:30:51 +02:00
|
|
|
static inline int cpu_of(struct rq *rq)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
return rq->cpu;
|
|
|
|
#else
|
|
|
|
return 0;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
/*
|
2007-08-09 11:16:46 +02:00
|
|
|
* Update the per-runqueue clock, as finegrained as the platform can give
|
|
|
|
* us, but without assuming monotonicity, etc.:
|
2007-07-09 18:51:58 +02:00
|
|
|
*/
|
2007-08-09 11:16:46 +02:00
|
|
|
static void __update_rq_clock(struct rq *rq)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
|
|
|
u64 prev_raw = rq->prev_clock_raw;
|
|
|
|
u64 now = sched_clock();
|
|
|
|
s64 delta = now - prev_raw;
|
|
|
|
u64 clock = rq->clock;
|
|
|
|
|
2007-08-09 11:16:46 +02:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
|
|
|
|
#endif
|
2007-07-09 18:51:58 +02:00
|
|
|
/*
|
|
|
|
* Protect against sched_clock() occasionally going backwards:
|
|
|
|
*/
|
|
|
|
if (unlikely(delta < 0)) {
|
|
|
|
clock++;
|
|
|
|
rq->clock_warps++;
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Catch too large forward jumps too:
|
|
|
|
*/
|
2007-08-10 23:05:11 +02:00
|
|
|
if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) {
|
|
|
|
if (clock < rq->tick_timestamp + TICK_NSEC)
|
|
|
|
clock = rq->tick_timestamp + TICK_NSEC;
|
|
|
|
else
|
|
|
|
clock++;
|
2007-07-09 18:51:58 +02:00
|
|
|
rq->clock_overflows++;
|
|
|
|
} else {
|
|
|
|
if (unlikely(delta > rq->clock_max_delta))
|
|
|
|
rq->clock_max_delta = delta;
|
|
|
|
clock += delta;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
rq->prev_clock_raw = now;
|
|
|
|
rq->clock = clock;
|
2007-08-09 11:16:46 +02:00
|
|
|
}
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2007-08-09 11:16:46 +02:00
|
|
|
static void update_rq_clock(struct rq *rq)
|
|
|
|
{
|
|
|
|
if (likely(smp_processor_id() == cpu_of(rq)))
|
|
|
|
__update_rq_clock(rq);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2005-06-25 23:57:27 +02:00
|
|
|
/*
|
|
|
|
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
|
2005-06-25 23:57:33 +02:00
|
|
|
* See detach_destroy_domains: synchronize_sched for details.
|
2005-06-25 23:57:27 +02:00
|
|
|
*
|
|
|
|
* The domain tree of any CPU may only be accessed from within
|
|
|
|
* preempt-disabled sections.
|
|
|
|
*/
|
2006-07-03 09:25:40 +02:00
|
|
|
#define for_each_domain(cpu, __sd) \
|
|
|
|
for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
|
|
|
|
#define this_rq() (&__get_cpu_var(runqueues))
|
|
|
|
#define task_rq(p) cpu_rq(task_cpu(p))
|
|
|
|
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
|
|
|
|
|
2008-01-25 21:08:31 +01:00
|
|
|
unsigned long rt_needs_cpu(int cpu)
|
|
|
|
{
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
u64 delta;
|
|
|
|
|
|
|
|
if (!rq->rt_throttled)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (rq->clock > rq->rt_period_expire)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
delta = rq->rt_period_expire - rq->clock;
|
|
|
|
do_div(delta, NSEC_PER_SEC / HZ);
|
|
|
|
|
|
|
|
return (unsigned long)delta;
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:04 +02:00
|
|
|
/*
|
|
|
|
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
# define const_debug __read_mostly
|
|
|
|
#else
|
|
|
|
# define const_debug static const
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Debugging: various feature bits
|
|
|
|
*/
|
|
|
|
enum {
|
2007-10-15 17:00:06 +02:00
|
|
|
SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
|
2007-11-15 20:57:40 +01:00
|
|
|
SCHED_FEAT_WAKEUP_PREEMPT = 2,
|
|
|
|
SCHED_FEAT_START_DEBIT = 4,
|
2007-12-05 15:46:09 +01:00
|
|
|
SCHED_FEAT_TREE_AVG = 8,
|
|
|
|
SCHED_FEAT_APPROX_AVG = 16,
|
2008-01-25 21:08:29 +01:00
|
|
|
SCHED_FEAT_HRTICK = 32,
|
|
|
|
SCHED_FEAT_DOUBLE_TICK = 64,
|
2007-10-15 17:00:04 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
const_debug unsigned int sysctl_sched_features =
|
2007-10-18 21:32:55 +02:00
|
|
|
SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
|
2007-11-15 20:57:40 +01:00
|
|
|
SCHED_FEAT_WAKEUP_PREEMPT * 1 |
|
2007-10-18 21:32:55 +02:00
|
|
|
SCHED_FEAT_START_DEBIT * 1 |
|
|
|
|
SCHED_FEAT_TREE_AVG * 0 |
|
2008-01-25 21:08:29 +01:00
|
|
|
SCHED_FEAT_APPROX_AVG * 0 |
|
|
|
|
SCHED_FEAT_HRTICK * 1 |
|
|
|
|
SCHED_FEAT_DOUBLE_TICK * 0;
|
2007-10-15 17:00:04 +02:00
|
|
|
|
|
|
|
#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
|
|
|
|
|
2007-11-09 22:39:39 +01:00
|
|
|
/*
|
|
|
|
* Number of tasks to iterate in a single balance run.
|
|
|
|
* Limited because this is done with IRQs disabled.
|
|
|
|
*/
|
|
|
|
const_debug unsigned int sysctl_sched_nr_migrate = 32;
|
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
/*
|
|
|
|
* period over which we measure -rt task cpu usage in ms.
|
|
|
|
* default: 1s
|
|
|
|
*/
|
|
|
|
const_debug unsigned int sysctl_sched_rt_period = 1000;
|
|
|
|
|
|
|
|
#define SCHED_RT_FRAC_SHIFT 16
|
|
|
|
#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ratio of time -rt tasks may consume.
|
2008-01-25 21:08:30 +01:00
|
|
|
* default: 95%
|
2008-01-25 21:08:29 +01:00
|
|
|
*/
|
2008-01-25 21:08:30 +01:00
|
|
|
const_debug unsigned int sysctl_sched_rt_ratio = 62259;
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2007-07-19 21:28:35 +02:00
|
|
|
/*
|
|
|
|
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
|
|
|
|
* clock constructed from sched_clock():
|
|
|
|
*/
|
|
|
|
unsigned long long cpu_clock(int cpu)
|
|
|
|
{
|
|
|
|
unsigned long long now;
|
|
|
|
unsigned long flags;
|
2007-08-09 11:16:46 +02:00
|
|
|
struct rq *rq;
|
2007-07-19 21:28:35 +02:00
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
local_irq_save(flags);
|
2007-08-09 11:16:46 +02:00
|
|
|
rq = cpu_rq(cpu);
|
2007-12-07 19:02:47 +01:00
|
|
|
/*
|
|
|
|
* Only call sched_clock() if the scheduler has already been
|
|
|
|
* initialized (some code might call cpu_clock() very early):
|
|
|
|
*/
|
|
|
|
if (rq->idle)
|
|
|
|
update_rq_clock(rq);
|
2007-08-09 11:16:46 +02:00
|
|
|
now = rq->clock;
|
2007-07-26 13:40:43 +02:00
|
|
|
local_irq_restore(flags);
|
2007-07-19 21:28:35 +02:00
|
|
|
|
|
|
|
return now;
|
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
EXPORT_SYMBOL_GPL(cpu_clock);
|
2007-07-19 21:28:35 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifndef prepare_arch_switch
|
2005-06-25 23:57:23 +02:00
|
|
|
# define prepare_arch_switch(next) do { } while (0)
|
|
|
|
#endif
|
|
|
|
#ifndef finish_arch_switch
|
|
|
|
# define finish_arch_switch(prev) do { } while (0)
|
|
|
|
#endif
|
|
|
|
|
2007-12-18 15:21:13 +01:00
|
|
|
static inline int task_current(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
return rq->curr == p;
|
|
|
|
}
|
|
|
|
|
2005-06-25 23:57:23 +02:00
|
|
|
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
|
2006-07-03 09:25:42 +02:00
|
|
|
static inline int task_running(struct rq *rq, struct task_struct *p)
|
2005-06-25 23:57:23 +02:00
|
|
|
{
|
2007-12-18 15:21:13 +01:00
|
|
|
return task_current(rq, p);
|
2005-06-25 23:57:23 +02:00
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:42 +02:00
|
|
|
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
|
2005-06-25 23:57:23 +02:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:42 +02:00
|
|
|
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
|
2005-06-25 23:57:23 +02:00
|
|
|
{
|
2005-09-13 11:17:59 +02:00
|
|
|
#ifdef CONFIG_DEBUG_SPINLOCK
|
|
|
|
/* this is a valid case when another task releases the spinlock */
|
|
|
|
rq->lock.owner = current;
|
|
|
|
#endif
|
2006-07-03 09:24:54 +02:00
|
|
|
/*
|
|
|
|
* If we are tracking spinlock dependencies then we have to
|
|
|
|
* fix up the runqueue lock - which gets 'carried over' from
|
|
|
|
* prev into current:
|
|
|
|
*/
|
|
|
|
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
|
|
|
|
|
2005-06-25 23:57:23 +02:00
|
|
|
spin_unlock_irq(&rq->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
|
2006-07-03 09:25:42 +02:00
|
|
|
static inline int task_running(struct rq *rq, struct task_struct *p)
|
2005-06-25 23:57:23 +02:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
return p->oncpu;
|
|
|
|
#else
|
2007-12-18 15:21:13 +01:00
|
|
|
return task_current(rq, p);
|
2005-06-25 23:57:23 +02:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:42 +02:00
|
|
|
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
|
2005-06-25 23:57:23 +02:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/*
|
|
|
|
* We can optimise this out completely for !SMP, because the
|
|
|
|
* SMP rebalancing from interrupt is the only thing that cares
|
|
|
|
* here.
|
|
|
|
*/
|
|
|
|
next->oncpu = 1;
|
|
|
|
#endif
|
|
|
|
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
spin_unlock_irq(&rq->lock);
|
|
|
|
#else
|
|
|
|
spin_unlock(&rq->lock);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:42 +02:00
|
|
|
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
|
2005-06-25 23:57:23 +02:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/*
|
|
|
|
* After ->oncpu is cleared, the task can be moved to a different CPU.
|
|
|
|
* We must ensure this doesn't happen until the switch is completely
|
|
|
|
* finished.
|
|
|
|
*/
|
|
|
|
smp_wmb();
|
|
|
|
prev->oncpu = 0;
|
|
|
|
#endif
|
|
|
|
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
local_irq_enable();
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
2005-06-25 23:57:23 +02:00
|
|
|
}
|
|
|
|
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-06-27 11:54:51 +02:00
|
|
|
/*
|
|
|
|
* __task_rq_lock - lock the runqueue a given task resides on.
|
|
|
|
* Must be called interrupts disabled.
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static inline struct rq *__task_rq_lock(struct task_struct *p)
|
2006-06-27 11:54:51 +02:00
|
|
|
__acquires(rq->lock)
|
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
for (;;) {
|
|
|
|
struct rq *rq = task_rq(p);
|
|
|
|
spin_lock(&rq->lock);
|
|
|
|
if (likely(rq == task_rq(p)))
|
|
|
|
return rq;
|
2006-06-27 11:54:51 +02:00
|
|
|
spin_unlock(&rq->lock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* task_rq_lock - lock the runqueue a given task resides on and disable
|
2007-12-05 15:46:09 +01:00
|
|
|
* interrupts. Note the ordering: we can safely lookup the task_rq without
|
2005-04-17 00:20:36 +02:00
|
|
|
* explicitly disabling preemption.
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
|
2005-04-17 00:20:36 +02:00
|
|
|
__acquires(rq->lock)
|
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
for (;;) {
|
|
|
|
local_irq_save(*flags);
|
|
|
|
rq = task_rq(p);
|
|
|
|
spin_lock(&rq->lock);
|
|
|
|
if (likely(rq == task_rq(p)))
|
|
|
|
return rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_unlock_irqrestore(&rq->lock, *flags);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:13 +02:00
|
|
|
static void __task_rq_unlock(struct rq *rq)
|
2006-06-27 11:54:51 +02:00
|
|
|
__releases(rq->lock)
|
|
|
|
{
|
|
|
|
spin_unlock(&rq->lock);
|
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:42 +02:00
|
|
|
static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
|
2005-04-17 00:20:36 +02:00
|
|
|
__releases(rq->lock)
|
|
|
|
{
|
|
|
|
spin_unlock_irqrestore(&rq->lock, *flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-12-10 11:20:00 +01:00
|
|
|
* this_rq_lock - lock this runqueue and disable interrupts.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static struct rq *this_rq_lock(void)
|
2005-04-17 00:20:36 +02:00
|
|
|
__acquires(rq->lock)
|
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
local_irq_disable();
|
|
|
|
rq = this_rq();
|
|
|
|
spin_lock(&rq->lock);
|
|
|
|
|
|
|
|
return rq;
|
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
2007-08-23 15:18:02 +02:00
|
|
|
* We are going deep-idle (irqs are disabled):
|
2007-07-09 18:51:59 +02:00
|
|
|
*/
|
2007-08-23 15:18:02 +02:00
|
|
|
void sched_clock_idle_sleep_event(void)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
2007-08-23 15:18:02 +02:00
|
|
|
struct rq *rq = cpu_rq(smp_processor_id());
|
|
|
|
|
|
|
|
spin_lock(&rq->lock);
|
|
|
|
__update_rq_clock(rq);
|
|
|
|
spin_unlock(&rq->lock);
|
|
|
|
rq->clock_deep_idle_events++;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We just idled delta nanoseconds (called with irqs disabled):
|
|
|
|
*/
|
|
|
|
void sched_clock_idle_wakeup_event(u64 delta_ns)
|
|
|
|
{
|
|
|
|
struct rq *rq = cpu_rq(smp_processor_id());
|
|
|
|
u64 now = sched_clock();
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2007-08-23 15:18:02 +02:00
|
|
|
rq->idle_clock += delta_ns;
|
|
|
|
/*
|
|
|
|
* Override the previous timestamp and ignore all
|
|
|
|
* sched_clock() deltas that occured while we idled,
|
|
|
|
* and use the PM-provided delta_ns to advance the
|
|
|
|
* rq clock:
|
|
|
|
*/
|
|
|
|
spin_lock(&rq->lock);
|
|
|
|
rq->prev_clock_raw = now;
|
|
|
|
rq->clock += delta_ns;
|
|
|
|
spin_unlock(&rq->lock);
|
2008-01-25 21:08:33 +01:00
|
|
|
touch_softlockup_watchdog();
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2007-08-23 15:18:02 +02:00
|
|
|
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
static void __resched_task(struct task_struct *p, int tif_bit);
|
|
|
|
|
|
|
|
static inline void resched_task(struct task_struct *p)
|
|
|
|
{
|
|
|
|
__resched_task(p, TIF_NEED_RESCHED);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_HRTICK
|
|
|
|
/*
|
|
|
|
* Use HR-timers to deliver accurate preemption points.
|
|
|
|
*
|
|
|
|
* Its all a bit involved since we cannot program an hrt while holding the
|
|
|
|
* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
|
|
|
|
* reschedule event.
|
|
|
|
*
|
|
|
|
* When we get rescheduled we reprogram the hrtick_timer outside of the
|
|
|
|
* rq->lock.
|
|
|
|
*/
|
|
|
|
static inline void resched_hrt(struct task_struct *p)
|
|
|
|
{
|
|
|
|
__resched_task(p, TIF_HRTICK_RESCHED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void resched_rq(struct rq *rq)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
resched_task(rq->curr);
|
|
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
enum {
|
|
|
|
HRTICK_SET, /* re-programm hrtick_timer */
|
|
|
|
HRTICK_RESET, /* not a new slice */
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use hrtick when:
|
|
|
|
* - enabled by features
|
|
|
|
* - hrtimer is actually high res
|
|
|
|
*/
|
|
|
|
static inline int hrtick_enabled(struct rq *rq)
|
|
|
|
{
|
|
|
|
if (!sched_feat(HRTICK))
|
|
|
|
return 0;
|
|
|
|
return hrtimer_is_hres_active(&rq->hrtick_timer);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Called to set the hrtick timer state.
|
|
|
|
*
|
|
|
|
* called with rq->lock held and irqs disabled
|
|
|
|
*/
|
|
|
|
static void hrtick_start(struct rq *rq, u64 delay, int reset)
|
|
|
|
{
|
|
|
|
assert_spin_locked(&rq->lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* preempt at: now + delay
|
|
|
|
*/
|
|
|
|
rq->hrtick_expire =
|
|
|
|
ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
|
|
|
|
/*
|
|
|
|
* indicate we need to program the timer
|
|
|
|
*/
|
|
|
|
__set_bit(HRTICK_SET, &rq->hrtick_flags);
|
|
|
|
if (reset)
|
|
|
|
__set_bit(HRTICK_RESET, &rq->hrtick_flags);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* New slices are called from the schedule path and don't need a
|
|
|
|
* forced reschedule.
|
|
|
|
*/
|
|
|
|
if (reset)
|
|
|
|
resched_hrt(rq->curr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void hrtick_clear(struct rq *rq)
|
|
|
|
{
|
|
|
|
if (hrtimer_active(&rq->hrtick_timer))
|
|
|
|
hrtimer_cancel(&rq->hrtick_timer);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update the timer from the possible pending state.
|
|
|
|
*/
|
|
|
|
static void hrtick_set(struct rq *rq)
|
|
|
|
{
|
|
|
|
ktime_t time;
|
|
|
|
int set, reset;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
|
|
|
|
|
|
|
|
spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
|
|
|
|
reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
|
|
|
|
time = rq->hrtick_expire;
|
|
|
|
clear_thread_flag(TIF_HRTICK_RESCHED);
|
|
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
|
|
|
|
if (set) {
|
|
|
|
hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
|
|
|
|
if (reset && !hrtimer_active(&rq->hrtick_timer))
|
|
|
|
resched_rq(rq);
|
|
|
|
} else
|
|
|
|
hrtick_clear(rq);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* High-resolution timer tick.
|
|
|
|
* Runs from hardirq context with interrupts disabled.
|
|
|
|
*/
|
|
|
|
static enum hrtimer_restart hrtick(struct hrtimer *timer)
|
|
|
|
{
|
|
|
|
struct rq *rq = container_of(timer, struct rq, hrtick_timer);
|
|
|
|
|
|
|
|
WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
|
|
|
|
|
|
|
|
spin_lock(&rq->lock);
|
|
|
|
__update_rq_clock(rq);
|
|
|
|
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
|
|
|
|
spin_unlock(&rq->lock);
|
|
|
|
|
|
|
|
return HRTIMER_NORESTART;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void init_rq_hrtick(struct rq *rq)
|
|
|
|
{
|
|
|
|
rq->hrtick_flags = 0;
|
|
|
|
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
|
|
rq->hrtick_timer.function = hrtick;
|
|
|
|
rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
|
|
|
|
}
|
|
|
|
|
|
|
|
void hrtick_resched(void)
|
|
|
|
{
|
|
|
|
struct rq *rq;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
if (!test_thread_flag(TIF_HRTICK_RESCHED))
|
|
|
|
return;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
rq = cpu_rq(smp_processor_id());
|
|
|
|
hrtick_set(rq);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void hrtick_clear(struct rq *rq)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void hrtick_set(struct rq *rq)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void init_rq_hrtick(struct rq *rq)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void hrtick_resched(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* resched_task - mark a task 'to be rescheduled now'.
|
|
|
|
*
|
|
|
|
* On UP this means the setting of the need_resched flag, on SMP it
|
|
|
|
* might also involve a cross-CPU call to trigger the scheduler on
|
|
|
|
* the target CPU.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
|
|
|
#ifndef tsk_is_polling
|
|
|
|
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
|
|
|
|
#endif
|
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
static void __resched_task(struct task_struct *p, int tif_bit)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
assert_spin_locked(&task_rq(p)->lock);
|
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
if (unlikely(test_tsk_thread_flag(p, tif_bit)))
|
2007-07-09 18:51:59 +02:00
|
|
|
return;
|
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
set_tsk_thread_flag(p, tif_bit);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
cpu = task_cpu(p);
|
|
|
|
if (cpu == smp_processor_id())
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* NEED_RESCHED must be visible before we test polling */
|
|
|
|
smp_mb();
|
|
|
|
if (!tsk_is_polling(p))
|
|
|
|
smp_send_reschedule(cpu);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void resched_cpu(int cpu)
|
|
|
|
{
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
if (!spin_trylock_irqsave(&rq->lock, flags))
|
|
|
|
return;
|
|
|
|
resched_task(cpu_curr(cpu));
|
|
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
}
|
|
|
|
#else
|
2008-01-25 21:08:29 +01:00
|
|
|
static void __resched_task(struct task_struct *p, int tif_bit)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
|
|
|
assert_spin_locked(&task_rq(p)->lock);
|
2008-01-25 21:08:29 +01:00
|
|
|
set_tsk_thread_flag(p, tif_bit);
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
#if BITS_PER_LONG == 32
|
|
|
|
# define WMULT_CONST (~0UL)
|
|
|
|
#else
|
|
|
|
# define WMULT_CONST (1UL << 32)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define WMULT_SHIFT 32
|
|
|
|
|
2007-08-09 11:16:51 +02:00
|
|
|
/*
|
|
|
|
* Shift right and round:
|
|
|
|
*/
|
2007-09-05 14:32:49 +02:00
|
|
|
#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
|
2007-08-09 11:16:51 +02:00
|
|
|
|
2007-08-02 17:41:40 +02:00
|
|
|
static unsigned long
|
2007-07-09 18:51:59 +02:00
|
|
|
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
|
|
|
|
struct load_weight *lw)
|
|
|
|
{
|
|
|
|
u64 tmp;
|
|
|
|
|
|
|
|
if (unlikely(!lw->inv_weight))
|
2007-08-09 11:16:51 +02:00
|
|
|
lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
tmp = (u64)delta_exec * weight;
|
|
|
|
/*
|
|
|
|
* Check whether we'd overflow the 64-bit multiplication:
|
|
|
|
*/
|
2007-08-09 11:16:51 +02:00
|
|
|
if (unlikely(tmp > WMULT_CONST))
|
2007-09-05 14:32:49 +02:00
|
|
|
tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
|
2007-08-09 11:16:51 +02:00
|
|
|
WMULT_SHIFT/2);
|
|
|
|
else
|
2007-09-05 14:32:49 +02:00
|
|
|
tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2007-08-02 17:41:40 +02:00
|
|
|
return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long
|
|
|
|
calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
|
|
|
|
{
|
|
|
|
return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:04 +02:00
|
|
|
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
|
|
|
lw->weight += inc;
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:04 +02:00
|
|
|
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
|
|
|
lw->weight -= dec;
|
|
|
|
}
|
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
/*
|
|
|
|
* To aid in avoiding the subversion of "niceness" due to uneven distribution
|
|
|
|
* of tasks with abnormal "nice" values across CPUs the contribution that
|
|
|
|
* each task makes to its run queue's load is weighted according to its
|
2007-12-05 15:46:09 +01:00
|
|
|
* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
* scaled version of the new time slice allocation that they receive on time
|
|
|
|
* slice expiry etc.
|
|
|
|
*/
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
#define WEIGHT_IDLEPRIO 2
|
|
|
|
#define WMULT_IDLEPRIO (1 << 31)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Nice levels are multiplicative, with a gentle 10% change for every
|
|
|
|
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
|
|
|
|
* nice 1, it will get ~10% less CPU time than another CPU-bound task
|
|
|
|
* that remained on nice 0.
|
|
|
|
*
|
|
|
|
* The "10% effect" is relative and cumulative: from _any_ nice level,
|
|
|
|
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
|
2007-07-16 09:46:30 +02:00
|
|
|
* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
|
|
|
|
* If a task goes up by ~10% and another task goes down by ~10% then
|
|
|
|
* the relative distance between them is ~25%.)
|
2007-07-09 18:51:59 +02:00
|
|
|
*/
|
|
|
|
static const int prio_to_weight[40] = {
|
2007-08-09 11:16:51 +02:00
|
|
|
/* -20 */ 88761, 71755, 56483, 46273, 36291,
|
|
|
|
/* -15 */ 29154, 23254, 18705, 14949, 11916,
|
|
|
|
/* -10 */ 9548, 7620, 6100, 4904, 3906,
|
|
|
|
/* -5 */ 3121, 2501, 1991, 1586, 1277,
|
|
|
|
/* 0 */ 1024, 820, 655, 526, 423,
|
|
|
|
/* 5 */ 335, 272, 215, 172, 137,
|
|
|
|
/* 10 */ 110, 87, 70, 56, 45,
|
|
|
|
/* 15 */ 36, 29, 23, 18, 15,
|
2007-07-09 18:51:59 +02:00
|
|
|
};
|
|
|
|
|
2007-07-16 09:46:31 +02:00
|
|
|
/*
|
|
|
|
* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
|
|
|
|
*
|
|
|
|
* In cases where the weight does not change often, we can use the
|
|
|
|
* precalculated inverse to speed up arithmetics by turning divisions
|
|
|
|
* into multiplications:
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
static const u32 prio_to_wmult[40] = {
|
2007-08-09 11:16:51 +02:00
|
|
|
/* -20 */ 48388, 59856, 76040, 92818, 118348,
|
|
|
|
/* -15 */ 147320, 184698, 229616, 287308, 360437,
|
|
|
|
/* -10 */ 449829, 563644, 704093, 875809, 1099582,
|
|
|
|
/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
|
|
|
|
/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
|
|
|
|
/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
|
|
|
|
/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
|
|
|
|
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
|
2007-07-09 18:51:59 +02:00
|
|
|
};
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* runqueue iterator, to support SMP load-balancing between different
|
|
|
|
* scheduling classes, without having to expose their internal data
|
|
|
|
* structures to the load-balancing proper:
|
|
|
|
*/
|
|
|
|
struct rq_iterator {
|
|
|
|
void *arg;
|
|
|
|
struct task_struct *(*start)(void *);
|
|
|
|
struct task_struct *(*next)(void *);
|
|
|
|
};
|
|
|
|
|
2007-10-24 18:23:51 +02:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
static unsigned long
|
|
|
|
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
|
|
unsigned long max_load_move, struct sched_domain *sd,
|
|
|
|
enum cpu_idle_type idle, int *all_pinned,
|
|
|
|
int *this_best_prio, struct rq_iterator *iterator);
|
|
|
|
|
|
|
|
static int
|
|
|
|
iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
|
|
struct sched_domain *sd, enum cpu_idle_type idle,
|
|
|
|
struct rq_iterator *iterator);
|
|
|
|
#endif
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2007-12-02 20:04:49 +01:00
|
|
|
#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
|
|
|
|
#else
|
|
|
|
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
|
|
|
|
#endif
|
|
|
|
|
2008-01-25 21:08:00 +01:00
|
|
|
static inline void inc_cpu_load(struct rq *rq, unsigned long load)
|
|
|
|
{
|
|
|
|
update_load_add(&rq->load, load);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void dec_cpu_load(struct rq *rq, unsigned long load)
|
|
|
|
{
|
|
|
|
update_load_sub(&rq->load, load);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:09 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
static unsigned long source_load(int cpu, int type);
|
|
|
|
static unsigned long target_load(int cpu, int type);
|
|
|
|
static unsigned long cpu_avg_load_per_task(int cpu);
|
|
|
|
static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
#include "sched_stats.h"
|
|
|
|
#include "sched_idletask.c"
|
2007-10-15 17:00:12 +02:00
|
|
|
#include "sched_fair.c"
|
|
|
|
#include "sched_rt.c"
|
2007-07-09 18:51:59 +02:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
# include "sched_debug.c"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define sched_class_highest (&rt_sched_class)
|
|
|
|
|
2008-01-31 22:45:23 +01:00
|
|
|
static void inc_nr_running(struct rq *rq)
|
2007-08-02 17:41:40 +02:00
|
|
|
{
|
|
|
|
rq->nr_running++;
|
|
|
|
}
|
|
|
|
|
2008-01-31 22:45:23 +01:00
|
|
|
static void dec_nr_running(struct rq *rq)
|
2007-08-02 17:41:40 +02:00
|
|
|
{
|
|
|
|
rq->nr_running--;
|
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
static void set_load_weight(struct task_struct *p)
|
|
|
|
{
|
|
|
|
if (task_has_rt_policy(p)) {
|
2007-07-09 18:51:59 +02:00
|
|
|
p->se.load.weight = prio_to_weight[0] * 2;
|
|
|
|
p->se.load.inv_weight = prio_to_wmult[0] >> 1;
|
|
|
|
return;
|
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* SCHED_IDLE tasks get minimal weight:
|
|
|
|
*/
|
|
|
|
if (p->policy == SCHED_IDLE) {
|
|
|
|
p->se.load.weight = WEIGHT_IDLEPRIO;
|
|
|
|
p->se.load.inv_weight = WMULT_IDLEPRIO;
|
|
|
|
return;
|
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
|
|
|
|
p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2007-08-09 11:16:49 +02:00
|
|
|
static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
sched_info_queued(p);
|
2007-08-09 11:16:48 +02:00
|
|
|
p->sched_class->enqueue_task(rq, p, wakeup);
|
2007-07-09 18:51:59 +02:00
|
|
|
p->se.on_rq = 1;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2007-08-09 11:16:49 +02:00
|
|
|
static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
2007-08-09 11:16:48 +02:00
|
|
|
p->sched_class->dequeue_task(rq, p, sleep);
|
2007-07-09 18:51:59 +02:00
|
|
|
p->se.on_rq = 0;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* __normal_prio - return the priority that is based on the static prio
|
2007-07-09 18:51:59 +02:00
|
|
|
*/
|
|
|
|
static inline int __normal_prio(struct task_struct *p)
|
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
return p->static_prio;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2006-06-27 11:54:51 +02:00
|
|
|
/*
|
|
|
|
* Calculate the expected normal priority: i.e. priority
|
|
|
|
* without taking RT-inheritance into account. Might be
|
|
|
|
* boosted by interactivity modifiers. Changes upon fork,
|
|
|
|
* setprio syscalls, and whenever the interactivity
|
|
|
|
* estimator recalculates.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
static inline int normal_prio(struct task_struct *p)
|
2006-06-27 11:54:51 +02:00
|
|
|
{
|
|
|
|
int prio;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (task_has_rt_policy(p))
|
2006-06-27 11:54:51 +02:00
|
|
|
prio = MAX_RT_PRIO-1 - p->rt_priority;
|
|
|
|
else
|
|
|
|
prio = __normal_prio(p);
|
|
|
|
return prio;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the current priority, i.e. the priority
|
|
|
|
* taken into account by the scheduler. This value might
|
|
|
|
* be boosted by RT tasks, or might be boosted by
|
|
|
|
* interactivity modifiers. Will be RT if the task got
|
|
|
|
* RT-boosted. If not then it returns p->normal_prio.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
static int effective_prio(struct task_struct *p)
|
2006-06-27 11:54:51 +02:00
|
|
|
{
|
|
|
|
p->normal_prio = normal_prio(p);
|
|
|
|
/*
|
|
|
|
* If we are RT tasks or we were boosted to RT priority,
|
|
|
|
* keep the priority unchanged. Otherwise, update priority
|
|
|
|
* to the normal priority:
|
|
|
|
*/
|
|
|
|
if (!rt_prio(p->prio))
|
|
|
|
return p->normal_prio;
|
|
|
|
return p->prio;
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* activate_task - move a task to the runqueue.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-12-06 17:07:07 +01:00
|
|
|
if (task_contributes_to_load(p))
|
2007-07-09 18:51:59 +02:00
|
|
|
rq->nr_uninterruptible--;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-08-09 11:16:49 +02:00
|
|
|
enqueue_task(rq, p, wakeup);
|
2008-01-31 22:45:23 +01:00
|
|
|
inc_nr_running(rq);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* deactivate_task - remove a task from the runqueue.
|
|
|
|
*/
|
2007-08-09 11:16:49 +02:00
|
|
|
static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-12-06 17:07:07 +01:00
|
|
|
if (task_contributes_to_load(p))
|
2007-07-09 18:51:59 +02:00
|
|
|
rq->nr_uninterruptible++;
|
|
|
|
|
2007-08-09 11:16:49 +02:00
|
|
|
dequeue_task(rq, p, sleep);
|
2008-01-31 22:45:23 +01:00
|
|
|
dec_nr_running(rq);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* task_curr - is this task currently executing on a CPU?
|
|
|
|
* @p: the task in question.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
inline int task_curr(const struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
return cpu_curr(task_cpu(p)) == p;
|
|
|
|
}
|
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
/* Used instead of source_load when we know the type == 0 */
|
|
|
|
unsigned long weighted_cpuload(const int cpu)
|
|
|
|
{
|
2007-10-15 17:00:06 +02:00
|
|
|
return cpu_rq(cpu)->load.weight;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|
|
|
{
|
2008-01-25 21:08:30 +01:00
|
|
|
set_task_rq(p, cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2007-11-15 20:57:40 +01:00
|
|
|
/*
|
|
|
|
* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
|
|
|
|
* successfuly executed on another CPU. We must ensure that updates of
|
|
|
|
* per-task data have been completed by this moment.
|
|
|
|
*/
|
|
|
|
smp_wmb();
|
2007-07-09 18:51:59 +02:00
|
|
|
task_thread_info(p)->cpu = cpu;
|
|
|
|
#endif
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:22 +01:00
|
|
|
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
|
|
|
|
const struct sched_class *prev_class,
|
|
|
|
int oldprio, int running)
|
|
|
|
{
|
|
|
|
if (prev_class != p->sched_class) {
|
|
|
|
if (prev_class->switched_from)
|
|
|
|
prev_class->switched_from(rq, p, running);
|
|
|
|
p->sched_class->switched_to(rq, p, running);
|
|
|
|
} else
|
|
|
|
p->sched_class->prio_changed(rq, p, oldprio, running);
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2007-07-09 18:51:58 +02:00
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
/*
|
|
|
|
* Is this task likely cache-hot:
|
|
|
|
*/
|
2008-01-25 21:08:09 +01:00
|
|
|
static int
|
2007-10-15 17:00:18 +02:00
|
|
|
task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
|
|
|
|
{
|
|
|
|
s64 delta;
|
|
|
|
|
|
|
|
if (p->sched_class != &fair_sched_class)
|
|
|
|
return 0;
|
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
if (sysctl_sched_migration_cost == -1)
|
|
|
|
return 1;
|
|
|
|
if (sysctl_sched_migration_cost == 0)
|
|
|
|
return 0;
|
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
delta = now - p->se.exec_start;
|
|
|
|
|
|
|
|
return delta < (s64)sysctl_sched_migration_cost;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
2007-07-09 18:51:58 +02:00
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
int old_cpu = task_cpu(p);
|
|
|
|
struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
|
2007-10-15 17:00:12 +02:00
|
|
|
struct cfs_rq *old_cfsrq = task_cfs_rq(p),
|
|
|
|
*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
|
2007-10-15 17:00:06 +02:00
|
|
|
u64 clock_offset;
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
clock_offset = old_rq->clock - new_rq->clock;
|
2007-08-02 17:41:40 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
if (p->se.wait_start)
|
|
|
|
p->se.wait_start -= clock_offset;
|
2007-07-09 18:51:59 +02:00
|
|
|
if (p->se.sleep_start)
|
|
|
|
p->se.sleep_start -= clock_offset;
|
|
|
|
if (p->se.block_start)
|
|
|
|
p->se.block_start -= clock_offset;
|
2007-10-15 17:00:18 +02:00
|
|
|
if (old_cpu != new_cpu) {
|
|
|
|
schedstat_inc(p, se.nr_migrations);
|
|
|
|
if (task_hot(p, old_rq->clock, NULL))
|
|
|
|
schedstat_inc(p, se.nr_forced2_migrations);
|
|
|
|
}
|
2007-08-02 17:41:40 +02:00
|
|
|
#endif
|
2007-10-15 17:00:12 +02:00
|
|
|
p->se.vruntime -= old_cfsrq->min_vruntime -
|
|
|
|
new_cfsrq->min_vruntime;
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
__set_task_cpu(p, new_cpu);
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:42 +02:00
|
|
|
struct migration_req {
|
2005-04-17 00:20:36 +02:00
|
|
|
struct list_head list;
|
|
|
|
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *task;
|
2005-04-17 00:20:36 +02:00
|
|
|
int dest_cpu;
|
|
|
|
|
|
|
|
struct completion done;
|
2006-07-03 09:25:42 +02:00
|
|
|
};
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The task's runqueue lock must be held.
|
|
|
|
* Returns true if you have to wait for migration thread.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
static int
|
2006-07-03 09:25:42 +02:00
|
|
|
migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = task_rq(p);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the task is not on a runqueue (and not running), then
|
|
|
|
* it is sufficient to simply update the task's cpu field.
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
if (!p->se.on_rq && !task_running(rq, p)) {
|
2005-04-17 00:20:36 +02:00
|
|
|
set_task_cpu(p, dest_cpu);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
init_completion(&req->done);
|
|
|
|
req->task = p;
|
|
|
|
req->dest_cpu = dest_cpu;
|
|
|
|
list_add(&req->list, &rq->migration_queue);
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* wait_task_inactive - wait for a thread to unschedule.
|
|
|
|
*
|
|
|
|
* The caller must ensure that the task *will* unschedule sometime soon,
|
|
|
|
* else this function might spin for a *long* time. This function can't
|
|
|
|
* be called with interrupts off, or it may introduce deadlock with
|
|
|
|
* smp_call_function() if an IPI is sent by the same process we are
|
|
|
|
* waiting to become inactive.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
void wait_task_inactive(struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2007-07-09 18:51:59 +02:00
|
|
|
int running, on_rq;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
for (;;) {
|
|
|
|
/*
|
|
|
|
* We do the initial early heuristics without holding
|
|
|
|
* any task-queue locks at all. We'll only try to get
|
|
|
|
* the runqueue lock when things look like they will
|
|
|
|
* work out!
|
|
|
|
*/
|
|
|
|
rq = task_rq(p);
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 18:34:40 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* If the task is actively running on another CPU
|
|
|
|
* still, just relax and busy-wait without holding
|
|
|
|
* any locks.
|
|
|
|
*
|
|
|
|
* NOTE! Since we don't hold any locks, it's not
|
|
|
|
* even sure that "rq" stays as the right runqueue!
|
|
|
|
* But we don't care, since "task_running()" will
|
|
|
|
* return false if the runqueue has changed and p
|
|
|
|
* is actually now running somewhere else!
|
|
|
|
*/
|
|
|
|
while (task_running(rq, p))
|
|
|
|
cpu_relax();
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 18:34:40 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Ok, time to look more closely! We need the rq
|
|
|
|
* lock now, to be *sure*. If we're wrong, we'll
|
|
|
|
* just go back and repeat.
|
|
|
|
*/
|
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
running = task_running(rq, p);
|
|
|
|
on_rq = p->se.on_rq;
|
|
|
|
task_rq_unlock(rq, &flags);
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 18:34:40 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Was it really running after all now that we
|
|
|
|
* checked with the proper locks actually held?
|
|
|
|
*
|
|
|
|
* Oops. Go back and try again..
|
|
|
|
*/
|
|
|
|
if (unlikely(running)) {
|
|
|
|
cpu_relax();
|
|
|
|
continue;
|
|
|
|
}
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 18:34:40 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* It's not enough that it's not actively running,
|
|
|
|
* it must be off the runqueue _entirely_, and not
|
|
|
|
* preempted!
|
|
|
|
*
|
|
|
|
* So if it wa still runnable (but just not actively
|
|
|
|
* running right now), it's preempted, and we should
|
|
|
|
* yield - it could be a while.
|
|
|
|
*/
|
|
|
|
if (unlikely(on_rq)) {
|
|
|
|
schedule_timeout_uninterruptible(1);
|
|
|
|
continue;
|
|
|
|
}
|
Fix possible runqueue lock starvation in wait_task_inactive()
Miklos Szeredi reported very long pauses (several seconds, sometimes
more) on his T60 (with a Core2Duo) which he managed to track down to
wait_task_inactive()'s open-coded busy-loop.
He observed that an interrupt on one core tries to acquire the
runqueue-lock but does not succeed in doing so for a very long time -
while wait_task_inactive() on the other core loops waiting for the first
core to deschedule a task (which it wont do while spinning in an
interrupt handler).
This rewrites wait_task_inactive() to do all its waiting optimistically
without any locks taken at all, and then just double-check the end
result with the proper runqueue lock held over just a very short
section. If there were races in the optimistic wait, of a preemption
event scheduled the process away, we simply re-synchronize, and start
over.
So the code now looks like this:
repeat:
/* Unlocked, optimistic looping! */
rq = task_rq(p);
while (task_running(rq, p))
cpu_relax();
/* Get the *real* values */
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/* Check them.. */
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/* Preempted away? Yield if so.. */
if (unlikely(array)) {
yield();
goto repeat;
}
Basically, that first "while()" loop is done entirely without any
locking at all (and doesn't check for the case where the target process
might have been preempted away), and so it's possibly "incorrect", but
we don't really care. Both the runqueue used, and the "task_running()"
check might be the wrong tests, but they won't oops - they just mean
that we could possibly get the wrong results due to lack of locking and
exit the loop early in the case of a race condition.
So once we've exited the loop, we then get the proper (and careful) rq
lock, and check the running/runnable state _safely_. And if it turns
out that our quick-and-dirty and unsafe loop was wrong after all, we
just go back and try it all again.
(The patch also adds a lot of comments, which is the actual bulk of it
all, to make it more obvious why we can do these things without holding
the locks).
Thanks to Miklos for all the testing and tracking it down.
Tested-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-18 18:34:40 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Ahh, all good. It wasn't running, and it wasn't
|
|
|
|
* runnable, which means that it will never become
|
|
|
|
* running in the future either. We're all done!
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/***
|
|
|
|
* kick_process - kick a running thread to enter/exit the kernel
|
|
|
|
* @p: the to-be-kicked thread
|
|
|
|
*
|
|
|
|
* Cause a process which is running on another CPU to enter
|
|
|
|
* kernel-mode, without any delay. (to get signals handled.)
|
|
|
|
*
|
|
|
|
* NOTE: this function doesnt have to take the runqueue lock,
|
|
|
|
* because all it wants to ensure is that the remote task enters
|
|
|
|
* the kernel. If the IPI races and the task has been migrated
|
|
|
|
* to another CPU then no harm is done and the purpose has been
|
|
|
|
* achieved as well.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
void kick_process(struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
preempt_disable();
|
|
|
|
cpu = task_cpu(p);
|
|
|
|
if ((cpu != smp_processor_id()) && task_curr(p))
|
|
|
|
smp_send_reschedule(cpu);
|
|
|
|
preempt_enable();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
* Return a low guess at the load of a migration-source cpu weighted
|
|
|
|
* according to the scheduling class and "nice" value.
|
2005-04-17 00:20:36 +02:00
|
|
|
*
|
|
|
|
* We want to under-estimate the load of migration sources, to
|
|
|
|
* balance conservatively.
|
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static unsigned long source_load(int cpu, int type)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
unsigned long total = weighted_cpuload(cpu);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
|
2005-11-09 06:38:58 +01:00
|
|
|
if (type == 0)
|
2007-07-09 18:51:59 +02:00
|
|
|
return total;
|
2005-11-09 06:38:55 +01:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
return min(rq->cpu_load[type-1], total);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
* Return a high guess at the load of a migration-target cpu weighted
|
|
|
|
* according to the scheduling class and "nice" value.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static unsigned long target_load(int cpu, int type)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
unsigned long total = weighted_cpuload(cpu);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
|
2005-06-25 23:57:13 +02:00
|
|
|
if (type == 0)
|
2007-07-09 18:51:59 +02:00
|
|
|
return total;
|
2005-11-09 06:38:58 +01:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
return max(rq->cpu_load[type-1], total);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the average load per task on the cpu's run queue
|
|
|
|
*/
|
2008-01-25 21:08:09 +01:00
|
|
|
static unsigned long cpu_avg_load_per_task(int cpu)
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
unsigned long total = weighted_cpuload(cpu);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
unsigned long n = rq->nr_running;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
return n ? total / n : SCHED_LOAD_SCALE;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2005-06-25 23:57:19 +02:00
|
|
|
/*
|
|
|
|
* find_idlest_group finds and returns the least busy CPU group within the
|
|
|
|
* domain.
|
|
|
|
*/
|
|
|
|
static struct sched_group *
|
|
|
|
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
|
|
|
|
{
|
|
|
|
struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
|
|
|
|
unsigned long min_load = ULONG_MAX, this_load = 0;
|
|
|
|
int load_idx = sd->forkexec_idx;
|
|
|
|
int imbalance = 100 + (sd->imbalance_pct-100)/2;
|
|
|
|
|
|
|
|
do {
|
|
|
|
unsigned long load, avg_load;
|
|
|
|
int local_group;
|
|
|
|
int i;
|
|
|
|
|
2005-09-10 09:26:09 +02:00
|
|
|
/* Skip over this group if it has no CPUs allowed */
|
|
|
|
if (!cpus_intersects(group->cpumask, p->cpus_allowed))
|
2007-10-15 17:00:14 +02:00
|
|
|
continue;
|
2005-09-10 09:26:09 +02:00
|
|
|
|
2005-06-25 23:57:19 +02:00
|
|
|
local_group = cpu_isset(this_cpu, group->cpumask);
|
|
|
|
|
|
|
|
/* Tally up the load of all CPUs in the group */
|
|
|
|
avg_load = 0;
|
|
|
|
|
|
|
|
for_each_cpu_mask(i, group->cpumask) {
|
|
|
|
/* Bias balancing toward cpus of our domain */
|
|
|
|
if (local_group)
|
|
|
|
load = source_load(i, load_idx);
|
|
|
|
else
|
|
|
|
load = target_load(i, load_idx);
|
|
|
|
|
|
|
|
avg_load += load;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Adjust by relative CPU power of the group */
|
2007-05-08 09:32:57 +02:00
|
|
|
avg_load = sg_div_cpu_power(group,
|
|
|
|
avg_load * SCHED_LOAD_SCALE);
|
2005-06-25 23:57:19 +02:00
|
|
|
|
|
|
|
if (local_group) {
|
|
|
|
this_load = avg_load;
|
|
|
|
this = group;
|
|
|
|
} else if (avg_load < min_load) {
|
|
|
|
min_load = avg_load;
|
|
|
|
idlest = group;
|
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
} while (group = group->next, group != sd->groups);
|
2005-06-25 23:57:19 +02:00
|
|
|
|
|
|
|
if (!idlest || 100*this_load < imbalance*min_load)
|
|
|
|
return NULL;
|
|
|
|
return idlest;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2006-10-03 10:14:10 +02:00
|
|
|
* find_idlest_cpu - find the idlest cpu among the cpus in group.
|
2005-06-25 23:57:19 +02:00
|
|
|
*/
|
2005-09-10 09:26:11 +02:00
|
|
|
static int
|
|
|
|
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
|
2005-06-25 23:57:19 +02:00
|
|
|
{
|
2005-09-10 09:26:09 +02:00
|
|
|
cpumask_t tmp;
|
2005-06-25 23:57:19 +02:00
|
|
|
unsigned long load, min_load = ULONG_MAX;
|
|
|
|
int idlest = -1;
|
|
|
|
int i;
|
|
|
|
|
2005-09-10 09:26:09 +02:00
|
|
|
/* Traverse only the allowed CPUs */
|
|
|
|
cpus_and(tmp, group->cpumask, p->cpus_allowed);
|
|
|
|
|
|
|
|
for_each_cpu_mask(i, tmp) {
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
load = weighted_cpuload(i);
|
2005-06-25 23:57:19 +02:00
|
|
|
|
|
|
|
if (load < min_load || (load == min_load && i == this_cpu)) {
|
|
|
|
min_load = load;
|
|
|
|
idlest = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return idlest;
|
|
|
|
}
|
|
|
|
|
2005-06-25 23:57:29 +02:00
|
|
|
/*
|
|
|
|
* sched_balance_self: balance the current task (running on cpu) in domains
|
|
|
|
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
|
|
|
|
* SD_BALANCE_EXEC.
|
|
|
|
*
|
|
|
|
* Balance, ie. select the least loaded group.
|
|
|
|
*
|
|
|
|
* Returns the target CPU number, or the same CPU if no balancing is needed.
|
|
|
|
*
|
|
|
|
* preempt must be disabled.
|
|
|
|
*/
|
|
|
|
static int sched_balance_self(int cpu, int flag)
|
|
|
|
{
|
|
|
|
struct task_struct *t = current;
|
|
|
|
struct sched_domain *tmp, *sd = NULL;
|
2005-06-25 23:57:19 +02:00
|
|
|
|
2006-06-27 11:54:28 +02:00
|
|
|
for_each_domain(cpu, tmp) {
|
2007-07-09 18:52:00 +02:00
|
|
|
/*
|
|
|
|
* If power savings logic is enabled for a domain, stop there.
|
|
|
|
*/
|
2006-06-27 11:54:42 +02:00
|
|
|
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
|
|
|
|
break;
|
2005-06-25 23:57:29 +02:00
|
|
|
if (tmp->flags & flag)
|
|
|
|
sd = tmp;
|
2006-06-27 11:54:28 +02:00
|
|
|
}
|
2005-06-25 23:57:29 +02:00
|
|
|
|
|
|
|
while (sd) {
|
|
|
|
cpumask_t span;
|
|
|
|
struct sched_group *group;
|
2006-10-03 10:14:08 +02:00
|
|
|
int new_cpu, weight;
|
|
|
|
|
|
|
|
if (!(sd->flags & flag)) {
|
|
|
|
sd = sd->child;
|
|
|
|
continue;
|
|
|
|
}
|
2005-06-25 23:57:29 +02:00
|
|
|
|
|
|
|
span = sd->span;
|
|
|
|
group = find_idlest_group(sd, t, cpu);
|
2006-10-03 10:14:08 +02:00
|
|
|
if (!group) {
|
|
|
|
sd = sd->child;
|
|
|
|
continue;
|
|
|
|
}
|
2005-06-25 23:57:29 +02:00
|
|
|
|
2005-09-10 09:26:09 +02:00
|
|
|
new_cpu = find_idlest_cpu(group, t, cpu);
|
2006-10-03 10:14:08 +02:00
|
|
|
if (new_cpu == -1 || new_cpu == cpu) {
|
|
|
|
/* Now try balancing at a lower domain level of cpu */
|
|
|
|
sd = sd->child;
|
|
|
|
continue;
|
|
|
|
}
|
2005-06-25 23:57:29 +02:00
|
|
|
|
2006-10-03 10:14:08 +02:00
|
|
|
/* Now try balancing at a lower domain level of new_cpu */
|
2005-06-25 23:57:29 +02:00
|
|
|
cpu = new_cpu;
|
|
|
|
sd = NULL;
|
|
|
|
weight = cpus_weight(span);
|
|
|
|
for_each_domain(cpu, tmp) {
|
|
|
|
if (weight <= cpus_weight(tmp->span))
|
|
|
|
break;
|
|
|
|
if (tmp->flags & flag)
|
|
|
|
sd = tmp;
|
|
|
|
}
|
|
|
|
/* while loop will break here if sd == NULL */
|
|
|
|
}
|
|
|
|
|
|
|
|
return cpu;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_SMP */
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/***
|
|
|
|
* try_to_wake_up - wake up a thread
|
|
|
|
* @p: the to-be-woken-up thread
|
|
|
|
* @state: the mask of task states that can be woken
|
|
|
|
* @sync: do a synchronous wakeup?
|
|
|
|
*
|
|
|
|
* Put it on the run-queue if it's not already there. The "current"
|
|
|
|
* thread is always on the run-queue (except when the actual
|
|
|
|
* re-schedule is in progress), and as such you're allowed to do
|
|
|
|
* the simpler "current->state = TASK_RUNNING" to mark yourself
|
|
|
|
* runnable without the overhead of this.
|
|
|
|
*
|
|
|
|
* returns failure only if the task is already active.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-10-15 17:00:18 +02:00
|
|
|
int cpu, orig_cpu, this_cpu, success = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long flags;
|
|
|
|
long old_state;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
old_state = p->state;
|
|
|
|
if (!(old_state & state))
|
|
|
|
goto out;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (p->se.on_rq)
|
2005-04-17 00:20:36 +02:00
|
|
|
goto out_running;
|
|
|
|
|
|
|
|
cpu = task_cpu(p);
|
2007-10-15 17:00:18 +02:00
|
|
|
orig_cpu = cpu;
|
2005-04-17 00:20:36 +02:00
|
|
|
this_cpu = smp_processor_id();
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
if (unlikely(task_running(rq, p)))
|
|
|
|
goto out_activate;
|
|
|
|
|
2008-01-25 21:08:21 +01:00
|
|
|
cpu = p->sched_class->select_task_rq(p, sync);
|
|
|
|
if (cpu != orig_cpu) {
|
|
|
|
set_task_cpu(p, cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
/* might preempt at this point */
|
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
old_state = p->state;
|
|
|
|
if (!(old_state & state))
|
|
|
|
goto out;
|
2007-07-09 18:51:59 +02:00
|
|
|
if (p->se.on_rq)
|
2005-04-17 00:20:36 +02:00
|
|
|
goto out_running;
|
|
|
|
|
|
|
|
this_cpu = smp_processor_id();
|
|
|
|
cpu = task_cpu(p);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:09 +01:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
schedstat_inc(rq, ttwu_count);
|
|
|
|
if (cpu == this_cpu)
|
|
|
|
schedstat_inc(rq, ttwu_local);
|
|
|
|
else {
|
|
|
|
struct sched_domain *sd;
|
|
|
|
for_each_domain(this_cpu, sd) {
|
|
|
|
if (cpu_isset(cpu, sd->span)) {
|
|
|
|
schedstat_inc(sd, ttwu_wake_remote);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
out_activate:
|
|
|
|
#endif /* CONFIG_SMP */
|
2007-10-15 17:00:18 +02:00
|
|
|
schedstat_inc(p, se.nr_wakeups);
|
|
|
|
if (sync)
|
|
|
|
schedstat_inc(p, se.nr_wakeups_sync);
|
|
|
|
if (orig_cpu != cpu)
|
|
|
|
schedstat_inc(p, se.nr_wakeups_migrate);
|
|
|
|
if (cpu == this_cpu)
|
|
|
|
schedstat_inc(p, se.nr_wakeups_local);
|
|
|
|
else
|
|
|
|
schedstat_inc(p, se.nr_wakeups_remote);
|
2007-08-09 11:16:51 +02:00
|
|
|
update_rq_clock(rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
activate_task(rq, p, 1);
|
2007-10-15 17:00:20 +02:00
|
|
|
check_preempt_curr(rq, p);
|
2005-04-17 00:20:36 +02:00
|
|
|
success = 1;
|
|
|
|
|
|
|
|
out_running:
|
|
|
|
p->state = TASK_RUNNING;
|
2008-01-25 21:08:22 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
if (p->sched_class->task_wake_up)
|
|
|
|
p->sched_class->task_wake_up(rq, p);
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
out:
|
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
|
|
|
|
return success;
|
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:41 +02:00
|
|
|
int fastcall wake_up_process(struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-12-06 17:07:07 +01:00
|
|
|
return try_to_wake_up(p, TASK_ALL, 0);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(wake_up_process);
|
|
|
|
|
2006-07-03 09:25:41 +02:00
|
|
|
int fastcall wake_up_state(struct task_struct *p, unsigned int state)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
return try_to_wake_up(p, state, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Perform scheduler related setup for a newly forked process p.
|
|
|
|
* p is forked by current.
|
2007-07-09 18:51:59 +02:00
|
|
|
*
|
|
|
|
* __sched_fork() is basic setup used by init_idle() too:
|
|
|
|
*/
|
|
|
|
static void __sched_fork(struct task_struct *p)
|
|
|
|
{
|
|
|
|
p->se.exec_start = 0;
|
|
|
|
p->se.sum_exec_runtime = 0;
|
2007-08-28 12:53:24 +02:00
|
|
|
p->se.prev_sum_exec_runtime = 0;
|
2007-08-02 17:41:40 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
p->se.wait_start = 0;
|
2007-07-09 18:51:59 +02:00
|
|
|
p->se.sum_sleep_runtime = 0;
|
|
|
|
p->se.sleep_start = 0;
|
|
|
|
p->se.block_start = 0;
|
|
|
|
p->se.sleep_max = 0;
|
|
|
|
p->se.block_max = 0;
|
|
|
|
p->se.exec_max = 0;
|
2007-10-15 17:00:02 +02:00
|
|
|
p->se.slice_max = 0;
|
2007-07-09 18:51:59 +02:00
|
|
|
p->se.wait_max = 0;
|
2007-08-02 17:41:40 +02:00
|
|
|
#endif
|
2005-06-25 23:57:29 +02:00
|
|
|
|
2008-01-25 21:08:27 +01:00
|
|
|
INIT_LIST_HEAD(&p->rt.run_list);
|
2007-07-09 18:51:59 +02:00
|
|
|
p->se.on_rq = 0;
|
2005-06-25 23:57:29 +02:00
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
|
|
INIT_HLIST_HEAD(&p->preempt_notifiers);
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* We mark the process as running here, but have not actually
|
|
|
|
* inserted it onto the runqueue yet. This guarantees that
|
|
|
|
* nobody will actually run it, and a signal or other external
|
|
|
|
* event cannot wake it up and insert it on the runqueue either.
|
|
|
|
*/
|
|
|
|
p->state = TASK_RUNNING;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* fork()/clone()-time setup:
|
|
|
|
*/
|
|
|
|
void sched_fork(struct task_struct *p, int clone_flags)
|
|
|
|
{
|
|
|
|
int cpu = get_cpu();
|
|
|
|
|
|
|
|
__sched_fork(p);
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
|
|
|
|
#endif
|
2007-10-15 17:00:11 +02:00
|
|
|
set_task_cpu(p, cpu);
|
2006-06-27 11:54:51 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure we do not leak PI boosting priority to the child:
|
|
|
|
*/
|
|
|
|
p->prio = current->normal_prio;
|
2007-10-15 17:00:11 +02:00
|
|
|
if (!rt_prio(p->prio))
|
|
|
|
p->sched_class = &fair_sched_class;
|
2006-06-27 11:54:51 +02:00
|
|
|
|
2006-07-14 09:24:38 +02:00
|
|
|
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
|
2007-07-09 18:51:59 +02:00
|
|
|
if (likely(sched_info_on()))
|
2006-07-14 09:24:38 +02:00
|
|
|
memset(&p->sched_info, 0, sizeof(p->sched_info));
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
[PATCH] sched: revert "filter affine wakeups"
Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6:
[PATCH] sched: filter affine wakeups
Apparently caused more than 10% performance regression for aim7 benchmark.
The setup in use is 16-cpu HP rx8620, 64Gb of memory and 12 MSA1000s with 144
disks. Each disk is 72Gb with a single ext3 filesystem (courtesy of HP, who
supplied benchmark results).
The problem is, for aim7, the wake-up pattern is random, but it still needs
load balancing action in the wake-up path to achieve best performance. With
the above commit, lack of load balancing hurts that workload.
However, for workloads like database transaction processing, the requirement
is exactly opposite. In the wake up path, best performance is achieved with
absolutely zero load balancing. We simply wake up the process on the CPU that
it was previously run. Worst performance is obtained when we do load
balancing at wake up.
There isn't an easy way to auto detect the workload characteristics. Ingo's
earlier patch that detects idle CPU and decide whether to load balance or not
doesn't perform with aim7 either since all CPUs are busy (it causes even
bigger perf. regression).
Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6, which causes more
than 10% performance regression with aim7.
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-02-14 22:53:10 +01:00
|
|
|
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
|
2005-06-25 23:57:23 +02:00
|
|
|
p->oncpu = 0;
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifdef CONFIG_PREEMPT
|
2005-06-25 23:57:23 +02:00
|
|
|
/* Want to start with kernel preemption disabled. */
|
2005-11-14 01:06:55 +01:00
|
|
|
task_thread_info(p)->preempt_count = 1;
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
2005-06-25 23:57:29 +02:00
|
|
|
put_cpu();
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* wake_up_new_task - wake up a newly created task for the first time.
|
|
|
|
*
|
|
|
|
* This function will do some initial scheduler statistics housekeeping
|
|
|
|
* that must be done for every newly created context, then puts the task
|
|
|
|
* on the runqueue and wakes it.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2007-07-09 18:51:59 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
rq = task_rq_lock(p, &flags);
|
2005-06-25 23:57:19 +02:00
|
|
|
BUG_ON(p->state != TASK_RUNNING);
|
2007-08-09 11:16:47 +02:00
|
|
|
update_rq_clock(rq);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
p->prio = effective_prio(p);
|
|
|
|
|
2007-10-17 16:55:11 +02:00
|
|
|
if (!p->sched_class->task_new || !current->se.on_rq) {
|
2007-07-09 18:51:59 +02:00
|
|
|
activate_task(rq, p, 0);
|
2005-04-17 00:20:36 +02:00
|
|
|
} else {
|
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* Let the scheduling class do new task startup
|
|
|
|
* management (if any):
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-08-09 11:16:49 +02:00
|
|
|
p->sched_class->task_new(rq, p);
|
2008-01-31 22:45:23 +01:00
|
|
|
inc_nr_running(rq);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
check_preempt_curr(rq, p);
|
2008-01-25 21:08:22 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
if (p->sched_class->task_wake_up)
|
|
|
|
p->sched_class->task_wake_up(rq, p);
|
|
|
|
#endif
|
2007-07-09 18:51:59 +02:00
|
|
|
task_rq_unlock(rq, &flags);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
|
|
|
|
|
|
/**
|
2007-07-31 09:37:50 +02:00
|
|
|
* preempt_notifier_register - tell me when current is being being preempted & rescheduled
|
|
|
|
* @notifier: notifier struct to register
|
2007-07-26 13:40:43 +02:00
|
|
|
*/
|
|
|
|
void preempt_notifier_register(struct preempt_notifier *notifier)
|
|
|
|
{
|
|
|
|
hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(preempt_notifier_register);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* preempt_notifier_unregister - no longer interested in preemption notifications
|
2007-07-31 09:37:50 +02:00
|
|
|
* @notifier: notifier struct to unregister
|
2007-07-26 13:40:43 +02:00
|
|
|
*
|
|
|
|
* This is safe to call from within a preemption notifier.
|
|
|
|
*/
|
|
|
|
void preempt_notifier_unregister(struct preempt_notifier *notifier)
|
|
|
|
{
|
|
|
|
hlist_del(¬ifier->link);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
|
|
|
|
|
|
|
|
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
|
|
|
{
|
|
|
|
struct preempt_notifier *notifier;
|
|
|
|
struct hlist_node *node;
|
|
|
|
|
|
|
|
hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
|
|
|
|
notifier->ops->sched_in(notifier, raw_smp_processor_id());
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
|
|
|
struct task_struct *next)
|
|
|
|
{
|
|
|
|
struct preempt_notifier *notifier;
|
|
|
|
struct hlist_node *node;
|
|
|
|
|
|
|
|
hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
|
|
|
|
notifier->ops->sched_out(notifier, next);
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
fire_sched_out_preempt_notifiers(struct task_struct *curr,
|
|
|
|
struct task_struct *next)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2005-06-25 23:57:23 +02:00
|
|
|
/**
|
|
|
|
* prepare_task_switch - prepare to switch tasks
|
|
|
|
* @rq: the runqueue preparing to switch
|
2007-07-31 09:37:50 +02:00
|
|
|
* @prev: the current task that is being switched out
|
2005-06-25 23:57:23 +02:00
|
|
|
* @next: the task we are going to switch to.
|
|
|
|
*
|
|
|
|
* This is called with the rq lock held and interrupts off. It must
|
|
|
|
* be paired with a subsequent finish_task_switch after the context
|
|
|
|
* switch.
|
|
|
|
*
|
|
|
|
* prepare_task_switch sets up locking and calls architecture specific
|
|
|
|
* hooks.
|
|
|
|
*/
|
2007-07-26 13:40:43 +02:00
|
|
|
static inline void
|
|
|
|
prepare_task_switch(struct rq *rq, struct task_struct *prev,
|
|
|
|
struct task_struct *next)
|
2005-06-25 23:57:23 +02:00
|
|
|
{
|
2007-07-26 13:40:43 +02:00
|
|
|
fire_sched_out_preempt_notifiers(prev, next);
|
2005-06-25 23:57:23 +02:00
|
|
|
prepare_lock_switch(rq, next);
|
|
|
|
prepare_arch_switch(next);
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/**
|
|
|
|
* finish_task_switch - clean up after a task-switch
|
2005-09-07 07:15:17 +02:00
|
|
|
* @rq: runqueue associated with task-switch
|
2005-04-17 00:20:36 +02:00
|
|
|
* @prev: the thread we just switched away from.
|
|
|
|
*
|
2005-06-25 23:57:23 +02:00
|
|
|
* finish_task_switch must be called after the context switch, paired
|
|
|
|
* with a prepare_task_switch call before the context switch.
|
|
|
|
* finish_task_switch will reconcile locking set up by prepare_task_switch,
|
|
|
|
* and do any other architecture-specific cleanup actions.
|
2005-04-17 00:20:36 +02:00
|
|
|
*
|
|
|
|
* Note that we may have delayed dropping an mm in context_switch(). If
|
2007-12-05 15:46:09 +01:00
|
|
|
* so, we finish that here outside of the runqueue lock. (Doing it
|
2005-04-17 00:20:36 +02:00
|
|
|
* with the lock held can cause deadlocks; see schedule() for
|
|
|
|
* details.)
|
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static void finish_task_switch(struct rq *rq, struct task_struct *prev)
|
2005-04-17 00:20:36 +02:00
|
|
|
__releases(rq->lock)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = rq->prev_mm;
|
2006-09-29 11:01:10 +02:00
|
|
|
long prev_state;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
rq->prev_mm = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A task struct has one reference for the use as "current".
|
2006-09-29 11:01:11 +02:00
|
|
|
* If a task dies, then it sets TASK_DEAD in tsk->state and calls
|
2006-09-29 11:01:10 +02:00
|
|
|
* schedule one last time. The schedule call will never return, and
|
|
|
|
* the scheduled task must drop that reference.
|
2006-09-29 11:01:11 +02:00
|
|
|
* The test for TASK_DEAD must occur while the runqueue locks are
|
2005-04-17 00:20:36 +02:00
|
|
|
* still held, otherwise prev could be scheduled on another cpu, die
|
|
|
|
* there before we look at prev->state, and then the reference would
|
|
|
|
* be dropped twice.
|
|
|
|
* Manfred Spraul <manfred@colorfullife.com>
|
|
|
|
*/
|
2006-09-29 11:01:10 +02:00
|
|
|
prev_state = prev->state;
|
2005-06-25 23:57:23 +02:00
|
|
|
finish_arch_switch(prev);
|
|
|
|
finish_lock_switch(rq, prev);
|
2008-01-25 21:08:22 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
if (current->sched_class->post_schedule)
|
|
|
|
current->sched_class->post_schedule(rq);
|
|
|
|
#endif
|
2008-01-25 21:08:05 +01:00
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
fire_sched_in_preempt_notifiers(current);
|
2005-04-17 00:20:36 +02:00
|
|
|
if (mm)
|
|
|
|
mmdrop(mm);
|
2006-09-29 11:01:11 +02:00
|
|
|
if (unlikely(prev_state == TASK_DEAD)) {
|
2006-03-26 11:38:20 +02:00
|
|
|
/*
|
|
|
|
* Remove function-return probe instances associated with this
|
|
|
|
* task and put them back on the free list.
|
2007-07-09 18:52:00 +02:00
|
|
|
*/
|
2006-03-26 11:38:20 +02:00
|
|
|
kprobe_flush_task(prev);
|
2005-04-17 00:20:36 +02:00
|
|
|
put_task_struct(prev);
|
2006-03-26 11:38:20 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* schedule_tail - first thing a freshly forked thread must call.
|
|
|
|
* @prev: the thread we just switched away from.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
asmlinkage void schedule_tail(struct task_struct *prev)
|
2005-04-17 00:20:36 +02:00
|
|
|
__releases(rq->lock)
|
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = this_rq();
|
|
|
|
|
2005-06-25 23:57:23 +02:00
|
|
|
finish_task_switch(rq, prev);
|
|
|
|
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
|
|
|
|
/* In this case, finish_task_switch does not reenable preemption */
|
|
|
|
preempt_enable();
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
if (current->set_child_tid)
|
2007-10-19 08:40:14 +02:00
|
|
|
put_user(task_pid_vnr(current), current->set_child_tid);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* context_switch - switch to the new MM and the new
|
|
|
|
* thread's register state.
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
static inline void
|
2006-07-03 09:25:42 +02:00
|
|
|
context_switch(struct rq *rq, struct task_struct *prev,
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *next)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
struct mm_struct *mm, *oldmm;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
prepare_task_switch(rq, prev, next);
|
2007-07-09 18:51:59 +02:00
|
|
|
mm = next->mm;
|
|
|
|
oldmm = prev->active_mm;
|
2007-02-13 13:26:21 +01:00
|
|
|
/*
|
|
|
|
* For paravirt, this is coupled with an exit in switch_to to
|
|
|
|
* combine the page table reload and the switch backend into
|
|
|
|
* one hypercall.
|
|
|
|
*/
|
|
|
|
arch_enter_lazy_cpu_mode();
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (unlikely(!mm)) {
|
2005-04-17 00:20:36 +02:00
|
|
|
next->active_mm = oldmm;
|
|
|
|
atomic_inc(&oldmm->mm_count);
|
|
|
|
enter_lazy_tlb(oldmm, next);
|
|
|
|
} else
|
|
|
|
switch_mm(oldmm, mm, next);
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (unlikely(!prev->mm)) {
|
2005-04-17 00:20:36 +02:00
|
|
|
prev->active_mm = NULL;
|
|
|
|
rq->prev_mm = oldmm;
|
|
|
|
}
|
2006-07-14 09:24:27 +02:00
|
|
|
/*
|
|
|
|
* Since the runqueue lock will be released by the next
|
|
|
|
* task (which is an invalid locking op but in the case
|
|
|
|
* of the scheduler it's an obvious special-case), so we
|
|
|
|
* do an early lockdep release here:
|
|
|
|
*/
|
|
|
|
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
|
2006-07-03 09:24:54 +02:00
|
|
|
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
|
2006-07-14 09:24:27 +02:00
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/* Here we just switch the register state and the stack. */
|
|
|
|
switch_to(prev, next, prev);
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
barrier();
|
|
|
|
/*
|
|
|
|
* this_rq must be evaluated again because prev may have moved
|
|
|
|
* CPUs since it called schedule(), thus the 'rq' on its stack
|
|
|
|
* frame will be invalid.
|
|
|
|
*/
|
|
|
|
finish_task_switch(this_rq(), prev);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* nr_running, nr_uninterruptible and nr_context_switches:
|
|
|
|
*
|
|
|
|
* externally visible scheduler statistics: current number of runnable
|
|
|
|
* threads, current number of uninterruptible-sleeping threads, total
|
|
|
|
* number of context switches performed since bootup.
|
|
|
|
*/
|
|
|
|
unsigned long nr_running(void)
|
|
|
|
{
|
|
|
|
unsigned long i, sum = 0;
|
|
|
|
|
|
|
|
for_each_online_cpu(i)
|
|
|
|
sum += cpu_rq(i)->nr_running;
|
|
|
|
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long nr_uninterruptible(void)
|
|
|
|
{
|
|
|
|
unsigned long i, sum = 0;
|
|
|
|
|
2006-03-28 11:56:37 +02:00
|
|
|
for_each_possible_cpu(i)
|
2005-04-17 00:20:36 +02:00
|
|
|
sum += cpu_rq(i)->nr_uninterruptible;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Since we read the counters lockless, it might be slightly
|
|
|
|
* inaccurate. Do not allow it to go below zero though:
|
|
|
|
*/
|
|
|
|
if (unlikely((long)sum < 0))
|
|
|
|
sum = 0;
|
|
|
|
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long long nr_context_switches(void)
|
|
|
|
{
|
2006-06-27 11:54:31 +02:00
|
|
|
int i;
|
|
|
|
unsigned long long sum = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-03-28 11:56:37 +02:00
|
|
|
for_each_possible_cpu(i)
|
2005-04-17 00:20:36 +02:00
|
|
|
sum += cpu_rq(i)->nr_switches;
|
|
|
|
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long nr_iowait(void)
|
|
|
|
{
|
|
|
|
unsigned long i, sum = 0;
|
|
|
|
|
2006-03-28 11:56:37 +02:00
|
|
|
for_each_possible_cpu(i)
|
2005-04-17 00:20:36 +02:00
|
|
|
sum += atomic_read(&cpu_rq(i)->nr_iowait);
|
|
|
|
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
2006-03-31 12:31:21 +02:00
|
|
|
unsigned long nr_active(void)
|
|
|
|
{
|
|
|
|
unsigned long i, running = 0, uninterruptible = 0;
|
|
|
|
|
|
|
|
for_each_online_cpu(i) {
|
|
|
|
running += cpu_rq(i)->nr_running;
|
|
|
|
uninterruptible += cpu_rq(i)->nr_uninterruptible;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely((long)uninterruptible < 0))
|
|
|
|
uninterruptible = 0;
|
|
|
|
|
|
|
|
return running + uninterruptible;
|
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* Update rq->cpu_load[] statistics. This function is usually called every
|
|
|
|
* scheduler tick (TICK_NSEC).
|
2006-07-03 09:25:40 +02:00
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
static void update_cpu_load(struct rq *this_rq)
|
2006-07-03 09:25:40 +02:00
|
|
|
{
|
2007-10-15 17:00:06 +02:00
|
|
|
unsigned long this_load = this_rq->load.weight;
|
2007-07-09 18:51:59 +02:00
|
|
|
int i, scale;
|
|
|
|
|
|
|
|
this_rq->nr_load_updates++;
|
|
|
|
|
|
|
|
/* Update our load: */
|
|
|
|
for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
|
|
|
|
unsigned long old_load, new_load;
|
|
|
|
|
|
|
|
/* scale is effectively 1 << i now, and >> i divides by scale */
|
|
|
|
|
|
|
|
old_load = this_rq->cpu_load[i];
|
|
|
|
new_load = this_load;
|
2007-10-15 17:00:03 +02:00
|
|
|
/*
|
|
|
|
* Round up the averaging division if load is increasing. This
|
|
|
|
* prevents us from getting stuck on 9 if the load is 10, for
|
|
|
|
* example.
|
|
|
|
*/
|
|
|
|
if (new_load > old_load)
|
|
|
|
new_load += scale-1;
|
2007-07-09 18:51:59 +02:00
|
|
|
this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
|
|
|
|
}
|
2006-07-03 09:25:40 +02:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* double_rq_lock - safely lock two runqueues
|
|
|
|
*
|
|
|
|
* Note this does not disable interrupts like task_rq_lock,
|
|
|
|
* you need to do so manually before calling.
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static void double_rq_lock(struct rq *rq1, struct rq *rq2)
|
2005-04-17 00:20:36 +02:00
|
|
|
__acquires(rq1->lock)
|
|
|
|
__acquires(rq2->lock)
|
|
|
|
{
|
2006-12-10 11:20:11 +01:00
|
|
|
BUG_ON(!irqs_disabled());
|
2005-04-17 00:20:36 +02:00
|
|
|
if (rq1 == rq2) {
|
|
|
|
spin_lock(&rq1->lock);
|
|
|
|
__acquire(rq2->lock); /* Fake it out ;) */
|
|
|
|
} else {
|
2006-06-27 11:54:28 +02:00
|
|
|
if (rq1 < rq2) {
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_lock(&rq1->lock);
|
|
|
|
spin_lock(&rq2->lock);
|
|
|
|
} else {
|
|
|
|
spin_lock(&rq2->lock);
|
|
|
|
spin_lock(&rq1->lock);
|
|
|
|
}
|
|
|
|
}
|
2007-08-09 11:16:51 +02:00
|
|
|
update_rq_clock(rq1);
|
|
|
|
update_rq_clock(rq2);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* double_rq_unlock - safely unlock two runqueues
|
|
|
|
*
|
|
|
|
* Note this does not restore interrupts like task_rq_unlock,
|
|
|
|
* you need to do so manually after calling.
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
|
2005-04-17 00:20:36 +02:00
|
|
|
__releases(rq1->lock)
|
|
|
|
__releases(rq2->lock)
|
|
|
|
{
|
|
|
|
spin_unlock(&rq1->lock);
|
|
|
|
if (rq1 != rq2)
|
|
|
|
spin_unlock(&rq2->lock);
|
|
|
|
else
|
|
|
|
__release(rq2->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
|
|
|
|
*/
|
2008-01-25 21:08:05 +01:00
|
|
|
static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
|
2005-04-17 00:20:36 +02:00
|
|
|
__releases(this_rq->lock)
|
|
|
|
__acquires(busiest->lock)
|
|
|
|
__acquires(this_rq->lock)
|
|
|
|
{
|
2008-01-25 21:08:05 +01:00
|
|
|
int ret = 0;
|
|
|
|
|
2006-12-10 11:20:11 +01:00
|
|
|
if (unlikely(!irqs_disabled())) {
|
|
|
|
/* printk() doesn't work good under rq->lock */
|
|
|
|
spin_unlock(&this_rq->lock);
|
|
|
|
BUG_ON(1);
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
if (unlikely(!spin_trylock(&busiest->lock))) {
|
2006-06-27 11:54:28 +02:00
|
|
|
if (busiest < this_rq) {
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_unlock(&this_rq->lock);
|
|
|
|
spin_lock(&busiest->lock);
|
|
|
|
spin_lock(&this_rq->lock);
|
2008-01-25 21:08:05 +01:00
|
|
|
ret = 1;
|
2005-04-17 00:20:36 +02:00
|
|
|
} else
|
|
|
|
spin_lock(&busiest->lock);
|
|
|
|
}
|
2008-01-25 21:08:05 +01:00
|
|
|
return ret;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If dest_cpu is allowed for this process, migrate the task to it.
|
|
|
|
* This is accomplished by forcing the cpu_allowed mask to only
|
2007-12-05 15:46:09 +01:00
|
|
|
* allow dest_cpu, which will force the cpu onto dest_cpu. Then
|
2005-04-17 00:20:36 +02:00
|
|
|
* the cpu_allowed mask is restored.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
static void sched_migrate_task(struct task_struct *p, int dest_cpu)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct migration_req req;
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long flags;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
if (!cpu_isset(dest_cpu, p->cpus_allowed)
|
|
|
|
|| unlikely(cpu_is_offline(dest_cpu)))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* force the process onto the specified CPU */
|
|
|
|
if (migrate_task(p, dest_cpu, &req)) {
|
|
|
|
/* Need to wait for migration thread (might exit: take ref). */
|
|
|
|
struct task_struct *mt = rq->migration_thread;
|
2006-07-03 09:25:41 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
get_task_struct(mt);
|
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
wake_up_process(mt);
|
|
|
|
put_task_struct(mt);
|
|
|
|
wait_for_completion(&req.done);
|
2006-07-03 09:25:41 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2005-06-25 23:57:29 +02:00
|
|
|
* sched_exec - execve() is a valuable balancing opportunity, because at
|
|
|
|
* this point the task has the smallest effective memory and cache footprint.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
|
|
|
void sched_exec(void)
|
|
|
|
{
|
|
|
|
int new_cpu, this_cpu = get_cpu();
|
2005-06-25 23:57:29 +02:00
|
|
|
new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
|
2005-04-17 00:20:36 +02:00
|
|
|
put_cpu();
|
2005-06-25 23:57:29 +02:00
|
|
|
if (new_cpu != this_cpu)
|
|
|
|
sched_migrate_task(current, new_cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pull_task - move a task from a remote runqueue to the local runqueue.
|
|
|
|
* Both runqueues must be locked.
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
static void pull_task(struct rq *src_rq, struct task_struct *p,
|
|
|
|
struct rq *this_rq, int this_cpu)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-08-09 11:16:49 +02:00
|
|
|
deactivate_task(src_rq, p, 0);
|
2005-04-17 00:20:36 +02:00
|
|
|
set_task_cpu(p, this_cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
activate_task(this_rq, p, 0);
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* Note that idle threads have a prio of MAX_PRIO, for this test
|
|
|
|
* to be always true for them.
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
check_preempt_curr(this_rq, p);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
|
|
|
|
*/
|
2006-01-14 22:20:43 +01:00
|
|
|
static
|
2006-07-03 09:25:42 +02:00
|
|
|
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
|
2007-07-09 18:51:57 +02:00
|
|
|
struct sched_domain *sd, enum cpu_idle_type idle,
|
2005-09-10 09:26:11 +02:00
|
|
|
int *all_pinned)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* We do not migrate tasks that are:
|
|
|
|
* 1) running (obviously), or
|
|
|
|
* 2) cannot be migrated to this CPU due to cpus_allowed, or
|
|
|
|
* 3) are cache-hot on their current CPU.
|
|
|
|
*/
|
2007-10-15 17:00:18 +02:00
|
|
|
if (!cpu_isset(this_cpu, p->cpus_allowed)) {
|
|
|
|
schedstat_inc(p, se.nr_failed_migrations_affine);
|
2005-04-17 00:20:36 +02:00
|
|
|
return 0;
|
2007-10-15 17:00:18 +02:00
|
|
|
}
|
2005-06-25 23:57:07 +02:00
|
|
|
*all_pinned = 0;
|
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
if (task_running(rq, p)) {
|
|
|
|
schedstat_inc(p, se.nr_failed_migrations_running);
|
2005-06-25 23:57:07 +02:00
|
|
|
return 0;
|
2007-10-15 17:00:18 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
/*
|
|
|
|
* Aggressive migration if:
|
|
|
|
* 1) task is cache cold, or
|
|
|
|
* 2) too many balance attempts have failed.
|
|
|
|
*/
|
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
if (!task_hot(p, rq->clock, sd) ||
|
|
|
|
sd->nr_balance_failed > sd->cache_nice_tries) {
|
2007-10-15 17:00:18 +02:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
2007-10-15 17:00:18 +02:00
|
|
|
if (task_hot(p, rq->clock, sd)) {
|
2007-10-15 17:00:18 +02:00
|
|
|
schedstat_inc(sd, lb_hot_gained[idle]);
|
2007-10-15 17:00:18 +02:00
|
|
|
schedstat_inc(p, se.nr_forced_migrations);
|
|
|
|
}
|
2007-10-15 17:00:18 +02:00
|
|
|
#endif
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
if (task_hot(p, rq->clock, sd)) {
|
|
|
|
schedstat_inc(p, se.nr_failed_migrations_hot);
|
2007-10-15 17:00:18 +02:00
|
|
|
return 0;
|
2007-10-15 17:00:18 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2007-10-24 18:23:51 +02:00
|
|
|
static unsigned long
|
|
|
|
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
|
|
unsigned long max_load_move, struct sched_domain *sd,
|
|
|
|
enum cpu_idle_type idle, int *all_pinned,
|
|
|
|
int *this_best_prio, struct rq_iterator *iterator)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-11-09 22:39:39 +01:00
|
|
|
int loops = 0, pulled = 0, pinned = 0, skip_for_load;
|
2007-07-09 18:51:59 +02:00
|
|
|
struct task_struct *p;
|
|
|
|
long rem_load_move = max_load_move;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:51 +02:00
|
|
|
if (max_load_move == 0)
|
2005-04-17 00:20:36 +02:00
|
|
|
goto out;
|
|
|
|
|
2005-06-25 23:57:07 +02:00
|
|
|
pinned = 1;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* Start the load-balancing iterator:
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
p = iterator->start(iterator->arg);
|
|
|
|
next:
|
2007-11-09 22:39:39 +01:00
|
|
|
if (!p || loops++ > sysctl_sched_nr_migrate)
|
2005-04-17 00:20:36 +02:00
|
|
|
goto out;
|
2006-06-27 11:54:36 +02:00
|
|
|
/*
|
2007-11-09 22:39:39 +01:00
|
|
|
* To help distribute high priority tasks across CPUs we don't
|
2006-06-27 11:54:36 +02:00
|
|
|
* skip a task if it will be the highest priority task (i.e. smallest
|
|
|
|
* prio value) on its new queue regardless of its load weight
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
|
|
|
|
SCHED_LOAD_SCALE_FUZZ;
|
2007-08-09 11:16:46 +02:00
|
|
|
if ((skip_for_load && p->prio >= *this_best_prio) ||
|
2007-07-09 18:51:59 +02:00
|
|
|
!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
|
|
|
|
p = iterator->next(iterator->arg);
|
|
|
|
goto next;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
pull_task(busiest, p, this_rq, this_cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
pulled++;
|
2007-07-09 18:51:59 +02:00
|
|
|
rem_load_move -= p->se.load.weight;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
/*
|
2007-11-09 22:39:39 +01:00
|
|
|
* We only want to steal up to the prescribed amount of weighted load.
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
*/
|
2007-10-24 18:23:51 +02:00
|
|
|
if (rem_load_move > 0) {
|
2007-08-09 11:16:46 +02:00
|
|
|
if (p->prio < *this_best_prio)
|
|
|
|
*this_best_prio = p->prio;
|
2007-07-09 18:51:59 +02:00
|
|
|
p = iterator->next(iterator->arg);
|
|
|
|
goto next;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
out:
|
|
|
|
/*
|
2007-10-24 18:23:51 +02:00
|
|
|
* Right now, this is one of only two places pull_task() is called,
|
2005-04-17 00:20:36 +02:00
|
|
|
* so we can safely collect pull_task() stats here rather than
|
|
|
|
* inside pull_task().
|
|
|
|
*/
|
|
|
|
schedstat_add(sd, lb_gained[idle], pulled);
|
2005-06-25 23:57:07 +02:00
|
|
|
|
|
|
|
if (all_pinned)
|
|
|
|
*all_pinned = pinned;
|
2007-10-24 18:23:51 +02:00
|
|
|
|
|
|
|
return max_load_move - rem_load_move;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
* move_tasks tries to move up to max_load_move weighted load from busiest to
|
|
|
|
* this_rq, as part of a balancing operation within domain "sd".
|
|
|
|
* Returns 1 if successful and 0 otherwise.
|
2007-07-09 18:51:59 +02:00
|
|
|
*
|
|
|
|
* Called with both runqueues locked.
|
|
|
|
*/
|
|
|
|
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
unsigned long max_load_move,
|
2007-07-09 18:51:59 +02:00
|
|
|
struct sched_domain *sd, enum cpu_idle_type idle,
|
|
|
|
int *all_pinned)
|
|
|
|
{
|
2007-10-15 17:00:12 +02:00
|
|
|
const struct sched_class *class = sched_class_highest;
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
unsigned long total_load_moved = 0;
|
2007-08-09 11:16:46 +02:00
|
|
|
int this_best_prio = this_rq->curr->prio;
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
do {
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
total_load_moved +=
|
|
|
|
class->load_balance(this_rq, this_cpu, busiest,
|
2007-10-24 18:23:51 +02:00
|
|
|
max_load_move - total_load_moved,
|
2007-08-09 11:16:46 +02:00
|
|
|
sd, idle, all_pinned, &this_best_prio);
|
2007-07-09 18:51:59 +02:00
|
|
|
class = class->next;
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
} while (class && max_load_move > total_load_moved);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
return total_load_moved > 0;
|
|
|
|
}
|
|
|
|
|
2007-10-24 18:23:51 +02:00
|
|
|
static int
|
|
|
|
iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
|
|
struct sched_domain *sd, enum cpu_idle_type idle,
|
|
|
|
struct rq_iterator *iterator)
|
|
|
|
{
|
|
|
|
struct task_struct *p = iterator->start(iterator->arg);
|
|
|
|
int pinned = 0;
|
|
|
|
|
|
|
|
while (p) {
|
|
|
|
if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
|
|
|
|
pull_task(busiest, p, this_rq, this_cpu);
|
|
|
|
/*
|
|
|
|
* Right now, this is only the second place pull_task()
|
|
|
|
* is called, so we can safely collect pull_task()
|
|
|
|
* stats here rather than inside pull_task().
|
|
|
|
*/
|
|
|
|
schedstat_inc(sd, lb_gained[idle]);
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
p = iterator->next(iterator->arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
/*
|
|
|
|
* move_one_task tries to move exactly one task from busiest to this_rq, as
|
|
|
|
* part of active balancing operations within "domain".
|
|
|
|
* Returns 1 if successful and 0 otherwise.
|
|
|
|
*
|
|
|
|
* Called with both runqueues locked.
|
|
|
|
*/
|
|
|
|
static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
|
|
|
struct sched_domain *sd, enum cpu_idle_type idle)
|
|
|
|
{
|
2007-10-15 17:00:12 +02:00
|
|
|
const struct sched_class *class;
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
|
|
|
|
for (class = sched_class_highest; class; class = class->next)
|
2007-10-24 18:23:51 +02:00
|
|
|
if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
return 1;
|
|
|
|
|
|
|
|
return 0;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* find_busiest_group finds and returns the busiest CPU group within the
|
2006-07-03 09:25:40 +02:00
|
|
|
* domain. It calculates and returns the amount of weighted load which
|
|
|
|
* should be moved to restore balance via the imbalance parameter.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
|
|
|
static struct sched_group *
|
|
|
|
find_busiest_group(struct sched_domain *sd, int this_cpu,
|
2007-07-09 18:51:59 +02:00
|
|
|
unsigned long *imbalance, enum cpu_idle_type idle,
|
|
|
|
int *sd_idle, cpumask_t *cpus, int *balance)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
|
|
|
|
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
|
2005-09-10 09:26:21 +02:00
|
|
|
unsigned long max_pull;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
unsigned long busiest_load_per_task, busiest_nr_running;
|
|
|
|
unsigned long this_load_per_task, this_nr_running;
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
int load_idx, group_imb = 0;
|
2006-06-27 11:54:42 +02:00
|
|
|
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
|
|
|
int power_savings_balance = 1;
|
|
|
|
unsigned long leader_nr_running = 0, min_load_per_task = 0;
|
|
|
|
unsigned long min_nr_running = ULONG_MAX;
|
|
|
|
struct sched_group *group_min = NULL, *group_leader = NULL;
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
max_load = this_load = total_load = total_pwr = 0;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
busiest_load_per_task = busiest_nr_running = 0;
|
|
|
|
this_load_per_task = this_nr_running = 0;
|
2007-07-09 18:51:57 +02:00
|
|
|
if (idle == CPU_NOT_IDLE)
|
2005-06-25 23:57:13 +02:00
|
|
|
load_idx = sd->busy_idx;
|
2007-07-09 18:51:57 +02:00
|
|
|
else if (idle == CPU_NEWLY_IDLE)
|
2005-06-25 23:57:13 +02:00
|
|
|
load_idx = sd->newidle_idx;
|
|
|
|
else
|
|
|
|
load_idx = sd->idle_idx;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
do {
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
|
2005-04-17 00:20:36 +02:00
|
|
|
int local_group;
|
|
|
|
int i;
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
int __group_imb = 0;
|
2006-12-10 11:20:33 +01:00
|
|
|
unsigned int balance_cpu = -1, first_idle_cpu = 0;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
unsigned long sum_nr_running, sum_weighted_load;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
local_group = cpu_isset(this_cpu, group->cpumask);
|
|
|
|
|
2006-12-10 11:20:33 +01:00
|
|
|
if (local_group)
|
|
|
|
balance_cpu = first_cpu(group->cpumask);
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/* Tally up the load of all CPUs in the group */
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
sum_weighted_load = sum_nr_running = avg_load = 0;
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
max_cpu_load = 0;
|
|
|
|
min_cpu_load = ~0UL;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
for_each_cpu_mask(i, group->cpumask) {
|
2006-09-26 08:30:51 +02:00
|
|
|
struct rq *rq;
|
|
|
|
|
|
|
|
if (!cpu_isset(i, *cpus))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
rq = cpu_rq(i);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
|
2007-07-19 21:28:35 +02:00
|
|
|
if (*sd_idle && rq->nr_running)
|
2005-09-10 09:26:19 +02:00
|
|
|
*sd_idle = 0;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/* Bias balancing toward cpus of our domain */
|
2006-12-10 11:20:33 +01:00
|
|
|
if (local_group) {
|
|
|
|
if (idle_cpu(i) && !first_idle_cpu) {
|
|
|
|
first_idle_cpu = 1;
|
|
|
|
balance_cpu = i;
|
|
|
|
}
|
|
|
|
|
[PATCH] sched: remove smpnice
I don't think the code is quite ready, which is why I asked for Peter's
additions to also be merged before I acked it (although it turned out that
it still isn't quite ready with his additions either).
Basically I have had similar observations to Suresh in that it does not
play nicely with the rest of the balancing infrastructure (and raised
similar concerns in my review).
The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2
SMP+HT Xeon are as follows:
| Following boot | hackbench 20 | hackbench 40
-----------+----------------+---------------------+---------------------
2.6.16-rc2 | 30,37,100,112 | 5600,5530,6020,6090 | 6390,7090,8760,8470
+nosmpnice | 3, 2, 4, 2 | 28, 150, 294, 132 | 348, 348, 294, 347
Hackbench raw performance is down around 15% with smpnice (but that in
itself isn't a huge deal because it is just a benchmark). However, the
samples show that the imbalance passed into move_tasks is increased by
about a factor of 10-30. I think this would also go some way to explaining
latency blips turning up in the balancing code (though I haven't actually
measured that).
We'll probably have to revert this in the SUSE kernel.
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: "Martin J. Bligh" <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-02-10 10:51:02 +01:00
|
|
|
load = target_load(i, load_idx);
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
} else {
|
[PATCH] sched: remove smpnice
I don't think the code is quite ready, which is why I asked for Peter's
additions to also be merged before I acked it (although it turned out that
it still isn't quite ready with his additions either).
Basically I have had similar observations to Suresh in that it does not
play nicely with the rest of the balancing infrastructure (and raised
similar concerns in my review).
The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2
SMP+HT Xeon are as follows:
| Following boot | hackbench 20 | hackbench 40
-----------+----------------+---------------------+---------------------
2.6.16-rc2 | 30,37,100,112 | 5600,5530,6020,6090 | 6390,7090,8760,8470
+nosmpnice | 3, 2, 4, 2 | 28, 150, 294, 132 | 348, 348, 294, 347
Hackbench raw performance is down around 15% with smpnice (but that in
itself isn't a huge deal because it is just a benchmark). However, the
samples show that the imbalance passed into move_tasks is increased by
about a factor of 10-30. I think this would also go some way to explaining
latency blips turning up in the balancing code (though I haven't actually
measured that).
We'll probably have to revert this in the SUSE kernel.
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: "Martin J. Bligh" <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-02-10 10:51:02 +01:00
|
|
|
load = source_load(i, load_idx);
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
if (load > max_cpu_load)
|
|
|
|
max_cpu_load = load;
|
|
|
|
if (min_cpu_load > load)
|
|
|
|
min_cpu_load = load;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
avg_load += load;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
sum_nr_running += rq->nr_running;
|
2007-07-09 18:51:59 +02:00
|
|
|
sum_weighted_load += weighted_cpuload(i);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2006-12-10 11:20:33 +01:00
|
|
|
/*
|
|
|
|
* First idle cpu or the first cpu(busiest) in this sched group
|
|
|
|
* is eligible for doing load balancing at this and above
|
2007-07-19 21:28:35 +02:00
|
|
|
* domains. In the newly idle case, we will allow all the cpu's
|
|
|
|
* to do the newly idle load balance.
|
2006-12-10 11:20:33 +01:00
|
|
|
*/
|
2007-07-19 21:28:35 +02:00
|
|
|
if (idle != CPU_NEWLY_IDLE && local_group &&
|
|
|
|
balance_cpu != this_cpu && balance) {
|
2006-12-10 11:20:33 +01:00
|
|
|
*balance = 0;
|
|
|
|
goto ret;
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
total_load += avg_load;
|
2007-05-08 09:32:57 +02:00
|
|
|
total_pwr += group->__cpu_power;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/* Adjust by relative CPU power of the group */
|
2007-05-08 09:32:57 +02:00
|
|
|
avg_load = sg_div_cpu_power(group,
|
|
|
|
avg_load * SCHED_LOAD_SCALE);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
|
|
|
|
__group_imb = 1;
|
|
|
|
|
2007-05-08 09:32:57 +02:00
|
|
|
group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
|
2006-06-27 11:54:42 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
if (local_group) {
|
|
|
|
this_load = avg_load;
|
|
|
|
this = group;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
this_nr_running = sum_nr_running;
|
|
|
|
this_load_per_task = sum_weighted_load;
|
|
|
|
} else if (avg_load > max_load &&
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
(sum_nr_running > group_capacity || __group_imb)) {
|
2005-04-17 00:20:36 +02:00
|
|
|
max_load = avg_load;
|
|
|
|
busiest = group;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
busiest_nr_running = sum_nr_running;
|
|
|
|
busiest_load_per_task = sum_weighted_load;
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
group_imb = __group_imb;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2006-06-27 11:54:42 +02:00
|
|
|
|
|
|
|
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
|
|
|
/*
|
|
|
|
* Busy processors will not participate in power savings
|
|
|
|
* balance.
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
if (idle == CPU_NOT_IDLE ||
|
|
|
|
!(sd->flags & SD_POWERSAVINGS_BALANCE))
|
|
|
|
goto group_next;
|
2006-06-27 11:54:42 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the local group is idle or completely loaded
|
|
|
|
* no need to do power savings balance at this domain
|
|
|
|
*/
|
|
|
|
if (local_group && (this_nr_running >= group_capacity ||
|
|
|
|
!this_nr_running))
|
|
|
|
power_savings_balance = 0;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
2006-06-27 11:54:42 +02:00
|
|
|
* If a group is already running at full capacity or idle,
|
|
|
|
* don't include that group in power savings calculations
|
2007-07-09 18:51:59 +02:00
|
|
|
*/
|
|
|
|
if (!power_savings_balance || sum_nr_running >= group_capacity
|
2006-06-27 11:54:42 +02:00
|
|
|
|| !sum_nr_running)
|
2007-07-09 18:51:59 +02:00
|
|
|
goto group_next;
|
2006-06-27 11:54:42 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
2006-06-27 11:54:42 +02:00
|
|
|
* Calculate the group which has the least non-idle load.
|
2007-07-09 18:51:59 +02:00
|
|
|
* This is the group from where we need to pick up the load
|
|
|
|
* for saving power
|
|
|
|
*/
|
|
|
|
if ((sum_nr_running < min_nr_running) ||
|
|
|
|
(sum_nr_running == min_nr_running &&
|
2006-06-27 11:54:42 +02:00
|
|
|
first_cpu(group->cpumask) <
|
|
|
|
first_cpu(group_min->cpumask))) {
|
2007-07-09 18:51:59 +02:00
|
|
|
group_min = group;
|
|
|
|
min_nr_running = sum_nr_running;
|
2006-06-27 11:54:42 +02:00
|
|
|
min_load_per_task = sum_weighted_load /
|
|
|
|
sum_nr_running;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2006-06-27 11:54:42 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
2006-06-27 11:54:42 +02:00
|
|
|
* Calculate the group which is almost near its
|
2007-07-09 18:51:59 +02:00
|
|
|
* capacity but still has some space to pick up some load
|
|
|
|
* from other group and save more power
|
|
|
|
*/
|
|
|
|
if (sum_nr_running <= group_capacity - 1) {
|
|
|
|
if (sum_nr_running > leader_nr_running ||
|
|
|
|
(sum_nr_running == leader_nr_running &&
|
|
|
|
first_cpu(group->cpumask) >
|
|
|
|
first_cpu(group_leader->cpumask))) {
|
|
|
|
group_leader = group;
|
|
|
|
leader_nr_running = sum_nr_running;
|
|
|
|
}
|
2006-07-03 09:25:40 +02:00
|
|
|
}
|
2006-06-27 11:54:42 +02:00
|
|
|
group_next:
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
group = group->next;
|
|
|
|
} while (group != sd->groups);
|
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
if (!busiest || this_load >= max_load || busiest_nr_running == 0)
|
2005-04-17 00:20:36 +02:00
|
|
|
goto out_balanced;
|
|
|
|
|
|
|
|
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
|
|
|
|
|
|
|
|
if (this_load >= avg_load ||
|
|
|
|
100*max_load <= sd->imbalance_pct*this_load)
|
|
|
|
goto out_balanced;
|
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
busiest_load_per_task /= busiest_nr_running;
|
sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.
When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.
To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:
CPU0 attaching sched-domain:
domain 0: span 0003 groups: 0001 0002
domain 1: span 000f groups: 0003 000c
CPU1 attaching sched-domain:
domain 0: span 0003 groups: 0002 0001
domain 1: span 000f groups: 0003 000c
CPU2 attaching sched-domain:
domain 0: span 000c groups: 0004 0008
domain 1: span 000f groups: 000c 0003
CPU3 attaching sched-domain:
domain 0: span 000c groups: 0008 0004
domain 1: span 000f groups: 000c 0003
If I run 2 tasks with CPU affinity set to 0x5. There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle. The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity". This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group. The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle. There are two other logic in the same function will also
causing similar effect. The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state. From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).
So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group. When such condition is detected, load balance goes into spread
mode instead of default grouping mode.
Signed-off-by: Ken Chen <kenchen@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-10-17 16:55:11 +02:00
|
|
|
if (group_imb)
|
|
|
|
busiest_load_per_task = min(busiest_load_per_task, avg_load);
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* We're trying to get all the cpus to the average_load, so we don't
|
|
|
|
* want to push ourselves above the average load, nor do we wish to
|
|
|
|
* reduce the max loaded cpu below the average load, as either of these
|
|
|
|
* actions would just result in more rebalancing later, and ping-pong
|
|
|
|
* tasks around. Thus we look for the minimum possible imbalance.
|
|
|
|
* Negative imbalances (*we* are more loaded than anyone else) will
|
|
|
|
* be counted as no imbalance for these purposes -- we can't fix that
|
2007-12-05 15:46:09 +01:00
|
|
|
* by pulling tasks to us. Be careful of negative numbers as they'll
|
2005-04-17 00:20:36 +02:00
|
|
|
* appear as very large values with unsigned longs.
|
|
|
|
*/
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
if (max_load <= busiest_load_per_task)
|
|
|
|
goto out_balanced;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In the presence of smp nice balancing, certain scenarios can have
|
|
|
|
* max load less than avg load(as we skip the groups at or below
|
|
|
|
* its cpu_power, while calculating max_load..)
|
|
|
|
*/
|
|
|
|
if (max_load < avg_load) {
|
|
|
|
*imbalance = 0;
|
|
|
|
goto small_imbalance;
|
|
|
|
}
|
2005-09-10 09:26:21 +02:00
|
|
|
|
|
|
|
/* Don't want to pull so many tasks that a group would go idle */
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
|
2005-09-10 09:26:21 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/* How much load to actually move to equalise the imbalance */
|
2007-05-08 09:32:57 +02:00
|
|
|
*imbalance = min(max_pull * busiest->__cpu_power,
|
|
|
|
(avg_load - this_load) * this->__cpu_power)
|
2005-04-17 00:20:36 +02:00
|
|
|
/ SCHED_LOAD_SCALE;
|
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
/*
|
|
|
|
* if *imbalance is less than the average load per runnable task
|
|
|
|
* there is no gaurantee that any tasks will be moved so we'll have
|
|
|
|
* a think about bumping its value to force at least one task to be
|
|
|
|
* moved
|
|
|
|
*/
|
2007-09-05 14:32:48 +02:00
|
|
|
if (*imbalance < busiest_load_per_task) {
|
2006-07-03 09:25:40 +02:00
|
|
|
unsigned long tmp, pwr_now, pwr_move;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
unsigned int imbn;
|
|
|
|
|
|
|
|
small_imbalance:
|
|
|
|
pwr_move = pwr_now = 0;
|
|
|
|
imbn = 2;
|
|
|
|
if (this_nr_running) {
|
|
|
|
this_load_per_task /= this_nr_running;
|
|
|
|
if (busiest_load_per_task > this_load_per_task)
|
|
|
|
imbn = 1;
|
|
|
|
} else
|
|
|
|
this_load_per_task = SCHED_LOAD_SCALE;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
|
|
|
|
busiest_load_per_task * imbn) {
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
*imbalance = busiest_load_per_task;
|
2005-04-17 00:20:36 +02:00
|
|
|
return busiest;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* OK, we don't have enough imbalance to justify moving tasks,
|
|
|
|
* however we may be able to increase total CPU power used by
|
|
|
|
* moving them.
|
|
|
|
*/
|
|
|
|
|
2007-05-08 09:32:57 +02:00
|
|
|
pwr_now += busiest->__cpu_power *
|
|
|
|
min(busiest_load_per_task, max_load);
|
|
|
|
pwr_now += this->__cpu_power *
|
|
|
|
min(this_load_per_task, this_load);
|
2005-04-17 00:20:36 +02:00
|
|
|
pwr_now /= SCHED_LOAD_SCALE;
|
|
|
|
|
|
|
|
/* Amount of load we'd subtract */
|
2007-05-08 09:32:57 +02:00
|
|
|
tmp = sg_div_cpu_power(busiest,
|
|
|
|
busiest_load_per_task * SCHED_LOAD_SCALE);
|
2005-04-17 00:20:36 +02:00
|
|
|
if (max_load > tmp)
|
2007-05-08 09:32:57 +02:00
|
|
|
pwr_move += busiest->__cpu_power *
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
min(busiest_load_per_task, max_load - tmp);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/* Amount of load we'd add */
|
2007-05-08 09:32:57 +02:00
|
|
|
if (max_load * busiest->__cpu_power <
|
2006-12-10 11:20:38 +01:00
|
|
|
busiest_load_per_task * SCHED_LOAD_SCALE)
|
2007-05-08 09:32:57 +02:00
|
|
|
tmp = sg_div_cpu_power(this,
|
|
|
|
max_load * busiest->__cpu_power);
|
2005-04-17 00:20:36 +02:00
|
|
|
else
|
2007-05-08 09:32:57 +02:00
|
|
|
tmp = sg_div_cpu_power(this,
|
|
|
|
busiest_load_per_task * SCHED_LOAD_SCALE);
|
|
|
|
pwr_move += this->__cpu_power *
|
|
|
|
min(this_load_per_task, this_load + tmp);
|
2005-04-17 00:20:36 +02:00
|
|
|
pwr_move /= SCHED_LOAD_SCALE;
|
|
|
|
|
|
|
|
/* Move if we gain throughput */
|
2007-09-05 14:32:48 +02:00
|
|
|
if (pwr_move > pwr_now)
|
|
|
|
*imbalance = busiest_load_per_task;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return busiest;
|
|
|
|
|
|
|
|
out_balanced:
|
2006-06-27 11:54:42 +02:00
|
|
|
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
2007-07-09 18:51:57 +02:00
|
|
|
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
|
2006-06-27 11:54:42 +02:00
|
|
|
goto ret;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-06-27 11:54:42 +02:00
|
|
|
if (this == group_leader && group_leader != group_min) {
|
|
|
|
*imbalance = min_load_per_task;
|
|
|
|
return group_min;
|
|
|
|
}
|
|
|
|
#endif
|
2006-12-10 11:20:33 +01:00
|
|
|
ret:
|
2005-04-17 00:20:36 +02:00
|
|
|
*imbalance = 0;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* find_busiest_queue - find the busiest runqueue among the cpus in group.
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static struct rq *
|
2007-07-09 18:51:57 +02:00
|
|
|
find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
|
2006-09-26 08:30:51 +02:00
|
|
|
unsigned long imbalance, cpumask_t *cpus)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *busiest = NULL, *rq;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
unsigned long max_load = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_cpu_mask(i, group->cpumask) {
|
2007-07-09 18:51:59 +02:00
|
|
|
unsigned long wl;
|
2006-09-26 08:30:51 +02:00
|
|
|
|
|
|
|
if (!cpu_isset(i, *cpus))
|
|
|
|
continue;
|
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
rq = cpu_rq(i);
|
2007-07-09 18:51:59 +02:00
|
|
|
wl = weighted_cpuload(i);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (rq->nr_running == 1 && wl > imbalance)
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
continue;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (wl > max_load) {
|
|
|
|
max_load = wl;
|
2006-07-03 09:25:40 +02:00
|
|
|
busiest = rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return busiest;
|
|
|
|
}
|
|
|
|
|
2005-06-25 23:57:30 +02:00
|
|
|
/*
|
|
|
|
* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
|
|
|
|
* so long as it is large enough.
|
|
|
|
*/
|
|
|
|
#define MAX_PINNED_INTERVAL 512
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* Check this_cpu to ensure it is balanced within domain. Attempt to move
|
|
|
|
* tasks if there is an imbalance.
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static int load_balance(int this_cpu, struct rq *this_rq,
|
2007-07-09 18:51:57 +02:00
|
|
|
struct sched_domain *sd, enum cpu_idle_type idle,
|
2006-12-10 11:20:33 +01:00
|
|
|
int *balance)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
struct sched_group *group;
|
|
|
|
unsigned long imbalance;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *busiest;
|
2006-09-26 08:30:51 +02:00
|
|
|
cpumask_t cpus = CPU_MASK_ALL;
|
2006-12-10 11:20:21 +01:00
|
|
|
unsigned long flags;
|
2005-09-10 09:26:19 +02:00
|
|
|
|
2006-10-03 10:14:09 +02:00
|
|
|
/*
|
|
|
|
* When power savings policy is enabled for the parent domain, idle
|
|
|
|
* sibling can pick up load irrespective of busy siblings. In this case,
|
2007-07-09 18:51:59 +02:00
|
|
|
* let the state of idle sibling percolate up as CPU_IDLE, instead of
|
2007-07-09 18:51:57 +02:00
|
|
|
* portraying it as CPU_NOT_IDLE.
|
2006-10-03 10:14:09 +02:00
|
|
|
*/
|
2007-07-09 18:51:57 +02:00
|
|
|
if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
|
2006-10-03 10:14:09 +02:00
|
|
|
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
|
2005-09-10 09:26:19 +02:00
|
|
|
sd_idle = 1;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:12 +02:00
|
|
|
schedstat_inc(sd, lb_count[idle]);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-09-26 08:30:51 +02:00
|
|
|
redo:
|
|
|
|
group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
|
2006-12-10 11:20:33 +01:00
|
|
|
&cpus, balance);
|
|
|
|
|
2006-12-10 11:20:35 +01:00
|
|
|
if (*balance == 0)
|
2006-12-10 11:20:33 +01:00
|
|
|
goto out_balanced;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
if (!group) {
|
|
|
|
schedstat_inc(sd, lb_nobusyg[idle]);
|
|
|
|
goto out_balanced;
|
|
|
|
}
|
|
|
|
|
2006-09-26 08:30:51 +02:00
|
|
|
busiest = find_busiest_queue(group, idle, imbalance, &cpus);
|
2005-04-17 00:20:36 +02:00
|
|
|
if (!busiest) {
|
|
|
|
schedstat_inc(sd, lb_nobusyq[idle]);
|
|
|
|
goto out_balanced;
|
|
|
|
}
|
|
|
|
|
2005-06-25 23:57:11 +02:00
|
|
|
BUG_ON(busiest == this_rq);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
schedstat_add(sd, lb_imbalance[idle], imbalance);
|
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
ld_moved = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
if (busiest->nr_running > 1) {
|
|
|
|
/*
|
|
|
|
* Attempt to move tasks. If find_busiest_group has found
|
|
|
|
* an imbalance but busiest->nr_running <= 1, the group is
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
* still unbalanced. ld_moved simply stays zero, so it is
|
2005-04-17 00:20:36 +02:00
|
|
|
* correctly treated as an imbalance.
|
|
|
|
*/
|
2006-12-10 11:20:21 +01:00
|
|
|
local_irq_save(flags);
|
2005-09-10 09:26:18 +02:00
|
|
|
double_rq_lock(this_rq, busiest);
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
ld_moved = move_tasks(this_rq, this_cpu, busiest,
|
2006-07-03 09:25:40 +02:00
|
|
|
imbalance, sd, idle, &all_pinned);
|
2005-09-10 09:26:18 +02:00
|
|
|
double_rq_unlock(this_rq, busiest);
|
2006-12-10 11:20:21 +01:00
|
|
|
local_irq_restore(flags);
|
2005-06-25 23:57:07 +02:00
|
|
|
|
2007-05-08 09:32:51 +02:00
|
|
|
/*
|
|
|
|
* some other cpu did the load balance for us.
|
|
|
|
*/
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
if (ld_moved && this_cpu != smp_processor_id())
|
2007-05-08 09:32:51 +02:00
|
|
|
resched_cpu(this_cpu);
|
|
|
|
|
2005-06-25 23:57:07 +02:00
|
|
|
/* All tasks on this runqueue were pinned by CPU affinity */
|
2006-09-26 08:30:51 +02:00
|
|
|
if (unlikely(all_pinned)) {
|
|
|
|
cpu_clear(cpu_of(busiest), cpus);
|
|
|
|
if (!cpus_empty(cpus))
|
|
|
|
goto redo;
|
2005-06-25 23:57:07 +02:00
|
|
|
goto out_balanced;
|
2006-09-26 08:30:51 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2005-06-25 23:57:07 +02:00
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
if (!ld_moved) {
|
2005-04-17 00:20:36 +02:00
|
|
|
schedstat_inc(sd, lb_failed[idle]);
|
|
|
|
sd->nr_balance_failed++;
|
|
|
|
|
|
|
|
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
|
|
|
|
|
2006-12-10 11:20:21 +01:00
|
|
|
spin_lock_irqsave(&busiest->lock, flags);
|
2005-09-10 09:26:21 +02:00
|
|
|
|
|
|
|
/* don't kick the migration_thread, if the curr
|
|
|
|
* task on busiest cpu can't be moved to this_cpu
|
|
|
|
*/
|
|
|
|
if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
|
2006-12-10 11:20:21 +01:00
|
|
|
spin_unlock_irqrestore(&busiest->lock, flags);
|
2005-09-10 09:26:21 +02:00
|
|
|
all_pinned = 1;
|
|
|
|
goto out_one_pinned;
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
if (!busiest->active_balance) {
|
|
|
|
busiest->active_balance = 1;
|
|
|
|
busiest->push_cpu = this_cpu;
|
2005-06-25 23:57:07 +02:00
|
|
|
active_balance = 1;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2006-12-10 11:20:21 +01:00
|
|
|
spin_unlock_irqrestore(&busiest->lock, flags);
|
2005-06-25 23:57:07 +02:00
|
|
|
if (active_balance)
|
2005-04-17 00:20:36 +02:00
|
|
|
wake_up_process(busiest->migration_thread);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We've kicked active balancing, reset the failure
|
|
|
|
* counter.
|
|
|
|
*/
|
2005-06-25 23:57:09 +02:00
|
|
|
sd->nr_balance_failed = sd->cache_nice_tries+1;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2005-06-25 23:57:07 +02:00
|
|
|
} else
|
2005-04-17 00:20:36 +02:00
|
|
|
sd->nr_balance_failed = 0;
|
|
|
|
|
2005-06-25 23:57:07 +02:00
|
|
|
if (likely(!active_balance)) {
|
2005-04-17 00:20:36 +02:00
|
|
|
/* We were unbalanced, so reset the balancing interval */
|
|
|
|
sd->balance_interval = sd->min_interval;
|
2005-06-25 23:57:07 +02:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* If we've begun active balancing, start to back off. This
|
|
|
|
* case may not be covered by the all_pinned logic if there
|
|
|
|
* is only 1 task on the busy runqueue (because we don't call
|
|
|
|
* move_tasks).
|
|
|
|
*/
|
|
|
|
if (sd->balance_interval < sd->max_interval)
|
|
|
|
sd->balance_interval *= 2;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
|
2006-10-03 10:14:09 +02:00
|
|
|
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
|
2005-09-10 09:26:19 +02:00
|
|
|
return -1;
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
return ld_moved;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
out_balanced:
|
|
|
|
schedstat_inc(sd, lb_balanced[idle]);
|
|
|
|
|
2005-06-25 23:57:08 +02:00
|
|
|
sd->nr_balance_failed = 0;
|
2005-09-10 09:26:21 +02:00
|
|
|
|
|
|
|
out_one_pinned:
|
2005-04-17 00:20:36 +02:00
|
|
|
/* tune up the balancing interval */
|
2005-06-25 23:57:30 +02:00
|
|
|
if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
|
|
|
|
(sd->balance_interval < sd->max_interval))
|
2005-04-17 00:20:36 +02:00
|
|
|
sd->balance_interval *= 2;
|
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
|
2006-10-03 10:14:09 +02:00
|
|
|
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
|
2005-09-10 09:26:19 +02:00
|
|
|
return -1;
|
2005-04-17 00:20:36 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check this_cpu to ensure it is balanced within domain. Attempt to move
|
|
|
|
* tasks if there is an imbalance.
|
|
|
|
*
|
2007-07-09 18:51:57 +02:00
|
|
|
* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
|
2005-04-17 00:20:36 +02:00
|
|
|
* this_rq is locked.
|
|
|
|
*/
|
2006-07-03 09:25:40 +02:00
|
|
|
static int
|
2006-07-03 09:25:42 +02:00
|
|
|
load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
struct sched_group *group;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *busiest = NULL;
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long imbalance;
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
int ld_moved = 0;
|
2005-09-10 09:26:19 +02:00
|
|
|
int sd_idle = 0;
|
2007-07-19 21:28:35 +02:00
|
|
|
int all_pinned = 0;
|
2006-09-26 08:30:51 +02:00
|
|
|
cpumask_t cpus = CPU_MASK_ALL;
|
2005-09-10 09:26:19 +02:00
|
|
|
|
2006-10-03 10:14:09 +02:00
|
|
|
/*
|
|
|
|
* When power savings policy is enabled for the parent domain, idle
|
|
|
|
* sibling can pick up load irrespective of busy siblings. In this case,
|
|
|
|
* let the state of idle sibling percolate up as IDLE, instead of
|
2007-07-09 18:51:57 +02:00
|
|
|
* portraying it as CPU_NOT_IDLE.
|
2006-10-03 10:14:09 +02:00
|
|
|
*/
|
|
|
|
if (sd->flags & SD_SHARE_CPUPOWER &&
|
|
|
|
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
|
2005-09-10 09:26:19 +02:00
|
|
|
sd_idle = 1;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:12 +02:00
|
|
|
schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
|
2006-09-26 08:30:51 +02:00
|
|
|
redo:
|
2007-07-09 18:51:57 +02:00
|
|
|
group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
|
2006-12-10 11:20:33 +01:00
|
|
|
&sd_idle, &cpus, NULL);
|
2005-04-17 00:20:36 +02:00
|
|
|
if (!group) {
|
2007-07-09 18:51:57 +02:00
|
|
|
schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
|
2005-06-25 23:57:08 +02:00
|
|
|
goto out_balanced;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:57 +02:00
|
|
|
busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
|
2006-09-26 08:30:51 +02:00
|
|
|
&cpus);
|
2005-06-25 23:57:11 +02:00
|
|
|
if (!busiest) {
|
2007-07-09 18:51:57 +02:00
|
|
|
schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
|
2005-06-25 23:57:08 +02:00
|
|
|
goto out_balanced;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2005-06-25 23:57:11 +02:00
|
|
|
BUG_ON(busiest == this_rq);
|
|
|
|
|
2007-07-09 18:51:57 +02:00
|
|
|
schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
|
2005-09-10 09:26:16 +02:00
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
ld_moved = 0;
|
2005-09-10 09:26:16 +02:00
|
|
|
if (busiest->nr_running > 1) {
|
|
|
|
/* Attempt to move tasks */
|
|
|
|
double_lock_balance(this_rq, busiest);
|
2007-08-09 11:16:51 +02:00
|
|
|
/* this_rq->clock is already updated */
|
|
|
|
update_rq_clock(busiest);
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
ld_moved = move_tasks(this_rq, this_cpu, busiest,
|
2007-07-19 21:28:35 +02:00
|
|
|
imbalance, sd, CPU_NEWLY_IDLE,
|
|
|
|
&all_pinned);
|
2005-09-10 09:26:16 +02:00
|
|
|
spin_unlock(&busiest->lock);
|
2006-09-26 08:30:51 +02:00
|
|
|
|
2007-07-19 21:28:35 +02:00
|
|
|
if (unlikely(all_pinned)) {
|
2006-09-26 08:30:51 +02:00
|
|
|
cpu_clear(cpu_of(busiest), cpus);
|
|
|
|
if (!cpus_empty(cpus))
|
|
|
|
goto redo;
|
|
|
|
}
|
2005-09-10 09:26:16 +02:00
|
|
|
}
|
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
if (!ld_moved) {
|
2007-07-09 18:51:57 +02:00
|
|
|
schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
|
2006-10-03 10:14:09 +02:00
|
|
|
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
|
|
|
|
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
|
2005-09-10 09:26:19 +02:00
|
|
|
return -1;
|
|
|
|
} else
|
2005-06-25 23:57:08 +02:00
|
|
|
sd->nr_balance_failed = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
return ld_moved;
|
2005-06-25 23:57:08 +02:00
|
|
|
|
|
|
|
out_balanced:
|
2007-07-09 18:51:57 +02:00
|
|
|
schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
|
2006-07-03 09:25:40 +02:00
|
|
|
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
|
2006-10-03 10:14:09 +02:00
|
|
|
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
|
2005-09-10 09:26:19 +02:00
|
|
|
return -1;
|
2005-06-25 23:57:08 +02:00
|
|
|
sd->nr_balance_failed = 0;
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-06-25 23:57:08 +02:00
|
|
|
return 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* idle_balance is called by schedule() if this_cpu is about to become
|
|
|
|
* idle. Attempts to pull tasks from other CPUs.
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static void idle_balance(int this_cpu, struct rq *this_rq)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
struct sched_domain *sd;
|
2007-07-09 18:51:59 +02:00
|
|
|
int pulled_task = -1;
|
|
|
|
unsigned long next_balance = jiffies + HZ;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
for_each_domain(this_cpu, sd) {
|
2007-06-24 02:16:33 +02:00
|
|
|
unsigned long interval;
|
|
|
|
|
|
|
|
if (!(sd->flags & SD_LOAD_BALANCE))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (sd->flags & SD_BALANCE_NEWIDLE)
|
2006-07-03 09:25:40 +02:00
|
|
|
/* If we've pulled tasks over stop searching: */
|
2006-12-10 11:20:27 +01:00
|
|
|
pulled_task = load_balance_newidle(this_cpu,
|
2007-06-24 02:16:33 +02:00
|
|
|
this_rq, sd);
|
|
|
|
|
|
|
|
interval = msecs_to_jiffies(sd->balance_interval);
|
|
|
|
if (time_after(next_balance, sd->last_balance + interval))
|
|
|
|
next_balance = sd->last_balance + interval;
|
|
|
|
if (pulled_task)
|
|
|
|
break;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
|
2006-12-10 11:20:27 +01:00
|
|
|
/*
|
|
|
|
* We are going idle. next_balance may be set based on
|
|
|
|
* a busy processor. So reset next_balance.
|
|
|
|
*/
|
|
|
|
this_rq->next_balance = next_balance;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* active_load_balance is run by migration threads. It pushes running tasks
|
|
|
|
* off the busiest CPU onto idle CPUs. It requires at least 1 task to be
|
|
|
|
* running on each physical CPU where possible, and avoids physical /
|
|
|
|
* logical imbalances.
|
|
|
|
*
|
|
|
|
* Called with busiest_rq locked.
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2005-06-25 23:57:09 +02:00
|
|
|
int target_cpu = busiest_rq->push_cpu;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct sched_domain *sd;
|
|
|
|
struct rq *target_rq;
|
2005-06-25 23:57:09 +02:00
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
/* Is there any task to move? */
|
2005-06-25 23:57:09 +02:00
|
|
|
if (busiest_rq->nr_running <= 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
target_rq = cpu_rq(target_cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
2005-06-25 23:57:09 +02:00
|
|
|
* This condition is "impossible", if it occurs
|
2007-12-05 15:46:09 +01:00
|
|
|
* we need to fix it. Originally reported by
|
2005-06-25 23:57:09 +02:00
|
|
|
* Bjorn Helgaas on a 128-cpu setup.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2005-06-25 23:57:09 +02:00
|
|
|
BUG_ON(busiest_rq == target_rq);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2005-06-25 23:57:09 +02:00
|
|
|
/* move a task from busiest_rq to target_rq */
|
|
|
|
double_lock_balance(busiest_rq, target_rq);
|
2007-08-09 11:16:51 +02:00
|
|
|
update_rq_clock(busiest_rq);
|
|
|
|
update_rq_clock(target_rq);
|
2005-06-25 23:57:09 +02:00
|
|
|
|
|
|
|
/* Search for an sd spanning us and the target CPU. */
|
2006-06-27 11:54:28 +02:00
|
|
|
for_each_domain(target_cpu, sd) {
|
2005-06-25 23:57:09 +02:00
|
|
|
if ((sd->flags & SD_LOAD_BALANCE) &&
|
2006-07-03 09:25:40 +02:00
|
|
|
cpu_isset(busiest_cpu, sd->span))
|
2005-06-25 23:57:09 +02:00
|
|
|
break;
|
2006-06-27 11:54:28 +02:00
|
|
|
}
|
2005-06-25 23:57:09 +02:00
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
if (likely(sd)) {
|
2007-10-15 17:00:12 +02:00
|
|
|
schedstat_inc(sd, alb_count);
|
2005-06-25 23:57:09 +02:00
|
|
|
|
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct
capabilities:
1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.
The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.
The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.
This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()). However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:
1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.
One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list. This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.
Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).
NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.
[ mingo@elte.hu ]
this change also reduces code size nicely:
text data bss dec hex filename
39216 3618 24 42858 a76a sched.o.before
39173 3618 24 42815 a73f sched.o.after
Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2007-08-09 11:16:46 +02:00
|
|
|
if (move_one_task(target_rq, target_cpu, busiest_rq,
|
|
|
|
sd, CPU_IDLE))
|
2006-07-03 09:25:40 +02:00
|
|
|
schedstat_inc(sd, alb_pushed);
|
|
|
|
else
|
|
|
|
schedstat_inc(sd, alb_failed);
|
|
|
|
}
|
2005-06-25 23:57:09 +02:00
|
|
|
spin_unlock(&target_rq->lock);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2007-05-08 09:32:51 +02:00
|
|
|
#ifdef CONFIG_NO_HZ
|
|
|
|
static struct {
|
|
|
|
atomic_t load_balancer;
|
2007-12-05 15:46:09 +01:00
|
|
|
cpumask_t cpu_mask;
|
2007-05-08 09:32:51 +02:00
|
|
|
} nohz ____cacheline_aligned = {
|
|
|
|
.load_balancer = ATOMIC_INIT(-1),
|
|
|
|
.cpu_mask = CPU_MASK_NONE,
|
|
|
|
};
|
|
|
|
|
2006-12-10 11:20:22 +01:00
|
|
|
/*
|
2007-05-08 09:32:51 +02:00
|
|
|
* This routine will try to nominate the ilb (idle load balancing)
|
|
|
|
* owner among the cpus whose ticks are stopped. ilb owner will do the idle
|
|
|
|
* load balancing on behalf of all those cpus. If all the cpus in the system
|
|
|
|
* go into this tickless mode, then there will be no ilb owner (as there is
|
|
|
|
* no need for one) and all the cpus will sleep till the next wakeup event
|
|
|
|
* arrives...
|
|
|
|
*
|
|
|
|
* For the ilb owner, tick is not stopped. And this tick will be used
|
|
|
|
* for idle load balancing. ilb owner will still be part of
|
|
|
|
* nohz.cpu_mask..
|
2006-12-10 11:20:22 +01:00
|
|
|
*
|
2007-05-08 09:32:51 +02:00
|
|
|
* While stopping the tick, this cpu will become the ilb owner if there
|
|
|
|
* is no other owner. And will be the owner till that cpu becomes busy
|
|
|
|
* or if all cpus in the system stop their ticks at which point
|
|
|
|
* there is no need for ilb owner.
|
|
|
|
*
|
|
|
|
* When the ilb owner becomes busy, it nominates another owner, during the
|
|
|
|
* next busy scheduler_tick()
|
|
|
|
*/
|
|
|
|
int select_nohz_load_balancer(int stop_tick)
|
|
|
|
{
|
|
|
|
int cpu = smp_processor_id();
|
|
|
|
|
|
|
|
if (stop_tick) {
|
|
|
|
cpu_set(cpu, nohz.cpu_mask);
|
|
|
|
cpu_rq(cpu)->in_nohz_recently = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If we are going offline and still the leader, give up!
|
|
|
|
*/
|
|
|
|
if (cpu_is_offline(cpu) &&
|
|
|
|
atomic_read(&nohz.load_balancer) == cpu) {
|
|
|
|
if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
|
|
|
|
BUG();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* time for ilb owner also to sleep */
|
|
|
|
if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
|
|
|
|
if (atomic_read(&nohz.load_balancer) == cpu)
|
|
|
|
atomic_set(&nohz.load_balancer, -1);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (atomic_read(&nohz.load_balancer) == -1) {
|
|
|
|
/* make me the ilb owner */
|
|
|
|
if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
|
|
|
|
return 1;
|
|
|
|
} else if (atomic_read(&nohz.load_balancer) == cpu)
|
|
|
|
return 1;
|
|
|
|
} else {
|
|
|
|
if (!cpu_isset(cpu, nohz.cpu_mask))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
cpu_clear(cpu, nohz.cpu_mask);
|
|
|
|
|
|
|
|
if (atomic_read(&nohz.load_balancer) == cpu)
|
|
|
|
if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static DEFINE_SPINLOCK(balancing);
|
|
|
|
|
|
|
|
/*
|
2006-12-10 11:20:22 +01:00
|
|
|
* It checks each scheduling domain to see if it is due to be balanced,
|
|
|
|
* and initiates a balancing operation if so.
|
|
|
|
*
|
|
|
|
* Balancing parameters are set up in arch_init_sched_domains.
|
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
|
2006-12-10 11:20:22 +01:00
|
|
|
{
|
2007-05-08 09:32:51 +02:00
|
|
|
int balance = 1;
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2006-12-10 11:20:22 +01:00
|
|
|
unsigned long interval;
|
|
|
|
struct sched_domain *sd;
|
2007-05-08 09:32:51 +02:00
|
|
|
/* Earliest time when we have to do rebalance again */
|
2006-12-10 11:20:25 +01:00
|
|
|
unsigned long next_balance = jiffies + 60*HZ;
|
2007-08-23 15:18:02 +02:00
|
|
|
int update_next_balance = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-05-08 09:32:51 +02:00
|
|
|
for_each_domain(cpu, sd) {
|
2005-04-17 00:20:36 +02:00
|
|
|
if (!(sd->flags & SD_LOAD_BALANCE))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
interval = sd->balance_interval;
|
2007-07-09 18:51:57 +02:00
|
|
|
if (idle != CPU_IDLE)
|
2005-04-17 00:20:36 +02:00
|
|
|
interval *= sd->busy_factor;
|
|
|
|
|
|
|
|
/* scale ms to jiffies */
|
|
|
|
interval = msecs_to_jiffies(interval);
|
|
|
|
if (unlikely(!interval))
|
|
|
|
interval = 1;
|
2007-07-09 18:51:59 +02:00
|
|
|
if (interval > HZ*NR_CPUS/10)
|
|
|
|
interval = HZ*NR_CPUS/10;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-12-10 11:20:29 +01:00
|
|
|
if (sd->flags & SD_SERIALIZE) {
|
|
|
|
if (!spin_trylock(&balancing))
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2006-12-10 11:20:25 +01:00
|
|
|
if (time_after_eq(jiffies, sd->last_balance + interval)) {
|
2007-05-08 09:32:51 +02:00
|
|
|
if (load_balance(cpu, rq, sd, idle, &balance)) {
|
2005-09-10 09:26:21 +02:00
|
|
|
/*
|
|
|
|
* We've pulled tasks over so either we're no
|
2005-09-10 09:26:19 +02:00
|
|
|
* longer idle, or one of our SMT siblings is
|
|
|
|
* not idle.
|
|
|
|
*/
|
2007-07-09 18:51:57 +02:00
|
|
|
idle = CPU_NOT_IDLE;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2006-12-10 11:20:27 +01:00
|
|
|
sd->last_balance = jiffies;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2006-12-10 11:20:29 +01:00
|
|
|
if (sd->flags & SD_SERIALIZE)
|
|
|
|
spin_unlock(&balancing);
|
|
|
|
out:
|
2007-08-23 15:18:02 +02:00
|
|
|
if (time_after(next_balance, sd->last_balance + interval)) {
|
2006-12-10 11:20:25 +01:00
|
|
|
next_balance = sd->last_balance + interval;
|
2007-08-23 15:18:02 +02:00
|
|
|
update_next_balance = 1;
|
|
|
|
}
|
2006-12-10 11:20:33 +01:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Stop the load balance at this level. There is another
|
|
|
|
* CPU in our sched group which is doing load balancing more
|
|
|
|
* actively.
|
|
|
|
*/
|
|
|
|
if (!balance)
|
|
|
|
break;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2007-08-23 15:18:02 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* next_balance will be updated only when there is a need.
|
|
|
|
* When the cpu is attached to null domain for ex, it will not be
|
|
|
|
* updated.
|
|
|
|
*/
|
|
|
|
if (likely(update_next_balance))
|
|
|
|
rq->next_balance = next_balance;
|
2007-05-08 09:32:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* run_rebalance_domains is triggered when needed from the scheduler tick.
|
|
|
|
* In CONFIG_NO_HZ case, the idle load balance owner will do the
|
|
|
|
* rebalancing for all the cpus for whom scheduler ticks are stopped.
|
|
|
|
*/
|
|
|
|
static void run_rebalance_domains(struct softirq_action *h)
|
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
int this_cpu = smp_processor_id();
|
|
|
|
struct rq *this_rq = cpu_rq(this_cpu);
|
|
|
|
enum cpu_idle_type idle = this_rq->idle_at_tick ?
|
|
|
|
CPU_IDLE : CPU_NOT_IDLE;
|
2007-05-08 09:32:51 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
rebalance_domains(this_cpu, idle);
|
2007-05-08 09:32:51 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_NO_HZ
|
|
|
|
/*
|
|
|
|
* If this cpu is the owner for idle load balancing, then do the
|
|
|
|
* balancing on behalf of the other idle cpus whose ticks are
|
|
|
|
* stopped.
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
if (this_rq->idle_at_tick &&
|
|
|
|
atomic_read(&nohz.load_balancer) == this_cpu) {
|
2007-05-08 09:32:51 +02:00
|
|
|
cpumask_t cpus = nohz.cpu_mask;
|
|
|
|
struct rq *rq;
|
|
|
|
int balance_cpu;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
cpu_clear(this_cpu, cpus);
|
2007-05-08 09:32:51 +02:00
|
|
|
for_each_cpu_mask(balance_cpu, cpus) {
|
|
|
|
/*
|
|
|
|
* If this cpu gets work to do, stop the load balancing
|
|
|
|
* work being done for other cpus. Next load
|
|
|
|
* balancing owner will pick it up.
|
|
|
|
*/
|
|
|
|
if (need_resched())
|
|
|
|
break;
|
|
|
|
|
2007-08-12 18:08:19 +02:00
|
|
|
rebalance_domains(balance_cpu, CPU_IDLE);
|
2007-05-08 09:32:51 +02:00
|
|
|
|
|
|
|
rq = cpu_rq(balance_cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
if (time_after(this_rq->next_balance, rq->next_balance))
|
|
|
|
this_rq->next_balance = rq->next_balance;
|
2007-05-08 09:32:51 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
|
|
|
|
*
|
|
|
|
* In case of CONFIG_NO_HZ, this is the place where we nominate a new
|
|
|
|
* idle load balancing owner or decide to stop the periodic load balancing,
|
|
|
|
* if the whole system is idle.
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
static inline void trigger_load_balance(struct rq *rq, int cpu)
|
2007-05-08 09:32:51 +02:00
|
|
|
{
|
|
|
|
#ifdef CONFIG_NO_HZ
|
|
|
|
/*
|
|
|
|
* If we were in the nohz mode recently and busy at the current
|
|
|
|
* scheduler tick, then check if we need to nominate new idle
|
|
|
|
* load balancer.
|
|
|
|
*/
|
|
|
|
if (rq->in_nohz_recently && !rq->idle_at_tick) {
|
|
|
|
rq->in_nohz_recently = 0;
|
|
|
|
|
|
|
|
if (atomic_read(&nohz.load_balancer) == cpu) {
|
|
|
|
cpu_clear(cpu, nohz.cpu_mask);
|
|
|
|
atomic_set(&nohz.load_balancer, -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (atomic_read(&nohz.load_balancer) == -1) {
|
|
|
|
/*
|
|
|
|
* simple selection for now: Nominate the
|
|
|
|
* first cpu in the nohz list to be the next
|
|
|
|
* ilb owner.
|
|
|
|
*
|
|
|
|
* TBD: Traverse the sched domains and nominate
|
|
|
|
* the nearest cpu in the nohz.cpu_mask.
|
|
|
|
*/
|
|
|
|
int ilb = first_cpu(nohz.cpu_mask);
|
|
|
|
|
|
|
|
if (ilb != NR_CPUS)
|
|
|
|
resched_cpu(ilb);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this cpu is idle and doing idle load balancing for all the
|
|
|
|
* cpus with ticks stopped, is it time for that to stop?
|
|
|
|
*/
|
|
|
|
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
|
|
|
|
cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
|
|
|
|
resched_cpu(cpu);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this cpu is idle and the idle load balancing is done by
|
|
|
|
* someone else, then no need raise the SCHED_SOFTIRQ
|
|
|
|
*/
|
|
|
|
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
|
|
|
|
cpu_isset(cpu, nohz.cpu_mask))
|
|
|
|
return;
|
|
|
|
#endif
|
|
|
|
if (time_after_eq(jiffies, rq->next_balance))
|
|
|
|
raise_softirq(SCHED_SOFTIRQ);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
#else /* CONFIG_SMP */
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* on UP we do not need to balance between CPUs:
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static inline void idle_balance(int cpu, struct rq *rq)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
DEFINE_PER_CPU(struct kernel_stat, kstat);
|
|
|
|
|
|
|
|
EXPORT_PER_CPU_SYMBOL(kstat);
|
|
|
|
|
|
|
|
/*
|
2007-07-09 18:51:58 +02:00
|
|
|
* Return p->sum_exec_runtime plus any more ns on the sched_clock
|
|
|
|
* that have not yet been banked in case the task is currently running.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-07-09 18:51:58 +02:00
|
|
|
unsigned long long task_sched_runtime(struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2007-07-09 18:51:58 +02:00
|
|
|
u64 ns, delta_exec;
|
|
|
|
struct rq *rq;
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
ns = p->se.sum_exec_runtime;
|
2007-12-18 15:21:13 +01:00
|
|
|
if (task_current(rq, p)) {
|
2007-08-09 11:16:47 +02:00
|
|
|
update_rq_clock(rq);
|
|
|
|
delta_exec = rq->clock - p->se.exec_start;
|
2007-07-09 18:51:58 +02:00
|
|
|
if ((s64)delta_exec > 0)
|
|
|
|
ns += delta_exec;
|
|
|
|
}
|
|
|
|
task_rq_unlock(rq, &flags);
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
return ns;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Account user cpu time to a process.
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @cputime: the cpu time spent in user space since the last update
|
|
|
|
*/
|
|
|
|
void account_user_time(struct task_struct *p, cputime_t cputime)
|
|
|
|
{
|
|
|
|
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
cputime64_t tmp;
|
|
|
|
|
|
|
|
p->utime = cputime_add(p->utime, cputime);
|
|
|
|
|
|
|
|
/* Add user time to cpustat. */
|
|
|
|
tmp = cputime_to_cputime64(cputime);
|
|
|
|
if (TASK_NICE(p) > 0)
|
|
|
|
cpustat->nice = cputime64_add(cpustat->nice, tmp);
|
|
|
|
else
|
|
|
|
cpustat->user = cputime64_add(cpustat->user, tmp);
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
/*
|
|
|
|
* Account guest cpu time to a process.
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @cputime: the cpu time spent in virtual machine since the last update
|
|
|
|
*/
|
2007-10-29 21:18:10 +01:00
|
|
|
static void account_guest_time(struct task_struct *p, cputime_t cputime)
|
2007-10-15 17:00:19 +02:00
|
|
|
{
|
|
|
|
cputime64_t tmp;
|
|
|
|
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
|
|
|
|
tmp = cputime_to_cputime64(cputime);
|
|
|
|
|
|
|
|
p->utime = cputime_add(p->utime, cputime);
|
|
|
|
p->gtime = cputime_add(p->gtime, cputime);
|
|
|
|
|
|
|
|
cpustat->user = cputime64_add(cpustat->user, tmp);
|
|
|
|
cpustat->guest = cputime64_add(cpustat->guest, tmp);
|
|
|
|
}
|
|
|
|
|
2007-10-18 12:06:34 +02:00
|
|
|
/*
|
|
|
|
* Account scaled user cpu time to a process.
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @cputime: the cpu time spent in user space since the last update
|
|
|
|
*/
|
|
|
|
void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
|
|
|
|
{
|
|
|
|
p->utimescaled = cputime_add(p->utimescaled, cputime);
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* Account system cpu time to a process.
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @hardirq_offset: the offset to subtract from hardirq_count()
|
|
|
|
* @cputime: the cpu time spent in kernel space since the last update
|
|
|
|
*/
|
|
|
|
void account_system_time(struct task_struct *p, int hardirq_offset,
|
|
|
|
cputime_t cputime)
|
|
|
|
{
|
|
|
|
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = this_rq();
|
2005-04-17 00:20:36 +02:00
|
|
|
cputime64_t tmp;
|
|
|
|
|
2007-11-15 20:57:39 +01:00
|
|
|
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
|
|
|
|
return account_guest_time(p, cputime);
|
2007-10-15 17:00:19 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
p->stime = cputime_add(p->stime, cputime);
|
|
|
|
|
|
|
|
/* Add system time to cpustat. */
|
|
|
|
tmp = cputime_to_cputime64(cputime);
|
|
|
|
if (hardirq_count() - hardirq_offset)
|
|
|
|
cpustat->irq = cputime64_add(cpustat->irq, tmp);
|
|
|
|
else if (softirq_count())
|
|
|
|
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
|
2007-11-15 01:59:45 +01:00
|
|
|
else if (p != rq->idle)
|
2005-04-17 00:20:36 +02:00
|
|
|
cpustat->system = cputime64_add(cpustat->system, tmp);
|
2007-11-15 01:59:45 +01:00
|
|
|
else if (atomic_read(&rq->nr_iowait) > 0)
|
2005-04-17 00:20:36 +02:00
|
|
|
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
|
|
|
|
else
|
|
|
|
cpustat->idle = cputime64_add(cpustat->idle, tmp);
|
|
|
|
/* Account for system time used */
|
|
|
|
acct_update_integrals(p);
|
|
|
|
}
|
|
|
|
|
2007-10-18 12:06:34 +02:00
|
|
|
/*
|
|
|
|
* Account scaled system cpu time to a process.
|
|
|
|
* @p: the process that the cpu time gets accounted to
|
|
|
|
* @hardirq_offset: the offset to subtract from hardirq_count()
|
|
|
|
* @cputime: the cpu time spent in kernel space since the last update
|
|
|
|
*/
|
|
|
|
void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
|
|
|
|
{
|
|
|
|
p->stimescaled = cputime_add(p->stimescaled, cputime);
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* Account for involuntary wait time.
|
|
|
|
* @p: the process from which the cpu time has been stolen
|
|
|
|
* @steal: the cpu time spent in involuntary wait
|
|
|
|
*/
|
|
|
|
void account_steal_time(struct task_struct *p, cputime_t steal)
|
|
|
|
{
|
|
|
|
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
cputime64_t tmp = cputime_to_cputime64(steal);
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = this_rq();
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
if (p == rq->idle) {
|
|
|
|
p->stime = cputime_add(p->stime, steal);
|
|
|
|
if (atomic_read(&rq->nr_iowait) > 0)
|
|
|
|
cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
|
|
|
|
else
|
|
|
|
cpustat->idle = cputime64_add(cpustat->idle, tmp);
|
2007-11-15 01:59:45 +01:00
|
|
|
} else
|
2005-04-17 00:20:36 +02:00
|
|
|
cpustat->steal = cputime64_add(cpustat->steal, tmp);
|
|
|
|
}
|
|
|
|
|
2006-12-10 11:20:22 +01:00
|
|
|
/*
|
|
|
|
* This function gets called by the timer code, with HZ frequency.
|
|
|
|
* We call it with interrupts disabled.
|
|
|
|
*
|
|
|
|
* It also gets called by the fork code, when changing the parent's
|
|
|
|
* timeslices.
|
|
|
|
*/
|
|
|
|
void scheduler_tick(void)
|
|
|
|
{
|
|
|
|
int cpu = smp_processor_id();
|
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
struct task_struct *curr = rq->curr;
|
2007-08-10 23:05:11 +02:00
|
|
|
u64 next_tick = rq->tick_timestamp + TICK_NSEC;
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
spin_lock(&rq->lock);
|
2007-08-09 11:16:51 +02:00
|
|
|
__update_rq_clock(rq);
|
2007-08-10 23:05:11 +02:00
|
|
|
/*
|
|
|
|
* Let rq->clock advance by at least TICK_NSEC:
|
|
|
|
*/
|
2008-01-25 21:08:34 +01:00
|
|
|
if (unlikely(rq->clock < next_tick)) {
|
2007-08-10 23:05:11 +02:00
|
|
|
rq->clock = next_tick;
|
2008-01-25 21:08:34 +01:00
|
|
|
rq->clock_underflows++;
|
|
|
|
}
|
2007-08-10 23:05:11 +02:00
|
|
|
rq->tick_timestamp = rq->clock;
|
2007-08-09 11:16:45 +02:00
|
|
|
update_cpu_load(rq);
|
2008-01-25 21:08:29 +01:00
|
|
|
curr->sched_class->task_tick(rq, curr, 0);
|
|
|
|
update_sched_rt_period(rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
spin_unlock(&rq->lock);
|
2006-12-10 11:20:22 +01:00
|
|
|
|
2006-12-10 11:20:23 +01:00
|
|
|
#ifdef CONFIG_SMP
|
2007-07-09 18:51:59 +02:00
|
|
|
rq->idle_at_tick = idle_cpu(cpu);
|
|
|
|
trigger_load_balance(rq, cpu);
|
2006-12-10 11:20:23 +01:00
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
|
|
|
|
|
|
|
|
void fastcall add_preempt_count(int val)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Underflow?
|
|
|
|
*/
|
2006-07-03 09:24:33 +02:00
|
|
|
if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
|
|
|
|
return;
|
2005-04-17 00:20:36 +02:00
|
|
|
preempt_count() += val;
|
|
|
|
/*
|
|
|
|
* Spinlock count overflowing soon?
|
|
|
|
*/
|
2006-12-10 11:20:38 +01:00
|
|
|
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
|
|
|
|
PREEMPT_MASK - 10);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(add_preempt_count);
|
|
|
|
|
|
|
|
void fastcall sub_preempt_count(int val)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Underflow?
|
|
|
|
*/
|
2006-07-03 09:24:33 +02:00
|
|
|
if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
|
|
|
|
return;
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* Is the spinlock portion underflowing?
|
|
|
|
*/
|
2006-07-03 09:24:33 +02:00
|
|
|
if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
|
|
|
|
!(preempt_count() & PREEMPT_MASK)))
|
|
|
|
return;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
preempt_count() -= val;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(sub_preempt_count);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* Print scheduling while atomic bug:
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
static noinline void __schedule_bug(struct task_struct *prev)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-10-24 18:23:50 +02:00
|
|
|
struct pt_regs *regs = get_irq_regs();
|
|
|
|
|
|
|
|
printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
|
|
|
|
prev->comm, prev->pid, preempt_count());
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
debug_show_held_locks(prev);
|
|
|
|
if (irqs_disabled())
|
|
|
|
print_irqtrace_events(prev);
|
2007-10-24 18:23:50 +02:00
|
|
|
|
|
|
|
if (regs)
|
|
|
|
show_regs(regs);
|
|
|
|
else
|
|
|
|
dump_stack();
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* Various schedule()-time debugging checks and statistics:
|
|
|
|
*/
|
|
|
|
static inline void schedule_debug(struct task_struct *prev)
|
|
|
|
{
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* Test if we are atomic. Since do_exit() needs to call into
|
2005-04-17 00:20:36 +02:00
|
|
|
* schedule() atomically, we ignore that path for now.
|
|
|
|
* Otherwise, whine if we are scheduling when we should not be.
|
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
|
|
|
|
__schedule_bug(prev);
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
|
|
|
|
|
2007-10-15 17:00:12 +02:00
|
|
|
schedstat_inc(this_rq(), sched_count);
|
2007-10-15 17:00:10 +02:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
if (unlikely(prev->lock_depth >= 0)) {
|
2007-10-15 17:00:12 +02:00
|
|
|
schedstat_inc(this_rq(), bkl_count);
|
|
|
|
schedstat_inc(prev, sched_info.bkl_count);
|
2007-10-15 17:00:10 +02:00
|
|
|
}
|
|
|
|
#endif
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pick up the highest-prio task:
|
|
|
|
*/
|
|
|
|
static inline struct task_struct *
|
2007-08-09 11:16:49 +02:00
|
|
|
pick_next_task(struct rq *rq, struct task_struct *prev)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
2007-10-15 17:00:12 +02:00
|
|
|
const struct sched_class *class;
|
2007-07-09 18:51:59 +02:00
|
|
|
struct task_struct *p;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
2007-07-09 18:51:59 +02:00
|
|
|
* Optimization: we know that if all tasks are in
|
|
|
|
* the fair class we can call that function directly:
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
if (likely(rq->nr_running == rq->cfs.nr_running)) {
|
2007-08-09 11:16:48 +02:00
|
|
|
p = fair_sched_class.pick_next_task(rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
if (likely(p))
|
|
|
|
return p;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
class = sched_class_highest;
|
|
|
|
for ( ; ; ) {
|
2007-08-09 11:16:48 +02:00
|
|
|
p = class->pick_next_task(rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
if (p)
|
|
|
|
return p;
|
|
|
|
/*
|
|
|
|
* Will never be NULL as the idle class always
|
|
|
|
* returns a non-NULL p:
|
|
|
|
*/
|
|
|
|
class = class->next;
|
|
|
|
}
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* schedule() is the main scheduler function.
|
|
|
|
*/
|
|
|
|
asmlinkage void __sched schedule(void)
|
|
|
|
{
|
|
|
|
struct task_struct *prev, *next;
|
|
|
|
long *switch_count;
|
|
|
|
struct rq *rq;
|
|
|
|
int cpu;
|
|
|
|
|
|
|
|
need_resched:
|
|
|
|
preempt_disable();
|
|
|
|
cpu = smp_processor_id();
|
|
|
|
rq = cpu_rq(cpu);
|
|
|
|
rcu_qsctr_inc(cpu);
|
|
|
|
prev = rq->curr;
|
|
|
|
switch_count = &prev->nivcsw;
|
|
|
|
|
|
|
|
release_kernel_lock(prev);
|
|
|
|
need_resched_nonpreemptible:
|
|
|
|
|
|
|
|
schedule_debug(prev);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
hrtick_clear(rq);
|
|
|
|
|
2007-10-15 17:00:13 +02:00
|
|
|
/*
|
|
|
|
* Do the rq-clock update outside the rq lock:
|
|
|
|
*/
|
|
|
|
local_irq_disable();
|
2007-08-09 11:16:47 +02:00
|
|
|
__update_rq_clock(rq);
|
2007-10-15 17:00:13 +02:00
|
|
|
spin_lock(&rq->lock);
|
|
|
|
clear_tsk_need_resched(prev);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
|
|
|
|
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
|
2007-07-09 18:51:59 +02:00
|
|
|
unlikely(signal_pending(prev)))) {
|
2005-04-17 00:20:36 +02:00
|
|
|
prev->state = TASK_RUNNING;
|
2007-07-09 18:51:59 +02:00
|
|
|
} else {
|
2007-08-09 11:16:49 +02:00
|
|
|
deactivate_task(rq, prev, 1);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
switch_count = &prev->nvcsw;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:22 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
if (prev->sched_class->pre_schedule)
|
|
|
|
prev->sched_class->pre_schedule(rq, prev);
|
|
|
|
#endif
|
2008-01-25 21:08:07 +01:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (unlikely(!rq->nr_running))
|
2005-04-17 00:20:36 +02:00
|
|
|
idle_balance(cpu, rq);
|
|
|
|
|
2007-08-09 11:16:49 +02:00
|
|
|
prev->sched_class->put_prev_task(rq, prev);
|
2007-08-09 11:16:49 +02:00
|
|
|
next = pick_next_task(rq, prev);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
sched_info_switch(prev, next);
|
2007-07-09 18:51:59 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
if (likely(prev != next)) {
|
|
|
|
rq->nr_switches++;
|
|
|
|
rq->curr = next;
|
|
|
|
++*switch_count;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
context_switch(rq, prev, next); /* unlocks the rq */
|
2008-01-25 21:08:29 +01:00
|
|
|
/*
|
|
|
|
* the context switch might have flipped the stack from under
|
|
|
|
* us, hence refresh the local variables.
|
|
|
|
*/
|
|
|
|
cpu = smp_processor_id();
|
|
|
|
rq = cpu_rq(cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
} else
|
|
|
|
spin_unlock_irq(&rq->lock);
|
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
hrtick_set(rq);
|
|
|
|
|
|
|
|
if (unlikely(reacquire_kernel_lock(current) < 0))
|
2005-04-17 00:20:36 +02:00
|
|
|
goto need_resched_nonpreemptible;
|
2008-01-25 21:08:29 +01:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
preempt_enable_no_resched();
|
|
|
|
if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
|
|
|
|
goto need_resched;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(schedule);
|
|
|
|
|
|
|
|
#ifdef CONFIG_PREEMPT
|
|
|
|
/*
|
2006-07-10 13:43:52 +02:00
|
|
|
* this is the entry point to schedule() from in-kernel preemption
|
2007-12-05 15:46:09 +01:00
|
|
|
* off of preempt_enable. Kernel preemptions off return from interrupt
|
2005-04-17 00:20:36 +02:00
|
|
|
* occur there and call schedule directly.
|
|
|
|
*/
|
|
|
|
asmlinkage void __sched preempt_schedule(void)
|
|
|
|
{
|
|
|
|
struct thread_info *ti = current_thread_info();
|
|
|
|
struct task_struct *task = current;
|
|
|
|
int saved_lock_depth;
|
2008-01-25 21:08:33 +01:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* If there is a non-zero preempt_count or interrupts are disabled,
|
2007-12-05 15:46:09 +01:00
|
|
|
* we do not want to preempt the current task. Just return..
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2006-10-11 10:21:52 +02:00
|
|
|
if (likely(ti->preempt_count || irqs_disabled()))
|
2005-04-17 00:20:36 +02:00
|
|
|
return;
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
do {
|
|
|
|
add_preempt_count(PREEMPT_ACTIVE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We keep the big kernel semaphore locked, but we
|
|
|
|
* clear ->lock_depth so that schedule() doesnt
|
|
|
|
* auto-release the semaphore:
|
|
|
|
*/
|
|
|
|
saved_lock_depth = task->lock_depth;
|
|
|
|
task->lock_depth = -1;
|
|
|
|
schedule();
|
|
|
|
task->lock_depth = saved_lock_depth;
|
|
|
|
sub_preempt_count(PREEMPT_ACTIVE);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Check again in case we missed a preemption opportunity
|
|
|
|
* between schedule and now.
|
|
|
|
*/
|
|
|
|
barrier();
|
|
|
|
} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(preempt_schedule);
|
|
|
|
|
|
|
|
/*
|
2006-07-10 13:43:52 +02:00
|
|
|
* this is the entry point to schedule() from kernel preemption
|
2005-04-17 00:20:36 +02:00
|
|
|
* off of irq context.
|
|
|
|
* Note, that this is called and return with irqs disabled. This will
|
|
|
|
* protect us against recursive calling from irq.
|
|
|
|
*/
|
|
|
|
asmlinkage void __sched preempt_schedule_irq(void)
|
|
|
|
{
|
|
|
|
struct thread_info *ti = current_thread_info();
|
|
|
|
struct task_struct *task = current;
|
|
|
|
int saved_lock_depth;
|
2008-01-25 21:08:33 +01:00
|
|
|
|
2006-07-10 13:43:52 +02:00
|
|
|
/* Catch callers which need to be fixed */
|
2005-04-17 00:20:36 +02:00
|
|
|
BUG_ON(ti->preempt_count || !irqs_disabled());
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
do {
|
|
|
|
add_preempt_count(PREEMPT_ACTIVE);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We keep the big kernel semaphore locked, but we
|
|
|
|
* clear ->lock_depth so that schedule() doesnt
|
|
|
|
* auto-release the semaphore:
|
|
|
|
*/
|
|
|
|
saved_lock_depth = task->lock_depth;
|
|
|
|
task->lock_depth = -1;
|
|
|
|
local_irq_enable();
|
|
|
|
schedule();
|
|
|
|
local_irq_disable();
|
|
|
|
task->lock_depth = saved_lock_depth;
|
|
|
|
sub_preempt_count(PREEMPT_ACTIVE);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Check again in case we missed a preemption opportunity
|
|
|
|
* between schedule and now.
|
|
|
|
*/
|
|
|
|
barrier();
|
|
|
|
} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_PREEMPT */
|
|
|
|
|
2005-09-10 09:26:11 +02:00
|
|
|
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
|
|
|
|
void *key)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:40 +02:00
|
|
|
return try_to_wake_up(curr->private, mode, sync);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(default_wake_function);
|
|
|
|
|
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
|
|
|
|
* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
|
2005-04-17 00:20:36 +02:00
|
|
|
* number) then we wake all the non-exclusive tasks and one exclusive task.
|
|
|
|
*
|
|
|
|
* There are circumstances in which we can try to wake a task which has already
|
2007-12-05 15:46:09 +01:00
|
|
|
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
|
2005-04-17 00:20:36 +02:00
|
|
|
* zero in this (rare) case, and we handle it by continuing to scan the queue.
|
|
|
|
*/
|
|
|
|
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
|
|
|
|
int nr_exclusive, int sync, void *key)
|
|
|
|
{
|
2007-10-15 17:00:02 +02:00
|
|
|
wait_queue_t *curr, *next;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:02 +02:00
|
|
|
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
|
2006-07-03 09:25:40 +02:00
|
|
|
unsigned flags = curr->flags;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
if (curr->func(curr, mode, sync, key) &&
|
2006-07-03 09:25:40 +02:00
|
|
|
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
|
2005-04-17 00:20:36 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* __wake_up - wake up threads blocked on a waitqueue.
|
|
|
|
* @q: the waitqueue
|
|
|
|
* @mode: which threads
|
|
|
|
* @nr_exclusive: how many wake-one or wake-many threads to wake up
|
2005-05-01 17:59:26 +02:00
|
|
|
* @key: is directly passed to the wakeup function
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
|
|
|
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
|
2005-09-10 09:26:11 +02:00
|
|
|
int nr_exclusive, void *key)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&q->lock, flags);
|
|
|
|
__wake_up_common(q, mode, nr_exclusive, 0, key);
|
|
|
|
spin_unlock_irqrestore(&q->lock, flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__wake_up);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
|
|
|
|
*/
|
|
|
|
void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
|
|
|
|
{
|
|
|
|
__wake_up_common(q, mode, 1, 0, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2005-05-01 17:59:26 +02:00
|
|
|
* __wake_up_sync - wake up threads blocked on a waitqueue.
|
2005-04-17 00:20:36 +02:00
|
|
|
* @q: the waitqueue
|
|
|
|
* @mode: which threads
|
|
|
|
* @nr_exclusive: how many wake-one or wake-many threads to wake up
|
|
|
|
*
|
|
|
|
* The sync wakeup differs that the waker knows that it will schedule
|
|
|
|
* away soon, so while the target thread will be woken up, it will not
|
|
|
|
* be migrated to another CPU - ie. the two threads are 'synchronized'
|
|
|
|
* with each other. This can prevent needless bouncing between CPUs.
|
|
|
|
*
|
|
|
|
* On UP it can prevent extra preemption.
|
|
|
|
*/
|
2005-09-10 09:26:11 +02:00
|
|
|
void fastcall
|
|
|
|
__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
int sync = 1;
|
|
|
|
|
|
|
|
if (unlikely(!q))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (unlikely(!nr_exclusive))
|
|
|
|
sync = 0;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&q->lock, flags);
|
|
|
|
__wake_up_common(q, mode, nr_exclusive, sync, NULL);
|
|
|
|
spin_unlock_irqrestore(&q->lock, flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
|
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
void complete(struct completion *x)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&x->wait.lock, flags);
|
|
|
|
x->done++;
|
2007-12-06 17:07:07 +01:00
|
|
|
__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_unlock_irqrestore(&x->wait.lock, flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(complete);
|
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
void complete_all(struct completion *x)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&x->wait.lock, flags);
|
|
|
|
x->done += UINT_MAX/2;
|
2007-12-06 17:07:07 +01:00
|
|
|
__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_unlock_irqrestore(&x->wait.lock, flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(complete_all);
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
static inline long __sched
|
|
|
|
do_wait_for_common(struct completion *x, long timeout, int state)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
if (!x->done) {
|
|
|
|
DECLARE_WAITQUEUE(wait, current);
|
|
|
|
|
|
|
|
wait.flags |= WQ_FLAG_EXCLUSIVE;
|
|
|
|
__add_wait_queue_tail(&x->wait, &wait);
|
|
|
|
do {
|
2007-12-06 18:29:54 +01:00
|
|
|
if ((state == TASK_INTERRUPTIBLE &&
|
|
|
|
signal_pending(current)) ||
|
|
|
|
(state == TASK_KILLABLE &&
|
|
|
|
fatal_signal_pending(current))) {
|
2007-10-15 17:00:14 +02:00
|
|
|
__remove_wait_queue(&x->wait, &wait);
|
|
|
|
return -ERESTARTSYS;
|
|
|
|
}
|
|
|
|
__set_current_state(state);
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_unlock_irq(&x->wait.lock);
|
|
|
|
timeout = schedule_timeout(timeout);
|
|
|
|
spin_lock_irq(&x->wait.lock);
|
|
|
|
if (!timeout) {
|
|
|
|
__remove_wait_queue(&x->wait, &wait);
|
2007-10-15 17:00:14 +02:00
|
|
|
return timeout;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
} while (!x->done);
|
|
|
|
__remove_wait_queue(&x->wait, &wait);
|
|
|
|
}
|
|
|
|
x->done--;
|
|
|
|
return timeout;
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
static long __sched
|
|
|
|
wait_for_common(struct completion *x, long timeout, int state)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
might_sleep();
|
|
|
|
|
|
|
|
spin_lock_irq(&x->wait.lock);
|
2007-10-15 17:00:14 +02:00
|
|
|
timeout = do_wait_for_common(x, timeout, state);
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_unlock_irq(&x->wait.lock);
|
2007-10-15 17:00:14 +02:00
|
|
|
return timeout;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
void __sched wait_for_completion(struct completion *x)
|
2007-10-15 17:00:14 +02:00
|
|
|
{
|
|
|
|
wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
EXPORT_SYMBOL(wait_for_completion);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
unsigned long __sched
|
2007-10-15 17:00:14 +02:00
|
|
|
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
EXPORT_SYMBOL(wait_for_completion_timeout);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
int __sched wait_for_completion_interruptible(struct completion *x)
|
2007-07-09 18:52:01 +02:00
|
|
|
{
|
2007-10-18 21:32:55 +02:00
|
|
|
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
|
|
|
|
if (t == -ERESTARTSYS)
|
|
|
|
return t;
|
|
|
|
return 0;
|
2007-07-09 18:52:01 +02:00
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
EXPORT_SYMBOL(wait_for_completion_interruptible);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
unsigned long __sched
|
2007-10-15 17:00:14 +02:00
|
|
|
wait_for_completion_interruptible_timeout(struct completion *x,
|
|
|
|
unsigned long timeout)
|
2007-07-09 18:52:01 +02:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
|
2007-07-09 18:52:01 +02:00
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-12-06 18:29:54 +01:00
|
|
|
int __sched wait_for_completion_killable(struct completion *x)
|
|
|
|
{
|
|
|
|
long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
|
|
|
|
if (t == -ERESTARTSYS)
|
|
|
|
return t;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(wait_for_completion_killable);
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
static long __sched
|
|
|
|
sleep_on_common(wait_queue_head_t *q, int state, long timeout)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-07-09 18:52:01 +02:00
|
|
|
unsigned long flags;
|
|
|
|
wait_queue_t wait;
|
|
|
|
|
|
|
|
init_waitqueue_entry(&wait, current);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
__set_current_state(state);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
spin_lock_irqsave(&q->lock, flags);
|
|
|
|
__add_wait_queue(q, &wait);
|
|
|
|
spin_unlock(&q->lock);
|
|
|
|
timeout = schedule_timeout(timeout);
|
|
|
|
spin_lock_irq(&q->lock);
|
|
|
|
__remove_wait_queue(q, &wait);
|
|
|
|
spin_unlock_irqrestore(&q->lock, flags);
|
|
|
|
|
|
|
|
return timeout;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __sched interruptible_sleep_on(wait_queue_head_t *q)
|
|
|
|
{
|
|
|
|
sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(interruptible_sleep_on);
|
|
|
|
|
2007-07-09 18:52:01 +02:00
|
|
|
long __sched
|
2005-09-10 09:26:11 +02:00
|
|
|
interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
|
|
|
|
|
2007-07-09 18:52:01 +02:00
|
|
|
void __sched sleep_on(wait_queue_head_t *q)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(sleep_on);
|
|
|
|
|
2007-07-09 18:52:01 +02:00
|
|
|
long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(sleep_on_timeout);
|
|
|
|
|
2006-06-27 11:54:51 +02:00
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
|
|
|
|
|
|
/*
|
|
|
|
* rt_mutex_setprio - set the current priority of a task
|
|
|
|
* @p: task
|
|
|
|
* @prio: prio value (kernel-internal form)
|
|
|
|
*
|
|
|
|
* This function changes the 'effective' priority of a task. It does
|
|
|
|
* not touch ->normal_prio like __setscheduler().
|
|
|
|
*
|
|
|
|
* Used by the rt_mutex code to implement priority inheritance logic.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
void rt_mutex_setprio(struct task_struct *p, int prio)
|
2006-06-27 11:54:51 +02:00
|
|
|
{
|
|
|
|
unsigned long flags;
|
2007-10-15 17:00:08 +02:00
|
|
|
int oldprio, on_rq, running;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2008-01-25 21:08:22 +01:00
|
|
|
const struct sched_class *prev_class = p->sched_class;
|
2006-06-27 11:54:51 +02:00
|
|
|
|
|
|
|
BUG_ON(prio < 0 || prio > MAX_PRIO);
|
|
|
|
|
|
|
|
rq = task_rq_lock(p, &flags);
|
2007-08-09 11:16:47 +02:00
|
|
|
update_rq_clock(rq);
|
2006-06-27 11:54:51 +02:00
|
|
|
|
2007-05-09 05:27:06 +02:00
|
|
|
oldprio = p->prio;
|
2007-07-09 18:51:59 +02:00
|
|
|
on_rq = p->se.on_rq;
|
2007-12-18 15:21:13 +01:00
|
|
|
running = task_current(rq, p);
|
2007-10-15 17:00:08 +02:00
|
|
|
if (on_rq) {
|
2007-08-09 11:16:49 +02:00
|
|
|
dequeue_task(rq, p, 0);
|
2007-10-15 17:00:08 +02:00
|
|
|
if (running)
|
|
|
|
p->sched_class->put_prev_task(rq, p);
|
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
|
|
|
|
if (rt_prio(prio))
|
|
|
|
p->sched_class = &rt_sched_class;
|
|
|
|
else
|
|
|
|
p->sched_class = &fair_sched_class;
|
|
|
|
|
2006-06-27 11:54:51 +02:00
|
|
|
p->prio = prio;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (on_rq) {
|
2007-10-15 17:00:08 +02:00
|
|
|
if (running)
|
|
|
|
p->sched_class->set_curr_task(rq);
|
2008-01-25 21:08:22 +01:00
|
|
|
|
2007-08-09 11:16:49 +02:00
|
|
|
enqueue_task(rq, p, 0);
|
2008-01-25 21:08:22 +01:00
|
|
|
|
|
|
|
check_class_changed(rq, p, prev_class, oldprio, running);
|
2006-06-27 11:54:51 +02:00
|
|
|
}
|
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2006-07-03 09:25:41 +02:00
|
|
|
void set_user_nice(struct task_struct *p, long nice)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
int old_prio, delta, on_rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long flags;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
|
|
|
|
return;
|
|
|
|
/*
|
|
|
|
* We have to be careful, if called from sys_setpriority(),
|
|
|
|
* the task might be in the middle of scheduling on another CPU.
|
|
|
|
*/
|
|
|
|
rq = task_rq_lock(p, &flags);
|
2007-08-09 11:16:47 +02:00
|
|
|
update_rq_clock(rq);
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* The RT priorities are set via sched_setscheduler(), but we still
|
|
|
|
* allow the 'normal' nice value to be set - but as expected
|
|
|
|
* it wont have any effect on scheduling until the task is
|
2007-07-09 18:51:59 +02:00
|
|
|
* SCHED_FIFO/SCHED_RR:
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-07-09 18:51:59 +02:00
|
|
|
if (task_has_rt_policy(p)) {
|
2005-04-17 00:20:36 +02:00
|
|
|
p->static_prio = NICE_TO_PRIO(nice);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
on_rq = p->se.on_rq;
|
2008-01-25 21:08:00 +01:00
|
|
|
if (on_rq)
|
2007-08-09 11:16:49 +02:00
|
|
|
dequeue_task(rq, p, 0);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
p->static_prio = NICE_TO_PRIO(nice);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
set_load_weight(p);
|
2006-06-27 11:54:51 +02:00
|
|
|
old_prio = p->prio;
|
|
|
|
p->prio = effective_prio(p);
|
|
|
|
delta = p->prio - old_prio;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (on_rq) {
|
2007-08-09 11:16:49 +02:00
|
|
|
enqueue_task(rq, p, 0);
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
2007-05-09 05:27:06 +02:00
|
|
|
* If the task increased its priority or is running and
|
|
|
|
* lowered its priority, then reschedule its CPU:
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2007-05-09 05:27:06 +02:00
|
|
|
if (delta < 0 || (delta > 0 && task_running(rq, p)))
|
2005-04-17 00:20:36 +02:00
|
|
|
resched_task(rq->curr);
|
|
|
|
}
|
|
|
|
out_unlock:
|
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(set_user_nice);
|
|
|
|
|
2005-05-01 17:59:00 +02:00
|
|
|
/*
|
|
|
|
* can_nice - check if a task can reduce its nice value
|
|
|
|
* @p: task
|
|
|
|
* @nice: nice value
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
int can_nice(const struct task_struct *p, const int nice)
|
2005-05-01 17:59:00 +02:00
|
|
|
{
|
2005-08-18 20:24:19 +02:00
|
|
|
/* convert nice value [19,-20] to rlimit style value [1,40] */
|
|
|
|
int nice_rlim = 20 - nice;
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-05-01 17:59:00 +02:00
|
|
|
return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
|
|
|
|
capable(CAP_SYS_NICE));
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifdef __ARCH_WANT_SYS_NICE
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sys_nice - change the priority of the current process.
|
|
|
|
* @increment: priority increment
|
|
|
|
*
|
|
|
|
* sys_setpriority is a more generic, but much slower function that
|
|
|
|
* does similar things.
|
|
|
|
*/
|
|
|
|
asmlinkage long sys_nice(int increment)
|
|
|
|
{
|
2006-07-03 09:25:40 +02:00
|
|
|
long nice, retval;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Setpriority might change our priority at the same moment.
|
|
|
|
* We don't have to worry. Conceptually one call occurs first
|
|
|
|
* and we have a single winner.
|
|
|
|
*/
|
2005-05-01 17:59:00 +02:00
|
|
|
if (increment < -40)
|
|
|
|
increment = -40;
|
2005-04-17 00:20:36 +02:00
|
|
|
if (increment > 40)
|
|
|
|
increment = 40;
|
|
|
|
|
|
|
|
nice = PRIO_TO_NICE(current->static_prio) + increment;
|
|
|
|
if (nice < -20)
|
|
|
|
nice = -20;
|
|
|
|
if (nice > 19)
|
|
|
|
nice = 19;
|
|
|
|
|
2005-05-01 17:59:00 +02:00
|
|
|
if (increment < 0 && !can_nice(current, nice))
|
|
|
|
return -EPERM;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
retval = security_task_setnice(current, nice);
|
|
|
|
if (retval)
|
|
|
|
return retval;
|
|
|
|
|
|
|
|
set_user_nice(current, nice);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/**
|
|
|
|
* task_prio - return the priority value of a given task.
|
|
|
|
* @p: the task in question.
|
|
|
|
*
|
|
|
|
* This is the priority value as seen by users in /proc.
|
|
|
|
* RT tasks are offset by -200. Normal tasks are centered
|
|
|
|
* around 0, value goes from -16 to +15.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
int task_prio(const struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
return p->prio - MAX_RT_PRIO;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* task_nice - return the nice value of a given task.
|
|
|
|
* @p: the task in question.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
int task_nice(const struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
return TASK_NICE(p);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(task_nice);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* idle_cpu - is a given cpu idle currently?
|
|
|
|
* @cpu: the processor in question.
|
|
|
|
*/
|
|
|
|
int idle_cpu(int cpu)
|
|
|
|
{
|
|
|
|
return cpu_curr(cpu) == cpu_rq(cpu)->idle;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* idle_task - return the idle task for a given cpu.
|
|
|
|
* @cpu: the processor in question.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *idle_task(int cpu)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
return cpu_rq(cpu)->idle;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* find_process_by_pid - find a process with a matching PID value.
|
|
|
|
* @pid: the pid in question.
|
|
|
|
*/
|
2007-10-15 17:00:13 +02:00
|
|
|
static struct task_struct *find_process_by_pid(pid_t pid)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-10-19 08:40:16 +02:00
|
|
|
return pid ? find_task_by_vpid(pid) : current;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Actually do priority change: must hold rq lock. */
|
2007-07-09 18:51:59 +02:00
|
|
|
static void
|
|
|
|
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
BUG_ON(p->se.on_rq);
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
p->policy = policy;
|
2007-07-09 18:51:59 +02:00
|
|
|
switch (p->policy) {
|
|
|
|
case SCHED_NORMAL:
|
|
|
|
case SCHED_BATCH:
|
|
|
|
case SCHED_IDLE:
|
|
|
|
p->sched_class = &fair_sched_class;
|
|
|
|
break;
|
|
|
|
case SCHED_FIFO:
|
|
|
|
case SCHED_RR:
|
|
|
|
p->sched_class = &rt_sched_class;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
p->rt_priority = prio;
|
2006-06-27 11:54:51 +02:00
|
|
|
p->normal_prio = normal_prio(p);
|
|
|
|
/* we are holding p->pi_lock already */
|
|
|
|
p->prio = rt_mutex_getprio(p);
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
set_load_weight(p);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2007-02-10 10:45:59 +01:00
|
|
|
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
|
2005-04-17 00:20:36 +02:00
|
|
|
* @p: the task in question.
|
|
|
|
* @policy: new policy.
|
|
|
|
* @param: structure containing the new RT priority.
|
2006-09-29 11:00:48 +02:00
|
|
|
*
|
2007-02-10 10:45:59 +01:00
|
|
|
* NOTE that the task may be already dead.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2005-09-10 09:26:11 +02:00
|
|
|
int sched_setscheduler(struct task_struct *p, int policy,
|
|
|
|
struct sched_param *param)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-10-15 17:00:08 +02:00
|
|
|
int retval, oldprio, oldpolicy = -1, on_rq, running;
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long flags;
|
2008-01-25 21:08:22 +01:00
|
|
|
const struct sched_class *prev_class = p->sched_class;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-06-27 11:54:44 +02:00
|
|
|
/* may grab non-irq protected spin_locks */
|
|
|
|
BUG_ON(in_interrupt());
|
2005-04-17 00:20:36 +02:00
|
|
|
recheck:
|
|
|
|
/* double check policy once rq lock held */
|
|
|
|
if (policy < 0)
|
|
|
|
policy = oldpolicy = p->policy;
|
|
|
|
else if (policy != SCHED_FIFO && policy != SCHED_RR &&
|
2007-07-09 18:51:59 +02:00
|
|
|
policy != SCHED_NORMAL && policy != SCHED_BATCH &&
|
|
|
|
policy != SCHED_IDLE)
|
2006-01-14 22:20:41 +01:00
|
|
|
return -EINVAL;
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* Valid priorities for SCHED_FIFO and SCHED_RR are
|
2007-07-09 18:51:59 +02:00
|
|
|
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
|
|
|
|
* SCHED_BATCH and SCHED_IDLE is 0.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
|
|
|
if (param->sched_priority < 0 ||
|
2005-09-10 09:26:11 +02:00
|
|
|
(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
|
2005-07-25 22:28:39 +02:00
|
|
|
(!p->mm && param->sched_priority > MAX_RT_PRIO-1))
|
2005-04-17 00:20:36 +02:00
|
|
|
return -EINVAL;
|
2007-07-09 18:51:59 +02:00
|
|
|
if (rt_policy(policy) != (param->sched_priority != 0))
|
2005-04-17 00:20:36 +02:00
|
|
|
return -EINVAL;
|
|
|
|
|
[PATCH] Changing RT priority without CAP_SYS_NICE
Presently, a process without the capability CAP_SYS_NICE can not change
its own policy, which is OK.
But it can also not decrease its RT priority (if scheduled with policy
SCHED_RR or SCHED_FIFO), which is what this patch changes.
The rationale is the same as for the nice value: a process should be
able to require less priority for itself. Increasing the priority is
still not allowed.
This is for example useful if you give a multithreaded user process a RT
priority, and the process would like to organize its internal threads
using priorities also. Then you can give the process the highest
priority needed N, and the process starts its threads with lower
priorities: N-1, N-2...
The POSIX norm says that the permissions are implementation specific, so
I think we can do that.
In a sense, it makes the permissions consistent whatever the policy is:
with this patch, process scheduled by SCHED_FIFO, SCHED_RR and
SCHED_OTHER can all decrease their priority.
From: Ingo Molnar <mingo@elte.hu>
cleaned up and merged to -mm.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 23:57:32 +02:00
|
|
|
/*
|
|
|
|
* Allow unprivileged RT tasks to decrease priority:
|
|
|
|
*/
|
|
|
|
if (!capable(CAP_SYS_NICE)) {
|
2007-07-09 18:51:59 +02:00
|
|
|
if (rt_policy(policy)) {
|
2006-09-29 11:00:50 +02:00
|
|
|
unsigned long rlim_rtprio;
|
|
|
|
|
|
|
|
if (!lock_task_sighand(p, &flags))
|
|
|
|
return -ESRCH;
|
|
|
|
rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
|
|
|
|
unlock_task_sighand(p, &flags);
|
|
|
|
|
|
|
|
/* can't set/change the rt policy */
|
|
|
|
if (policy != p->policy && !rlim_rtprio)
|
|
|
|
return -EPERM;
|
|
|
|
|
|
|
|
/* can't increase priority */
|
|
|
|
if (param->sched_priority > p->rt_priority &&
|
|
|
|
param->sched_priority > rlim_rtprio)
|
|
|
|
return -EPERM;
|
|
|
|
}
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* Like positive nice levels, dont allow tasks to
|
|
|
|
* move out of SCHED_IDLE either:
|
|
|
|
*/
|
|
|
|
if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
|
|
|
|
return -EPERM;
|
2006-09-29 11:00:48 +02:00
|
|
|
|
[PATCH] Changing RT priority without CAP_SYS_NICE
Presently, a process without the capability CAP_SYS_NICE can not change
its own policy, which is OK.
But it can also not decrease its RT priority (if scheduled with policy
SCHED_RR or SCHED_FIFO), which is what this patch changes.
The rationale is the same as for the nice value: a process should be
able to require less priority for itself. Increasing the priority is
still not allowed.
This is for example useful if you give a multithreaded user process a RT
priority, and the process would like to organize its internal threads
using priorities also. Then you can give the process the highest
priority needed N, and the process starts its threads with lower
priorities: N-1, N-2...
The POSIX norm says that the permissions are implementation specific, so
I think we can do that.
In a sense, it makes the permissions consistent whatever the policy is:
with this patch, process scheduled by SCHED_FIFO, SCHED_RR and
SCHED_OTHER can all decrease their priority.
From: Ingo Molnar <mingo@elte.hu>
cleaned up and merged to -mm.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 23:57:32 +02:00
|
|
|
/* can't change other user's priorities */
|
|
|
|
if ((current->euid != p->euid) &&
|
|
|
|
(current->euid != p->uid))
|
|
|
|
return -EPERM;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
retval = security_task_setscheduler(p, policy, param);
|
|
|
|
if (retval)
|
|
|
|
return retval;
|
2006-06-27 11:54:51 +02:00
|
|
|
/*
|
|
|
|
* make sure no PI-waiters arrive (or leave) while we are
|
|
|
|
* changing the priority of the task:
|
|
|
|
*/
|
|
|
|
spin_lock_irqsave(&p->pi_lock, flags);
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* To be able to change p->policy safely, the apropriate
|
|
|
|
* runqueue lock must be held.
|
|
|
|
*/
|
2006-06-27 11:54:51 +02:00
|
|
|
rq = __task_rq_lock(p);
|
2005-04-17 00:20:36 +02:00
|
|
|
/* recheck policy now with rq lock held */
|
|
|
|
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
|
|
|
|
policy = oldpolicy = -1;
|
2006-06-27 11:54:51 +02:00
|
|
|
__task_rq_unlock(rq);
|
|
|
|
spin_unlock_irqrestore(&p->pi_lock, flags);
|
2005-04-17 00:20:36 +02:00
|
|
|
goto recheck;
|
|
|
|
}
|
2007-08-09 11:16:51 +02:00
|
|
|
update_rq_clock(rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
on_rq = p->se.on_rq;
|
2007-12-18 15:21:13 +01:00
|
|
|
running = task_current(rq, p);
|
2007-10-15 17:00:08 +02:00
|
|
|
if (on_rq) {
|
2007-08-09 11:16:49 +02:00
|
|
|
deactivate_task(rq, p, 0);
|
2007-10-15 17:00:08 +02:00
|
|
|
if (running)
|
|
|
|
p->sched_class->put_prev_task(rq, p);
|
|
|
|
}
|
2007-10-15 17:00:08 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
oldprio = p->prio;
|
2007-07-09 18:51:59 +02:00
|
|
|
__setscheduler(rq, p, policy, param->sched_priority);
|
2007-10-15 17:00:08 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
if (on_rq) {
|
2007-10-15 17:00:08 +02:00
|
|
|
if (running)
|
|
|
|
p->sched_class->set_curr_task(rq);
|
2008-01-25 21:08:22 +01:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
activate_task(rq, p, 0);
|
2008-01-25 21:08:22 +01:00
|
|
|
|
|
|
|
check_class_changed(rq, p, prev_class, oldprio, running);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2006-06-27 11:54:51 +02:00
|
|
|
__task_rq_unlock(rq);
|
|
|
|
spin_unlock_irqrestore(&p->pi_lock, flags);
|
|
|
|
|
2006-06-27 11:55:02 +02:00
|
|
|
rt_mutex_adjust_pi(p);
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(sched_setscheduler);
|
|
|
|
|
2005-09-10 09:26:11 +02:00
|
|
|
static int
|
|
|
|
do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
struct sched_param lparam;
|
|
|
|
struct task_struct *p;
|
2006-07-03 09:25:41 +02:00
|
|
|
int retval;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
if (!param || pid < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
|
|
|
|
return -EFAULT;
|
2006-09-29 11:00:48 +02:00
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
retval = -ESRCH;
|
2005-04-17 00:20:36 +02:00
|
|
|
p = find_process_by_pid(pid);
|
2006-09-29 11:00:48 +02:00
|
|
|
if (p != NULL)
|
|
|
|
retval = sched_setscheduler(p, policy, &lparam);
|
|
|
|
rcu_read_unlock();
|
2006-07-03 09:25:41 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
|
|
|
|
* @pid: the pid in question.
|
|
|
|
* @policy: new policy.
|
|
|
|
* @param: structure containing the new RT priority.
|
|
|
|
*/
|
2007-12-05 15:46:09 +01:00
|
|
|
asmlinkage long
|
|
|
|
sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-01-19 02:43:03 +01:00
|
|
|
/* negative values for policy are not valid */
|
|
|
|
if (policy < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
return do_sched_setscheduler(pid, policy, param);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_setparam - set/change the RT priority of a thread
|
|
|
|
* @pid: the pid in question.
|
|
|
|
* @param: structure containing the new RT priority.
|
|
|
|
*/
|
|
|
|
asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
|
|
|
|
{
|
|
|
|
return do_sched_setscheduler(pid, -1, param);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_getscheduler - get the policy (scheduling class) of a thread
|
|
|
|
* @pid: the pid in question.
|
|
|
|
*/
|
|
|
|
asmlinkage long sys_sched_getscheduler(pid_t pid)
|
|
|
|
{
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *p;
|
2007-10-15 17:00:14 +02:00
|
|
|
int retval;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
if (pid < 0)
|
2007-10-15 17:00:14 +02:00
|
|
|
return -EINVAL;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
retval = -ESRCH;
|
|
|
|
read_lock(&tasklist_lock);
|
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
if (p) {
|
|
|
|
retval = security_task_getscheduler(p);
|
|
|
|
if (!retval)
|
|
|
|
retval = p->policy;
|
|
|
|
}
|
|
|
|
read_unlock(&tasklist_lock);
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_getscheduler - get the RT priority of a thread
|
|
|
|
* @pid: the pid in question.
|
|
|
|
* @param: structure containing the RT priority.
|
|
|
|
*/
|
|
|
|
asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
|
|
|
|
{
|
|
|
|
struct sched_param lp;
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *p;
|
2007-10-15 17:00:14 +02:00
|
|
|
int retval;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
if (!param || pid < 0)
|
2007-10-15 17:00:14 +02:00
|
|
|
return -EINVAL;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
read_lock(&tasklist_lock);
|
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
retval = -ESRCH;
|
|
|
|
if (!p)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
retval = security_task_getscheduler(p);
|
|
|
|
if (retval)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
lp.sched_priority = p->rt_priority;
|
|
|
|
read_unlock(&tasklist_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This one might sleep, we cannot do it with a spinlock held ...
|
|
|
|
*/
|
|
|
|
retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
|
|
|
|
|
|
|
|
return retval;
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
read_unlock(&tasklist_lock);
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
long sched_setaffinity(pid_t pid, cpumask_t new_mask)
|
|
|
|
{
|
|
|
|
cpumask_t cpus_allowed;
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *p;
|
|
|
|
int retval;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2008-01-25 21:08:02 +01:00
|
|
|
get_online_cpus();
|
2005-04-17 00:20:36 +02:00
|
|
|
read_lock(&tasklist_lock);
|
|
|
|
|
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
if (!p) {
|
|
|
|
read_unlock(&tasklist_lock);
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
2005-04-17 00:20:36 +02:00
|
|
|
return -ESRCH;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It is not safe to call set_cpus_allowed with the
|
2007-12-05 15:46:09 +01:00
|
|
|
* tasklist_lock held. We will bump the task_struct's
|
2005-04-17 00:20:36 +02:00
|
|
|
* usage count and then drop tasklist_lock.
|
|
|
|
*/
|
|
|
|
get_task_struct(p);
|
|
|
|
read_unlock(&tasklist_lock);
|
|
|
|
|
|
|
|
retval = -EPERM;
|
|
|
|
if ((current->euid != p->euid) && (current->euid != p->uid) &&
|
|
|
|
!capable(CAP_SYS_NICE))
|
|
|
|
goto out_unlock;
|
|
|
|
|
2006-06-23 11:03:59 +02:00
|
|
|
retval = security_task_setscheduler(p, 0, NULL);
|
|
|
|
if (retval)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
cpus_allowed = cpuset_cpus_allowed(p);
|
|
|
|
cpus_and(new_mask, new_mask, cpus_allowed);
|
2007-10-19 08:40:22 +02:00
|
|
|
again:
|
2005-04-17 00:20:36 +02:00
|
|
|
retval = set_cpus_allowed(p, new_mask);
|
|
|
|
|
2007-10-19 08:40:22 +02:00
|
|
|
if (!retval) {
|
|
|
|
cpus_allowed = cpuset_cpus_allowed(p);
|
|
|
|
if (!cpus_subset(new_mask, cpus_allowed)) {
|
|
|
|
/*
|
|
|
|
* We must have raced with a concurrent cpuset
|
|
|
|
* update. Just reset the cpus_allowed to the
|
|
|
|
* cpuset's cpus_allowed
|
|
|
|
*/
|
|
|
|
new_mask = cpus_allowed;
|
|
|
|
goto again;
|
|
|
|
}
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
out_unlock:
|
|
|
|
put_task_struct(p);
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
2005-04-17 00:20:36 +02:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
|
|
|
|
cpumask_t *new_mask)
|
|
|
|
{
|
|
|
|
if (len < sizeof(cpumask_t)) {
|
|
|
|
memset(new_mask, 0, sizeof(cpumask_t));
|
|
|
|
} else if (len > sizeof(cpumask_t)) {
|
|
|
|
len = sizeof(cpumask_t);
|
|
|
|
}
|
|
|
|
return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_setaffinity - set the cpu affinity of a process
|
|
|
|
* @pid: pid of the process
|
|
|
|
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
|
|
|
|
* @user_mask_ptr: user-space pointer to the new cpu mask
|
|
|
|
*/
|
|
|
|
asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
|
|
|
|
unsigned long __user *user_mask_ptr)
|
|
|
|
{
|
|
|
|
cpumask_t new_mask;
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
|
|
|
|
if (retval)
|
|
|
|
return retval;
|
|
|
|
|
|
|
|
return sched_setaffinity(pid, new_mask);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Represents all cpu's present in the system
|
|
|
|
* In systems capable of hotplug, this map could dynamically grow
|
|
|
|
* as new cpu's are detected in the system via any platform specific
|
|
|
|
* method, such as ACPI for e.g.
|
|
|
|
*/
|
|
|
|
|
2006-01-11 22:44:57 +01:00
|
|
|
cpumask_t cpu_present_map __read_mostly;
|
2005-04-17 00:20:36 +02:00
|
|
|
EXPORT_SYMBOL(cpu_present_map);
|
|
|
|
|
|
|
|
#ifndef CONFIG_SMP
|
2006-01-11 22:44:57 +01:00
|
|
|
cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
|
2006-10-02 11:17:40 +02:00
|
|
|
EXPORT_SYMBOL(cpu_online_map);
|
|
|
|
|
2006-01-11 22:44:57 +01:00
|
|
|
cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
|
2006-10-02 11:17:40 +02:00
|
|
|
EXPORT_SYMBOL(cpu_possible_map);
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
long sched_getaffinity(pid_t pid, cpumask_t *mask)
|
|
|
|
{
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *p;
|
2005-04-17 00:20:36 +02:00
|
|
|
int retval;
|
|
|
|
|
2008-01-25 21:08:02 +01:00
|
|
|
get_online_cpus();
|
2005-04-17 00:20:36 +02:00
|
|
|
read_lock(&tasklist_lock);
|
|
|
|
|
|
|
|
retval = -ESRCH;
|
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
if (!p)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2006-06-23 11:03:59 +02:00
|
|
|
retval = security_task_getscheduler(p);
|
|
|
|
if (retval)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2006-02-01 12:05:18 +01:00
|
|
|
cpus_and(*mask, p->cpus_allowed, cpu_online_map);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
read_unlock(&tasklist_lock);
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-08-09 11:16:46 +02:00
|
|
|
return retval;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_getaffinity - get the cpu affinity of a process
|
|
|
|
* @pid: pid of the process
|
|
|
|
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
|
|
|
|
* @user_mask_ptr: user-space pointer to hold the current cpu mask
|
|
|
|
*/
|
|
|
|
asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
|
|
|
|
unsigned long __user *user_mask_ptr)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
cpumask_t mask;
|
|
|
|
|
|
|
|
if (len < sizeof(cpumask_t))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
ret = sched_getaffinity(pid, &mask);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
return sizeof(cpumask_t);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_yield - yield the current processor to other threads.
|
|
|
|
*
|
2007-07-09 18:51:59 +02:00
|
|
|
* This function yields the current CPU to other tasks. If there are no
|
|
|
|
* other threads running on this CPU then this function will return.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
|
|
|
asmlinkage long sys_sched_yield(void)
|
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = this_rq_lock();
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:12 +02:00
|
|
|
schedstat_inc(rq, yld_count);
|
2007-10-15 17:00:08 +02:00
|
|
|
current->sched_class->yield_task(rq);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Since we are going to call schedule() anyway, there's
|
|
|
|
* no need to preempt or enable interrupts:
|
|
|
|
*/
|
|
|
|
__release(rq->lock);
|
2006-07-03 09:24:54 +02:00
|
|
|
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
|
2005-04-17 00:20:36 +02:00
|
|
|
_raw_spin_unlock(&rq->lock);
|
|
|
|
preempt_enable_no_resched();
|
|
|
|
|
|
|
|
schedule();
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2006-06-30 10:56:00 +02:00
|
|
|
static void __cond_resched(void)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-06-23 11:05:23 +02:00
|
|
|
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
|
|
|
|
__might_sleep(__FILE__, __LINE__);
|
|
|
|
#endif
|
2005-07-08 02:57:04 +02:00
|
|
|
/*
|
|
|
|
* The BKS might be reacquired before we have dropped
|
|
|
|
* PREEMPT_ACTIVE, which could trigger a second
|
|
|
|
* cond_resched() call.
|
|
|
|
*/
|
2005-04-17 00:20:36 +02:00
|
|
|
do {
|
|
|
|
add_preempt_count(PREEMPT_ACTIVE);
|
|
|
|
schedule();
|
|
|
|
sub_preempt_count(PREEMPT_ACTIVE);
|
|
|
|
} while (need_resched());
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:28 +01:00
|
|
|
#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
|
|
|
|
int __sched _cond_resched(void)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-12-30 01:48:13 +01:00
|
|
|
if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
|
|
|
|
system_state == SYSTEM_RUNNING) {
|
2005-04-17 00:20:36 +02:00
|
|
|
__cond_resched();
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2008-01-25 21:08:28 +01:00
|
|
|
EXPORT_SYMBOL(_cond_resched);
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* cond_resched_lock() - if a reschedule is pending, drop the given lock,
|
|
|
|
* call schedule, and on return reacquire the lock.
|
|
|
|
*
|
2007-12-05 15:46:09 +01:00
|
|
|
* This works OK both with and without CONFIG_PREEMPT. We do strange low-level
|
2005-04-17 00:20:36 +02:00
|
|
|
* operations here to prevent schedule() from being called twice (once via
|
|
|
|
* spin_unlock(), once by hand).
|
|
|
|
*/
|
2005-09-10 09:26:11 +02:00
|
|
|
int cond_resched_lock(spinlock_t *lock)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2008-01-30 13:31:20 +01:00
|
|
|
int resched = need_resched() && system_state == SYSTEM_RUNNING;
|
2005-06-14 00:52:32 +02:00
|
|
|
int ret = 0;
|
|
|
|
|
2008-01-30 13:31:20 +01:00
|
|
|
if (spin_needbreak(lock) || resched) {
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_unlock(lock);
|
2008-01-30 13:31:20 +01:00
|
|
|
if (resched && need_resched())
|
|
|
|
__cond_resched();
|
|
|
|
else
|
|
|
|
cpu_relax();
|
2005-06-14 00:52:32 +02:00
|
|
|
ret = 1;
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_lock(lock);
|
|
|
|
}
|
2005-06-14 00:52:32 +02:00
|
|
|
return ret;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(cond_resched_lock);
|
|
|
|
|
|
|
|
int __sched cond_resched_softirq(void)
|
|
|
|
{
|
|
|
|
BUG_ON(!in_softirq());
|
|
|
|
|
2006-12-30 01:48:13 +01:00
|
|
|
if (need_resched() && system_state == SYSTEM_RUNNING) {
|
2007-05-23 22:58:18 +02:00
|
|
|
local_bh_enable();
|
2005-04-17 00:20:36 +02:00
|
|
|
__cond_resched();
|
|
|
|
local_bh_disable();
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(cond_resched_softirq);
|
|
|
|
|
|
|
|
/**
|
|
|
|
* yield - yield the current processor to other threads.
|
|
|
|
*
|
2007-02-10 10:45:59 +01:00
|
|
|
* This is a shortcut for kernel-space yielding - it marks the
|
2005-04-17 00:20:36 +02:00
|
|
|
* thread runnable and calls sys_sched_yield().
|
|
|
|
*/
|
|
|
|
void __sched yield(void)
|
|
|
|
{
|
|
|
|
set_current_state(TASK_RUNNING);
|
|
|
|
sys_sched_yield();
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(yield);
|
|
|
|
|
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
|
2005-04-17 00:20:36 +02:00
|
|
|
* that process accounting knows that this is a task in IO wait state.
|
|
|
|
*
|
|
|
|
* But don't do that if it is a deliberate, throttling IO wait (this task
|
|
|
|
* has set its backing_dev_info: the queue against which it should throttle)
|
|
|
|
*/
|
|
|
|
void __sched io_schedule(void)
|
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = &__raw_get_cpu_var(runqueues);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-07-14 09:24:37 +02:00
|
|
|
delayacct_blkio_start();
|
2005-04-17 00:20:36 +02:00
|
|
|
atomic_inc(&rq->nr_iowait);
|
|
|
|
schedule();
|
|
|
|
atomic_dec(&rq->nr_iowait);
|
2006-07-14 09:24:37 +02:00
|
|
|
delayacct_blkio_end();
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(io_schedule);
|
|
|
|
|
|
|
|
long __sched io_schedule_timeout(long timeout)
|
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = &__raw_get_cpu_var(runqueues);
|
2005-04-17 00:20:36 +02:00
|
|
|
long ret;
|
|
|
|
|
2006-07-14 09:24:37 +02:00
|
|
|
delayacct_blkio_start();
|
2005-04-17 00:20:36 +02:00
|
|
|
atomic_inc(&rq->nr_iowait);
|
|
|
|
ret = schedule_timeout(timeout);
|
|
|
|
atomic_dec(&rq->nr_iowait);
|
2006-07-14 09:24:37 +02:00
|
|
|
delayacct_blkio_end();
|
2005-04-17 00:20:36 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_get_priority_max - return maximum RT priority.
|
|
|
|
* @policy: scheduling class.
|
|
|
|
*
|
|
|
|
* this syscall returns the maximum rt_priority that can be used
|
|
|
|
* by a given scheduling class.
|
|
|
|
*/
|
|
|
|
asmlinkage long sys_sched_get_priority_max(int policy)
|
|
|
|
{
|
|
|
|
int ret = -EINVAL;
|
|
|
|
|
|
|
|
switch (policy) {
|
|
|
|
case SCHED_FIFO:
|
|
|
|
case SCHED_RR:
|
|
|
|
ret = MAX_USER_RT_PRIO-1;
|
|
|
|
break;
|
|
|
|
case SCHED_NORMAL:
|
2006-01-14 22:20:41 +01:00
|
|
|
case SCHED_BATCH:
|
2007-07-09 18:51:59 +02:00
|
|
|
case SCHED_IDLE:
|
2005-04-17 00:20:36 +02:00
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_get_priority_min - return minimum RT priority.
|
|
|
|
* @policy: scheduling class.
|
|
|
|
*
|
|
|
|
* this syscall returns the minimum rt_priority that can be used
|
|
|
|
* by a given scheduling class.
|
|
|
|
*/
|
|
|
|
asmlinkage long sys_sched_get_priority_min(int policy)
|
|
|
|
{
|
|
|
|
int ret = -EINVAL;
|
|
|
|
|
|
|
|
switch (policy) {
|
|
|
|
case SCHED_FIFO:
|
|
|
|
case SCHED_RR:
|
|
|
|
ret = 1;
|
|
|
|
break;
|
|
|
|
case SCHED_NORMAL:
|
2006-01-14 22:20:41 +01:00
|
|
|
case SCHED_BATCH:
|
2007-07-09 18:51:59 +02:00
|
|
|
case SCHED_IDLE:
|
2005-04-17 00:20:36 +02:00
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sys_sched_rr_get_interval - return the default timeslice of a process.
|
|
|
|
* @pid: pid of the process.
|
|
|
|
* @interval: userspace pointer to the timeslice value.
|
|
|
|
*
|
|
|
|
* this syscall writes the default timeslice value of a given process
|
|
|
|
* into the user-space timespec buffer. A value of '0' means infinity.
|
|
|
|
*/
|
|
|
|
asmlinkage
|
|
|
|
long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
|
|
|
|
{
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *p;
|
2007-10-15 17:00:13 +02:00
|
|
|
unsigned int time_slice;
|
2007-10-15 17:00:14 +02:00
|
|
|
int retval;
|
2005-04-17 00:20:36 +02:00
|
|
|
struct timespec t;
|
|
|
|
|
|
|
|
if (pid < 0)
|
2007-10-15 17:00:14 +02:00
|
|
|
return -EINVAL;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
retval = -ESRCH;
|
|
|
|
read_lock(&tasklist_lock);
|
|
|
|
p = find_process_by_pid(pid);
|
|
|
|
if (!p)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
retval = security_task_getscheduler(p);
|
|
|
|
if (retval)
|
|
|
|
goto out_unlock;
|
|
|
|
|
2007-12-04 17:04:39 +01:00
|
|
|
/*
|
|
|
|
* Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
|
|
|
|
* tasks that are on an otherwise idle runqueue:
|
|
|
|
*/
|
|
|
|
time_slice = 0;
|
|
|
|
if (p->policy == SCHED_RR) {
|
2007-10-15 17:00:13 +02:00
|
|
|
time_slice = DEF_TIMESLICE;
|
2007-12-04 17:04:39 +01:00
|
|
|
} else {
|
2007-10-15 17:00:13 +02:00
|
|
|
struct sched_entity *se = &p->se;
|
|
|
|
unsigned long flags;
|
|
|
|
struct rq *rq;
|
|
|
|
|
|
|
|
rq = task_rq_lock(p, &flags);
|
2007-12-04 17:04:39 +01:00
|
|
|
if (rq->cfs.load.weight)
|
|
|
|
time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
|
2007-10-15 17:00:13 +02:00
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
read_unlock(&tasklist_lock);
|
2007-10-15 17:00:13 +02:00
|
|
|
jiffies_to_timespec(time_slice, &t);
|
2005-04-17 00:20:36 +02:00
|
|
|
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
|
|
|
|
return retval;
|
2007-10-15 17:00:14 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
out_unlock:
|
|
|
|
read_unlock(&tasklist_lock);
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2006-07-10 13:43:52 +02:00
|
|
|
static const char stat_nam[] = "RSDTtZX";
|
2006-07-03 09:25:41 +02:00
|
|
|
|
2008-01-25 21:08:02 +01:00
|
|
|
void sched_show_task(struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
unsigned long free = 0;
|
2006-07-03 09:25:41 +02:00
|
|
|
unsigned state;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
state = p->state ? __ffs(p->state) + 1 : 0;
|
2007-10-18 21:32:56 +02:00
|
|
|
printk(KERN_INFO "%-13.13s %c", p->comm,
|
2006-07-10 13:43:52 +02:00
|
|
|
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
|
2007-07-11 21:21:47 +02:00
|
|
|
#if BITS_PER_LONG == 32
|
2005-04-17 00:20:36 +02:00
|
|
|
if (state == TASK_RUNNING)
|
2007-10-18 21:32:56 +02:00
|
|
|
printk(KERN_CONT " running ");
|
2005-04-17 00:20:36 +02:00
|
|
|
else
|
2007-10-18 21:32:56 +02:00
|
|
|
printk(KERN_CONT " %08lx ", thread_saved_pc(p));
|
2005-04-17 00:20:36 +02:00
|
|
|
#else
|
|
|
|
if (state == TASK_RUNNING)
|
2007-10-18 21:32:56 +02:00
|
|
|
printk(KERN_CONT " running task ");
|
2005-04-17 00:20:36 +02:00
|
|
|
else
|
2007-10-18 21:32:56 +02:00
|
|
|
printk(KERN_CONT " %016lx ", thread_saved_pc(p));
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_DEBUG_STACK_USAGE
|
|
|
|
{
|
2005-11-14 01:06:56 +01:00
|
|
|
unsigned long *n = end_of_stack(p);
|
2005-04-17 00:20:36 +02:00
|
|
|
while (!*n)
|
|
|
|
n++;
|
2005-11-14 01:06:56 +01:00
|
|
|
free = (unsigned long)n - (unsigned long)end_of_stack(p);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
#endif
|
2007-10-19 08:40:40 +02:00
|
|
|
printk(KERN_CONT "%5lu %5d %6d\n", free,
|
2008-01-09 09:03:23 +01:00
|
|
|
task_pid_nr(p), task_pid_nr(p->real_parent));
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2008-01-25 21:08:34 +01:00
|
|
|
show_stack(p, NULL);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2006-12-07 05:35:59 +01:00
|
|
|
void show_state_filter(unsigned long state_filter)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *g, *p;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-11 21:21:47 +02:00
|
|
|
#if BITS_PER_LONG == 32
|
|
|
|
printk(KERN_INFO
|
|
|
|
" task PC stack pid father\n");
|
2005-04-17 00:20:36 +02:00
|
|
|
#else
|
2007-07-11 21:21:47 +02:00
|
|
|
printk(KERN_INFO
|
|
|
|
" task PC stack pid father\n");
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
read_lock(&tasklist_lock);
|
|
|
|
do_each_thread(g, p) {
|
|
|
|
/*
|
|
|
|
* reset the NMI-timeout, listing all files on a slow
|
|
|
|
* console might take alot of time:
|
|
|
|
*/
|
|
|
|
touch_nmi_watchdog();
|
2007-04-26 05:50:03 +02:00
|
|
|
if (!state_filter || (p->state & state_filter))
|
2008-01-25 21:08:02 +01:00
|
|
|
sched_show_task(p);
|
2005-04-17 00:20:36 +02:00
|
|
|
} while_each_thread(g, p);
|
|
|
|
|
2007-05-08 09:28:05 +02:00
|
|
|
touch_all_softlockup_watchdogs();
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
sysrq_sched_debug_show();
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
read_unlock(&tasklist_lock);
|
2006-12-07 05:35:59 +01:00
|
|
|
/*
|
|
|
|
* Only show locks if all tasks are dumped:
|
|
|
|
*/
|
|
|
|
if (state_filter == -1)
|
|
|
|
debug_show_all_locks();
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:58 +02:00
|
|
|
void __cpuinit init_idle_bootup_task(struct task_struct *idle)
|
|
|
|
{
|
2007-07-09 18:51:59 +02:00
|
|
|
idle->sched_class = &idle_sched_class;
|
2007-07-09 18:51:58 +02:00
|
|
|
}
|
|
|
|
|
2005-06-28 16:40:42 +02:00
|
|
|
/**
|
|
|
|
* init_idle - set up an idle thread for a given CPU
|
|
|
|
* @idle: task in question
|
|
|
|
* @cpu: cpu the idle task belongs to
|
|
|
|
*
|
|
|
|
* NOTE: this function does not set the idle thread's NEED_RESCHED
|
|
|
|
* flag, to make booting more robust.
|
|
|
|
*/
|
2006-10-03 10:14:04 +02:00
|
|
|
void __cpuinit init_idle(struct task_struct *idle, int cpu)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long flags;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
__sched_fork(idle);
|
|
|
|
idle->se.exec_start = sched_clock();
|
|
|
|
|
2006-06-27 11:54:51 +02:00
|
|
|
idle->prio = idle->normal_prio = MAX_PRIO;
|
2005-04-17 00:20:36 +02:00
|
|
|
idle->cpus_allowed = cpumask_of_cpu(cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
__set_task_cpu(idle, cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
rq->curr = rq->idle = idle;
|
2005-06-25 23:57:23 +02:00
|
|
|
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
|
|
|
|
idle->oncpu = 1;
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
|
|
|
|
/* Set the preempt count _outside_ the spinlocks! */
|
2005-11-14 01:06:55 +01:00
|
|
|
task_thread_info(idle)->preempt_count = 0;
|
2008-01-25 21:08:33 +01:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* The idle tasks have their own, simple scheduling class:
|
|
|
|
*/
|
|
|
|
idle->sched_class = &idle_sched_class;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In a system that switches off the HZ timer nohz_cpu_mask
|
|
|
|
* indicates which cpus entered this state. This is used
|
|
|
|
* in the rcu update to wait only for active cpus. For system
|
|
|
|
* which do not switch off the HZ timer nohz_cpu_mask should
|
|
|
|
* always be CPU_MASK_NONE.
|
|
|
|
*/
|
|
|
|
cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
|
|
|
|
|
2007-11-09 22:39:38 +01:00
|
|
|
/*
|
|
|
|
* Increase the granularity value when there are more CPUs,
|
|
|
|
* because with more CPUs the 'effective latency' as visible
|
|
|
|
* to users decreases. But the relationship is not linear,
|
|
|
|
* so pick a second-best guess by going with the log2 of the
|
|
|
|
* number of CPUs.
|
|
|
|
*
|
|
|
|
* This idea comes from the SD scheduler of Con Kolivas:
|
|
|
|
*/
|
|
|
|
static inline void sched_init_granularity(void)
|
|
|
|
{
|
|
|
|
unsigned int factor = 1 + ilog2(num_online_cpus());
|
|
|
|
const unsigned long limit = 200000000;
|
|
|
|
|
|
|
|
sysctl_sched_min_granularity *= factor;
|
|
|
|
if (sysctl_sched_min_granularity > limit)
|
|
|
|
sysctl_sched_min_granularity = limit;
|
|
|
|
|
|
|
|
sysctl_sched_latency *= factor;
|
|
|
|
if (sysctl_sched_latency > limit)
|
|
|
|
sysctl_sched_latency = limit;
|
|
|
|
|
|
|
|
sysctl_sched_wakeup_granularity *= factor;
|
|
|
|
sysctl_sched_batch_wakeup_granularity *= factor;
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/*
|
|
|
|
* This is how migration works:
|
|
|
|
*
|
2006-07-03 09:25:42 +02:00
|
|
|
* 1) we queue a struct migration_req structure in the source CPU's
|
2005-04-17 00:20:36 +02:00
|
|
|
* runqueue and wake up that CPU's migration thread.
|
|
|
|
* 2) we down() the locked semaphore => thread blocks.
|
|
|
|
* 3) migration thread wakes up (implicitly it forces the migrated
|
|
|
|
* thread off the CPU)
|
|
|
|
* 4) it gets the migration request and checks whether the migrated
|
|
|
|
* task is still in the wrong runqueue.
|
|
|
|
* 5) if it's in the wrong runqueue then the migration thread removes
|
|
|
|
* it and puts it into the right queue.
|
|
|
|
* 6) migration thread up()s the semaphore.
|
|
|
|
* 7) we wake up and the migration is done.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Change a given task's CPU affinity. Migrate the thread to a
|
|
|
|
* proper CPU and schedule it away if the CPU it's executing on
|
|
|
|
* is removed from the allowed bitmask.
|
|
|
|
*
|
|
|
|
* NOTE: the caller must have a valid reference to the task, the
|
2007-12-05 15:46:09 +01:00
|
|
|
* task must not exit() & deallocate itself prematurely. The
|
2005-04-17 00:20:36 +02:00
|
|
|
* call is not atomic; no spinlocks may be held.
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct migration_req req;
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long flags;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2006-07-03 09:25:40 +02:00
|
|
|
int ret = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
rq = task_rq_lock(p, &flags);
|
|
|
|
if (!cpus_intersects(new_mask, cpu_online_map)) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:07 +01:00
|
|
|
if (p->sched_class->set_cpus_allowed)
|
|
|
|
p->sched_class->set_cpus_allowed(p, &new_mask);
|
|
|
|
else {
|
2008-01-25 21:08:19 +01:00
|
|
|
p->cpus_allowed = new_mask;
|
2008-01-25 21:08:30 +01:00
|
|
|
p->rt.nr_cpus_allowed = cpus_weight(new_mask);
|
2008-01-25 21:08:07 +01:00
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/* Can the task run on the task's current CPU? If so, we're done */
|
|
|
|
if (cpu_isset(task_cpu(p), new_mask))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (migrate_task(p, any_online_cpu(new_mask), &req)) {
|
|
|
|
/* Need help from migration thread: drop lock and wait. */
|
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
wake_up_process(rq->migration_thread);
|
|
|
|
wait_for_completion(&req.done);
|
|
|
|
tlb_migrate_finish(p->mm);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
out:
|
|
|
|
task_rq_unlock(rq, &flags);
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(set_cpus_allowed);
|
|
|
|
|
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* Move (not current) task off this cpu, onto dest cpu. We're doing
|
2005-04-17 00:20:36 +02:00
|
|
|
* this because either it can't run here any more (set_cpus_allowed()
|
|
|
|
* away from this CPU, or CPU going down), or because we're
|
|
|
|
* attempting to rebalance this task on exec (sched_exec).
|
|
|
|
*
|
|
|
|
* So we race with normal scheduler movements, but that's OK, as long
|
|
|
|
* as the task is no longer on this CPU.
|
2006-06-27 11:54:32 +02:00
|
|
|
*
|
|
|
|
* Returns non-zero if task was successfully migrated.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2006-06-27 11:54:32 +02:00
|
|
|
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq_dest, *rq_src;
|
2007-07-09 18:51:59 +02:00
|
|
|
int ret = 0, on_rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
if (unlikely(cpu_is_offline(dest_cpu)))
|
2006-06-27 11:54:32 +02:00
|
|
|
return ret;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
rq_src = cpu_rq(src_cpu);
|
|
|
|
rq_dest = cpu_rq(dest_cpu);
|
|
|
|
|
|
|
|
double_rq_lock(rq_src, rq_dest);
|
|
|
|
/* Already moved. */
|
|
|
|
if (task_cpu(p) != src_cpu)
|
|
|
|
goto out;
|
|
|
|
/* Affinity changed (again). */
|
|
|
|
if (!cpu_isset(dest_cpu, p->cpus_allowed))
|
|
|
|
goto out;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
on_rq = p->se.on_rq;
|
2007-08-09 11:16:51 +02:00
|
|
|
if (on_rq)
|
2007-08-09 11:16:49 +02:00
|
|
|
deactivate_task(rq_src, p, 0);
|
2007-08-09 11:16:51 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
set_task_cpu(p, dest_cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
if (on_rq) {
|
|
|
|
activate_task(rq_dest, p, 0);
|
|
|
|
check_preempt_curr(rq_dest, p);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2006-06-27 11:54:32 +02:00
|
|
|
ret = 1;
|
2005-04-17 00:20:36 +02:00
|
|
|
out:
|
|
|
|
double_rq_unlock(rq_src, rq_dest);
|
2006-06-27 11:54:32 +02:00
|
|
|
return ret;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* migration_thread - this is a highprio system thread that performs
|
|
|
|
* thread migration by bumping thread off CPU then 'pushing' onto
|
|
|
|
* another runqueue.
|
|
|
|
*/
|
2005-09-10 09:26:11 +02:00
|
|
|
static int migration_thread(void *data)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
int cpu = (long)data;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
rq = cpu_rq(cpu);
|
|
|
|
BUG_ON(rq->migration_thread != current);
|
|
|
|
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
while (!kthread_should_stop()) {
|
2006-07-03 09:25:42 +02:00
|
|
|
struct migration_req *req;
|
2005-04-17 00:20:36 +02:00
|
|
|
struct list_head *head;
|
|
|
|
|
|
|
|
spin_lock_irq(&rq->lock);
|
|
|
|
|
|
|
|
if (cpu_is_offline(cpu)) {
|
|
|
|
spin_unlock_irq(&rq->lock);
|
|
|
|
goto wait_to_die;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (rq->active_balance) {
|
|
|
|
active_load_balance(rq, cpu);
|
|
|
|
rq->active_balance = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
head = &rq->migration_queue;
|
|
|
|
|
|
|
|
if (list_empty(head)) {
|
|
|
|
spin_unlock_irq(&rq->lock);
|
|
|
|
schedule();
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
continue;
|
|
|
|
}
|
2006-07-03 09:25:42 +02:00
|
|
|
req = list_entry(head->next, struct migration_req, list);
|
2005-04-17 00:20:36 +02:00
|
|
|
list_del_init(head->next);
|
|
|
|
|
2005-06-25 23:57:27 +02:00
|
|
|
spin_unlock(&rq->lock);
|
|
|
|
__migrate_task(req->task, cpu, req->dest_cpu);
|
|
|
|
local_irq_enable();
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
complete(&req->done);
|
|
|
|
}
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
wait_to_die:
|
|
|
|
/* Wait for kthread_stop */
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
while (!kthread_should_stop()) {
|
|
|
|
schedule();
|
|
|
|
set_current_state(TASK_INTERRUPTIBLE);
|
|
|
|
}
|
|
|
|
__set_current_state(TASK_RUNNING);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
2007-10-17 08:30:56 +02:00
|
|
|
|
|
|
|
static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
local_irq_disable();
|
|
|
|
ret = __migrate_task(p, src_cpu, dest_cpu);
|
|
|
|
local_irq_enable();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2006-12-10 11:20:11 +01:00
|
|
|
/*
|
2007-10-19 23:10:43 +02:00
|
|
|
* Figure out where task on dead CPU should go, use force if necessary.
|
2006-12-10 11:20:11 +01:00
|
|
|
* NOTE: interrupts should be disabled by the caller
|
|
|
|
*/
|
2006-07-03 09:25:40 +02:00
|
|
|
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-06-27 11:54:32 +02:00
|
|
|
unsigned long flags;
|
2005-04-17 00:20:36 +02:00
|
|
|
cpumask_t mask;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
|
|
|
int dest_cpu;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
do {
|
|
|
|
/* On same node? */
|
|
|
|
mask = node_to_cpumask(cpu_to_node(dead_cpu));
|
|
|
|
cpus_and(mask, mask, p->cpus_allowed);
|
|
|
|
dest_cpu = any_online_cpu(mask);
|
|
|
|
|
|
|
|
/* On any allowed CPU? */
|
|
|
|
if (dest_cpu == NR_CPUS)
|
|
|
|
dest_cpu = any_online_cpu(p->cpus_allowed);
|
|
|
|
|
|
|
|
/* No more Mr. Nice Guy. */
|
|
|
|
if (dest_cpu == NR_CPUS) {
|
2007-10-19 08:40:46 +02:00
|
|
|
cpumask_t cpus_allowed = cpuset_cpus_allowed_locked(p);
|
|
|
|
/*
|
|
|
|
* Try to stay on the same cpuset, where the
|
|
|
|
* current cpuset may be a subset of all cpus.
|
|
|
|
* The cpuset_cpus_allowed_locked() variant of
|
2007-12-05 15:46:09 +01:00
|
|
|
* cpuset_cpus_allowed() will not block. It must be
|
2007-10-19 08:40:46 +02:00
|
|
|
* called within calls to cpuset_lock/cpuset_unlock.
|
|
|
|
*/
|
2007-10-15 17:00:14 +02:00
|
|
|
rq = task_rq_lock(p, &flags);
|
2007-10-19 08:40:46 +02:00
|
|
|
p->cpus_allowed = cpus_allowed;
|
2007-10-15 17:00:14 +02:00
|
|
|
dest_cpu = any_online_cpu(p->cpus_allowed);
|
|
|
|
task_rq_unlock(rq, &flags);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
/*
|
|
|
|
* Don't tell them about moving exiting tasks or
|
|
|
|
* kernel threads (both mm NULL), since they never
|
|
|
|
* leave kernel.
|
|
|
|
*/
|
2007-12-05 15:46:09 +01:00
|
|
|
if (p->mm && printk_ratelimit()) {
|
2007-10-15 17:00:14 +02:00
|
|
|
printk(KERN_INFO "process %d (%s) no "
|
|
|
|
"longer affine to cpu%d\n",
|
2007-12-05 15:46:09 +01:00
|
|
|
task_pid_nr(p), p->comm, dead_cpu);
|
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
}
|
2007-10-17 08:30:56 +02:00
|
|
|
} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* While a dead CPU has no uninterruptible tasks queued at this point,
|
|
|
|
* it might still have a nonzero ->nr_uninterruptible counter, because
|
|
|
|
* for performance reasons the counter is not stricly tracking tasks to
|
|
|
|
* their home CPUs. So we just add the counter to another CPU's counter,
|
|
|
|
* to keep the global sum constant after CPU-down:
|
|
|
|
*/
|
2006-07-03 09:25:42 +02:00
|
|
|
static void migrate_nr_uninterruptible(struct rq *rq_src)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
local_irq_save(flags);
|
|
|
|
double_rq_lock(rq_src, rq_dest);
|
|
|
|
rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
|
|
|
|
rq_src->nr_uninterruptible = 0;
|
|
|
|
double_rq_unlock(rq_src, rq_dest);
|
|
|
|
local_irq_restore(flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Run through task list and migrate tasks from the dead cpu. */
|
|
|
|
static void migrate_live_tasks(int src_cpu)
|
|
|
|
{
|
2006-07-03 09:25:40 +02:00
|
|
|
struct task_struct *p, *t;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-17 08:30:56 +02:00
|
|
|
read_lock(&tasklist_lock);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
do_each_thread(t, p) {
|
|
|
|
if (p == current)
|
2005-04-17 00:20:36 +02:00
|
|
|
continue;
|
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
if (task_cpu(p) == src_cpu)
|
|
|
|
move_task_off_dead_cpu(src_cpu, p);
|
|
|
|
} while_each_thread(t, p);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-17 08:30:56 +02:00
|
|
|
read_unlock(&tasklist_lock);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* Schedules idle task to be the next runnable task on current CPU.
|
2007-11-15 20:57:40 +01:00
|
|
|
* It does so by boosting its priority to highest possible.
|
|
|
|
* Used by CPU offline code.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
|
|
|
void sched_idle_next(void)
|
|
|
|
{
|
2006-07-03 09:25:40 +02:00
|
|
|
int this_cpu = smp_processor_id();
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = cpu_rq(this_cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
struct task_struct *p = rq->idle;
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
/* cpu has to be offline */
|
2006-07-03 09:25:40 +02:00
|
|
|
BUG_ON(cpu_online(this_cpu));
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
/*
|
|
|
|
* Strictly not necessary since rest of the CPUs are stopped by now
|
|
|
|
* and interrupts disabled on the current cpu.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
|
|
|
spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2007-11-15 20:57:40 +01:00
|
|
|
update_rq_clock(rq);
|
|
|
|
activate_task(rq, p, 0);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
/*
|
|
|
|
* Ensures that the idle task is using init_mm right before its cpu goes
|
2005-04-17 00:20:36 +02:00
|
|
|
* offline.
|
|
|
|
*/
|
|
|
|
void idle_task_exit(void)
|
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->active_mm;
|
|
|
|
|
|
|
|
BUG_ON(cpu_online(smp_processor_id()));
|
|
|
|
|
|
|
|
if (mm != &init_mm)
|
|
|
|
switch_mm(mm, &init_mm, current);
|
|
|
|
mmdrop(mm);
|
|
|
|
}
|
|
|
|
|
2006-12-10 11:20:11 +01:00
|
|
|
/* called under rq->lock with disabled interrupts */
|
2006-07-03 09:25:41 +02:00
|
|
|
static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = cpu_rq(dead_cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/* Must be exiting, otherwise would be on tasklist. */
|
2007-10-19 08:40:38 +02:00
|
|
|
BUG_ON(!p->exit_state);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/* Cannot have done final schedule yet: would have vanished. */
|
2006-09-29 11:01:11 +02:00
|
|
|
BUG_ON(p->state == TASK_DEAD);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
get_task_struct(p);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Drop lock around migration; if someone else moves it,
|
2007-12-05 15:46:09 +01:00
|
|
|
* that's OK. No task can be added to this CPU, so iteration is
|
2005-04-17 00:20:36 +02:00
|
|
|
* fine.
|
|
|
|
*/
|
2007-10-17 08:30:56 +02:00
|
|
|
spin_unlock_irq(&rq->lock);
|
2006-07-03 09:25:40 +02:00
|
|
|
move_task_off_dead_cpu(dead_cpu, p);
|
2007-10-17 08:30:56 +02:00
|
|
|
spin_lock_irq(&rq->lock);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
put_task_struct(p);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* release_task() removes task from tasklist, so we won't find dead tasks. */
|
|
|
|
static void migrate_dead_tasks(unsigned int dead_cpu)
|
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = cpu_rq(dead_cpu);
|
2007-07-09 18:51:59 +02:00
|
|
|
struct task_struct *next;
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
for ( ; ; ) {
|
|
|
|
if (!rq->nr_running)
|
|
|
|
break;
|
2007-08-09 11:16:47 +02:00
|
|
|
update_rq_clock(rq);
|
2007-08-09 11:16:49 +02:00
|
|
|
next = pick_next_task(rq, rq->curr);
|
2007-07-09 18:51:59 +02:00
|
|
|
if (!next)
|
|
|
|
break;
|
|
|
|
migrate_dead(dead_cpu, next);
|
2007-07-26 13:40:43 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_HOTPLUG_CPU */
|
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
|
|
|
|
|
|
|
|
static struct ctl_table sd_ctl_dir[] = {
|
2007-08-09 11:16:46 +02:00
|
|
|
{
|
|
|
|
.procname = "sched_domain",
|
2007-08-23 15:18:02 +02:00
|
|
|
.mode = 0555,
|
2007-08-09 11:16:46 +02:00
|
|
|
},
|
2007-10-29 21:18:11 +01:00
|
|
|
{0, },
|
2007-07-26 13:40:43 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
static struct ctl_table sd_ctl_root[] = {
|
2007-08-09 11:16:46 +02:00
|
|
|
{
|
2007-08-23 15:18:02 +02:00
|
|
|
.ctl_name = CTL_KERN,
|
2007-08-09 11:16:46 +02:00
|
|
|
.procname = "kernel",
|
2007-08-23 15:18:02 +02:00
|
|
|
.mode = 0555,
|
2007-08-09 11:16:46 +02:00
|
|
|
.child = sd_ctl_dir,
|
|
|
|
},
|
2007-10-29 21:18:11 +01:00
|
|
|
{0, },
|
2007-07-26 13:40:43 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
static struct ctl_table *sd_alloc_ctl_entry(int n)
|
|
|
|
{
|
|
|
|
struct ctl_table *entry =
|
2007-10-15 17:00:19 +02:00
|
|
|
kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
|
2007-07-26 13:40:43 +02:00
|
|
|
|
|
|
|
return entry;
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
static void sd_free_ctl_entry(struct ctl_table **tablep)
|
|
|
|
{
|
2007-10-17 16:55:11 +02:00
|
|
|
struct ctl_table *entry;
|
2007-10-15 17:00:19 +02:00
|
|
|
|
2007-10-17 16:55:11 +02:00
|
|
|
/*
|
|
|
|
* In the intermediate directories, both the child directory and
|
|
|
|
* procname are dynamically allocated and could fail but the mode
|
2007-12-05 15:46:09 +01:00
|
|
|
* will always be set. In the lowest directory the names are
|
2007-10-17 16:55:11 +02:00
|
|
|
* static strings and all have proc handlers.
|
|
|
|
*/
|
|
|
|
for (entry = *tablep; entry->mode; entry++) {
|
2007-10-15 17:00:19 +02:00
|
|
|
if (entry->child)
|
|
|
|
sd_free_ctl_entry(&entry->child);
|
2007-10-17 16:55:11 +02:00
|
|
|
if (entry->proc_handler == NULL)
|
|
|
|
kfree(entry->procname);
|
|
|
|
}
|
2007-10-15 17:00:19 +02:00
|
|
|
|
|
|
|
kfree(*tablep);
|
|
|
|
*tablep = NULL;
|
|
|
|
}
|
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
static void
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(struct ctl_table *entry,
|
2007-07-26 13:40:43 +02:00
|
|
|
const char *procname, void *data, int maxlen,
|
|
|
|
mode_t mode, proc_handler *proc_handler)
|
|
|
|
{
|
|
|
|
entry->procname = procname;
|
|
|
|
entry->data = data;
|
|
|
|
entry->maxlen = maxlen;
|
|
|
|
entry->mode = mode;
|
|
|
|
entry->proc_handler = proc_handler;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ctl_table *
|
|
|
|
sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
struct ctl_table *table = sd_alloc_ctl_entry(12);
|
2007-07-26 13:40:43 +02:00
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
if (table == NULL)
|
|
|
|
return NULL;
|
|
|
|
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[0], "min_interval", &sd->min_interval,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(long), 0644, proc_doulongvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[1], "max_interval", &sd->max_interval,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(long), 0644, proc_doulongvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-08-09 11:16:46 +02:00
|
|
|
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-10-15 17:00:14 +02:00
|
|
|
set_table_entry(&table[9], "cache_nice_tries",
|
2007-07-26 13:40:43 +02:00
|
|
|
&sd->cache_nice_tries,
|
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-10-15 17:00:14 +02:00
|
|
|
set_table_entry(&table[10], "flags", &sd->flags,
|
2007-07-26 13:40:43 +02:00
|
|
|
sizeof(int), 0644, proc_dointvec_minmax);
|
2007-10-15 17:00:19 +02:00
|
|
|
/* &table[11] is terminator */
|
2007-07-26 13:40:43 +02:00
|
|
|
|
|
|
|
return table;
|
|
|
|
}
|
|
|
|
|
2007-11-28 15:52:56 +01:00
|
|
|
static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
|
2007-07-26 13:40:43 +02:00
|
|
|
{
|
|
|
|
struct ctl_table *entry, *table;
|
|
|
|
struct sched_domain *sd;
|
|
|
|
int domain_num = 0, i;
|
|
|
|
char buf[32];
|
|
|
|
|
|
|
|
for_each_domain(cpu, sd)
|
|
|
|
domain_num++;
|
|
|
|
entry = table = sd_alloc_ctl_entry(domain_num + 1);
|
2007-10-15 17:00:19 +02:00
|
|
|
if (table == NULL)
|
|
|
|
return NULL;
|
2007-07-26 13:40:43 +02:00
|
|
|
|
|
|
|
i = 0;
|
|
|
|
for_each_domain(cpu, sd) {
|
|
|
|
snprintf(buf, 32, "domain%d", i);
|
|
|
|
entry->procname = kstrdup(buf, GFP_KERNEL);
|
2007-08-23 15:18:02 +02:00
|
|
|
entry->mode = 0555;
|
2007-07-26 13:40:43 +02:00
|
|
|
entry->child = sd_alloc_ctl_domain_table(sd);
|
|
|
|
entry++;
|
|
|
|
i++;
|
|
|
|
}
|
|
|
|
return table;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct ctl_table_header *sd_sysctl_header;
|
2007-10-15 17:00:19 +02:00
|
|
|
static void register_sched_domain_sysctl(void)
|
2007-07-26 13:40:43 +02:00
|
|
|
{
|
|
|
|
int i, cpu_num = num_online_cpus();
|
|
|
|
struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
|
|
|
|
char buf[32];
|
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
WARN_ON(sd_ctl_dir[0].child);
|
|
|
|
sd_ctl_dir[0].child = entry;
|
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
if (entry == NULL)
|
|
|
|
return;
|
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
for_each_online_cpu(i) {
|
2007-07-26 13:40:43 +02:00
|
|
|
snprintf(buf, 32, "cpu%d", i);
|
|
|
|
entry->procname = kstrdup(buf, GFP_KERNEL);
|
2007-08-23 15:18:02 +02:00
|
|
|
entry->mode = 0555;
|
2007-07-26 13:40:43 +02:00
|
|
|
entry->child = sd_alloc_ctl_cpu_table(i);
|
2007-10-15 17:00:19 +02:00
|
|
|
entry++;
|
2007-07-26 13:40:43 +02:00
|
|
|
}
|
2007-10-24 18:23:48 +02:00
|
|
|
|
|
|
|
WARN_ON(sd_sysctl_header);
|
2007-07-26 13:40:43 +02:00
|
|
|
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
|
|
|
|
}
|
2007-10-15 17:00:19 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
/* may be called multiple times per register */
|
2007-10-15 17:00:19 +02:00
|
|
|
static void unregister_sched_domain_sysctl(void)
|
|
|
|
{
|
2007-10-24 18:23:48 +02:00
|
|
|
if (sd_sysctl_header)
|
|
|
|
unregister_sysctl_table(sd_sysctl_header);
|
2007-10-15 17:00:19 +02:00
|
|
|
sd_sysctl_header = NULL;
|
2007-10-24 18:23:48 +02:00
|
|
|
if (sd_ctl_dir[0].child)
|
|
|
|
sd_free_ctl_entry(&sd_ctl_dir[0].child);
|
2007-10-15 17:00:19 +02:00
|
|
|
}
|
2007-07-26 13:40:43 +02:00
|
|
|
#else
|
2007-10-15 17:00:19 +02:00
|
|
|
static void register_sched_domain_sysctl(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
static void unregister_sched_domain_sysctl(void)
|
2007-07-26 13:40:43 +02:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* migration_call - callback that gets triggered when a CPU is added.
|
|
|
|
* Here we can start up the necessary migration thread for the new CPU.
|
|
|
|
*/
|
2006-07-03 09:25:40 +02:00
|
|
|
static int __cpuinit
|
|
|
|
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
struct task_struct *p;
|
2006-07-03 09:25:40 +02:00
|
|
|
int cpu = (long)hcpu;
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long flags;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
switch (action) {
|
2007-05-09 11:34:04 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
case CPU_UP_PREPARE:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_UP_PREPARE_FROZEN:
|
2007-07-09 18:51:59 +02:00
|
|
|
p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
if (IS_ERR(p))
|
|
|
|
return NOTIFY_BAD;
|
|
|
|
kthread_bind(p, cpu);
|
|
|
|
/* Must be high prio: stop_machine expects to yield to it. */
|
|
|
|
rq = task_rq_lock(p, &flags);
|
2007-07-09 18:51:59 +02:00
|
|
|
__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
|
2005-04-17 00:20:36 +02:00
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
cpu_rq(cpu)->migration_thread = p;
|
|
|
|
break;
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
case CPU_ONLINE:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_ONLINE_FROZEN:
|
2007-10-19 23:10:43 +02:00
|
|
|
/* Strictly unnecessary, as first user will wake it. */
|
2005-04-17 00:20:36 +02:00
|
|
|
wake_up_process(cpu_rq(cpu)->migration_thread);
|
2008-01-25 21:08:18 +01:00
|
|
|
|
|
|
|
/* Update our root-domain */
|
|
|
|
rq = cpu_rq(cpu);
|
|
|
|
spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
if (rq->rd) {
|
|
|
|
BUG_ON(!cpu_isset(cpu, rq->rd->span));
|
|
|
|
cpu_set(cpu, rq->rd->online);
|
|
|
|
}
|
|
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
2005-04-17 00:20:36 +02:00
|
|
|
break;
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
|
|
case CPU_UP_CANCELED:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_UP_CANCELED_FROZEN:
|
2006-06-25 14:49:10 +02:00
|
|
|
if (!cpu_rq(cpu)->migration_thread)
|
|
|
|
break;
|
2007-12-05 15:46:09 +01:00
|
|
|
/* Unbind it from offline cpu so it can run. Fall thru. */
|
2005-11-07 09:58:38 +01:00
|
|
|
kthread_bind(cpu_rq(cpu)->migration_thread,
|
|
|
|
any_online_cpu(cpu_online_map));
|
2005-04-17 00:20:36 +02:00
|
|
|
kthread_stop(cpu_rq(cpu)->migration_thread);
|
|
|
|
cpu_rq(cpu)->migration_thread = NULL;
|
|
|
|
break;
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
case CPU_DEAD:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_DEAD_FROZEN:
|
2007-10-19 08:40:46 +02:00
|
|
|
cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
|
2005-04-17 00:20:36 +02:00
|
|
|
migrate_live_tasks(cpu);
|
|
|
|
rq = cpu_rq(cpu);
|
|
|
|
kthread_stop(rq->migration_thread);
|
|
|
|
rq->migration_thread = NULL;
|
|
|
|
/* Idle task back to normal (off runqueue, low prio) */
|
2007-10-17 08:30:56 +02:00
|
|
|
spin_lock_irq(&rq->lock);
|
2007-08-09 11:16:47 +02:00
|
|
|
update_rq_clock(rq);
|
2007-08-09 11:16:49 +02:00
|
|
|
deactivate_task(rq, rq->idle, 0);
|
2005-04-17 00:20:36 +02:00
|
|
|
rq->idle->static_prio = MAX_PRIO;
|
2007-07-09 18:51:59 +02:00
|
|
|
__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
|
|
|
|
rq->idle->sched_class = &idle_sched_class;
|
2005-04-17 00:20:36 +02:00
|
|
|
migrate_dead_tasks(cpu);
|
2007-10-17 08:30:56 +02:00
|
|
|
spin_unlock_irq(&rq->lock);
|
2007-10-19 08:40:46 +02:00
|
|
|
cpuset_unlock();
|
2005-04-17 00:20:36 +02:00
|
|
|
migrate_nr_uninterruptible(rq);
|
|
|
|
BUG_ON(rq->nr_running != 0);
|
|
|
|
|
2007-12-05 15:46:09 +01:00
|
|
|
/*
|
|
|
|
* No need to migrate the tasks: it was best-effort if
|
|
|
|
* they didn't take sched_hotcpu_mutex. Just wake up
|
|
|
|
* the requestors.
|
|
|
|
*/
|
2005-04-17 00:20:36 +02:00
|
|
|
spin_lock_irq(&rq->lock);
|
|
|
|
while (!list_empty(&rq->migration_queue)) {
|
2006-07-03 09:25:42 +02:00
|
|
|
struct migration_req *req;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
req = list_entry(rq->migration_queue.next,
|
2006-07-03 09:25:42 +02:00
|
|
|
struct migration_req, list);
|
2005-04-17 00:20:36 +02:00
|
|
|
list_del_init(&req->list);
|
|
|
|
complete(&req->done);
|
|
|
|
}
|
|
|
|
spin_unlock_irq(&rq->lock);
|
|
|
|
break;
|
2008-01-25 21:08:18 +01:00
|
|
|
|
|
|
|
case CPU_DOWN_PREPARE:
|
|
|
|
/* Update our root-domain */
|
|
|
|
rq = cpu_rq(cpu);
|
|
|
|
spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
if (rq->rd) {
|
|
|
|
BUG_ON(!cpu_isset(cpu, rq->rd->span));
|
|
|
|
cpu_clear(cpu, rq->rd->online);
|
|
|
|
}
|
|
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
break;
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Register at highest priority so that task migration (migrate_all_tasks)
|
|
|
|
* happens before everything else.
|
|
|
|
*/
|
2006-06-27 11:54:10 +02:00
|
|
|
static struct notifier_block __cpuinitdata migration_notifier = {
|
2005-04-17 00:20:36 +02:00
|
|
|
.notifier_call = migration_call,
|
|
|
|
.priority = 10
|
|
|
|
};
|
|
|
|
|
2007-11-09 22:39:39 +01:00
|
|
|
void __init migration_init(void)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
void *cpu = (void *)(long)smp_processor_id();
|
2006-09-29 11:00:22 +02:00
|
|
|
int err;
|
2006-07-03 09:25:40 +02:00
|
|
|
|
|
|
|
/* Start one for the boot CPU: */
|
2006-09-29 11:00:22 +02:00
|
|
|
err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
|
|
|
|
BUG_ON(err == NOTIFY_BAD);
|
2005-04-17 00:20:36 +02:00
|
|
|
migration_call(&migration_notifier, CPU_ONLINE, cpu);
|
|
|
|
register_cpu_notifier(&migration_notifier);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
2007-05-06 23:48:58 +02:00
|
|
|
|
|
|
|
/* Number of possible processor ids */
|
|
|
|
int nr_cpu_ids __read_mostly = NR_CPUS;
|
|
|
|
EXPORT_SYMBOL(nr_cpu_ids);
|
|
|
|
|
2007-10-15 17:00:13 +02:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
2007-10-24 18:23:48 +02:00
|
|
|
|
|
|
|
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2007-10-24 18:23:48 +02:00
|
|
|
struct sched_group *group = sd->groups;
|
|
|
|
cpumask_t groupmask;
|
|
|
|
char str[NR_CPUS];
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
cpumask_scnprintf(str, NR_CPUS, sd->span);
|
|
|
|
cpus_clear(groupmask);
|
|
|
|
|
|
|
|
printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
|
|
|
|
|
|
|
|
if (!(sd->flags & SD_LOAD_BALANCE)) {
|
|
|
|
printk("does not load-balance\n");
|
|
|
|
if (sd->parent)
|
|
|
|
printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
|
|
|
|
" has parent");
|
|
|
|
return -1;
|
2005-06-25 23:57:24 +02:00
|
|
|
}
|
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
printk(KERN_CONT "span %s\n", str);
|
|
|
|
|
|
|
|
if (!cpu_isset(cpu, sd->span)) {
|
|
|
|
printk(KERN_ERR "ERROR: domain->span does not contain "
|
|
|
|
"CPU%d\n", cpu);
|
|
|
|
}
|
|
|
|
if (!cpu_isset(cpu, group->cpumask)) {
|
|
|
|
printk(KERN_ERR "ERROR: domain->groups does not contain"
|
|
|
|
" CPU%d\n", cpu);
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
printk(KERN_DEBUG "%*s groups:", level + 1, "");
|
2005-04-17 00:20:36 +02:00
|
|
|
do {
|
2007-10-24 18:23:48 +02:00
|
|
|
if (!group) {
|
|
|
|
printk("\n");
|
|
|
|
printk(KERN_ERR "ERROR: group is NULL\n");
|
2005-04-17 00:20:36 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
if (!group->__cpu_power) {
|
|
|
|
printk(KERN_CONT "\n");
|
|
|
|
printk(KERN_ERR "ERROR: domain->cpu_power not "
|
|
|
|
"set\n");
|
|
|
|
break;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
if (!cpus_weight(group->cpumask)) {
|
|
|
|
printk(KERN_CONT "\n");
|
|
|
|
printk(KERN_ERR "ERROR: empty group\n");
|
|
|
|
break;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
if (cpus_intersects(groupmask, group->cpumask)) {
|
|
|
|
printk(KERN_CONT "\n");
|
|
|
|
printk(KERN_ERR "ERROR: repeated CPUs\n");
|
|
|
|
break;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
cpus_or(groupmask, groupmask, group->cpumask);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
cpumask_scnprintf(str, NR_CPUS, group->cpumask);
|
|
|
|
printk(KERN_CONT " %s", str);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
group = group->next;
|
|
|
|
} while (group != sd->groups);
|
|
|
|
printk(KERN_CONT "\n");
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
if (!cpus_equal(sd->span, groupmask))
|
|
|
|
printk(KERN_ERR "ERROR: groups don't span domain->span\n");
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
if (sd->parent && !cpus_subset(groupmask, sd->parent->span))
|
|
|
|
printk(KERN_ERR "ERROR: parent span is not a superset "
|
|
|
|
"of domain->span\n");
|
|
|
|
return 0;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
static void sched_domain_debug(struct sched_domain *sd, int cpu)
|
|
|
|
{
|
|
|
|
int level = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
if (!sd) {
|
|
|
|
printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
|
|
|
|
return;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
if (sched_domain_debug_one(sd, cpu, level))
|
|
|
|
break;
|
2005-04-17 00:20:36 +02:00
|
|
|
level++;
|
|
|
|
sd = sd->parent;
|
2006-12-10 11:20:38 +01:00
|
|
|
if (!sd)
|
2007-10-24 18:23:48 +02:00
|
|
|
break;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
#else
|
2006-07-03 09:25:40 +02:00
|
|
|
# define sched_domain_debug(sd, cpu) do { } while (0)
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
|
2005-06-25 23:57:33 +02:00
|
|
|
static int sd_degenerate(struct sched_domain *sd)
|
2005-06-25 23:57:25 +02:00
|
|
|
{
|
|
|
|
if (cpus_weight(sd->span) == 1)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
/* Following flags need at least 2 groups */
|
|
|
|
if (sd->flags & (SD_LOAD_BALANCE |
|
|
|
|
SD_BALANCE_NEWIDLE |
|
|
|
|
SD_BALANCE_FORK |
|
2006-10-03 10:14:09 +02:00
|
|
|
SD_BALANCE_EXEC |
|
|
|
|
SD_SHARE_CPUPOWER |
|
|
|
|
SD_SHARE_PKG_RESOURCES)) {
|
2005-06-25 23:57:25 +02:00
|
|
|
if (sd->groups != sd->groups->next)
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Following flags don't use groups */
|
|
|
|
if (sd->flags & (SD_WAKE_IDLE |
|
|
|
|
SD_WAKE_AFFINE |
|
|
|
|
SD_WAKE_BALANCE))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
static int
|
|
|
|
sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
|
2005-06-25 23:57:25 +02:00
|
|
|
{
|
|
|
|
unsigned long cflags = sd->flags, pflags = parent->flags;
|
|
|
|
|
|
|
|
if (sd_degenerate(parent))
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
if (!cpus_equal(sd->span, parent->span))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* Does parent contain flags not in child? */
|
|
|
|
/* WAKE_BALANCE is a subset of WAKE_AFFINE */
|
|
|
|
if (cflags & SD_WAKE_AFFINE)
|
|
|
|
pflags &= ~SD_WAKE_BALANCE;
|
|
|
|
/* Flags needing groups don't count if only 1 group in parent */
|
|
|
|
if (parent->groups == parent->groups->next) {
|
|
|
|
pflags &= ~(SD_LOAD_BALANCE |
|
|
|
|
SD_BALANCE_NEWIDLE |
|
|
|
|
SD_BALANCE_FORK |
|
2006-10-03 10:14:09 +02:00
|
|
|
SD_BALANCE_EXEC |
|
|
|
|
SD_SHARE_CPUPOWER |
|
|
|
|
SD_SHARE_PKG_RESOURCES);
|
2005-06-25 23:57:25 +02:00
|
|
|
}
|
|
|
|
if (~cflags & pflags)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
static void rq_attach_root(struct rq *rq, struct root_domain *rd)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
const struct sched_class *class;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
|
|
|
|
if (rq->rd) {
|
|
|
|
struct root_domain *old_rd = rq->rd;
|
|
|
|
|
2008-01-25 21:08:19 +01:00
|
|
|
for (class = sched_class_highest; class; class = class->next) {
|
2008-01-25 21:08:18 +01:00
|
|
|
if (class->leave_domain)
|
|
|
|
class->leave_domain(rq);
|
2008-01-25 21:08:19 +01:00
|
|
|
}
|
2008-01-25 21:08:18 +01:00
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
cpu_clear(rq->cpu, old_rd->span);
|
|
|
|
cpu_clear(rq->cpu, old_rd->online);
|
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
if (atomic_dec_and_test(&old_rd->refcount))
|
|
|
|
kfree(old_rd);
|
|
|
|
}
|
|
|
|
|
|
|
|
atomic_inc(&rd->refcount);
|
|
|
|
rq->rd = rd;
|
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
cpu_set(rq->cpu, rd->span);
|
|
|
|
if (cpu_isset(rq->cpu, cpu_online_map))
|
|
|
|
cpu_set(rq->cpu, rd->online);
|
|
|
|
|
2008-01-25 21:08:19 +01:00
|
|
|
for (class = sched_class_highest; class; class = class->next) {
|
2008-01-25 21:08:18 +01:00
|
|
|
if (class->join_domain)
|
|
|
|
class->join_domain(rq);
|
2008-01-25 21:08:19 +01:00
|
|
|
}
|
2008-01-25 21:08:18 +01:00
|
|
|
|
|
|
|
spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
static void init_rootdomain(struct root_domain *rd)
|
2008-01-25 21:08:18 +01:00
|
|
|
{
|
|
|
|
memset(rd, 0, sizeof(*rd));
|
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
cpus_clear(rd->span);
|
|
|
|
cpus_clear(rd->online);
|
2008-01-25 21:08:18 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static void init_defrootdomain(void)
|
|
|
|
{
|
2008-01-25 21:08:26 +01:00
|
|
|
init_rootdomain(&def_root_domain);
|
2008-01-25 21:08:18 +01:00
|
|
|
atomic_set(&def_root_domain.refcount, 1);
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
static struct root_domain *alloc_rootdomain(void)
|
2008-01-25 21:08:18 +01:00
|
|
|
{
|
|
|
|
struct root_domain *rd;
|
|
|
|
|
|
|
|
rd = kmalloc(sizeof(*rd), GFP_KERNEL);
|
|
|
|
if (!rd)
|
|
|
|
return NULL;
|
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
init_rootdomain(rd);
|
2008-01-25 21:08:18 +01:00
|
|
|
|
|
|
|
return rd;
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
2008-01-25 21:08:19 +01:00
|
|
|
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
|
2005-04-17 00:20:36 +02:00
|
|
|
* hold the hotplug lock.
|
|
|
|
*/
|
2008-01-25 21:08:19 +01:00
|
|
|
static void
|
|
|
|
cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq = cpu_rq(cpu);
|
2005-06-25 23:57:25 +02:00
|
|
|
struct sched_domain *tmp;
|
|
|
|
|
|
|
|
/* Remove the sched domains which do not contribute to scheduling. */
|
|
|
|
for (tmp = sd; tmp; tmp = tmp->parent) {
|
|
|
|
struct sched_domain *parent = tmp->parent;
|
|
|
|
if (!parent)
|
|
|
|
break;
|
2006-10-03 10:14:08 +02:00
|
|
|
if (sd_parent_degenerate(tmp, parent)) {
|
2005-06-25 23:57:25 +02:00
|
|
|
tmp->parent = parent->parent;
|
2006-10-03 10:14:08 +02:00
|
|
|
if (parent->parent)
|
|
|
|
parent->parent->child = tmp;
|
|
|
|
}
|
2005-06-25 23:57:25 +02:00
|
|
|
}
|
|
|
|
|
2006-10-03 10:14:08 +02:00
|
|
|
if (sd && sd_degenerate(sd)) {
|
2005-06-25 23:57:25 +02:00
|
|
|
sd = sd->parent;
|
2006-10-03 10:14:08 +02:00
|
|
|
if (sd)
|
|
|
|
sd->child = NULL;
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
sched_domain_debug(sd, cpu);
|
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
rq_attach_root(rq, rd);
|
2005-06-25 23:57:27 +02:00
|
|
|
rcu_assign_pointer(rq->sd, sd);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/* cpus with isolated domains */
|
2006-12-22 10:07:50 +01:00
|
|
|
static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/* Setup the mask of cpus configured for isolated domains */
|
|
|
|
static int __init isolated_cpu_setup(char *str)
|
|
|
|
{
|
|
|
|
int ints[NR_CPUS], i;
|
|
|
|
|
|
|
|
str = get_options(str, ARRAY_SIZE(ints), ints);
|
|
|
|
cpus_clear(cpu_isolated_map);
|
|
|
|
for (i = 1; i <= ints[0]; i++)
|
|
|
|
if (ints[i] < NR_CPUS)
|
|
|
|
cpu_set(ints[i], cpu_isolated_map);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:13 +02:00
|
|
|
__setup("isolcpus=", isolated_cpu_setup);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
/*
|
2006-12-10 11:20:07 +01:00
|
|
|
* init_sched_build_groups takes the cpumask we wish to span, and a pointer
|
|
|
|
* to a function which identifies what group(along with sched group) a CPU
|
|
|
|
* belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
|
|
|
|
* (due to the fact that we keep track of groups covered with a cpumask_t).
|
2005-04-17 00:20:36 +02:00
|
|
|
*
|
|
|
|
* init_sched_build_groups will build a circular linked list of the groups
|
|
|
|
* covered by the given span, and will set each group's ->cpumask correctly,
|
|
|
|
* and ->cpu_power to 0.
|
|
|
|
*/
|
2006-10-03 10:14:06 +02:00
|
|
|
static void
|
2006-12-10 11:20:07 +01:00
|
|
|
init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
|
|
|
|
int (*group_fn)(int cpu, const cpumask_t *cpu_map,
|
|
|
|
struct sched_group **sg))
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
struct sched_group *first = NULL, *last = NULL;
|
|
|
|
cpumask_t covered = CPU_MASK_NONE;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_cpu_mask(i, span) {
|
2006-12-10 11:20:07 +01:00
|
|
|
struct sched_group *sg;
|
|
|
|
int group = group_fn(i, cpu_map, &sg);
|
2005-04-17 00:20:36 +02:00
|
|
|
int j;
|
|
|
|
|
|
|
|
if (cpu_isset(i, covered))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
sg->cpumask = CPU_MASK_NONE;
|
2007-05-08 09:32:57 +02:00
|
|
|
sg->__cpu_power = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
for_each_cpu_mask(j, span) {
|
2006-12-10 11:20:07 +01:00
|
|
|
if (group_fn(j, cpu_map, NULL) != group)
|
2005-04-17 00:20:36 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
cpu_set(j, covered);
|
|
|
|
cpu_set(j, sg->cpumask);
|
|
|
|
}
|
|
|
|
if (!first)
|
|
|
|
first = sg;
|
|
|
|
if (last)
|
|
|
|
last->next = sg;
|
|
|
|
last = sg;
|
|
|
|
}
|
|
|
|
last->next = first;
|
|
|
|
}
|
|
|
|
|
2005-09-07 00:18:14 +02:00
|
|
|
#define SD_NODES_PER_DOMAIN 16
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2005-09-07 00:18:14 +02:00
|
|
|
#ifdef CONFIG_NUMA
|
[PATCH] scheduler cache-hot-autodetect
)
From: Ingo Molnar <mingo@elte.hu>
This is the latest version of the scheduler cache-hot-auto-tune patch.
The first problem was that detection time scaled with O(N^2), which is
unacceptable on larger SMP and NUMA systems. To solve this:
- I've added a 'domain distance' function, which is used to cache
measurement results. Each distance is only measured once. This means
that e.g. on NUMA distances of 0, 1 and 2 might be measured, on HT
distances 0 and 1, and on SMP distance 0 is measured. The code walks
the domain tree to determine the distance, so it automatically follows
whatever hierarchy an architecture sets up. This cuts down on the boot
time significantly and removes the O(N^2) limit. The only assumption
is that migration costs can be expressed as a function of domain
distance - this covers the overwhelming majority of existing systems,
and is a good guess even for more assymetric systems.
[ People hacking systems that have assymetries that break this
assumption (e.g. different CPU speeds) should experiment a bit with
the cpu_distance() function. Adding a ->migration_distance factor to
the domain structure would be one possible solution - but lets first
see the problem systems, if they exist at all. Lets not overdesign. ]
Another problem was that only a single cache-size was used for measuring
the cost of migration, and most architectures didnt set that variable
up. Furthermore, a single cache-size does not fit NUMA hierarchies with
L3 caches and does not fit HT setups, where different CPUs will often
have different 'effective cache sizes'. To solve this problem:
- Instead of relying on a single cache-size provided by the platform and
sticking to it, the code now auto-detects the 'effective migration
cost' between two measured CPUs, via iterating through a wide range of
cachesizes. The code searches for the maximum migration cost, which
occurs when the working set of the test-workload falls just below the
'effective cache size'. I.e. real-life optimized search is done for
the maximum migration cost, between two real CPUs.
This, amongst other things, has the positive effect hat if e.g. two
CPUs share a L2/L3 cache, a different (and accurate) migration cost
will be found than between two CPUs on the same system that dont share
any caches.
(The reliable measurement of migration costs is tricky - see the source
for details.)
Furthermore i've added various boot-time options to override/tune
migration behavior.
Firstly, there's a blanket override for autodetection:
migration_cost=1000,2000,3000
will override the depth 0/1/2 values with 1msec/2msec/3msec values.
Secondly, there's a global factor that can be used to increase (or
decrease) the autodetected values:
migration_factor=120
will increase the autodetected values by 20%. This option is useful to
tune things in a workload-dependent way - e.g. if a workload is
cache-insensitive then CPU utilization can be maximized by specifying
migration_factor=0.
I've tested the autodetection code quite extensively on x86, on 3
P3/Xeon/2MB, and the autodetected values look pretty good:
Dual Celeron (128K L2 cache):
---------------------
migration cost matrix (max_cache_size: 131072, cpu: 467 MHz):
---------------------
[00] [01]
[00]: - 1.7(1)
[01]: 1.7(1) -
---------------------
cacheflush times [2]: 0.0 (0) 1.7 (1784008)
---------------------
Here the slow memory subsystem dominates system performance, and even
though caches are small, the migration cost is 1.7 msecs.
Dual HT P4 (512K L2 cache):
---------------------
migration cost matrix (max_cache_size: 524288, cpu: 2379 MHz):
---------------------
[00] [01] [02] [03]
[00]: - 0.4(1) 0.0(0) 0.4(1)
[01]: 0.4(1) - 0.4(1) 0.0(0)
[02]: 0.0(0) 0.4(1) - 0.4(1)
[03]: 0.4(1) 0.0(0) 0.4(1) -
---------------------
cacheflush times [2]: 0.0 (33900) 0.4 (448514)
---------------------
Here it can be seen that there is no migration cost between two HT
siblings (CPU#0/2 and CPU#1/3 are separate physical CPUs). A fast memory
system makes inter-physical-CPU migration pretty cheap: 0.4 msecs.
8-way P3/Xeon [2MB L2 cache]:
---------------------
migration cost matrix (max_cache_size: 2097152, cpu: 700 MHz):
---------------------
[00] [01] [02] [03] [04] [05] [06] [07]
[00]: - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[01]: 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[02]: 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[03]: 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1) 19.2(1)
[04]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1) 19.2(1)
[05]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1) 19.2(1)
[06]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) - 19.2(1)
[07]: 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) 19.2(1) -
---------------------
cacheflush times [2]: 0.0 (0) 19.2 (19281756)
---------------------
This one has huge caches and a relatively slow memory subsystem - so the
migration cost is 19 msecs.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: <wilder@us.ibm.com>
Signed-off-by: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-12 10:05:30 +01:00
|
|
|
|
2005-09-07 00:18:14 +02:00
|
|
|
/**
|
|
|
|
* find_next_best_node - find the next node to include in a sched_domain
|
|
|
|
* @node: node whose sched_domain we're building
|
|
|
|
* @used_nodes: nodes already in the sched_domain
|
|
|
|
*
|
2007-12-05 15:46:09 +01:00
|
|
|
* Find the next node to include in a given scheduling domain. Simply
|
2005-09-07 00:18:14 +02:00
|
|
|
* finds the closest node not already in the @used_nodes map.
|
|
|
|
*
|
|
|
|
* Should use nodemask_t.
|
|
|
|
*/
|
|
|
|
static int find_next_best_node(int node, unsigned long *used_nodes)
|
|
|
|
{
|
|
|
|
int i, n, val, min_val, best_node = 0;
|
|
|
|
|
|
|
|
min_val = INT_MAX;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
/* Start at @node */
|
|
|
|
n = (node + i) % MAX_NUMNODES;
|
|
|
|
|
|
|
|
if (!nr_cpus_node(n))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Skip already used nodes */
|
|
|
|
if (test_bit(n, used_nodes))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* Simple min distance search */
|
|
|
|
val = node_distance(node, n);
|
|
|
|
|
|
|
|
if (val < min_val) {
|
|
|
|
min_val = val;
|
|
|
|
best_node = n;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
set_bit(best_node, used_nodes);
|
|
|
|
return best_node;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* sched_domain_node_span - get a cpumask for a node's sched_domain
|
|
|
|
* @node: node whose cpumask we're constructing
|
|
|
|
* @size: number of nodes to include in this span
|
|
|
|
*
|
2007-12-05 15:46:09 +01:00
|
|
|
* Given a node, construct a good cpumask for its sched_domain to span. It
|
2005-09-07 00:18:14 +02:00
|
|
|
* should be one that prevents unnecessary balancing, but also spreads tasks
|
|
|
|
* out optimally.
|
|
|
|
*/
|
|
|
|
static cpumask_t sched_domain_node_span(int node)
|
|
|
|
{
|
|
|
|
DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
|
2006-07-03 09:25:40 +02:00
|
|
|
cpumask_t span, nodemask;
|
|
|
|
int i;
|
2005-09-07 00:18:14 +02:00
|
|
|
|
|
|
|
cpus_clear(span);
|
|
|
|
bitmap_zero(used_nodes, MAX_NUMNODES);
|
|
|
|
|
|
|
|
nodemask = node_to_cpumask(node);
|
|
|
|
cpus_or(span, span, nodemask);
|
|
|
|
set_bit(node, used_nodes);
|
|
|
|
|
|
|
|
for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
|
|
|
|
int next_node = find_next_best_node(node, used_nodes);
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-09-07 00:18:14 +02:00
|
|
|
nodemask = node_to_cpumask(next_node);
|
|
|
|
cpus_or(span, span, nodemask);
|
|
|
|
}
|
|
|
|
|
|
|
|
return span;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-06-27 11:54:42 +02:00
|
|
|
int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2005-09-07 00:18:14 +02:00
|
|
|
/*
|
2006-07-03 09:25:40 +02:00
|
|
|
* SMT sched-domains:
|
2005-09-07 00:18:14 +02:00
|
|
|
*/
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
|
2006-12-10 11:20:07 +01:00
|
|
|
static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2007-12-05 15:46:09 +01:00
|
|
|
static int
|
|
|
|
cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-12-10 11:20:07 +01:00
|
|
|
if (sg)
|
|
|
|
*sg = &per_cpu(sched_group_cpus, cpu);
|
2005-04-17 00:20:36 +02:00
|
|
|
return cpu;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-07-03 09:25:40 +02:00
|
|
|
/*
|
|
|
|
* multi-core sched-domains:
|
|
|
|
*/
|
2006-03-27 11:15:22 +02:00
|
|
|
#ifdef CONFIG_SCHED_MC
|
|
|
|
static DEFINE_PER_CPU(struct sched_domain, core_domains);
|
2006-12-10 11:20:07 +01:00
|
|
|
static DEFINE_PER_CPU(struct sched_group, sched_group_core);
|
2006-03-27 11:15:22 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
|
2007-12-05 15:46:09 +01:00
|
|
|
static int
|
|
|
|
cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
|
2006-03-27 11:15:22 +02:00
|
|
|
{
|
2006-12-10 11:20:07 +01:00
|
|
|
int group;
|
2007-10-16 10:24:05 +02:00
|
|
|
cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
|
2006-10-03 10:14:06 +02:00
|
|
|
cpus_and(mask, mask, *cpu_map);
|
2006-12-10 11:20:07 +01:00
|
|
|
group = first_cpu(mask);
|
|
|
|
if (sg)
|
|
|
|
*sg = &per_cpu(sched_group_core, group);
|
|
|
|
return group;
|
2006-03-27 11:15:22 +02:00
|
|
|
}
|
|
|
|
#elif defined(CONFIG_SCHED_MC)
|
2007-12-05 15:46:09 +01:00
|
|
|
static int
|
|
|
|
cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
|
2006-03-27 11:15:22 +02:00
|
|
|
{
|
2006-12-10 11:20:07 +01:00
|
|
|
if (sg)
|
|
|
|
*sg = &per_cpu(sched_group_core, cpu);
|
2006-03-27 11:15:22 +02:00
|
|
|
return cpu;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
|
2006-12-10 11:20:07 +01:00
|
|
|
static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
|
2006-07-03 09:25:40 +02:00
|
|
|
|
2007-12-05 15:46:09 +01:00
|
|
|
static int
|
|
|
|
cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-12-10 11:20:07 +01:00
|
|
|
int group;
|
2006-07-03 09:25:40 +02:00
|
|
|
#ifdef CONFIG_SCHED_MC
|
2006-03-27 11:15:22 +02:00
|
|
|
cpumask_t mask = cpu_coregroup_map(cpu);
|
2006-10-03 10:14:06 +02:00
|
|
|
cpus_and(mask, mask, *cpu_map);
|
2006-12-10 11:20:07 +01:00
|
|
|
group = first_cpu(mask);
|
2006-03-27 11:15:22 +02:00
|
|
|
#elif defined(CONFIG_SCHED_SMT)
|
2007-10-16 10:24:05 +02:00
|
|
|
cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
|
2006-10-03 10:14:06 +02:00
|
|
|
cpus_and(mask, mask, *cpu_map);
|
2006-12-10 11:20:07 +01:00
|
|
|
group = first_cpu(mask);
|
2005-04-17 00:20:36 +02:00
|
|
|
#else
|
2006-12-10 11:20:07 +01:00
|
|
|
group = cpu;
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
2006-12-10 11:20:07 +01:00
|
|
|
if (sg)
|
|
|
|
*sg = &per_cpu(sched_group_phys, group);
|
|
|
|
return group;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/*
|
2005-09-07 00:18:14 +02:00
|
|
|
* The init_sched_build_groups can't handle what we want to do with node
|
|
|
|
* groups, so roll our own. Now each node has its own list of groups which
|
|
|
|
* gets dynamically allocated.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2005-09-07 00:18:14 +02:00
|
|
|
static DEFINE_PER_CPU(struct sched_domain, node_domains);
|
2005-09-07 00:18:14 +02:00
|
|
|
static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2005-09-07 00:18:14 +02:00
|
|
|
static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
|
2006-12-10 11:20:07 +01:00
|
|
|
static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
|
2005-09-07 00:18:14 +02:00
|
|
|
|
2006-12-10 11:20:07 +01:00
|
|
|
static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
|
|
|
|
struct sched_group **sg)
|
2005-09-07 00:18:14 +02:00
|
|
|
{
|
2006-12-10 11:20:07 +01:00
|
|
|
cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
|
|
|
|
int group;
|
|
|
|
|
|
|
|
cpus_and(nodemask, nodemask, *cpu_map);
|
|
|
|
group = first_cpu(nodemask);
|
|
|
|
|
|
|
|
if (sg)
|
|
|
|
*sg = &per_cpu(sched_group_allnodes, group);
|
|
|
|
return group;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2006-12-10 11:20:07 +01:00
|
|
|
|
2006-03-27 11:15:23 +02:00
|
|
|
static void init_numa_sched_groups_power(struct sched_group *group_head)
|
|
|
|
{
|
|
|
|
struct sched_group *sg = group_head;
|
|
|
|
int j;
|
|
|
|
|
|
|
|
if (!sg)
|
|
|
|
return;
|
2007-10-15 17:00:14 +02:00
|
|
|
do {
|
|
|
|
for_each_cpu_mask(j, sg->cpumask) {
|
|
|
|
struct sched_domain *sd;
|
2006-03-27 11:15:23 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
sd = &per_cpu(phys_domains, j);
|
|
|
|
if (j != first_cpu(sd->groups->cpumask)) {
|
|
|
|
/*
|
|
|
|
* Only add "power" once for each
|
|
|
|
* physical package.
|
|
|
|
*/
|
|
|
|
continue;
|
|
|
|
}
|
2006-03-27 11:15:23 +02:00
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
sg_inc_cpu_power(sg, sd->groups->__cpu_power);
|
|
|
|
}
|
|
|
|
sg = sg->next;
|
|
|
|
} while (sg != group_head);
|
2006-03-27 11:15:23 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
|
2006-10-03 10:14:06 +02:00
|
|
|
#ifdef CONFIG_NUMA
|
2006-06-27 11:54:38 +02:00
|
|
|
/* Free memory allocated for various sched_group structures */
|
|
|
|
static void free_sched_groups(const cpumask_t *cpu_map)
|
|
|
|
{
|
2006-10-03 10:14:06 +02:00
|
|
|
int cpu, i;
|
2006-06-27 11:54:38 +02:00
|
|
|
|
|
|
|
for_each_cpu_mask(cpu, *cpu_map) {
|
|
|
|
struct sched_group **sched_group_nodes
|
|
|
|
= sched_group_nodes_bycpu[cpu];
|
|
|
|
|
|
|
|
if (!sched_group_nodes)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
cpumask_t nodemask = node_to_cpumask(i);
|
|
|
|
struct sched_group *oldsg, *sg = sched_group_nodes[i];
|
|
|
|
|
|
|
|
cpus_and(nodemask, nodemask, *cpu_map);
|
|
|
|
if (cpus_empty(nodemask))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (sg == NULL)
|
|
|
|
continue;
|
|
|
|
sg = sg->next;
|
|
|
|
next_sg:
|
|
|
|
oldsg = sg;
|
|
|
|
sg = sg->next;
|
|
|
|
kfree(oldsg);
|
|
|
|
if (oldsg != sched_group_nodes[i])
|
|
|
|
goto next_sg;
|
|
|
|
}
|
|
|
|
kfree(sched_group_nodes);
|
|
|
|
sched_group_nodes_bycpu[cpu] = NULL;
|
|
|
|
}
|
|
|
|
}
|
2006-10-03 10:14:06 +02:00
|
|
|
#else
|
|
|
|
static void free_sched_groups(const cpumask_t *cpu_map)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
2006-06-27 11:54:38 +02:00
|
|
|
|
2006-10-03 10:14:09 +02:00
|
|
|
/*
|
|
|
|
* Initialize sched groups cpu_power.
|
|
|
|
*
|
|
|
|
* cpu_power indicates the capacity of sched group, which is used while
|
|
|
|
* distributing the load between different sched groups in a sched domain.
|
|
|
|
* Typically cpu_power for all the groups in a sched domain will be same unless
|
|
|
|
* there are asymmetries in the topology. If there are asymmetries, group
|
|
|
|
* having more cpu_power will pickup more load compared to the group having
|
|
|
|
* less cpu_power.
|
|
|
|
*
|
|
|
|
* cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
|
|
|
|
* the maximum number of tasks a group can handle in the presence of other idle
|
|
|
|
* or lightly loaded groups in the same sched domain.
|
|
|
|
*/
|
|
|
|
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
|
|
|
|
{
|
|
|
|
struct sched_domain *child;
|
|
|
|
struct sched_group *group;
|
|
|
|
|
|
|
|
WARN_ON(!sd || !sd->groups);
|
|
|
|
|
|
|
|
if (cpu != first_cpu(sd->groups->cpumask))
|
|
|
|
return;
|
|
|
|
|
|
|
|
child = sd->child;
|
|
|
|
|
2007-05-08 09:32:57 +02:00
|
|
|
sd->groups->__cpu_power = 0;
|
|
|
|
|
2006-10-03 10:14:09 +02:00
|
|
|
/*
|
|
|
|
* For perf policy, if the groups in child domain share resources
|
|
|
|
* (for example cores sharing some portions of the cache hierarchy
|
|
|
|
* or SMT), then set this domain groups cpu_power such that each group
|
|
|
|
* can handle only one task, when there are other idle groups in the
|
|
|
|
* same sched domain.
|
|
|
|
*/
|
|
|
|
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
|
|
|
|
(child->flags &
|
|
|
|
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
|
2007-05-08 09:32:57 +02:00
|
|
|
sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
|
2006-10-03 10:14:09 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* add cpu_power of each child group to this groups cpu_power
|
|
|
|
*/
|
|
|
|
group = child->groups;
|
|
|
|
do {
|
2007-05-08 09:32:57 +02:00
|
|
|
sg_inc_cpu_power(sd->groups, group->__cpu_power);
|
2006-10-03 10:14:09 +02:00
|
|
|
group = group->next;
|
|
|
|
} while (group != child->groups);
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
2005-06-25 23:57:33 +02:00
|
|
|
* Build sched domains for a given set of cpus and attach the sched domains
|
|
|
|
* to the individual cpus
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2006-06-27 11:54:38 +02:00
|
|
|
static int build_sched_domains(const cpumask_t *cpu_map)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
|
|
|
int i;
|
2008-01-25 21:08:18 +01:00
|
|
|
struct root_domain *rd;
|
2005-09-07 00:18:14 +02:00
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
struct sched_group **sched_group_nodes = NULL;
|
2006-12-10 11:20:07 +01:00
|
|
|
int sd_allnodes = 0;
|
2005-09-07 00:18:14 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate the per-node list of sched groups
|
|
|
|
*/
|
2007-10-15 17:00:19 +02:00
|
|
|
sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
|
2007-12-05 15:46:09 +01:00
|
|
|
GFP_KERNEL);
|
2005-09-07 00:18:14 +02:00
|
|
|
if (!sched_group_nodes) {
|
|
|
|
printk(KERN_WARNING "Can not alloc sched group node list\n");
|
2006-06-27 11:54:38 +02:00
|
|
|
return -ENOMEM;
|
2005-09-07 00:18:14 +02:00
|
|
|
}
|
|
|
|
sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2008-01-25 21:08:26 +01:00
|
|
|
rd = alloc_rootdomain();
|
2008-01-25 21:08:18 +01:00
|
|
|
if (!rd) {
|
|
|
|
printk(KERN_WARNING "Cannot alloc root domain\n");
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
2005-06-25 23:57:33 +02:00
|
|
|
* Set up domains for cpus specified by the cpu_map.
|
2005-04-17 00:20:36 +02:00
|
|
|
*/
|
2005-06-25 23:57:33 +02:00
|
|
|
for_each_cpu_mask(i, *cpu_map) {
|
2005-04-17 00:20:36 +02:00
|
|
|
struct sched_domain *sd = NULL, *p;
|
|
|
|
cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
|
|
|
|
|
2005-06-25 23:57:33 +02:00
|
|
|
cpus_and(nodemask, nodemask, *cpu_map);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
2007-07-09 18:51:59 +02:00
|
|
|
if (cpus_weight(*cpu_map) >
|
|
|
|
SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
|
2005-09-07 00:18:14 +02:00
|
|
|
sd = &per_cpu(allnodes_domains, i);
|
|
|
|
*sd = SD_ALLNODES_INIT;
|
|
|
|
sd->span = *cpu_map;
|
2006-12-10 11:20:07 +01:00
|
|
|
cpu_to_allnodes_group(i, cpu_map, &sd->groups);
|
2005-09-07 00:18:14 +02:00
|
|
|
p = sd;
|
2006-12-10 11:20:07 +01:00
|
|
|
sd_allnodes = 1;
|
2005-09-07 00:18:14 +02:00
|
|
|
} else
|
|
|
|
p = NULL;
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
sd = &per_cpu(node_domains, i);
|
|
|
|
*sd = SD_NODE_INIT;
|
2005-09-07 00:18:14 +02:00
|
|
|
sd->span = sched_domain_node_span(cpu_to_node(i));
|
|
|
|
sd->parent = p;
|
2006-10-03 10:14:08 +02:00
|
|
|
if (p)
|
|
|
|
p->child = sd;
|
2005-09-07 00:18:14 +02:00
|
|
|
cpus_and(sd->span, sd->span, *cpu_map);
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
p = sd;
|
|
|
|
sd = &per_cpu(phys_domains, i);
|
|
|
|
*sd = SD_CPU_INIT;
|
|
|
|
sd->span = nodemask;
|
|
|
|
sd->parent = p;
|
2006-10-03 10:14:08 +02:00
|
|
|
if (p)
|
|
|
|
p->child = sd;
|
2006-12-10 11:20:07 +01:00
|
|
|
cpu_to_phys_group(i, cpu_map, &sd->groups);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-03-27 11:15:22 +02:00
|
|
|
#ifdef CONFIG_SCHED_MC
|
|
|
|
p = sd;
|
|
|
|
sd = &per_cpu(core_domains, i);
|
|
|
|
*sd = SD_MC_INIT;
|
|
|
|
sd->span = cpu_coregroup_map(i);
|
|
|
|
cpus_and(sd->span, sd->span, *cpu_map);
|
|
|
|
sd->parent = p;
|
2006-10-03 10:14:08 +02:00
|
|
|
p->child = sd;
|
2006-12-10 11:20:07 +01:00
|
|
|
cpu_to_core_group(i, cpu_map, &sd->groups);
|
2006-03-27 11:15:22 +02:00
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
p = sd;
|
|
|
|
sd = &per_cpu(cpu_domains, i);
|
|
|
|
*sd = SD_SIBLING_INIT;
|
2007-10-16 10:24:05 +02:00
|
|
|
sd->span = per_cpu(cpu_sibling_map, i);
|
2005-06-25 23:57:33 +02:00
|
|
|
cpus_and(sd->span, sd->span, *cpu_map);
|
2005-04-17 00:20:36 +02:00
|
|
|
sd->parent = p;
|
2006-10-03 10:14:08 +02:00
|
|
|
p->child = sd;
|
2006-12-10 11:20:07 +01:00
|
|
|
cpu_to_cpu_group(i, cpu_map, &sd->groups);
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
/* Set up CPU (sibling) groups */
|
2005-09-07 00:18:14 +02:00
|
|
|
for_each_cpu_mask(i, *cpu_map) {
|
2007-10-16 10:24:05 +02:00
|
|
|
cpumask_t this_sibling_map = per_cpu(cpu_sibling_map, i);
|
2005-06-25 23:57:33 +02:00
|
|
|
cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
|
2005-04-17 00:20:36 +02:00
|
|
|
if (i != first_cpu(this_sibling_map))
|
|
|
|
continue;
|
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
init_sched_build_groups(this_sibling_map, cpu_map,
|
|
|
|
&cpu_to_cpu_group);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-03-27 11:15:22 +02:00
|
|
|
#ifdef CONFIG_SCHED_MC
|
|
|
|
/* Set up multi-core groups */
|
|
|
|
for_each_cpu_mask(i, *cpu_map) {
|
|
|
|
cpumask_t this_core_map = cpu_coregroup_map(i);
|
|
|
|
cpus_and(this_core_map, this_core_map, *cpu_map);
|
|
|
|
if (i != first_cpu(this_core_map))
|
|
|
|
continue;
|
2007-07-09 18:51:59 +02:00
|
|
|
init_sched_build_groups(this_core_map, cpu_map,
|
|
|
|
&cpu_to_core_group);
|
2006-03-27 11:15:22 +02:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/* Set up physical groups */
|
|
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
cpumask_t nodemask = node_to_cpumask(i);
|
|
|
|
|
2005-06-25 23:57:33 +02:00
|
|
|
cpus_and(nodemask, nodemask, *cpu_map);
|
2005-04-17 00:20:36 +02:00
|
|
|
if (cpus_empty(nodemask))
|
|
|
|
continue;
|
|
|
|
|
2006-12-10 11:20:07 +01:00
|
|
|
init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
|
/* Set up node groups */
|
2006-12-10 11:20:07 +01:00
|
|
|
if (sd_allnodes)
|
2007-07-09 18:51:59 +02:00
|
|
|
init_sched_build_groups(*cpu_map, cpu_map,
|
|
|
|
&cpu_to_allnodes_group);
|
2005-09-07 00:18:14 +02:00
|
|
|
|
|
|
|
for (i = 0; i < MAX_NUMNODES; i++) {
|
|
|
|
/* Set up node groups */
|
|
|
|
struct sched_group *sg, *prev;
|
|
|
|
cpumask_t nodemask = node_to_cpumask(i);
|
|
|
|
cpumask_t domainspan;
|
|
|
|
cpumask_t covered = CPU_MASK_NONE;
|
|
|
|
int j;
|
|
|
|
|
|
|
|
cpus_and(nodemask, nodemask, *cpu_map);
|
2005-09-07 00:18:14 +02:00
|
|
|
if (cpus_empty(nodemask)) {
|
|
|
|
sched_group_nodes[i] = NULL;
|
2005-09-07 00:18:14 +02:00
|
|
|
continue;
|
2005-09-07 00:18:14 +02:00
|
|
|
}
|
2005-09-07 00:18:14 +02:00
|
|
|
|
|
|
|
domainspan = sched_domain_node_span(i);
|
|
|
|
cpus_and(domainspan, domainspan, *cpu_map);
|
|
|
|
|
2006-06-27 11:54:40 +02:00
|
|
|
sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
|
2006-06-27 11:54:38 +02:00
|
|
|
if (!sg) {
|
|
|
|
printk(KERN_WARNING "Can not alloc domain group for "
|
|
|
|
"node %d\n", i);
|
|
|
|
goto error;
|
|
|
|
}
|
2005-09-07 00:18:14 +02:00
|
|
|
sched_group_nodes[i] = sg;
|
|
|
|
for_each_cpu_mask(j, nodemask) {
|
|
|
|
struct sched_domain *sd;
|
2007-07-09 18:52:00 +02:00
|
|
|
|
2005-09-07 00:18:14 +02:00
|
|
|
sd = &per_cpu(node_domains, j);
|
|
|
|
sd->groups = sg;
|
|
|
|
}
|
2007-05-08 09:32:57 +02:00
|
|
|
sg->__cpu_power = 0;
|
2005-09-07 00:18:14 +02:00
|
|
|
sg->cpumask = nodemask;
|
2006-06-27 11:54:38 +02:00
|
|
|
sg->next = sg;
|
2005-09-07 00:18:14 +02:00
|
|
|
cpus_or(covered, covered, nodemask);
|
|
|
|
prev = sg;
|
|
|
|
|
|
|
|
for (j = 0; j < MAX_NUMNODES; j++) {
|
|
|
|
cpumask_t tmp, notcovered;
|
|
|
|
int n = (i + j) % MAX_NUMNODES;
|
|
|
|
|
|
|
|
cpus_complement(notcovered, covered);
|
|
|
|
cpus_and(tmp, notcovered, *cpu_map);
|
|
|
|
cpus_and(tmp, tmp, domainspan);
|
|
|
|
if (cpus_empty(tmp))
|
|
|
|
break;
|
|
|
|
|
|
|
|
nodemask = node_to_cpumask(n);
|
|
|
|
cpus_and(tmp, tmp, nodemask);
|
|
|
|
if (cpus_empty(tmp))
|
|
|
|
continue;
|
|
|
|
|
2006-06-27 11:54:40 +02:00
|
|
|
sg = kmalloc_node(sizeof(struct sched_group),
|
|
|
|
GFP_KERNEL, i);
|
2005-09-07 00:18:14 +02:00
|
|
|
if (!sg) {
|
|
|
|
printk(KERN_WARNING
|
|
|
|
"Can not alloc domain group for node %d\n", j);
|
2006-06-27 11:54:38 +02:00
|
|
|
goto error;
|
2005-09-07 00:18:14 +02:00
|
|
|
}
|
2007-05-08 09:32:57 +02:00
|
|
|
sg->__cpu_power = 0;
|
2005-09-07 00:18:14 +02:00
|
|
|
sg->cpumask = tmp;
|
2006-06-27 11:54:38 +02:00
|
|
|
sg->next = prev->next;
|
2005-09-07 00:18:14 +02:00
|
|
|
cpus_or(covered, covered, tmp);
|
|
|
|
prev->next = sg;
|
|
|
|
prev = sg;
|
|
|
|
}
|
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Calculate CPU power for physical packages and nodes */
|
2006-06-27 11:54:42 +02:00
|
|
|
#ifdef CONFIG_SCHED_SMT
|
2005-06-25 23:57:33 +02:00
|
|
|
for_each_cpu_mask(i, *cpu_map) {
|
2007-07-09 18:51:59 +02:00
|
|
|
struct sched_domain *sd = &per_cpu(cpu_domains, i);
|
|
|
|
|
2006-10-03 10:14:09 +02:00
|
|
|
init_sched_groups_power(i, sd);
|
2006-06-27 11:54:42 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
2006-03-27 11:15:22 +02:00
|
|
|
#ifdef CONFIG_SCHED_MC
|
2006-06-27 11:54:42 +02:00
|
|
|
for_each_cpu_mask(i, *cpu_map) {
|
2007-07-09 18:51:59 +02:00
|
|
|
struct sched_domain *sd = &per_cpu(core_domains, i);
|
|
|
|
|
2006-10-03 10:14:09 +02:00
|
|
|
init_sched_groups_power(i, sd);
|
2006-06-27 11:54:42 +02:00
|
|
|
}
|
|
|
|
#endif
|
2006-03-27 11:15:22 +02:00
|
|
|
|
2006-06-27 11:54:42 +02:00
|
|
|
for_each_cpu_mask(i, *cpu_map) {
|
2007-07-09 18:51:59 +02:00
|
|
|
struct sched_domain *sd = &per_cpu(phys_domains, i);
|
|
|
|
|
2006-10-03 10:14:09 +02:00
|
|
|
init_sched_groups_power(i, sd);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
2005-09-07 00:18:14 +02:00
|
|
|
#ifdef CONFIG_NUMA
|
2006-03-27 11:15:23 +02:00
|
|
|
for (i = 0; i < MAX_NUMNODES; i++)
|
|
|
|
init_numa_sched_groups_power(sched_group_nodes[i]);
|
2005-09-07 00:18:14 +02:00
|
|
|
|
2006-12-10 11:20:07 +01:00
|
|
|
if (sd_allnodes) {
|
|
|
|
struct sched_group *sg;
|
2006-07-30 12:02:59 +02:00
|
|
|
|
2006-12-10 11:20:07 +01:00
|
|
|
cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
|
2006-07-30 12:02:59 +02:00
|
|
|
init_numa_sched_groups_power(sg);
|
|
|
|
}
|
2005-09-07 00:18:14 +02:00
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/* Attach the domains */
|
2005-06-25 23:57:33 +02:00
|
|
|
for_each_cpu_mask(i, *cpu_map) {
|
2005-04-17 00:20:36 +02:00
|
|
|
struct sched_domain *sd;
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
sd = &per_cpu(cpu_domains, i);
|
2006-03-27 11:15:22 +02:00
|
|
|
#elif defined(CONFIG_SCHED_MC)
|
|
|
|
sd = &per_cpu(core_domains, i);
|
2005-04-17 00:20:36 +02:00
|
|
|
#else
|
|
|
|
sd = &per_cpu(phys_domains, i);
|
|
|
|
#endif
|
2008-01-25 21:08:18 +01:00
|
|
|
cpu_attach_domain(sd, rd, i);
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2006-06-27 11:54:38 +02:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
2006-10-03 10:14:06 +02:00
|
|
|
#ifdef CONFIG_NUMA
|
2006-06-27 11:54:38 +02:00
|
|
|
error:
|
|
|
|
free_sched_groups(cpu_map);
|
|
|
|
return -ENOMEM;
|
2006-10-03 10:14:06 +02:00
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
2007-10-19 08:40:20 +02:00
|
|
|
|
|
|
|
static cpumask_t *doms_cur; /* current sched domains */
|
|
|
|
static int ndoms_cur; /* number of sched domains in 'doms_cur' */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Special case: If a kmalloc of a doms_cur partition (array of
|
|
|
|
* cpumask_t) fails, then fallback to a single sched domain,
|
|
|
|
* as determined by the single cpumask_t fallback_doms.
|
|
|
|
*/
|
|
|
|
static cpumask_t fallback_doms;
|
|
|
|
|
2005-06-25 23:57:33 +02:00
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
|
2007-10-19 08:40:20 +02:00
|
|
|
* For now this just excludes isolated cpus, but could be used to
|
|
|
|
* exclude other special cases in the future.
|
2005-06-25 23:57:33 +02:00
|
|
|
*/
|
2006-06-27 11:54:38 +02:00
|
|
|
static int arch_init_sched_domains(const cpumask_t *cpu_map)
|
2005-06-25 23:57:33 +02:00
|
|
|
{
|
2007-10-24 18:23:48 +02:00
|
|
|
int err;
|
|
|
|
|
2007-10-19 08:40:20 +02:00
|
|
|
ndoms_cur = 1;
|
|
|
|
doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
|
|
|
|
if (!doms_cur)
|
|
|
|
doms_cur = &fallback_doms;
|
|
|
|
cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
|
2007-10-24 18:23:48 +02:00
|
|
|
err = build_sched_domains(doms_cur);
|
2007-10-15 17:00:19 +02:00
|
|
|
register_sched_domain_sysctl();
|
2007-10-24 18:23:48 +02:00
|
|
|
|
|
|
|
return err;
|
2005-06-25 23:57:33 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
|
2005-04-17 00:20:36 +02:00
|
|
|
{
|
2006-06-27 11:54:38 +02:00
|
|
|
free_sched_groups(cpu_map);
|
2005-09-07 00:18:14 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2005-06-25 23:57:33 +02:00
|
|
|
/*
|
|
|
|
* Detach sched domains from a group of cpus specified in cpu_map
|
|
|
|
* These cpus will now be attached to the NULL domain
|
|
|
|
*/
|
2006-01-14 22:20:43 +01:00
|
|
|
static void detach_destroy_domains(const cpumask_t *cpu_map)
|
2005-06-25 23:57:33 +02:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2007-10-15 17:00:19 +02:00
|
|
|
unregister_sched_domain_sysctl();
|
|
|
|
|
2005-06-25 23:57:33 +02:00
|
|
|
for_each_cpu_mask(i, *cpu_map)
|
2008-01-25 21:08:18 +01:00
|
|
|
cpu_attach_domain(NULL, &def_root_domain, i);
|
2005-06-25 23:57:33 +02:00
|
|
|
synchronize_sched();
|
|
|
|
arch_destroy_sched_domains(cpu_map);
|
|
|
|
}
|
|
|
|
|
2007-10-19 08:40:20 +02:00
|
|
|
/*
|
|
|
|
* Partition sched domains as specified by the 'ndoms_new'
|
2007-12-05 15:46:09 +01:00
|
|
|
* cpumasks in the array doms_new[] of cpumasks. This compares
|
2007-10-19 08:40:20 +02:00
|
|
|
* doms_new[] to the current sched domain partitioning, doms_cur[].
|
|
|
|
* It destroys each deleted domain and builds each new domain.
|
|
|
|
*
|
|
|
|
* 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
|
2007-12-05 15:46:09 +01:00
|
|
|
* The masks don't intersect (don't overlap.) We should setup one
|
|
|
|
* sched domain for each mask. CPUs not in any of the cpumasks will
|
|
|
|
* not be load balanced. If the same cpumask appears both in the
|
2007-10-19 08:40:20 +02:00
|
|
|
* current 'doms_cur' domains and in the new 'doms_new', we can leave
|
|
|
|
* it as it is.
|
|
|
|
*
|
2007-12-05 15:46:09 +01:00
|
|
|
* The passed in 'doms_new' should be kmalloc'd. This routine takes
|
|
|
|
* ownership of it and will kfree it when done with it. If the caller
|
2007-10-19 08:40:20 +02:00
|
|
|
* failed the kmalloc call, then it can pass in doms_new == NULL,
|
|
|
|
* and partition_sched_domains() will fallback to the single partition
|
|
|
|
* 'fallback_doms'.
|
|
|
|
*
|
|
|
|
* Call with hotplug lock held
|
|
|
|
*/
|
|
|
|
void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
|
|
|
|
{
|
|
|
|
int i, j;
|
|
|
|
|
2008-01-25 21:08:00 +01:00
|
|
|
lock_doms_cur();
|
|
|
|
|
2007-10-24 18:23:48 +02:00
|
|
|
/* always unregister in case we don't destroy any domains */
|
|
|
|
unregister_sched_domain_sysctl();
|
|
|
|
|
2007-10-19 08:40:20 +02:00
|
|
|
if (doms_new == NULL) {
|
|
|
|
ndoms_new = 1;
|
|
|
|
doms_new = &fallback_doms;
|
|
|
|
cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Destroy deleted domains */
|
|
|
|
for (i = 0; i < ndoms_cur; i++) {
|
|
|
|
for (j = 0; j < ndoms_new; j++) {
|
|
|
|
if (cpus_equal(doms_cur[i], doms_new[j]))
|
|
|
|
goto match1;
|
|
|
|
}
|
|
|
|
/* no match - a current sched domain not in new doms_new[] */
|
|
|
|
detach_destroy_domains(doms_cur + i);
|
|
|
|
match1:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Build new domains */
|
|
|
|
for (i = 0; i < ndoms_new; i++) {
|
|
|
|
for (j = 0; j < ndoms_cur; j++) {
|
|
|
|
if (cpus_equal(doms_new[i], doms_cur[j]))
|
|
|
|
goto match2;
|
|
|
|
}
|
|
|
|
/* no match - add a new doms_new */
|
|
|
|
build_sched_domains(doms_new + i);
|
|
|
|
match2:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Remember the new sched domains */
|
|
|
|
if (doms_cur != &fallback_doms)
|
|
|
|
kfree(doms_cur);
|
|
|
|
doms_cur = doms_new;
|
|
|
|
ndoms_cur = ndoms_new;
|
2007-10-24 18:23:48 +02:00
|
|
|
|
|
|
|
register_sched_domain_sysctl();
|
2008-01-25 21:08:00 +01:00
|
|
|
|
|
|
|
unlock_doms_cur();
|
2007-10-19 08:40:20 +02:00
|
|
|
}
|
|
|
|
|
2006-06-27 11:54:42 +02:00
|
|
|
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
2007-08-12 18:08:19 +02:00
|
|
|
static int arch_reinit_sched_domains(void)
|
2006-06-27 11:54:42 +02:00
|
|
|
{
|
|
|
|
int err;
|
|
|
|
|
2008-01-25 21:08:02 +01:00
|
|
|
get_online_cpus();
|
2006-06-27 11:54:42 +02:00
|
|
|
detach_destroy_domains(&cpu_online_map);
|
|
|
|
err = arch_init_sched_domains(&cpu_online_map);
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
2006-06-27 11:54:42 +02:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (buf[0] != '0' && buf[0] != '1')
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (smt)
|
|
|
|
sched_smt_power_savings = (buf[0] == '1');
|
|
|
|
else
|
|
|
|
sched_mc_power_savings = (buf[0] == '1');
|
|
|
|
|
|
|
|
ret = arch_reinit_sched_domains();
|
|
|
|
|
|
|
|
return ret ? ret : count;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_MC
|
|
|
|
static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
|
|
|
|
{
|
|
|
|
return sprintf(page, "%u\n", sched_mc_power_savings);
|
|
|
|
}
|
2006-07-03 09:25:40 +02:00
|
|
|
static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
|
|
|
|
const char *buf, size_t count)
|
2006-06-27 11:54:42 +02:00
|
|
|
{
|
|
|
|
return sched_power_savings_store(buf, count, 0);
|
|
|
|
}
|
2007-08-12 18:08:19 +02:00
|
|
|
static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
|
|
|
|
sched_mc_power_savings_store);
|
2006-06-27 11:54:42 +02:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
|
|
|
|
{
|
|
|
|
return sprintf(page, "%u\n", sched_smt_power_savings);
|
|
|
|
}
|
2006-07-03 09:25:40 +02:00
|
|
|
static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
|
|
|
|
const char *buf, size_t count)
|
2006-06-27 11:54:42 +02:00
|
|
|
{
|
|
|
|
return sched_power_savings_store(buf, count, 1);
|
|
|
|
}
|
2007-08-12 18:08:19 +02:00
|
|
|
static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
|
|
|
|
sched_smt_power_savings_store);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
|
|
|
|
{
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_SMT
|
|
|
|
if (smt_capable())
|
|
|
|
err = sysfs_create_file(&cls->kset.kobj,
|
|
|
|
&attr_sched_smt_power_savings.attr);
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_SCHED_MC
|
|
|
|
if (!err && mc_capable())
|
|
|
|
err = sysfs_create_file(&cls->kset.kobj,
|
|
|
|
&attr_sched_mc_power_savings.attr);
|
|
|
|
#endif
|
|
|
|
return err;
|
|
|
|
}
|
2006-06-27 11:54:42 +02:00
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
2007-12-05 15:46:09 +01:00
|
|
|
* Force a reinitialization of the sched domains hierarchy. The domains
|
2005-04-17 00:20:36 +02:00
|
|
|
* and groups cannot be updated in place without racing with the balancing
|
2005-06-25 23:57:24 +02:00
|
|
|
* code, so we temporarily attach all running cpus to the NULL domain
|
2005-04-17 00:20:36 +02:00
|
|
|
* which will prevent rebalancing while the sched domains are recalculated.
|
|
|
|
*/
|
|
|
|
static int update_sched_domains(struct notifier_block *nfb,
|
|
|
|
unsigned long action, void *hcpu)
|
|
|
|
{
|
|
|
|
switch (action) {
|
|
|
|
case CPU_UP_PREPARE:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_UP_PREPARE_FROZEN:
|
2005-04-17 00:20:36 +02:00
|
|
|
case CPU_DOWN_PREPARE:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_DOWN_PREPARE_FROZEN:
|
2005-06-25 23:57:33 +02:00
|
|
|
detach_destroy_domains(&cpu_online_map);
|
2005-04-17 00:20:36 +02:00
|
|
|
return NOTIFY_OK;
|
|
|
|
|
|
|
|
case CPU_UP_CANCELED:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_UP_CANCELED_FROZEN:
|
2005-04-17 00:20:36 +02:00
|
|
|
case CPU_DOWN_FAILED:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_DOWN_FAILED_FROZEN:
|
2005-04-17 00:20:36 +02:00
|
|
|
case CPU_ONLINE:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_ONLINE_FROZEN:
|
2005-04-17 00:20:36 +02:00
|
|
|
case CPU_DEAD:
|
2007-05-09 11:35:10 +02:00
|
|
|
case CPU_DEAD_FROZEN:
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* Fall through and re-initialise the domains.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The hotplug lock is already held by cpu_up/cpu_down */
|
2005-06-25 23:57:33 +02:00
|
|
|
arch_init_sched_domains(&cpu_online_map);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
return NOTIFY_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __init sched_init_smp(void)
|
|
|
|
{
|
2006-10-03 10:14:04 +02:00
|
|
|
cpumask_t non_isolated_cpus;
|
|
|
|
|
2008-01-25 21:08:02 +01:00
|
|
|
get_online_cpus();
|
2005-06-25 23:57:33 +02:00
|
|
|
arch_init_sched_domains(&cpu_online_map);
|
2007-01-11 08:15:28 +01:00
|
|
|
cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
|
2006-10-03 10:14:04 +02:00
|
|
|
if (cpus_empty(non_isolated_cpus))
|
|
|
|
cpu_set(smp_processor_id(), non_isolated_cpus);
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
2005-04-17 00:20:36 +02:00
|
|
|
/* XXX: Theoretical race here - CPU may be hotplugged now */
|
|
|
|
hotcpu_notifier(update_sched_domains, 0);
|
2006-10-03 10:14:04 +02:00
|
|
|
|
|
|
|
/* Move init over to a non-isolated CPU */
|
|
|
|
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
|
|
|
|
BUG();
|
2007-11-09 22:39:38 +01:00
|
|
|
sched_init_granularity();
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
if (nr_cpu_ids == 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
lb_monitor_task = kthread_create(load_balance_monitor, NULL,
|
|
|
|
"group_balance");
|
|
|
|
if (!IS_ERR(lb_monitor_task)) {
|
|
|
|
lb_monitor_task->flags |= PF_NOFREEZE;
|
|
|
|
wake_up_process(lb_monitor_task);
|
|
|
|
} else {
|
|
|
|
printk(KERN_ERR "Could not create load balance monitor thread"
|
|
|
|
"(error = %ld) \n", PTR_ERR(lb_monitor_task));
|
|
|
|
}
|
|
|
|
#endif
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
void __init sched_init_smp(void)
|
|
|
|
{
|
2007-11-09 22:39:38 +01:00
|
|
|
sched_init_granularity();
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
|
|
|
int in_sched_functions(unsigned long addr)
|
|
|
|
{
|
|
|
|
return in_lock_functions(addr) ||
|
|
|
|
(addr >= (unsigned long)__sched_text_start
|
|
|
|
&& addr < (unsigned long)__sched_text_end);
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:13 +02:00
|
|
|
static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
|
2007-07-09 18:51:59 +02:00
|
|
|
{
|
|
|
|
cfs_rq->tasks_timeline = RB_ROOT;
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
cfs_rq->rq = rq;
|
|
|
|
#endif
|
2007-10-15 17:00:10 +02:00
|
|
|
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:29 +01:00
|
|
|
static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
|
|
|
|
{
|
|
|
|
struct rt_prio_array *array;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
array = &rt_rq->active;
|
|
|
|
for (i = 0; i < MAX_RT_PRIO; i++) {
|
|
|
|
INIT_LIST_HEAD(array->queue + i);
|
|
|
|
__clear_bit(i, array->bitmap);
|
|
|
|
}
|
|
|
|
/* delimiter for bitsearch: */
|
|
|
|
__set_bit(MAX_RT_PRIO, array->bitmap);
|
|
|
|
|
2008-01-25 21:08:31 +01:00
|
|
|
#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
|
|
|
|
rt_rq->highest_prio = MAX_RT_PRIO;
|
|
|
|
#endif
|
2008-01-25 21:08:29 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
rt_rq->rt_nr_migratory = 0;
|
|
|
|
rt_rq->overloaded = 0;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
rt_rq->rt_time = 0;
|
|
|
|
rt_rq->rt_throttled = 0;
|
2008-01-25 21:08:30 +01:00
|
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
rt_rq->rq = rq;
|
|
|
|
#endif
|
2008-01-25 21:08:29 +01:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
|
|
|
|
struct cfs_rq *cfs_rq, struct sched_entity *se,
|
|
|
|
int cpu, int add)
|
|
|
|
{
|
|
|
|
tg->cfs_rq[cpu] = cfs_rq;
|
|
|
|
init_cfs_rq(cfs_rq, rq);
|
|
|
|
cfs_rq->tg = tg;
|
|
|
|
if (add)
|
|
|
|
list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
|
|
|
|
|
|
|
|
tg->se[cpu] = se;
|
|
|
|
se->cfs_rq = &rq->cfs;
|
|
|
|
se->my_q = cfs_rq;
|
|
|
|
se->load.weight = tg->shares;
|
|
|
|
se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
|
|
|
|
se->parent = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
|
|
|
|
struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
|
|
|
|
int cpu, int add)
|
|
|
|
{
|
|
|
|
tg->rt_rq[cpu] = rt_rq;
|
|
|
|
init_rt_rq(rt_rq, rq);
|
|
|
|
rt_rq->tg = tg;
|
|
|
|
rt_rq->rt_se = rt_se;
|
|
|
|
if (add)
|
|
|
|
list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
|
|
|
|
|
|
|
|
tg->rt_se[cpu] = rt_se;
|
|
|
|
rt_se->rt_rq = &rq->rt;
|
|
|
|
rt_se->my_q = rt_rq;
|
|
|
|
rt_se->parent = NULL;
|
|
|
|
INIT_LIST_HEAD(&rt_se->run_list);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
void __init sched_init(void)
|
|
|
|
{
|
2007-05-06 23:48:58 +02:00
|
|
|
int highest_cpu = 0;
|
2007-07-09 18:51:59 +02:00
|
|
|
int i, j;
|
|
|
|
|
2008-01-25 21:08:18 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
init_defrootdomain();
|
|
|
|
#endif
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
list_add(&init_task_group.list, &task_groups);
|
|
|
|
#endif
|
|
|
|
|
2006-03-28 11:56:37 +02:00
|
|
|
for_each_possible_cpu(i) {
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
rq = cpu_rq(i);
|
|
|
|
spin_lock_init(&rq->lock);
|
2006-07-03 09:25:10 +02:00
|
|
|
lockdep_set_class(&rq->lock, &rq->rq_lock_key);
|
2005-06-25 23:57:13 +02:00
|
|
|
rq->nr_running = 0;
|
2007-07-09 18:51:59 +02:00
|
|
|
rq->clock = 1;
|
|
|
|
init_cfs_rq(&rq->cfs, rq);
|
2008-01-25 21:08:30 +01:00
|
|
|
init_rt_rq(&rq->rt, rq);
|
2007-07-09 18:51:59 +02:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2007-10-15 17:00:14 +02:00
|
|
|
init_task_group.shares = init_task_group_load;
|
2008-01-25 21:08:30 +01:00
|
|
|
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
|
|
|
init_tg_cfs_entry(rq, &init_task_group,
|
|
|
|
&per_cpu(init_cfs_rq, i),
|
|
|
|
&per_cpu(init_sched_entity, i), i, 1);
|
|
|
|
|
|
|
|
init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
|
|
|
|
INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
|
|
|
|
init_tg_rt_entry(rq, &init_task_group,
|
|
|
|
&per_cpu(init_rt_rq, i),
|
|
|
|
&per_cpu(init_sched_rt_entity, i), i, 1);
|
2007-07-09 18:51:59 +02:00
|
|
|
#endif
|
2008-01-25 21:08:29 +01:00
|
|
|
rq->rt_period_expire = 0;
|
2008-01-25 21:08:31 +01:00
|
|
|
rq->rt_throttled = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-07-09 18:51:59 +02:00
|
|
|
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
|
|
|
|
rq->cpu_load[j] = 0;
|
2005-04-17 00:20:36 +02:00
|
|
|
#ifdef CONFIG_SMP
|
2005-06-25 23:57:24 +02:00
|
|
|
rq->sd = NULL;
|
2008-01-25 21:08:18 +01:00
|
|
|
rq->rd = NULL;
|
2005-04-17 00:20:36 +02:00
|
|
|
rq->active_balance = 0;
|
2007-07-09 18:51:59 +02:00
|
|
|
rq->next_balance = jiffies;
|
2005-04-17 00:20:36 +02:00
|
|
|
rq->push_cpu = 0;
|
2006-09-26 08:30:51 +02:00
|
|
|
rq->cpu = i;
|
2005-04-17 00:20:36 +02:00
|
|
|
rq->migration_thread = NULL;
|
|
|
|
INIT_LIST_HEAD(&rq->migration_queue);
|
2008-01-25 21:08:26 +01:00
|
|
|
rq_attach_root(rq, &def_root_domain);
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|
2008-01-25 21:08:29 +01:00
|
|
|
init_rq_hrtick(rq);
|
2005-04-17 00:20:36 +02:00
|
|
|
atomic_set(&rq->nr_iowait, 0);
|
2007-05-06 23:48:58 +02:00
|
|
|
highest_cpu = i;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:34 +02:00
|
|
|
set_load_weight(&init_task);
|
2006-07-30 12:03:52 +02:00
|
|
|
|
2007-07-26 13:40:43 +02:00
|
|
|
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
|
|
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
|
|
|
|
#endif
|
|
|
|
|
2006-12-10 11:20:25 +01:00
|
|
|
#ifdef CONFIG_SMP
|
2007-05-06 23:48:58 +02:00
|
|
|
nr_cpu_ids = highest_cpu + 1;
|
2006-12-10 11:20:25 +01:00
|
|
|
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
|
|
|
|
#endif
|
|
|
|
|
2006-07-30 12:03:52 +02:00
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
|
|
plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* The boot idle thread does lazy MMU switching as well:
|
|
|
|
*/
|
|
|
|
atomic_inc(&init_mm.mm_count);
|
|
|
|
enter_lazy_tlb(&init_mm, current);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make us the idle thread. Technically, schedule() should not be
|
|
|
|
* called from this thread, however somewhere below it might be,
|
|
|
|
* but because we are the idle thread, we just pick up running again
|
|
|
|
* when this runqueue becomes "idle".
|
|
|
|
*/
|
|
|
|
init_idle(current, smp_processor_id());
|
2007-07-09 18:51:59 +02:00
|
|
|
/*
|
|
|
|
* During early bootup we pretend to be a normal task:
|
|
|
|
*/
|
|
|
|
current->sched_class = &fair_sched_class;
|
2005-04-17 00:20:36 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
|
|
|
|
void __might_sleep(char *file, int line)
|
|
|
|
{
|
2006-07-03 09:25:40 +02:00
|
|
|
#ifdef in_atomic
|
2005-04-17 00:20:36 +02:00
|
|
|
static unsigned long prev_jiffy; /* ratelimiting */
|
|
|
|
|
|
|
|
if ((in_atomic() || irqs_disabled()) &&
|
|
|
|
system_state == SYSTEM_RUNNING && !oops_in_progress) {
|
|
|
|
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
|
|
|
|
return;
|
|
|
|
prev_jiffy = jiffies;
|
2006-03-23 12:00:54 +01:00
|
|
|
printk(KERN_ERR "BUG: sleeping function called from invalid"
|
2005-04-17 00:20:36 +02:00
|
|
|
" context at %s:%d\n", file, line);
|
|
|
|
printk("in_atomic():%d, irqs_disabled():%d\n",
|
|
|
|
in_atomic(), irqs_disabled());
|
2006-12-07 05:37:21 +01:00
|
|
|
debug_show_held_locks(current);
|
2006-12-13 09:34:43 +01:00
|
|
|
if (irqs_disabled())
|
|
|
|
print_irqtrace_events(current);
|
2005-04-17 00:20:36 +02:00
|
|
|
dump_stack();
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(__might_sleep);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_MAGIC_SYSRQ
|
2007-10-15 17:00:15 +02:00
|
|
|
static void normalize_task(struct rq *rq, struct task_struct *p)
|
|
|
|
{
|
|
|
|
int on_rq;
|
|
|
|
update_rq_clock(rq);
|
|
|
|
on_rq = p->se.on_rq;
|
|
|
|
if (on_rq)
|
|
|
|
deactivate_task(rq, p, 0);
|
|
|
|
__setscheduler(rq, p, SCHED_NORMAL, 0);
|
|
|
|
if (on_rq) {
|
|
|
|
activate_task(rq, p, 0);
|
|
|
|
resched_task(rq->curr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
void normalize_rt_tasks(void)
|
|
|
|
{
|
2007-06-17 18:37:45 +02:00
|
|
|
struct task_struct *g, *p;
|
2005-04-17 00:20:36 +02:00
|
|
|
unsigned long flags;
|
2006-07-03 09:25:42 +02:00
|
|
|
struct rq *rq;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
read_lock_irq(&tasklist_lock);
|
2007-06-17 18:37:45 +02:00
|
|
|
do_each_thread(g, p) {
|
2007-10-15 17:00:18 +02:00
|
|
|
/*
|
|
|
|
* Only normalize user tasks:
|
|
|
|
*/
|
|
|
|
if (!p->mm)
|
|
|
|
continue;
|
|
|
|
|
2007-08-02 17:41:40 +02:00
|
|
|
p->se.exec_start = 0;
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
2007-07-09 18:51:59 +02:00
|
|
|
p->se.wait_start = 0;
|
|
|
|
p->se.sleep_start = 0;
|
|
|
|
p->se.block_start = 0;
|
2007-08-02 17:41:40 +02:00
|
|
|
#endif
|
2007-07-09 18:51:59 +02:00
|
|
|
task_rq(p)->clock = 0;
|
|
|
|
|
|
|
|
if (!rt_task(p)) {
|
|
|
|
/*
|
|
|
|
* Renice negative nice level userspace
|
|
|
|
* tasks back to 0:
|
|
|
|
*/
|
|
|
|
if (TASK_NICE(p) < 0 && p->mm)
|
|
|
|
set_user_nice(p, 0);
|
2005-04-17 00:20:36 +02:00
|
|
|
continue;
|
2007-07-09 18:51:59 +02:00
|
|
|
}
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2006-06-27 11:54:51 +02:00
|
|
|
spin_lock_irqsave(&p->pi_lock, flags);
|
|
|
|
rq = __task_rq_lock(p);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2007-10-15 17:00:18 +02:00
|
|
|
normalize_task(rq, p);
|
2007-10-15 17:00:15 +02:00
|
|
|
|
2006-06-27 11:54:51 +02:00
|
|
|
__task_rq_unlock(rq);
|
|
|
|
spin_unlock_irqrestore(&p->pi_lock, flags);
|
2007-06-17 18:37:45 +02:00
|
|
|
} while_each_thread(g, p);
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
read_unlock_irq(&tasklist_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_MAGIC_SYSRQ */
|
2005-09-12 16:59:21 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_IA64
|
|
|
|
/*
|
|
|
|
* These functions are only useful for the IA64 MCA handling.
|
|
|
|
*
|
|
|
|
* They can only be called when the whole system has been
|
|
|
|
* stopped - every CPU needs to be quiescent, and no scheduling
|
|
|
|
* activity can take place. Using them for anything else would
|
|
|
|
* be a serious bug, and as a result, they aren't even visible
|
|
|
|
* under any other configuration.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* curr_task - return the current task for a given cpu.
|
|
|
|
* @cpu: the processor in question.
|
|
|
|
*
|
|
|
|
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
struct task_struct *curr_task(int cpu)
|
2005-09-12 16:59:21 +02:00
|
|
|
{
|
|
|
|
return cpu_curr(cpu);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* set_curr_task - set the current task for a given cpu.
|
|
|
|
* @cpu: the processor in question.
|
|
|
|
* @p: the task pointer to set.
|
|
|
|
*
|
|
|
|
* Description: This function must only be used when non-maskable interrupts
|
2007-12-05 15:46:09 +01:00
|
|
|
* are serviced on a separate stack. It allows the architecture to switch the
|
|
|
|
* notion of the current task on a cpu in a non-blocking manner. This function
|
2005-09-12 16:59:21 +02:00
|
|
|
* must be called with all CPU's synchronized, and interrupts disabled, the
|
|
|
|
* and caller must save the original value of the current task (see
|
|
|
|
* curr_task() above) and restore that value before reenabling interrupts and
|
|
|
|
* re-starting the system.
|
|
|
|
*
|
|
|
|
* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
|
|
|
|
*/
|
2006-07-03 09:25:41 +02:00
|
|
|
void set_curr_task(int cpu, struct task_struct *p)
|
2005-09-12 16:59:21 +02:00
|
|
|
{
|
|
|
|
cpu_curr(cpu) = p;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
2007-10-15 17:00:07 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
/*
|
|
|
|
* distribute shares of all task groups among their schedulable entities,
|
2008-01-25 21:08:29 +01:00
|
|
|
* to reflect load distribution across cpus.
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
*/
|
|
|
|
static int rebalance_shares(struct sched_domain *sd, int this_cpu)
|
|
|
|
{
|
|
|
|
struct cfs_rq *cfs_rq;
|
|
|
|
struct rq *rq = cpu_rq(this_cpu);
|
|
|
|
cpumask_t sdspan = sd->span;
|
|
|
|
int balanced = 1;
|
|
|
|
|
|
|
|
/* Walk thr' all the task groups that we have */
|
|
|
|
for_each_leaf_cfs_rq(rq, cfs_rq) {
|
|
|
|
int i;
|
|
|
|
unsigned long total_load = 0, total_shares;
|
|
|
|
struct task_group *tg = cfs_rq->tg;
|
|
|
|
|
|
|
|
/* Gather total task load of this group across cpus */
|
|
|
|
for_each_cpu_mask(i, sdspan)
|
|
|
|
total_load += tg->cfs_rq[i]->load.weight;
|
|
|
|
|
2008-01-25 21:08:19 +01:00
|
|
|
/* Nothing to do if this group has no load */
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
if (!total_load)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* tg->shares represents the number of cpu shares the task group
|
|
|
|
* is eligible to hold on a single cpu. On N cpus, it is
|
|
|
|
* eligible to hold (N * tg->shares) number of cpu shares.
|
|
|
|
*/
|
|
|
|
total_shares = tg->shares * cpus_weight(sdspan);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* redistribute total_shares across cpus as per the task load
|
|
|
|
* distribution.
|
|
|
|
*/
|
|
|
|
for_each_cpu_mask(i, sdspan) {
|
|
|
|
unsigned long local_load, local_shares;
|
|
|
|
|
|
|
|
local_load = tg->cfs_rq[i]->load.weight;
|
|
|
|
local_shares = (local_load * total_shares) / total_load;
|
|
|
|
if (!local_shares)
|
|
|
|
local_shares = MIN_GROUP_SHARES;
|
|
|
|
if (local_shares == tg->se[i]->load.weight)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
spin_lock_irq(&cpu_rq(i)->lock);
|
|
|
|
set_se_shares(tg->se[i], local_shares);
|
|
|
|
spin_unlock_irq(&cpu_rq(i)->lock);
|
|
|
|
balanced = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return balanced;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* How frequently should we rebalance_shares() across cpus?
|
|
|
|
*
|
|
|
|
* The more frequently we rebalance shares, the more accurate is the fairness
|
|
|
|
* of cpu bandwidth distribution between task groups. However higher frequency
|
|
|
|
* also implies increased scheduling overhead.
|
|
|
|
*
|
|
|
|
* sysctl_sched_min_bal_int_shares represents the minimum interval between
|
|
|
|
* consecutive calls to rebalance_shares() in the same sched domain.
|
|
|
|
*
|
|
|
|
* sysctl_sched_max_bal_int_shares represents the maximum interval between
|
|
|
|
* consecutive calls to rebalance_shares() in the same sched domain.
|
|
|
|
*
|
2008-01-25 21:08:29 +01:00
|
|
|
* These settings allows for the appropriate trade-off between accuracy of
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
* fairness and the associated overhead.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* default: 8ms, units: milliseconds */
|
|
|
|
const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
|
|
|
|
|
|
|
|
/* default: 128ms, units: milliseconds */
|
|
|
|
const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
|
|
|
|
|
|
|
|
/* kernel thread that runs rebalance_shares() periodically */
|
|
|
|
static int load_balance_monitor(void *unused)
|
|
|
|
{
|
|
|
|
unsigned int timeout = sysctl_sched_min_bal_int_shares;
|
|
|
|
struct sched_param schedparm;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't want this thread's execution to be limited by the shares
|
|
|
|
* assigned to default group (init_task_group). Hence make it run
|
|
|
|
* as a SCHED_RR RT task at the lowest priority.
|
|
|
|
*/
|
|
|
|
schedparm.sched_priority = 1;
|
|
|
|
ret = sched_setscheduler(current, SCHED_RR, &schedparm);
|
|
|
|
if (ret)
|
|
|
|
printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
|
|
|
|
" monitor thread (error = %d) \n", ret);
|
|
|
|
|
|
|
|
while (!kthread_should_stop()) {
|
|
|
|
int i, cpu, balanced = 1;
|
|
|
|
|
|
|
|
/* Prevent cpus going down or coming up */
|
2008-01-25 21:08:02 +01:00
|
|
|
get_online_cpus();
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
/* lockout changes to doms_cur[] array */
|
|
|
|
lock_doms_cur();
|
|
|
|
/*
|
|
|
|
* Enter a rcu read-side critical section to safely walk rq->sd
|
|
|
|
* chain on various cpus and to walk task group list
|
|
|
|
* (rq->leaf_cfs_rq_list) in rebalance_shares().
|
|
|
|
*/
|
|
|
|
rcu_read_lock();
|
|
|
|
|
|
|
|
for (i = 0; i < ndoms_cur; i++) {
|
|
|
|
cpumask_t cpumap = doms_cur[i];
|
|
|
|
struct sched_domain *sd = NULL, *sd_prev = NULL;
|
|
|
|
|
|
|
|
cpu = first_cpu(cpumap);
|
|
|
|
|
|
|
|
/* Find the highest domain at which to balance shares */
|
|
|
|
for_each_domain(cpu, sd) {
|
|
|
|
if (!(sd->flags & SD_LOAD_BALANCE))
|
|
|
|
continue;
|
|
|
|
sd_prev = sd;
|
|
|
|
}
|
|
|
|
|
|
|
|
sd = sd_prev;
|
|
|
|
/* sd == NULL? No load balance reqd in this domain */
|
|
|
|
if (!sd)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
balanced &= rebalance_shares(sd, cpu);
|
|
|
|
}
|
|
|
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
unlock_doms_cur();
|
2008-01-25 21:08:02 +01:00
|
|
|
put_online_cpus();
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
|
|
|
|
if (!balanced)
|
|
|
|
timeout = sysctl_sched_min_bal_int_shares;
|
|
|
|
else if (timeout < sysctl_sched_max_bal_int_shares)
|
|
|
|
timeout *= 2;
|
|
|
|
|
|
|
|
msleep_interruptible(timeout);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static void free_sched_group(struct task_group *tg)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
if (tg->cfs_rq)
|
|
|
|
kfree(tg->cfs_rq[i]);
|
|
|
|
if (tg->se)
|
|
|
|
kfree(tg->se[i]);
|
|
|
|
if (tg->rt_rq)
|
|
|
|
kfree(tg->rt_rq[i]);
|
|
|
|
if (tg->rt_se)
|
|
|
|
kfree(tg->rt_se[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
kfree(tg->cfs_rq);
|
|
|
|
kfree(tg->se);
|
|
|
|
kfree(tg->rt_rq);
|
|
|
|
kfree(tg->rt_se);
|
|
|
|
kfree(tg);
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:07 +02:00
|
|
|
/* allocate runqueue etc for a new task group */
|
2007-10-15 17:00:14 +02:00
|
|
|
struct task_group *sched_create_group(void)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
2007-10-15 17:00:14 +02:00
|
|
|
struct task_group *tg;
|
2007-10-15 17:00:07 +02:00
|
|
|
struct cfs_rq *cfs_rq;
|
|
|
|
struct sched_entity *se;
|
2008-01-25 21:08:30 +01:00
|
|
|
struct rt_rq *rt_rq;
|
|
|
|
struct sched_rt_entity *rt_se;
|
2007-10-15 17:00:09 +02:00
|
|
|
struct rq *rq;
|
2007-10-15 17:00:07 +02:00
|
|
|
int i;
|
|
|
|
|
|
|
|
tg = kzalloc(sizeof(*tg), GFP_KERNEL);
|
|
|
|
if (!tg)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL);
|
2007-10-15 17:00:07 +02:00
|
|
|
if (!tg->cfs_rq)
|
|
|
|
goto err;
|
2007-10-15 17:00:09 +02:00
|
|
|
tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
|
2007-10-15 17:00:07 +02:00
|
|
|
if (!tg->se)
|
|
|
|
goto err;
|
2008-01-25 21:08:30 +01:00
|
|
|
tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
|
|
|
|
if (!tg->rt_rq)
|
|
|
|
goto err;
|
|
|
|
tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
|
|
|
|
if (!tg->rt_se)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
tg->shares = NICE_0_LOAD;
|
|
|
|
tg->rt_ratio = 0; /* XXX */
|
2007-10-15 17:00:07 +02:00
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
2007-10-15 17:00:09 +02:00
|
|
|
rq = cpu_rq(i);
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
|
|
|
|
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
|
2007-10-15 17:00:07 +02:00
|
|
|
if (!cfs_rq)
|
|
|
|
goto err;
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
se = kmalloc_node(sizeof(struct sched_entity),
|
|
|
|
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
|
2007-10-15 17:00:07 +02:00
|
|
|
if (!se)
|
|
|
|
goto err;
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
rt_rq = kmalloc_node(sizeof(struct rt_rq),
|
|
|
|
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
|
|
|
|
if (!rt_rq)
|
|
|
|
goto err;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
|
|
|
|
GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
|
|
|
|
if (!rt_se)
|
|
|
|
goto err;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
|
|
|
|
init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
|
|
|
|
2008-01-25 21:07:59 +01:00
|
|
|
lock_task_group_list();
|
2007-10-15 17:00:09 +02:00
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
rq = cpu_rq(i);
|
|
|
|
cfs_rq = tg->cfs_rq[i];
|
|
|
|
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
|
2008-01-25 21:08:30 +01:00
|
|
|
rt_rq = tg->rt_rq[i];
|
|
|
|
list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
|
2007-10-15 17:00:09 +02:00
|
|
|
}
|
2008-01-25 21:08:30 +01:00
|
|
|
list_add_rcu(&tg->list, &task_groups);
|
2008-01-25 21:07:59 +01:00
|
|
|
unlock_task_group_list();
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
return tg;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
|
|
|
err:
|
2008-01-25 21:08:30 +01:00
|
|
|
free_sched_group(tg);
|
2007-10-15 17:00:07 +02:00
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
/* rcu callback to free various structures associated with a task group */
|
2008-01-25 21:08:30 +01:00
|
|
|
static void free_sched_group_rcu(struct rcu_head *rhp)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
|
|
|
/* now it should be safe to free those cfs_rqs */
|
2008-01-25 21:08:30 +01:00
|
|
|
free_sched_group(container_of(rhp, struct task_group, rcu));
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
/* Destroy runqueue etc associated with a task group */
|
2007-10-15 17:00:14 +02:00
|
|
|
void sched_destroy_group(struct task_group *tg)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
2007-10-29 21:18:11 +01:00
|
|
|
struct cfs_rq *cfs_rq = NULL;
|
2008-01-25 21:08:30 +01:00
|
|
|
struct rt_rq *rt_rq = NULL;
|
2007-10-15 17:00:09 +02:00
|
|
|
int i;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2008-01-25 21:07:59 +01:00
|
|
|
lock_task_group_list();
|
2007-10-15 17:00:09 +02:00
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
cfs_rq = tg->cfs_rq[i];
|
|
|
|
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
|
2008-01-25 21:08:30 +01:00
|
|
|
rt_rq = tg->rt_rq[i];
|
|
|
|
list_del_rcu(&rt_rq->leaf_rt_rq_list);
|
2007-10-15 17:00:09 +02:00
|
|
|
}
|
2008-01-25 21:08:30 +01:00
|
|
|
list_del_rcu(&tg->list);
|
2008-01-25 21:07:59 +01:00
|
|
|
unlock_task_group_list();
|
2007-10-15 17:00:09 +02:00
|
|
|
|
2007-10-29 21:18:11 +01:00
|
|
|
BUG_ON(!cfs_rq);
|
2007-10-15 17:00:09 +02:00
|
|
|
|
|
|
|
/* wait for possible concurrent references to cfs_rqs complete */
|
2008-01-25 21:08:30 +01:00
|
|
|
call_rcu(&tg->rcu, free_sched_group_rcu);
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:09 +02:00
|
|
|
/* change task's runqueue when it moves between groups.
|
2007-10-15 17:00:12 +02:00
|
|
|
* The caller of this function should have put the task in its new group
|
|
|
|
* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
|
|
|
|
* reflect its new group.
|
2007-10-15 17:00:09 +02:00
|
|
|
*/
|
|
|
|
void sched_move_task(struct task_struct *tsk)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
|
|
|
int on_rq, running;
|
|
|
|
unsigned long flags;
|
|
|
|
struct rq *rq;
|
|
|
|
|
|
|
|
rq = task_rq_lock(tsk, &flags);
|
|
|
|
|
|
|
|
update_rq_clock(rq);
|
|
|
|
|
2007-12-18 15:21:13 +01:00
|
|
|
running = task_current(rq, tsk);
|
2007-10-15 17:00:07 +02:00
|
|
|
on_rq = tsk->se.on_rq;
|
|
|
|
|
2007-10-15 17:00:08 +02:00
|
|
|
if (on_rq) {
|
2007-10-15 17:00:07 +02:00
|
|
|
dequeue_task(rq, tsk, 0);
|
2007-10-15 17:00:08 +02:00
|
|
|
if (unlikely(running))
|
|
|
|
tsk->sched_class->put_prev_task(rq, tsk);
|
|
|
|
}
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
set_task_rq(tsk, task_cpu(tsk));
|
2007-10-15 17:00:07 +02:00
|
|
|
|
2007-10-15 17:00:08 +02:00
|
|
|
if (on_rq) {
|
|
|
|
if (unlikely(running))
|
|
|
|
tsk->sched_class->set_curr_task(rq);
|
2007-10-15 17:00:07 +02:00
|
|
|
enqueue_task(rq, tsk, 0);
|
2007-10-15 17:00:08 +02:00
|
|
|
}
|
2007-10-15 17:00:07 +02:00
|
|
|
|
|
|
|
task_rq_unlock(rq, &flags);
|
|
|
|
}
|
|
|
|
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
/* rq->lock to be locked by caller */
|
2007-10-15 17:00:07 +02:00
|
|
|
static void set_se_shares(struct sched_entity *se, unsigned long shares)
|
|
|
|
{
|
|
|
|
struct cfs_rq *cfs_rq = se->cfs_rq;
|
|
|
|
struct rq *rq = cfs_rq->rq;
|
|
|
|
int on_rq;
|
|
|
|
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
if (!shares)
|
|
|
|
shares = MIN_GROUP_SHARES;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
|
|
|
on_rq = se->on_rq;
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
if (on_rq) {
|
2007-10-15 17:00:07 +02:00
|
|
|
dequeue_entity(cfs_rq, se, 0);
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
dec_cpu_load(rq, se->load.weight);
|
|
|
|
}
|
2007-10-15 17:00:07 +02:00
|
|
|
|
|
|
|
se->load.weight = shares;
|
|
|
|
se->load.inv_weight = div64_64((1ULL<<32), shares);
|
|
|
|
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
if (on_rq) {
|
2007-10-15 17:00:07 +02:00
|
|
|
enqueue_entity(cfs_rq, se, 0);
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
inc_cpu_load(rq, se->load.weight);
|
|
|
|
}
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
|
2007-10-15 17:00:07 +02:00
|
|
|
{
|
|
|
|
int i;
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
struct cfs_rq *cfs_rq;
|
|
|
|
struct rq *rq;
|
2008-01-22 11:24:58 +01:00
|
|
|
|
2008-01-25 21:07:59 +01:00
|
|
|
lock_task_group_list();
|
2007-10-15 17:00:09 +02:00
|
|
|
if (tg->shares == shares)
|
2007-10-15 17:00:14 +02:00
|
|
|
goto done;
|
2007-10-15 17:00:07 +02:00
|
|
|
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
if (shares < MIN_GROUP_SHARES)
|
|
|
|
shares = MIN_GROUP_SHARES;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prevent any load balance activity (rebalance_shares,
|
|
|
|
* load_balance_fair) from referring to this group first,
|
|
|
|
* by taking it off the rq->leaf_cfs_rq_list on each cpu.
|
|
|
|
*/
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
cfs_rq = tg->cfs_rq[i];
|
|
|
|
list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* wait for any ongoing reference to this group to finish */
|
|
|
|
synchronize_sched();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now we are free to modify the group's share on each cpu
|
|
|
|
* w/o tripping rebalance_share or load_balance_fair.
|
|
|
|
*/
|
2007-10-15 17:00:09 +02:00
|
|
|
tg->shares = shares;
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
spin_lock_irq(&cpu_rq(i)->lock);
|
2007-10-15 17:00:09 +02:00
|
|
|
set_se_shares(tg->se[i], shares);
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
spin_unlock_irq(&cpu_rq(i)->lock);
|
|
|
|
}
|
2007-10-15 17:00:07 +02:00
|
|
|
|
sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups
The current load balancing scheme isn't good enough for precise
group fairness.
For example: on a 8-cpu system, I created 3 groups as under:
a = 8 tasks (cpu.shares = 1024)
b = 4 tasks (cpu.shares = 1024)
c = 3 tasks (cpu.shares = 1024)
a, b and c are task groups that have equal weight. We would expect each
of the groups to receive 33.33% of cpu bandwidth under a fair scheduler.
This is what I get with the latest scheduler git tree:
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 277.676 | 57.8% | 54.1% 54.1% 54.1% 54.2% 56.7% 62.2% 62.8% 64.5%
b | 116.108 | 24.2% | 47.4% 48.1% 48.7% 49.3%
c | 86.326 | 18.0% | 47.5% 47.9% 48.5%
--------------------------------------------------------------------------------
Explanation of o/p:
Col1 -> Group name
Col2 -> Cumulative execution time (in seconds) received by all tasks of that
group in a 60sec window across 8 cpus
Col3 -> CPU bandwidth received by the group in the 60sec window, expressed in
percentage. Col3 data is derived as:
Col3 = 100 * Col2 / (NR_CPUS * 60)
Col4 -> CPU bandwidth received by each individual task of the group.
Col4 = 100 * cpu_time_recd_by_task / 60
[I can share the test case that produces a similar o/p if reqd]
The deviation from desired group fairness is as below:
a = +24.47%
b = -9.13%
c = -15.33%
which is quite high.
After the patch below is applied, here are the results:
--------------------------------------------------------------------------------
Col1 | Col2 | Col3 | Col4
------|---------|-------|-------------------------------------------------------
a | 163.112 | 34.0% | 33.2% 33.4% 33.5% 33.5% 33.7% 34.4% 34.8% 35.3%
b | 156.220 | 32.5% | 63.3% 64.5% 66.1% 66.5%
c | 160.653 | 33.5% | 85.8% 90.6% 91.4%
--------------------------------------------------------------------------------
Deviation from desired group fairness is as below:
a = +0.67%
b = -0.83%
c = +0.17%
which is far better IMO. Most of other runs have yielded a deviation within
+-2% at the most, which is good.
Why do we see bad (group) fairness with current scheuler?
=========================================================
Currently cpu's weight is just the summation of individual task weights.
This can yield incorrect results. For ex: consider three groups as below
on a 2-cpu system:
CPU0 CPU1
---------------------------
A (10) B(5)
C(5)
---------------------------
Group A has 10 tasks, all on CPU0, Group B and C have 5 tasks each all
of which are on CPU1. Each task has the same weight (NICE_0_LOAD =
1024).
The current scheme would yield a cpu weight of 10240 (10*1024) for each cpu and
the load balancer will think both CPUs are perfectly balanced and won't
move around any tasks. This, however, would yield this bandwidth:
A = 50%
B = 25%
C = 25%
which is not the desired result.
What's changing in the patch?
=============================
- How cpu weights are calculated when CONFIF_FAIR_GROUP_SCHED is
defined (see below)
- API Change
- Two tunables introduced in sysfs (under SCHED_DEBUG) to
control the frequency at which the load balance monitor
thread runs.
The basic change made in this patch is how cpu weight (rq->load.weight) is
calculated. Its now calculated as the summation of group weights on a cpu,
rather than summation of task weights. Weight exerted by a group on a
cpu is dependent on the shares allocated to it and also the number of
tasks the group has on that cpu compared to the total number of
(runnable) tasks the group has in the system.
Let,
W(K,i) = Weight of group K on cpu i
T(K,i) = Task load present in group K's cfs_rq on cpu i
T(K) = Total task load of group K across various cpus
S(K) = Shares allocated to group K
NRCPUS = Number of online cpus in the scheduler domain to
which group K is assigned.
Then,
W(K,i) = S(K) * NRCPUS * T(K,i) / T(K)
A load balance monitor thread is created at bootup, which periodically
runs and adjusts group's weight on each cpu. To avoid its overhead, two
min/max tunables are introduced (under SCHED_DEBUG) to control the rate
at which it runs.
Fixes from: Peter Zijlstra <a.p.zijlstra@chello.nl>
- don't start the load_balance_monitor when there is only a single cpu.
- rename the kthread because its currently longer than TASK_COMM_LEN
Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-01-25 21:08:00 +01:00
|
|
|
/*
|
|
|
|
* Enable load balance activity on this group, by inserting it back on
|
|
|
|
* each cpu's rq->leaf_cfs_rq_list.
|
|
|
|
*/
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
rq = cpu_rq(i);
|
|
|
|
cfs_rq = tg->cfs_rq[i];
|
|
|
|
list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
|
|
|
|
}
|
2007-10-15 17:00:14 +02:00
|
|
|
done:
|
2008-01-25 21:07:59 +01:00
|
|
|
unlock_task_group_list();
|
2007-10-15 17:00:09 +02:00
|
|
|
return 0;
|
2007-10-15 17:00:07 +02:00
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:14 +02:00
|
|
|
unsigned long sched_group_shares(struct task_group *tg)
|
|
|
|
{
|
|
|
|
return tg->shares;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
/*
|
|
|
|
* Ensure the total rt_ratio <= sysctl_sched_rt_ratio
|
|
|
|
*/
|
|
|
|
int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
|
|
|
|
{
|
|
|
|
struct task_group *tgi;
|
|
|
|
unsigned long total = 0;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
list_for_each_entry_rcu(tgi, &task_groups, list)
|
|
|
|
total += tgi->rt_ratio;
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
tg->rt_ratio = rt_ratio;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned long sched_group_rt_ratio(struct task_group *tg)
|
|
|
|
{
|
|
|
|
return tg->rt_ratio;
|
|
|
|
}
|
|
|
|
|
2007-10-15 17:00:12 +02:00
|
|
|
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
2007-10-19 08:41:03 +02:00
|
|
|
|
|
|
|
#ifdef CONFIG_FAIR_CGROUP_SCHED
|
|
|
|
|
|
|
|
/* return corresponding task_group object of a cgroup */
|
2007-10-24 18:23:50 +02:00
|
|
|
static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
|
2007-10-19 08:41:03 +02:00
|
|
|
{
|
2007-10-24 18:23:50 +02:00
|
|
|
return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
|
|
|
|
struct task_group, css);
|
2007-10-19 08:41:03 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct cgroup_subsys_state *
|
2007-10-24 18:23:50 +02:00
|
|
|
cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
|
2007-10-19 08:41:03 +02:00
|
|
|
{
|
|
|
|
struct task_group *tg;
|
|
|
|
|
2007-10-24 18:23:50 +02:00
|
|
|
if (!cgrp->parent) {
|
2007-10-19 08:41:03 +02:00
|
|
|
/* This is early initialization for the top cgroup */
|
2007-10-24 18:23:50 +02:00
|
|
|
init_task_group.css.cgroup = cgrp;
|
2007-10-19 08:41:03 +02:00
|
|
|
return &init_task_group.css;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* we support only 1-level deep hierarchical scheduler atm */
|
2007-10-24 18:23:50 +02:00
|
|
|
if (cgrp->parent->parent)
|
2007-10-19 08:41:03 +02:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
|
|
tg = sched_create_group();
|
|
|
|
if (IS_ERR(tg))
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
/* Bind the cgroup to task_group object we just created */
|
2007-10-24 18:23:50 +02:00
|
|
|
tg->css.cgroup = cgrp;
|
2007-10-19 08:41:03 +02:00
|
|
|
|
|
|
|
return &tg->css;
|
|
|
|
}
|
|
|
|
|
2007-12-05 15:46:09 +01:00
|
|
|
static void
|
|
|
|
cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
|
2007-10-19 08:41:03 +02:00
|
|
|
{
|
2007-10-24 18:23:50 +02:00
|
|
|
struct task_group *tg = cgroup_tg(cgrp);
|
2007-10-19 08:41:03 +02:00
|
|
|
|
|
|
|
sched_destroy_group(tg);
|
|
|
|
}
|
|
|
|
|
2007-12-05 15:46:09 +01:00
|
|
|
static int
|
|
|
|
cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
|
|
|
|
struct task_struct *tsk)
|
2007-10-19 08:41:03 +02:00
|
|
|
{
|
|
|
|
/* We don't support RT-tasks being in separate groups */
|
|
|
|
if (tsk->sched_class != &fair_sched_class)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2007-10-24 18:23:50 +02:00
|
|
|
cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
|
2007-10-19 08:41:03 +02:00
|
|
|
struct cgroup *old_cont, struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
sched_move_task(tsk);
|
|
|
|
}
|
|
|
|
|
2007-10-24 18:23:50 +02:00
|
|
|
static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype,
|
|
|
|
u64 shareval)
|
2007-10-19 08:41:03 +02:00
|
|
|
{
|
2007-10-24 18:23:50 +02:00
|
|
|
return sched_group_set_shares(cgroup_tg(cgrp), shareval);
|
2007-10-19 08:41:03 +02:00
|
|
|
}
|
|
|
|
|
2007-10-24 18:23:50 +02:00
|
|
|
static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
|
2007-10-19 08:41:03 +02:00
|
|
|
{
|
2007-10-24 18:23:50 +02:00
|
|
|
struct task_group *tg = cgroup_tg(cgrp);
|
2007-10-19 08:41:03 +02:00
|
|
|
|
|
|
|
return (u64) tg->shares;
|
|
|
|
}
|
|
|
|
|
2008-01-25 21:08:30 +01:00
|
|
|
static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
|
|
|
|
u64 rt_ratio_val)
|
|
|
|
{
|
|
|
|
return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
|
|
|
|
}
|
|
|
|
|
|
|
|
static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
|
|
|
|
{
|
|
|
|
struct task_group *tg = cgroup_tg(cgrp);
|
|
|
|
|
|
|
|
return (u64) tg->rt_ratio;
|
|
|
|
}
|
|
|
|
|
2007-10-29 21:18:11 +01:00
|
|
|
static struct cftype cpu_files[] = {
|
|
|
|
{
|
|
|
|
.name = "shares",
|
|
|
|
.read_uint = cpu_shares_read_uint,
|
|
|
|
.write_uint = cpu_shares_write_uint,
|
|
|
|
},
|
2008-01-25 21:08:30 +01:00
|
|
|
{
|
|
|
|
.name = "rt_ratio",
|
|
|
|
.read_uint = cpu_rt_ratio_read_uint,
|
|
|
|
.write_uint = cpu_rt_ratio_write_uint,
|
|
|
|
},
|
2007-10-19 08:41:03 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
|
|
|
|
{
|
2007-10-29 21:18:11 +01:00
|
|
|
return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
|
2007-10-19 08:41:03 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
struct cgroup_subsys cpu_cgroup_subsys = {
|
2007-10-29 21:18:11 +01:00
|
|
|
.name = "cpu",
|
|
|
|
.create = cpu_cgroup_create,
|
|
|
|
.destroy = cpu_cgroup_destroy,
|
|
|
|
.can_attach = cpu_cgroup_can_attach,
|
|
|
|
.attach = cpu_cgroup_attach,
|
|
|
|
.populate = cpu_cgroup_populate,
|
|
|
|
.subsys_id = cpu_cgroup_subsys_id,
|
2007-10-19 08:41:03 +02:00
|
|
|
.early_init = 1,
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* CONFIG_FAIR_CGROUP_SCHED */
|
2007-12-02 20:04:49 +01:00
|
|
|
|
|
|
|
#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
|
|
|
|
/*
|
|
|
|
* CPU accounting code for task groups.
|
|
|
|
*
|
|
|
|
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
|
|
|
|
* (balbir@in.ibm.com).
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* track cpu usage of a group of tasks */
|
|
|
|
struct cpuacct {
|
|
|
|
struct cgroup_subsys_state css;
|
|
|
|
/* cpuusage holds pointer to a u64-type object on every cpu */
|
|
|
|
u64 *cpuusage;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct cgroup_subsys cpuacct_subsys;
|
|
|
|
|
|
|
|
/* return cpu accounting group corresponding to this container */
|
|
|
|
static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
|
|
|
|
{
|
|
|
|
return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
|
|
|
|
struct cpuacct, css);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* return cpu accounting group to which this task belongs */
|
|
|
|
static inline struct cpuacct *task_ca(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
|
|
|
|
struct cpuacct, css);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* create a new cpu accounting group */
|
|
|
|
static struct cgroup_subsys_state *cpuacct_create(
|
|
|
|
struct cgroup_subsys *ss, struct cgroup *cont)
|
|
|
|
{
|
|
|
|
struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
|
|
|
|
|
|
|
|
if (!ca)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
|
|
|
ca->cpuusage = alloc_percpu(u64);
|
|
|
|
if (!ca->cpuusage) {
|
|
|
|
kfree(ca);
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
|
|
|
|
|
|
|
return &ca->css;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* destroy an existing cpu accounting group */
|
2007-12-05 15:46:09 +01:00
|
|
|
static void
|
|
|
|
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
|
2007-12-02 20:04:49 +01:00
|
|
|
{
|
|
|
|
struct cpuacct *ca = cgroup_ca(cont);
|
|
|
|
|
|
|
|
free_percpu(ca->cpuusage);
|
|
|
|
kfree(ca);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* return total cpu usage (in nanoseconds) of a group */
|
|
|
|
static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
|
|
|
|
{
|
|
|
|
struct cpuacct *ca = cgroup_ca(cont);
|
|
|
|
u64 totalcpuusage = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
|
|
u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Take rq->lock to make 64-bit addition safe on 32-bit
|
|
|
|
* platforms.
|
|
|
|
*/
|
|
|
|
spin_lock_irq(&cpu_rq(i)->lock);
|
|
|
|
totalcpuusage += *cpuusage;
|
|
|
|
spin_unlock_irq(&cpu_rq(i)->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
return totalcpuusage;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct cftype files[] = {
|
|
|
|
{
|
|
|
|
.name = "usage",
|
|
|
|
.read_uint = cpuusage_read,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
|
|
|
|
{
|
|
|
|
return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* charge this task's execution time to its accounting group.
|
|
|
|
*
|
|
|
|
* called with rq->lock held.
|
|
|
|
*/
|
|
|
|
static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
|
|
|
{
|
|
|
|
struct cpuacct *ca;
|
|
|
|
|
|
|
|
if (!cpuacct_subsys.active)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ca = task_ca(tsk);
|
|
|
|
if (ca) {
|
|
|
|
u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
|
|
|
|
|
|
|
|
*cpuusage += cputime;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct cgroup_subsys cpuacct_subsys = {
|
|
|
|
.name = "cpuacct",
|
|
|
|
.create = cpuacct_create,
|
|
|
|
.destroy = cpuacct_destroy,
|
|
|
|
.populate = cpuacct_populate,
|
|
|
|
.subsys_id = cpuacct_subsys_id,
|
|
|
|
};
|
|
|
|
#endif /* CONFIG_CGROUP_CPUACCT */
|