linux內核分析之調度——實時調度演算法
linux內核分析之調度——實時調度演算法
linux內核中提供了兩種實時調度策略:SCHED_FIFO和SCHED_RR,其中RR是帶有時間片的FIFO。這兩種調度演算法實現的都是靜態優先順序。內核不為實時進程計算動態優先順序。這能保證給定優先順序別的實時進程總能搶佔優先順序比他低得進程。linux的實時調度演算法提供了一種軟實時工作方式。實時優先順序範圍從0到MAX_RT_PRIO減一。默認情況下,MAX_RT_PRIO為100,所以默認的實時優先順序範圍是從0到99.SCHED_NORMAL級進程的nice值共享了這個取值空間;他的取值範圍是從MAX_RT_PRIO到MAX_RT_PRIO+40.也就是說,在默認情況下,nice值從-20到19直接對應的是從100到139的實時優先順序範圍。
數據結構
實時調度的優先順序隊列
實時調度的優先順序隊列是一組鏈表,每個優先順序對應一個鏈表。
view plaincopy to clipboardprint?/*實時調度的優先順序隊列,實時調度中,運行進程
根據優先順序放到對應的隊列裡面,對於相同的優先順序
的進程後面來的進程放到同一優先順序隊列的隊尾
對於FIFO/RR調度,各自的進程需要設置相關的屬性
進程運行時,要根據task中的這些屬性判斷和設置
放棄cpu的時機(運行完或是時間片用完)*/struct rt_prio_array {
DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue;
};
/*實時調度的優先順序隊列,實時調度中,運行進程
根據優先順序放到對應的隊列裡面,對於相同的優先順序
的進程後面來的進程放到同一優先順序隊列的隊尾
對於FIFO/RR調度,各自的進程需要設置相關的屬性
進程運行時,要根據task中的這些屬性判斷和設置
放棄cpu的時機(運行完或是時間片用完)*/
struct rt_prio_array {
DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue;
};運行隊列
運行隊列組織實時調度的相關信息
view plaincopy to clipboardprint?/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
int next; /* next highest */
#endif
} highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
struct sched_rt_entity *rt_se;
#endif
};
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
unsigned long rt_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
#ifdef CONFIG_SMP
int next; /* next highest */
#endif
} highest_prio;
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#endif
int rt_throttled;
u64 rt_time;
u64 rt_runtime;
/* Nests inside the rq lock: */
spinlock_t rt_runtime_lock;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
struct sched_rt_entity *rt_se;
#endif
};進程調度實體
view plaincopy to clipboardprint?struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
unsigned int time_slice;
int nr_cpus_allowed;
struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: */
struct rt_rq *my_q;
#endif
};
struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
unsigned int time_slice;
int nr_cpus_allowed;
struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: */
struct rt_rq *my_q;
#endif
};調度類
view plaincopy to clipboardprint?static const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
.check_preempt_curr = check_preempt_curr_rt,
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
.load_balance = load_balance_rt,
.move_one_task = move_one_task_rt,
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt,
.post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt,
.switched_from = switched_from_rt,
#endif
.set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
};
static const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
.check_preempt_curr = check_preempt_curr_rt,
.pick_next_task = pick_next_task_rt,
.put_prev_task = put_prev_task_rt,
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
.load_balance = load_balance_rt,
.move_one_task = move_one_task_rt,
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt,
.post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt,
.switched_from = switched_from_rt,
#endif
.set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
.get_rr_interval = get_rr_interval_rt,
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
};從運行隊列中移除函數dequeue_task_rt
view plaincopy to clipboardprint?static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
{
struct sched_rt_entity *rt_se = &p->rt;
/*更新調度信息*/
update_curr_rt(rq);
/*實際工作,將rt_se從運行隊列中刪除然後
添加到隊列尾部*/
dequeue_rt_entity(rt_se);
/*從hash表中刪除*/
dequeue_pushable_task(rq, p);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
{
struct sched_rt_entity *rt_se = &p->rt;
/*更新調度信息*/
update_curr_rt(rq);
/*實際工作,將rt_se從運行隊列中刪除然後
添加到隊列尾部*/
dequeue_rt_entity(rt_se);
/*從hash表中刪除*/
dequeue_pushable_task(rq, p);
}
view plaincopy to clipboardprint?/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (!task_has_rt_policy(curr))/*判斷是否問實時調度進程*/
return;
/*執行時間*/
delta_exec = rq->clock - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
/*更新當前進程的總的執行時間*/
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
/*更新執行的開始時間*/
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);/*主調度相關*/
/*更新調度信息*/
sched_rt_avg_update(rq, delta_exec);
if (!rt_bandwidth_enabled())
return;
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq))
resched_task(curr);
spin_unlock(&rt_rq->rt_runtime_lock);
}
}
}
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
*/
static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (!task_has_rt_policy(curr))/*判斷是否問實時調度進程*/
return;
/*執行時間*/
delta_exec = rq->clock - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
/*更新當前進程的總的執行時間*/
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
/*更新執行的開始時間*/
curr->se.exec_start = rq->clock;
cpuacct_charge(curr, delta_exec);/*主調度相關*/
/*更新調度信息*/
sched_rt_avg_update(rq, delta_exec);
if (!rt_bandwidth_enabled())
return;
for_each_sched_rt_entity(rt_se) {
rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq))
resched_task(curr);
spin_unlock(&rt_rq->rt_runtime_lock);
}
}
}
view plaincopy to clipboardprint?static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
/*從運行隊列中刪除*/
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se);/*添加到隊列尾*/
}
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
/*從運行隊列中刪除*/
dequeue_rt_stack(rt_se);
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se);/*添加到隊列尾*/
}
}
view plaincopy to clipboardprint?/*
* Because the prio of an upper entry depends on the lower
* entries, we must remove entries top - down.
*/
static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
{
struct sched_rt_entity *back = NULL;
for_each_sched_rt_entity(rt_se) {/*遍歷整個組調度實體*/
rt_se->back = back;/*可見rt_se的back實體為組調度中前一個調度實體*/
back = rt_se;
}
/*將組中的所有進程從運行隊列中移除*/
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
}
}
/*
* Because the prio of an upper entry depends on the lower
* entries, we must remove entries top - down.
*/
static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
{
struct sched_rt_entity *back = NULL;
for_each_sched_rt_entity(rt_se) {/*遍歷整個組調度實體*/
rt_se->back = back;/*可見rt_se的back實體為組調度中前一個調度實體*/
back = rt_se;
}
/*將組中的所有進程從運行隊列中移除*/
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
}
}刪除的實際工作為:view plaincopy to clipboardprint?static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
list_del_init(&rt_se->run_list);
if (list_empty(array->queue + rt_se_prio(rt_se)))
__clear_bit(rt_se_prio(rt_se), array->bitmap);
dec_rt_tasks(rt_se, rt_rq);
}
static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
list_del_init(&rt_se->run_list);
if (list_empty(array->queue + rt_se_prio(rt_se)))
__clear_bit(rt_se_prio(rt_se), array->bitmap);
dec_rt_tasks(rt_se, rt_rq);
}添加進程到運行隊列view plaincopy to clipboardprint?/*
* Adding/removing a task to/from a priority array:
*/
static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
{
struct sched_rt_entity *rt_se = &p->rt;
if (wakeup)
rt_se->timeout = 0;
/*實際工作*/
enqueue_rt_entity(rt_se);
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);/*添加到對應的hash表中*/
}
/*
* Adding/removing a task to/from a priority array:
*/
static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
{
struct sched_rt_entity *rt_se = &p->rt;
if (wakeup)
rt_se->timeout = 0;
/*實際工作*/
enqueue_rt_entity(rt_se);
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);/*添加到對應的hash表中*/
}view plaincopy to clipboardprint?static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
{
dequeue_rt_stack(rt_se);/*從運行隊列中刪除*/
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se);/*添加到運行隊列尾部*/
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
{
dequeue_rt_stack(rt_se);/*從運行隊列中刪除*/
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se);/*添加到運行隊列尾部*/
}實際的添加工作:view plaincopy to clipboardprint?static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
struct list_head *queue = array->queue + rt_se_prio(rt_se);
/*
* Don't enqueue the group if its throttled, or when empty.
* The latter is a consequence of the former when a child group
* get throttled and the current group doesn't have any other
* active members.
*/
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;
list_add_tail(&rt_se->run_list, queue);
__set_bit(rt_se_prio(rt_se), array->bitmap);
inc_rt_tasks(rt_se, rt_rq);
}
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
struct list_head *queue = array->queue + rt_se_prio(rt_se);
/*
* Don't enqueue the group if its throttled, or when empty.
* The latter is a consequence of the former when a child group
* get throttled and the current group doesn't have any other
* active members.
*/
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
return;
list_add_tail(&rt_se->run_list, queue);
__set_bit(rt_se_prio(rt_se), array->bitmap);
inc_rt_tasks(rt_se, rt_rq);
}選擇下一個進程運行view plaincopy to clipboardprint?static struct task_struct *pick_next_task_rt(struct rq *rq)
{
struct task_struct *p = _pick_next_task_rt(rq);/*實際工作*/
/* The running task is never eligible for pushing */
if (p)
dequeue_pushable_task(rq, p);
#ifdef CONFIG_SMP
/*
* We detect this state here so that we can avoid taking the RQ
* lock again later if there is no need to push
*/
rq->post_schedule = has_pushable_tasks(rq);
#endif
return p;
}
static struct task_struct *pick_next_task_rt(struct rq *rq)
{
struct task_struct *p = _pick_next_task_rt(rq);/*實際工作*/
/* The running task is never eligible for pushing */
if (p)
dequeue_pushable_task(rq, p);
#ifdef CONFIG_SMP
/*
* We detect this state here so that we can avoid taking the RQ
* lock again later if there is no need to push
*/
rq->post_schedule = has_pushable_tasks(rq);
#endif
return p;
}
view plaincopy to clipboardprint?static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
struct rt_rq *rt_rq;
rt_rq = &rq->rt;
if (unlikely(!rt_rq->rt_nr_running))
return NULL;
if (rt_rq_throttled(rt_rq))
return NULL;
do {/*遍歷組調度中的每個進程*/
rt_se = pick_next_rt_entity(rq, rt_rq);
BUG_ON(!rt_se);
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
p = rt_task_of(rt_se);
/*更新執行域*/
p->se.exec_start = rq->clock;
return p;
}
static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
struct rt_rq *rt_rq;
rt_rq = &rq->rt;
if (unlikely(!rt_rq->rt_nr_running))
return NULL;
if (rt_rq_throttled(rt_rq))
return NULL;
do {/*遍歷組調度中的每個進程*/
rt_se = pick_next_rt_entity(rq, rt_rq);
BUG_ON(!rt_se);
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
p = rt_task_of(rt_se);
/*更新執行域*/
p->se.exec_start = rq->clock;
return p;
}
view plaincopy to clipboardprint?static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
struct sched_rt_entity *next = NULL;
struct list_head *queue;
int idx;
/*找到第一個可用的*/
idx = sched_find_first_bit(array->bitmap);
BUG_ON(idx >= MAX_RT_PRIO);
/*從鏈表組中找到對應的鏈表*/
queue = array->queue + idx;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
/*返回找到的運行實體*/
return next;
}
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
struct sched_rt_entity *next = NULL;
struct list_head *queue;
int idx;
/*找到第一個可用的*/
idx = sched_find_first_bit(array->bitmap);
BUG_ON(idx >= MAX_RT_PRIO);
/*從鏈表組中找到對應的鏈表*/
queue = array->queue + idx;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
/*返回找到的運行實體*/
return next;
}總結:對於實時調度,基於linux內核調度框架下作的工作比較簡單,把所有的運行進程根據優先順序放到不用的隊列裡面,採用點陣圖方式進行使用記錄。進隊列僅僅是刪除原來隊列裡面的本進程,然後將他掛到隊列尾部;而對於「移除」操作,也僅僅是從隊列裡面移除后添加到運行隊列尾部。
《解決方案》
謝謝分享