Linux RT调度器之带宽控制

fanxiaoyu321

已于 2024-08-25 18:49:50 修改

阅读量1.2k

点赞数 1

分类专栏： Linux进程调度文章标签： linux 带宽控制 RUNTIME_SHARE RT

于 2023-11-12 16:42:32 首次发布

本文链接：https://blog.csdn.net/xiaoyu_750516366/article/details/134362371

版权

Linux进程调度专栏收录该内容

8 篇文章

订阅专栏

这篇笔记详细介绍了Linux RT调度器的带宽控制，包括CPU和任务组级别的配置参数，rt_bandwidth数据结构，rt_rq运行队列，以及初始化和带宽控制的实现，如sched_rt_runtime_exceeded函数和带宽控制定时器。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

所谓带宽控制，指的是能够控制任务长时间占用CPU的能力。Deadline调度器、RT调度器、CFS调度器均支持这种能力。这篇笔记记录了RT调度器的带宽控制相关实现，代码使用的5.10。

RT调度器的带宽控制核心思想是：限制CPU上可运行的RT任务在检测周期内占用CPU的时长不能超过运行门限，检测周期和运行门限是RT带宽控制的两个可配置参数。

RT调度类的带宽控制默认在cpu运行队列上生效。支持组调度后，可以扩展到任务组的运行队列上生效。

可配置参数

如开篇提到的，有两个配置参数，可以在CPU和任务组两个级别生效。

CPU级别配置参数

在CPU级别上，由下面两个全局变量表示检测周期和限定时长两个带宽控制参数。默认的周期为1s，RT任务可运行时长为0.95s，即最大可以占用95%的CPU。

/*
 * period over which we measure -rt task cpu usage in us.
 * default: 1s
 * 配置节点：/proc/sys/kernel/sched_rt_period_us
 */
unsigned int sysctl_sched_rt_period = 1000000;
/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 * 配置节点：/proc/sys/kernelsched_rt_runtime_us
 * 配置为-1或者等于sysctl_sched_rt_runtime都表示不限制cpu运行队列上RT任务的运行时长，
 * 其它情况下，必须配置一个小于sysctl_sched_rt_period的值
 */
int sysctl_sched_rt_runtime = 950000;

RT调度器通过判断sysctl_sched_rt_runtime变量来确定系统是否开启了带宽控制的。

static inline int rt_bandwidth_enabled(void)
{
    return sysctl_sched_rt_runtime >= 0;
}

任务组级别配置参数

支持组调度后，可以在任务组的运行队列上配置检测周期和运行门限，配置节点是每个任务组目录下的rt_period_us和rt_runtime_us属性文件。

static struct cftype cpu_files[] = {
...

#ifdef CONFIG_RT_GROUP_SCHED
    {
        .name = "rt_runtime_us",
        .read_s64 = cpu_rt_runtime_read,
        .write_s64 = cpu_rt_runtime_write,
    },
    {
        .name = "rt_period_us",
        .read_u64 = cpu_rt_period_read_uint,
        .write_u64 = cpu_rt_period_write_uint,
    },
#endif
    { }    /* terminate */
};

Linux内核引入了struct rt_bandwitdh结构体来保存任务组的带宽配置，在task_group中包含了该结构。root_task_group.rt_bandwidth保存的是cpu级别的配置信息。

struct rt_bandwidth {
    
    raw_spinlock_t rt_runtime_lock;
    ktime_t rt_period;
// 检查周期配置参数，单位为纳秒
    u64 rt_runtime;
 // 运行门限配置参数，单位为纳秒
    struct hrtimer rt_period_timer; // 用于解除限流的高精度定时器，见下面介绍
    unsigned int rt_period_active; // 非0表示定时器已经启动
};

struct task_group {
...
#ifdef CONFIG_RT_GROUP_SCHED

    struct rt_bandwidth rt_bandwidth;
// 任务组带宽信息
#endif
}

rt_period

任务组的检测周期配置参数，检查周期必须大于0。

rt_runtime

任务组的运行门限配置参数。如果配置为0，表示该任务组中的任务没有带宽配额，效果是任务组中的任务将无法被调度器调度到。如果配置为rt_period或负值，表示该任务组中的任务有无限的带宽配额，不会被带宽限制机制限制运行。

此外，还设计了一个全局变量def_rt_bandwidth保存了cpu级别的带宽配置参数，该变量在不支持组调度时使用。

struct rt_bandwidth def_rt_bandwidth;

设置参数

系统中的所有任务组是一个倒长的树，可以想象，低层次任务组的配置参数必须要服从高层次任务组的配置参数。下面以支持组调度的情况讨论上述两个可配置参数的设置过程。

// 修改任务组的运行门限配置参数
int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
    u64 rt_runtime, rt_period;

    rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
    rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
    if (rt_runtime_us < 0)
        rt_runtime = RUNTIME_INF; // 小于0的运行门限表示不限制任务组带宽
    else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
        return -EINVAL;
    // 检查参数的合理性并进行设置
    return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}

// 修改任务组的检查周期配置参数
int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{
    u64 rt_runtime, rt_period;

    if (rt_period_us > U64_MAX / NSEC_PER_USEC)
        return -EINVAL;

    rt_period = rt_period_us * NSEC_PER_USEC;
    rt_runtime = tg->rt_bandwidth.rt_runtime;

    return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}

设置检测周期和运行门限配置参数最后都会调用tg_set_rt_bandwidth()函数检查设置的参数是否符合整个任务组树的约束，如果满足则完成任务组带宽参数的设置。

static int tg_set_rt_bandwidth(struct task_group *tg,
        u64 rt_period, u64 rt_runtime)
{
    int i, err = 0;

    // 将根分组的运行门限设置为0会导致内核无法创建RT任务，不允许这种配置
    if (tg == &root_task_group && rt_runtime == 0)
        return -EINVAL;

    // 不允许将检测周期配置为0，这样没有意义
    if (rt_period == 0)
        return -EINVAL;

    // 运行门限有个最大值限制，该值一般为几个小时，非常大，不用关注
    if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
        return -EINVAL;

    mutex_lock(&rt_constraints_mutex);
    // 1. 检查新的配置参数是否满足任务树的约束
    err = __rt_schedulable(tg, rt_period, rt_runtime);
    if (err)
        goto unlock;

    // 2. 新的配置参数满足约束，将其配置到任务组中
    raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
    tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
    tg->rt_bandwidth.rt_runtime = rt_runtime;
    for_each_possible_cpu(i) {
        struct rt_rq *rt_rq = tg->rt_rq[i];

        raw_spin_lock(&rt_rq->rt_runtime_lock);
        rt_rq->rt_runtime = rt_runtime;
        raw_spin_unlock(&rt_rq->rt_runtime_lock);
    }
    raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
unlock:
    mutex_unlock(&rt_constraints_mutex);
    return err;
}

任务组参数合法性检查

__rt_schedulable()函数负责检查任务组的新参数是否符合任务组树的约束。

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
    int ret;

    struct rt_schedulable_data data = {
        .tg = tg,
        .rt_period = period,
        .rt_runtime = runtime,
    };

    // 从根分组开始，按照深度优先的方式遍历整个任务组树，对每个任务组都调用
    // tg_rt_schedulable()函数检查任务组的设置是否满足约束
    rcu_read_lock();
    ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
    rcu_read_unlock();

    return ret;
}

static int tg_rt_schedulable(struct task_group *tg, void *data)
{
    struct rt_schedulable_data *d = data;
    struct task_group *child;
    unsigned long total, sum = 0;
    u64 period, runtime;

    // 取任务组的最新配置参数
    period = ktime_to_ns(tg->rt_bandwidth.rt_period);
    runtime = tg->rt_bandwidth.rt_runtime;
    if (tg == d->tg) {
        period = d->rt_period;
        runtime = d->rt_runtime;
    }

    // 运行门限配置参数超过了检测周期配置参数，属于非法配置
    if (runtime > period && runtime != RUNTIME_INF)
        return -EINVAL;

    // 只有当任务组为空（不包含任务）时才允许将该任务组的运行门限参数设置为0
    if (rt_bandwidth_enabled() && !runtime &&
            tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
        return -EBUSY;

    // 任务组tg的总带宽
    total = to_ratio(period, runtime); // (runtime/period)*1024*1024

    // 任务组tg的总带宽超过了cpu的总带宽，当然时非法配置
    if (total > to_ratio(global_rt_period(), global_rt_runtime()))
        return -EINVAL;

    // 计算任务组tg的所有子任务组的总带宽，保证它们的总带宽配置不能超过任务组tg配置的总带宽
    list_for_each_entry_rcu(child, &tg->children, siblings) {
        period = ktime_to_ns(child->rt_bandwidth.rt_period);
        runtime = child->rt_bandwidth.rt_runtime;
        if (child == d->tg) {
            period = d->rt_period;
            runtime = d->rt_runtime;
        }
        sum += to_ratio(period, runtime);
    }
    if (sum > total)
        return -EINVAL;

    return 0;
}

可以看出，任务组树的参数配置约束还是比较简洁的：属于同一个高层次任务组的低层次任务组之间会共享高层次任务组的带宽，所以，低层次任务组的带宽配置参数不能超过高层次任务组的配置。举个例子：

                                 <---------------->
                                 |       Root     |
                                 v----------------v
                               /                    \
                              /                      \
                      <--------------->       <---------------->
                      |   Professor   |       |     Student    |
                      v---------------v       v----------------v
                    /                   \
                   /                     \
           <---------------->      <----------------->
           |      Mail      |      |      Web        |
           v----------------v      v-----------------v

假设cpu的总带宽为B，根分组上设置的RT任务带宽（即cpu级别的RT带宽配置）为0.5B：

Professor分组、Student分组和根分组中的RT任务将共享这0.5B的带宽。
Professor分组和Student分组的带宽配置加起来不能超过0.5B。
假设Professor分组的RT带宽配置为0.3B，同理，Mail分组和Web分组的配置也必须遵守上面的约束。

RT运行队列扩展

开篇有提到，RT带宽控制是作用在各级cpu运行队列上的，为了支持RT带宽控制，对RT运行队列rt_rq进行了如下扩展：

struct rt_rq {
...
    int rt_throttled;
    u64 rt_time;
    u64 rt_runtime;
    /* Nests inside the rq lock: */
    raw_spinlock_t rt_runtime_lock;
};

rt_runtime

该运行队列的RT带宽配额，初始值来自运行门限配置参数，但是在开启RT_RUNTIME_SHARE特性后，该值在运行过程中可以发生变化。见下面分析。

static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
    if (!rt_rq->tg)
        return RUNTIME_INF;
    return rt_rq->rt_runtime;
}

rt_time

检测周期内统计到的运行队列上RT任务已运行时长。

rt_throttled

当运行队列上的带宽配额被耗尽后会设置该字段为1，表示运行队列进入“限流”状态，处于限流状态的运行队列上的任务是不会被调度运行的。

static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
    // 暂时不考虑boost场景
    return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
}

开机初始化

在sched_init()函数中，有如下和带宽控制相关的初始化逻辑。

void __init sched_init(void)
{

...
    // 用CPU级别的检测周期和运行门限初始化def_rt_bandwidth和根任务组的rt_bandwidth
    init_rt_bandwidth(&def_rt_bandwidth,
        global_rt_period(), global_rt_runtime());


#ifdef CONFIG_RT_GROUP_SCHED
    init_rt_bandwidth(&root_task_group.rt_bandwidth,
        global_rt_period(), global_rt_runtime());
#endif /* CONFIG_RT_GROUP_SCHED */

    for_each_possible_cpu(i) {
        struct rq *rq = cpu_rq(i);
        // 设置CPU运行队列的RT带宽配额为cpu级别的运行门限配置参数
        rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
    } 
}

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
{
    rt_b->rt_period = ns_to_ktime(period);
    rt_b->rt_runtime = runtime;

    raw_spin_lock_init(&rt_b->rt_runtime_lock);

    hrtimer_init(&rt_b->rt_period_timer,
        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    rt_b->rt_period_timer.function = sched_rt_period_timer; // 该定时器见后面分析
}

此外，在任务组创建时，也会调用init_rt_bandwidth()函数为新的task_group初始化rt_bandwidth，具体见alloc_rt_sched_group()函数。

带宽控制流程

每当调用update_curr_rt()函数更新RT任务的cpu运行统计数据时，都会进行带宽控制检查。

static void update_curr_rt(struct rq *rq)
{
    struct task_struct *curr = rq->curr;
    struct sched_rt_entity *rt_se = &curr->rt;
    u64 delta_exec;
    u64 now;
...
    // 上一次统计距离当前时间的delta值
    now = rq_clock_task(rq);
    delta_exec = now - curr->se.exec_start;
    if (unlikely((s64)delta_exec <= 0))
        return;

    // 更新统计时间戳
    curr->se.exec_start = now;

    // 使能RT带宽控制的情况下，检查任务是否超过了带宽限制
    if (!rt_bandwidth_enabled())
        return;

    // 按照从底层到顶层的顺序，将delta值累加到各层任务组的运行队列上，
    // 检查确保每一层的运行时长都不会超过其带宽配额
    for_each_sched_rt_entity(rt_se) {
        // 调度实体所在rt_rq，即上一层任务组的运行队列
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);

        // 运行队列上配置了有限的带宽配额的情况下进行检查
        if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
            raw_spin_lock(&rt_rq->rt_runtime_lock);
            // 累计delta到该运行队列上
            rt_rq->rt_time += delta_exec;
            // 检查rt_rq的运行时长是否超过了带宽配额，超过则重新调度
            if (sched_rt_runtime_exceeded(rt_rq))
                resched_curr(rq);
            raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
    }
}

带宽限制检查

带宽控制的关键逻辑都在sched_rt_runtime_exceeded()函数中。该函数检查运行队列上RT任务的运行时长是否超过了其带宽配额，如果超过了，将运行队列标记为“限流”状态，然后将该队列从cpu运行队列上移除，这样该队列中的任务在“限流”状态被解除前将不会被调度运行。

static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
    // rt_rq的带宽配额
    u64 runtime = sched_rt_runtime(rt_rq);

    // 运行队列已经限流，不在重复检查
    if (rt_rq->rt_throttled)
        return rt_rq_throttled(rt_rq);

    // 带宽配额超过检测周期，永远不会被限流
    if (runtime >= sched_rt_period(rt_rq))
        return 0;

    // 开启RT_RUNTIME_SHARE特性后，可以向其它cpu借用RT带宽，该机制后面单独分析。
    // 该函数可能会修改rt_rq的带宽配额，重新获取
    balance_runtime(rt_rq);
    runtime = sched_rt_runtime(rt_rq);
    if (runtime == RUNTIME_INF)
        return 0;

    // 检测周期内，运行队列的cpu使用时长已经超过了带宽配额，将运行队列标记为“限流”状态
    if (rt_rq->rt_time > runtime) {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
        if (likely(rt_b->rt_runtime)) {
            rt_rq->rt_throttled = 1;
            printk_deferred_once("sched: RT throttling activated\n");
        } else {
            /*
             * In case we did anyway, make it go away,
             * replenishment is a joke, since it will replenish us
             * with exactly 0 ns.
             */
            rt_rq->rt_time = 0;
        }
        // 将“限流”的运行队列出队列
        if (rt_rq_throttled(rt_rq)) {
            sched_rt_rq_dequeue(rt_rq);
            return 1;
        }
    }
    // 其它情况返回未超带宽限制
    return 0;
}

解除限流状态

每个任务组的tg->rt_bandwidth结构中都包含一个高精度定时器，只要任务组中包含了等待运行的任务，就会启动该定时器，调用链为：__enqueue_rt_entity()->inc_rt_tasks()->inc_rt_group()->start_rt_bandwidth()。

static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
...
    if (rt_rq->tg)
        start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
}

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
    // 未使能RT带宽控制，或者有无限的带宽配额，无需启动定时器
    if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
        return;

    // 如果定时器尚未启动，则启动定时器
    raw_spin_lock(&rt_b->rt_runtime_lock);
    if (!rt_b->rt_period_active) {
        rt_b->rt_period_active = 1;  // 防止定时器重复启动

        hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
        hrtimer_start_expires(&rt_b->rt_period_timer,
              HRTIMER_MODE_ABS_PINNED_HARD);
    }
    raw_spin_unlock(&rt_b->rt_runtime_lock);
}

定时器处理函数为sched_rt_period_timer()。

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
    struct rt_bandwidth *rt_b =
        container_of(timer, struct rt_bandwidth, rt_period_timer);
    int idle = 0;
    int overrun;

    raw_spin_lock(&rt_b->rt_runtime_lock);
    for (;;) {
        // 重新设置定时器到期时间为下一个检测周期，overrun表示定时器
        // 自从上一次启动以来的溢出次数
        overrun = hrtimer_forward_now(timer, rt_b->rt_period);
        // 定时器机制导致定时器可能会在一定的精度内提前被处理，调度器
        // 为了保证准确，只有在定时器一定到期的情况下才进行带宽处理
        if (!overrun)
            break;

        // 定时器已经到期，进行带宽控制相关逻辑处理
        raw_spin_unlock(&rt_b->rt_runtime_lock);
        idle = do_sched_rt_period_timer(rt_b, overrun);
        raw_spin_lock(&rt_b->rt_runtime_lock);
    }
    // idle说明运行队列已经空闲，无需重新启动定时器，清除定时器启动标记
    if (idle)
        rt_b->rt_period_active = 0;
    raw_spin_unlock(&rt_b->rt_runtime_lock);

    // 根据情况返回是否要重新启动定时器
    return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}

// 返回非0表示需要重新启动定时器，0则不需要
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
    int i, idle = 1, throttled = 0;
    const struct cpumask *span;

    // 任务组只有一个定时器，需要在定时器中处理任务组在各个cpu上的运行队列
    span = sched_rt_period_mask();
#ifdef CONFIG_RT_GROUP_SCHED
    if (rt_b == &root_task_group.rt_bandwidth)
        span = cpu_online_mask;
#endif
    for_each_cpu(i, span) {
        int enqueue = 0;
        // 任务组在cpu i上的运行队列
        struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
        struct rq *rq = rq_of_rt_rq(rt_rq);
        int skip;

        raw_spin_lock(&rt_rq->rt_runtime_lock);
        // 重新设置运行队列的带宽配额
        if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
            rt_rq->rt_runtime = rt_b->rt_runtime;
        // 在检测周期内，运行队列中没有任务运行，无需处理，跳过
        skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
        raw_spin_unlock(&rt_rq->rt_runtime_lock);
        if (skip)
            continue;

        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);

        if (rt_rq->rt_time) {
            // 1. 检测周期内，运行队列上有RT任务被调度运行
            u64 runtime;

            raw_spin_lock(&rt_rq->rt_runtime_lock);
            if (rt_rq->rt_throttled)
                balance_runtime(rt_rq);
            runtime = rt_rq->rt_runtime;
            // 检测周期完毕，重新复位运行队列上的cpu占用时长字段
            rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
            // 检测周期内，运行队列发生了"限流"，要将其解除“限流”
            if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
                rt_rq->rt_throttled = 0;
                enqueue = 1; // 重新入队列标记

                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
                    rq_clock_cancel_skipupdate(rq);
            }
            // 还有任务要运行，需要重启定时器
            if (rt_rq->rt_time || rt_rq->rt_nr_running)
                idle = 0;
            raw_spin_unlock(&rt_rq->rt_runtime_lock);
        } else if (rt_rq->rt_nr_running) {
            // 2. 运行队列有任务要运行，但是检测周期没有被调度运行
            idle = 0;
            if (!rt_rq_throttled(rt_rq)) // 为何要重新入队列？
                enqueue = 1;
        }
        // 依然处于限流状态
        if (rt_rq->rt_throttled)
            throttled = 1;
        
        if (enqueue) // 将运行队列入队列
            sched_rt_rq_enqueue(rt_rq);
        raw_spin_unlock(&rq->lock);
    }

    // 未限流，未开启带宽控制或者不限制带宽的情况下，不再重新启动定时器
    if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
        return 1;

    return idle;
}

可以看出，定时器到期后，核心逻辑包括两点：

递减运行队列上任务的CPU使用时长。因为检测是按周期进行的，这样相当于按周期将使用时长复位。
对于限流的运行队列则解除限流，并将其重新入队。

RT_RUNTIME_SHARE特性

带宽控制要求一个运行队列上的RT任务在检测周期内的运行时长不能超过带宽配额，这里有一个隐含条件是一个cpu上（因为运行队列是per-cpu的）。开启RT_RUNTIME_SHARE特性后，cpu之间的带宽可以借用，即当一个运行队列上的带宽配额不够用时，可以检查其它cpu上是否有多余的带宽，如果能借到，则该运行队列不会进入“限流”状态，可以继续运行。下面来看这部分逻辑是如何实现的。

借用带宽是通过balance_runtime()函数实现的，该函数尝试借用，如果能借到会直接修改运行队列的带宽配额。

static void balance_runtime(struct rt_rq *rt_rq)
{
    // 开启RT_RUNTIME_SHARE特性才能借用
    if (!sched_feat(RT_RUNTIME_SHARE))
        return;

    // 只有运行队列当前的配额已经用尽时才会借用
    if (rt_rq->rt_time > rt_rq->rt_runtime) {
        raw_spin_unlock(&rt_rq->rt_runtime_lock);
        do_balance_runtime(rt_rq);
        raw_spin_lock(&rt_rq->rt_runtime_lock);
    }
}

static void do_balance_runtime(struct rt_rq *rt_rq)
{
    // 当前运行队列带宽配置参数
    struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
    struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
    int i, weight;
    u64 rt_period;

    weight = cpumask_weight(rd->span);

    raw_spin_lock(&rt_b->rt_runtime_lock);
    rt_period = ktime_to_ns(rt_b->rt_period);
    // 尝试和root_domain中的其它cpu借用
    for_each_cpu(i, rd->span) {
        // 目标cpu上的运行队列
        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
        s64 diff;

        if (iter == rt_rq) // 防止自己和自己借
            continue;

        raw_spin_lock(&iter->rt_runtime_lock);

        // 只能和有限配额的运行队列借用
        if (iter->rt_runtime == RUNTIME_INF)
            goto next;

        // 得到目标cpu的带宽余额（减去自己已经使用的）
        diff = iter->rt_runtime - iter->rt_time;
        if (diff > 0) {
            // 不会全借，给其它cpu也留一些
            diff = div_u64((u64)diff, weight);
            // 保证借用后，源cpu运行队列的配额不会超过其检测周期
            if (rt_rq->rt_runtime + diff > rt_period)
                diff = rt_period - rt_rq->rt_runtime;
            // 借用配额
            iter->rt_runtime -= diff;
            rt_rq->rt_runtime += diff;
            // 已经借到了足够的配额，结束借用流程
            if (rt_rq->rt_runtime == rt_period) {
                raw_spin_unlock(&iter->rt_runtime_lock);
                break;
            }
        }
next:
        raw_spin_unlock(&iter->rt_runtime_lock);
    }
    raw_spin_unlock(&rt_b->rt_runtime_lock);
}

整个带宽借用流程是比较清晰的，但是需要特别注意的是：RT_RUNTIME_SHARE特性可能会导致一个RT任务长时间占用某个cpu，这可能会导致一些只能在该cpu上运行的任务长时间得不到调度，典型的如特定cpu上的kworker线程，这可能会引起系统稳定性问题。

此外，当一个cpu offline时，会尝试将其借出去的配额进行回收，见__disable_runtime()函数实现，流程比较简单，这里不再展开。