Table of Contents
源码
版本5.16-rc3
核心算法
On each ACK, update our model of the network path:
bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
min_rtt = windowed_min(rtt, 10 seconds)
pacing_rate = pacing_gain * bottleneck_bandwidth
cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
* Here is a state transition diagram for BBR:
*
* |
* V
* +---> STARTUP ----+
* | | |
* | V |
* | DRAIN ----+
* | | |
* | V |
* +---> PROBE_BW ----+
* | ^ | |
* | | | |
* | +----+ |
* | |
* +---- PROBE_RTT <--+
全局参数
带宽计算
#define BW_SCALE 24
#define BW_UNIT (1 << BW_SCALE)
用一个u32(低BW_SCALE部分为小数部分)来表示每1us发送的包数,按照每个包大小1500byte算,这个u32可以表示的带宽范围为0.06pps(715bps)到256Mpps(3Tbps)。
定点化
#define BBR_SCALE 8 /* scaling factor for fractions in BBR (e.g. gains) */
#define BBR_UNIT (1 << BBR_SCALE)
全局变量
/* Window length of bw filter (in rounds): */
static const int bbr_bw_rtts = CYCLE_LEN + 2;
/* Window length of min_rtt filter (in sec): */
static const u32 bbr_min_rtt_win_sec = 10;
/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000; // 小于1.2Mbps就别用tso了
static const int bbr_pacing_margin_percent = 1; // 不要用满带宽,留1%
static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
static const int bbr_cwnd_gain = BBR_UNIT * 2;
static const u32 bbr_cycle_rand = 7; // 随机化
static const u32 bbr_cwnd_min_target = 4; // 至少保证4个包的cwnd
static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
static const u32 bbr_full_bw_cnt = 3;
long-term相关的东西
/* "long-term" ("LT") bandwidth estimator parameters... */
/* The minimum number of rounds in an LT bw sampling interval: */
static const u32 bbr_lt_intvl_min_rtts = 4;
/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
static const u32 bbr_lt_loss_thresh = 50;
/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
static const u32 bbr_lt_bw_diff = 4000 / 8;
/* If we estimate we're policed, use lt_bw for this many round trips: */
static const u32 bbr_lt_bw_max_rtts = 48;
bbr_lt_loss_thresh:丢包率连续超过20%,则认为收到了流量控制,需要进入long-term模式
ack聚积相关的东西
/* Gain factor for adding extra_acked to target cwnd: */
static const int bbr_extra_acked_gain = BBR_UNIT;
/* Window length of extra_acked window. */
static const u32 bbr_extra_acked_win_rtts = 5;
/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
/* Time period for clamping cwnd increment due to ack aggregation */
static const u32 bbr_extra_acked_max_us = 100 * 1000;
重要数据结构
static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.flags = TCP_CONG_NON_RESTRICTED,
.name = "bbr",
.owner = THIS_MODULE,
.init = bbr_init,
.cong_control = bbr_main,
.sndbuf_expand = bbr_sndbuf_expand,
.undo_cwnd = bbr_undo_cwnd,
.cwnd_event = bbr_cwnd_event,
.ssthresh = bbr_ssthresh,
.min_tso_segs = bbr_min_tso_segs,
.get_info = bbr_get_info,
.set_state = bbr_set_state,
};
/* BBR congestion control block */
struct bbr {
u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */
u32 min_rtt_stamp; /* timestamp of min_rtt_us */
u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */
struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */
u32 rtt_cnt; /* count of packet-timed rounds elapsed */
u32 next_rtt_delivered; /* scb->tx.delivered at end of round */
u64 cycle_mstamp; /* time of this cycle phase start */
u32 mode:3, /* current bbr_mode in state machine */
prev_ca_state:3, /* CA state on previous ACK */
packet_conservation:1, /* use packet conservation? */
round_start:1, /* start of packet-timed tx->ack round? */
idle_restart:1, /* restarting after idle? */
probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */
unused:13,
lt_is_sampling:1, /* taking long-term ("LT") samples now? */
lt_rtt_cnt:7, /* round trips in long-term interval */
lt_use_bw:1; /* use lt_bw as our bw estimate? */
u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */
u32 lt_last_delivered; /* LT intvl start: tp->delivered */
u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */
u32 lt_last_lost; /* LT intvl start: tp->lost */
u32 pacing_gain:10, /* current gain for setting pacing rate */
cwnd_gain:10, /* current gain for setting cwnd */
full_bw_reached:1, /* reached full bw in Startup? */
full_bw_cnt:2, /* number of rounds without large bw gains */
cycle_idx:3, /* current index in pacing_gain cycle array */
has_seen_rtt:1, /* have we seen an RTT sample yet? */
unused_b:5;
u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
u32 full_bw; /* recent bw, to estimate if pipe is full */
/* For tracking ACK aggregation: */
u64 ack_epoch_mstamp; /* start of ACK sampling epoch */
u16 extra_acked[2]; /* max excess data ACKed in epoch */
u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
extra_acked_win_idx:1, /* current index in extra_acked array */
unused_c:6;
};
重要函数
bbr_init
类的各个变量的初始化;
bbr_main
static void bbr_main(struct sock *sk, const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk);
u32 bw;
bbr_update_model(sk, rs);
bw = bbr_bw(sk);
bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
}
每次收到ack后,需要执行的动作:更新mode,bw,pacing_rate,cwnd
实现了cong_control方法,该方法在处理好ca_state后调用,用于更新cwnd和pacing_rate。
bbr_sndbuf_expand
static u32 bbr_sndbuf_expand(struct sock *sk)
{
/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
return 3;
}
bbr_undo_cwnd
/* In theory BBR does not need to undo the cwnd since it does not
* always reduce cwnd on losses (see bbr_main()). Keep it for now.
*/
static u32 bbr_undo_cwnd(struct sock *sk)
{
struct bbr *bbr = inet_csk_ca(sk);
bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
bbr->full_bw_cnt = 0;
bbr_reset_lt_bw_sampling(sk);
return tcp_sk(sk)->snd_cwnd;
}
bbr_ssthresh
/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
static u32 bbr_ssthresh(struct sock *sk)
{
bbr_save_cwnd(sk);
return tcp_sk(sk)->snd_ssthresh;
}
bbr_min_tso_segs
/* override sysctl_tcp_min_tso_segs */
static u32 bbr_min_tso_segs(struct sock *sk)
{
return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
}
tcp_min_tso_seg:每个 TSO 帧包含的报文段数量最小值。
对于低带宽链路,tso在一个tso帧中将多个报文段送给网卡,这样反而导致低带宽链路上的burst传输,因此低带宽链路反而应该限制tso的使用。
bbr_get_info
返回bbr内部状态
bbr_set_state
static void bbr_set_state(struct sock *sk, u8 new_state)
{
struct bbr *bbr = inet_csk_ca(sk);
if (new_state == TCP_CA_Loss) {
struct rate_sample rs = { .losses = 1 };
bbr->prev_ca_state = TCP_CA_Loss;
bbr->full_bw = 0;
bbr->round_start = 1; /* treat RTO like end of a round */
bbr_lt_bw_sampling(sk, &rs);
}
}
改变ca_state时调用,ca_state的取值:
/*
* Sender's congestion state indicating normal or abnormal situations
* in the last round of packets sent. The state is driven by the ACK
* information and timer events.
*/
enum tcp_ca_state {
/*
* Nothing bad has been observed recently.
* No apparent reordering, packet loss, or ECN marks.
*/
TCP_CA_Open = 0,
#define TCPF_CA_Open (1<<TCP_CA_Open)
/*
* The sender enters disordered state when it has received DUPACKs or
* SACKs in the last round of packets sent. This could be due to packet
* loss or reordering but needs further information to confirm packets
* have been lost.
*/
TCP_CA_Disorder = 1,
#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
/*
* The sender enters Congestion Window Reduction (CWR) state when it
* has received ACKs with ECN-ECE marks, or has experienced congestion
* or packet discard on the sender host (e.g. qdisc).
*/
TCP_CA_CWR = 2,
#define TCPF_CA_CWR (1<<TCP_CA_CWR)
/*
* The sender is in fast recovery and retransmitting lost packets,
* typically triggered by ACK events.
*/
TCP_CA_Recovery = 3,
#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
/*
* The sender is in loss recovery triggered by retransmission timeout.
*/
TCP_CA_Loss = 4
#define TCPF_CA_Loss (1<<TCP_CA_Loss)
};
所以这个bbr_set_state函数的作用就是在遇到rto超时后重新进入STARTUP。
内部函数
bbr_save_cwnd
/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
static void bbr_save_cwnd(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
}
保存cwnd,在退出loss recovery或者probe_rtt结束后可以继续使用这个cwnd。
bbr_update_model
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{
bbr_update_bw(sk, rs);
bbr_update_ack_aggregation(sk, rs);
bbr_update_cycle_phase(sk, rs);
bbr_check_full_bw_reached(sk, rs);
bbr_check_drain(sk, rs);
bbr_update_min_rtt(sk, rs);
bbr_update_gains(sk);
}
bbr_update_bw
更新round数,以及带宽采样值(包括long-term模式的带宽采样)。
bbr_update_ack_aggregation
用extra_acked[0]和extra_acked[1]表示最近1-5这5个周期和最近6-10这5个周期内的最大ack聚积,只有在每个round结束的那一个ack才需要更新ack聚积程度。聚积程度取extra_acked[0]和extra_acked[1]中的最大值。
如果实际ack的字节数小于等于expected_acked或者远大于expected_acked(超过2^20),那么要重置跟踪extra_acked的round。
extra_acked[0/1]不能超过当前snd_cwnd。
最后在计算target_cwnd时,extra_acked不能超过bw * bbr_extra_acked_max_us(100ms)。
bbr_update_cycle_phase
对于pacing_gain等于1的阶段(PROBE_BW中的6个平稳发送阶段),需要发足1个rtt时间(按wall clock time算)才可以进入下一个阶段。
对于pacing_gain大于1的阶段(STARTUP和PROBE_BW中的上升探测阶段),需要同时满足2个条件才可以进入下一个阶段:a.持续时间达到1个rtt(按wall clock time算),b.出现丢包或者bif达到pacing_gain * BDP(bw*min_rtt)。注意第2个条件里列的是pacing_gain而不是cwnd_gain。
对于pacing_gain小于1的阶段(DRAIN和PROBE_BW中的下降阶段),只要满足2个条件中任意1个即可进入下一个阶段:a.持续时间达到1个rtt(按wall clock time算),b.bif小于等于BDP(bw*min_rtt)。
bbr_check_full_bw_reached
有3个non-applimited round带宽增长都没有达到25%,那么就任务startup已经探测到最大带宽了。为什么是3个round?这个函数的注释是:
/* Estimate when the pipe is full, using the change in delivery rate: BBR
* estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
* at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
* rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
* higher rwin, 3: we get higher delivery rate samples. Or transient
* cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
* design goal, but uses delay and inter-ACK spacing instead of bandwidth.
*/
先用1个round来让接收端动态调整receive window,再用1个round填满增长后的receive window,再用1个round获取更高的delivery rate。
Linux中控制receive window的auto tuning的参数:
net.ipv4.tcp_moderate_rcvbuf
那么对于没有实现auto-tuning的接收端来说,其实不需要等待3个round,2个round就可以了?
bbr_check_drain
STARTUP之后的排空。
bbr_update_min_rtt
收到ack后检查min_rtt是否过期(10s),如果过期则需要进入PROBE_RTT状态,至少持续max(200ms,1RTT)时间。其中1RTT是通过收到进入PROBE_RTT发出的那个包来保证的。
内核代码实现似乎有点问题,即使当前的rate_sample里已经包含了一个更小的rtt,但是只要filter_delivered为true就会进入PROBE_RTT,更好的实现应该是检测到一个更小的rtt后就不再进入PROBE_RTT?
bbr_update_gains
跟据bbr模型当前阶段设置发送数据的pacing_gain和cwnd_gain,对于long-term模式,PROBE_BW的pacing_gain是1。
近期评论