计算机 · 2021年12月6日 0

Linux内核BBR代码笔记

源码

版本5.16-rc3

核心算法

   On each ACK, update our model of the network path:

   bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)

   min_rtt = windowed_min(rtt, 10 seconds)

   pacing_rate = pacing_gain * bottleneck_bandwidth

   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)

 * Here is a state transition diagram for BBR:
 *
 *             |
 *             V
 *    +---> STARTUP  ----+
 *    |        |         |
 *    |        V         |
 *    |      DRAIN   ----+
 *    |        |         |
 *    |        V         |
 *    +---> PROBE_BW ----+
 *    |      ^    |      |
 *    |      |    |      |
 *    |      +----+      |
 *    |                  |
 *    +---- PROBE_RTT <--+

全局参数

带宽计算

#define BW_SCALE 24
#define BW_UNIT (1 << BW_SCALE)

用一个u32(低BW_SCALE部分为小数部分)来表示每1us发送的包数,按照每个包大小1500byte算,这个u32可以表示的带宽范围为0.06pps(715bps)到256Mpps(3Tbps)。

定点化

#define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
#define BBR_UNIT (1 << BBR_SCALE)

全局变量

/* Window length of bw filter (in rounds): */
static const int bbr_bw_rtts = CYCLE_LEN + 2;
/* Window length of min_rtt filter (in sec): */
static const u32 bbr_min_rtt_win_sec = 10;
/* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000; // 小于1.2Mbps就别用tso了
static const int bbr_pacing_margin_percent = 1; // 不要用满带宽,留1%
static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
static const int bbr_cwnd_gain  = BBR_UNIT * 2;
static const u32 bbr_cycle_rand = 7; // 随机化
static const u32 bbr_cwnd_min_target = 4; // 至少保证4个包的cwnd
static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
static const u32 bbr_full_bw_cnt = 3;

long-term相关的东西

/* "long-term" ("LT") bandwidth estimator parameters... */
/* The minimum number of rounds in an LT bw sampling interval: */
static const u32 bbr_lt_intvl_min_rtts = 4;
/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
static const u32 bbr_lt_loss_thresh = 50;
/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
static const u32 bbr_lt_bw_diff = 4000 / 8;
/* If we estimate we're policed, use lt_bw for this many round trips: */
static const u32 bbr_lt_bw_max_rtts = 48;

bbr_lt_loss_thresh:丢包率连续超过20%,则认为收到了流量控制,需要进入long-term模式

ack聚积相关的东西

/* Gain factor for adding extra_acked to target cwnd: */
static const int bbr_extra_acked_gain = BBR_UNIT;
/* Window length of extra_acked window. */
static const u32 bbr_extra_acked_win_rtts = 5;
/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
/* Time period for clamping cwnd increment due to ack aggregation */
static const u32 bbr_extra_acked_max_us = 100 * 1000;

重要数据结构

static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
	.flags		= TCP_CONG_NON_RESTRICTED,
	.name		= "bbr",
	.owner		= THIS_MODULE,
	.init		= bbr_init,
	.cong_control	= bbr_main,
	.sndbuf_expand	= bbr_sndbuf_expand,
	.undo_cwnd	= bbr_undo_cwnd,
	.cwnd_event	= bbr_cwnd_event,
	.ssthresh	= bbr_ssthresh,
	.min_tso_segs	= bbr_min_tso_segs,
	.get_info	= bbr_get_info,
	.set_state	= bbr_set_state,
};
/* BBR congestion control block */
struct bbr {
	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
	u64	cycle_mstamp;	     /* time of this cycle phase start */
	u32     mode:3,		     /* current bbr_mode in state machine */
		prev_ca_state:3,     /* CA state on previous ACK */
		packet_conservation:1,  /* use packet conservation? */
		round_start:1,	     /* start of packet-timed tx->ack round? */
		idle_restart:1,	     /* restarting after idle? */
		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
		unused:13,
		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
		lt_rtt_cnt:7,	     /* round trips in long-term interval */
		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
	u32	pacing_gain:10,	/* current gain for setting pacing rate */
		cwnd_gain:10,	/* current gain for setting cwnd */
		full_bw_reached:1,   /* reached full bw in Startup? */
		full_bw_cnt:2,	/* number of rounds without large bw gains */
		cycle_idx:3,	/* current index in pacing_gain cycle array */
		has_seen_rtt:1, /* have we seen an RTT sample yet? */
		unused_b:5;
	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
	u32	full_bw;	/* recent bw, to estimate if pipe is full */

	/* For tracking ACK aggregation: */
	u64	ack_epoch_mstamp;	/* start of ACK sampling epoch */
	u16	extra_acked[2];		/* max excess data ACKed in epoch */
	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
		extra_acked_win_idx:1,	/* current index in extra_acked array */
		unused_c:6;
};

重要函数

bbr_init

类的各个变量的初始化;

bbr_main

static void bbr_main(struct sock *sk, const struct rate_sample *rs)
{
	struct bbr *bbr = inet_csk_ca(sk);
	u32 bw;

	bbr_update_model(sk, rs);

	bw = bbr_bw(sk);
	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
}

每次收到ack后,需要执行的动作:更新mode,bw,pacing_rate,cwnd

实现了cong_control方法,该方法在处理好ca_state后调用,用于更新cwnd和pacing_rate。

bbr_sndbuf_expand

static u32 bbr_sndbuf_expand(struct sock *sk)
{
	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
	return 3;
}

bbr_undo_cwnd

/* In theory BBR does not need to undo the cwnd since it does not
 * always reduce cwnd on losses (see bbr_main()). Keep it for now.
 */
static u32 bbr_undo_cwnd(struct sock *sk)
{
	struct bbr *bbr = inet_csk_ca(sk);

	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
	bbr->full_bw_cnt = 0;
	bbr_reset_lt_bw_sampling(sk);
	return tcp_sk(sk)->snd_cwnd;
}

bbr_ssthresh

/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
static u32 bbr_ssthresh(struct sock *sk)
{
	bbr_save_cwnd(sk);
	return tcp_sk(sk)->snd_ssthresh;
}

bbr_min_tso_segs

/* override sysctl_tcp_min_tso_segs */
static u32 bbr_min_tso_segs(struct sock *sk)
{
	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
}

tcp_min_tso_seg:每个 TSO 帧包含的报文段数量最小值。

对于低带宽链路,tso在一个tso帧中将多个报文段送给网卡,这样反而导致低带宽链路上的burst传输,因此低带宽链路反而应该限制tso的使用。

tcp_min_tso_segs参数的解释

内核代码相关修改

bbr_get_info

返回bbr内部状态

bbr_set_state

static void bbr_set_state(struct sock *sk, u8 new_state)
{
	struct bbr *bbr = inet_csk_ca(sk);

	if (new_state == TCP_CA_Loss) {
		struct rate_sample rs = { .losses = 1 };

		bbr->prev_ca_state = TCP_CA_Loss;
		bbr->full_bw = 0;
		bbr->round_start = 1;	/* treat RTO like end of a round */
		bbr_lt_bw_sampling(sk, &rs);
	}
}

改变ca_state时调用,ca_state的取值:

/*
 * Sender's congestion state indicating normal or abnormal situations
 * in the last round of packets sent. The state is driven by the ACK
 * information and timer events.
 */
enum tcp_ca_state {
	/*
	 * Nothing bad has been observed recently.
	 * No apparent reordering, packet loss, or ECN marks.
	 */
	TCP_CA_Open = 0,
#define TCPF_CA_Open	(1<<TCP_CA_Open)
	/*
	 * The sender enters disordered state when it has received DUPACKs or
	 * SACKs in the last round of packets sent. This could be due to packet
	 * loss or reordering but needs further information to confirm packets
	 * have been lost.
	 */
	TCP_CA_Disorder = 1,
#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
	/*
	 * The sender enters Congestion Window Reduction (CWR) state when it
	 * has received ACKs with ECN-ECE marks, or has experienced congestion
	 * or packet discard on the sender host (e.g. qdisc).
	 */
	TCP_CA_CWR = 2,
#define TCPF_CA_CWR	(1<<TCP_CA_CWR)
	/*
	 * The sender is in fast recovery and retransmitting lost packets,
	 * typically triggered by ACK events.
	 */
	TCP_CA_Recovery = 3,
#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
	/*
	 * The sender is in loss recovery triggered by retransmission timeout.
	 */
	TCP_CA_Loss = 4
#define TCPF_CA_Loss	(1<<TCP_CA_Loss)
};

所以这个bbr_set_state函数的作用就是在遇到rto超时后重新进入STARTUP。

内部函数

bbr_save_cwnd

/* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
static void bbr_save_cwnd(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct bbr *bbr = inet_csk_ca(sk);

	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
}

保存cwnd,在退出loss recovery或者probe_rtt结束后可以继续使用这个cwnd。

bbr_update_model

static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{
	bbr_update_bw(sk, rs);
	bbr_update_ack_aggregation(sk, rs);
	bbr_update_cycle_phase(sk, rs);
	bbr_check_full_bw_reached(sk, rs);
	bbr_check_drain(sk, rs);
	bbr_update_min_rtt(sk, rs);
	bbr_update_gains(sk);
}

bbr_update_bw

更新round数,以及带宽采样值(包括long-term模式的带宽采样)。

bbr_update_ack_aggregation

用extra_acked[0]和extra_acked[1]表示最近1-5这5个周期和最近6-10这5个周期内的最大ack聚积,只有在每个round结束的那一个ack才需要更新ack聚积程度。聚积程度取extra_acked[0]和extra_acked[1]中的最大值。

如果实际ack的字节数小于等于expected_acked或者远大于expected_acked(超过2^20),那么要重置跟踪extra_acked的round。

extra_acked[0/1]不能超过当前snd_cwnd。

最后在计算target_cwnd时,extra_acked不能超过bw * bbr_extra_acked_max_us(100ms)。

bbr_update_cycle_phase

对于pacing_gain等于1的阶段(PROBE_BW中的6个平稳发送阶段),需要发足1个rtt时间(按wall clock time算)才可以进入下一个阶段。

对于pacing_gain大于1的阶段(STARTUP和PROBE_BW中的上升探测阶段),需要同时满足2个条件才可以进入下一个阶段:a.持续时间达到1个rtt(按wall clock time算),b.出现丢包或者bif达到pacing_gain * BDP(bw*min_rtt)。注意第2个条件里列的是pacing_gain而不是cwnd_gain。

对于pacing_gain小于1的阶段(DRAIN和PROBE_BW中的下降阶段),只要满足2个条件中任意1个即可进入下一个阶段:a.持续时间达到1个rtt(按wall clock time算),b.bif小于等于BDP(bw*min_rtt)。

bbr_check_full_bw_reached

有3个non-applimited round带宽增长都没有达到25%,那么就任务startup已经探测到最大带宽了。为什么是3个round?这个函数的注释是:

/* Estimate when the pipe is full, using the change in delivery rate: BBR
 * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
 * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
 * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
 * higher rwin, 3: we get higher delivery rate samples. Or transient
 * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
 * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
 */

先用1个round来让接收端动态调整receive window,再用1个round填满增长后的receive window,再用1个round获取更高的delivery rate。

Linux中控制receive window的auto tuning的参数:

net.ipv4.tcp_moderate_rcvbuf

那么对于没有实现auto-tuning的接收端来说,其实不需要等待3个round,2个round就可以了?

bbr_check_drain

STARTUP之后的排空。

bbr_update_min_rtt

收到ack后检查min_rtt是否过期(10s),如果过期则需要进入PROBE_RTT状态,至少持续max(200ms,1RTT)时间。其中1RTT是通过收到进入PROBE_RTT发出的那个包来保证的。

内核代码实现似乎有点问题,即使当前的rate_sample里已经包含了一个更小的rtt,但是只要filter_delivered为true就会进入PROBE_RTT,更好的实现应该是检测到一个更小的rtt后就不再进入PROBE_RTT?

bbr_update_gains

跟据bbr模型当前阶段设置发送数据的pacing_gain和cwnd_gain,对于long-term模式,PROBE_BW的pacing_gain是1。