Skip to content

Commit 9e23616

Browse files
committed
WIP: testing some congestion metrics and fixes
1 parent 60d5310 commit 9e23616

File tree

12 files changed

+532
-49
lines changed

12 files changed

+532
-49
lines changed

Cargo.lock

Lines changed: 3 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,6 @@ unexpected_cfgs = { level = "warn", check-cfg = ["cfg(iroh_docsrs)", "cfg(iroh_l
4040

4141
[workspace.lints.clippy]
4242
unused-async = "warn"
43+
44+
[patch.crates-io]
45+
iroh-metrics = { git = "https://github.com/n0-computer/iroh-metrics", branch = "arqu/histograms" }

iroh-dns-server/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ hickory-server = { version = "0.25.1", features = ["https-ring"] }
2828
http = "1.0.0"
2929
humantime = "2.2.0"
3030
humantime-serde = "1.1.1"
31-
iroh-metrics = { version = "0.35", features = ["service"] }
31+
iroh-metrics = { git = "https://github.com/n0-computer/iroh-metrics", branch = "arqu/histograms", features = ["service"] }
3232
lru = "0.13"
3333
n0-future = "0.1.2"
3434
n0-snafu = "0.2.2"

iroh-relay/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ http-body-util = "0.1.0"
3232
hyper = { version = "1", features = ["server", "client", "http1"] }
3333
hyper-util = "0.1.1"
3434
iroh-base = { version = "0.92.0", path = "../iroh-base", default-features = false, features = ["key", "relay"] }
35-
iroh-metrics = { version = "0.35", default-features = false }
35+
iroh-metrics = { git = "https://github.com/n0-computer/iroh-metrics", branch = "arqu/histograms", default-features = false }
3636
n0-future = "0.1.2"
3737
num_enum = "0.7"
3838
pin-project = "1"

iroh/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ futures-buffered = "0.2.11"
8585
spki = { version = "0.7.3", features = ["std"] }
8686

8787
# metrics
88-
iroh-metrics = { version = "0.35", default-features = false }
88+
iroh-metrics = { git = "https://github.com/n0-computer/iroh-metrics", branch = "arqu/histograms", default-features = false }
8989

9090
# local-swarm-discovery
9191
swarm-discovery = { version = "0.4", optional = true }

iroh/src/magicsock.rs

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,19 @@ const ENDPOINTS_FRESH_ENOUGH_DURATION: Duration = Duration::from_secs(27);
9595

9696
const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5);
9797

98+
/// Jitter range for heartbeat intervals (±25% of base interval).
99+
/// This prevents synchronized heartbeat storms across many nodes.
100+
const HEARTBEAT_JITTER_PCT: f64 = 0.25;
101+
102+
/// Create a jittered interval to prevent synchronized heartbeat storms.
103+
fn jittered_interval(base: Duration) -> time::Interval {
104+
let jitter_range = base.as_secs_f64() * HEARTBEAT_JITTER_PCT;
105+
let jitter = rand::thread_rng().gen_range(-jitter_range..=jitter_range);
106+
let jittered = base.as_secs_f64() + jitter;
107+
let duration = Duration::from_secs_f64(jittered.max(0.1));
108+
time::interval(duration)
109+
}
110+
98111
/// Contains options for `MagicSock::listen`.
99112
#[derive(derive_more::Debug)]
100113
pub(crate) struct Options {
@@ -795,7 +808,8 @@ impl MagicSock {
795808
}
796809
disco::Message::Pong(pong) => {
797810
self.metrics.magicsock.recv_disco_pong.inc();
798-
self.node_map.handle_pong(sender, src, pong);
811+
self.node_map
812+
.handle_pong(sender, src, pong, &self.metrics.magicsock);
799813
}
800814
disco::Message::CallMeMaybe(cm) => {
801815
self.metrics.magicsock.recv_disco_call_me_maybe.inc();
@@ -1849,7 +1863,7 @@ impl Actor {
18491863
let mut current_netmon_state = self.netmon_watcher.get();
18501864

18511865
#[cfg(not(wasm_browser))]
1852-
let mut direct_addr_heartbeat_timer = time::interval(HEARTBEAT_INTERVAL);
1866+
let mut direct_addr_heartbeat_timer = jittered_interval(HEARTBEAT_INTERVAL);
18531867

18541868
#[cfg(not(wasm_browser))]
18551869
let mut portmap_watcher = self
@@ -2063,7 +2077,9 @@ impl Actor {
20632077
async fn handle_actor_message(&mut self, msg: ActorMessage) {
20642078
match msg {
20652079
ActorMessage::EndpointPingExpired(id, txid) => {
2066-
self.msock.node_map.notify_ping_timeout(id, txid);
2080+
self.msock
2081+
.node_map
2082+
.notify_ping_timeout(id, txid, &self.msock.metrics.magicsock);
20672083
}
20682084
ActorMessage::NetworkChange => {
20692085
self.network_monitor.network_change().await.ok();
@@ -2255,7 +2271,9 @@ impl Actor {
22552271
/// This is called when connectivity changes enough that we no longer trust the old routes.
22562272
#[instrument(skip_all)]
22572273
fn reset_endpoint_states(&mut self) {
2258-
self.msock.node_map.reset_node_states()
2274+
self.msock
2275+
.node_map
2276+
.reset_node_states(&self.msock.metrics.magicsock)
22592277
}
22602278
}
22612279

iroh/src/magicsock/metrics.rs

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
use iroh_metrics::{Counter, MetricsGroup};
1+
use iroh_metrics::{Counter, Histogram, MetricsGroup};
22
use serde::{Deserialize, Serialize};
33

44
/// Enum of metrics for the module
55
// TODO(frando): Add description doc strings for each metric.
66
#[allow(missing_docs)]
7-
#[derive(Debug, Default, Serialize, Deserialize, MetricsGroup)]
7+
#[derive(Debug, Serialize, Deserialize, MetricsGroup)]
88
#[non_exhaustive]
99
#[metrics(name = "magicsock")]
1010
pub struct Metrics {
@@ -77,4 +77,88 @@ pub struct Metrics {
7777
pub connection_handshake_success: Counter,
7878
/// Number of connections with a successful handshake that became direct.
7979
pub connection_became_direct: Counter,
80+
81+
/*
82+
* Path Congestion Metrics
83+
*/
84+
/// Number of times a path was marked as outdated due to consecutive ping failures.
85+
pub path_marked_outdated: Counter,
86+
/// Number of ping failures recorded across all paths.
87+
pub path_ping_failures: Counter,
88+
/// Number of consecutive failure resets (path recovered).
89+
pub path_failure_resets: Counter,
90+
/// Histogram of packet loss rates (0.0-1.0) observed on UDP paths.
91+
pub path_packet_loss_rate: Histogram,
92+
/// Histogram of RTT variance (in milliseconds) as a congestion indicator.
93+
pub path_rtt_variance_ms: Histogram,
94+
/// Histogram of path quality scores (0.0-1.0).
95+
pub path_quality_score: Histogram,
96+
}
97+
98+
impl Default for Metrics {
99+
fn default() -> Self {
100+
Self {
101+
update_direct_addrs: Counter::default(),
102+
send_ipv4: Counter::default(),
103+
send_ipv6: Counter::default(),
104+
send_relay: Counter::default(),
105+
send_relay_error: Counter::default(),
106+
send_data: Counter::default(),
107+
send_data_network_down: Counter::default(),
108+
recv_data_relay: Counter::default(),
109+
recv_data_ipv4: Counter::default(),
110+
recv_data_ipv6: Counter::default(),
111+
recv_datagrams: Counter::default(),
112+
recv_gro_datagrams: Counter::default(),
113+
send_disco_udp: Counter::default(),
114+
send_disco_relay: Counter::default(),
115+
sent_disco_udp: Counter::default(),
116+
sent_disco_relay: Counter::default(),
117+
sent_disco_ping: Counter::default(),
118+
sent_disco_pong: Counter::default(),
119+
sent_disco_call_me_maybe: Counter::default(),
120+
recv_disco_bad_key: Counter::default(),
121+
recv_disco_bad_parse: Counter::default(),
122+
recv_disco_udp: Counter::default(),
123+
recv_disco_relay: Counter::default(),
124+
recv_disco_ping: Counter::default(),
125+
recv_disco_pong: Counter::default(),
126+
recv_disco_call_me_maybe: Counter::default(),
127+
recv_disco_call_me_maybe_bad_disco: Counter::default(),
128+
relay_home_change: Counter::default(),
129+
num_direct_conns_added: Counter::default(),
130+
num_direct_conns_removed: Counter::default(),
131+
num_relay_conns_added: Counter::default(),
132+
num_relay_conns_removed: Counter::default(),
133+
actor_tick_main: Counter::default(),
134+
actor_tick_msg: Counter::default(),
135+
actor_tick_re_stun: Counter::default(),
136+
actor_tick_portmap_changed: Counter::default(),
137+
actor_tick_direct_addr_heartbeat: Counter::default(),
138+
actor_link_change: Counter::default(),
139+
actor_tick_other: Counter::default(),
140+
nodes_contacted: Counter::default(),
141+
nodes_contacted_directly: Counter::default(),
142+
connection_handshake_success: Counter::default(),
143+
connection_became_direct: Counter::default(),
144+
path_marked_outdated: Counter::default(),
145+
path_ping_failures: Counter::default(),
146+
path_failure_resets: Counter::default(),
147+
path_packet_loss_rate: packet_loss_buckets(),
148+
path_rtt_variance_ms: rtt_variance_buckets(),
149+
path_quality_score: quality_score_buckets(),
150+
}
151+
}
152+
}
153+
154+
fn packet_loss_buckets() -> Histogram {
155+
Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0])
156+
}
157+
158+
fn rtt_variance_buckets() -> Histogram {
159+
Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0])
160+
}
161+
162+
fn quality_score_buckets() -> Histogram {
163+
Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0])
80164
}

iroh/src/magicsock/node_map.rs

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,14 +192,19 @@ impl NodeMap {
192192
}
193193
}
194194

195-
pub(super) fn notify_ping_timeout(&self, id: usize, tx_id: stun_rs::TransactionId) {
195+
pub(super) fn notify_ping_timeout(
196+
&self,
197+
id: usize,
198+
tx_id: stun_rs::TransactionId,
199+
metrics: &Metrics,
200+
) {
196201
if let Some(ep) = self
197202
.inner
198203
.lock()
199204
.expect("poisoned")
200205
.get_mut(NodeStateKey::Idx(id))
201206
{
202-
ep.ping_timeout(tx_id, Instant::now());
207+
ep.ping_timeout(tx_id, Instant::now(), metrics);
203208
}
204209
}
205210

@@ -228,11 +233,17 @@ impl NodeMap {
228233
.handle_ping(sender, src, tx_id)
229234
}
230235

231-
pub(super) fn handle_pong(&self, sender: PublicKey, src: &transports::Addr, pong: Pong) {
236+
pub(super) fn handle_pong(
237+
&self,
238+
sender: PublicKey,
239+
src: &transports::Addr,
240+
pong: Pong,
241+
metrics: &Metrics,
242+
) {
232243
self.inner
233244
.lock()
234245
.expect("poisoned")
235-
.handle_pong(sender, src, pong)
246+
.handle_pong(sender, src, pong, metrics)
236247
}
237248

238249
#[must_use = "actions must be handled"]
@@ -268,11 +279,11 @@ impl NodeMap {
268279
Some((public_key, udp_addr, relay_url, ping_actions))
269280
}
270281

271-
pub(super) fn reset_node_states(&self) {
282+
pub(super) fn reset_node_states(&self, metrics: &Metrics) {
272283
let now = Instant::now();
273284
let mut inner = self.inner.lock().expect("poisoned");
274285
for (_, ep) in inner.node_states_mut() {
275-
ep.note_connectivity_change(now);
286+
ep.note_connectivity_change(now, metrics);
276287
}
277288
}
278289

@@ -506,9 +517,15 @@ impl NodeMapInner {
506517
.map(|ep| ep.conn_type())
507518
}
508519

509-
fn handle_pong(&mut self, sender: NodeId, src: &transports::Addr, pong: Pong) {
520+
fn handle_pong(
521+
&mut self,
522+
sender: NodeId,
523+
src: &transports::Addr,
524+
pong: Pong,
525+
metrics: &Metrics,
526+
) {
510527
if let Some(ns) = self.get_mut(NodeStateKey::NodeId(sender)).as_mut() {
511-
let insert = ns.handle_pong(&pong, src.clone().into());
528+
let insert = ns.handle_pong(&pong, src.clone().into(), metrics);
512529
if let Some((src, key)) = insert {
513530
self.set_node_key_for_ip_port(src, &key);
514531
}
@@ -541,7 +558,7 @@ impl NodeMapInner {
541558
Some(ns) => {
542559
debug!(endpoints = ?cm.my_numbers, "received call-me-maybe");
543560

544-
ns.handle_call_me_maybe(cm)
561+
ns.handle_call_me_maybe(cm, metrics)
545562
}
546563
}
547564
}

0 commit comments

Comments
 (0)