From 20414e51f64df50b174cbda498794571411edd91 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Thu, 26 Jun 2025 10:21:51 +0300 Subject: [PATCH 01/26] Add handler pool So far no tests or anything, just to put it somewhere. --- iroh-handler-pool/Cargo.toml | 12 + iroh-handler-pool/src/handler_pool.rs | 255 +++++++++++++++++ iroh-handler-pool/src/handler_pool_0rtt.rs | 317 +++++++++++++++++++++ iroh-handler-pool/src/lib.rs | 2 + 4 files changed, 586 insertions(+) create mode 100644 iroh-handler-pool/Cargo.toml create mode 100644 iroh-handler-pool/src/handler_pool.rs create mode 100644 iroh-handler-pool/src/handler_pool_0rtt.rs create mode 100644 iroh-handler-pool/src/lib.rs diff --git a/iroh-handler-pool/Cargo.toml b/iroh-handler-pool/Cargo.toml new file mode 100644 index 0000000..8974c69 --- /dev/null +++ b/iroh-handler-pool/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "iroh-handler-pool" +version = "0.1.0" +edition = "2024" + +[dependencies] +anyhow = "1" +iroh = "0.35.0" +n0-future = "0.1.3" +tokio = "1.45" +tokio-util = { version = "0.7", features = ["time"] } +tracing = "0.1.41" diff --git a/iroh-handler-pool/src/handler_pool.rs b/iroh-handler-pool/src/handler_pool.rs new file mode 100644 index 0000000..7e7c29b --- /dev/null +++ b/iroh-handler-pool/src/handler_pool.rs @@ -0,0 +1,255 @@ +use std::{collections::HashMap, sync::Arc}; + +use anyhow::anyhow; +use iroh::{Endpoint, NodeId, endpoint::Connection}; +use tokio::{ + sync::mpsc, + task::{JoinError, JoinSet}, +}; +use tokio_util::time::FutureExt; + +pub struct Options { + pub idle_timeout: std::time::Duration, + pub connect_timeout: std::time::Duration, + pub alpn: Vec, +} + +type BoxedHandler = Box< + dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> + Send + 'static, +>; + +pub enum ConnectResult { + Connected(Connection), + Timeout, + ConnectError(anyhow::Error), + ConnectionError(anyhow::Error), + JoinError(JoinError), +} + +enum ActorMessage { + Handle { id: NodeId, handler: BoxedHandler }, + ConnectionShutdown { id: NodeId }, +} + +/// Run a connection actor for a single node +async fn run_connection_actor( + endpoint: Endpoint, + node_id: NodeId, + mut rx: mpsc::Receiver, + main_tx: mpsc::Sender, + options: Arc, +) -> anyhow::Result<()> { + // Connect to the node + let mut state = match endpoint + .connect(node_id, &options.alpn) + .timeout(options.connect_timeout) + .await + { + Ok(Ok(conn)) => ConnectResult::Connected(conn), + Ok(Err(e)) => ConnectResult::ConnectError(e), + Err(_) => ConnectResult::Timeout, + }; + if !matches!(state, ConnectResult::Connected(_)) { + let _ = main_tx + .send(ActorMessage::ConnectionShutdown { id: node_id }) + .await; + } + + let mut tasks = JoinSet::new(); + let idle_timer = tokio::time::sleep(options.idle_timeout); + tokio::pin!(idle_timer); + + loop { + tokio::select! { + biased; + + // Handle new work + handler = rx.recv() => { + match handler { + Some(handler) => { + // Reset idle timer by creating a new one + idle_timer.set(tokio::time::sleep(options.idle_timeout)); + tasks.spawn(handler(&state)); + } + None => { + // Channel closed - finish remaining tasks and exit + break; + } + } + } + + // Handle completed tasks + task_result = tasks.join_next(), if !tasks.is_empty() => { + match task_result { + Some(Ok(Ok(()))) => { + tracing::debug!("Task completed for node {}", node_id); + } + Some(Ok(Err(e))) => { + tracing::error!("Task failed for node {}: {}", node_id, e); + if let ConnectResult::Connected(conn) = state { + conn.close(1u32.into(), b""); + } + state = ConnectResult::ConnectionError(e); + let _ = main_tx.send(ActorMessage::ConnectionShutdown { id: node_id }).await; + } + Some(Err(e)) => { + tracing::error!("Task panicked for node {}: {}", node_id, e); + if let ConnectResult::Connected(conn) = state { + conn.close(1u32.into(), b""); + } + state = ConnectResult::JoinError(e); + let _ = main_tx.send(ActorMessage::ConnectionShutdown { id: node_id }).await; + } + None => { + tracing::debug!("Task was cancelled or already completed for node {}", node_id); + } + } + + // If no tasks left and channel is closed, we can exit + if tasks.is_empty() && rx.is_closed() { + break; + } + } + + // Idle timeout - request shutdown + _ = &mut idle_timer, if tasks.is_empty() => { + tracing::debug!("Connection to {} idle, requesting shutdown", node_id); + let _ = main_tx.send(ActorMessage::ConnectionShutdown { id: node_id }).await; + // Don't break here - wait for main actor to close our channel + } + } + } + + // Wait for remaining tasks to complete + while let Some(task_result) = tasks.join_next().await { + if let Err(e) = task_result { + tracing::error!("Task failed during shutdown for node {}: {}", node_id, e); + } + } + + if let ConnectResult::Connected(connection) = &state { + connection.close(0u32.into(), b""); + } + + tracing::debug!("Connection actor for {} shutting down", node_id); + Ok(()) +} + +struct Actor { + tx: mpsc::Sender, + rx: mpsc::Receiver, + endpoint: Endpoint, + connections: HashMap>, + options: Arc, +} + +impl Actor { + pub fn new(endpoint: Endpoint, options: Options) -> (Self, mpsc::Sender) { + let (tx, rx) = mpsc::channel(100); + ( + Self { + rx, + tx: tx.clone(), + endpoint, + connections: HashMap::new(), + options: Arc::new(options), + }, + tx, + ) + } + + pub async fn run(mut self) -> anyhow::Result<()> { + while let Some(msg) = self.rx.recv().await { + match msg { + ActorMessage::Handle { id, mut handler } => { + // Try to send to existing connection actor + if let Some(conn_tx) = self.connections.get(&id) { + if let Err(tokio::sync::mpsc::error::SendError(e)) = + conn_tx.send(handler).await + { + handler = e; + } else { + continue; + } + // Connection actor died, remove it + self.connections.remove(&id); + } + + // No connection actor or it died - spawn a new one + let (conn_tx, conn_rx) = mpsc::channel(100); + self.connections.insert(id, conn_tx.clone()); + + let endpoint = self.endpoint.clone(); + let main_tx = self.tx.clone(); // Assuming we store the sender + let options = self.options.clone(); + + tokio::spawn(async move { + if let Err(e) = + run_connection_actor(endpoint, id, conn_rx, main_tx, options).await + { + tracing::error!("Connection actor for {} failed: {}", id, e); + } + }); + + // Send the handler to the new actor + if conn_tx.send(handler).await.is_err() { + tracing::error!( + "Failed to send handler to new connection actor for {}", + id + ); + self.connections.remove(&id); + } + } + ActorMessage::ConnectionShutdown { id } => { + // Remove the connection from our map - this closes the channel + self.connections.remove(&id); + tracing::debug!("Approved shutdown for connection {}", id); + } + } + } + Ok(()) + } +} + +pub struct HandlerPool { + tx: mpsc::Sender, +} + +impl HandlerPool { + pub fn new(endpoint: Endpoint, options: Options) -> Self { + let (actor, tx) = Actor::new(endpoint, options); + + // Spawn the main actor + tokio::spawn(async move { + if let Err(e) = actor.run().await { + tracing::error!("Main actor failed: {}", e); + } + }); + + Self { tx } + } + + /// Connect to a node and execute the given handler function + /// + /// The connection will either be a new connection or an existing one if it is already established. + /// If connection establishment succeeds, the handler will be called with a `ConnectResult::Connected(Connection)`. + /// If connection establishment fails, the handler will get passed a `ConnectResult` containing the error. + /// + /// The fn f is guaranteed to be called exactly once, unless the tokio runtime is shutting down. + pub async fn connect(&self, id: NodeId, f: F) -> anyhow::Result<()> + where + F: FnOnce(&ConnectResult) -> Fut + Send + 'static, + Fut: std::future::Future> + Send + 'static, + { + let handler = Box::new(move |conn: &ConnectResult| { + Box::pin(f(conn)) as n0_future::boxed::BoxFuture> + }); + + self.tx + .send(ActorMessage::Handle { id, handler }) + .await + .map_err(|_| anyhow!("Main actor has shut down"))?; + + Ok(()) + } +} diff --git a/iroh-handler-pool/src/handler_pool_0rtt.rs b/iroh-handler-pool/src/handler_pool_0rtt.rs new file mode 100644 index 0000000..c82ea0f --- /dev/null +++ b/iroh-handler-pool/src/handler_pool_0rtt.rs @@ -0,0 +1,317 @@ +use std::{collections::HashMap, sync::Arc}; + +use anyhow::anyhow; +use iroh::{ + Endpoint, NodeId, + endpoint::{ConnectOptions, Connection}, +}; +use tokio::{ + sync::{broadcast, mpsc}, + task::{JoinError, JoinSet}, +}; +use tokio_util::time::FutureExt; +use tracing::{error, trace}; + +pub struct Options { + pub idle_timeout: std::time::Duration, + pub connect_timeout: std::time::Duration, + pub alpn: Vec, +} + +type BoxedHandler = Box< + dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> + Send + 'static, +>; + +pub enum ConnectResult { + Connected(Connection, broadcast::Receiver), + Timeout, + ConnectError(anyhow::Error), + ConnectError2(iroh::endpoint::ConnectionError), + HandlerError(anyhow::Error), + JoinError(JoinError), +} + +enum ActorMessage { + Handle { id: NodeId, handler: BoxedHandler }, + ConnectionShutdown { id: NodeId }, +} + +fn accepted(value: bool) -> broadcast::Receiver { + let (tx, rx) = broadcast::channel(1); + let _ = tx.send(value); + rx +} + +async fn connect( + endpoint: Endpoint, + node_id: NodeId, + alpn: &[u8], +) -> (ConnectResult, Option>) { + let connecting = match endpoint + .connect_with_opts(node_id, alpn, ConnectOptions::default()) + .await + { + Ok(connecting) => connecting, + Err(cause) => { + trace!("Failed to connect to node {}: {}", node_id, cause); + return (ConnectResult::ConnectError(cause), None); + } + }; + let (conn, zero_rtt_accepted) = match connecting.into_0rtt() { + Ok((conn, accepted)) => { + trace!("Connected to node {} with 0-RTT", node_id); + (conn, accepted) + } + Err(connecting) => { + trace!("Failed to connect using 0-RTT to node {}", node_id); + let res = match connecting.await { + Err(cause) => ConnectResult::ConnectError2(cause), + Ok(connection) => ConnectResult::Connected(connection, accepted(true)), + }; + return (res, None); + } + }; + let (tx, rx) = broadcast::channel(1); + let complete = Box::pin(async move { + tx.send(zero_rtt_accepted.await).ok(); + }); + (ConnectResult::Connected(conn, rx), Some(complete)) +} + +/// Run a connection actor for a single node +async fn run_connection_actor( + endpoint: Endpoint, + node_id: NodeId, + mut rx: mpsc::Receiver, + owner: HandlerPool0Rtt, + options: Arc, +) -> anyhow::Result<()> { + // Connect to the node + let (mut state, mut forwarder) = match connect(endpoint, node_id, &options.alpn) + .timeout(options.connect_timeout) + .await + { + Ok((state, forwarder)) => (state, forwarder), + Err(_) => (ConnectResult::Timeout, None), + }; + if !matches!(state, ConnectResult::Connected(_, _)) { + owner.close(node_id).await.ok(); + } + let mut tasks = JoinSet::new(); + let idle_timer = tokio::time::sleep(options.idle_timeout); + tokio::pin!(idle_timer); + + loop { + tokio::select! { + biased; + + // Handle new work + handler = rx.recv() => { + match handler { + Some(handler) => { + // Reset idle timer by creating a new one + idle_timer.set(tokio::time::sleep(options.idle_timeout)); + tasks.spawn(handler(&state)); + } + None => { + // Channel closed - finish remaining tasks and exit + break; + } + } + } + + // Handle completed tasks + task_result = tasks.join_next(), if !tasks.is_empty() => { + match task_result { + Some(Ok(Ok(()))) => { + trace!("Task completed for node {}", node_id); + } + Some(Ok(Err(e))) => { + trace!("Task failed for node {}: {}", node_id, e); + if let ConnectResult::Connected(conn, _) = state { + conn.close(1u32.into(), b""); + } + state = ConnectResult::HandlerError(e); + owner.close(node_id).await.ok(); + } + Some(Err(e)) => { + error!("Task panicked for node {}: {}", node_id, e); + if let ConnectResult::Connected(conn, _) = state { + conn.close(1u32.into(), b""); + } + state = ConnectResult::JoinError(e); + owner.close(node_id).await.ok(); + } + None => { + trace!("Task was cancelled or already completed for node {}", node_id); + } + } + + // If no tasks left and channel is closed, we can exit + if tasks.is_empty() && rx.is_closed() { + break; + } + } + + // Idle timeout - request shutdown + _ = &mut idle_timer, if tasks.is_empty() => { + trace!("Connection to {} idle, requesting shutdown", node_id); + owner.close(node_id).await.ok(); + // Don't break here - wait for main actor to close our channel + } + + _ = forwarder.as_mut().unwrap(), if forwarder.is_some() => { + forwarder = None; + } + } + } + + // Wait for remaining tasks to complete + while let Some(task_result) = tasks.join_next().await { + if let Err(e) = task_result { + error!("Task panicked during shutdown for node {}: {}", node_id, e); + } + } + + if let ConnectResult::Connected(conn, _) = state { + conn.close(0u32.into(), b""); + } + + trace!("Connection actor for {} shutting down", node_id); + Ok(()) +} + +struct Actor { + tx: HandlerPool0Rtt, + rx: mpsc::Receiver, + endpoint: Endpoint, + connections: HashMap>, + options: Arc, +} + +impl Actor { + pub fn new(endpoint: Endpoint, options: Options) -> (Self, mpsc::Sender) { + let (tx, rx) = mpsc::channel(100); + ( + Self { + rx, + tx: HandlerPool0Rtt { tx: tx.clone() }, + endpoint, + connections: HashMap::new(), + options: Arc::new(options), + }, + tx, + ) + } + + pub async fn run(mut self) -> anyhow::Result<()> { + while let Some(msg) = self.rx.recv().await { + match msg { + ActorMessage::Handle { id, mut handler } => { + // Try to send to existing connection actor + if let Some(conn_tx) = self.connections.get(&id) { + if let Err(tokio::sync::mpsc::error::SendError(e)) = + conn_tx.send(handler).await + { + handler = e; + } else { + continue; + } + // Connection actor died, remove it + self.connections.remove(&id); + } + + // No connection actor or it died - spawn a new one + let (conn_tx, conn_rx) = mpsc::channel(100); + self.connections.insert(id, conn_tx.clone()); + + let endpoint = self.endpoint.clone(); + let main_tx = self.tx.clone(); // Assuming we store the sender + let options = self.options.clone(); + + tokio::spawn(async move { + if let Err(e) = + run_connection_actor(endpoint, id, conn_rx, main_tx, options).await + { + tracing::error!("Connection actor for {} failed: {}", id, e); + } + }); + + // Send the handler to the new actor + if conn_tx.send(handler).await.is_err() { + tracing::error!( + "Failed to send handler to new connection actor for {}", + id + ); + self.connections.remove(&id); + } + } + ActorMessage::ConnectionShutdown { id } => { + // Remove the connection from our map - this closes the channel + self.connections.remove(&id); + tracing::debug!("Approved shutdown for connection {}", id); + } + } + } + Ok(()) + } +} + +#[derive(Debug, Clone)] +#[repr(transparent)] +pub struct HandlerPool0Rtt { + tx: mpsc::Sender, +} + +impl HandlerPool0Rtt { + pub fn new(endpoint: Endpoint, options: Options) -> Self { + let (actor, tx) = Actor::new(endpoint, options); + + // Spawn the main actor + tokio::spawn(async move { + if let Err(e) = actor.run().await { + tracing::error!("Main actor failed: {}", e); + } + }); + + Self { tx } + } + + /// Close an existing connection, if it exists + /// + /// This will finish pending tasks and close the connection. New tasks will + /// get a new connection if they are submitted after this call + pub async fn close(&self, id: NodeId) -> anyhow::Result<()> { + self.tx + .send(ActorMessage::ConnectionShutdown { id }) + .await + .map_err(|_| anyhow!("Main actor has shut down"))?; + Ok(()) + } + + /// Connect to a node and execute the given handler function + /// + /// The connection will either be a new connection or an existing one if it is already established. + /// If connection establishment succeeds, the handler will be called with a [`ConnectResult::Connected`]. + /// If connection establishment fails, the handler will get passed a [`ConnectResult`] containing the error. + /// + /// The fn f is guaranteed to be called exactly once, unless the tokio runtime is shutting down. + /// If the fn returns an error, it is assumed that the connection is no longer valid. This will cause + /// the connection to be closed and a new one to be established for future calls. + pub async fn connect(&self, id: NodeId, f: F) -> anyhow::Result<()> + where + F: FnOnce(&ConnectResult) -> Fut + Send + 'static, + Fut: std::future::Future> + Send + 'static, + { + let handler = Box::new(move |conn: &ConnectResult| { + Box::pin(f(conn)) as n0_future::boxed::BoxFuture> + }); + + self.tx + .send(ActorMessage::Handle { id, handler }) + .await + .map_err(|_| anyhow!("Main actor has shut down"))?; + + Ok(()) + } +} diff --git a/iroh-handler-pool/src/lib.rs b/iroh-handler-pool/src/lib.rs new file mode 100644 index 0000000..b7754b0 --- /dev/null +++ b/iroh-handler-pool/src/lib.rs @@ -0,0 +1,2 @@ +pub mod handler_pool; +pub mod handler_pool_0rtt; From b9148260d09a4251d25e49054dcca2128819d061 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 14:51:00 +0200 Subject: [PATCH 02/26] Update iroh to 0.91.1 and eliminate anyhow --- iroh-handler-pool/Cargo.toml | 4 +- iroh-handler-pool/src/handler_pool.rs | 59 +++++++++++--------- iroh-handler-pool/src/handler_pool_0rtt.rs | 64 ++++++++++++---------- 3 files changed, 69 insertions(+), 58 deletions(-) diff --git a/iroh-handler-pool/Cargo.toml b/iroh-handler-pool/Cargo.toml index 8974c69..f9fdf50 100644 --- a/iroh-handler-pool/Cargo.toml +++ b/iroh-handler-pool/Cargo.toml @@ -4,9 +4,9 @@ version = "0.1.0" edition = "2024" [dependencies] -anyhow = "1" -iroh = "0.35.0" +iroh = "0.91.1" n0-future = "0.1.3" +snafu = "0.8.6" tokio = "1.45" tokio-util = { version = "0.7", features = ["time"] } tracing = "0.1.41" diff --git a/iroh-handler-pool/src/handler_pool.rs b/iroh-handler-pool/src/handler_pool.rs index 7e7c29b..9d3567f 100644 --- a/iroh-handler-pool/src/handler_pool.rs +++ b/iroh-handler-pool/src/handler_pool.rs @@ -1,7 +1,10 @@ use std::{collections::HashMap, sync::Arc}; -use anyhow::anyhow; -use iroh::{Endpoint, NodeId, endpoint::Connection}; +use iroh::{ + Endpoint, NodeId, + endpoint::{ConnectError, Connection}, +}; +use snafu::Snafu; use tokio::{ sync::mpsc, task::{JoinError, JoinSet}, @@ -15,14 +18,16 @@ pub struct Options { } type BoxedHandler = Box< - dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> + Send + 'static, + dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> + + Send + + 'static, >; pub enum ConnectResult { Connected(Connection), Timeout, - ConnectError(anyhow::Error), - ConnectionError(anyhow::Error), + ConnectError(ConnectError), + ExecuteError(ExecuteError), JoinError(JoinError), } @@ -38,7 +43,7 @@ async fn run_connection_actor( mut rx: mpsc::Receiver, main_tx: mpsc::Sender, options: Arc, -) -> anyhow::Result<()> { +) { // Connect to the node let mut state = match endpoint .connect(node_id, &options.alpn) @@ -89,7 +94,7 @@ async fn run_connection_actor( if let ConnectResult::Connected(conn) = state { conn.close(1u32.into(), b""); } - state = ConnectResult::ConnectionError(e); + state = ConnectResult::ExecuteError(e); let _ = main_tx.send(ActorMessage::ConnectionShutdown { id: node_id }).await; } Some(Err(e)) => { @@ -132,7 +137,6 @@ async fn run_connection_actor( } tracing::debug!("Connection actor for {} shutting down", node_id); - Ok(()) } struct Actor { @@ -158,7 +162,7 @@ impl Actor { ) } - pub async fn run(mut self) -> anyhow::Result<()> { + pub async fn run(mut self) { while let Some(msg) = self.rx.recv().await { match msg { ActorMessage::Handle { id, mut handler } => { @@ -183,13 +187,9 @@ impl Actor { let main_tx = self.tx.clone(); // Assuming we store the sender let options = self.options.clone(); - tokio::spawn(async move { - if let Err(e) = - run_connection_actor(endpoint, id, conn_rx, main_tx, options).await - { - tracing::error!("Connection actor for {} failed: {}", id, e); - } - }); + tokio::spawn(run_connection_actor( + endpoint, id, conn_rx, main_tx, options, + )); // Send the handler to the new actor if conn_tx.send(handler).await.is_err() { @@ -207,10 +207,17 @@ impl Actor { } } } - Ok(()) } } +#[derive(Debug, Snafu)] +pub enum HandlerPoolError { + Shutdown, +} + +#[derive(Debug, Snafu)] +pub struct ExecuteError; + pub struct HandlerPool { tx: mpsc::Sender, } @@ -220,11 +227,7 @@ impl HandlerPool { let (actor, tx) = Actor::new(endpoint, options); // Spawn the main actor - tokio::spawn(async move { - if let Err(e) = actor.run().await { - tracing::error!("Main actor failed: {}", e); - } - }); + tokio::spawn(actor.run()); Self { tx } } @@ -236,19 +239,23 @@ impl HandlerPool { /// If connection establishment fails, the handler will get passed a `ConnectResult` containing the error. /// /// The fn f is guaranteed to be called exactly once, unless the tokio runtime is shutting down. - pub async fn connect(&self, id: NodeId, f: F) -> anyhow::Result<()> + pub async fn connect( + &self, + id: NodeId, + f: F, + ) -> std::result::Result<(), HandlerPoolError> where F: FnOnce(&ConnectResult) -> Fut + Send + 'static, - Fut: std::future::Future> + Send + 'static, + Fut: std::future::Future> + Send + 'static, { let handler = Box::new(move |conn: &ConnectResult| { - Box::pin(f(conn)) as n0_future::boxed::BoxFuture> + Box::pin(f(conn)) as n0_future::boxed::BoxFuture> }); self.tx .send(ActorMessage::Handle { id, handler }) .await - .map_err(|_| anyhow!("Main actor has shut down"))?; + .map_err(|_| HandlerPoolError::Shutdown)?; Ok(()) } diff --git a/iroh-handler-pool/src/handler_pool_0rtt.rs b/iroh-handler-pool/src/handler_pool_0rtt.rs index c82ea0f..cdba1e2 100644 --- a/iroh-handler-pool/src/handler_pool_0rtt.rs +++ b/iroh-handler-pool/src/handler_pool_0rtt.rs @@ -1,10 +1,10 @@ use std::{collections::HashMap, sync::Arc}; -use anyhow::anyhow; use iroh::{ Endpoint, NodeId, - endpoint::{ConnectOptions, Connection}, + endpoint::{ConnectOptions, ConnectWithOptsError, Connection}, }; +use snafu::Snafu; use tokio::{ sync::{broadcast, mpsc}, task::{JoinError, JoinSet}, @@ -19,15 +19,17 @@ pub struct Options { } type BoxedHandler = Box< - dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> + Send + 'static, + dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> + + Send + + 'static, >; pub enum ConnectResult { Connected(Connection, broadcast::Receiver), Timeout, - ConnectError(anyhow::Error), - ConnectError2(iroh::endpoint::ConnectionError), - HandlerError(anyhow::Error), + ConnectError(ConnectWithOptsError), + ConnectionError(iroh::endpoint::ConnectionError), + ExecuteError(ExecuteError), JoinError(JoinError), } @@ -65,7 +67,7 @@ async fn connect( Err(connecting) => { trace!("Failed to connect using 0-RTT to node {}", node_id); let res = match connecting.await { - Err(cause) => ConnectResult::ConnectError2(cause), + Err(cause) => ConnectResult::ConnectionError(cause), Ok(connection) => ConnectResult::Connected(connection, accepted(true)), }; return (res, None); @@ -85,7 +87,7 @@ async fn run_connection_actor( mut rx: mpsc::Receiver, owner: HandlerPool0Rtt, options: Arc, -) -> anyhow::Result<()> { +) { // Connect to the node let (mut state, mut forwarder) = match connect(endpoint, node_id, &options.alpn) .timeout(options.connect_timeout) @@ -131,7 +133,7 @@ async fn run_connection_actor( if let ConnectResult::Connected(conn, _) = state { conn.close(1u32.into(), b""); } - state = ConnectResult::HandlerError(e); + state = ConnectResult::ExecuteError(e); owner.close(node_id).await.ok(); } Some(Err(e)) => { @@ -178,7 +180,6 @@ async fn run_connection_actor( } trace!("Connection actor for {} shutting down", node_id); - Ok(()) } struct Actor { @@ -204,7 +205,7 @@ impl Actor { ) } - pub async fn run(mut self) -> anyhow::Result<()> { + pub async fn run(mut self) { while let Some(msg) = self.rx.recv().await { match msg { ActorMessage::Handle { id, mut handler } => { @@ -229,13 +230,9 @@ impl Actor { let main_tx = self.tx.clone(); // Assuming we store the sender let options = self.options.clone(); - tokio::spawn(async move { - if let Err(e) = - run_connection_actor(endpoint, id, conn_rx, main_tx, options).await - { - tracing::error!("Connection actor for {} failed: {}", id, e); - } - }); + tokio::spawn(run_connection_actor( + endpoint, id, conn_rx, main_tx, options, + )); // Send the handler to the new actor if conn_tx.send(handler).await.is_err() { @@ -253,10 +250,17 @@ impl Actor { } } } - Ok(()) } } +#[derive(Debug, Snafu)] +pub enum HandlerPoolError { + Shutdown, +} + +#[derive(Debug, Snafu)] +pub struct ExecuteError; + #[derive(Debug, Clone)] #[repr(transparent)] pub struct HandlerPool0Rtt { @@ -268,11 +272,7 @@ impl HandlerPool0Rtt { let (actor, tx) = Actor::new(endpoint, options); // Spawn the main actor - tokio::spawn(async move { - if let Err(e) = actor.run().await { - tracing::error!("Main actor failed: {}", e); - } - }); + tokio::spawn(actor.run()); Self { tx } } @@ -281,11 +281,11 @@ impl HandlerPool0Rtt { /// /// This will finish pending tasks and close the connection. New tasks will /// get a new connection if they are submitted after this call - pub async fn close(&self, id: NodeId) -> anyhow::Result<()> { + pub async fn close(&self, id: NodeId) -> std::result::Result<(), HandlerPoolError> { self.tx .send(ActorMessage::ConnectionShutdown { id }) .await - .map_err(|_| anyhow!("Main actor has shut down"))?; + .map_err(|_| HandlerPoolError::Shutdown)?; Ok(()) } @@ -298,19 +298,23 @@ impl HandlerPool0Rtt { /// The fn f is guaranteed to be called exactly once, unless the tokio runtime is shutting down. /// If the fn returns an error, it is assumed that the connection is no longer valid. This will cause /// the connection to be closed and a new one to be established for future calls. - pub async fn connect(&self, id: NodeId, f: F) -> anyhow::Result<()> + pub async fn connect( + &self, + id: NodeId, + f: F, + ) -> std::result::Result<(), HandlerPoolError> where F: FnOnce(&ConnectResult) -> Fut + Send + 'static, - Fut: std::future::Future> + Send + 'static, + Fut: std::future::Future> + Send + 'static, { let handler = Box::new(move |conn: &ConnectResult| { - Box::pin(f(conn)) as n0_future::boxed::BoxFuture> + Box::pin(f(conn)) as n0_future::boxed::BoxFuture> }); self.tx .send(ActorMessage::Handle { id, handler }) .await - .map_err(|_| anyhow!("Main actor has shut down"))?; + .map_err(|_| HandlerPoolError::Shutdown)?; Ok(()) } From 829871307f147a788c9bc123609c1d3e74ef58be Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 15:04:28 +0200 Subject: [PATCH 03/26] docs --- iroh-handler-pool/src/handler_pool.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/iroh-handler-pool/src/handler_pool.rs b/iroh-handler-pool/src/handler_pool.rs index 9d3567f..bd23f32 100644 --- a/iroh-handler-pool/src/handler_pool.rs +++ b/iroh-handler-pool/src/handler_pool.rs @@ -24,10 +24,15 @@ type BoxedHandler = Box< >; pub enum ConnectResult { + /// We got a connection Connected(Connection), + /// Timeout during connect Timeout, + /// Error during connect ConnectError(ConnectError), + /// Error during last execute ExecuteError(ExecuteError), + /// Handler actor panicked JoinError(JoinError), } @@ -212,6 +217,7 @@ impl Actor { #[derive(Debug, Snafu)] pub enum HandlerPoolError { + /// The handler pool has been shut down Shutdown, } @@ -235,8 +241,8 @@ impl HandlerPool { /// Connect to a node and execute the given handler function /// /// The connection will either be a new connection or an existing one if it is already established. - /// If connection establishment succeeds, the handler will be called with a `ConnectResult::Connected(Connection)`. - /// If connection establishment fails, the handler will get passed a `ConnectResult` containing the error. + /// If connection establishment succeeds, the handler will be called with a [`ConnectResult::Connected`]. + /// If connection establishment fails, the handler will get passed a [`ConnectResult`] containing the error. /// /// The fn f is guaranteed to be called exactly once, unless the tokio runtime is shutting down. pub async fn connect( From e00556a4f45e7fb77c8416cddb2e175bc84473e4 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 15:09:48 +0200 Subject: [PATCH 04/26] Add limit for the number of connections --- iroh-handler-pool/src/handler_pool.rs | 7 +++++++ iroh-handler-pool/src/handler_pool_0rtt.rs | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/iroh-handler-pool/src/handler_pool.rs b/iroh-handler-pool/src/handler_pool.rs index bd23f32..2a2d519 100644 --- a/iroh-handler-pool/src/handler_pool.rs +++ b/iroh-handler-pool/src/handler_pool.rs @@ -14,6 +14,7 @@ use tokio_util::time::FutureExt; pub struct Options { pub idle_timeout: std::time::Duration, pub connect_timeout: std::time::Duration, + pub max_connections: usize, pub alpn: Vec, } @@ -28,6 +29,8 @@ pub enum ConnectResult { Connected(Connection), /// Timeout during connect Timeout, + /// Too many connections + TooManyConnections, /// Error during connect ConnectError(ConnectError), /// Error during last execute @@ -185,6 +188,10 @@ impl Actor { } // No connection actor or it died - spawn a new one + if self.connections.len() >= self.options.max_connections { + handler(&ConnectResult::TooManyConnections).await.ok(); + continue; + } let (conn_tx, conn_rx) = mpsc::channel(100); self.connections.insert(id, conn_tx.clone()); diff --git a/iroh-handler-pool/src/handler_pool_0rtt.rs b/iroh-handler-pool/src/handler_pool_0rtt.rs index cdba1e2..2f82111 100644 --- a/iroh-handler-pool/src/handler_pool_0rtt.rs +++ b/iroh-handler-pool/src/handler_pool_0rtt.rs @@ -15,6 +15,7 @@ use tracing::{error, trace}; pub struct Options { pub idle_timeout: std::time::Duration, pub connect_timeout: std::time::Duration, + pub max_connections: usize, pub alpn: Vec, } @@ -27,6 +28,7 @@ type BoxedHandler = Box< pub enum ConnectResult { Connected(Connection, broadcast::Receiver), Timeout, + TooManyConnections, ConnectError(ConnectWithOptsError), ConnectionError(iroh::endpoint::ConnectionError), ExecuteError(ExecuteError), @@ -223,6 +225,10 @@ impl Actor { } // No connection actor or it died - spawn a new one + if self.connections.len() >= self.options.max_connections { + handler(&ConnectResult::TooManyConnections).await.ok(); + continue; + } let (conn_tx, conn_rx) = mpsc::channel(100); self.connections.insert(id, conn_tx.clone()); From d41fb3f021a508a6f2ec3d15afc2e0f727bc184c Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 15:14:18 +0200 Subject: [PATCH 05/26] separate out alpn --- iroh-handler-pool/src/handler_pool.rs | 32 +++++++++++++++++----- iroh-handler-pool/src/handler_pool_0rtt.rs | 30 ++++++++++++++++---- 2 files changed, 49 insertions(+), 13 deletions(-) diff --git a/iroh-handler-pool/src/handler_pool.rs b/iroh-handler-pool/src/handler_pool.rs index 2a2d519..849b50c 100644 --- a/iroh-handler-pool/src/handler_pool.rs +++ b/iroh-handler-pool/src/handler_pool.rs @@ -11,11 +11,21 @@ use tokio::{ }; use tokio_util::time::FutureExt; +#[derive(Debug, Clone, Copy)] pub struct Options { pub idle_timeout: std::time::Duration, pub connect_timeout: std::time::Duration, pub max_connections: usize, - pub alpn: Vec, +} + +impl Default for Options { + fn default() -> Self { + Self { + idle_timeout: std::time::Duration::from_secs(5), + connect_timeout: std::time::Duration::from_secs(1), + max_connections: 1024, + } + } } type BoxedHandler = Box< @@ -51,10 +61,11 @@ async fn run_connection_actor( mut rx: mpsc::Receiver, main_tx: mpsc::Sender, options: Arc, + alpn: Arc>, ) { // Connect to the node let mut state = match endpoint - .connect(node_id, &options.alpn) + .connect(node_id, &alpn) .timeout(options.connect_timeout) .await { @@ -153,18 +164,24 @@ struct Actor { endpoint: Endpoint, connections: HashMap>, options: Arc, + alpn: Arc>, } impl Actor { - pub fn new(endpoint: Endpoint, options: Options) -> (Self, mpsc::Sender) { + pub fn new( + endpoint: Endpoint, + alpn: &[u8], + options: Options, + ) -> (Self, mpsc::Sender) { let (tx, rx) = mpsc::channel(100); ( Self { rx, tx: tx.clone(), endpoint, - connections: HashMap::new(), options: Arc::new(options), + connections: HashMap::new(), + alpn: Arc::new(alpn.to_vec()), }, tx, ) @@ -198,9 +215,10 @@ impl Actor { let endpoint = self.endpoint.clone(); let main_tx = self.tx.clone(); // Assuming we store the sender let options = self.options.clone(); + let alpn = self.alpn.clone(); tokio::spawn(run_connection_actor( - endpoint, id, conn_rx, main_tx, options, + endpoint, id, conn_rx, main_tx, options, alpn, )); // Send the handler to the new actor @@ -236,8 +254,8 @@ pub struct HandlerPool { } impl HandlerPool { - pub fn new(endpoint: Endpoint, options: Options) -> Self { - let (actor, tx) = Actor::new(endpoint, options); + pub fn new(endpoint: Endpoint, alpn: &[u8], options: Options) -> Self { + let (actor, tx) = Actor::new(endpoint, alpn, options); // Spawn the main actor tokio::spawn(actor.run()); diff --git a/iroh-handler-pool/src/handler_pool_0rtt.rs b/iroh-handler-pool/src/handler_pool_0rtt.rs index 2f82111..8cec89b 100644 --- a/iroh-handler-pool/src/handler_pool_0rtt.rs +++ b/iroh-handler-pool/src/handler_pool_0rtt.rs @@ -12,11 +12,21 @@ use tokio::{ use tokio_util::time::FutureExt; use tracing::{error, trace}; +#[derive(Debug, Clone, Copy)] pub struct Options { pub idle_timeout: std::time::Duration, pub connect_timeout: std::time::Duration, pub max_connections: usize, - pub alpn: Vec, +} + +impl Default for Options { + fn default() -> Self { + Self { + idle_timeout: std::time::Duration::from_secs(5), + connect_timeout: std::time::Duration::from_secs(1), + max_connections: 1024, + } + } } type BoxedHandler = Box< @@ -88,10 +98,11 @@ async fn run_connection_actor( node_id: NodeId, mut rx: mpsc::Receiver, owner: HandlerPool0Rtt, + alpn: Arc>, options: Arc, ) { // Connect to the node - let (mut state, mut forwarder) = match connect(endpoint, node_id, &options.alpn) + let (mut state, mut forwarder) = match connect(endpoint, node_id, &alpn) .timeout(options.connect_timeout) .await { @@ -190,10 +201,15 @@ struct Actor { endpoint: Endpoint, connections: HashMap>, options: Arc, + alpn: Arc>, } impl Actor { - pub fn new(endpoint: Endpoint, options: Options) -> (Self, mpsc::Sender) { + pub fn new( + endpoint: Endpoint, + alpn: &[u8], + options: Options, + ) -> (Self, mpsc::Sender) { let (tx, rx) = mpsc::channel(100); ( Self { @@ -202,6 +218,7 @@ impl Actor { endpoint, connections: HashMap::new(), options: Arc::new(options), + alpn: Arc::new(alpn.to_vec()), }, tx, ) @@ -235,9 +252,10 @@ impl Actor { let endpoint = self.endpoint.clone(); let main_tx = self.tx.clone(); // Assuming we store the sender let options = self.options.clone(); + let alpn = self.alpn.clone(); tokio::spawn(run_connection_actor( - endpoint, id, conn_rx, main_tx, options, + endpoint, id, conn_rx, main_tx, alpn, options, )); // Send the handler to the new actor @@ -274,8 +292,8 @@ pub struct HandlerPool0Rtt { } impl HandlerPool0Rtt { - pub fn new(endpoint: Endpoint, options: Options) -> Self { - let (actor, tx) = Actor::new(endpoint, options); + pub fn new(endpoint: Endpoint, alpn: &[u8], options: Options) -> Self { + let (actor, tx) = Actor::new(endpoint, alpn, options); // Spawn the main actor tokio::spawn(actor.run()); From 6c60556b06ce15530a8486b16c9a9abc7ec09591 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 15:29:19 +0200 Subject: [PATCH 06/26] Group stuff that the child actor needs in a context. --- iroh-handler-pool/src/handler_pool.rs | 72 ++++++++++++---------- iroh-handler-pool/src/handler_pool_0rtt.rs | 58 +++++++++-------- 2 files changed, 69 insertions(+), 61 deletions(-) diff --git a/iroh-handler-pool/src/handler_pool.rs b/iroh-handler-pool/src/handler_pool.rs index 849b50c..a26830c 100644 --- a/iroh-handler-pool/src/handler_pool.rs +++ b/iroh-handler-pool/src/handler_pool.rs @@ -28,6 +28,13 @@ impl Default for Options { } } +struct Context { + options: Options, + endpoint: Endpoint, + owner: HandlerPool, + alpn: Vec, +} + type BoxedHandler = Box< dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> + Send @@ -56,17 +63,15 @@ enum ActorMessage { /// Run a connection actor for a single node async fn run_connection_actor( - endpoint: Endpoint, node_id: NodeId, mut rx: mpsc::Receiver, - main_tx: mpsc::Sender, - options: Arc, - alpn: Arc>, + context: Arc, ) { // Connect to the node - let mut state = match endpoint - .connect(node_id, &alpn) - .timeout(options.connect_timeout) + let mut state = match context + .endpoint + .connect(node_id, &context.alpn) + .timeout(context.options.connect_timeout) .await { Ok(Ok(conn)) => ConnectResult::Connected(conn), @@ -74,13 +79,11 @@ async fn run_connection_actor( Err(_) => ConnectResult::Timeout, }; if !matches!(state, ConnectResult::Connected(_)) { - let _ = main_tx - .send(ActorMessage::ConnectionShutdown { id: node_id }) - .await; + context.owner.close(node_id).await.ok(); } let mut tasks = JoinSet::new(); - let idle_timer = tokio::time::sleep(options.idle_timeout); + let idle_timer = tokio::time::sleep(context.options.idle_timeout); tokio::pin!(idle_timer); loop { @@ -92,7 +95,7 @@ async fn run_connection_actor( match handler { Some(handler) => { // Reset idle timer by creating a new one - idle_timer.set(tokio::time::sleep(options.idle_timeout)); + idle_timer.set(tokio::time::sleep(context.options.idle_timeout)); tasks.spawn(handler(&state)); } None => { @@ -114,7 +117,7 @@ async fn run_connection_actor( conn.close(1u32.into(), b""); } state = ConnectResult::ExecuteError(e); - let _ = main_tx.send(ActorMessage::ConnectionShutdown { id: node_id }).await; + let _ = context.owner.close(node_id).await; } Some(Err(e)) => { tracing::error!("Task panicked for node {}: {}", node_id, e); @@ -122,7 +125,7 @@ async fn run_connection_actor( conn.close(1u32.into(), b""); } state = ConnectResult::JoinError(e); - let _ = main_tx.send(ActorMessage::ConnectionShutdown { id: node_id }).await; + let _ = context.owner.close(node_id).await; } None => { tracing::debug!("Task was cancelled or already completed for node {}", node_id); @@ -138,7 +141,8 @@ async fn run_connection_actor( // Idle timeout - request shutdown _ = &mut idle_timer, if tasks.is_empty() => { tracing::debug!("Connection to {} idle, requesting shutdown", node_id); - let _ = main_tx.send(ActorMessage::ConnectionShutdown { id: node_id }).await; + + context.owner.close(node_id).await.ok(); // Don't break here - wait for main actor to close our channel } } @@ -159,12 +163,9 @@ async fn run_connection_actor( } struct Actor { - tx: mpsc::Sender, rx: mpsc::Receiver, - endpoint: Endpoint, connections: HashMap>, - options: Arc, - alpn: Arc>, + context: Arc, } impl Actor { @@ -177,11 +178,13 @@ impl Actor { ( Self { rx, - tx: tx.clone(), - endpoint, - options: Arc::new(options), connections: HashMap::new(), - alpn: Arc::new(alpn.to_vec()), + context: Arc::new(Context { + options, + alpn: alpn.to_vec(), + endpoint, + owner: HandlerPool { tx: tx.clone() }, + }), }, tx, ) @@ -205,21 +208,16 @@ impl Actor { } // No connection actor or it died - spawn a new one - if self.connections.len() >= self.options.max_connections { + if self.connections.len() >= self.context.options.max_connections { handler(&ConnectResult::TooManyConnections).await.ok(); continue; } let (conn_tx, conn_rx) = mpsc::channel(100); self.connections.insert(id, conn_tx.clone()); - let endpoint = self.endpoint.clone(); - let main_tx = self.tx.clone(); // Assuming we store the sender - let options = self.options.clone(); - let alpn = self.alpn.clone(); + let context = self.context.clone(); - tokio::spawn(run_connection_actor( - endpoint, id, conn_rx, main_tx, options, alpn, - )); + tokio::spawn(run_connection_actor(id, conn_rx, context)); // Send the handler to the new actor if conn_tx.send(handler).await.is_err() { @@ -290,4 +288,16 @@ impl HandlerPool { Ok(()) } + + /// Close an existing connection, if it exists + /// + /// This will finish pending tasks and close the connection. New tasks will + /// get a new connection if they are submitted after this call + pub async fn close(&self, id: NodeId) -> std::result::Result<(), HandlerPoolError> { + self.tx + .send(ActorMessage::ConnectionShutdown { id }) + .await + .map_err(|_| HandlerPoolError::Shutdown)?; + Ok(()) + } } diff --git a/iroh-handler-pool/src/handler_pool_0rtt.rs b/iroh-handler-pool/src/handler_pool_0rtt.rs index 8cec89b..ea6502a 100644 --- a/iroh-handler-pool/src/handler_pool_0rtt.rs +++ b/iroh-handler-pool/src/handler_pool_0rtt.rs @@ -29,6 +29,14 @@ impl Default for Options { } } +/// Part of the main actor state that is needed by the connection actors +struct Context { + options: Options, + alpn: Vec, + endpoint: Endpoint, + owner: HandlerPool0Rtt, +} + type BoxedHandler = Box< dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> + Send @@ -57,7 +65,7 @@ fn accepted(value: bool) -> broadcast::Receiver { } async fn connect( - endpoint: Endpoint, + endpoint: &Endpoint, node_id: NodeId, alpn: &[u8], ) -> (ConnectResult, Option>) { @@ -94,26 +102,23 @@ async fn connect( /// Run a connection actor for a single node async fn run_connection_actor( - endpoint: Endpoint, node_id: NodeId, mut rx: mpsc::Receiver, - owner: HandlerPool0Rtt, - alpn: Arc>, - options: Arc, + context: Arc, ) { // Connect to the node - let (mut state, mut forwarder) = match connect(endpoint, node_id, &alpn) - .timeout(options.connect_timeout) + let (mut state, mut forwarder) = match connect(&context.endpoint, node_id, &context.alpn) + .timeout(context.options.connect_timeout) .await { Ok((state, forwarder)) => (state, forwarder), Err(_) => (ConnectResult::Timeout, None), }; if !matches!(state, ConnectResult::Connected(_, _)) { - owner.close(node_id).await.ok(); + context.owner.close(node_id).await.ok(); } let mut tasks = JoinSet::new(); - let idle_timer = tokio::time::sleep(options.idle_timeout); + let idle_timer = tokio::time::sleep(context.options.idle_timeout); tokio::pin!(idle_timer); loop { @@ -125,7 +130,7 @@ async fn run_connection_actor( match handler { Some(handler) => { // Reset idle timer by creating a new one - idle_timer.set(tokio::time::sleep(options.idle_timeout)); + idle_timer.set(tokio::time::sleep(context.options.idle_timeout)); tasks.spawn(handler(&state)); } None => { @@ -147,7 +152,7 @@ async fn run_connection_actor( conn.close(1u32.into(), b""); } state = ConnectResult::ExecuteError(e); - owner.close(node_id).await.ok(); + context.owner.close(node_id).await.ok(); } Some(Err(e)) => { error!("Task panicked for node {}: {}", node_id, e); @@ -155,7 +160,7 @@ async fn run_connection_actor( conn.close(1u32.into(), b""); } state = ConnectResult::JoinError(e); - owner.close(node_id).await.ok(); + context.owner.close(node_id).await.ok(); } None => { trace!("Task was cancelled or already completed for node {}", node_id); @@ -171,7 +176,7 @@ async fn run_connection_actor( // Idle timeout - request shutdown _ = &mut idle_timer, if tasks.is_empty() => { trace!("Connection to {} idle, requesting shutdown", node_id); - owner.close(node_id).await.ok(); + context.owner.close(node_id).await.ok(); // Don't break here - wait for main actor to close our channel } @@ -196,12 +201,9 @@ async fn run_connection_actor( } struct Actor { - tx: HandlerPool0Rtt, rx: mpsc::Receiver, - endpoint: Endpoint, connections: HashMap>, - options: Arc, - alpn: Arc>, + context: Arc, } impl Actor { @@ -214,11 +216,13 @@ impl Actor { ( Self { rx, - tx: HandlerPool0Rtt { tx: tx.clone() }, - endpoint, connections: HashMap::new(), - options: Arc::new(options), - alpn: Arc::new(alpn.to_vec()), + context: Arc::new(Context { + options, + alpn: alpn.to_vec(), + endpoint, + owner: HandlerPool0Rtt { tx: tx.clone() }, + }), }, tx, ) @@ -242,21 +246,15 @@ impl Actor { } // No connection actor or it died - spawn a new one - if self.connections.len() >= self.options.max_connections { + if self.connections.len() >= self.context.options.max_connections { handler(&ConnectResult::TooManyConnections).await.ok(); continue; } let (conn_tx, conn_rx) = mpsc::channel(100); self.connections.insert(id, conn_tx.clone()); - let endpoint = self.endpoint.clone(); - let main_tx = self.tx.clone(); // Assuming we store the sender - let options = self.options.clone(); - let alpn = self.alpn.clone(); - - tokio::spawn(run_connection_actor( - endpoint, id, conn_rx, main_tx, alpn, options, - )); + let context = self.context.clone(); + tokio::spawn(run_connection_actor(id, conn_rx, context)); // Send the handler to the new actor if conn_tx.send(handler).await.is_err() { From 08cef071ed3f1c1fd23d42539e5007978a20059c Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 15:36:50 +0200 Subject: [PATCH 07/26] docs --- iroh-handler-pool/src/handler_pool_0rtt.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/iroh-handler-pool/src/handler_pool_0rtt.rs b/iroh-handler-pool/src/handler_pool_0rtt.rs index ea6502a..09293e8 100644 --- a/iroh-handler-pool/src/handler_pool_0rtt.rs +++ b/iroh-handler-pool/src/handler_pool_0rtt.rs @@ -2,7 +2,7 @@ use std::{collections::HashMap, sync::Arc}; use iroh::{ Endpoint, NodeId, - endpoint::{ConnectOptions, ConnectWithOptsError, Connection}, + endpoint::{ConnectOptions, ConnectWithOptsError, Connection, ConnectionError}, }; use snafu::Snafu; use tokio::{ @@ -44,12 +44,19 @@ type BoxedHandler = Box< >; pub enum ConnectResult { + /// We got a connection Connected(Connection, broadcast::Receiver), + /// Timeout during connect Timeout, + /// Too many connections TooManyConnections, + /// Error in the first stage of connect ConnectError(ConnectWithOptsError), - ConnectionError(iroh::endpoint::ConnectionError), + /// Error in the second stage of connect + ConnectionError(ConnectionError), + /// Error during usage of the connection ExecuteError(ExecuteError), + /// Connection actor panicked JoinError(JoinError), } From 2950448c811e39c9083bce5ea19b049c2705a687 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 15:43:41 +0200 Subject: [PATCH 08/26] type aliases --- iroh-handler-pool/src/handler_pool.rs | 26 ++++++++++------------ iroh-handler-pool/src/handler_pool_0rtt.rs | 26 ++++++++++------------ 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/iroh-handler-pool/src/handler_pool.rs b/iroh-handler-pool/src/handler_pool.rs index a26830c..6f8872b 100644 --- a/iroh-handler-pool/src/handler_pool.rs +++ b/iroh-handler-pool/src/handler_pool.rs @@ -1,9 +1,10 @@ -use std::{collections::HashMap, sync::Arc}; +use std::{collections::HashMap, sync::Arc, time::Duration}; use iroh::{ Endpoint, NodeId, endpoint::{ConnectError, Connection}, }; +use n0_future::boxed::BoxFuture; use snafu::Snafu; use tokio::{ sync::mpsc, @@ -13,16 +14,16 @@ use tokio_util::time::FutureExt; #[derive(Debug, Clone, Copy)] pub struct Options { - pub idle_timeout: std::time::Duration, - pub connect_timeout: std::time::Duration, + pub idle_timeout: Duration, + pub connect_timeout: Duration, pub max_connections: usize, } impl Default for Options { fn default() -> Self { Self { - idle_timeout: std::time::Duration::from_secs(5), - connect_timeout: std::time::Duration::from_secs(1), + idle_timeout: Duration::from_secs(5), + connect_timeout: Duration::from_secs(1), max_connections: 1024, } } @@ -35,11 +36,7 @@ struct Context { alpn: Vec, } -type BoxedHandler = Box< - dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> - + Send - + 'static, ->; +type BoxedHandler = Box BoxFuture + Send + 'static>; pub enum ConnectResult { /// We got a connection @@ -247,6 +244,8 @@ pub enum HandlerPoolError { #[derive(Debug, Snafu)] pub struct ExecuteError; +pub type ExecuteResult = std::result::Result<(), ExecuteError>; + pub struct HandlerPool { tx: mpsc::Sender, } @@ -275,11 +274,10 @@ impl HandlerPool { ) -> std::result::Result<(), HandlerPoolError> where F: FnOnce(&ConnectResult) -> Fut + Send + 'static, - Fut: std::future::Future> + Send + 'static, + Fut: std::future::Future + Send + 'static, { - let handler = Box::new(move |conn: &ConnectResult| { - Box::pin(f(conn)) as n0_future::boxed::BoxFuture> - }); + let handler = + Box::new(move |conn: &ConnectResult| Box::pin(f(conn)) as BoxFuture); self.tx .send(ActorMessage::Handle { id, handler }) diff --git a/iroh-handler-pool/src/handler_pool_0rtt.rs b/iroh-handler-pool/src/handler_pool_0rtt.rs index 09293e8..6eed595 100644 --- a/iroh-handler-pool/src/handler_pool_0rtt.rs +++ b/iroh-handler-pool/src/handler_pool_0rtt.rs @@ -1,9 +1,10 @@ -use std::{collections::HashMap, sync::Arc}; +use std::{collections::HashMap, sync::Arc, time::Duration}; use iroh::{ Endpoint, NodeId, endpoint::{ConnectOptions, ConnectWithOptsError, Connection, ConnectionError}, }; +use n0_future::boxed::BoxFuture; use snafu::Snafu; use tokio::{ sync::{broadcast, mpsc}, @@ -14,16 +15,16 @@ use tracing::{error, trace}; #[derive(Debug, Clone, Copy)] pub struct Options { - pub idle_timeout: std::time::Duration, - pub connect_timeout: std::time::Duration, + pub idle_timeout: Duration, + pub connect_timeout: Duration, pub max_connections: usize, } impl Default for Options { fn default() -> Self { Self { - idle_timeout: std::time::Duration::from_secs(5), - connect_timeout: std::time::Duration::from_secs(1), + idle_timeout: Duration::from_secs(5), + connect_timeout: Duration::from_secs(1), max_connections: 1024, } } @@ -37,11 +38,7 @@ struct Context { owner: HandlerPool0Rtt, } -type BoxedHandler = Box< - dyn FnOnce(&ConnectResult) -> n0_future::boxed::BoxFuture> - + Send - + 'static, ->; +type BoxedHandler = Box BoxFuture + Send + 'static>; pub enum ConnectResult { /// We got a connection @@ -290,6 +287,8 @@ pub enum HandlerPoolError { #[derive(Debug, Snafu)] pub struct ExecuteError; +pub type ExecuteResult = std::result::Result<(), ExecuteError>; + #[derive(Debug, Clone)] #[repr(transparent)] pub struct HandlerPool0Rtt { @@ -334,11 +333,10 @@ impl HandlerPool0Rtt { ) -> std::result::Result<(), HandlerPoolError> where F: FnOnce(&ConnectResult) -> Fut + Send + 'static, - Fut: std::future::Future> + Send + 'static, + Fut: std::future::Future + Send + 'static, { - let handler = Box::new(move |conn: &ConnectResult| { - Box::pin(f(conn)) as n0_future::boxed::BoxFuture> - }); + let handler = + Box::new(move |conn: &ConnectResult| Box::pin(f(conn)) as BoxFuture); self.tx .send(ActorMessage::Handle { id, handler }) From d3b77862e3f99471366fe2407fa2337fbb8da37b Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 15:50:51 +0200 Subject: [PATCH 09/26] Rename --- .../Cargo.toml | 2 +- .../src/connection_pool.rs | 30 ++++++++++++------- .../src/connection_pool_0rtt.rs | 18 +++++------ iroh-connection-pool/src/lib.rs | 2 ++ iroh-handler-pool/src/lib.rs | 2 -- 5 files changed, 32 insertions(+), 22 deletions(-) rename {iroh-handler-pool => iroh-connection-pool}/Cargo.toml (87%) rename iroh-handler-pool/src/handler_pool.rs => iroh-connection-pool/src/connection_pool.rs (91%) rename iroh-handler-pool/src/handler_pool_0rtt.rs => iroh-connection-pool/src/connection_pool_0rtt.rs (96%) create mode 100644 iroh-connection-pool/src/lib.rs delete mode 100644 iroh-handler-pool/src/lib.rs diff --git a/iroh-handler-pool/Cargo.toml b/iroh-connection-pool/Cargo.toml similarity index 87% rename from iroh-handler-pool/Cargo.toml rename to iroh-connection-pool/Cargo.toml index f9fdf50..b40bfc2 100644 --- a/iroh-handler-pool/Cargo.toml +++ b/iroh-connection-pool/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "iroh-handler-pool" +name = "iroh-connection-pool" version = "0.1.0" edition = "2024" diff --git a/iroh-handler-pool/src/handler_pool.rs b/iroh-connection-pool/src/connection_pool.rs similarity index 91% rename from iroh-handler-pool/src/handler_pool.rs rename to iroh-connection-pool/src/connection_pool.rs index 6f8872b..ac8d1ea 100644 --- a/iroh-handler-pool/src/handler_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -1,3 +1,13 @@ +//! A simple iroh connection pool +//! +//! Entry point is [`ConnectionPool`]. You create a connection pool for a specific +//! ALPN and [`Options`]. Then the pool will manage connections for you. +//! +//! Access to connections is via the [`ConnectionPool::connect`] method, which +//! gives you access to a connection if possible. +//! +//! It is important that you use the connection only in the future passed to +//! connect, and don't clone it out of the future. use std::{collections::HashMap, sync::Arc, time::Duration}; use iroh::{ @@ -32,7 +42,7 @@ impl Default for Options { struct Context { options: Options, endpoint: Endpoint, - owner: HandlerPool, + owner: ConnectionPool, alpn: Vec, } @@ -180,7 +190,7 @@ impl Actor { options, alpn: alpn.to_vec(), endpoint, - owner: HandlerPool { tx: tx.clone() }, + owner: ConnectionPool { tx: tx.clone() }, }), }, tx, @@ -236,8 +246,8 @@ impl Actor { } #[derive(Debug, Snafu)] -pub enum HandlerPoolError { - /// The handler pool has been shut down +pub enum ConnectionPoolError { + /// The connection pool has been shut down Shutdown, } @@ -246,11 +256,11 @@ pub struct ExecuteError; pub type ExecuteResult = std::result::Result<(), ExecuteError>; -pub struct HandlerPool { +pub struct ConnectionPool { tx: mpsc::Sender, } -impl HandlerPool { +impl ConnectionPool { pub fn new(endpoint: Endpoint, alpn: &[u8], options: Options) -> Self { let (actor, tx) = Actor::new(endpoint, alpn, options); @@ -271,7 +281,7 @@ impl HandlerPool { &self, id: NodeId, f: F, - ) -> std::result::Result<(), HandlerPoolError> + ) -> std::result::Result<(), ConnectionPoolError> where F: FnOnce(&ConnectResult) -> Fut + Send + 'static, Fut: std::future::Future + Send + 'static, @@ -282,7 +292,7 @@ impl HandlerPool { self.tx .send(ActorMessage::Handle { id, handler }) .await - .map_err(|_| HandlerPoolError::Shutdown)?; + .map_err(|_| ConnectionPoolError::Shutdown)?; Ok(()) } @@ -291,11 +301,11 @@ impl HandlerPool { /// /// This will finish pending tasks and close the connection. New tasks will /// get a new connection if they are submitted after this call - pub async fn close(&self, id: NodeId) -> std::result::Result<(), HandlerPoolError> { + pub async fn close(&self, id: NodeId) -> std::result::Result<(), ConnectionPoolError> { self.tx .send(ActorMessage::ConnectionShutdown { id }) .await - .map_err(|_| HandlerPoolError::Shutdown)?; + .map_err(|_| ConnectionPoolError::Shutdown)?; Ok(()) } } diff --git a/iroh-handler-pool/src/handler_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs similarity index 96% rename from iroh-handler-pool/src/handler_pool_0rtt.rs rename to iroh-connection-pool/src/connection_pool_0rtt.rs index 6eed595..8375514 100644 --- a/iroh-handler-pool/src/handler_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -35,7 +35,7 @@ struct Context { options: Options, alpn: Vec, endpoint: Endpoint, - owner: HandlerPool0Rtt, + owner: ConnectionPool0Rtt, } type BoxedHandler = Box BoxFuture + Send + 'static>; @@ -225,7 +225,7 @@ impl Actor { options, alpn: alpn.to_vec(), endpoint, - owner: HandlerPool0Rtt { tx: tx.clone() }, + owner: ConnectionPool0Rtt { tx: tx.clone() }, }), }, tx, @@ -280,7 +280,7 @@ impl Actor { } #[derive(Debug, Snafu)] -pub enum HandlerPoolError { +pub enum ConnectionPoolError { Shutdown, } @@ -291,11 +291,11 @@ pub type ExecuteResult = std::result::Result<(), ExecuteError>; #[derive(Debug, Clone)] #[repr(transparent)] -pub struct HandlerPool0Rtt { +pub struct ConnectionPool0Rtt { tx: mpsc::Sender, } -impl HandlerPool0Rtt { +impl ConnectionPool0Rtt { pub fn new(endpoint: Endpoint, alpn: &[u8], options: Options) -> Self { let (actor, tx) = Actor::new(endpoint, alpn, options); @@ -309,11 +309,11 @@ impl HandlerPool0Rtt { /// /// This will finish pending tasks and close the connection. New tasks will /// get a new connection if they are submitted after this call - pub async fn close(&self, id: NodeId) -> std::result::Result<(), HandlerPoolError> { + pub async fn close(&self, id: NodeId) -> std::result::Result<(), ConnectionPoolError> { self.tx .send(ActorMessage::ConnectionShutdown { id }) .await - .map_err(|_| HandlerPoolError::Shutdown)?; + .map_err(|_| ConnectionPoolError::Shutdown)?; Ok(()) } @@ -330,7 +330,7 @@ impl HandlerPool0Rtt { &self, id: NodeId, f: F, - ) -> std::result::Result<(), HandlerPoolError> + ) -> std::result::Result<(), ConnectionPoolError> where F: FnOnce(&ConnectResult) -> Fut + Send + 'static, Fut: std::future::Future + Send + 'static, @@ -341,7 +341,7 @@ impl HandlerPool0Rtt { self.tx .send(ActorMessage::Handle { id, handler }) .await - .map_err(|_| HandlerPoolError::Shutdown)?; + .map_err(|_| ConnectionPoolError::Shutdown)?; Ok(()) } diff --git a/iroh-connection-pool/src/lib.rs b/iroh-connection-pool/src/lib.rs new file mode 100644 index 0000000..48490b8 --- /dev/null +++ b/iroh-connection-pool/src/lib.rs @@ -0,0 +1,2 @@ +pub mod connection_pool; +pub mod connection_pool_0rtt; diff --git a/iroh-handler-pool/src/lib.rs b/iroh-handler-pool/src/lib.rs deleted file mode 100644 index b7754b0..0000000 --- a/iroh-handler-pool/src/lib.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub mod handler_pool; -pub mod handler_pool_0rtt; From 8565f1953e4c4c0ed1b73890539b7dfcedff687e Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 15:53:09 +0200 Subject: [PATCH 10/26] docs --- iroh-connection-pool/src/connection_pool_0rtt.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index 8375514..ccf31d1 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -1,3 +1,16 @@ +//! An iroh connection pool that supports 0Rtt connections +//! +//! Entry point is [`ConnectionPool0Rtt`]. You create a connection pool for a specific +//! ALPN and [`Options`]. Then the pool will manage connections for you. +//! +//! Access to connections is via the [`ConnectionPool0Rtt::connect`] method, which +//! gives you access to a connection if possible. +//! +//! It is important that you use the connection only in the future passed to +//! connect, and don't clone it out of the future. +//! +//! For what 0Rtt connections are and why you might want to use them, see this +//! [blog post](https://www.iroh.computer/blog/0rtt-api). use std::{collections::HashMap, sync::Arc, time::Duration}; use iroh::{ From 0445f6c963465fe596c4f6a068fea58cfa62d4e6 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 16:13:41 +0200 Subject: [PATCH 11/26] more refactoring --- iroh-connection-pool/src/connection_pool.rs | 54 ++++++++++------ .../src/connection_pool_0rtt.rs | 61 ++++++++++++------- 2 files changed, 74 insertions(+), 41 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index ac8d1ea..722cbf5 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -22,6 +22,7 @@ use tokio::{ }; use tokio_util::time::FutureExt; +/// Configuration options for the connection pool #[derive(Debug, Clone, Copy)] pub struct Options { pub idle_timeout: Duration, @@ -46,11 +47,14 @@ struct Context { alpn: Vec, } -type BoxedHandler = Box BoxFuture + Send + 'static>; +type BoxedHandler = + Box BoxFuture + Send + 'static>; -pub enum ConnectResult { - /// We got a connection - Connected(Connection), +/// Error when a connection can not be acquired +/// +/// This includes the normal iroh connection errors as well as pool specific +/// errors such as timeouts and connection limits. +pub enum PoolConnectError { /// Timeout during connect Timeout, /// Too many connections @@ -63,6 +67,8 @@ pub enum ConnectResult { JoinError(JoinError), } +pub type PoolConnectResult = std::result::Result; + enum ActorMessage { Handle { id: NodeId, handler: BoxedHandler }, ConnectionShutdown { id: NodeId }, @@ -81,11 +87,11 @@ async fn run_connection_actor( .timeout(context.options.connect_timeout) .await { - Ok(Ok(conn)) => ConnectResult::Connected(conn), - Ok(Err(e)) => ConnectResult::ConnectError(e), - Err(_) => ConnectResult::Timeout, + Ok(Ok(conn)) => Ok(conn), + Ok(Err(e)) => Err(PoolConnectError::ConnectError(e)), + Err(_) => Err(PoolConnectError::Timeout), }; - if !matches!(state, ConnectResult::Connected(_)) { + if !matches!(state, Ok(_)) { context.owner.close(node_id).await.ok(); } @@ -120,18 +126,18 @@ async fn run_connection_actor( } Some(Ok(Err(e))) => { tracing::error!("Task failed for node {}: {}", node_id, e); - if let ConnectResult::Connected(conn) = state { + if let Ok(conn) = state { conn.close(1u32.into(), b""); } - state = ConnectResult::ExecuteError(e); + state = Err(PoolConnectError::ExecuteError(e)); let _ = context.owner.close(node_id).await; } Some(Err(e)) => { tracing::error!("Task panicked for node {}: {}", node_id, e); - if let ConnectResult::Connected(conn) = state { + if let Ok(conn) = state { conn.close(1u32.into(), b""); } - state = ConnectResult::JoinError(e); + state = Err(PoolConnectError::JoinError(e)); let _ = context.owner.close(node_id).await; } None => { @@ -162,7 +168,7 @@ async fn run_connection_actor( } } - if let ConnectResult::Connected(connection) = &state { + if let Ok(connection) = &state { connection.close(0u32.into(), b""); } @@ -216,7 +222,9 @@ impl Actor { // No connection actor or it died - spawn a new one if self.connections.len() >= self.context.options.max_connections { - handler(&ConnectResult::TooManyConnections).await.ok(); + handler(&Err(PoolConnectError::TooManyConnections)) + .await + .ok(); continue; } let (conn_tx, conn_rx) = mpsc::channel(100); @@ -245,17 +253,25 @@ impl Actor { } } +/// Error when calling a fn on the [`ConnectionPool`]. +/// +/// The only thing that can go wrong is that the connection pool is shut down. #[derive(Debug, Snafu)] pub enum ConnectionPoolError { /// The connection pool has been shut down Shutdown, } +/// An error during the usage of the connection. +/// +/// The connection pool will recreate the connection if a handler returns this +/// error. If you don't want this, swallow the error in the handler. #[derive(Debug, Snafu)] pub struct ExecuteError; -pub type ExecuteResult = std::result::Result<(), ExecuteError>; +type ExecuteResult = std::result::Result<(), ExecuteError>; +/// A connection pool pub struct ConnectionPool { tx: mpsc::Sender, } @@ -273,8 +289,8 @@ impl ConnectionPool { /// Connect to a node and execute the given handler function /// /// The connection will either be a new connection or an existing one if it is already established. - /// If connection establishment succeeds, the handler will be called with a [`ConnectResult::Connected`]. - /// If connection establishment fails, the handler will get passed a [`ConnectResult`] containing the error. + /// If connection establishment succeeds, the handler will be called with a [`Ok`]. + /// If connection establishment fails, the handler will get passed a [`Err`] containing the error. /// /// The fn f is guaranteed to be called exactly once, unless the tokio runtime is shutting down. pub async fn connect( @@ -283,11 +299,11 @@ impl ConnectionPool { f: F, ) -> std::result::Result<(), ConnectionPoolError> where - F: FnOnce(&ConnectResult) -> Fut + Send + 'static, + F: FnOnce(&PoolConnectResult) -> Fut + Send + 'static, Fut: std::future::Future + Send + 'static, { let handler = - Box::new(move |conn: &ConnectResult| Box::pin(f(conn)) as BoxFuture); + Box::new(move |conn: &PoolConnectResult| Box::pin(f(conn)) as BoxFuture); self.tx .send(ActorMessage::Handle { id, handler }) diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index ccf31d1..f2cb631 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -26,6 +26,7 @@ use tokio::{ use tokio_util::time::FutureExt; use tracing::{error, trace}; +/// Configuration options for the 0rtt connection pool #[derive(Debug, Clone, Copy)] pub struct Options { pub idle_timeout: Duration, @@ -51,11 +52,17 @@ struct Context { owner: ConnectionPool0Rtt, } -type BoxedHandler = Box BoxFuture + Send + 'static>; +type BoxedHandler = + Box BoxFuture + Send + 'static>; -pub enum ConnectResult { - /// We got a connection - Connected(Connection, broadcast::Receiver), +pub type PoolConnectResult = + std::result::Result<(Connection, broadcast::Receiver), PoolConnectError>; + +/// Error when a connection can not be acquired +/// +/// This includes the normal iroh connection errors as well as pool specific +/// errors such as timeouts and connection limits. +pub enum PoolConnectError { /// Timeout during connect Timeout, /// Too many connections @@ -85,7 +92,7 @@ async fn connect( endpoint: &Endpoint, node_id: NodeId, alpn: &[u8], -) -> (ConnectResult, Option>) { +) -> (PoolConnectResult, Option>) { let connecting = match endpoint .connect_with_opts(node_id, alpn, ConnectOptions::default()) .await @@ -93,7 +100,7 @@ async fn connect( Ok(connecting) => connecting, Err(cause) => { trace!("Failed to connect to node {}: {}", node_id, cause); - return (ConnectResult::ConnectError(cause), None); + return (Err(PoolConnectError::ConnectError(cause)), None); } }; let (conn, zero_rtt_accepted) = match connecting.into_0rtt() { @@ -104,8 +111,8 @@ async fn connect( Err(connecting) => { trace!("Failed to connect using 0-RTT to node {}", node_id); let res = match connecting.await { - Err(cause) => ConnectResult::ConnectionError(cause), - Ok(connection) => ConnectResult::Connected(connection, accepted(true)), + Err(cause) => Err(PoolConnectError::ConnectionError(cause)), + Ok(connection) => Ok((connection, accepted(true))), }; return (res, None); } @@ -114,7 +121,7 @@ async fn connect( let complete = Box::pin(async move { tx.send(zero_rtt_accepted.await).ok(); }); - (ConnectResult::Connected(conn, rx), Some(complete)) + (Ok((conn, rx)), Some(complete)) } /// Run a connection actor for a single node @@ -129,9 +136,9 @@ async fn run_connection_actor( .await { Ok((state, forwarder)) => (state, forwarder), - Err(_) => (ConnectResult::Timeout, None), + Err(_) => (Err(PoolConnectError::Timeout), None), }; - if !matches!(state, ConnectResult::Connected(_, _)) { + if !matches!(state, Ok((_, _))) { context.owner.close(node_id).await.ok(); } let mut tasks = JoinSet::new(); @@ -165,18 +172,18 @@ async fn run_connection_actor( } Some(Ok(Err(e))) => { trace!("Task failed for node {}: {}", node_id, e); - if let ConnectResult::Connected(conn, _) = state { + if let Ok((conn, _)) = state { conn.close(1u32.into(), b""); } - state = ConnectResult::ExecuteError(e); + state = Err(PoolConnectError::ExecuteError(e)); context.owner.close(node_id).await.ok(); } Some(Err(e)) => { error!("Task panicked for node {}: {}", node_id, e); - if let ConnectResult::Connected(conn, _) = state { + if let Ok((conn, _)) = state { conn.close(1u32.into(), b""); } - state = ConnectResult::JoinError(e); + state = Err(PoolConnectError::JoinError(e)); context.owner.close(node_id).await.ok(); } None => { @@ -210,7 +217,7 @@ async fn run_connection_actor( } } - if let ConnectResult::Connected(conn, _) = state { + if let Ok((conn, _)) = state { conn.close(0u32.into(), b""); } @@ -264,7 +271,9 @@ impl Actor { // No connection actor or it died - spawn a new one if self.connections.len() >= self.context.options.max_connections { - handler(&ConnectResult::TooManyConnections).await.ok(); + handler(&Err(PoolConnectError::TooManyConnections)) + .await + .ok(); continue; } let (conn_tx, conn_rx) = mpsc::channel(100); @@ -292,16 +301,24 @@ impl Actor { } } +/// Error when calling a fn on the [`ConnectionPool0Rtt`]. +/// +/// The only thing that can go wrong is that the connection pool is shut down. #[derive(Debug, Snafu)] pub enum ConnectionPoolError { Shutdown, } +/// An error during the usage of the connection. +/// +/// The connection pool will recreate the connection if a handler returns this +/// error. If you don't want this, swallow the error in the handler. #[derive(Debug, Snafu)] pub struct ExecuteError; -pub type ExecuteResult = std::result::Result<(), ExecuteError>; +type ExecuteResult = std::result::Result<(), ExecuteError>; +/// A connection pool for 0-RTT connections #[derive(Debug, Clone)] #[repr(transparent)] pub struct ConnectionPool0Rtt { @@ -333,8 +350,8 @@ impl ConnectionPool0Rtt { /// Connect to a node and execute the given handler function /// /// The connection will either be a new connection or an existing one if it is already established. - /// If connection establishment succeeds, the handler will be called with a [`ConnectResult::Connected`]. - /// If connection establishment fails, the handler will get passed a [`ConnectResult`] containing the error. + /// If connection establishment succeeds, the handler will be called with a [`Ok`]. + /// If connection establishment fails, the handler will get passed a [`Err`] containing the error. /// /// The fn f is guaranteed to be called exactly once, unless the tokio runtime is shutting down. /// If the fn returns an error, it is assumed that the connection is no longer valid. This will cause @@ -345,11 +362,11 @@ impl ConnectionPool0Rtt { f: F, ) -> std::result::Result<(), ConnectionPoolError> where - F: FnOnce(&ConnectResult) -> Fut + Send + 'static, + F: FnOnce(&PoolConnectResult) -> Fut + Send + 'static, Fut: std::future::Future + Send + 'static, { let handler = - Box::new(move |conn: &ConnectResult| Box::pin(f(conn)) as BoxFuture); + Box::new(move |conn: &PoolConnectResult| Box::pin(f(conn)) as BoxFuture); self.tx .send(ActorMessage::Handle { id, handler }) From 2dc40632570c7f070cfb5cbfa3189d66e78781dd Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 16:44:15 +0200 Subject: [PATCH 12/26] Use MaybeFuture --- iroh-connection-pool/Cargo.toml | 2 +- iroh-connection-pool/src/connection_pool.rs | 27 +++++++---- .../src/connection_pool_0rtt.rs | 47 +++++++++++-------- 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/iroh-connection-pool/Cargo.toml b/iroh-connection-pool/Cargo.toml index b40bfc2..c3bafa3 100644 --- a/iroh-connection-pool/Cargo.toml +++ b/iroh-connection-pool/Cargo.toml @@ -5,7 +5,7 @@ edition = "2024" [dependencies] iroh = "0.91.1" -n0-future = "0.1.3" +n0-future = "0.2.0" snafu = "0.8.6" tokio = "1.45" tokio-util = { version = "0.7", features = ["time"] } diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index 722cbf5..c0c859c 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -14,7 +14,7 @@ use iroh::{ Endpoint, NodeId, endpoint::{ConnectError, Connection}, }; -use n0_future::boxed::BoxFuture; +use n0_future::{MaybeFuture, boxed::BoxFuture}; use snafu::Snafu; use tokio::{ sync::mpsc, @@ -91,12 +91,14 @@ async fn run_connection_actor( Ok(Err(e)) => Err(PoolConnectError::ConnectError(e)), Err(_) => Err(PoolConnectError::Timeout), }; - if !matches!(state, Ok(_)) { - context.owner.close(node_id).await.ok(); + if state.is_err() { + if context.owner.close(node_id).await.is_err() { + return; + } } let mut tasks = JoinSet::new(); - let idle_timer = tokio::time::sleep(context.options.idle_timeout); + let idle_timer = MaybeFuture::default(); tokio::pin!(idle_timer); loop { @@ -107,8 +109,8 @@ async fn run_connection_actor( handler = rx.recv() => { match handler { Some(handler) => { - // Reset idle timer by creating a new one - idle_timer.set(tokio::time::sleep(context.options.idle_timeout)); + // clear the idle timer + idle_timer.as_mut().set_none(); tasks.spawn(handler(&state)); } None => { @@ -145,14 +147,19 @@ async fn run_connection_actor( } } - // If no tasks left and channel is closed, we can exit - if tasks.is_empty() && rx.is_closed() { - break; + // We are idle + if tasks.is_empty() { + // If the channel is closed, we can exit + if rx.is_closed() { + break; + } + // set the idle timer + idle_timer.as_mut().set_future(tokio::time::sleep(context.options.idle_timeout)); } } // Idle timeout - request shutdown - _ = &mut idle_timer, if tasks.is_empty() => { + _ = &mut idle_timer => { tracing::debug!("Connection to {} idle, requesting shutdown", node_id); context.owner.close(node_id).await.ok(); diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index f2cb631..06423d7 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -17,7 +17,7 @@ use iroh::{ Endpoint, NodeId, endpoint::{ConnectOptions, ConnectWithOptsError, Connection, ConnectionError}, }; -use n0_future::boxed::BoxFuture; +use n0_future::{MaybeFuture, boxed::BoxFuture}; use snafu::Snafu; use tokio::{ sync::{broadcast, mpsc}, @@ -92,7 +92,7 @@ async fn connect( endpoint: &Endpoint, node_id: NodeId, alpn: &[u8], -) -> (PoolConnectResult, Option>) { +) -> (PoolConnectResult, MaybeFuture>) { let connecting = match endpoint .connect_with_opts(node_id, alpn, ConnectOptions::default()) .await @@ -100,7 +100,10 @@ async fn connect( Ok(connecting) => connecting, Err(cause) => { trace!("Failed to connect to node {}: {}", node_id, cause); - return (Err(PoolConnectError::ConnectError(cause)), None); + return ( + Err(PoolConnectError::ConnectError(cause)), + MaybeFuture::None, + ); } }; let (conn, zero_rtt_accepted) = match connecting.into_0rtt() { @@ -114,14 +117,14 @@ async fn connect( Err(cause) => Err(PoolConnectError::ConnectionError(cause)), Ok(connection) => Ok((connection, accepted(true))), }; - return (res, None); + return (res, MaybeFuture::None); } }; let (tx, rx) = broadcast::channel(1); let complete = Box::pin(async move { tx.send(zero_rtt_accepted.await).ok(); }); - (Ok((conn, rx)), Some(complete)) + (Ok((conn, rx)), MaybeFuture::Some(complete)) } /// Run a connection actor for a single node @@ -131,19 +134,22 @@ async fn run_connection_actor( context: Arc, ) { // Connect to the node - let (mut state, mut forwarder) = match connect(&context.endpoint, node_id, &context.alpn) + let (mut state, forwarder) = match connect(&context.endpoint, node_id, &context.alpn) .timeout(context.options.connect_timeout) .await { Ok((state, forwarder)) => (state, forwarder), - Err(_) => (Err(PoolConnectError::Timeout), None), + Err(_) => (Err(PoolConnectError::Timeout), MaybeFuture::None), }; - if !matches!(state, Ok((_, _))) { - context.owner.close(node_id).await.ok(); + if state.is_err() { + if context.owner.close(node_id).await.is_err() { + return; + } } let mut tasks = JoinSet::new(); - let idle_timer = tokio::time::sleep(context.options.idle_timeout); + let idle_timer = MaybeFuture::default(); tokio::pin!(idle_timer); + tokio::pin!(forwarder); loop { tokio::select! { @@ -153,8 +159,7 @@ async fn run_connection_actor( handler = rx.recv() => { match handler { Some(handler) => { - // Reset idle timer by creating a new one - idle_timer.set(tokio::time::sleep(context.options.idle_timeout)); + idle_timer.as_mut().set_none(); tasks.spawn(handler(&state)); } None => { @@ -191,22 +196,26 @@ async fn run_connection_actor( } } - // If no tasks left and channel is closed, we can exit - if tasks.is_empty() && rx.is_closed() { - break; + // We are idle + if tasks.is_empty() { + // If the channel is closed, we can exit + if rx.is_closed() { + break; + } + // set the idle timer + idle_timer.as_mut().set_future(tokio::time::sleep(context.options.idle_timeout)); } } // Idle timeout - request shutdown - _ = &mut idle_timer, if tasks.is_empty() => { + _ = &mut idle_timer => { trace!("Connection to {} idle, requesting shutdown", node_id); context.owner.close(node_id).await.ok(); // Don't break here - wait for main actor to close our channel } - _ = forwarder.as_mut().unwrap(), if forwarder.is_some() => { - forwarder = None; - } + // Poll the forwarder if we have one + _ = &mut forwarder => {} } } From 6a7653ba04d0910ca504b29498472086d11a5b2b Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 16:44:56 +0200 Subject: [PATCH 13/26] clippy --- iroh-connection-pool/src/connection_pool.rs | 5 ++--- iroh-connection-pool/src/connection_pool_0rtt.rs | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index c0c859c..d16a3a9 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -91,11 +91,10 @@ async fn run_connection_actor( Ok(Err(e)) => Err(PoolConnectError::ConnectError(e)), Err(_) => Err(PoolConnectError::Timeout), }; - if state.is_err() { - if context.owner.close(node_id).await.is_err() { + if state.is_err() + && context.owner.close(node_id).await.is_err() { return; } - } let mut tasks = JoinSet::new(); let idle_timer = MaybeFuture::default(); diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index 06423d7..3c5bc2d 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -141,11 +141,10 @@ async fn run_connection_actor( Ok((state, forwarder)) => (state, forwarder), Err(_) => (Err(PoolConnectError::Timeout), MaybeFuture::None), }; - if state.is_err() { - if context.owner.close(node_id).await.is_err() { + if state.is_err() + && context.owner.close(node_id).await.is_err() { return; } - } let mut tasks = JoinSet::new(); let idle_timer = MaybeFuture::default(); tokio::pin!(idle_timer); From 79cc245ab63add8dc30bb6e602daa8550d7ee671 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 13 Aug 2025 16:56:25 +0200 Subject: [PATCH 14/26] fmt --- iroh-connection-pool/src/connection_pool.rs | 24 ++++++++----------- .../src/connection_pool_0rtt.rs | 24 ++++++++----------- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index d16a3a9..51cd5e1 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -91,10 +91,9 @@ async fn run_connection_actor( Ok(Err(e)) => Err(PoolConnectError::ConnectError(e)), Err(_) => Err(PoolConnectError::Timeout), }; - if state.is_err() - && context.owner.close(node_id).await.is_err() { - return; - } + if state.is_err() && context.owner.close(node_id).await.is_err() { + return; + } let mut tasks = JoinSet::new(); let idle_timer = MaybeFuture::default(); @@ -120,30 +119,27 @@ async fn run_connection_actor( } // Handle completed tasks - task_result = tasks.join_next(), if !tasks.is_empty() => { + Some(task_result) = tasks.join_next(), if !tasks.is_empty() => { match task_result { - Some(Ok(Ok(()))) => { + Ok(Ok(())) => { tracing::debug!("Task completed for node {}", node_id); } - Some(Ok(Err(e))) => { + Ok(Err(e)) => { tracing::error!("Task failed for node {}: {}", node_id, e); if let Ok(conn) = state { - conn.close(1u32.into(), b""); + conn.close(1u32.into(), b"error"); } state = Err(PoolConnectError::ExecuteError(e)); let _ = context.owner.close(node_id).await; } - Some(Err(e)) => { + Err(e) => { tracing::error!("Task panicked for node {}: {}", node_id, e); if let Ok(conn) = state { - conn.close(1u32.into(), b""); + conn.close(1u32.into(), b"panic"); } state = Err(PoolConnectError::JoinError(e)); let _ = context.owner.close(node_id).await; } - None => { - tracing::debug!("Task was cancelled or already completed for node {}", node_id); - } } // We are idle @@ -175,7 +171,7 @@ async fn run_connection_actor( } if let Ok(connection) = &state { - connection.close(0u32.into(), b""); + connection.close(0u32.into(), b"idle"); } tracing::debug!("Connection actor for {} shutting down", node_id); diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index 3c5bc2d..6c571df 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -141,10 +141,9 @@ async fn run_connection_actor( Ok((state, forwarder)) => (state, forwarder), Err(_) => (Err(PoolConnectError::Timeout), MaybeFuture::None), }; - if state.is_err() - && context.owner.close(node_id).await.is_err() { - return; - } + if state.is_err() && context.owner.close(node_id).await.is_err() { + return; + } let mut tasks = JoinSet::new(); let idle_timer = MaybeFuture::default(); tokio::pin!(idle_timer); @@ -169,30 +168,27 @@ async fn run_connection_actor( } // Handle completed tasks - task_result = tasks.join_next(), if !tasks.is_empty() => { + Some(task_result) = tasks.join_next(), if !tasks.is_empty() => { match task_result { - Some(Ok(Ok(()))) => { + Ok(Ok(())) => { trace!("Task completed for node {}", node_id); } - Some(Ok(Err(e))) => { + Ok(Err(e)) => { trace!("Task failed for node {}: {}", node_id, e); if let Ok((conn, _)) = state { - conn.close(1u32.into(), b""); + conn.close(1u32.into(), b"error"); } state = Err(PoolConnectError::ExecuteError(e)); context.owner.close(node_id).await.ok(); } - Some(Err(e)) => { + Err(e) => { error!("Task panicked for node {}: {}", node_id, e); if let Ok((conn, _)) = state { - conn.close(1u32.into(), b""); + conn.close(1u32.into(), b"panic"); } state = Err(PoolConnectError::JoinError(e)); context.owner.close(node_id).await.ok(); } - None => { - trace!("Task was cancelled or already completed for node {}", node_id); - } } // We are idle @@ -226,7 +222,7 @@ async fn run_connection_actor( } if let Ok((conn, _)) = state { - conn.close(0u32.into(), b""); + conn.close(0u32.into(), b"idle"); } trace!("Connection actor for {} shutting down", node_id); From f9951e768efa50123f16bf0bafa1839ce01c6318 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Thu, 14 Aug 2025 12:42:53 +0200 Subject: [PATCH 15/26] Update smoke test to check that connection id changes after delay --- iroh-connection-pool/Cargo.toml | 8 +- iroh-connection-pool/src/connection_pool.rs | 100 ++++++++--- iroh-connection-pool/src/lib.rs | 3 + iroh-connection-pool/src/tests.rs | 185 ++++++++++++++++++++ 4 files changed, 272 insertions(+), 24 deletions(-) create mode 100644 iroh-connection-pool/src/tests.rs diff --git a/iroh-connection-pool/Cargo.toml b/iroh-connection-pool/Cargo.toml index c3bafa3..d0bbd1b 100644 --- a/iroh-connection-pool/Cargo.toml +++ b/iroh-connection-pool/Cargo.toml @@ -4,9 +4,15 @@ version = "0.1.0" edition = "2024" [dependencies] -iroh = "0.91.1" +iroh = { version = "0.91.1", git = "https://github.com/n0-computer/iroh", branch = "connection-speed-test" } n0-future = "0.2.0" snafu = "0.8.6" tokio = "1.45" tokio-util = { version = "0.7", features = ["time"] } tracing = "0.1.41" + +[dev-dependencies] +anyhow = "1.0.99" +n0-snafu = "0.2.1" +testresult = "0.4.1" +tracing-subscriber = { version = "0.3.19", features = ["env-filter"] } diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index 51cd5e1..f0f94bd 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -17,10 +17,11 @@ use iroh::{ use n0_future::{MaybeFuture, boxed::BoxFuture}; use snafu::Snafu; use tokio::{ - sync::mpsc, + sync::{mpsc, oneshot}, task::{JoinError, JoinSet}, }; use tokio_util::time::FutureExt; +use tracing::{debug, error, trace}; /// Configuration options for the connection pool #[derive(Debug, Clone, Copy)] @@ -47,24 +48,36 @@ struct Context { alpn: Vec, } -type BoxedHandler = - Box BoxFuture + Send + 'static>; +type BoxedHandler = Box BoxFuture + Send + 'static>; /// Error when a connection can not be acquired /// /// This includes the normal iroh connection errors as well as pool specific /// errors such as timeouts and connection limits. +#[derive(Debug, Clone)] pub enum PoolConnectError { /// Timeout during connect Timeout, /// Too many connections TooManyConnections, /// Error during connect - ConnectError(ConnectError), + ConnectError(Arc), /// Error during last execute - ExecuteError(ExecuteError), + ExecuteError(Arc), /// Handler actor panicked - JoinError(JoinError), + JoinError(Arc), +} + +impl std::fmt::Display for PoolConnectError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PoolConnectError::Timeout => write!(f, "Connection timed out"), + PoolConnectError::TooManyConnections => write!(f, "Too many connections"), + PoolConnectError::ConnectError(e) => write!(f, "Connection error: {}", e), + PoolConnectError::ExecuteError(e) => write!(f, "Execution error: {}", e), + PoolConnectError::JoinError(e) => write!(f, "Join error: {}", e), + } + } } pub type PoolConnectResult = std::result::Result; @@ -88,11 +101,14 @@ async fn run_connection_actor( .await { Ok(Ok(conn)) => Ok(conn), - Ok(Err(e)) => Err(PoolConnectError::ConnectError(e)), + Ok(Err(e)) => Err(PoolConnectError::ConnectError(Arc::new(e))), Err(_) => Err(PoolConnectError::Timeout), }; - if state.is_err() && context.owner.close(node_id).await.is_err() { - return; + if let Err(e) = &state { + debug!(%node_id, "Failed to connect {e:?}, requesting shutdown"); + if context.owner.close(node_id).await.is_err() { + return; + } } let mut tasks = JoinSet::new(); @@ -107,9 +123,10 @@ async fn run_connection_actor( handler = rx.recv() => { match handler { Some(handler) => { + trace!(%node_id, "Received new task"); // clear the idle timer idle_timer.as_mut().set_none(); - tasks.spawn(handler(&state)); + tasks.spawn(handler(state.clone())); } None => { // Channel closed - finish remaining tasks and exit @@ -122,22 +139,22 @@ async fn run_connection_actor( Some(task_result) = tasks.join_next(), if !tasks.is_empty() => { match task_result { Ok(Ok(())) => { - tracing::debug!("Task completed for node {}", node_id); + debug!(%node_id, "Task completed"); } Ok(Err(e)) => { - tracing::error!("Task failed for node {}: {}", node_id, e); + error!(%node_id, "Task failed: {}", e); if let Ok(conn) = state { conn.close(1u32.into(), b"error"); } - state = Err(PoolConnectError::ExecuteError(e)); + state = Err(PoolConnectError::ExecuteError(Arc::new(e))); let _ = context.owner.close(node_id).await; } Err(e) => { - tracing::error!("Task panicked for node {}: {}", node_id, e); + error!(%node_id, "Task panicked: {}", e); if let Ok(conn) = state { conn.close(1u32.into(), b"panic"); } - state = Err(PoolConnectError::JoinError(e)); + state = Err(PoolConnectError::JoinError(Arc::new(e))); let _ = context.owner.close(node_id).await; } } @@ -155,8 +172,7 @@ async fn run_connection_actor( // Idle timeout - request shutdown _ = &mut idle_timer => { - tracing::debug!("Connection to {} idle, requesting shutdown", node_id); - + debug!(%node_id, "Connection idle, requesting shutdown"); context.owner.close(node_id).await.ok(); // Don't break here - wait for main actor to close our channel } @@ -166,7 +182,7 @@ async fn run_connection_actor( // Wait for remaining tasks to complete while let Some(task_result) = tasks.join_next().await { if let Err(e) = task_result { - tracing::error!("Task failed during shutdown for node {}: {}", node_id, e); + error!(%node_id, "Task failed during shutdown: {}", e); } } @@ -174,7 +190,7 @@ async fn run_connection_actor( connection.close(0u32.into(), b"idle"); } - tracing::debug!("Connection actor for {} shutting down", node_id); + debug!(%node_id, "Connection actor shutting down"); } struct Actor { @@ -224,7 +240,7 @@ impl Actor { // No connection actor or it died - spawn a new one if self.connections.len() >= self.context.options.max_connections { - handler(&Err(PoolConnectError::TooManyConnections)) + handler(Err(PoolConnectError::TooManyConnections)) .await .ok(); continue; @@ -273,7 +289,14 @@ pub struct ExecuteError; type ExecuteResult = std::result::Result<(), ExecuteError>; +impl From for ExecuteError { + fn from(_: PoolConnectError) -> Self { + ExecuteError + } +} + /// A connection pool +#[derive(Debug, Clone)] pub struct ConnectionPool { tx: mpsc::Sender, } @@ -301,11 +324,11 @@ impl ConnectionPool { f: F, ) -> std::result::Result<(), ConnectionPoolError> where - F: FnOnce(&PoolConnectResult) -> Fut + Send + 'static, - Fut: std::future::Future + Send + 'static, + F: FnOnce(PoolConnectResult) -> Fut + Send + 'static, + Fut: Future + Send + 'static, { let handler = - Box::new(move |conn: &PoolConnectResult| Box::pin(f(conn)) as BoxFuture); + Box::new(move |conn: PoolConnectResult| Box::pin(f(conn)) as BoxFuture); self.tx .send(ActorMessage::Handle { id, handler }) @@ -315,6 +338,37 @@ impl ConnectionPool { Ok(()) } + pub async fn with_connection( + &self, + id: NodeId, + f: F, + ) -> Result, PoolConnectError>, ConnectionPoolError> + where + F: FnOnce(Connection) -> Fut + Send + 'static, + Fut: Future> + Send + 'static, + I: Send + 'static, + E: Send + 'static, + { + let (tx, rx) = oneshot::channel(); + self.connect(id, |conn| async move { + let (res, ret) = match conn { + Ok(connection) => { + let res = f(connection).await; + let ret = match &res { + Ok(_) => Ok(()), + Err(_) => Err(ExecuteError), + }; + (Ok(res), ret) + } + Err(e) => (Err(e), Err(ExecuteError)), + }; + tx.send(res).ok(); + ret + }) + .await?; + rx.await.map_err(|_| ConnectionPoolError::Shutdown) + } + /// Close an existing connection, if it exists /// /// This will finish pending tasks and close the connection. New tasks will diff --git a/iroh-connection-pool/src/lib.rs b/iroh-connection-pool/src/lib.rs index 48490b8..0c5a5be 100644 --- a/iroh-connection-pool/src/lib.rs +++ b/iroh-connection-pool/src/lib.rs @@ -1,2 +1,5 @@ pub mod connection_pool; pub mod connection_pool_0rtt; + +#[cfg(test)] +mod tests; diff --git a/iroh-connection-pool/src/tests.rs b/iroh-connection-pool/src/tests.rs new file mode 100644 index 0000000..fd77547 --- /dev/null +++ b/iroh-connection-pool/src/tests.rs @@ -0,0 +1,185 @@ +use std::{collections::BTreeMap, time::Duration}; + +use iroh::{ + NodeAddr, NodeId, SecretKey, Watcher, + discovery::static_provider::StaticProvider, + endpoint::Connection, + protocol::{AcceptError, ProtocolHandler, Router}, +}; +use n0_future::{BufferedStreamExt, StreamExt, stream}; +use n0_snafu::ResultExt; +use testresult::TestResult; +use tracing::trace; + +use crate::connection_pool::{ConnectionPool, ConnectionPoolError, Options, PoolConnectError}; + +const ECHO_ALPN: &[u8] = b"echo"; + +#[derive(Debug, Clone)] +struct Echo; + +impl ProtocolHandler for Echo { + async fn accept(&self, connection: Connection) -> Result<(), AcceptError> { + let conn_id = connection.stable_id(); + let id = connection.remote_node_id().map_err(AcceptError::from_err)?; + trace!(%id, %conn_id, "Accepting echo connection"); + loop { + match connection.accept_bi().await { + Ok((mut send, mut recv)) => { + trace!(%id, %conn_id, "Accepted echo request"); + tokio::io::copy(&mut recv, &mut send).await?; + send.finish().map_err(AcceptError::from_err)?; + } + Err(e) => { + trace!(%id, %conn_id, "Failed to accept echo request {e}"); + break; + } + } + } + Ok(()) + } +} + +async fn echo_client(conn: Connection, text: &[u8]) -> n0_snafu::Result> { + let conn_id = conn.stable_id(); + let id = conn.remote_node_id().e()?; + trace!(%id, %conn_id, "Sending echo request"); + let (mut send, mut recv) = conn.open_bi().await.e()?; + send.write_all(text).await.e()?; + send.finish().e()?; + let response = recv.read_to_end(1000).await.e()?; + trace!(%id, %conn_id, "Received echo response"); + Ok(response) +} + +async fn echo_server() -> TestResult<(NodeAddr, Router)> { + let endpoint = iroh::Endpoint::builder() + .alpns(vec![ECHO_ALPN.to_vec()]) + .bind() + .await?; + let addr = endpoint.node_addr().initialized().await; + let router = iroh::protocol::Router::builder(endpoint) + .accept(ECHO_ALPN, Echo) + .spawn(); + + Ok((addr, router)) +} + +async fn echo_servers(n: usize) -> TestResult> { + stream::iter(0..n) + .map(|_| echo_server()) + .buffered_unordered(16) + .collect::>() + .await + .into_iter() + .collect() +} + +fn test_options() -> Options { + Options { + idle_timeout: Duration::from_millis(100), + connect_timeout: Duration::from_secs(2), + max_connections: 32, + } +} + +struct EchoClient { + pool: ConnectionPool, +} + +impl EchoClient { + async fn echo( + &self, + id: NodeId, + text: Vec, + ) -> Result< + Result), n0_snafu::Error>, PoolConnectError>, + ConnectionPoolError, + > { + self.pool + .with_connection(id, |conn| async move { + let id = conn.stable_id(); + let res = echo_client(conn, &text).await?; + Ok((id, res)) + }) + .await + } +} + +#[tokio::test] +async fn connection_pool_errors() -> TestResult<()> { + // set up static discovery for all addrs + let discovery = StaticProvider::new(); + let endpoint = iroh::Endpoint::builder() + .discovery(discovery.clone()) + .bind() + .await?; + let pool = ConnectionPool::new(endpoint, ECHO_ALPN, test_options()); + let client = EchoClient { pool }; + { + let non_existing = SecretKey::from_bytes(&[0; 32]).public(); + let res = client.echo(non_existing, b"Hello, world!".to_vec()).await?; + // trying to connect to a non-existing id will fail with ConnectError + // because we don't have any information about the node + assert!(matches!(res, Err(PoolConnectError::ConnectError(_)))); + } + { + let non_listening = SecretKey::from_bytes(&[0; 32]).public(); + // make up fake node info + discovery.add_node_info(NodeAddr { + node_id: non_listening, + relay_url: None, + direct_addresses: vec!["127.0.0.1:12121".parse().unwrap()] + .into_iter() + .collect(), + }); + // trying to connect to an id for which we have info, but the other + // end is not listening, will lead to a timeout. + let res = client + .echo(non_listening, b"Hello, world!".to_vec()) + .await?; + assert!(matches!(res, Err(PoolConnectError::Timeout))); + } + Ok(()) +} + +#[tokio::test] +async fn connection_pool_smoke() -> TestResult<()> { + let n = 32; + let filter = tracing_subscriber::EnvFilter::from_default_env(); + tracing_subscriber::fmt().with_env_filter(filter).init(); + let nodes = echo_servers(n).await?; + let ids = nodes + .iter() + .map(|(addr, _)| addr.node_id) + .collect::>(); + // set up static discovery for all addrs + let discovery = StaticProvider::from_node_info(nodes.iter().map(|(addr, _)| addr.clone())); + // build a client endpoint that can resolve all the node ids + let endpoint = iroh::Endpoint::builder() + .discovery(discovery.clone()) + .bind() + .await?; + let pool = ConnectionPool::new(endpoint.clone(), ECHO_ALPN, test_options()); + let client = EchoClient { pool }; + let mut connection_ids = BTreeMap::new(); + let msg = b"Hello, world!".to_vec(); + for id in &ids { + let (cid1, res) = client.echo(*id, msg.clone()).await???; + println!("First response from {}: {:?}", cid1, res); + assert_eq!(res, msg); + let (cid2, res) = client.echo(*id, msg.clone()).await???; + println!("Second response from {}: {:?}", cid2, res); + assert_eq!(res, msg); + assert_eq!(cid1, cid2); + connection_ids.insert(id, cid1); + } + tokio::time::sleep(Duration::from_millis(200)).await; + for id in &ids { + let cid1 = *connection_ids.get(id).expect("Connection ID not found"); + let (cid2, res) = client.echo(*id, msg.clone()).await???; + assert_eq!(res, msg); + assert_ne!(cid1, cid2); + } + Ok(()) +} From 66925dbbda67a192e2680110ae798f7e80a3b0b6 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Thu, 14 Aug 2025 13:51:19 +0200 Subject: [PATCH 16/26] better idle logic --- iroh-connection-pool/src/connection_pool.rs | 77 ++++++++++++++++----- 1 file changed, 60 insertions(+), 17 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index f0f94bd..0e63719 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -8,7 +8,11 @@ //! //! It is important that you use the connection only in the future passed to //! connect, and don't clone it out of the future. -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::{ + collections::{HashMap, VecDeque}, + sync::Arc, + time::Duration, +}; use iroh::{ Endpoint, NodeId, @@ -17,7 +21,7 @@ use iroh::{ use n0_future::{MaybeFuture, boxed::BoxFuture}; use snafu::Snafu; use tokio::{ - sync::{mpsc, oneshot}, + sync::{mpsc, mpsc::error::SendError as TokioSendError, oneshot}, task::{JoinError, JoinSet}, }; use tokio_util::time::FutureExt; @@ -84,6 +88,7 @@ pub type PoolConnectResult = std::result::Result; enum ActorMessage { Handle { id: NodeId, handler: BoxedHandler }, + ConnectionIdle { id: NodeId }, ConnectionShutdown { id: NodeId }, } @@ -165,6 +170,10 @@ async fn run_connection_actor( if rx.is_closed() { break; } + if context.owner.idle(node_id).await.is_err() { + // If we can't notify the pool, we are shutting down + break; + } // set the idle timer idle_timer.as_mut().set_future(tokio::time::sleep(context.options.idle_timeout)); } @@ -197,6 +206,9 @@ struct Actor { rx: mpsc::Receiver, connections: HashMap>, context: Arc, + // idle set (most recent last) + // todo: use a better data structure if this becomes a performance issue + idle: VecDeque, } impl Actor { @@ -210,6 +222,7 @@ impl Actor { Self { rx, connections: HashMap::new(), + idle: VecDeque::new(), context: Arc::new(Context { options, alpn: alpn.to_vec(), @@ -221,29 +234,47 @@ impl Actor { ) } + fn add_idle(&mut self, id: NodeId) { + self.remove_idle(id); + self.idle.push_back(id); + } + + fn remove_idle(&mut self, id: NodeId) { + self.idle.retain(|&x| x != id); + } + + fn remove_connection(&mut self, id: NodeId) { + self.connections.remove(&id); + self.remove_idle(id); + } + pub async fn run(mut self) { while let Some(msg) = self.rx.recv().await { match msg { ActorMessage::Handle { id, mut handler } => { + self.remove_idle(id); // Try to send to existing connection actor if let Some(conn_tx) = self.connections.get(&id) { - if let Err(tokio::sync::mpsc::error::SendError(e)) = - conn_tx.send(handler).await - { + if let Err(TokioSendError(e)) = conn_tx.send(handler).await { handler = e; } else { continue; } // Connection actor died, remove it - self.connections.remove(&id); + self.remove_connection(id); } - // No connection actor or it died - spawn a new one + // No connection actor or it died - check limits if self.connections.len() >= self.context.options.max_connections { - handler(Err(PoolConnectError::TooManyConnections)) - .await - .ok(); - continue; + if let Some(idle) = self.idle.pop_front() { + // remove the oldest idle connection to make room for one more + self.connections.remove(&idle); + } else { + handler(Err(PoolConnectError::TooManyConnections)) + .await + .ok(); + continue; + } } let (conn_tx, conn_rx) = mpsc::channel(100); self.connections.insert(id, conn_tx.clone()); @@ -254,17 +285,18 @@ impl Actor { // Send the handler to the new actor if conn_tx.send(handler).await.is_err() { - tracing::error!( - "Failed to send handler to new connection actor for {}", - id - ); + error!(%id, "Failed to send handler to new connection actor"); self.connections.remove(&id); } } + ActorMessage::ConnectionIdle { id } => { + self.add_idle(id); + trace!(%id, "connection idle"); + } ActorMessage::ConnectionShutdown { id } => { // Remove the connection from our map - this closes the channel - self.connections.remove(&id); - tracing::debug!("Approved shutdown for connection {}", id); + self.remove_connection(id); + trace!(%id, "removed connection"); } } } @@ -380,4 +412,15 @@ impl ConnectionPool { .map_err(|_| ConnectionPoolError::Shutdown)?; Ok(()) } + + /// Notify the connection pool that a connection is idle. + /// + /// Should only be called from connection handlers. + pub(crate) async fn idle(&self, id: NodeId) -> std::result::Result<(), ConnectionPoolError> { + self.tx + .send(ActorMessage::ConnectionIdle { id }) + .await + .map_err(|_| ConnectionPoolError::Shutdown)?; + Ok(()) + } } From 2ba54259497187ef8d7e6a4488882045e1366ee1 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Thu, 14 Aug 2025 14:01:29 +0200 Subject: [PATCH 17/26] Add test for reclaiming idle conns. --- iroh-connection-pool/src/connection_pool.rs | 7 +++- iroh-connection-pool/src/tests.rs | 39 +++++++++++++++++++-- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index 0e63719..77f5470 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -243,6 +243,10 @@ impl Actor { self.idle.retain(|&x| x != id); } + fn pop_oldest_idle(&mut self) -> Option { + self.idle.pop_front() + } + fn remove_connection(&mut self, id: NodeId) { self.connections.remove(&id); self.remove_idle(id); @@ -266,8 +270,9 @@ impl Actor { // No connection actor or it died - check limits if self.connections.len() >= self.context.options.max_connections { - if let Some(idle) = self.idle.pop_front() { + if let Some(idle) = self.pop_oldest_idle() { // remove the oldest idle connection to make room for one more + trace!("removing oldest idle connection {}", idle); self.connections.remove(&idle); } else { handler(Err(PoolConnectError::TooManyConnections)) diff --git a/iroh-connection-pool/src/tests.rs b/iroh-connection-pool/src/tests.rs index fd77547..11e95c9 100644 --- a/iroh-connection-pool/src/tests.rs +++ b/iroh-connection-pool/src/tests.rs @@ -166,10 +166,8 @@ async fn connection_pool_smoke() -> TestResult<()> { let msg = b"Hello, world!".to_vec(); for id in &ids { let (cid1, res) = client.echo(*id, msg.clone()).await???; - println!("First response from {}: {:?}", cid1, res); assert_eq!(res, msg); let (cid2, res) = client.echo(*id, msg.clone()).await???; - println!("Second response from {}: {:?}", cid2, res); assert_eq!(res, msg); assert_eq!(cid1, cid2); connection_ids.insert(id, cid1); @@ -183,3 +181,40 @@ async fn connection_pool_smoke() -> TestResult<()> { } Ok(()) } + +/// Tests that idle connections are being reclaimed to make room if we hit the +/// maximum connection limit. +#[tokio::test] +async fn connection_pool_idle() -> TestResult<()> { + let n = 32; + let filter = tracing_subscriber::EnvFilter::from_default_env(); + tracing_subscriber::fmt().with_env_filter(filter).init(); + let nodes = echo_servers(n).await?; + let ids = nodes + .iter() + .map(|(addr, _)| addr.node_id) + .collect::>(); + // set up static discovery for all addrs + let discovery = StaticProvider::from_node_info(nodes.iter().map(|(addr, _)| addr.clone())); + // build a client endpoint that can resolve all the node ids + let endpoint = iroh::Endpoint::builder() + .discovery(discovery.clone()) + .bind() + .await?; + let pool = ConnectionPool::new( + endpoint.clone(), + ECHO_ALPN, + Options { + idle_timeout: Duration::from_secs(100), + max_connections: 8, + ..test_options() + }, + ); + let client = EchoClient { pool }; + let msg = b"Hello, world!".to_vec(); + for id in &ids { + let (_, res) = client.echo(*id, msg.clone()).await???; + assert_eq!(res, msg); + } + Ok(()) +} From 035284fb7662d7106a551d394daeea6377d1b823 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Tue, 19 Aug 2025 10:18:37 +0200 Subject: [PATCH 18/26] Introduce ConnectionRef instead of sending a fn. --- iroh-connection-pool/src/connection_pool.rs | 209 ++++++++------------ iroh-connection-pool/src/tests.rs | 31 +-- 2 files changed, 104 insertions(+), 136 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index 77f5470..68d6795 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -9,20 +9,18 @@ //! It is important that you use the connection only in the future passed to //! connect, and don't clone it out of the future. use std::{ - collections::{HashMap, VecDeque}, - sync::Arc, - time::Duration, + collections::{HashMap, VecDeque}, ops::Deref, sync::Arc, time::Duration }; use iroh::{ Endpoint, NodeId, endpoint::{ConnectError, Connection}, }; -use n0_future::{MaybeFuture, boxed::BoxFuture}; +use n0_future::MaybeFuture; use snafu::Snafu; use tokio::{ - sync::{mpsc, mpsc::error::SendError as TokioSendError, oneshot}, - task::{JoinError, JoinSet}, + sync::{mpsc::{self, error::SendError as TokioSendError}, oneshot, OwnedSemaphorePermit}, + task::JoinError, }; use tokio_util::time::FutureExt; use tracing::{debug, error, trace}; @@ -45,6 +43,30 @@ impl Default for Options { } } +/// A reference to a connection that is owned by a connection pool. +#[derive(Debug)] +pub struct ConnectionRef { + connection: iroh::endpoint::Connection, + _permit: OwnedSemaphorePermit, +} + +impl Deref for ConnectionRef { + type Target = iroh::endpoint::Connection; + + fn deref(&self) -> &Self::Target { + &self.connection + } +} + +impl ConnectionRef { + fn new(connection: iroh::endpoint::Connection, permit: OwnedSemaphorePermit) -> Self { + Self { + connection, + _permit: permit, + } + } +} + struct Context { options: Options, endpoint: Endpoint, @@ -52,8 +74,6 @@ struct Context { alpn: Vec, } -type BoxedHandler = Box BoxFuture + Send + 'static>; - /// Error when a connection can not be acquired /// /// This includes the normal iroh connection errors as well as pool specific @@ -87,19 +107,24 @@ impl std::fmt::Display for PoolConnectError { pub type PoolConnectResult = std::result::Result; enum ActorMessage { - Handle { id: NodeId, handler: BoxedHandler }, + RequestRef(RequestRef), ConnectionIdle { id: NodeId }, ConnectionShutdown { id: NodeId }, } +struct RequestRef { + id: NodeId, + tx: oneshot::Sender>, +} + /// Run a connection actor for a single node async fn run_connection_actor( node_id: NodeId, - mut rx: mpsc::Receiver, + mut rx: mpsc::Receiver, context: Arc, ) { // Connect to the node - let mut state = match context + let state = match context .endpoint .connect(node_id, &context.alpn) .timeout(context.options.connect_timeout) @@ -115,10 +140,10 @@ async fn run_connection_actor( return; } } - - let mut tasks = JoinSet::new(); + let semaphore = Arc::new(tokio::sync::Semaphore::new(u32::MAX as usize)); let idle_timer = MaybeFuture::default(); - tokio::pin!(idle_timer); + let idle_fut = MaybeFuture::default(); + tokio::pin!(idle_timer, idle_fut); loop { tokio::select! { @@ -127,11 +152,26 @@ async fn run_connection_actor( // Handle new work handler = rx.recv() => { match handler { - Some(handler) => { - trace!(%node_id, "Received new task"); - // clear the idle timer - idle_timer.as_mut().set_none(); - tasks.spawn(handler(state.clone())); + Some(RequestRef { id, tx }) => { + assert!(id == node_id, "Not for me!"); + trace!(%node_id, "Received new request"); + match &state { + Ok(state) => { + // first acquire a permit for the op, then aquire all permits for idle + let permit = semaphore.clone().acquire_owned().await.expect("semaphore closed"); + let res = ConnectionRef::new(state.clone(), permit); + if idle_fut.is_none() { + idle_fut.as_mut().set_future(semaphore.clone().acquire_many_owned(u32::MAX)); + } + + // clear the idle timer + idle_timer.as_mut().set_none(); + tx.send(Ok(res)).ok(); + } + Err(cause) => { + tx.send(Err(cause.clone())).ok(); + } + } } None => { // Channel closed - finish remaining tasks and exit @@ -140,43 +180,15 @@ async fn run_connection_actor( } } - // Handle completed tasks - Some(task_result) = tasks.join_next(), if !tasks.is_empty() => { - match task_result { - Ok(Ok(())) => { - debug!(%node_id, "Task completed"); - } - Ok(Err(e)) => { - error!(%node_id, "Task failed: {}", e); - if let Ok(conn) = state { - conn.close(1u32.into(), b"error"); - } - state = Err(PoolConnectError::ExecuteError(Arc::new(e))); - let _ = context.owner.close(node_id).await; - } - Err(e) => { - error!(%node_id, "Task panicked: {}", e); - if let Ok(conn) = state { - conn.close(1u32.into(), b"panic"); - } - state = Err(PoolConnectError::JoinError(Arc::new(e))); - let _ = context.owner.close(node_id).await; - } - } - - // We are idle - if tasks.is_empty() { - // If the channel is closed, we can exit - if rx.is_closed() { - break; - } - if context.owner.idle(node_id).await.is_err() { - // If we can't notify the pool, we are shutting down - break; - } - // set the idle timer - idle_timer.as_mut().set_future(tokio::time::sleep(context.options.idle_timeout)); + _ = &mut idle_fut => { + // notify the pool that we are idle. + trace!(%node_id, "Idle"); + if context.owner.idle(node_id).await.is_err() { + // If we can't notify the pool, we are shutting down + break; } + // set the idle timer + idle_timer.as_mut().set_future(tokio::time::sleep(context.options.idle_timeout)); } // Idle timeout - request shutdown @@ -188,15 +200,13 @@ async fn run_connection_actor( } } - // Wait for remaining tasks to complete - while let Some(task_result) = tasks.join_next().await { - if let Err(e) = task_result { - error!(%node_id, "Task failed during shutdown: {}", e); - } - } - - if let Ok(connection) = &state { - connection.close(0u32.into(), b"idle"); + if let Ok(connection) = state { + let reason = if semaphore.available_permits() == u32::MAX as usize { + "idle" + } else { + "drop" + }; + connection.close(0u32.into(), reason.as_bytes()); } debug!(%node_id, "Connection actor shutting down"); @@ -204,7 +214,7 @@ async fn run_connection_actor( struct Actor { rx: mpsc::Receiver, - connections: HashMap>, + connections: HashMap>, context: Arc, // idle set (most recent last) // todo: use a better data structure if this becomes a performance issue @@ -255,12 +265,13 @@ impl Actor { pub async fn run(mut self) { while let Some(msg) = self.rx.recv().await { match msg { - ActorMessage::Handle { id, mut handler } => { + ActorMessage::RequestRef(mut msg) => { + let id = msg.id; self.remove_idle(id); // Try to send to existing connection actor if let Some(conn_tx) = self.connections.get(&id) { - if let Err(TokioSendError(e)) = conn_tx.send(handler).await { - handler = e; + if let Err(TokioSendError(e)) = conn_tx.send(msg).await { + msg = e; } else { continue; } @@ -275,8 +286,7 @@ impl Actor { trace!("removing oldest idle connection {}", idle); self.connections.remove(&idle); } else { - handler(Err(PoolConnectError::TooManyConnections)) - .await + msg.tx.send(Err(PoolConnectError::TooManyConnections)) .ok(); continue; } @@ -289,7 +299,7 @@ impl Actor { tokio::spawn(run_connection_actor(id, conn_rx, context)); // Send the handler to the new actor - if conn_tx.send(handler).await.is_err() { + if conn_tx.send(msg).await.is_err() { error!(%id, "Failed to send handler to new connection actor"); self.connections.remove(&id); } @@ -324,8 +334,6 @@ pub enum ConnectionPoolError { #[derive(Debug, Snafu)] pub struct ExecuteError; -type ExecuteResult = std::result::Result<(), ExecuteError>; - impl From for ExecuteError { fn from(_: PoolConnectError) -> Self { ExecuteError @@ -348,62 +356,17 @@ impl ConnectionPool { Self { tx } } - /// Connect to a node and execute the given handler function - /// - /// The connection will either be a new connection or an existing one if it is already established. - /// If connection establishment succeeds, the handler will be called with a [`Ok`]. - /// If connection establishment fails, the handler will get passed a [`Err`] containing the error. - /// - /// The fn f is guaranteed to be called exactly once, unless the tokio runtime is shutting down. - pub async fn connect( + pub async fn connect( &self, id: NodeId, - f: F, - ) -> std::result::Result<(), ConnectionPoolError> - where - F: FnOnce(PoolConnectResult) -> Fut + Send + 'static, - Fut: Future + Send + 'static, + ) -> std::result::Result, ConnectionPoolError> { - let handler = - Box::new(move |conn: PoolConnectResult| Box::pin(f(conn)) as BoxFuture); - + let (tx, rx) = oneshot::channel(); self.tx - .send(ActorMessage::Handle { id, handler }) + .send(ActorMessage::RequestRef(RequestRef { id, tx })) .await .map_err(|_| ConnectionPoolError::Shutdown)?; - - Ok(()) - } - - pub async fn with_connection( - &self, - id: NodeId, - f: F, - ) -> Result, PoolConnectError>, ConnectionPoolError> - where - F: FnOnce(Connection) -> Fut + Send + 'static, - Fut: Future> + Send + 'static, - I: Send + 'static, - E: Send + 'static, - { - let (tx, rx) = oneshot::channel(); - self.connect(id, |conn| async move { - let (res, ret) = match conn { - Ok(connection) => { - let res = f(connection).await; - let ret = match &res { - Ok(_) => Ok(()), - Err(_) => Err(ExecuteError), - }; - (Ok(res), ret) - } - Err(e) => (Err(e), Err(ExecuteError)), - }; - tx.send(res).ok(); - ret - }) - .await?; - rx.await.map_err(|_| ConnectionPoolError::Shutdown) + Ok(rx.await.map_err(|_| ConnectionPoolError::Shutdown)?) } /// Close an existing connection, if it exists diff --git a/iroh-connection-pool/src/tests.rs b/iroh-connection-pool/src/tests.rs index 11e95c9..55cafd8 100644 --- a/iroh-connection-pool/src/tests.rs +++ b/iroh-connection-pool/src/tests.rs @@ -40,7 +40,7 @@ impl ProtocolHandler for Echo { } } -async fn echo_client(conn: Connection, text: &[u8]) -> n0_snafu::Result> { +async fn echo_client(conn: &Connection, text: &[u8]) -> n0_snafu::Result> { let conn_id = conn.stable_id(); let id = conn.remote_node_id().e()?; trace!(%id, %conn_id, "Sending echo request"); @@ -96,18 +96,23 @@ impl EchoClient { Result), n0_snafu::Error>, PoolConnectError>, ConnectionPoolError, > { - self.pool - .with_connection(id, |conn| async move { - let id = conn.stable_id(); - let res = echo_client(conn, &text).await?; - Ok((id, res)) - }) - .await + let conn = self.pool.connect(id).await?; + let conn = match conn { + Ok(conn) => conn, + Err(e) => return Ok(Err(e)), + }; + let id = conn.stable_id(); + match echo_client(&conn, &text).await { + Ok(res) => Ok(Ok(Ok((id, res)))), + Err(e) => Ok(Ok(Err(e))), + } } } #[tokio::test] async fn connection_pool_errors() -> TestResult<()> { + let filter = tracing_subscriber::EnvFilter::from_default_env(); + tracing_subscriber::fmt().with_env_filter(filter).try_init().ok(); // set up static discovery for all addrs let discovery = StaticProvider::new(); let endpoint = iroh::Endpoint::builder() @@ -145,9 +150,9 @@ async fn connection_pool_errors() -> TestResult<()> { #[tokio::test] async fn connection_pool_smoke() -> TestResult<()> { - let n = 32; let filter = tracing_subscriber::EnvFilter::from_default_env(); - tracing_subscriber::fmt().with_env_filter(filter).init(); + tracing_subscriber::fmt().with_env_filter(filter).try_init().ok(); + let n = 32; let nodes = echo_servers(n).await?; let ids = nodes .iter() @@ -172,7 +177,7 @@ async fn connection_pool_smoke() -> TestResult<()> { assert_eq!(cid1, cid2); connection_ids.insert(id, cid1); } - tokio::time::sleep(Duration::from_millis(200)).await; + tokio::time::sleep(Duration::from_millis(1000)).await; for id in &ids { let cid1 = *connection_ids.get(id).expect("Connection ID not found"); let (cid2, res) = client.echo(*id, msg.clone()).await???; @@ -186,9 +191,9 @@ async fn connection_pool_smoke() -> TestResult<()> { /// maximum connection limit. #[tokio::test] async fn connection_pool_idle() -> TestResult<()> { - let n = 32; let filter = tracing_subscriber::EnvFilter::from_default_env(); - tracing_subscriber::fmt().with_env_filter(filter).init(); + tracing_subscriber::fmt().with_env_filter(filter).try_init().ok(); + let n = 32; let nodes = echo_servers(n).await?; let ids = nodes .iter() From b6b7dee154623bb03b30dcb4647a4acddad4fae2 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Tue, 19 Aug 2025 10:29:50 +0200 Subject: [PATCH 19/26] flatten error --- iroh-connection-pool/src/connection_pool.rs | 40 ++++++++----------- iroh-connection-pool/src/tests.rs | 44 ++++++++++----------- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index 68d6795..9743c9e 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -9,7 +9,10 @@ //! It is important that you use the connection only in the future passed to //! connect, and don't clone it out of the future. use std::{ - collections::{HashMap, VecDeque}, ops::Deref, sync::Arc, time::Duration + collections::{HashMap, VecDeque}, + ops::Deref, + sync::Arc, + time::Duration, }; use iroh::{ @@ -19,7 +22,11 @@ use iroh::{ use n0_future::MaybeFuture; use snafu::Snafu; use tokio::{ - sync::{mpsc::{self, error::SendError as TokioSendError}, oneshot, OwnedSemaphorePermit}, + sync::{ + OwnedSemaphorePermit, + mpsc::{self, error::SendError as TokioSendError}, + oneshot, + }, task::JoinError, }; use tokio_util::time::FutureExt; @@ -80,14 +87,14 @@ struct Context { /// errors such as timeouts and connection limits. #[derive(Debug, Clone)] pub enum PoolConnectError { + /// Connection pool is shut down + Shutdown, /// Timeout during connect Timeout, /// Too many connections TooManyConnections, /// Error during connect ConnectError(Arc), - /// Error during last execute - ExecuteError(Arc), /// Handler actor panicked JoinError(Arc), } @@ -95,10 +102,10 @@ pub enum PoolConnectError { impl std::fmt::Display for PoolConnectError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { + PoolConnectError::Shutdown => write!(f, "Connection pool is shut down"), PoolConnectError::Timeout => write!(f, "Connection timed out"), PoolConnectError::TooManyConnections => write!(f, "Too many connections"), PoolConnectError::ConnectError(e) => write!(f, "Connection error: {}", e), - PoolConnectError::ExecuteError(e) => write!(f, "Execution error: {}", e), PoolConnectError::JoinError(e) => write!(f, "Join error: {}", e), } } @@ -286,8 +293,7 @@ impl Actor { trace!("removing oldest idle connection {}", idle); self.connections.remove(&idle); } else { - msg.tx.send(Err(PoolConnectError::TooManyConnections)) - .ok(); + msg.tx.send(Err(PoolConnectError::TooManyConnections)).ok(); continue; } } @@ -327,19 +333,6 @@ pub enum ConnectionPoolError { Shutdown, } -/// An error during the usage of the connection. -/// -/// The connection pool will recreate the connection if a handler returns this -/// error. If you don't want this, swallow the error in the handler. -#[derive(Debug, Snafu)] -pub struct ExecuteError; - -impl From for ExecuteError { - fn from(_: PoolConnectError) -> Self { - ExecuteError - } -} - /// A connection pool #[derive(Debug, Clone)] pub struct ConnectionPool { @@ -359,14 +352,13 @@ impl ConnectionPool { pub async fn connect( &self, id: NodeId, - ) -> std::result::Result, ConnectionPoolError> - { + ) -> std::result::Result { let (tx, rx) = oneshot::channel(); self.tx .send(ActorMessage::RequestRef(RequestRef { id, tx })) .await - .map_err(|_| ConnectionPoolError::Shutdown)?; - Ok(rx.await.map_err(|_| ConnectionPoolError::Shutdown)?) + .map_err(|_| PoolConnectError::Shutdown)?; + Ok(rx.await.map_err(|_| PoolConnectError::Shutdown)??) } /// Close an existing connection, if it exists diff --git a/iroh-connection-pool/src/tests.rs b/iroh-connection-pool/src/tests.rs index 55cafd8..c3fccfc 100644 --- a/iroh-connection-pool/src/tests.rs +++ b/iroh-connection-pool/src/tests.rs @@ -11,7 +11,7 @@ use n0_snafu::ResultExt; use testresult::TestResult; use tracing::trace; -use crate::connection_pool::{ConnectionPool, ConnectionPoolError, Options, PoolConnectError}; +use crate::connection_pool::{ConnectionPool, Options, PoolConnectError}; const ECHO_ALPN: &[u8] = b"echo"; @@ -92,19 +92,12 @@ impl EchoClient { &self, id: NodeId, text: Vec, - ) -> Result< - Result), n0_snafu::Error>, PoolConnectError>, - ConnectionPoolError, - > { + ) -> Result), n0_snafu::Error>, PoolConnectError> { let conn = self.pool.connect(id).await?; - let conn = match conn { - Ok(conn) => conn, - Err(e) => return Ok(Err(e)), - }; let id = conn.stable_id(); match echo_client(&conn, &text).await { - Ok(res) => Ok(Ok(Ok((id, res)))), - Err(e) => Ok(Ok(Err(e))), + Ok(res) => Ok(Ok((id, res))), + Err(e) => Ok(Err(e)), } } } @@ -112,7 +105,10 @@ impl EchoClient { #[tokio::test] async fn connection_pool_errors() -> TestResult<()> { let filter = tracing_subscriber::EnvFilter::from_default_env(); - tracing_subscriber::fmt().with_env_filter(filter).try_init().ok(); + tracing_subscriber::fmt() + .with_env_filter(filter) + .try_init() + .ok(); // set up static discovery for all addrs let discovery = StaticProvider::new(); let endpoint = iroh::Endpoint::builder() @@ -123,7 +119,7 @@ async fn connection_pool_errors() -> TestResult<()> { let client = EchoClient { pool }; { let non_existing = SecretKey::from_bytes(&[0; 32]).public(); - let res = client.echo(non_existing, b"Hello, world!".to_vec()).await?; + let res = client.echo(non_existing, b"Hello, world!".to_vec()).await; // trying to connect to a non-existing id will fail with ConnectError // because we don't have any information about the node assert!(matches!(res, Err(PoolConnectError::ConnectError(_)))); @@ -140,9 +136,7 @@ async fn connection_pool_errors() -> TestResult<()> { }); // trying to connect to an id for which we have info, but the other // end is not listening, will lead to a timeout. - let res = client - .echo(non_listening, b"Hello, world!".to_vec()) - .await?; + let res = client.echo(non_listening, b"Hello, world!".to_vec()).await; assert!(matches!(res, Err(PoolConnectError::Timeout))); } Ok(()) @@ -151,7 +145,10 @@ async fn connection_pool_errors() -> TestResult<()> { #[tokio::test] async fn connection_pool_smoke() -> TestResult<()> { let filter = tracing_subscriber::EnvFilter::from_default_env(); - tracing_subscriber::fmt().with_env_filter(filter).try_init().ok(); + tracing_subscriber::fmt() + .with_env_filter(filter) + .try_init() + .ok(); let n = 32; let nodes = echo_servers(n).await?; let ids = nodes @@ -170,9 +167,9 @@ async fn connection_pool_smoke() -> TestResult<()> { let mut connection_ids = BTreeMap::new(); let msg = b"Hello, world!".to_vec(); for id in &ids { - let (cid1, res) = client.echo(*id, msg.clone()).await???; + let (cid1, res) = client.echo(*id, msg.clone()).await??; assert_eq!(res, msg); - let (cid2, res) = client.echo(*id, msg.clone()).await???; + let (cid2, res) = client.echo(*id, msg.clone()).await??; assert_eq!(res, msg); assert_eq!(cid1, cid2); connection_ids.insert(id, cid1); @@ -180,7 +177,7 @@ async fn connection_pool_smoke() -> TestResult<()> { tokio::time::sleep(Duration::from_millis(1000)).await; for id in &ids { let cid1 = *connection_ids.get(id).expect("Connection ID not found"); - let (cid2, res) = client.echo(*id, msg.clone()).await???; + let (cid2, res) = client.echo(*id, msg.clone()).await??; assert_eq!(res, msg); assert_ne!(cid1, cid2); } @@ -192,7 +189,10 @@ async fn connection_pool_smoke() -> TestResult<()> { #[tokio::test] async fn connection_pool_idle() -> TestResult<()> { let filter = tracing_subscriber::EnvFilter::from_default_env(); - tracing_subscriber::fmt().with_env_filter(filter).try_init().ok(); + tracing_subscriber::fmt() + .with_env_filter(filter) + .try_init() + .ok(); let n = 32; let nodes = echo_servers(n).await?; let ids = nodes @@ -218,7 +218,7 @@ async fn connection_pool_idle() -> TestResult<()> { let client = EchoClient { pool }; let msg = b"Hello, world!".to_vec(); for id in &ids { - let (_, res) = client.echo(*id, msg.clone()).await???; + let (_, res) = client.echo(*id, msg.clone()).await??; assert_eq!(res, msg); } Ok(()) From 501f2eeb8a5a97d255a1df3f76aa5c5e7f006b97 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Tue, 19 Aug 2025 14:44:55 +0200 Subject: [PATCH 20/26] Remove the semaphore hack since it causes an issue sometimes --- iroh-connection-pool/src/connection_pool.rs | 112 +++++++++++++++----- 1 file changed, 87 insertions(+), 25 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index 9743c9e..6969614 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -11,7 +11,10 @@ use std::{ collections::{HashMap, VecDeque}, ops::Deref, - sync::Arc, + sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, + }, time::Duration, }; @@ -19,18 +22,18 @@ use iroh::{ Endpoint, NodeId, endpoint::{ConnectError, Connection}, }; -use n0_future::MaybeFuture; +use n0_future::{MaybeFuture, Stream, StreamExt}; use snafu::Snafu; use tokio::{ sync::{ - OwnedSemaphorePermit, + Notify, mpsc::{self, error::SendError as TokioSendError}, oneshot, }, task::JoinError, }; -use tokio_util::time::FutureExt; -use tracing::{debug, error, trace}; +use tokio_util::time::FutureExt as TimeFutureExt; +use tracing::{debug, error, info, trace}; /// Configuration options for the connection pool #[derive(Debug, Clone, Copy)] @@ -54,7 +57,7 @@ impl Default for Options { #[derive(Debug)] pub struct ConnectionRef { connection: iroh::endpoint::Connection, - _permit: OwnedSemaphorePermit, + _permit: OneConnection, } impl Deref for ConnectionRef { @@ -66,10 +69,10 @@ impl Deref for ConnectionRef { } impl ConnectionRef { - fn new(connection: iroh::endpoint::Connection, permit: OwnedSemaphorePermit) -> Self { + fn new(connection: iroh::endpoint::Connection, counter: OneConnection) -> Self { Self { connection, - _permit: permit, + _permit: counter, } } } @@ -124,6 +127,67 @@ struct RequestRef { tx: oneshot::Sender>, } +#[derive(Debug)] +struct ConnectionCounterInner { + count: AtomicUsize, + notify: Notify, +} + +#[derive(Debug, Clone)] +struct ConnectionCounter { + inner: Arc, +} + +impl ConnectionCounter { + fn new() -> Self { + Self { + inner: Arc::new(ConnectionCounterInner { + count: Default::default(), + notify: Notify::new(), + }), + } + } + + fn get_one(&self) -> OneConnection { + self.inner.count.fetch_add(1, Ordering::SeqCst); + OneConnection { + inner: self.inner.clone(), + } + } + + fn is_idle(&self) -> bool { + self.inner.count.load(Ordering::SeqCst) == 0 + } + + /// Infinite stream that yields when the connection is briefly idle. + /// + /// Note that you still have to check if the connection is still idle when + /// you get the notification. + /// + /// Also note that this stream is triggered on [OneConnection::drop], so it + /// won't trigger initially even though a [ConnectionCounter] starts up as + /// idle. + fn idle_stream(self) -> impl Stream { + n0_future::stream::unfold(self, |c| async move { + c.inner.notify.notified().await; + Some(((), c)) + }) + } +} + +#[derive(Debug)] +struct OneConnection { + inner: Arc, +} + +impl Drop for OneConnection { + fn drop(&mut self) { + if self.inner.count.fetch_sub(1, Ordering::SeqCst) == 1 { + self.inner.notify.notify_waiters(); + } + } +} + /// Run a connection actor for a single node async fn run_connection_actor( node_id: NodeId, @@ -147,10 +211,11 @@ async fn run_connection_actor( return; } } - let semaphore = Arc::new(tokio::sync::Semaphore::new(u32::MAX as usize)); + let counter = ConnectionCounter::new(); let idle_timer = MaybeFuture::default(); - let idle_fut = MaybeFuture::default(); - tokio::pin!(idle_timer, idle_fut); + let idle_stream = counter.clone().idle_stream(); + + tokio::pin!(idle_timer, idle_stream); loop { tokio::select! { @@ -161,15 +226,9 @@ async fn run_connection_actor( match handler { Some(RequestRef { id, tx }) => { assert!(id == node_id, "Not for me!"); - trace!(%node_id, "Received new request"); match &state { Ok(state) => { - // first acquire a permit for the op, then aquire all permits for idle - let permit = semaphore.clone().acquire_owned().await.expect("semaphore closed"); - let res = ConnectionRef::new(state.clone(), permit); - if idle_fut.is_none() { - idle_fut.as_mut().set_future(semaphore.clone().acquire_many_owned(u32::MAX)); - } + let res = ConnectionRef::new(state.clone(), counter.get_one()); // clear the idle timer idle_timer.as_mut().set_none(); @@ -187,7 +246,10 @@ async fn run_connection_actor( } } - _ = &mut idle_fut => { + _ = idle_stream.next() => { + if !counter.is_idle() { + continue; + }; // notify the pool that we are idle. trace!(%node_id, "Idle"); if context.owner.idle(node_id).await.is_err() { @@ -200,7 +262,7 @@ async fn run_connection_actor( // Idle timeout - request shutdown _ = &mut idle_timer => { - debug!(%node_id, "Connection idle, requesting shutdown"); + trace!(%node_id, "Idle timer expired, requesting shutdown"); context.owner.close(node_id).await.ok(); // Don't break here - wait for main actor to close our channel } @@ -208,15 +270,15 @@ async fn run_connection_actor( } if let Ok(connection) = state { - let reason = if semaphore.available_permits() == u32::MAX as usize { - "idle" + let reason = if counter.inner.count.load(Ordering::SeqCst) > 0 { + b"drop" } else { - "drop" + b"idle" }; - connection.close(0u32.into(), reason.as_bytes()); + connection.close(0u32.into(), reason); } - debug!(%node_id, "Connection actor shutting down"); + trace!(%node_id, "Connection actor shutting down"); } struct Actor { From ac857391229220727c44995c4634d836625dc88f Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Tue, 19 Aug 2025 17:34:16 +0200 Subject: [PATCH 21/26] Change the ConnectionPool0Rtt to return a ConnectionRef --- iroh-connection-pool/src/connection_pool.rs | 108 +---- .../src/connection_pool_0rtt.rs | 403 ++++++++++-------- iroh-connection-pool/src/lib.rs | 98 +++++ 3 files changed, 335 insertions(+), 274 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index 6969614..e2039c9 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -10,11 +10,7 @@ //! connect, and don't clone it out of the future. use std::{ collections::{HashMap, VecDeque}, - ops::Deref, - sync::{ - Arc, - atomic::{AtomicUsize, Ordering}, - }, + sync::Arc, time::Duration, }; @@ -22,18 +18,19 @@ use iroh::{ Endpoint, NodeId, endpoint::{ConnectError, Connection}, }; -use n0_future::{MaybeFuture, Stream, StreamExt}; +use n0_future::{MaybeFuture, StreamExt}; use snafu::Snafu; use tokio::{ sync::{ - Notify, mpsc::{self, error::SendError as TokioSendError}, oneshot, }, task::JoinError, }; use tokio_util::time::FutureExt as TimeFutureExt; -use tracing::{debug, error, info, trace}; +use tracing::{debug, error, trace}; + +use crate::{ConnectionCounter, ConnectionRef}; /// Configuration options for the connection pool #[derive(Debug, Clone, Copy)] @@ -53,30 +50,6 @@ impl Default for Options { } } -/// A reference to a connection that is owned by a connection pool. -#[derive(Debug)] -pub struct ConnectionRef { - connection: iroh::endpoint::Connection, - _permit: OneConnection, -} - -impl Deref for ConnectionRef { - type Target = iroh::endpoint::Connection; - - fn deref(&self) -> &Self::Target { - &self.connection - } -} - -impl ConnectionRef { - fn new(connection: iroh::endpoint::Connection, counter: OneConnection) -> Self { - Self { - connection, - _permit: counter, - } - } -} - struct Context { options: Options, endpoint: Endpoint, @@ -127,67 +100,6 @@ struct RequestRef { tx: oneshot::Sender>, } -#[derive(Debug)] -struct ConnectionCounterInner { - count: AtomicUsize, - notify: Notify, -} - -#[derive(Debug, Clone)] -struct ConnectionCounter { - inner: Arc, -} - -impl ConnectionCounter { - fn new() -> Self { - Self { - inner: Arc::new(ConnectionCounterInner { - count: Default::default(), - notify: Notify::new(), - }), - } - } - - fn get_one(&self) -> OneConnection { - self.inner.count.fetch_add(1, Ordering::SeqCst); - OneConnection { - inner: self.inner.clone(), - } - } - - fn is_idle(&self) -> bool { - self.inner.count.load(Ordering::SeqCst) == 0 - } - - /// Infinite stream that yields when the connection is briefly idle. - /// - /// Note that you still have to check if the connection is still idle when - /// you get the notification. - /// - /// Also note that this stream is triggered on [OneConnection::drop], so it - /// won't trigger initially even though a [ConnectionCounter] starts up as - /// idle. - fn idle_stream(self) -> impl Stream { - n0_future::stream::unfold(self, |c| async move { - c.inner.notify.notified().await; - Some(((), c)) - }) - } -} - -#[derive(Debug)] -struct OneConnection { - inner: Arc, -} - -impl Drop for OneConnection { - fn drop(&mut self) { - if self.inner.count.fetch_sub(1, Ordering::SeqCst) == 1 { - self.inner.notify.notify_waiters(); - } - } -} - /// Run a connection actor for a single node async fn run_connection_actor( node_id: NodeId, @@ -270,11 +182,7 @@ async fn run_connection_actor( } if let Ok(connection) = state { - let reason = if counter.inner.count.load(Ordering::SeqCst) > 0 { - b"drop" - } else { - b"idle" - }; + let reason = if counter.is_idle() { b"idle" } else { b"drop" }; connection.close(0u32.into(), reason); } @@ -411,6 +319,10 @@ impl ConnectionPool { Self { tx } } + /// Returns either a fresh connection or a reference to an existing one. + /// + /// This is guaranteed to return after approximately [Options::connect_timeout] + /// with either an error or a connection. pub async fn connect( &self, id: NodeId, diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index 6c571df..7c14c37 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -6,27 +6,36 @@ //! Access to connections is via the [`ConnectionPool0Rtt::connect`] method, which //! gives you access to a connection if possible. //! -//! It is important that you use the connection only in the future passed to -//! connect, and don't clone it out of the future. -//! //! For what 0Rtt connections are and why you might want to use them, see this //! [blog post](https://www.iroh.computer/blog/0rtt-api). -use std::{collections::HashMap, sync::Arc, time::Duration}; +use std::{ + collections::{HashMap, VecDeque}, + pin::Pin, + sync::Arc, + task::Poll, + time::Duration, +}; use iroh::{ Endpoint, NodeId, endpoint::{ConnectOptions, ConnectWithOptsError, Connection, ConnectionError}, }; -use n0_future::{MaybeFuture, boxed::BoxFuture}; +use n0_future::{FutureExt, MaybeFuture, StreamExt}; use snafu::Snafu; use tokio::{ - sync::{broadcast, mpsc}, - task::{JoinError, JoinSet}, + sync::{ + broadcast, + mpsc::{self, error::SendError as TokioSendError}, + oneshot, + }, + task::JoinError, }; -use tokio_util::time::FutureExt; -use tracing::{error, trace}; +use tokio_util::time::FutureExt as TimeFutureExt; +use tracing::{debug, error, trace}; + +use crate::{ConnectionCounter, ConnectionRef}; -/// Configuration options for the 0rtt connection pool +/// Configuration options for the connection pool #[derive(Debug, Clone, Copy)] pub struct Options { pub idle_timeout: Duration, @@ -44,110 +53,104 @@ impl Default for Options { } } -/// Part of the main actor state that is needed by the connection actors struct Context { options: Options, - alpn: Vec, endpoint: Endpoint, owner: ConnectionPool0Rtt, + alpn: Vec, } -type BoxedHandler = - Box BoxFuture + Send + 'static>; - -pub type PoolConnectResult = - std::result::Result<(Connection, broadcast::Receiver), PoolConnectError>; - /// Error when a connection can not be acquired /// /// This includes the normal iroh connection errors as well as pool specific /// errors such as timeouts and connection limits. +#[derive(Debug, Clone)] pub enum PoolConnectError { + /// Connection pool is shut down + Shutdown, /// Timeout during connect Timeout, /// Too many connections TooManyConnections, - /// Error in the first stage of connect - ConnectError(ConnectWithOptsError), - /// Error in the second stage of connect - ConnectionError(ConnectionError), - /// Error during usage of the connection - ExecuteError(ExecuteError), - /// Connection actor panicked - JoinError(JoinError), + /// Error during connect + ConnectError(Arc), + /// Error during connect + ConnectionError(Arc), + /// Handler actor panicked + JoinError(Arc), } -enum ActorMessage { - Handle { id: NodeId, handler: BoxedHandler }, - ConnectionShutdown { id: NodeId }, +impl std::fmt::Display for PoolConnectError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PoolConnectError::Shutdown => write!(f, "Connection pool is shut down"), + PoolConnectError::Timeout => write!(f, "Connection timed out"), + PoolConnectError::TooManyConnections => write!(f, "Too many connections"), + PoolConnectError::ConnectError(e) => write!(f, "Connect error: {}", e), + PoolConnectError::ConnectionError(e) => write!(f, "Connection error: {}", e), + PoolConnectError::JoinError(e) => write!(f, "Join error: {}", e), + } + } } -fn accepted(value: bool) -> broadcast::Receiver { - let (tx, rx) = broadcast::channel(1); - let _ = tx.send(value); - rx +pub type PoolConnectResult = + std::result::Result<(Connection, Option>), PoolConnectError>; + +/// Future that completes when a connection is fully established +pub enum ConnectionState { + /// The connection is in the handshake phase, the future will resolve when the handshake is complete + Handshake(n0_future::boxed::BoxFuture), + /// TThe connection is already fully established + FullyEstablished, } -async fn connect( - endpoint: &Endpoint, - node_id: NodeId, - alpn: &[u8], -) -> (PoolConnectResult, MaybeFuture>) { - let connecting = match endpoint - .connect_with_opts(node_id, alpn, ConnectOptions::default()) - .await - { - Ok(connecting) => connecting, - Err(cause) => { - trace!("Failed to connect to node {}: {}", node_id, cause); - return ( - Err(PoolConnectError::ConnectError(cause)), - MaybeFuture::None, - ); - } - }; - let (conn, zero_rtt_accepted) = match connecting.into_0rtt() { - Ok((conn, accepted)) => { - trace!("Connected to node {} with 0-RTT", node_id); - (conn, accepted) - } - Err(connecting) => { - trace!("Failed to connect using 0-RTT to node {}", node_id); - let res = match connecting.await { - Err(cause) => Err(PoolConnectError::ConnectionError(cause)), - Ok(connection) => Ok((connection, accepted(true))), - }; - return (res, MaybeFuture::None); +impl Future for ConnectionState { + type Output = bool; + + fn poll(self: Pin<&mut Self>, cx: &mut std::task::Context<'_>) -> Poll { + match self.get_mut() { + ConnectionState::Handshake(rx) => rx.poll(cx), + ConnectionState::FullyEstablished => Poll::Ready(true), } - }; - let (tx, rx) = broadcast::channel(1); - let complete = Box::pin(async move { - tx.send(zero_rtt_accepted.await).ok(); - }); - (Ok((conn, rx)), MaybeFuture::Some(complete)) + } +} + +enum ActorMessage { + RequestRef(RequestRef), + ConnectionIdle { id: NodeId }, + ConnectionShutdown { id: NodeId }, +} + +struct RequestRef { + id: NodeId, + tx: oneshot::Sender>, } /// Run a connection actor for a single node async fn run_connection_actor( node_id: NodeId, - mut rx: mpsc::Receiver, + mut rx: mpsc::Receiver, context: Arc, ) { // Connect to the node - let (mut state, forwarder) = match connect(&context.endpoint, node_id, &context.alpn) + let (state, forwarder) = match connect(&context.endpoint, node_id, &context.alpn) .timeout(context.options.connect_timeout) .await { Ok((state, forwarder)) => (state, forwarder), Err(_) => (Err(PoolConnectError::Timeout), MaybeFuture::None), }; - if state.is_err() && context.owner.close(node_id).await.is_err() { - return; + if let Err(e) = &state { + debug!(%node_id, "Failed to connect {e:?}, requesting shutdown"); + if context.owner.close(node_id).await.is_err() { + return; + } } - let mut tasks = JoinSet::new(); + let counter = ConnectionCounter::new(); let idle_timer = MaybeFuture::default(); - tokio::pin!(idle_timer); - tokio::pin!(forwarder); + let idle_stream = counter.clone().idle_stream(); + + tokio::pin!(idle_timer, idle_stream, forwarder); loop { tokio::select! { @@ -156,9 +159,28 @@ async fn run_connection_actor( // Handle new work handler = rx.recv() => { match handler { - Some(handler) => { - idle_timer.as_mut().set_none(); - tasks.spawn(handler(&state)); + Some(RequestRef { id, tx }) => { + assert!(id == node_id, "Not for me!"); + match &state { + Ok(state) => { + let conn = state.0.clone(); + let handshake_complete = match &state.1 { + Some(tx) => ConnectionState::Handshake({ + let mut recv = tx.subscribe(); + Box::pin(async move { recv.recv().await.unwrap_or_default() }) + }), + None => ConnectionState::FullyEstablished, + }; + let res = ConnectionRef::new(conn, counter.get_one()); + + // clear the idle timer + idle_timer.as_mut().set_none(); + tx.send(Ok((res, handshake_complete))).ok(); + } + Err(cause) => { + tx.send(Err(cause.clone())).ok(); + } + } } None => { // Channel closed - finish remaining tasks and exit @@ -167,71 +189,46 @@ async fn run_connection_actor( } } - // Handle completed tasks - Some(task_result) = tasks.join_next(), if !tasks.is_empty() => { - match task_result { - Ok(Ok(())) => { - trace!("Task completed for node {}", node_id); - } - Ok(Err(e)) => { - trace!("Task failed for node {}: {}", node_id, e); - if let Ok((conn, _)) = state { - conn.close(1u32.into(), b"error"); - } - state = Err(PoolConnectError::ExecuteError(e)); - context.owner.close(node_id).await.ok(); - } - Err(e) => { - error!("Task panicked for node {}: {}", node_id, e); - if let Ok((conn, _)) = state { - conn.close(1u32.into(), b"panic"); - } - state = Err(PoolConnectError::JoinError(e)); - context.owner.close(node_id).await.ok(); - } - } - - // We are idle - if tasks.is_empty() { - // If the channel is closed, we can exit - if rx.is_closed() { - break; - } - // set the idle timer - idle_timer.as_mut().set_future(tokio::time::sleep(context.options.idle_timeout)); + _ = idle_stream.next() => { + if !counter.is_idle() { + continue; + }; + // notify the pool that we are idle. + trace!(%node_id, "Idle"); + if context.owner.idle(node_id).await.is_err() { + // If we can't notify the pool, we are shutting down + break; } + // set the idle timer + idle_timer.as_mut().set_future(tokio::time::sleep(context.options.idle_timeout)); } // Idle timeout - request shutdown _ = &mut idle_timer => { - trace!("Connection to {} idle, requesting shutdown", node_id); + trace!(%node_id, "Idle timer expired, requesting shutdown"); context.owner.close(node_id).await.ok(); // Don't break here - wait for main actor to close our channel } - // Poll the forwarder if we have one _ = &mut forwarder => {} } } - // Wait for remaining tasks to complete - while let Some(task_result) = tasks.join_next().await { - if let Err(e) = task_result { - error!("Task panicked during shutdown for node {}: {}", node_id, e); - } - } - - if let Ok((conn, _)) = state { - conn.close(0u32.into(), b"idle"); + if let Ok(connection) = state { + let reason = if counter.is_idle() { b"idle" } else { b"drop" }; + connection.0.close(0u32.into(), reason); } - trace!("Connection actor for {} shutting down", node_id); + trace!(%node_id, "Connection actor shutting down"); } struct Actor { rx: mpsc::Receiver, - connections: HashMap>, + connections: HashMap>, context: Arc, + // idle set (most recent last) + // todo: use a better data structure if this becomes a performance issue + idle: VecDeque, } impl Actor { @@ -245,6 +242,7 @@ impl Actor { Self { rx, connections: HashMap::new(), + idle: VecDeque::new(), context: Arc::new(Context { options, alpn: alpn.to_vec(), @@ -256,75 +254,90 @@ impl Actor { ) } + fn add_idle(&mut self, id: NodeId) { + self.remove_idle(id); + self.idle.push_back(id); + } + + fn remove_idle(&mut self, id: NodeId) { + self.idle.retain(|&x| x != id); + } + + fn pop_oldest_idle(&mut self) -> Option { + self.idle.pop_front() + } + + fn remove_connection(&mut self, id: NodeId) { + self.connections.remove(&id); + self.remove_idle(id); + } + pub async fn run(mut self) { while let Some(msg) = self.rx.recv().await { match msg { - ActorMessage::Handle { id, mut handler } => { + ActorMessage::RequestRef(mut msg) => { + let id = msg.id; + self.remove_idle(id); // Try to send to existing connection actor if let Some(conn_tx) = self.connections.get(&id) { - if let Err(tokio::sync::mpsc::error::SendError(e)) = - conn_tx.send(handler).await - { - handler = e; + if let Err(TokioSendError(e)) = conn_tx.send(msg).await { + msg = e; } else { continue; } // Connection actor died, remove it - self.connections.remove(&id); + self.remove_connection(id); } - // No connection actor or it died - spawn a new one + // No connection actor or it died - check limits if self.connections.len() >= self.context.options.max_connections { - handler(&Err(PoolConnectError::TooManyConnections)) - .await - .ok(); - continue; + if let Some(idle) = self.pop_oldest_idle() { + // remove the oldest idle connection to make room for one more + trace!("removing oldest idle connection {}", idle); + self.connections.remove(&idle); + } else { + msg.tx.send(Err(PoolConnectError::TooManyConnections)).ok(); + continue; + } } let (conn_tx, conn_rx) = mpsc::channel(100); self.connections.insert(id, conn_tx.clone()); let context = self.context.clone(); + tokio::spawn(run_connection_actor(id, conn_rx, context)); // Send the handler to the new actor - if conn_tx.send(handler).await.is_err() { - tracing::error!( - "Failed to send handler to new connection actor for {}", - id - ); + if conn_tx.send(msg).await.is_err() { + error!(%id, "Failed to send handler to new connection actor"); self.connections.remove(&id); } } + ActorMessage::ConnectionIdle { id } => { + self.add_idle(id); + trace!(%id, "connection idle"); + } ActorMessage::ConnectionShutdown { id } => { // Remove the connection from our map - this closes the channel - self.connections.remove(&id); - tracing::debug!("Approved shutdown for connection {}", id); + self.remove_connection(id); + trace!(%id, "removed connection"); } } } } } -/// Error when calling a fn on the [`ConnectionPool0Rtt`]. +/// Error when calling a fn on the [`ConnectionPool`]. /// /// The only thing that can go wrong is that the connection pool is shut down. #[derive(Debug, Snafu)] pub enum ConnectionPoolError { + /// The connection pool has been shut down Shutdown, } -/// An error during the usage of the connection. -/// -/// The connection pool will recreate the connection if a handler returns this -/// error. If you don't want this, swallow the error in the handler. -#[derive(Debug, Snafu)] -pub struct ExecuteError; - -type ExecuteResult = std::result::Result<(), ExecuteError>; - -/// A connection pool for 0-RTT connections +/// A connection pool #[derive(Debug, Clone)] -#[repr(transparent)] pub struct ConnectionPool0Rtt { tx: mpsc::Sender, } @@ -339,6 +352,22 @@ impl ConnectionPool0Rtt { Self { tx } } + /// Returns either a fresh connection or a reference to an existing one. + /// + /// This is guaranteed to return after approximately [Options::connect_timeout] + /// with either an error or a connection. + pub async fn connect( + &self, + id: NodeId, + ) -> std::result::Result<(ConnectionRef, ConnectionState), PoolConnectError> { + let (tx, rx) = oneshot::channel(); + self.tx + .send(ActorMessage::RequestRef(RequestRef { id, tx })) + .await + .map_err(|_| PoolConnectError::Shutdown)?; + Ok(rx.await.map_err(|_| PoolConnectError::Shutdown)??) + } + /// Close an existing connection, if it exists /// /// This will finish pending tasks and close the connection. New tasks will @@ -351,32 +380,54 @@ impl ConnectionPool0Rtt { Ok(()) } - /// Connect to a node and execute the given handler function + /// Notify the connection pool that a connection is idle. /// - /// The connection will either be a new connection or an existing one if it is already established. - /// If connection establishment succeeds, the handler will be called with a [`Ok`]. - /// If connection establishment fails, the handler will get passed a [`Err`] containing the error. - /// - /// The fn f is guaranteed to be called exactly once, unless the tokio runtime is shutting down. - /// If the fn returns an error, it is assumed that the connection is no longer valid. This will cause - /// the connection to be closed and a new one to be established for future calls. - pub async fn connect( - &self, - id: NodeId, - f: F, - ) -> std::result::Result<(), ConnectionPoolError> - where - F: FnOnce(&PoolConnectResult) -> Fut + Send + 'static, - Fut: std::future::Future + Send + 'static, - { - let handler = - Box::new(move |conn: &PoolConnectResult| Box::pin(f(conn)) as BoxFuture); - + /// Should only be called from connection handlers. + pub(crate) async fn idle(&self, id: NodeId) -> std::result::Result<(), ConnectionPoolError> { self.tx - .send(ActorMessage::Handle { id, handler }) + .send(ActorMessage::ConnectionIdle { id }) .await .map_err(|_| ConnectionPoolError::Shutdown)?; - Ok(()) } } + +async fn connect( + endpoint: &Endpoint, + node_id: NodeId, + alpn: &[u8], +) -> (PoolConnectResult, MaybeFuture>) { + let connecting = match endpoint + .connect_with_opts(node_id, alpn, ConnectOptions::default()) + .await + { + Ok(connecting) => connecting, + Err(cause) => { + trace!("Failed to connect to node {}: {}", node_id, cause); + return ( + Err(PoolConnectError::ConnectError(Arc::new(cause))), + MaybeFuture::None, + ); + } + }; + let (conn, zero_rtt_accepted) = match connecting.into_0rtt() { + Ok((conn, accepted)) => { + trace!("Connected to node {} with 0-RTT", node_id); + (conn, accepted) + } + Err(connecting) => { + trace!("Failed to connect using 0-RTT to node {}", node_id); + let res = match connecting.await { + Err(cause) => Err(PoolConnectError::ConnectionError(Arc::new(cause))), + Ok(connection) => Ok((connection, None)), + }; + return (res, MaybeFuture::None); + } + }; + let (tx, _) = broadcast::channel(1); + let tx2 = tx.clone(); + let complete = Box::pin(async move { + tx2.send(zero_rtt_accepted.await).ok(); + }); + (Ok((conn, Some(tx))), MaybeFuture::Some(complete)) +} diff --git a/iroh-connection-pool/src/lib.rs b/iroh-connection-pool/src/lib.rs index 0c5a5be..27f9f09 100644 --- a/iroh-connection-pool/src/lib.rs +++ b/iroh-connection-pool/src/lib.rs @@ -1,5 +1,103 @@ +use std::{ + ops::Deref, + sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, + }, +}; + +use n0_future::Stream; +use tokio::sync::Notify; + pub mod connection_pool; pub mod connection_pool_0rtt; #[cfg(test)] mod tests; + +/// A reference to a connection that is owned by a connection pool. +#[derive(Debug)] +pub struct ConnectionRef { + connection: iroh::endpoint::Connection, + _permit: OneConnection, +} + +impl Deref for ConnectionRef { + type Target = iroh::endpoint::Connection; + + fn deref(&self) -> &Self::Target { + &self.connection + } +} + +impl ConnectionRef { + fn new(connection: iroh::endpoint::Connection, counter: OneConnection) -> Self { + Self { + connection, + _permit: counter, + } + } +} + +#[derive(Debug)] +struct ConnectionCounterInner { + count: AtomicUsize, + notify: Notify, +} + +#[derive(Debug, Clone)] +struct ConnectionCounter { + inner: Arc, +} + +impl ConnectionCounter { + fn new() -> Self { + Self { + inner: Arc::new(ConnectionCounterInner { + count: Default::default(), + notify: Notify::new(), + }), + } + } + + /// Increase the connection count and return a guard for the new connection + fn get_one(&self) -> OneConnection { + self.inner.count.fetch_add(1, Ordering::SeqCst); + OneConnection { + inner: self.inner.clone(), + } + } + + fn is_idle(&self) -> bool { + self.inner.count.load(Ordering::SeqCst) == 0 + } + + /// Infinite stream that yields when the connection is briefly idle. + /// + /// Note that you still have to check if the connection is still idle when + /// you get the notification. + /// + /// Also note that this stream is triggered on [OneConnection::drop], so it + /// won't trigger initially even though a [ConnectionCounter] starts up as + /// idle. + fn idle_stream(self) -> impl Stream { + n0_future::stream::unfold(self, |c| async move { + c.inner.notify.notified().await; + Some(((), c)) + }) + } +} + +/// Guard for one connection +#[derive(Debug)] +struct OneConnection { + inner: Arc, +} + +impl Drop for OneConnection { + fn drop(&mut self) { + if self.inner.count.fetch_sub(1, Ordering::SeqCst) == 1 { + self.inner.notify.notify_waiters(); + } + } +} From 735c1999b111bc879ccb08350555c1b265ac0418 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Tue, 19 Aug 2025 17:49:19 +0200 Subject: [PATCH 22/26] Move ConnectionRef into the two pools --- iroh-connection-pool/src/connection_pool.rs | 27 ++++++++++++++- .../src/connection_pool_0rtt.rs | 27 ++++++++++++++- iroh-connection-pool/src/lib.rs | 33 ++----------------- 3 files changed, 55 insertions(+), 32 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index e2039c9..d60e251 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -10,6 +10,7 @@ //! connect, and don't clone it out of the future. use std::{ collections::{HashMap, VecDeque}, + ops::Deref, sync::Arc, time::Duration, }; @@ -30,7 +31,7 @@ use tokio::{ use tokio_util::time::FutureExt as TimeFutureExt; use tracing::{debug, error, trace}; -use crate::{ConnectionCounter, ConnectionRef}; +use crate::{ConnectionCounter, OneConnection}; /// Configuration options for the connection pool #[derive(Debug, Clone, Copy)] @@ -50,6 +51,30 @@ impl Default for Options { } } +/// A reference to a connection that is owned by a connection pool. +#[derive(Debug)] +pub struct ConnectionRef { + connection: iroh::endpoint::Connection, + _permit: OneConnection, +} + +impl Deref for ConnectionRef { + type Target = iroh::endpoint::Connection; + + fn deref(&self) -> &Self::Target { + &self.connection + } +} + +impl ConnectionRef { + fn new(connection: iroh::endpoint::Connection, counter: OneConnection) -> Self { + Self { + connection, + _permit: counter, + } + } +} + struct Context { options: Options, endpoint: Endpoint, diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index 7c14c37..84a8ef3 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -10,6 +10,7 @@ //! [blog post](https://www.iroh.computer/blog/0rtt-api). use std::{ collections::{HashMap, VecDeque}, + ops::Deref, pin::Pin, sync::Arc, task::Poll, @@ -33,7 +34,7 @@ use tokio::{ use tokio_util::time::FutureExt as TimeFutureExt; use tracing::{debug, error, trace}; -use crate::{ConnectionCounter, ConnectionRef}; +use crate::{ConnectionCounter, OneConnection}; /// Configuration options for the connection pool #[derive(Debug, Clone, Copy)] @@ -53,6 +54,30 @@ impl Default for Options { } } +/// A reference to a connection that is owned by a connection pool. +#[derive(Debug)] +pub struct ConnectionRef { + connection: iroh::endpoint::Connection, + _permit: OneConnection, +} + +impl Deref for ConnectionRef { + type Target = iroh::endpoint::Connection; + + fn deref(&self) -> &Self::Target { + &self.connection + } +} + +impl ConnectionRef { + fn new(connection: iroh::endpoint::Connection, counter: OneConnection) -> Self { + Self { + connection, + _permit: counter, + } + } +} + struct Context { options: Options, endpoint: Endpoint, diff --git a/iroh-connection-pool/src/lib.rs b/iroh-connection-pool/src/lib.rs index 27f9f09..664f073 100644 --- a/iroh-connection-pool/src/lib.rs +++ b/iroh-connection-pool/src/lib.rs @@ -1,9 +1,6 @@ -use std::{ - ops::Deref, - sync::{ - Arc, - atomic::{AtomicUsize, Ordering}, - }, +use std::sync::{ + Arc, + atomic::{AtomicUsize, Ordering}, }; use n0_future::Stream; @@ -15,30 +12,6 @@ pub mod connection_pool_0rtt; #[cfg(test)] mod tests; -/// A reference to a connection that is owned by a connection pool. -#[derive(Debug)] -pub struct ConnectionRef { - connection: iroh::endpoint::Connection, - _permit: OneConnection, -} - -impl Deref for ConnectionRef { - type Target = iroh::endpoint::Connection; - - fn deref(&self) -> &Self::Target { - &self.connection - } -} - -impl ConnectionRef { - fn new(connection: iroh::endpoint::Connection, counter: OneConnection) -> Self { - Self { - connection, - _permit: counter, - } - } -} - #[derive(Debug)] struct ConnectionCounterInner { count: AtomicUsize, From cbeb15a707734c24ba17ab79ac7d07fb2cccb1eb Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Tue, 19 Aug 2025 17:52:29 +0200 Subject: [PATCH 23/26] clippy --- iroh-connection-pool/src/connection_pool.rs | 6 +++--- iroh-connection-pool/src/connection_pool_0rtt.rs | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index d60e251..5a7af76 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -106,8 +106,8 @@ impl std::fmt::Display for PoolConnectError { PoolConnectError::Shutdown => write!(f, "Connection pool is shut down"), PoolConnectError::Timeout => write!(f, "Connection timed out"), PoolConnectError::TooManyConnections => write!(f, "Too many connections"), - PoolConnectError::ConnectError(e) => write!(f, "Connection error: {}", e), - PoolConnectError::JoinError(e) => write!(f, "Join error: {}", e), + PoolConnectError::ConnectError(e) => write!(f, "Connection error: {e}"), + PoolConnectError::JoinError(e) => write!(f, "Join error: {e}"), } } } @@ -357,7 +357,7 @@ impl ConnectionPool { .send(ActorMessage::RequestRef(RequestRef { id, tx })) .await .map_err(|_| PoolConnectError::Shutdown)?; - Ok(rx.await.map_err(|_| PoolConnectError::Shutdown)??) + rx.await.map_err(|_| PoolConnectError::Shutdown)? } /// Close an existing connection, if it exists diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index 84a8ef3..cb636b8 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -111,9 +111,9 @@ impl std::fmt::Display for PoolConnectError { PoolConnectError::Shutdown => write!(f, "Connection pool is shut down"), PoolConnectError::Timeout => write!(f, "Connection timed out"), PoolConnectError::TooManyConnections => write!(f, "Too many connections"), - PoolConnectError::ConnectError(e) => write!(f, "Connect error: {}", e), - PoolConnectError::ConnectionError(e) => write!(f, "Connection error: {}", e), - PoolConnectError::JoinError(e) => write!(f, "Join error: {}", e), + PoolConnectError::ConnectError(e) => write!(f, "Connect error: {e}"), + PoolConnectError::ConnectionError(e) => write!(f, "Connection error: {e}"), + PoolConnectError::JoinError(e) => write!(f, "Join error: {e}"), } } } @@ -390,7 +390,7 @@ impl ConnectionPool0Rtt { .send(ActorMessage::RequestRef(RequestRef { id, tx })) .await .map_err(|_| PoolConnectError::Shutdown)?; - Ok(rx.await.map_err(|_| PoolConnectError::Shutdown)??) + rx.await.map_err(|_| PoolConnectError::Shutdown)? } /// Close an existing connection, if it exists From 67b200ba520c058c43548c02b9fc0138afc945e2 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 20 Aug 2025 11:33:35 +0200 Subject: [PATCH 24/26] Use snafu for errors, and track pool tasks explicitly --- iroh-connection-pool/src/connection_pool.rs | 152 ++++++++++-------- .../src/connection_pool_0rtt.rs | 146 ++++++++++------- iroh-connection-pool/src/tests.rs | 2 +- 3 files changed, 173 insertions(+), 127 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index 5a7af76..fe4d827 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -26,7 +26,7 @@ use tokio::{ mpsc::{self, error::SendError as TokioSendError}, oneshot, }, - task::JoinError, + task::JoinSet, }; use tokio_util::time::FutureExt as TimeFutureExt; use tracing::{debug, error, trace}; @@ -86,7 +86,8 @@ struct Context { /// /// This includes the normal iroh connection errors as well as pool specific /// errors such as timeouts and connection limits. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Snafu)] +#[snafu(module)] pub enum PoolConnectError { /// Connection pool is shut down Shutdown, @@ -95,23 +96,27 @@ pub enum PoolConnectError { /// Too many connections TooManyConnections, /// Error during connect - ConnectError(Arc), - /// Handler actor panicked - JoinError(Arc), + ConnectError { source: Arc }, } -impl std::fmt::Display for PoolConnectError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - PoolConnectError::Shutdown => write!(f, "Connection pool is shut down"), - PoolConnectError::Timeout => write!(f, "Connection timed out"), - PoolConnectError::TooManyConnections => write!(f, "Too many connections"), - PoolConnectError::ConnectError(e) => write!(f, "Connection error: {e}"), - PoolConnectError::JoinError(e) => write!(f, "Join error: {e}"), +impl From for PoolConnectError { + fn from(e: ConnectError) -> Self { + PoolConnectError::ConnectError { + source: Arc::new(e), } } } +/// Error when calling a fn on the [`ConnectionPool`]. +/// +/// The only thing that can go wrong is that the connection pool is shut down. +#[derive(Debug, Snafu)] +#[snafu(module)] +pub enum ConnectionPoolError { + /// The connection pool has been shut down + Shutdown, +} + pub type PoolConnectResult = std::result::Result; enum ActorMessage { @@ -125,7 +130,7 @@ struct RequestRef { tx: oneshot::Sender>, } -/// Run a connection actor for a single node +/// Run a connection actor for a single remote node id async fn run_connection_actor( node_id: NodeId, mut rx: mpsc::Receiver, @@ -139,7 +144,7 @@ async fn run_connection_actor( .await { Ok(Ok(conn)) => Ok(conn), - Ok(Err(e)) => Err(PoolConnectError::ConnectError(Arc::new(e))), + Ok(Err(e)) => Err(PoolConnectError::from(e)), Err(_) => Err(PoolConnectError::Timeout), }; if let Err(e) = &state { @@ -221,6 +226,8 @@ struct Actor { // idle set (most recent last) // todo: use a better data structure if this becomes a performance issue idle: VecDeque, + // per connection tasks + tasks: JoinSet<()>, } impl Actor { @@ -241,6 +248,7 @@ impl Actor { endpoint, owner: ConnectionPool { tx: tx.clone() }, }), + tasks: JoinSet::new(), }, tx, ) @@ -264,70 +272,84 @@ impl Actor { self.remove_idle(id); } - pub async fn run(mut self) { - while let Some(msg) = self.rx.recv().await { - match msg { - ActorMessage::RequestRef(mut msg) => { - let id = msg.id; - self.remove_idle(id); - // Try to send to existing connection actor - if let Some(conn_tx) = self.connections.get(&id) { - if let Err(TokioSendError(e)) = conn_tx.send(msg).await { - msg = e; - } else { - continue; - } - // Connection actor died, remove it - self.remove_connection(id); + async fn handle_msg(&mut self, msg: ActorMessage) { + match msg { + ActorMessage::RequestRef(mut msg) => { + let id = msg.id; + self.remove_idle(id); + // Try to send to existing connection actor + if let Some(conn_tx) = self.connections.get(&id) { + if let Err(TokioSendError(e)) = conn_tx.send(msg).await { + msg = e; + } else { + return; } + // Connection actor died, remove it + self.remove_connection(id); + } - // No connection actor or it died - check limits - if self.connections.len() >= self.context.options.max_connections { - if let Some(idle) = self.pop_oldest_idle() { - // remove the oldest idle connection to make room for one more - trace!("removing oldest idle connection {}", idle); - self.connections.remove(&idle); - } else { - msg.tx.send(Err(PoolConnectError::TooManyConnections)).ok(); - continue; - } + // No connection actor or it died - check limits + if self.connections.len() >= self.context.options.max_connections { + if let Some(idle) = self.pop_oldest_idle() { + // remove the oldest idle connection to make room for one more + trace!("removing oldest idle connection {}", idle); + self.connections.remove(&idle); + } else { + msg.tx.send(Err(PoolConnectError::TooManyConnections)).ok(); + return; } - let (conn_tx, conn_rx) = mpsc::channel(100); - self.connections.insert(id, conn_tx.clone()); + } + let (conn_tx, conn_rx) = mpsc::channel(100); + self.connections.insert(id, conn_tx.clone()); - let context = self.context.clone(); + let context = self.context.clone(); - tokio::spawn(run_connection_actor(id, conn_rx, context)); + self.tasks.spawn(run_connection_actor(id, conn_rx, context)); - // Send the handler to the new actor - if conn_tx.send(msg).await.is_err() { - error!(%id, "Failed to send handler to new connection actor"); - self.connections.remove(&id); - } + // Send the handler to the new actor + if conn_tx.send(msg).await.is_err() { + error!(%id, "Failed to send handler to new connection actor"); + self.connections.remove(&id); } - ActorMessage::ConnectionIdle { id } => { - self.add_idle(id); - trace!(%id, "connection idle"); + } + ActorMessage::ConnectionIdle { id } => { + self.add_idle(id); + trace!(%id, "connection idle"); + } + ActorMessage::ConnectionShutdown { id } => { + // Remove the connection from our map - this closes the channel + self.remove_connection(id); + trace!(%id, "removed connection"); + } + } + } + + pub async fn run(mut self) { + loop { + tokio::select! { + biased; + + msg = self.rx.recv() => { + if let Some(msg) = msg { + self.handle_msg(msg).await; + } else { + break; + } } - ActorMessage::ConnectionShutdown { id } => { - // Remove the connection from our map - this closes the channel - self.remove_connection(id); - trace!(%id, "removed connection"); + + res = self.tasks.join_next(), if !self.tasks.is_empty() => { + if let Some(Err(e)) = res { + // panic during either connection establishment or + // timeout. Message handling is outside the actor's + // control, so we should hopefully never get this. + error!("Connection actor failed: {e}"); + } } } } } } -/// Error when calling a fn on the [`ConnectionPool`]. -/// -/// The only thing that can go wrong is that the connection pool is shut down. -#[derive(Debug, Snafu)] -pub enum ConnectionPoolError { - /// The connection pool has been shut down - Shutdown, -} - /// A connection pool #[derive(Debug, Clone)] pub struct ConnectionPool { diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index cb636b8..7f27820 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -29,7 +29,7 @@ use tokio::{ mpsc::{self, error::SendError as TokioSendError}, oneshot, }, - task::JoinError, + task::JoinSet, }; use tokio_util::time::FutureExt as TimeFutureExt; use tracing::{debug, error, trace}; @@ -89,7 +89,8 @@ struct Context { /// /// This includes the normal iroh connection errors as well as pool specific /// errors such as timeouts and connection limits. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Snafu)] +#[snafu(module)] pub enum PoolConnectError { /// Connection pool is shut down Shutdown, @@ -98,22 +99,23 @@ pub enum PoolConnectError { /// Too many connections TooManyConnections, /// Error during connect - ConnectError(Arc), + ConnectError { source: Arc }, /// Error during connect - ConnectionError(Arc), - /// Handler actor panicked - JoinError(Arc), + ConnectionError { source: Arc }, +} + +impl From for PoolConnectError { + fn from(e: ConnectWithOptsError) -> Self { + PoolConnectError::ConnectError { + source: Arc::new(e), + } + } } -impl std::fmt::Display for PoolConnectError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - PoolConnectError::Shutdown => write!(f, "Connection pool is shut down"), - PoolConnectError::Timeout => write!(f, "Connection timed out"), - PoolConnectError::TooManyConnections => write!(f, "Too many connections"), - PoolConnectError::ConnectError(e) => write!(f, "Connect error: {e}"), - PoolConnectError::ConnectionError(e) => write!(f, "Connection error: {e}"), - PoolConnectError::JoinError(e) => write!(f, "Join error: {e}"), +impl From for PoolConnectError { + fn from(e: ConnectionError) -> Self { + PoolConnectError::ConnectionError { + source: Arc::new(e), } } } @@ -254,6 +256,7 @@ struct Actor { // idle set (most recent last) // todo: use a better data structure if this becomes a performance issue idle: VecDeque, + tasks: JoinSet<()>, } impl Actor { @@ -274,6 +277,7 @@ impl Actor { endpoint, owner: ConnectionPool0Rtt { tx: tx.clone() }, }), + tasks: JoinSet::new(), }, tx, ) @@ -297,55 +301,78 @@ impl Actor { self.remove_idle(id); } - pub async fn run(mut self) { - while let Some(msg) = self.rx.recv().await { - match msg { - ActorMessage::RequestRef(mut msg) => { - let id = msg.id; - self.remove_idle(id); - // Try to send to existing connection actor - if let Some(conn_tx) = self.connections.get(&id) { - if let Err(TokioSendError(e)) = conn_tx.send(msg).await { - msg = e; - } else { - continue; - } - // Connection actor died, remove it - self.remove_connection(id); + async fn handle_msg(&mut self, msg: ActorMessage) { + match msg { + ActorMessage::RequestRef(mut msg) => { + let id = msg.id; + self.remove_idle(id); + // Try to send to existing connection actor + if let Some(conn_tx) = self.connections.get(&id) { + if let Err(TokioSendError(e)) = conn_tx.send(msg).await { + msg = e; + } else { + return; } + // Connection actor died, remove it + self.remove_connection(id); + } - // No connection actor or it died - check limits - if self.connections.len() >= self.context.options.max_connections { - if let Some(idle) = self.pop_oldest_idle() { - // remove the oldest idle connection to make room for one more - trace!("removing oldest idle connection {}", idle); - self.connections.remove(&idle); - } else { - msg.tx.send(Err(PoolConnectError::TooManyConnections)).ok(); - continue; - } + // No connection actor or it died - check limits + if self.connections.len() >= self.context.options.max_connections { + if let Some(idle) = self.pop_oldest_idle() { + // remove the oldest idle connection to make room for one more + trace!("removing oldest idle connection {}", idle); + self.connections.remove(&idle); + } else { + msg.tx.send(Err(PoolConnectError::TooManyConnections)).ok(); + return; } - let (conn_tx, conn_rx) = mpsc::channel(100); - self.connections.insert(id, conn_tx.clone()); + } + let (conn_tx, conn_rx) = mpsc::channel(100); + self.connections.insert(id, conn_tx.clone()); - let context = self.context.clone(); + let context = self.context.clone(); - tokio::spawn(run_connection_actor(id, conn_rx, context)); + tokio::spawn(run_connection_actor(id, conn_rx, context)); - // Send the handler to the new actor - if conn_tx.send(msg).await.is_err() { - error!(%id, "Failed to send handler to new connection actor"); - self.connections.remove(&id); - } + // Send the handler to the new actor + if conn_tx.send(msg).await.is_err() { + error!(%id, "Failed to send handler to new connection actor"); + self.connections.remove(&id); } - ActorMessage::ConnectionIdle { id } => { - self.add_idle(id); - trace!(%id, "connection idle"); + } + ActorMessage::ConnectionIdle { id } => { + self.add_idle(id); + trace!(%id, "connection idle"); + } + ActorMessage::ConnectionShutdown { id } => { + // Remove the connection from our map - this closes the channel + self.remove_connection(id); + trace!(%id, "removed connection"); + } + } + } + + pub async fn run(mut self) { + loop { + tokio::select! { + biased; + + msg = self.rx.recv() => { + if let Some(msg) = msg { + self.handle_msg(msg).await; + } else { + break; + } } - ActorMessage::ConnectionShutdown { id } => { - // Remove the connection from our map - this closes the channel - self.remove_connection(id); - trace!(%id, "removed connection"); + + res = self.tasks.join_next(), if !self.tasks.is_empty() => { + if let Some(Err(e)) = res { + // panic during either connection establishment or + // timeout. Message handling is outside the actor's + // control, so we should hopefully never get this. + error!("Connection actor failed: {e}"); + } } } } @@ -429,10 +456,7 @@ async fn connect( Ok(connecting) => connecting, Err(cause) => { trace!("Failed to connect to node {}: {}", node_id, cause); - return ( - Err(PoolConnectError::ConnectError(Arc::new(cause))), - MaybeFuture::None, - ); + return (Err(PoolConnectError::from(cause)), MaybeFuture::None); } }; let (conn, zero_rtt_accepted) = match connecting.into_0rtt() { @@ -443,7 +467,7 @@ async fn connect( Err(connecting) => { trace!("Failed to connect using 0-RTT to node {}", node_id); let res = match connecting.await { - Err(cause) => Err(PoolConnectError::ConnectionError(Arc::new(cause))), + Err(cause) => Err(PoolConnectError::from(cause)), Ok(connection) => Ok((connection, None)), }; return (res, MaybeFuture::None); diff --git a/iroh-connection-pool/src/tests.rs b/iroh-connection-pool/src/tests.rs index c3fccfc..5d3acca 100644 --- a/iroh-connection-pool/src/tests.rs +++ b/iroh-connection-pool/src/tests.rs @@ -122,7 +122,7 @@ async fn connection_pool_errors() -> TestResult<()> { let res = client.echo(non_existing, b"Hello, world!".to_vec()).await; // trying to connect to a non-existing id will fail with ConnectError // because we don't have any information about the node - assert!(matches!(res, Err(PoolConnectError::ConnectError(_)))); + assert!(matches!(res, Err(PoolConnectError::ConnectError { .. }))); } { let non_listening = SecretKey::from_bytes(&[0; 32]).public(); From 3643d0ee0097ef80b18b6578e9a3124be9d99d6d Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 20 Aug 2025 15:40:04 +0200 Subject: [PATCH 25/26] Add docs with caveats for ConnectionRef. --- .../src/connection_pool_0rtt.rs | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/iroh-connection-pool/src/connection_pool_0rtt.rs b/iroh-connection-pool/src/connection_pool_0rtt.rs index 7f27820..df93db8 100644 --- a/iroh-connection-pool/src/connection_pool_0rtt.rs +++ b/iroh-connection-pool/src/connection_pool_0rtt.rs @@ -55,6 +55,24 @@ impl Default for Options { } /// A reference to a connection that is owned by a connection pool. +/// +/// You need to keep the [`ConnectionRef`] around until you are done with the +/// connection. Otherwise the connection will be closed by the pool after +/// the idle timeout, even if you are still working with it! +/// +/// ```rust +/// # use iroh::NodeId; +/// # async fn test() -> anyhow::Result<()> { +/// # let pool: iroh_connection_pool::connection_pool::ConnectionPool = todo!(); +/// # let node_id: NodeId = todo!(); +/// let conn = pool.connect(node_id).await?; +/// // work with the connection +/// drop(conn); +/// # } +/// ``` +/// +/// ConnectionRef does not implement Clone. If you need multiple you can just +/// wrap them in an Arc or request another one from the pool. #[derive(Debug)] pub struct ConnectionRef { connection: iroh::endpoint::Connection, @@ -379,7 +397,7 @@ impl Actor { } } -/// Error when calling a fn on the [`ConnectionPool`]. +/// Error when calling a fn on the [`ConnectionPool0Rtt`]. /// /// The only thing that can go wrong is that the connection pool is shut down. #[derive(Debug, Snafu)] From d283e57013a987544bb7fe4c9267105211dca257 Mon Sep 17 00:00:00 2001 From: Ruediger Klaehn Date: Wed, 20 Aug 2025 17:24:31 +0200 Subject: [PATCH 26/26] Use FuturesUnordered instead of JoinSet we don't expect connect to panic, so we don't need to isolate it. --- iroh-connection-pool/src/connection_pool.rs | 30 +++++++++------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/iroh-connection-pool/src/connection_pool.rs b/iroh-connection-pool/src/connection_pool.rs index fe4d827..adde757 100644 --- a/iroh-connection-pool/src/connection_pool.rs +++ b/iroh-connection-pool/src/connection_pool.rs @@ -19,14 +19,14 @@ use iroh::{ Endpoint, NodeId, endpoint::{ConnectError, Connection}, }; -use n0_future::{MaybeFuture, StreamExt}; +use n0_future::{ + FuturesUnordered, MaybeFuture, StreamExt, + future::{self}, +}; use snafu::Snafu; -use tokio::{ - sync::{ - mpsc::{self, error::SendError as TokioSendError}, - oneshot, - }, - task::JoinSet, +use tokio::sync::{ + mpsc::{self, error::SendError as TokioSendError}, + oneshot, }; use tokio_util::time::FutureExt as TimeFutureExt; use tracing::{debug, error, trace}; @@ -227,7 +227,7 @@ struct Actor { // todo: use a better data structure if this becomes a performance issue idle: VecDeque, // per connection tasks - tasks: JoinSet<()>, + tasks: FuturesUnordered>, } impl Actor { @@ -248,7 +248,7 @@ impl Actor { endpoint, owner: ConnectionPool { tx: tx.clone() }, }), - tasks: JoinSet::new(), + tasks: FuturesUnordered::new(), }, tx, ) @@ -304,7 +304,8 @@ impl Actor { let context = self.context.clone(); - self.tasks.spawn(run_connection_actor(id, conn_rx, context)); + self.tasks + .push(Box::pin(run_connection_actor(id, conn_rx, context))); // Send the handler to the new actor if conn_tx.send(msg).await.is_err() { @@ -337,14 +338,7 @@ impl Actor { } } - res = self.tasks.join_next(), if !self.tasks.is_empty() => { - if let Some(Err(e)) = res { - // panic during either connection establishment or - // timeout. Message handling is outside the actor's - // control, so we should hopefully never get this. - error!("Connection actor failed: {e}"); - } - } + _ = self.tasks.next(), if !self.tasks.is_empty() => {} } } }