diff --git a/mujina-miner/src/asic/bm13xx/protocol.rs b/mujina-miner/src/asic/bm13xx/protocol.rs index b80c469..0b19724 100644 --- a/mujina-miner/src/asic/bm13xx/protocol.rs +++ b/mujina-miner/src/asic/bm13xx/protocol.rs @@ -44,16 +44,13 @@ pub struct Frequency { impl Frequency { /// Minimum supported frequency in MHz - #[allow(dead_code)] pub const MIN_MHZ: f32 = 50.0; /// Maximum supported frequency in MHz - #[allow(dead_code)] pub const MAX_MHZ: f32 = 800.0; /// Base crystal frequency in MHz - const CRYSTAL_MHZ: f32 = 25.0; + pub const CRYSTAL_MHZ: f32 = 25.0; /// Create frequency from MHz value with validation - #[allow(dead_code)] pub fn from_mhz(mhz: f32) -> Result { if !(Self::MIN_MHZ..=Self::MAX_MHZ).contains(&mhz) { return Err(ProtocolError::InvalidFrequency { mhz: mhz as u32 }); @@ -62,7 +59,6 @@ impl Frequency { } /// Get frequency in MHz - #[allow(dead_code)] pub fn mhz(&self) -> f32 { self.mhz } diff --git a/mujina-miner/src/asic/bm13xx/thread.rs b/mujina-miner/src/asic/bm13xx/thread.rs index 5d26874..76b9722 100644 --- a/mujina-miner/src/asic/bm13xx/thread.rs +++ b/mujina-miner/src/asic/bm13xx/thread.rs @@ -7,11 +7,14 @@ //! The thread is implemented as an actor task that monitors the serial bus for //! chip responses, filters shares, and manages work assignment. -use std::sync::{Arc, RwLock}; +use std::{ + sync::{Arc, RwLock}, + time::Duration, +}; use async_trait::async_trait; use bitcoin::block::Header as BlockHeader; -use futures::{SinkExt, sink::Sink, stream::Stream}; +use futures::{SinkExt, future, sink::Sink, stream::Stream}; use tokio::sync::{mpsc, oneshot, watch}; use tokio_stream::StreamExt; @@ -21,10 +24,16 @@ use crate::{ BoardPeripherals, HashTask, HashThread, HashThreadCapabilities, HashThreadError, HashThreadEvent, HashThreadStatus, Share, ThreadRemovalSignal, }, + thermal::FrequencyCommand, tracing::prelude::*, types::{Difficulty, HashRate}, }; +/// 1/4 of runtime step size: smaller steps during init prevent PLL +/// instability on cold chips. +const INITIALIZATION_RAMP_STEP_SIZE_MHZ: f32 = protocol::Frequency::CRYSTAL_MHZ / 4f32; +const FREQUENCY_WRITE_TIMEOUT: Duration = Duration::from_millis(750); + /// Tracks tasks sent to chip hardware, indexed by chip_job_id. /// /// BM13xx chips use 4-bit job IDs. This tracker maintains snapshots of @@ -86,6 +95,16 @@ enum ThreadCommand { Shutdown, } +/// Actor context for BM13xx thread. +/// +/// Groups channels and shared state to reduce function parameter count. +struct ThreadActorContext { + cmd_rx: mpsc::Receiver, + _evt_tx: mpsc::Sender, + removal_rx: watch::Receiver, + status: Arc>, +} + /// BM13xx HashThread implementation. /// /// Represents a chain of BM13xx chips as a schedulable worker. The thread @@ -120,12 +139,16 @@ impl BM13xxThread { /// * `chip_commands` - Sink for sending encoded commands to chips /// * `peripherals` - Hardware interfaces from board (enable, regulator, etc.) /// * `removal_rx` - Watch channel for board-triggered removal + /// * `frequency_command_rx` - Optional receiver for frequency adjustment commands + /// * `operating_frequency_mhz` - Target operating frequency for ASIC chips (MHz) pub fn new( name: String, chip_responses: R, chip_commands: W, peripherals: BoardPeripherals, removal_rx: watch::Receiver, + frequency_command_rx: Option>, + operating_frequency_mhz: f32, ) -> Self where R: Stream> + Unpin + Send + 'static, @@ -138,16 +161,21 @@ impl BM13xxThread { let status = Arc::new(RwLock::new(HashThreadStatus::default())); let status_clone = Arc::clone(&status); - // Spawn the actor task + let ctx = ThreadActorContext { + cmd_rx, + _evt_tx: evt_tx, + removal_rx, + status: status_clone, + }; + tokio::spawn(async move { bm13xx_thread_actor( - cmd_rx, - evt_tx, - removal_rx, - status_clone, + ctx, chip_responses, chip_commands, peripherals, + frequency_command_rx, + operating_frequency_mhz, ) .await; }); @@ -240,6 +268,7 @@ impl HashThread for BM13xxThread { async fn initialize_chip( chip_commands: &mut W, peripherals: &mut BoardPeripherals, + operating_frequency_mhz: f32, ) -> Result<(), HashThreadError> where W: Sink + Unpin, @@ -498,9 +527,16 @@ where HashThreadError::InitializationFailed(format!("Core final send failed: {:?}", e)) })?; - // Frequency ramping (56.25 MHz -> 525 MHz) - debug!("Ramping frequency from 56.25 MHz to 525 MHz"); - let frequency_steps = generate_frequency_ramp_steps(56.25, 525.0, 6.25); + // Ramp frequency gradually to prevent PLL instability during initialization + debug!( + "Ramping frequency from 56.25 MHz to {} MHz", + operating_frequency_mhz + ); + let frequency_steps = generate_frequency_ramp_steps( + 56.25, + operating_frequency_mhz, + INITIALIZATION_RAMP_STEP_SIZE_MHZ, + ); for (i, pll_config) in frequency_steps.iter().enumerate() { chip_commands @@ -678,6 +714,18 @@ fn calculate_pll_for_frequency(target_freq: f32) -> Option )) } +/// Receives a frequency command when thermal management is active and the chip is initialized. +/// Otherwise never completes (uses pending future). +async fn recv_frequency_command_if_ready( + frequency_command_rx: &mut Option>, + chip_initialized: bool, +) -> Option { + match (chip_initialized, frequency_command_rx) { + (true, Some(rx)) => rx.recv().await, + _ => future::pending().await, + } +} + /// Internal actor task for BM13xxThread. /// /// This runs as an independent Tokio task and handles: @@ -690,13 +738,12 @@ fn calculate_pll_for_frequency(target_freq: f32) -> Option /// Chip is disabled on startup to establish known state. Chip is enabled and /// configured when scheduler assigns first work. async fn bm13xx_thread_actor( - mut cmd_rx: mpsc::Receiver, - _evt_tx: mpsc::Sender, - mut removal_rx: watch::Receiver, - status: Arc>, + mut ctx: ThreadActorContext, mut chip_responses: R, mut chip_commands: W, mut peripherals: BoardPeripherals, + mut frequency_command_rx: Option>, + operating_frequency_mhz: f32, ) where R: Stream> + Unpin, W: Sink + Unpin, @@ -715,11 +762,13 @@ async fn bm13xx_thread_actor( let mut ntime_ticker = tokio::time::interval(tokio::time::Duration::from_secs(1)); ntime_ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + let mut current_frequency_mhz: f32 = operating_frequency_mhz; + loop { tokio::select! { // Removal signal (highest priority) - _ = removal_rx.changed() => { - let signal = removal_rx.borrow().clone(); // Clone to avoid holding borrow across await + _ = ctx.removal_rx.changed() => { + let signal = ctx.removal_rx.borrow().clone(); // Clone to avoid holding borrow across await match signal { ThreadRemovalSignal::Running => { // False alarm - still running @@ -727,7 +776,7 @@ async fn bm13xx_thread_actor( _reason => { // Update status { - let mut s = status.write().unwrap(); + let mut s = ctx.status.write().unwrap(); s.is_active = false; } @@ -738,7 +787,7 @@ async fn bm13xx_thread_actor( } // Commands from scheduler - Some(cmd) = cmd_rx.recv() => { + Some(cmd) = ctx.cmd_rx.recv() => { match cmd { ThreadCommand::UpdateTask { new_task, response_tx } => { if let Some(ref old) = current_task { @@ -753,7 +802,13 @@ async fn bm13xx_thread_actor( if !chip_initialized { trace!("Initializing chip on first assignment."); - if let Err(e) = initialize_chip(&mut chip_commands, &mut peripherals).await { + if let Err(e) = initialize_chip( + &mut chip_commands, + &mut peripherals, + operating_frequency_mhz, + ) + .await + { error!(error = %e, "Chip initialization failed"); response_tx.send(Err(e)).ok(); continue; @@ -784,7 +839,7 @@ async fn bm13xx_thread_actor( } { - let mut s = status.write().unwrap(); + let mut s = ctx.status.write().unwrap(); s.is_active = true; } @@ -804,7 +859,13 @@ async fn bm13xx_thread_actor( if !chip_initialized { trace!("Initializing chip on first assignment."); - if let Err(e) = initialize_chip(&mut chip_commands, &mut peripherals).await { + if let Err(e) = initialize_chip( + &mut chip_commands, + &mut peripherals, + operating_frequency_mhz, + ) + .await + { error!(error = %e, "Chip initialization failed"); response_tx.send(Err(e)).ok(); continue; @@ -838,7 +899,7 @@ async fn bm13xx_thread_actor( } { - let mut s = status.write().unwrap(); + let mut s = ctx.status.write().unwrap(); s.is_active = true; } @@ -851,7 +912,7 @@ async fn bm13xx_thread_actor( let old_task = current_task.take(); { - let mut s = status.write().unwrap(); + let mut s = ctx.status.write().unwrap(); s.is_active = false; } @@ -983,6 +1044,78 @@ async fn bm13xx_thread_actor( } } } + + // Handle thermal frequency adjustment commands + Some(cmd) = recv_frequency_command_if_ready(&mut frequency_command_rx, chip_initialized) => { + use protocol::{Command, Frequency, Register}; + + let old_frequency_mhz = current_frequency_mhz; + + debug!( + command = ?cmd, + current_freq_mhz = %current_frequency_mhz, + "Thermal frequency command received" + ); + + let new_frequency_mhz = match cmd { + FrequencyCommand::BumpUp => current_frequency_mhz + protocol::Frequency::CRYSTAL_MHZ, + FrequencyCommand::BumpDown => current_frequency_mhz - protocol::Frequency::CRYSTAL_MHZ, + }; + + let clamped_frequency_mhz = new_frequency_mhz + .clamp(protocol::Frequency::MIN_MHZ, protocol::Frequency::MAX_MHZ); + + match Frequency::from_mhz(clamped_frequency_mhz) { + Err(e) => { + warn!( + error = %e, + requested_freq = clamped_frequency_mhz, + "Invalid frequency requested" + ); + } + Ok(frequency) => { + let pll_config = frequency.calculate_pll(); + + trace!( + freq_mhz = %clamped_frequency_mhz, + pll = ?pll_config, + "Writing PLL divider register" + ); + + let timeout_result = tokio::time::timeout( + FREQUENCY_WRITE_TIMEOUT, + chip_commands.send(Command::WriteRegister { + broadcast: true, + chip_address: 0x00, + register: Register::PllDivider(pll_config), + }), + ) + .await; + + match timeout_result { + Ok(write_result) => match write_result { + Ok(()) => { + current_frequency_mhz = clamped_frequency_mhz; + debug!( + old_freq = old_frequency_mhz, + new_freq = current_frequency_mhz, + "Frequency adjusted" + ); + } + Err(e) => { + error!(error = ?e, "Failed to send PllDivider register write"); + } + }, + Err(_) => { + error!( + timeout_ms = %FREQUENCY_WRITE_TIMEOUT.as_millis(), + "Frequency register write timed out" + ); + } + } + } + } + } } } diff --git a/mujina-miner/src/board/bitaxe.rs b/mujina-miner/src/board/bitaxe.rs index 1c9bc11..73af2e7 100644 --- a/mujina-miner/src/board/bitaxe.rs +++ b/mujina-miner/src/board/bitaxe.rs @@ -4,15 +4,18 @@ use std::{ pin::Pin, sync::Arc, task::{Context, Poll}, - time::Duration, + time::{Duration, Instant}, }; use tokio::{ io::{AsyncRead, ReadBuf}, - sync::{Mutex, watch}, + sync::{Mutex, mpsc, oneshot, watch}, time, }; use tokio_stream::StreamExt; -use tokio_util::codec::{FramedRead, FramedWrite}; +use tokio_util::{ + codec::{FramedRead, FramedWrite}, + sync::CancellationToken, +}; use crate::{ api_client::types::{BoardState, Fan, PowerMeasurement, TemperatureSensor}, @@ -22,6 +25,7 @@ use crate::{ hash_thread::{BoardPeripherals, HashThread, ThreadRemovalSignal}, }, hw_trait::{ + HwError, gpio::{Gpio, GpioPin, PinValue}, i2c::I2c, }, @@ -36,6 +40,10 @@ use crate::{ emc2101::{Emc2101, Percent}, tps546::{Tps546, Tps546Config}, }, + thermal::{ + FanPIDController, FanSpeedCommand, FrequencyCommand, TemperatureFilter, ThermalConfig, + ThermalController, + }, tracing::prelude::*, transport::serial::{SerialControl, SerialReader, SerialStream, SerialWriter}, }; @@ -45,12 +53,53 @@ use super::{ pattern::{Match, StringMatch}, }; +const TEMPERATURE_READING_INTERVAL: Duration = Duration::from_secs(3); +const TEMPERATURE_READ_TIMEOUT: Duration = Duration::from_millis(750); +const TEMPERATURE_MOVING_AVG_WINDOW: u8 = 5; +const TEMPERATURE_NOISE_THRESHOLD_C: f32 = 15.0; + +const FAN_SPEED_WRITE_TIMEOUT: Duration = Duration::from_millis(750); +const THERMAL_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(1); + +const FAN_PID_PROPORTIONAL: f32 = 1.0; +const FAN_PID_INTEGRAL_INIT: f32 = 0.1; +const FAN_PID_INTEGRAL_COEFF: f32 = 0.1; +const FAN_PID_INTEGRAL_MIN: f32 = -10.0; +const FAN_PID_INTEGRAL_MAX: f32 = 10.0; + +const FREQUENCY_COMMAND_QUEUE_CAPACITY: u8 = 8; + /// Adapter implementing `AsicEnable` for Bitaxe's GPIO-based reset control. struct BitaxeAsicEnable { /// Reset pin (directly controls nRST on the BM1370) nrst_pin: BitaxeRawGpioPin, } +#[derive(Debug)] +enum FanWorkerCommand { + SetSpeed { + percent: Percent, + respond_to: oneshot::Sender>, + }, + ReadExternalTemp { + respond_to: oneshot::Sender>, + }, + GetFanSpeed { + respond_to: oneshot::Sender>, + }, + GetTachCount { + respond_to: oneshot::Sender>, + }, + GetRpm { + respond_to: oneshot::Sender>, + }, +} + +struct FanWorkerHandle { + tx: mpsc::Sender, + task: tokio::task::JoinHandle<()>, +} + #[async_trait] impl crate::asic::hash_thread::AsicEnable for BitaxeAsicEnable { async fn enable(&mut self) -> anyhow::Result<()> { @@ -122,8 +171,8 @@ pub struct BitaxeBoard { asic_nrst: Option, /// I2C bus controller i2c: BitaxeRawI2c, - /// Fan controller (board-controlled only, not shared with thread) - fan_controller: Option>, + /// Fan controller worker (serializes EMC2101 access) + fan_worker: Option, /// Voltage regulator (shared with thread, cached state) regulator: Option>>>, /// Writer for sending commands to chips (transferred to hash thread) @@ -141,12 +190,43 @@ pub struct BitaxeBoard { stats_task_handle: Option>, /// Serial number from USB device info serial_number: Option, + /// Frequency command channel (for thermal control) + frequency_command_tx: Option>, + frequency_command_rx: Option>, + /// Thermal controller cancellation token + thermal_cancellation: Option, + /// Thermal task handles for cleanup + thermal_task_handles: Vec>, + /// Thermal configuration + thermal_config: ThermalConfig, /// Channel for publishing board state to the API server. /// Taken by `spawn_stats_monitor` which publishes periodic snapshots. state_tx: Option>, } impl BitaxeBoard { + async fn fan_worker_request( + fan_tx: &mpsc::Sender, + timeout: Duration, + make_cmd: impl FnOnce(oneshot::Sender>) -> FanWorkerCommand, + ) -> crate::hw_trait::Result { + let (resp_tx, resp_rx) = oneshot::channel(); + if fan_tx.send(make_cmd(resp_tx)).await.is_err() { + return Err(HwError::Other("Fan worker channel closed".to_string())); + } + + match tokio::time::timeout(timeout, resp_rx).await { + Ok(recv_result) => match recv_result { + Ok(make_cmd_result) => make_cmd_result, + Err(_) => Err(HwError::Other("Fan worker response dropped".into())), + }, + Err(_) => Err(HwError::Timeout), + } + } + + fn fan_worker_tx(&self) -> Option> { + self.fan_worker.as_ref().map(|worker| worker.tx.clone()) + } /// GPIO pin number for ASIC reset control (active low) const ASIC_RESET_PIN: u8 = 0; @@ -189,11 +269,13 @@ impl BitaxeBoard { // Wrap the data reader with tracing let tracing_reader = TracingReader::new(data_reader, "Data"); + let (frequency_tx, frequency_rx) = mpsc::channel(FREQUENCY_COMMAND_QUEUE_CAPACITY as usize); + Ok(BitaxeBoard { control_channel, asic_nrst: None, i2c, - fan_controller: None, + fan_worker: None, regulator: None, data_writer: Some(FramedWrite::new(data_writer, bm13xx::FrameCodec)), data_reader: Some(FramedRead::new(tracing_reader, bm13xx::FrameCodec)), @@ -202,6 +284,11 @@ impl BitaxeBoard { thread_shutdown: None, stats_task_handle: None, serial_number, + frequency_command_tx: Some(frequency_tx), + frequency_command_rx: Some(frequency_rx), + thermal_cancellation: None, + thermal_task_handles: Vec::new(), + thermal_config: ThermalConfig::default(), state_tx: Some(state_tx), }) } @@ -469,17 +556,80 @@ impl BitaxeBoard { // Initialize the EMC2101 match fan.init().await { Ok(()) => { - // Set fan to full speed until closed-loop control is implemented - match fan.set_fan_speed(Percent::FULL).await { - Ok(()) => { - debug!("Fan speed set to 100%"); - } - Err(e) => { - warn!("Failed to set fan speed: {}", e); + let fan_read_timeout = TEMPERATURE_READ_TIMEOUT; + let fan_write_timeout = FAN_SPEED_WRITE_TIMEOUT; + let (fan_tx, mut fan_rx) = mpsc::channel(32); + + let task = tokio::spawn(async move { + let mut fan = fan; + while let Some(command) = fan_rx.recv().await { + match command { + FanWorkerCommand::SetSpeed { + percent, + respond_to, + } => { + trace!(speed_pct = %u8::from(percent), "Setting fan speed"); + let result = tokio::time::timeout( + fan_write_timeout, + fan.set_fan_speed(percent), + ) + .await; + let response = match result { + Ok(res) => res, + Err(_) => Err(HwError::Timeout), + }; + let _ = respond_to.send(response); + } + FanWorkerCommand::ReadExternalTemp { respond_to } => { + trace!("Reading external temperature"); + let result = tokio::time::timeout( + fan_read_timeout, + fan.get_external_temperature(), + ) + .await; + let response = match result { + Ok(res) => res, + Err(_) => Err(HwError::Timeout), + }; + let _ = respond_to.send(response); + } + FanWorkerCommand::GetFanSpeed { respond_to } => { + trace!("Reading fan speed"); + let result = + tokio::time::timeout(fan_read_timeout, fan.get_fan_speed()) + .await; + let response = match result { + Ok(res) => res, + Err(_) => Err(HwError::Timeout), + }; + let _ = respond_to.send(response); + } + FanWorkerCommand::GetTachCount { respond_to } => { + trace!("Reading TACH count"); + let result = + tokio::time::timeout(fan_read_timeout, fan.get_tach_count()) + .await; + let response = match result { + Ok(res) => res, + Err(_) => Err(HwError::Timeout), + }; + let _ = respond_to.send(response); + } + FanWorkerCommand::GetRpm { respond_to } => { + trace!("Reading fan RPM"); + let result = + tokio::time::timeout(fan_read_timeout, fan.get_rpm()).await; + let response = match result { + Ok(res) => res, + Err(_) => Err(HwError::Timeout), + }; + let _ = respond_to.send(response); + } + } } - } + }); - self.fan_controller = Some(fan); + self.fan_worker = Some(FanWorkerHandle { tx: fan_tx, task }); Ok(()) } Err(e) => { @@ -633,6 +783,9 @@ impl BitaxeBoard { self.init_fan_controller().await?; self.init_power_controller().await?; + // Setup thermal controller after fan controller is available + self.setup_thermal_controller()?; + tokio::time::sleep(Duration::from_millis(500)).await; // Phase 3: Release ASIC from reset for discovery @@ -689,17 +842,233 @@ impl BitaxeBoard { self.chip_infos.len() } - /// Spawn a task to periodically log and publish board telemetry. - fn spawn_stats_monitor(&mut self) { - // Clone data needed for the monitoring task - let i2c = self.i2c.clone(); + /// Get temperature from EMC2101 fan controller. + /// + /// # Returns + /// Temperature in Celsius, or an error if the fan controller is not available + /// or if the temperature read fails. + #[expect( + dead_code, + reason = "Public API method for external thermal control integration" + )] + pub async fn get_temperature(&mut self) -> Result { + let fan_tx = self.fan_worker_tx().ok_or_else(|| { + BoardError::HardwareControl("Fan controller not available".to_string()) + })?; + let timeout = TEMPERATURE_READ_TIMEOUT; + Self::fan_worker_request(&fan_tx, timeout, |respond_to| { + FanWorkerCommand::ReadExternalTemp { respond_to } + }) + .await + .map_err(|e| BoardError::HardwareControl(format!("Failed to read temperature: {}", e))) + } + + /// Set fan speed on EMC2101 fan controller. + /// + /// # Arguments + /// * `speed_percent` - Fan speed as a percentage (0-100) + /// + /// # Returns + /// An error if the fan controller is not available or if setting the fan speed fails. + #[expect( + dead_code, + reason = "Public API method for external thermal control integration" + )] + pub async fn set_fan_speed(&mut self, speed_percent: u8) -> Result<(), BoardError> { + let percent = Percent::try_from(speed_percent) + .map_err(|e| BoardError::HardwareControl(format!("Invalid fan speed: {}", e)))?; + let fan_tx = self.fan_worker_tx().ok_or_else(|| { + BoardError::HardwareControl("Fan controller not available".to_string()) + })?; + + let timeout = FAN_SPEED_WRITE_TIMEOUT; + Self::fan_worker_request(&fan_tx, timeout, |respond_to| FanWorkerCommand::SetSpeed { + percent, + respond_to, + }) + .await + .map_err(|e| BoardError::HardwareControl(format!("Failed to set fan speed: {}", e))) + } + + /// Get the frequency command channel sender. + /// + /// This channel can be used by thermal controllers to send frequency adjustment + /// commands to the board. + /// + /// # Returns + /// The sender for frequency commands, or None if thermal control is not available. + #[expect( + dead_code, + reason = "Public API method for external thermal control integration" + )] + pub fn get_frequency_command_tx(&self) -> Option> { + self.frequency_command_tx.clone() + } + + async fn run_temperature_reader( + fan_tx: mpsc::Sender, + temperature_tx: watch::Sender>, + cancellation: CancellationToken, + ) { + let mut interval = tokio::time::interval(TEMPERATURE_READING_INTERVAL); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + let mut filter = + TemperatureFilter::new(TEMPERATURE_MOVING_AVG_WINDOW, TEMPERATURE_NOISE_THRESHOLD_C); + + loop { + tokio::select! { + _ = cancellation.cancelled() => break, + _ = interval.tick() => { + let read_start = Instant::now(); + let read_result = Self::fan_worker_request( + &fan_tx, + TEMPERATURE_READ_TIMEOUT, + |respond_to| FanWorkerCommand::ReadExternalTemp { respond_to }, + ) + .await; + + match read_result { + Ok(temp) => { + if let Some(accepted) = filter.consider(temp) { + debug!(temp_c = %accepted, "Temperature reading"); + if let Err(e) = temperature_tx.send(Some(accepted)) { + warn!("Temperature watch channel closed: {}", e); + break; + } + } else { + debug!(temp_c = %temp, "Skipping noisy temperature reading"); + } + } + Err(e) => { + warn!( + elapsed_ms = %read_start.elapsed().as_millis(), + "Failed to read temperature: {}", + e + ); + } + } + } + } + } + } + + async fn run_fan_speed_listener( + fan_tx: mpsc::Sender, + mut fan_speed_rx: watch::Receiver, + cancellation: CancellationToken, + ) { + let mut last_speed: Option = None; + loop { + tokio::select! { + _ = cancellation.cancelled() => { + break; + } + result = fan_speed_rx.changed() => { + if result.is_err() { + debug!("Fan speed watch sender dropped"); + break; + } + + let FanSpeedCommand { speed_percent } = *fan_speed_rx.borrow(); + if last_speed == Some(speed_percent) { + continue; + } + + let percent = match Percent::try_from(speed_percent) { + Ok(p) => p, + Err(e) => { + warn!("Invalid fan speed percentage: {}", e); + continue; + } + }; + + debug!(speed_pct = %speed_percent, "Setting fan speed"); + let set_result = Self::fan_worker_request( + &fan_tx, + FAN_SPEED_WRITE_TIMEOUT, + |respond_to| FanWorkerCommand::SetSpeed { + percent, + respond_to, + }, + ) + .await; + + match set_result { + Ok(()) => { + last_speed = Some(speed_percent); + } + Err(e) => { + warn!("Failed to set fan speed: {}", e); + } + } + } + } + } + } + + fn setup_thermal_controller(&mut self) -> Result<(), BoardError> { + let Some(fan_tx) = self.fan_worker_tx() else { + debug!("Skipping thermal controller setup: fan controller not available"); + return Ok(()); + }; + let frequency_tx = self.frequency_command_tx.clone().ok_or_else(|| { + BoardError::HardwareControl("Frequency command channel not available".into()) + })?; + + let cancellation = CancellationToken::new(); + self.thermal_cancellation = Some(cancellation.clone()); + + let (temperature_tx, temperature_rx) = watch::channel(None::); + let (fan_speed_tx, fan_speed_rx) = watch::channel(FanSpeedCommand { speed_percent: 0 }); + + self.thermal_task_handles + .push(tokio::spawn(Self::run_temperature_reader( + fan_tx.clone(), + temperature_tx, + cancellation.clone(), + ))); + + self.thermal_task_handles + .push(tokio::spawn(Self::run_fan_speed_listener( + fan_tx, + fan_speed_rx, + cancellation.clone(), + ))); + + let controller = ThermalController::new( + self.thermal_config.clone(), + FanPIDController::new( + FAN_PID_PROPORTIONAL, + FAN_PID_INTEGRAL_INIT, + FAN_PID_INTEGRAL_COEFF, + FAN_PID_INTEGRAL_MIN, + FAN_PID_INTEGRAL_MAX, + ), + fan_speed_tx, + frequency_tx, + temperature_rx, + ); + self.thermal_task_handles.push(tokio::spawn(async move { + controller.run(cancellation).await; + })); + + debug!("Thermal controller setup complete"); + Ok(()) + } + + /// Spawn a task to periodically log management statistics + fn spawn_stats_monitor(&mut self) { // Clone the regulator Arc for stats monitoring let regulator = self .regulator .clone() .expect("Regulator must be initialized before spawning stats monitor"); + // Clone the fan worker sender for stats monitoring (if available) + let fan_tx = self.fan_worker_tx(); + let fan_read_timeout = TEMPERATURE_READ_TIMEOUT; + // Capture board info for logging let board_info = self.board_info(); let board_name = format!( @@ -720,40 +1089,91 @@ impl BitaxeBoard { let mut interval = tokio::time::interval(STATS_INTERVAL); interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); - // Create fan controller for the stats task - let mut fan_ctrl = Emc2101::new(i2c); + // Discard first tick (fires immediately, ADC readings may not be settled) + interval.tick().await; const LOG_INTERVAL: Duration = Duration::from_secs(30); let mut last_log = tokio::time::Instant::now(); - // Discard first tick (fires immediately, ADC readings may not be settled) - interval.tick().await; - loop { interval.tick().await; - // -- Read sensor values -- + // Read temperature (ASIC / external sensor) + let asic_temp: Option = if let Some(ref fan_tx) = fan_tx { + Self::fan_worker_request(fan_tx, fan_read_timeout, |respond_to| { + FanWorkerCommand::ReadExternalTemp { respond_to } + }) + .await + .ok() + } else { + None + }; - let asic_temp = fan_ctrl.get_external_temperature().await.ok(); - let fan_percent = fan_ctrl.get_fan_speed().await.ok().map(u8::from); - let fan_rpm = fan_ctrl.get_rpm().await.ok(); + // Read fan speed + let fan_percent: Option = if let Some(ref fan_tx) = fan_tx { + Self::fan_worker_request(fan_tx, fan_read_timeout, |respond_to| { + FanWorkerCommand::GetFanSpeed { respond_to } + }) + .await + .ok() + .map(u8::from) + } else { + None + }; - let (vin_mv, vout_mv, iout_ma, power_mw, vr_temp) = { - let mut reg = regulator.lock().await; - ( - reg.get_vin().await.ok(), - reg.get_vout().await.ok(), - reg.get_iout().await.ok(), - reg.get_power().await.ok(), - reg.get_temperature().await.ok(), - ) + // Read fan RPM (if TACH is connected) + let fan_rpm: Option = if let Some(ref fan_tx) = fan_tx { + match Self::fan_worker_request(fan_tx, fan_read_timeout, |respond_to| { + FanWorkerCommand::GetTachCount { respond_to } + }) + .await + { + Ok(count) => { + trace!("TACH count: 0x{:04x}", count); + Self::fan_worker_request(fan_tx, fan_read_timeout, |respond_to| { + FanWorkerCommand::GetRpm { respond_to } + }) + .await + .ok() + } + Err(e) => { + trace!("Failed to read TACH: {}", e); + None + } + } + } else { + None }; - if let Some(mv) = vout_mv { - let volts = mv as f32 / 1000.0; - if volts < 1.0 { - warn!("Core voltage low: {:.3}V", volts); + // Read power stats using the shared regulator + let vin_mv: Option = regulator.lock().await.get_vin().await.ok(); + + let vout_mv: Option = + regulator.lock().await.get_vout().await.ok().inspect(|&mv| { + let volts = mv as f32 / 1000.0; + if volts < 1.0 { + warn!("Core voltage low: {:.3}V", volts); + } + }); + + let iout_ma: Option = regulator.lock().await.get_iout().await.ok(); + + let power_mw: Option = regulator.lock().await.get_power().await.ok(); + + let vr_temp: Option = regulator.lock().await.get_temperature().await.ok(); + + // Check power status - critical faults will return error + if let Err(e) = regulator.lock().await.check_status().await { + error!("CRITICAL: Power controller fault detected: {}", e); + + // Try to clear the fault once + warn!("Attempting to clear power controller faults..."); + if let Err(clear_err) = regulator.lock().await.clear_faults().await { + error!("Failed to clear faults: {}", clear_err); } + + // Continue monitoring + continue; } // Check power status -- critical faults will return error @@ -856,6 +1276,26 @@ impl Board for BitaxeBoard { } } + // Cancel thermal tasks + if let Some(ref cancellation) = self.thermal_cancellation { + cancellation.cancel(); + } + + // Wait for thermal tasks to complete + let shutdown_timeout = THERMAL_SHUTDOWN_TIMEOUT; + for mut handle in self.thermal_task_handles.drain(..) { + if tokio::time::timeout(shutdown_timeout, &mut handle) + .await + .is_err() + { + warn!( + timeout_ms = %shutdown_timeout.as_millis(), + "Thermal task shutdown timed out; aborting" + ); + handle.abort(); + } + } + // Hold chips in reset self.hold_in_reset().await?; @@ -868,13 +1308,27 @@ impl Board for BitaxeBoard { } // Reduce fan speed (no more heat generation) - if let Some(ref mut fan) = self.fan_controller { + if let Some(ref fan_worker) = self.fan_worker { let shutdown_speed = Percent::new_clamped(25); - if let Err(e) = fan.set_fan_speed(shutdown_speed).await { - warn!("Failed to set fan speed: {}", e); + let speed_result = + Self::fan_worker_request(&fan_worker.tx, FAN_SPEED_WRITE_TIMEOUT, |respond_to| { + FanWorkerCommand::SetSpeed { + percent: shutdown_speed, + respond_to, + } + }) + .await; + + if let Err(e) = speed_result { + warn!("Failed to set fan speed during shutdown: {}", e); } } + // Stop fan worker + if let Some(worker) = self.fan_worker.take() { + worker.task.abort(); + } + // Cancel the statistics monitoring task if let Some(handle) = self.stats_task_handle.take() { handle.abort(); @@ -926,13 +1380,17 @@ impl Board for BitaxeBoard { None => "Bitaxe-Gamma".to_string(), }; - // Create BM13xxThread with streams and peripherals + let frequency_command_rx = self.frequency_command_rx.take(); + let operating_frequency_mhz = self.thermal_config.operating_frequency_mhz; + let thread = BM13xxThread::new( thread_name, data_reader, data_writer, peripherals, removal_rx, + frequency_command_rx, + operating_frequency_mhz, ); debug!("Created BM13xx hash thread from BitaxeBoard"); diff --git a/mujina-miner/src/lib.rs b/mujina-miner/src/lib.rs index 7513f55..98ff951 100644 --- a/mujina-miner/src/lib.rs +++ b/mujina-miner/src/lib.rs @@ -13,6 +13,7 @@ pub mod mgmt_protocol; pub mod peripheral; pub mod scheduler; pub mod stratum_v1; +pub mod thermal; pub mod tracing; pub mod transport; pub mod types; diff --git a/mujina-miner/src/thermal/config.rs b/mujina-miner/src/thermal/config.rs new file mode 100644 index 0000000..77d9f77 --- /dev/null +++ b/mujina-miner/src/thermal/config.rs @@ -0,0 +1,24 @@ +#[derive(Debug, Clone)] +pub struct ThermalConfig { + /// Fan PID target (°C). Adjust based on cooling capacity and + /// ambient conditions. + pub target_temperature_c: f32, + + /// Frequency throttling threshold (°C). Must be higher than + /// `target_temperature_c`. + pub max_temperature_c: f32, + + /// Target chip clock after initialization (MHz). Higher values + /// increase hashrate and power draw. + pub operating_frequency_mhz: f32, +} + +impl Default for ThermalConfig { + fn default() -> Self { + Self { + target_temperature_c: 74.0, + max_temperature_c: 85.0, + operating_frequency_mhz: 525.0, + } + } +} diff --git a/mujina-miner/src/thermal/controller.rs b/mujina-miner/src/thermal/controller.rs new file mode 100644 index 0000000..dc1efe8 --- /dev/null +++ b/mujina-miner/src/thermal/controller.rs @@ -0,0 +1,487 @@ +use std::time::{Duration, Instant}; +use tokio::sync::{mpsc, watch}; +use tokio_util::sync::CancellationToken; + +use super::config::ThermalConfig; +use super::fan_pid::FanPIDController; +use super::state::ThermalState; +use crate::tracing::prelude::*; + +const FAN_SPEED_NORMAL: u8 = 30; +const FAN_SPEED_COOLING: u8 = 50; +const FAN_SPEED_THROTTLING: u8 = 80; +const FAN_SPEED_CRITICAL: u8 = 100; + +const FAN_SPEED_MIN: f32 = 0.0; +const FAN_SPEED_MAX: f32 = 100.0; + +const TICK_DURATION: Duration = Duration::from_secs(5); + +/// Cooldown between frequency adjustments to let thermal changes settle. +const FREQUENCY_ADJUSTMENT_INTERVAL: Duration = Duration::from_secs(20); + +/// Deadband above target before frequency reduction triggers, to avoid +/// reacting to transient spikes. +const FREQUENCY_OVER_TARGET_MARGIN_C: f32 = 5.0; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct FanSpeedCommand { + pub speed_percent: u8, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum FrequencyCommand { + BumpUp, + BumpDown, +} + +pub struct ThermalController { + config: ThermalConfig, + tick_duration: Duration, + fan_pid: FanPIDController, + fan_speed_tx: watch::Sender, + frequency_tx: mpsc::Sender, + temperature_rx: watch::Receiver>, + current_state: ThermalState, + last_tick_time: Option, + last_frequency_adjust: Option, +} + +impl ThermalController { + pub fn new( + config: ThermalConfig, + fan_pid: FanPIDController, + fan_speed_tx: watch::Sender, + frequency_tx: mpsc::Sender, + temperature_rx: watch::Receiver>, + ) -> Self { + Self { + config, + tick_duration: TICK_DURATION, + fan_pid, + fan_speed_tx, + frequency_tx, + temperature_rx, + current_state: ThermalState::NORMAL, + last_tick_time: None, + last_frequency_adjust: None, + } + } + + pub async fn run(mut self, cancellation: CancellationToken) { + let mut interval = tokio::time::interval(self.tick_duration); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip); + + loop { + tokio::select! { + _ = cancellation.cancelled() => { + break; + } + _ = interval.tick() => { + self.tick().await; + } + } + } + } + + pub fn current_state(&self) -> ThermalState { + self.current_state + } + + pub fn fan_pid_integral_sum(&self) -> f32 { + self.fan_pid.integral_sum + } + + pub fn fan_pid_integral_gain(&self) -> f32 { + self.fan_pid.integral_gain + } + + #[cfg(test)] + fn set_state(&mut self, state: ThermalState) { + self.current_state = state; + } + + #[cfg(test)] + fn set_fan_pid_integral_sum(&mut self, value: f32) { + self.fan_pid.integral_sum = value; + } + + #[cfg(test)] + fn set_tick_duration(&mut self, duration: Duration) { + self.tick_duration = duration; + } + + async fn tick(&mut self) { + let temperature = match *self.temperature_rx.borrow() { + Some(temp) => temp, + None => { + debug!("Thermal controller tick: no temperature reading available yet"); + return; + } + }; + + let now = Instant::now(); + let time_since_last_tick = self + .last_tick_time + .map(|last| now.duration_since(last)) + .unwrap_or(self.tick_duration); + self.last_tick_time = Some(now); + + let new_state = + ThermalState::from_temperature(temperature, self.current_state, &self.config); + let previous_state = self.current_state; + let state_changed = new_state != self.current_state; + + if state_changed { + info!( + previous_state = ?previous_state, + new_state = ?new_state, + "Thermal state changed" + ); + } + + self.current_state = new_state; + + self.adjust_fan_speed(temperature, time_since_last_tick, state_changed) + .await; + self.adjust_frequency(previous_state, state_changed, temperature, now) + .await; + } + + async fn adjust_fan_speed( + &mut self, + temperature: f32, + time_since_last_tick: Duration, + state_changed: bool, + ) { + let target_temperature = self.config.target_temperature_c; + let error = temperature - target_temperature; + + let freeze_integral = matches!( + self.current_state, + ThermalState::NORMAL | ThermalState::CRITICAL + ); + + let pid_output = self + .fan_pid + .update(error, time_since_last_tick, freeze_integral); + + if state_changed && matches!(self.current_state, ThermalState::NORMAL) { + self.fan_pid.reset(); + debug!("Fan PID reset on transition to NORMAL state"); + } + + let base_speed = match self.current_state { + ThermalState::NORMAL => FAN_SPEED_NORMAL, + ThermalState::COOLING => FAN_SPEED_COOLING, + ThermalState::THROTTLING => FAN_SPEED_THROTTLING, + ThermalState::CRITICAL => FAN_SPEED_CRITICAL, + }; + + let speed = (base_speed as f32 + pid_output).clamp(FAN_SPEED_MIN, FAN_SPEED_MAX) as u8; + + debug!( + temp_c = %temperature, + state = ?self.current_state, + target_c = %target_temperature, + error_c = %error, + pid_output = %pid_output, + base_speed_pct = %base_speed, + final_speed_pct = %speed, + integral_sum = %self.fan_pid.integral_sum, + freeze_integral = freeze_integral, + "Thermal control tick" + ); + + if self + .fan_speed_tx + .send(FanSpeedCommand { + speed_percent: speed, + }) + .is_err() + { + debug!("Fan speed command channel closed"); + } + } + + async fn adjust_frequency( + &mut self, + previous_state: ThermalState, + state_changed: bool, + temperature: f32, + now: Instant, + ) { + if !state_changed { + let over_target = + temperature >= self.config.target_temperature_c + FREQUENCY_OVER_TARGET_MARGIN_C; + let throttling = matches!( + self.current_state, + ThermalState::THROTTLING | ThermalState::CRITICAL + ); + + let cooldown_active = self + .last_frequency_adjust + .map(|last| now.duration_since(last) < FREQUENCY_ADJUSTMENT_INTERVAL) + .unwrap_or(false); + + if throttling && over_target && !cooldown_active { + let cmd = FrequencyCommand::BumpDown; + info!( + state = ?self.current_state, + temp_c = %temperature, + target_c = %self.config.target_temperature_c, + margin_c = %FREQUENCY_OVER_TARGET_MARGIN_C, + command = ?cmd, + "Thermal frequency adjustment (sustained overshoot)" + ); + if self.frequency_tx.send(cmd).await.is_err() { + debug!(command = ?cmd, "Frequency command channel closed"); + } + self.last_frequency_adjust = Some(now); + } + return; + } + + fn state_severity(state: ThermalState) -> u8 { + match state { + ThermalState::NORMAL => 0, + ThermalState::COOLING => 1, + ThermalState::THROTTLING => 2, + ThermalState::CRITICAL => 3, + } + } + + let previous_severity = state_severity(previous_state); + let current_severity = state_severity(self.current_state); + + let cmd = if current_severity > previous_severity { + FrequencyCommand::BumpDown + } else { + FrequencyCommand::BumpUp + }; + + info!( + previous_state = ?previous_state, + new_state = ?self.current_state, + command = ?cmd, + "Thermal frequency adjustment" + ); + + if self.frequency_tx.send(cmd).await.is_err() { + debug!(command = ?cmd, "Frequency command channel closed"); + } + + self.last_frequency_adjust = Some(now); + } +} + +#[cfg(test)] +mod tests { + use super::super::state::{HYSTERESIS_C, NORMAL_THRESHOLD_C}; + use super::*; + use tokio::sync::{mpsc, watch}; + + fn create_controller() -> ( + ThermalController, + watch::Receiver, + mpsc::Receiver, + watch::Sender>, + ) { + let (fan_tx, fan_rx) = watch::channel(FanSpeedCommand { speed_percent: 0 }); + let (freq_tx, freq_rx) = mpsc::channel(3); + let (temp_tx, temp_rx) = watch::channel(None::); + + let config = ThermalConfig::default(); + let fan_pid = FanPIDController::new(1.0, 0.1, 0.0, -10.0, 10.0); + + let controller = ThermalController::new(config, fan_pid, fan_tx, freq_tx, temp_rx); + + (controller, fan_rx, freq_rx, temp_tx) + } + + #[tokio::test] + async fn should_send_fan_speed_command_on_tick() { + let (mut controller, fan_rx, _freq_rx, temp_tx) = create_controller(); + let config = ThermalConfig::default(); + + temp_tx + .send(Some(config.target_temperature_c + 1.0)) + .unwrap(); + controller.tick().await; + + let command = *fan_rx.borrow(); + assert_eq!(command.speed_percent, 51); + } + + #[tokio::test] + async fn should_not_send_commands_when_no_temperature_received() { + let (mut controller, fan_rx, mut freq_rx, _temp_tx) = create_controller(); + + controller.tick().await; + + assert_eq!(fan_rx.borrow().speed_percent, 0); + assert!(freq_rx.try_recv().is_err()); + } + + #[tokio::test] + async fn should_send_frequency_command_on_state_change() { + let (mut controller, _fan_rx, mut freq_rx, temp_tx) = create_controller(); + + temp_tx.send(Some(75.0)).unwrap(); + controller.tick().await; + + let command = freq_rx.try_recv().unwrap(); + assert_eq!(command, FrequencyCommand::BumpDown); + } + + #[tokio::test] + async fn should_not_send_frequency_command_when_state_unchanged() { + let (mut controller, _fan_rx, mut freq_rx, temp_tx) = create_controller(); + + temp_tx.send(Some(NORMAL_THRESHOLD_C + 1.0)).unwrap(); + controller.tick().await; + + freq_rx.try_recv().unwrap(); + + temp_tx.send(Some(NORMAL_THRESHOLD_C + 2.0)).unwrap(); + controller.tick().await; + + assert!(freq_rx.try_recv().is_err()); + } + + #[tokio::test] + async fn should_reset_fan_pid_on_transition_to_normal() { + let (mut controller, _fan_rx, _freq_rx, temp_tx) = create_controller(); + + temp_tx.send(Some(65.0)).unwrap(); + controller.tick().await; + + controller.set_fan_pid_integral_sum(5.0); + + temp_tx.send(Some(45.0)).unwrap(); + controller.tick().await; + + assert_eq!(controller.fan_pid_integral_sum(), 0.0); + } + + #[tokio::test] + async fn should_set_fan_speed_based_on_thermal_state() { + let (mut controller, fan_rx, _freq_rx, temp_tx) = create_controller(); + let config = ThermalConfig::default(); + + temp_tx.send(Some(NORMAL_THRESHOLD_C - 1.0)).unwrap(); + controller.tick().await; + let normal_cmd = *fan_rx.borrow(); + assert_eq!(normal_cmd.speed_percent, 10); + + temp_tx.send(Some(NORMAL_THRESHOLD_C + 1.0)).unwrap(); + controller.tick().await; + let cooling_cmd = *fan_rx.borrow(); + assert_eq!(cooling_cmd.speed_percent, 32); + + temp_tx + .send(Some(config.target_temperature_c + 1.0)) + .unwrap(); + controller.tick().await; + let throttling_cmd = *fan_rx.borrow(); + assert_eq!(throttling_cmd.speed_percent, 81); + + temp_tx.send(Some(config.max_temperature_c + 1.0)).unwrap(); + controller.tick().await; + let critical_cmd = *fan_rx.borrow(); + assert_eq!(critical_cmd.speed_percent, 100); + } + + #[tokio::test] + async fn should_send_bump_down_when_state_becomes_more_severe() { + let (mut controller, _fan_rx, mut freq_rx, temp_tx) = create_controller(); + let config = ThermalConfig::default(); + + controller.set_state(ThermalState::NORMAL); + temp_tx.send(Some(NORMAL_THRESHOLD_C + 1.0)).unwrap(); + controller.tick().await; + let cooling_cmd = freq_rx.try_recv().unwrap(); + assert_eq!(cooling_cmd, FrequencyCommand::BumpDown); + + temp_tx + .send(Some(config.target_temperature_c + 1.0)) + .unwrap(); + controller.tick().await; + let throttling_cmd = freq_rx.try_recv().unwrap(); + assert_eq!(throttling_cmd, FrequencyCommand::BumpDown); + + temp_tx.send(Some(config.max_temperature_c + 1.0)).unwrap(); + controller.tick().await; + let critical_cmd = freq_rx.try_recv().unwrap(); + assert_eq!(critical_cmd, FrequencyCommand::BumpDown); + } + + #[tokio::test] + async fn should_send_bump_up_when_state_becomes_less_severe() { + let (mut controller, _fan_rx, mut freq_rx, temp_tx) = create_controller(); + let config = ThermalConfig::default(); + + controller.set_state(ThermalState::CRITICAL); + temp_tx + .send(Some(config.max_temperature_c - HYSTERESIS_C - 1.0)) + .unwrap(); + controller.tick().await; + let throttling_cmd = freq_rx.try_recv().unwrap(); + assert_eq!(throttling_cmd, FrequencyCommand::BumpUp); + + temp_tx + .send(Some(config.target_temperature_c - HYSTERESIS_C - 1.0)) + .unwrap(); + controller.tick().await; + let cooling_cmd = freq_rx.try_recv().unwrap(); + assert_eq!(cooling_cmd, FrequencyCommand::BumpUp); + + temp_tx + .send(Some(NORMAL_THRESHOLD_C - HYSTERESIS_C - 1.0)) + .unwrap(); + controller.tick().await; + let normal_cmd = freq_rx.try_recv().unwrap(); + assert_eq!(normal_cmd, FrequencyCommand::BumpUp); + } + + #[tokio::test] + async fn should_freeze_integral_in_normal_and_critical_states() { + let (mut controller, _fan_rx, _freq_rx, temp_tx) = create_controller(); + let config = ThermalConfig::default(); + + controller.set_fan_pid_integral_sum(5.0); + let initial_integral = controller.fan_pid_integral_sum(); + + temp_tx.send(Some(NORMAL_THRESHOLD_C - 1.0)).unwrap(); + controller.tick().await; + + assert!((controller.fan_pid_integral_sum() - initial_integral).abs() < 1e-3); + + temp_tx.send(Some(config.max_temperature_c + 1.0)).unwrap(); + controller.tick().await; + + assert!((controller.fan_pid_integral_sum() - initial_integral).abs() < 1e-3); + } + + #[tokio::test] + async fn should_accumulate_integral_in_cooling_and_throttling_states() { + let (mut controller, _fan_rx, _freq_rx, temp_tx) = create_controller(); + let config = ThermalConfig::default(); + + controller.set_fan_pid_integral_sum(0.0); + controller.set_tick_duration(Duration::from_millis(100)); + + temp_tx.send(Some(NORMAL_THRESHOLD_C + 1.0)).unwrap(); + controller.tick().await; + + let integral_after_cooling = controller.fan_pid_integral_sum(); + assert!(integral_after_cooling < 0.0); + + temp_tx + .send(Some(config.target_temperature_c + 1.0)) + .unwrap(); + controller.tick().await; + + assert!(controller.fan_pid_integral_sum() > integral_after_cooling); + } +} diff --git a/mujina-miner/src/thermal/fan_pid.rs b/mujina-miner/src/thermal/fan_pid.rs new file mode 100644 index 0000000..7fffdc5 --- /dev/null +++ b/mujina-miner/src/thermal/fan_pid.rs @@ -0,0 +1,100 @@ +use std::time::Duration; + +#[derive(Debug, Clone)] +pub struct FanPIDController { + pub proportional_gain: f32, + pub integral_sum: f32, + pub integral_gain: f32, + pub integral_min: f32, + pub integral_max: f32, +} + +impl FanPIDController { + pub fn new( + proportional_gain: f32, + integral_sum: f32, + integral_gain: f32, + integral_min: f32, + integral_max: f32, + ) -> Self { + Self { + proportional_gain, + integral_sum, + integral_gain, + integral_min, + integral_max, + } + } + + pub fn update( + &mut self, + error: f32, + time_since_last_update: Duration, + freeze_integral: bool, + ) -> f32 { + let dt_s = time_since_last_update.as_secs_f32(); + + if !freeze_integral { + self.integral_sum += error * dt_s; + self.integral_sum = self + .integral_sum + .clamp(self.integral_min, self.integral_max); + } + + self.proportional_gain * error + self.integral_gain * self.integral_sum + } + + pub fn reset(&mut self) { + self.integral_sum = 0.0; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn should_combine_proportional_and_integral_terms() { + let mut controller = FanPIDController::new(1.0, 0.5, 2.0, -10.0, 10.0); + + controller.integral_sum = 3.0; + let output = controller.update(5.0, Duration::from_secs(1), true); + assert_eq!(output, 11.0); + } + + #[test] + fn should_accumulate_integral_when_freeze_integral_is_false() { + let mut controller = FanPIDController::new(1.0, 0.5, 0.0, -10.0, 10.0); + + let initial_integral_sum = controller.integral_sum; + controller.update(2.0, Duration::from_secs(1), false); + assert_eq!(controller.integral_sum, initial_integral_sum + 2.0); + } + + #[test] + fn should_not_accumulate_integral_when_freeze_integral_is_true() { + let mut controller = FanPIDController::new(1.0, 0.5, 0.0, -10.0, 10.0); + + let initial_integral_sum = controller.integral_sum; + controller.update(2.0, Duration::from_secs(1), true); + assert_eq!(controller.integral_sum, initial_integral_sum); + } + + #[test] + fn should_clamp_integral_at_max_boundary() { + let mut controller = FanPIDController::new(1.0, 0.5, 0.0, -10.0, 10.0); + + controller.integral_sum = 9.0; + controller.update(5.0, Duration::from_secs(1), false); + assert_eq!(controller.integral_sum, 10.0); + } + + #[test] + fn should_clamp_integral_at_min_boundary() { + let mut controller = FanPIDController::new(1.0, 0.5, 0.0, -10.0, 10.0); + + controller.integral_sum = -9.0; + controller.update(-5.0, Duration::from_secs(1), false); + assert_eq!(controller.integral_sum, -10.0); + } +} diff --git a/mujina-miner/src/thermal/filter.rs b/mujina-miner/src/thermal/filter.rs new file mode 100644 index 0000000..a899ee3 --- /dev/null +++ b/mujina-miner/src/thermal/filter.rs @@ -0,0 +1,175 @@ +use std::collections::VecDeque; + +/// A sliding window filter for temperature readings that rejects noise. +/// +/// Maintains a moving average of recent readings and rejects readings that +/// deviate too much from this average, which helps filter out sensor noise +/// while still responding to genuine temperature changes. +#[derive(Debug, Clone)] +pub struct TemperatureFilter { + window: VecDeque, + window_size: u8, + max_deviation_c: f32, +} + +impl TemperatureFilter { + /// Creates a new temperature filter with the specified window size and maximum deviation. + /// + /// # Arguments + /// * `window_size` - Number of recent readings to maintain in the sliding window + /// * `max_deviation_c` - Maximum allowed deviation from the moving average (in °C) + pub fn new(window_size: u8, max_deviation_c: f32) -> Self { + Self { + window: VecDeque::with_capacity(window_size as usize), + window_size, + max_deviation_c, + } + } + + /// Considers a new reading; returns `Some(temp)` if accepted, `None` if rejected as noise. + /// + /// Readings are rejected if: + /// - They are outside the valid range (-20°C to 100°C) + /// - They deviate more than `max_deviation_c` from the current moving average + /// + /// Valid readings are added to the sliding window and returned. + pub fn consider(&mut self, temp: f32) -> Option { + if !(-20.0..=100.0).contains(&temp) { + return None; + } + + if !self.window.is_empty() { + let avg = self.window.iter().sum::() / self.window.len() as f32; + let deviation = (temp - avg).abs(); + if deviation > self.max_deviation_c { + return None; + } + } + + if self.window.len() == self.window_size as usize { + self.window.pop_front(); + } + + self.window.push_back(temp); + + Some(temp) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn should_accept_valid_reading_when_window_is_empty() { + let mut filter = TemperatureFilter::new(5, 5.0); + + let result = filter.consider(45.0); + + assert_eq!(result, Some(45.0)); + assert_eq!(filter.window.len(), 1); + } + + #[test] + fn should_reject_reading_below_valid_range() { + let mut filter = TemperatureFilter::new(5, 5.0); + + let result = filter.consider(-21.0); + + assert_eq!(result, None); + assert_eq!(filter.window.len(), 0); + } + + #[test] + fn should_reject_reading_above_valid_range() { + let mut filter = TemperatureFilter::new(5, 5.0); + + let result = filter.consider(101.0); + + assert_eq!(result, None); + assert_eq!(filter.window.len(), 0); + } + + #[test] + fn should_accept_reading_at_lower_bound() { + let mut filter = TemperatureFilter::new(5, 5.0); + + let result = filter.consider(-20.0); + + assert_eq!(result, Some(-20.0)); + } + + #[test] + fn should_accept_reading_at_upper_bound() { + let mut filter = TemperatureFilter::new(5, 5.0); + + let result = filter.consider(100.0); + + assert_eq!(result, Some(100.0)); + } + + #[test] + fn should_reject_reading_that_exceeds_max_deviation() { + let mut filter = TemperatureFilter::new(5, 5.0); + + filter.consider(50.0); + filter.consider(51.0); + filter.consider(52.0); + + let result = filter.consider(65.0); + + assert_eq!(result, None); + } + + #[test] + fn should_accept_reading_within_max_deviation() { + let mut filter = TemperatureFilter::new(5, 5.0); + + filter.consider(50.0); + filter.consider(51.0); + filter.consider(52.0); + + let result = filter.consider(54.0); + + assert_eq!(result, Some(54.0)); + } + + #[test] + fn should_maintain_sliding_window_size() { + let mut filter = TemperatureFilter::new(3, 5.0); + + filter.consider(50.0); + filter.consider(51.0); + filter.consider(52.0); + filter.consider(53.0); + + assert_eq!(filter.window.len(), 3); + assert_eq!(*filter.window.front().unwrap(), 51.0); + } + + #[test] + fn should_accept_readings_after_noise_rejection() { + let mut filter = TemperatureFilter::new(5, 5.0); + + filter.consider(50.0); + filter.consider(51.0); + filter.consider(52.0); + filter.consider(65.0); // Rejected + + let result = filter.consider(53.0); + + assert_eq!(result, Some(53.0)); + } + + #[test] + fn should_have_deterministic_behavior_with_known_values() { + let mut filter = TemperatureFilter::new(4, 2.0); + + assert_eq!(filter.consider(50.0), Some(50.0)); + assert_eq!(filter.consider(51.0), Some(51.0)); + assert_eq!(filter.consider(50.5), Some(50.5)); + assert_eq!(filter.consider(51.2), Some(51.2)); + assert_eq!(filter.consider(58.0), None); // Too much deviation + assert_eq!(filter.consider(52.0), Some(52.0)); + } +} diff --git a/mujina-miner/src/thermal/mod.rs b/mujina-miner/src/thermal/mod.rs new file mode 100644 index 0000000..a8f6b63 --- /dev/null +++ b/mujina-miner/src/thermal/mod.rs @@ -0,0 +1,11 @@ +mod config; +mod controller; +mod fan_pid; +mod filter; +mod state; + +pub use config::ThermalConfig; +pub use controller::{FanSpeedCommand, FrequencyCommand, ThermalController}; +pub use fan_pid::FanPIDController; +pub use filter::TemperatureFilter; +pub use state::ThermalState; diff --git a/mujina-miner/src/thermal/state.rs b/mujina-miner/src/thermal/state.rs new file mode 100644 index 0000000..09509d5 --- /dev/null +++ b/mujina-miner/src/thermal/state.rs @@ -0,0 +1,100 @@ +use super::config::ThermalConfig; + +pub(super) const NORMAL_THRESHOLD_C: f32 = 55.0; + +/// Prevents oscillation between thermal states on noisy readings. +pub(super) const HYSTERESIS_C: f32 = 2.0; + +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum ThermalState { + NORMAL, + COOLING, + THROTTLING, + CRITICAL, +} + +impl ThermalState { + pub fn from_temperature(temp: f32, previous: ThermalState, config: &ThermalConfig) -> Self { + match previous { + ThermalState::NORMAL => { + if temp > NORMAL_THRESHOLD_C { + ThermalState::COOLING + } else { + ThermalState::NORMAL + } + } + ThermalState::COOLING => { + if temp <= NORMAL_THRESHOLD_C - HYSTERESIS_C { + ThermalState::NORMAL + } else if temp > config.target_temperature_c { + ThermalState::THROTTLING + } else { + ThermalState::COOLING + } + } + ThermalState::THROTTLING => { + if temp <= config.target_temperature_c - HYSTERESIS_C { + ThermalState::COOLING + } else if temp > config.max_temperature_c { + ThermalState::CRITICAL + } else { + ThermalState::THROTTLING + } + } + ThermalState::CRITICAL => { + if temp <= config.max_temperature_c - HYSTERESIS_C { + ThermalState::THROTTLING + } else { + ThermalState::CRITICAL + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::thermal::ThermalConfig; + + #[test] + fn should_be_normal_thermal_state() { + let config = ThermalConfig::default(); + let state = + ThermalState::from_temperature(NORMAL_THRESHOLD_C, ThermalState::NORMAL, &config); + assert_eq!(state, ThermalState::NORMAL); + } + + #[test] + fn should_be_cooling_thermal_state() { + let config = ThermalConfig::default(); + let state = ThermalState::from_temperature( + config.target_temperature_c, + ThermalState::COOLING, + &config, + ); + assert_eq!(state, ThermalState::COOLING); + } + + #[test] + fn should_be_throttling_thermal_state() { + let config = ThermalConfig::default(); + let state = ThermalState::from_temperature( + config.max_temperature_c, + ThermalState::THROTTLING, + &config, + ); + assert_eq!(state, ThermalState::THROTTLING); + } + + #[test] + fn should_be_critical_thermal_state() { + let config = ThermalConfig::default(); + let state = ThermalState::from_temperature( + config.max_temperature_c + 1.0, + ThermalState::CRITICAL, + &config, + ); + assert_eq!(state, ThermalState::CRITICAL); + } +} diff --git a/mujina-miner/src/types/difficulty.rs b/mujina-miner/src/types/difficulty.rs index 6e34fc5..4242141 100644 --- a/mujina-miner/src/types/difficulty.rs +++ b/mujina-miner/src/types/difficulty.rs @@ -243,8 +243,8 @@ mod tests { let diff_a = Difficulty::from(500_u64); let diff_b = Difficulty::from(500_u64); assert_eq!(diff_a, diff_b); - assert!(!(diff_a > diff_b)); - assert!(!(diff_a < diff_b)); + assert!(diff_a <= diff_b); + assert!(diff_a >= diff_b); } #[test]