From 712df2278f0e9ee0ec280cd23f142b5f2f6f251d Mon Sep 17 00:00:00 2001 From: archipelago Date: Tue, 30 Jun 2026 23:21:29 -0400 Subject: [PATCH] feat(mesh): Meshtastic provisioning robustness (backlog #12) Three fixes: 1. Modem-preset authoritative: parse_config_lora_region now also decodes modem_preset (field 2) alongside region, tracked as current_modem_preset. ensure_lora_region's "region already set, don't touch it" branch (correct, unchanged) now ALSO re-asserts LONG_FAST when a real observed preset has drifted -- previously modem_preset only ever got written when region was UNSET, so a radio with the right region but wrong preset was never fixed. Only acts on an actually-observed wrong value (never speculative), so it can't reboot-loop. 2. RX-stall watchdog: run_mesh_session now bails (triggering the existing auto-reconnect path) if no frame has been successfully received in 5 minutes -- the existing consecutive_write_failures counter is blind to a receive-only stall (writes can keep succeeding while inbound streaming is wedged). 3. Hot-swap detection: spawn_mesh_listener now compares self_node_id across session restarts and logs clearly when the physical radio itself changed (not just an ordinary reconnect of the same board). Per-session device state (contacts, current_region, etc.) was already naturally isolated per-session (fresh struct each reconnect) -- nothing else needed clearing. 107/107 mesh tests pass (2 new: modem_preset decode + the absent-field-defaults-to-LONG_FAST case). Co-Authored-By: Claude Sonnet 5 --- core/archipelago/src/mesh/listener/mod.rs | 38 +++++++++ core/archipelago/src/mesh/listener/session.rs | 23 +++++- core/archipelago/src/mesh/meshtastic.rs | 82 +++++++++++++++++-- 3 files changed, 131 insertions(+), 12 deletions(-) diff --git a/core/archipelago/src/mesh/listener/mod.rs b/core/archipelago/src/mesh/listener/mod.rs index fdf61eb6..1bec5e39 100644 --- a/core/archipelago/src/mesh/listener/mod.rs +++ b/core/archipelago/src/mesh/listener/mod.rs @@ -28,6 +28,16 @@ const ADVERT_INTERVAL: Duration = Duration::from_secs(60); /// How often to poll for queued messages when no push notifications. const SYNC_INTERVAL: Duration = Duration::from_secs(10); +/// Backlog #12 (provisioning robustness): if we haven't successfully received +/// ANY frame in this long, treat the serial link as stalled and force a +/// reconnect — the write-side `consecutive_write_failures` counter is blind +/// to a receive-only stall (writes can keep succeeding while the radio's +/// stopped streaming inbound, e.g. the FROM_RADIO_REBOOTED-without-recovery +/// case this same file already has a targeted fix for). 5 minutes is well +/// above the existing 60s advert / 10s sync cadence, so a healthy link never +/// trips it. +const RX_STALL_TIMEOUT: Duration = Duration::from_secs(300); + /// Maximum stored messages (circular buffer). const MAX_MESSAGES: usize = 100; @@ -409,6 +419,15 @@ pub fn spawn_mesh_listener( let mut shutdown = shutdown; let mut cmd_rx = cmd_rx; let mut reconnect_delay = RECONNECT_DELAY_INIT; + // Backlog #12 hot-swap re-binding: each run_mesh_session call already + // builds a fresh device struct (contacts/current_region/etc. all + // start empty), so per-device session state is naturally isolated + // across reconnects — there's no stale in-memory state to clear here. + // What's worth doing is detecting when the *physical radio itself* + // changed (a genuine hot-swap, not just the same radio reconnecting) + // so it's visible in logs rather than silently treated the same as + // an ordinary reconnect. + let mut last_self_node_id: Option = None; loop { if *shutdown.borrow() { info!("Mesh listener shutting down"); @@ -447,6 +466,25 @@ pub fn spawn_mesh_listener( } } + // Hot-swap detection: compare this session's self_node_id against + // the last one we saw. A change means the physical radio itself + // was swapped (not just a reconnect of the same board). + { + let current_self_node_id = state.status.read().await.self_node_id; + if let (Some(prev), Some(cur)) = (last_self_node_id, current_self_node_id) { + if prev != cur { + info!( + previous_node_id = prev, + new_node_id = cur, + "Local mesh radio identity changed — treating as a hot-swapped device" + ); + } + } + if current_self_node_id.is_some() { + last_self_node_id = current_self_node_id; + } + } + // Update status to disconnected { let mut status = state.status.write().await; diff --git a/core/archipelago/src/mesh/listener/session.rs b/core/archipelago/src/mesh/listener/session.rs index 41c69d6f..5b1f2f5a 100644 --- a/core/archipelago/src/mesh/listener/session.rs +++ b/core/archipelago/src/mesh/listener/session.rs @@ -6,12 +6,12 @@ use super::super::serial::MeshcoreDevice; use super::super::types::*; use super::{ dispatch, frames, MeshCommand, MeshState, ADVERT_INTERVAL, MAX_CONSECUTIVE_WRITE_FAILURES, - SYNC_INTERVAL, + RX_STALL_TIMEOUT, SYNC_INTERVAL, }; use anyhow::{Context, Result}; use std::path::Path; use std::sync::Arc; -use std::time::Duration; +use std::time::{Duration, Instant}; use tokio::sync::mpsc; use tracing::{debug, error, info, warn}; @@ -950,6 +950,11 @@ pub(super) async fn run_mesh_session( advert_timer.tick().await; // skip first immediate tick sync_timer.tick().await; let mut consecutive_write_failures: u32 = 0; + // Backlog #12 RX-stall watchdog — see RX_STALL_TIMEOUT's doc comment. + // Reset on the very first frame check too (not just successful reads), + // so a session that never receives anything still gets a full timeout + // window from startup rather than an immediately-stale clock. + let mut last_rx_at = Instant::now(); loop { // If too many consecutive writes have failed, the serial port is dead — @@ -964,14 +969,26 @@ pub(super) async fn run_mesh_session( consecutive_write_failures ); } + if last_rx_at.elapsed() >= RX_STALL_TIMEOUT { + error!( + stalled_for_secs = last_rx_at.elapsed().as_secs(), + "No mesh frames received for too long — triggering reconnection" + ); + anyhow::bail!( + "RX stalled for over {}s — forcing reconnect", + RX_STALL_TIMEOUT.as_secs() + ); + } tokio::select! { // Check for incoming frames frame_result = device.try_recv_frame() => { match frame_result { Ok(Some(frame)) => { - // Successful read resets the failure counter + // Successful read resets the failure counter and the + // RX-stall watchdog. consecutive_write_failures = 0; + last_rx_at = Instant::now(); // For meshtastic, the PKI-E2E status of this frame can't // ride the synthetic meshcore frame — snapshot the message // id high-water mark, dispatch, then stamp the E2E pill on diff --git a/core/archipelago/src/mesh/meshtastic.rs b/core/archipelago/src/mesh/meshtastic.rs index 1a33c814..2692e483 100644 --- a/core/archipelago/src/mesh/meshtastic.rs +++ b/core/archipelago/src/mesh/meshtastic.rs @@ -132,6 +132,14 @@ pub struct MeshtasticDevice { /// provision the operator-configured region — and to avoid a reboot loop by /// only writing when it actually differs. current_region: Option, + /// The radio's currently-configured LoRa modem preset, learned alongside + /// `current_region` from the same `Config.lora` block. Unlike region + /// (deliberately never overridden once set), this SHOULD be authoritative + /// -- two archy radios with mismatched presets (different spreading + /// factor/bandwidth) can't decode each other even on the correct region. + /// `None` until a Config frame is seen -- never write speculatively before + /// we've actually observed the radio's real value. + current_modem_preset: Option, /// The radio's current PRIMARY channel as `(name, psk)`, learned from the /// `Channel` blocks during `initialize`. Two radios only decode each other /// when their primary channel (name + psk → channel hash) matches, so archy @@ -184,6 +192,7 @@ impl MeshtasticDevice { contacts: HashMap::new(), peer_pubkeys: HashMap::new(), current_region: None, + current_modem_preset: None, current_primary_channel: None, current_secondary_channel: None, device_path: path.to_string(), @@ -348,7 +357,24 @@ impl MeshtasticDevice { "Respecting the radio's own LoRa region (not overriding with the configured one)" ); } - Ok(false) + // Region is untouched either way, but the modem preset IS + // authoritative (backlog #12): two archy radios on mismatched + // presets can't decode each other even on the same region. + // Only acts once we've actually observed a real (non-None) + // preset that's wrong — never speculative, so this can't + // reboot-loop (the write sets LONG_FAST, the next Config + // frame confirms it, and this branch goes quiet). + match self.current_modem_preset { + Some(p) if p != LORA_MODEM_PRESET_LONG_FAST as u32 => { + debug!( + device_preset = p, + "Modem preset drifted from LONG_FAST — re-provisioning (region unchanged)" + ); + self.set_lora_region(cur).await?; + Ok(true) + } + _ => Ok(false), + } } // Region is UNSET → a fresh radio is RF-silent and can't mesh at all. // Set the operator-configured region so it can transmit/receive. @@ -855,9 +881,10 @@ impl MeshtasticDevice { FROM_RADIO_CONFIG => { // Only the LoRa sub-config carries a region; other Config // variants (device/position/…) return None and are ignored. - if let Some(region) = parse_config_lora_region(value) { + if let Some((region, modem_preset)) = parse_config_lora_region(value) { self.current_region = Some(region); - debug!(region, "Meshtastic LoRa region from device config"); + self.current_modem_preset = Some(modem_preset); + debug!(region, modem_preset, "Meshtastic LoRa region/preset from device config"); } None } @@ -1182,7 +1209,12 @@ fn encode_heartbeat() -> Vec { /// code. Returns `Some(REGION_UNSET)` when the LoRa block is present but has no /// region field (a fresh radio), and `None` when this Config carries a /// non-LoRa variant (device/position/…) so the caller keeps the prior value. -fn parse_config_lora_region(data: &[u8]) -> Option { +/// Returns `(region, modem_preset)` from a `Config.lora` block. `modem_preset` +/// defaults to `LORA_MODEM_PRESET_LONG_FAST` when the field is absent — a +/// fresh/never-configured radio reports no preset field at all, and treating +/// that as "already correct" (rather than "unknown, needs fixing") avoids a +/// spurious reboot before the operator-region provisioning step even runs. +fn parse_config_lora_region(data: &[u8]) -> Option<(u32, u32)> { let mut idx = 0; while idx < data.len() { let (field, value, next) = next_field(data, idx)?; @@ -1191,16 +1223,17 @@ fn parse_config_lora_region(data: &[u8]) -> Option { if let FieldValue::Bytes(b) = value { let mut j = 0; let mut region = REGION_UNSET; + let mut modem_preset = LORA_MODEM_PRESET_LONG_FAST as u32; while j < b.len() { let (lf, lv, ln) = next_field(b, j)?; j = ln; - if lf == LORA_REGION_FIELD { - if let FieldValue::Varint(v) = lv { - region = v as u32; - } + match (lf, lv) { + (LORA_REGION_FIELD, FieldValue::Varint(v)) => region = v as u32, + (LORA_MODEM_PRESET_FIELD, FieldValue::Varint(v)) => modem_preset = v as u32, + _ => {} } } - return Some(region); + return Some((region, modem_preset)); } } } @@ -1803,6 +1836,37 @@ mod tests { assert!(parse_position_lat_lon(&position).is_none()); } + #[test] + fn parse_config_lora_region_decodes_region_and_modem_preset() { + let mut lora = Vec::new(); + encode_varint_field_into(LORA_REGION_FIELD, 3, &mut lora); // some real region code + encode_varint_field_into(LORA_MODEM_PRESET_FIELD, 4, &mut lora); // drifted preset + + let mut config = Vec::new(); + encode_len_field(CONFIG_LORA_FIELD, &lora, &mut config); + + let (region, preset) = parse_config_lora_region(&config).expect("lora config present"); + assert_eq!(region, 3); + assert_eq!(preset, 4); + } + + #[test] + fn parse_config_lora_region_defaults_preset_to_long_fast_when_absent() { + // A fresh/never-provisioned radio's Config.lora may carry a region + // with no modem_preset field at all -- must default to "already + // correct" (LONG_FAST), not "unknown/wrong", or ensure_lora_region + // would try to fix a value it never actually observed. + let mut lora = Vec::new(); + encode_varint_field_into(LORA_REGION_FIELD, 3, &mut lora); + + let mut config = Vec::new(); + encode_len_field(CONFIG_LORA_FIELD, &lora, &mut config); + + let (region, preset) = parse_config_lora_region(&config).expect("lora config present"); + assert_eq!(region, 3); + assert_eq!(preset, LORA_MODEM_PRESET_LONG_FAST as u32); + } + #[test] fn packet_to_inbound_frame_updates_contact_signal_and_position_without_a_chat_frame() { let from = 0x0000_4444;