feat(mesh): Meshtastic provisioning robustness (backlog #12)

Three fixes:
1. Modem-preset authoritative: parse_config_lora_region now also decodes
   modem_preset (field 2) alongside region, tracked as current_modem_preset.
   ensure_lora_region's "region already set, don't touch it" branch (correct,
   unchanged) now ALSO re-asserts LONG_FAST when a real observed preset has
   drifted -- previously modem_preset only ever got written when region was
   UNSET, so a radio with the right region but wrong preset was never fixed.
   Only acts on an actually-observed wrong value (never speculative), so it
   can't reboot-loop.
2. RX-stall watchdog: run_mesh_session now bails (triggering the existing
   auto-reconnect path) if no frame has been successfully received in 5
   minutes -- the existing consecutive_write_failures counter is blind to a
   receive-only stall (writes can keep succeeding while inbound streaming is
   wedged).
3. Hot-swap detection: spawn_mesh_listener now compares self_node_id across
   session restarts and logs clearly when the physical radio itself changed
   (not just an ordinary reconnect of the same board). Per-session device
   state (contacts, current_region, etc.) was already naturally isolated
   per-session (fresh struct each reconnect) -- nothing else needed clearing.

107/107 mesh tests pass (2 new: modem_preset decode + the
absent-field-defaults-to-LONG_FAST case).

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-30 23:21:29 -04:00
parent 494f272815
commit 712df2278f
3 changed files with 131 additions and 12 deletions

View File

@ -28,6 +28,16 @@ const ADVERT_INTERVAL: Duration = Duration::from_secs(60);
/// How often to poll for queued messages when no push notifications.
const SYNC_INTERVAL: Duration = Duration::from_secs(10);
/// Backlog #12 (provisioning robustness): if we haven't successfully received
/// ANY frame in this long, treat the serial link as stalled and force a
/// reconnect — the write-side `consecutive_write_failures` counter is blind
/// to a receive-only stall (writes can keep succeeding while the radio's
/// stopped streaming inbound, e.g. the FROM_RADIO_REBOOTED-without-recovery
/// case this same file already has a targeted fix for). 5 minutes is well
/// above the existing 60s advert / 10s sync cadence, so a healthy link never
/// trips it.
const RX_STALL_TIMEOUT: Duration = Duration::from_secs(300);
/// Maximum stored messages (circular buffer).
const MAX_MESSAGES: usize = 100;
@ -409,6 +419,15 @@ pub fn spawn_mesh_listener(
let mut shutdown = shutdown;
let mut cmd_rx = cmd_rx;
let mut reconnect_delay = RECONNECT_DELAY_INIT;
// Backlog #12 hot-swap re-binding: each run_mesh_session call already
// builds a fresh device struct (contacts/current_region/etc. all
// start empty), so per-device session state is naturally isolated
// across reconnects — there's no stale in-memory state to clear here.
// What's worth doing is detecting when the *physical radio itself*
// changed (a genuine hot-swap, not just the same radio reconnecting)
// so it's visible in logs rather than silently treated the same as
// an ordinary reconnect.
let mut last_self_node_id: Option<u32> = None;
loop {
if *shutdown.borrow() {
info!("Mesh listener shutting down");
@ -447,6 +466,25 @@ pub fn spawn_mesh_listener(
}
}
// Hot-swap detection: compare this session's self_node_id against
// the last one we saw. A change means the physical radio itself
// was swapped (not just a reconnect of the same board).
{
let current_self_node_id = state.status.read().await.self_node_id;
if let (Some(prev), Some(cur)) = (last_self_node_id, current_self_node_id) {
if prev != cur {
info!(
previous_node_id = prev,
new_node_id = cur,
"Local mesh radio identity changed — treating as a hot-swapped device"
);
}
}
if current_self_node_id.is_some() {
last_self_node_id = current_self_node_id;
}
}
// Update status to disconnected
{
let mut status = state.status.write().await;

View File

@ -6,12 +6,12 @@ use super::super::serial::MeshcoreDevice;
use super::super::types::*;
use super::{
dispatch, frames, MeshCommand, MeshState, ADVERT_INTERVAL, MAX_CONSECUTIVE_WRITE_FAILURES,
SYNC_INTERVAL,
RX_STALL_TIMEOUT, SYNC_INTERVAL,
};
use anyhow::{Context, Result};
use std::path::Path;
use std::sync::Arc;
use std::time::Duration;
use std::time::{Duration, Instant};
use tokio::sync::mpsc;
use tracing::{debug, error, info, warn};
@ -950,6 +950,11 @@ pub(super) async fn run_mesh_session(
advert_timer.tick().await; // skip first immediate tick
sync_timer.tick().await;
let mut consecutive_write_failures: u32 = 0;
// Backlog #12 RX-stall watchdog — see RX_STALL_TIMEOUT's doc comment.
// Reset on the very first frame check too (not just successful reads),
// so a session that never receives anything still gets a full timeout
// window from startup rather than an immediately-stale clock.
let mut last_rx_at = Instant::now();
loop {
// If too many consecutive writes have failed, the serial port is dead —
@ -964,14 +969,26 @@ pub(super) async fn run_mesh_session(
consecutive_write_failures
);
}
if last_rx_at.elapsed() >= RX_STALL_TIMEOUT {
error!(
stalled_for_secs = last_rx_at.elapsed().as_secs(),
"No mesh frames received for too long — triggering reconnection"
);
anyhow::bail!(
"RX stalled for over {}s — forcing reconnect",
RX_STALL_TIMEOUT.as_secs()
);
}
tokio::select! {
// Check for incoming frames
frame_result = device.try_recv_frame() => {
match frame_result {
Ok(Some(frame)) => {
// Successful read resets the failure counter
// Successful read resets the failure counter and the
// RX-stall watchdog.
consecutive_write_failures = 0;
last_rx_at = Instant::now();
// For meshtastic, the PKI-E2E status of this frame can't
// ride the synthetic meshcore frame — snapshot the message
// id high-water mark, dispatch, then stamp the E2E pill on

View File

@ -132,6 +132,14 @@ pub struct MeshtasticDevice {
/// provision the operator-configured region — and to avoid a reboot loop by
/// only writing when it actually differs.
current_region: Option<u32>,
/// The radio's currently-configured LoRa modem preset, learned alongside
/// `current_region` from the same `Config.lora` block. Unlike region
/// (deliberately never overridden once set), this SHOULD be authoritative
/// -- two archy radios with mismatched presets (different spreading
/// factor/bandwidth) can't decode each other even on the correct region.
/// `None` until a Config frame is seen -- never write speculatively before
/// we've actually observed the radio's real value.
current_modem_preset: Option<u32>,
/// The radio's current PRIMARY channel as `(name, psk)`, learned from the
/// `Channel` blocks during `initialize`. Two radios only decode each other
/// when their primary channel (name + psk → channel hash) matches, so archy
@ -184,6 +192,7 @@ impl MeshtasticDevice {
contacts: HashMap::new(),
peer_pubkeys: HashMap::new(),
current_region: None,
current_modem_preset: None,
current_primary_channel: None,
current_secondary_channel: None,
device_path: path.to_string(),
@ -348,7 +357,24 @@ impl MeshtasticDevice {
"Respecting the radio's own LoRa region (not overriding with the configured one)"
);
}
Ok(false)
// Region is untouched either way, but the modem preset IS
// authoritative (backlog #12): two archy radios on mismatched
// presets can't decode each other even on the same region.
// Only acts once we've actually observed a real (non-None)
// preset that's wrong — never speculative, so this can't
// reboot-loop (the write sets LONG_FAST, the next Config
// frame confirms it, and this branch goes quiet).
match self.current_modem_preset {
Some(p) if p != LORA_MODEM_PRESET_LONG_FAST as u32 => {
debug!(
device_preset = p,
"Modem preset drifted from LONG_FAST — re-provisioning (region unchanged)"
);
self.set_lora_region(cur).await?;
Ok(true)
}
_ => Ok(false),
}
}
// Region is UNSET → a fresh radio is RF-silent and can't mesh at all.
// Set the operator-configured region so it can transmit/receive.
@ -855,9 +881,10 @@ impl MeshtasticDevice {
FROM_RADIO_CONFIG => {
// Only the LoRa sub-config carries a region; other Config
// variants (device/position/…) return None and are ignored.
if let Some(region) = parse_config_lora_region(value) {
if let Some((region, modem_preset)) = parse_config_lora_region(value) {
self.current_region = Some(region);
debug!(region, "Meshtastic LoRa region from device config");
self.current_modem_preset = Some(modem_preset);
debug!(region, modem_preset, "Meshtastic LoRa region/preset from device config");
}
None
}
@ -1182,7 +1209,12 @@ fn encode_heartbeat() -> Vec<u8> {
/// code. Returns `Some(REGION_UNSET)` when the LoRa block is present but has no
/// region field (a fresh radio), and `None` when this Config carries a
/// non-LoRa variant (device/position/…) so the caller keeps the prior value.
fn parse_config_lora_region(data: &[u8]) -> Option<u32> {
/// Returns `(region, modem_preset)` from a `Config.lora` block. `modem_preset`
/// defaults to `LORA_MODEM_PRESET_LONG_FAST` when the field is absent — a
/// fresh/never-configured radio reports no preset field at all, and treating
/// that as "already correct" (rather than "unknown, needs fixing") avoids a
/// spurious reboot before the operator-region provisioning step even runs.
fn parse_config_lora_region(data: &[u8]) -> Option<(u32, u32)> {
let mut idx = 0;
while idx < data.len() {
let (field, value, next) = next_field(data, idx)?;
@ -1191,16 +1223,17 @@ fn parse_config_lora_region(data: &[u8]) -> Option<u32> {
if let FieldValue::Bytes(b) = value {
let mut j = 0;
let mut region = REGION_UNSET;
let mut modem_preset = LORA_MODEM_PRESET_LONG_FAST as u32;
while j < b.len() {
let (lf, lv, ln) = next_field(b, j)?;
j = ln;
if lf == LORA_REGION_FIELD {
if let FieldValue::Varint(v) = lv {
region = v as u32;
}
match (lf, lv) {
(LORA_REGION_FIELD, FieldValue::Varint(v)) => region = v as u32,
(LORA_MODEM_PRESET_FIELD, FieldValue::Varint(v)) => modem_preset = v as u32,
_ => {}
}
}
return Some(region);
return Some((region, modem_preset));
}
}
}
@ -1803,6 +1836,37 @@ mod tests {
assert!(parse_position_lat_lon(&position).is_none());
}
#[test]
fn parse_config_lora_region_decodes_region_and_modem_preset() {
let mut lora = Vec::new();
encode_varint_field_into(LORA_REGION_FIELD, 3, &mut lora); // some real region code
encode_varint_field_into(LORA_MODEM_PRESET_FIELD, 4, &mut lora); // drifted preset
let mut config = Vec::new();
encode_len_field(CONFIG_LORA_FIELD, &lora, &mut config);
let (region, preset) = parse_config_lora_region(&config).expect("lora config present");
assert_eq!(region, 3);
assert_eq!(preset, 4);
}
#[test]
fn parse_config_lora_region_defaults_preset_to_long_fast_when_absent() {
// A fresh/never-provisioned radio's Config.lora may carry a region
// with no modem_preset field at all -- must default to "already
// correct" (LONG_FAST), not "unknown/wrong", or ensure_lora_region
// would try to fix a value it never actually observed.
let mut lora = Vec::new();
encode_varint_field_into(LORA_REGION_FIELD, 3, &mut lora);
let mut config = Vec::new();
encode_len_field(CONFIG_LORA_FIELD, &lora, &mut config);
let (region, preset) = parse_config_lora_region(&config).expect("lora config present");
assert_eq!(region, 3);
assert_eq!(preset, LORA_MODEM_PRESET_LONG_FAST as u32);
}
#[test]
fn packet_to_inbound_frame_updates_contact_signal_and_position_without_a_chat_frame() {
let from = 0x0000_4444;