feat(mesh): Meshtastic provisioning robustness (backlog #12)
Three fixes: 1. Modem-preset authoritative: parse_config_lora_region now also decodes modem_preset (field 2) alongside region, tracked as current_modem_preset. ensure_lora_region's "region already set, don't touch it" branch (correct, unchanged) now ALSO re-asserts LONG_FAST when a real observed preset has drifted -- previously modem_preset only ever got written when region was UNSET, so a radio with the right region but wrong preset was never fixed. Only acts on an actually-observed wrong value (never speculative), so it can't reboot-loop. 2. RX-stall watchdog: run_mesh_session now bails (triggering the existing auto-reconnect path) if no frame has been successfully received in 5 minutes -- the existing consecutive_write_failures counter is blind to a receive-only stall (writes can keep succeeding while inbound streaming is wedged). 3. Hot-swap detection: spawn_mesh_listener now compares self_node_id across session restarts and logs clearly when the physical radio itself changed (not just an ordinary reconnect of the same board). Per-session device state (contacts, current_region, etc.) was already naturally isolated per-session (fresh struct each reconnect) -- nothing else needed clearing. 107/107 mesh tests pass (2 new: modem_preset decode + the absent-field-defaults-to-LONG_FAST case). Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
parent
494f272815
commit
712df2278f
@ -28,6 +28,16 @@ const ADVERT_INTERVAL: Duration = Duration::from_secs(60);
|
||||
/// How often to poll for queued messages when no push notifications.
|
||||
const SYNC_INTERVAL: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Backlog #12 (provisioning robustness): if we haven't successfully received
|
||||
/// ANY frame in this long, treat the serial link as stalled and force a
|
||||
/// reconnect — the write-side `consecutive_write_failures` counter is blind
|
||||
/// to a receive-only stall (writes can keep succeeding while the radio's
|
||||
/// stopped streaming inbound, e.g. the FROM_RADIO_REBOOTED-without-recovery
|
||||
/// case this same file already has a targeted fix for). 5 minutes is well
|
||||
/// above the existing 60s advert / 10s sync cadence, so a healthy link never
|
||||
/// trips it.
|
||||
const RX_STALL_TIMEOUT: Duration = Duration::from_secs(300);
|
||||
|
||||
/// Maximum stored messages (circular buffer).
|
||||
const MAX_MESSAGES: usize = 100;
|
||||
|
||||
@ -409,6 +419,15 @@ pub fn spawn_mesh_listener(
|
||||
let mut shutdown = shutdown;
|
||||
let mut cmd_rx = cmd_rx;
|
||||
let mut reconnect_delay = RECONNECT_DELAY_INIT;
|
||||
// Backlog #12 hot-swap re-binding: each run_mesh_session call already
|
||||
// builds a fresh device struct (contacts/current_region/etc. all
|
||||
// start empty), so per-device session state is naturally isolated
|
||||
// across reconnects — there's no stale in-memory state to clear here.
|
||||
// What's worth doing is detecting when the *physical radio itself*
|
||||
// changed (a genuine hot-swap, not just the same radio reconnecting)
|
||||
// so it's visible in logs rather than silently treated the same as
|
||||
// an ordinary reconnect.
|
||||
let mut last_self_node_id: Option<u32> = None;
|
||||
loop {
|
||||
if *shutdown.borrow() {
|
||||
info!("Mesh listener shutting down");
|
||||
@ -447,6 +466,25 @@ pub fn spawn_mesh_listener(
|
||||
}
|
||||
}
|
||||
|
||||
// Hot-swap detection: compare this session's self_node_id against
|
||||
// the last one we saw. A change means the physical radio itself
|
||||
// was swapped (not just a reconnect of the same board).
|
||||
{
|
||||
let current_self_node_id = state.status.read().await.self_node_id;
|
||||
if let (Some(prev), Some(cur)) = (last_self_node_id, current_self_node_id) {
|
||||
if prev != cur {
|
||||
info!(
|
||||
previous_node_id = prev,
|
||||
new_node_id = cur,
|
||||
"Local mesh radio identity changed — treating as a hot-swapped device"
|
||||
);
|
||||
}
|
||||
}
|
||||
if current_self_node_id.is_some() {
|
||||
last_self_node_id = current_self_node_id;
|
||||
}
|
||||
}
|
||||
|
||||
// Update status to disconnected
|
||||
{
|
||||
let mut status = state.status.write().await;
|
||||
|
||||
@ -6,12 +6,12 @@ use super::super::serial::MeshcoreDevice;
|
||||
use super::super::types::*;
|
||||
use super::{
|
||||
dispatch, frames, MeshCommand, MeshState, ADVERT_INTERVAL, MAX_CONSECUTIVE_WRITE_FAILURES,
|
||||
SYNC_INTERVAL,
|
||||
RX_STALL_TIMEOUT, SYNC_INTERVAL,
|
||||
};
|
||||
use anyhow::{Context, Result};
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
@ -950,6 +950,11 @@ pub(super) async fn run_mesh_session(
|
||||
advert_timer.tick().await; // skip first immediate tick
|
||||
sync_timer.tick().await;
|
||||
let mut consecutive_write_failures: u32 = 0;
|
||||
// Backlog #12 RX-stall watchdog — see RX_STALL_TIMEOUT's doc comment.
|
||||
// Reset on the very first frame check too (not just successful reads),
|
||||
// so a session that never receives anything still gets a full timeout
|
||||
// window from startup rather than an immediately-stale clock.
|
||||
let mut last_rx_at = Instant::now();
|
||||
|
||||
loop {
|
||||
// If too many consecutive writes have failed, the serial port is dead —
|
||||
@ -964,14 +969,26 @@ pub(super) async fn run_mesh_session(
|
||||
consecutive_write_failures
|
||||
);
|
||||
}
|
||||
if last_rx_at.elapsed() >= RX_STALL_TIMEOUT {
|
||||
error!(
|
||||
stalled_for_secs = last_rx_at.elapsed().as_secs(),
|
||||
"No mesh frames received for too long — triggering reconnection"
|
||||
);
|
||||
anyhow::bail!(
|
||||
"RX stalled for over {}s — forcing reconnect",
|
||||
RX_STALL_TIMEOUT.as_secs()
|
||||
);
|
||||
}
|
||||
|
||||
tokio::select! {
|
||||
// Check for incoming frames
|
||||
frame_result = device.try_recv_frame() => {
|
||||
match frame_result {
|
||||
Ok(Some(frame)) => {
|
||||
// Successful read resets the failure counter
|
||||
// Successful read resets the failure counter and the
|
||||
// RX-stall watchdog.
|
||||
consecutive_write_failures = 0;
|
||||
last_rx_at = Instant::now();
|
||||
// For meshtastic, the PKI-E2E status of this frame can't
|
||||
// ride the synthetic meshcore frame — snapshot the message
|
||||
// id high-water mark, dispatch, then stamp the E2E pill on
|
||||
|
||||
@ -132,6 +132,14 @@ pub struct MeshtasticDevice {
|
||||
/// provision the operator-configured region — and to avoid a reboot loop by
|
||||
/// only writing when it actually differs.
|
||||
current_region: Option<u32>,
|
||||
/// The radio's currently-configured LoRa modem preset, learned alongside
|
||||
/// `current_region` from the same `Config.lora` block. Unlike region
|
||||
/// (deliberately never overridden once set), this SHOULD be authoritative
|
||||
/// -- two archy radios with mismatched presets (different spreading
|
||||
/// factor/bandwidth) can't decode each other even on the correct region.
|
||||
/// `None` until a Config frame is seen -- never write speculatively before
|
||||
/// we've actually observed the radio's real value.
|
||||
current_modem_preset: Option<u32>,
|
||||
/// The radio's current PRIMARY channel as `(name, psk)`, learned from the
|
||||
/// `Channel` blocks during `initialize`. Two radios only decode each other
|
||||
/// when their primary channel (name + psk → channel hash) matches, so archy
|
||||
@ -184,6 +192,7 @@ impl MeshtasticDevice {
|
||||
contacts: HashMap::new(),
|
||||
peer_pubkeys: HashMap::new(),
|
||||
current_region: None,
|
||||
current_modem_preset: None,
|
||||
current_primary_channel: None,
|
||||
current_secondary_channel: None,
|
||||
device_path: path.to_string(),
|
||||
@ -348,7 +357,24 @@ impl MeshtasticDevice {
|
||||
"Respecting the radio's own LoRa region (not overriding with the configured one)"
|
||||
);
|
||||
}
|
||||
Ok(false)
|
||||
// Region is untouched either way, but the modem preset IS
|
||||
// authoritative (backlog #12): two archy radios on mismatched
|
||||
// presets can't decode each other even on the same region.
|
||||
// Only acts once we've actually observed a real (non-None)
|
||||
// preset that's wrong — never speculative, so this can't
|
||||
// reboot-loop (the write sets LONG_FAST, the next Config
|
||||
// frame confirms it, and this branch goes quiet).
|
||||
match self.current_modem_preset {
|
||||
Some(p) if p != LORA_MODEM_PRESET_LONG_FAST as u32 => {
|
||||
debug!(
|
||||
device_preset = p,
|
||||
"Modem preset drifted from LONG_FAST — re-provisioning (region unchanged)"
|
||||
);
|
||||
self.set_lora_region(cur).await?;
|
||||
Ok(true)
|
||||
}
|
||||
_ => Ok(false),
|
||||
}
|
||||
}
|
||||
// Region is UNSET → a fresh radio is RF-silent and can't mesh at all.
|
||||
// Set the operator-configured region so it can transmit/receive.
|
||||
@ -855,9 +881,10 @@ impl MeshtasticDevice {
|
||||
FROM_RADIO_CONFIG => {
|
||||
// Only the LoRa sub-config carries a region; other Config
|
||||
// variants (device/position/…) return None and are ignored.
|
||||
if let Some(region) = parse_config_lora_region(value) {
|
||||
if let Some((region, modem_preset)) = parse_config_lora_region(value) {
|
||||
self.current_region = Some(region);
|
||||
debug!(region, "Meshtastic LoRa region from device config");
|
||||
self.current_modem_preset = Some(modem_preset);
|
||||
debug!(region, modem_preset, "Meshtastic LoRa region/preset from device config");
|
||||
}
|
||||
None
|
||||
}
|
||||
@ -1182,7 +1209,12 @@ fn encode_heartbeat() -> Vec<u8> {
|
||||
/// code. Returns `Some(REGION_UNSET)` when the LoRa block is present but has no
|
||||
/// region field (a fresh radio), and `None` when this Config carries a
|
||||
/// non-LoRa variant (device/position/…) so the caller keeps the prior value.
|
||||
fn parse_config_lora_region(data: &[u8]) -> Option<u32> {
|
||||
/// Returns `(region, modem_preset)` from a `Config.lora` block. `modem_preset`
|
||||
/// defaults to `LORA_MODEM_PRESET_LONG_FAST` when the field is absent — a
|
||||
/// fresh/never-configured radio reports no preset field at all, and treating
|
||||
/// that as "already correct" (rather than "unknown, needs fixing") avoids a
|
||||
/// spurious reboot before the operator-region provisioning step even runs.
|
||||
fn parse_config_lora_region(data: &[u8]) -> Option<(u32, u32)> {
|
||||
let mut idx = 0;
|
||||
while idx < data.len() {
|
||||
let (field, value, next) = next_field(data, idx)?;
|
||||
@ -1191,16 +1223,17 @@ fn parse_config_lora_region(data: &[u8]) -> Option<u32> {
|
||||
if let FieldValue::Bytes(b) = value {
|
||||
let mut j = 0;
|
||||
let mut region = REGION_UNSET;
|
||||
let mut modem_preset = LORA_MODEM_PRESET_LONG_FAST as u32;
|
||||
while j < b.len() {
|
||||
let (lf, lv, ln) = next_field(b, j)?;
|
||||
j = ln;
|
||||
if lf == LORA_REGION_FIELD {
|
||||
if let FieldValue::Varint(v) = lv {
|
||||
region = v as u32;
|
||||
}
|
||||
match (lf, lv) {
|
||||
(LORA_REGION_FIELD, FieldValue::Varint(v)) => region = v as u32,
|
||||
(LORA_MODEM_PRESET_FIELD, FieldValue::Varint(v)) => modem_preset = v as u32,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
return Some(region);
|
||||
return Some((region, modem_preset));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1803,6 +1836,37 @@ mod tests {
|
||||
assert!(parse_position_lat_lon(&position).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_config_lora_region_decodes_region_and_modem_preset() {
|
||||
let mut lora = Vec::new();
|
||||
encode_varint_field_into(LORA_REGION_FIELD, 3, &mut lora); // some real region code
|
||||
encode_varint_field_into(LORA_MODEM_PRESET_FIELD, 4, &mut lora); // drifted preset
|
||||
|
||||
let mut config = Vec::new();
|
||||
encode_len_field(CONFIG_LORA_FIELD, &lora, &mut config);
|
||||
|
||||
let (region, preset) = parse_config_lora_region(&config).expect("lora config present");
|
||||
assert_eq!(region, 3);
|
||||
assert_eq!(preset, 4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_config_lora_region_defaults_preset_to_long_fast_when_absent() {
|
||||
// A fresh/never-provisioned radio's Config.lora may carry a region
|
||||
// with no modem_preset field at all -- must default to "already
|
||||
// correct" (LONG_FAST), not "unknown/wrong", or ensure_lora_region
|
||||
// would try to fix a value it never actually observed.
|
||||
let mut lora = Vec::new();
|
||||
encode_varint_field_into(LORA_REGION_FIELD, 3, &mut lora);
|
||||
|
||||
let mut config = Vec::new();
|
||||
encode_len_field(CONFIG_LORA_FIELD, &lora, &mut config);
|
||||
|
||||
let (region, preset) = parse_config_lora_region(&config).expect("lora config present");
|
||||
assert_eq!(region, 3);
|
||||
assert_eq!(preset, LORA_MODEM_PRESET_LONG_FAST as u32);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn packet_to_inbound_frame_updates_contact_signal_and_position_without_a_chat_frame() {
|
||||
let from = 0x0000_4444;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user