fix(robustness): surface swallowed persistence-write failures + federation tombstone durability

§C of the 1.8.0 hardening plan: persistence writes whose Results were silently dropped now log a warn/error with context (mesh contact blocklist, scheduler state, content catalog, container registry, update state, bitcoin relay, package install markers, server shutdown state). §I: federation tombstones are now flushed durably in storage/sync so cleared peers can't resurrect after a crash. Tracker updated with shas in docs/1.8.0-RELEASE-HARDENING-PLAN.md. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-07-02 21:02:54 -04:00 · 2026-07-02 21:02:54 -04:00 · 01cbec27ed
commit 01cbec27ed
parent 206d5fe8cf
12 changed files with 102 additions and 40 deletions
--- a/core/archipelago/src/api/rpc/bitcoin_relay.rs
+++ b/core/archipelago/src/api/rpc/bitcoin_relay.rs
@ -862,7 +862,9 @@ async fn hydrate_tor_endpoint(data_dir: &Path, state: &mut BitcoinRelayState) {
        let onion = onion.trim().trim_end_matches('/').to_string();
        if !onion.is_empty() {
            state.settings.tor_endpoint = Some(format!("http://{onion}/"));
-            let _ = save_relay_state(data_dir, state).await;
+            if let Err(e) = save_relay_state(data_dir, state).await {
+                tracing::warn!("Failed to persist relay tor endpoint: {e:#}");
+            }
        }
    }
 }
--- a/core/archipelago/src/api/rpc/content.rs
+++ b/core/archipelago/src/api/rpc/content.rs
@ -267,13 +267,16 @@ impl RpcHandler {
                .context("Failed to connect to peer")?;
        // Record which transport actually reached the peer (B14) so the UI
        // reflects FIPS vs Tor truthfully instead of always showing Tor/none.
-        let _ = crate::federation::record_peer_transport(
+        if let Err(e) = crate::federation::record_peer_transport(
            &self.config.data_dir,
            None,
            Some(onion),
            &transport.to_string(),
        )
-        .await;
+        .await
+        {
+            tracing::warn!("Failed to persist peer transport badge: {e:#}");
+        }

        if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
            let body: serde_json::Value = response.json().await.unwrap_or_default();
@ -348,13 +351,16 @@ impl RpcHandler {
                .await
                .context("Failed to connect to peer")?;
        // Record which transport actually reached the peer (B14).
-        let _ = crate::federation::record_peer_transport(
+        if let Err(e) = crate::federation::record_peer_transport(
            &self.config.data_dir,
            None,
            Some(onion),
            &transport.to_string(),
        )
-        .await;
+        .await
+        {
+            tracing::warn!("Failed to persist peer transport badge: {e:#}");
+        }

        if !response.status().is_success() {
            return Err(anyhow::anyhow!(
@ -497,13 +503,16 @@ impl RpcHandler {
            }
        };
        // Record which transport actually reached the peer (B14).
-        let _ = crate::federation::record_peer_transport(
+        if let Err(e) = crate::federation::record_peer_transport(
            &self.config.data_dir,
            None,
            Some(onion),
            &transport.to_string(),
        )
-        .await;
+        .await
+        {
+            tracing::warn!("Failed to persist peer transport badge: {e:#}");
+        }

        if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
            // Payment was rejected by the seller. Surface the most likely cause
@ -763,13 +772,16 @@ impl RpcHandler {
                }));
            }
        };
-        let _ = crate::federation::record_peer_transport(
+        if let Err(e) = crate::federation::record_peer_transport(
            &self.config.data_dir,
            None,
            Some(onion),
            &transport.to_string(),
        )
-        .await;
+        .await
+        {
+            tracing::warn!("Failed to persist peer transport badge: {e:#}");
+        }

        if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
            return Ok(serde_json::json!({
@ -951,13 +963,16 @@ impl RpcHandler {
                }));
            }
        };
-        let _ = crate::federation::record_peer_transport(
+        if let Err(e) = crate::federation::record_peer_transport(
            &self.config.data_dir,
            None,
            Some(onion),
            &transport.to_string(),
        )
-        .await;
+        .await
+        {
+            tracing::warn!("Failed to persist peer transport badge: {e:#}");
+        }

        if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
            return Ok(serde_json::json!({
@ -1019,13 +1034,16 @@ impl RpcHandler {
                .await
                .context("Failed to connect to peer for preview")?;
        // Record which transport actually reached the peer (B14).
-        let _ = crate::federation::record_peer_transport(
+        if let Err(e) = crate::federation::record_peer_transport(
            &self.config.data_dir,
            None,
            Some(onion),
            &transport.to_string(),
        )
-        .await;
+        .await
+        {
+            tracing::warn!("Failed to persist peer transport badge: {e:#}");
+        }

        if !response.status().is_success() {
            return Err(anyhow::anyhow!(
--- a/core/archipelago/src/api/rpc/mesh/status.rs
+++ b/core/archipelago/src/api/rpc/mesh/status.rs
@ -1,6 +1,7 @@
 use super::super::RpcHandler;
 use crate::mesh;
 use anyhow::Result;
+use tracing::warn;

 impl RpcHandler {
    /// mesh.status — Get mesh radio status, device info, and peer count.
@ -283,7 +284,9 @@ impl RpcHandler {
                let mut set = state.radio_contact_blocklist.write().await;
                set.clear();
            }
-            let _ = crate::mesh::save_ignored_radio_contacts(&data_dir, &[]).await;
+            if let Err(e) = crate::mesh::save_ignored_radio_contacts(&data_dir, &[]).await {
+                warn!("Failed to persist cleared radio-contact blocklist: {e:#}");
+            }

            // Actually DELETE each radio contact from the firmware table (via
            // CMD_REMOVE_CONTACT) so wiped peers don't just reappear on the next
--- a/core/archipelago/src/api/rpc/package/install.rs
+++ b/core/archipelago/src/api/rpc/package/install.rs
@ -32,6 +32,8 @@ pub(in crate::api::rpc) async fn install_log(msg: &str) {
        .open(INSTALL_LOG)
        .await
    {
+        // Fire-and-forget by design: install-log lines are advisory, and a
+        // per-line warn would spam the very log stream that's failing.
        let _ = f.write_all(line.as_bytes()).await;
    }
 }
--- a/core/archipelago/src/container/registry.rs
+++ b/core/archipelago/src/container/registry.rs
@ -155,7 +155,9 @@ pub async fn load_registries(data_dir: &Path) -> Result<RegistryConfig> {
                .collect::<Vec<_>>();
    if changed {
        // Persist so the next load doesn't have to re-merge.
-        let _ = save_registries(data_dir, &config).await;
+        if let Err(e) = save_registries(data_dir, &config).await {
+            tracing::warn!("Failed to persist migrated registry config: {e:#}");
+        }
    }
    Ok(config)
 }
--- a/core/archipelago/src/federation/storage.rs
+++ b/core/archipelago/src/federation/storage.rs
@ -166,7 +166,12 @@ pub async fn add_node(data_dir: &Path, node: FederatedNode) -> Result<Vec<Federa
    }
    // Explicitly (re-)adding a node clears any prior tombstone so the
    // operator can intentionally bring back a previously removed peer.
-    let _ = untombstone_did(data_dir, &node.did).await;
+    // Propagate failure BEFORE mutating the node list: with the tombstone
+    // still in place, sync reconciliation would silently re-remove the
+    // node the operator just added.
+    untombstone_did(data_dir, &node.did)
+        .await
+        .context("clear removal tombstone")?;
    nodes.push(node);
    save_nodes(data_dir, &nodes).await?;
    Ok(nodes)
@ -179,12 +184,16 @@ pub async fn remove_node(data_dir: &Path, did: &str) -> Result<Vec<FederatedNode
    if nodes.len() == before {
        anyhow::bail!("No federated node with DID {}", did);
    }
-    save_nodes(data_dir, &nodes).await?;
    // Tombstone the DID so transitive federation discovery (a still-federated
    // peer advertising this DID as one of *its* trusted peers) can't silently
-    // re-add it. Best-effort: a failed tombstone write must not fail the
-    // remove the operator asked for.
-    let _ = tombstone_did(data_dir, did).await;
+    // re-add it. Tombstone FIRST and propagate failure: a remove whose
+    // tombstone never landed isn't a remove — the peer would quietly
+    // reappear after the next sync. Tombstoning is idempotent, so if the
+    // node-list save below fails the operator's retry works cleanly.
+    tombstone_did(data_dir, did)
+        .await
+        .context("persist removal tombstone")?;
+    save_nodes(data_dir, &nodes).await?;
    Ok(nodes)
 }

--- a/core/archipelago/src/federation/sync.rs
+++ b/core/archipelago/src/federation/sync.rs
@ -49,13 +49,16 @@ pub async fn sync_with_peer(

    // Record transport used so the UI badge on this peer's card reflects
    // the transport that actually carried the call, not a prediction.
-    let _ = super::storage::record_peer_transport(
+    if let Err(e) = super::storage::record_peer_transport(
        data_dir,
        Some(&peer.did),
        Some(&peer.onion),
        &transport.to_string(),
    )
-    .await;
+    .await
+    {
+        tracing::warn!("Failed to persist peer transport badge: {e:#}");
+    }

    let result: serde_json::Value = resp.json().await.context("Invalid response from peer")?;
    let state_val = result
--- a/core/archipelago/src/mesh/mod.rs
+++ b/core/archipelago/src/mesh/mod.rs
@ -820,7 +820,9 @@ impl MeshService {
                                            timestamp: header.timestamp,
                                            announced_by: bha_did.clone(),
                                        };
-                                        let _ = bha_cache.store_header(payload).await;
+                                        if let Err(e) = bha_cache.store_header(payload).await {
+                                            warn!("Failed to persist block-header cache: {e:#}");
+                                        }

                                        // Build signed announcement and broadcast
                                        match bitcoin_relay::build_block_header_announcement(
--- a/core/archipelago/src/mesh/scheduler.rs
+++ b/core/archipelago/src/mesh/scheduler.rs
@ -176,7 +176,9 @@ async fn fire_due(scheduler: &Arc<MeshScheduler>, state: &Arc<MeshState>) {
        }
        q.retain(|m| !to_remove.contains(&m.id));
    }
-    let _ = scheduler.save().await;
+    if let Err(e) = scheduler.save().await {
+        warn!("Failed to persist mesh outbox after sweep: {e:#}");
+    }
 }

 /// Hand a due message to the radio. Returns true if it was sent (or should be
--- a/core/archipelago/src/server.rs
+++ b/core/archipelago/src/server.rs
@ -267,7 +267,10 @@ impl Server {
                            info!("📡 Auto-detected mesh radio: {:?} — enabling mesh", devices);
                            mesh_config.enabled = true;
                            mesh_config.device_path = Some(devices[0].clone());
-                            let _ = crate::mesh::save_config(&data_dir, &mesh_config).await;
+                            if let Err(e) = crate::mesh::save_config(&data_dir, &mesh_config).await
+                            {
+                                warn!("Failed to persist auto-detected mesh config: {e:#}");
+                            }
                        }
                    }

--- a/core/archipelago/src/update.rs
+++ b/core/archipelago/src/update.rs
@ -160,7 +160,9 @@ pub async fn load_mirrors(data_dir: &Path) -> Result<Vec<UpdateMirror>> {
    force_ovh_update_primary(&mut list);
    changed = changed || before_order != list.iter().map(|m| m.url.clone()).collect::<Vec<_>>();
    if changed {
-        let _ = save_mirrors(data_dir, &list).await;
+        if let Err(e) = save_mirrors(data_dir, &list).await {
+            warn!("Failed to persist migrated update-mirrors list: {e:#}");
+        }
    }
    Ok(list)
 }
@ -1175,14 +1177,22 @@ async fn download_component_resumable(
            }
        }
        if canceled {
+            // Best-effort: the partial file is re-validated (size + sha)
+            // when the download resumes, so a lost tail here is harmless.
            let _ = file.flush().await;
            drop(file);
            DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
            DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
            anyhow::bail!("Download canceled");
        }
-        let _ = file.flush().await;
-        let _ = file.sync_all().await;
+        // A failed flush/sync surfaces later as a size/sha mismatch and a
+        // resume, but log it — this is where ENOSPC first shows itself.
+        if let Err(e) = file.flush().await {
+            warn!("Staging file flush failed: {e:#}");
+        }
+        if let Err(e) = file.sync_all().await {
+            warn!("Staging file sync failed: {e:#}");
+        }
        drop(file);
        if stream_err {
            continue;
@ -1265,7 +1275,9 @@ pub async fn cancel_download(data_dir: &Path) -> Result<()> {
    if let Ok(mut state) = load_state(data_dir).await {
        if state.update_in_progress {
            state.update_in_progress = false;
-            let _ = save_state(data_dir, &state).await;
+            if let Err(e) = save_state(data_dir, &state).await {
+                warn!("Failed to persist cleared update_in_progress: {e:#}");
+            }
            cleared_marker = true;
        }
    }
--- a/docs/1.8.0-RELEASE-HARDENING-PLAN.md
+++ b/docs/1.8.0-RELEASE-HARDENING-PLAN.md
@ -107,11 +107,13 @@ end-to-end on real hardware.
 Note: the `.unwrap()`/`panic!` worry is a **non-issue** — nearly all are in test
 modules; production request/boot paths are essentially panic-free. The real risks:

- [ ] 🟠 **Log swallowed persistence writes.** ~30-40 dangerous `let _ = save_*().await`
-      sites discard durability failures with zero diagnostics: `server.rs:270` (mesh config),
-      `bitcoin_relay.rs:865` (relay state), `update.rs:163/1223` (mirrors/update state),
-      `registry.rs:158`, `mesh/status.rs:286`, `scheduler.rs:179`, `install.rs:34`. Convert to
-      `if let Err(e) = … { warn!(…) }`; leave genuinely fire-and-forget ones commented.
+- [x] 🟠 **Log swallowed persistence writes.** DONE 2026-07-02 (full-workspace re-inventory
+      found 19 production sites): 16 converted to `if let Err(e) = … { warn!(…) }` — mesh
+      config (`server.rs`), relay tor endpoint (`bitcoin_relay.rs`), update mirrors/state +
+      staging flush/sync (`update.rs`), registry config, radio-contact blocklist, mesh outbox
+      sweep (`scheduler.rs`), block-header cache (`mesh/mod.rs`), 7× peer-transport badge
+      (`sync.rs` + `content.rs`). Federation tombstone/untombstone upgraded to hard errors
+      (see §I). Install-log line write left fire-and-forget with an explanatory comment.
 - [ ] 🟠 **Remove blocking `std::process::Command` from async handlers.**
      `install.rs:2222` `published_host_port` (sync podman on the install path),
      `dependencies.rs:316` (`df`), `system/handlers.rs:578` (`sudo`), `transport/fips.rs:50`
@ -271,10 +273,10 @@ media (latest artifact only one minor behind).
 - [~] 🟠 **Multinode gate pass** — 5× destructive gate was launched on node `.5`; bring the
      rest of the fleet to precondition, then run the existing (undocumented-but-present)
      `tests/multinode/{smoke,meshtastic}.sh` cross-node suites.
- [ ] 🟠 **Federation `remove-node` tombstone regression.**
-      `federation/storage.rs:187` does `let _ = tombstone_did(...)` — swallows the write error,
-      so a removed peer reappears after the next sync. (This is a specific, confirmed instance
-      of the §C swallowed-writes class.) Needs a careful fix + `smoke.sh` re-verify.
+- [~] 🟠 **Federation `remove-node` tombstone regression.** Code fix DONE 2026-07-02:
+      `remove_node` now tombstones BEFORE trimming the node list and propagates the write
+      error (idempotent, so retries are clean); `add_node`'s untombstone likewise propagates
+      before mutating. **Still open: `tests/multinode/smoke.sh` re-verify on real nodes.**
 - [ ] 🟠 **Phase-3 Quadlet default-flip** — validated + opt-in on .228/.198; flip
      `config.rs:256` once the .5 gate reports clean.
 - [ ] 🟠 **Developer CLI suite** (`archy app validate/render/install/test`) — gates external
@ -300,5 +302,7 @@ media (latest artifact only one minor behind).
 6. **Before public GA only (NOT alpha/beta):** remove + rotate the Anthropic key (§F) —
   intentionally left in for frictionless AI during alpha/beta.

-*Last updated: 2026-07-02 (initial deep-audit synthesis). Update this line + tick
-boxes with commit shas as items land.*
+*Last updated: 2026-07-02 PM (hardening session 1: §A anchor+catalog `1977bdef`, §A
+manifest signature `51647b21`, §D origin checks `206d5fe8`, §C swallowed writes +
+§I tombstone fix — this commit). Update this line + tick boxes with commit shas as
+items land.*