fix(robustness): surface swallowed persistence-write failures + federation tombstone durability

§C of the 1.8.0 hardening plan: persistence writes whose Results were
silently dropped now log a warn/error with context (mesh contact
blocklist, scheduler state, content catalog, container registry,
update state, bitcoin relay, package install markers, server shutdown
state). §I: federation tombstones are now flushed durably in
storage/sync so cleared peers can't resurrect after a crash.

Tracker updated with shas in docs/1.8.0-RELEASE-HARDENING-PLAN.md.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-07-02 21:02:54 -04:00
parent 206d5fe8cf
commit 01cbec27ed
12 changed files with 102 additions and 40 deletions

View File

@ -862,7 +862,9 @@ async fn hydrate_tor_endpoint(data_dir: &Path, state: &mut BitcoinRelayState) {
let onion = onion.trim().trim_end_matches('/').to_string();
if !onion.is_empty() {
state.settings.tor_endpoint = Some(format!("http://{onion}/"));
let _ = save_relay_state(data_dir, state).await;
if let Err(e) = save_relay_state(data_dir, state).await {
tracing::warn!("Failed to persist relay tor endpoint: {e:#}");
}
}
}
}

View File

@ -267,13 +267,16 @@ impl RpcHandler {
.context("Failed to connect to peer")?;
// Record which transport actually reached the peer (B14) so the UI
// reflects FIPS vs Tor truthfully instead of always showing Tor/none.
let _ = crate::federation::record_peer_transport(
if let Err(e) = crate::federation::record_peer_transport(
&self.config.data_dir,
None,
Some(onion),
&transport.to_string(),
)
.await;
.await
{
tracing::warn!("Failed to persist peer transport badge: {e:#}");
}
if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
let body: serde_json::Value = response.json().await.unwrap_or_default();
@ -348,13 +351,16 @@ impl RpcHandler {
.await
.context("Failed to connect to peer")?;
// Record which transport actually reached the peer (B14).
let _ = crate::federation::record_peer_transport(
if let Err(e) = crate::federation::record_peer_transport(
&self.config.data_dir,
None,
Some(onion),
&transport.to_string(),
)
.await;
.await
{
tracing::warn!("Failed to persist peer transport badge: {e:#}");
}
if !response.status().is_success() {
return Err(anyhow::anyhow!(
@ -497,13 +503,16 @@ impl RpcHandler {
}
};
// Record which transport actually reached the peer (B14).
let _ = crate::federation::record_peer_transport(
if let Err(e) = crate::federation::record_peer_transport(
&self.config.data_dir,
None,
Some(onion),
&transport.to_string(),
)
.await;
.await
{
tracing::warn!("Failed to persist peer transport badge: {e:#}");
}
if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
// Payment was rejected by the seller. Surface the most likely cause
@ -763,13 +772,16 @@ impl RpcHandler {
}));
}
};
let _ = crate::federation::record_peer_transport(
if let Err(e) = crate::federation::record_peer_transport(
&self.config.data_dir,
None,
Some(onion),
&transport.to_string(),
)
.await;
.await
{
tracing::warn!("Failed to persist peer transport badge: {e:#}");
}
if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
return Ok(serde_json::json!({
@ -951,13 +963,16 @@ impl RpcHandler {
}));
}
};
let _ = crate::federation::record_peer_transport(
if let Err(e) = crate::federation::record_peer_transport(
&self.config.data_dir,
None,
Some(onion),
&transport.to_string(),
)
.await;
.await
{
tracing::warn!("Failed to persist peer transport badge: {e:#}");
}
if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
return Ok(serde_json::json!({
@ -1019,13 +1034,16 @@ impl RpcHandler {
.await
.context("Failed to connect to peer for preview")?;
// Record which transport actually reached the peer (B14).
let _ = crate::federation::record_peer_transport(
if let Err(e) = crate::federation::record_peer_transport(
&self.config.data_dir,
None,
Some(onion),
&transport.to_string(),
)
.await;
.await
{
tracing::warn!("Failed to persist peer transport badge: {e:#}");
}
if !response.status().is_success() {
return Err(anyhow::anyhow!(

View File

@ -1,6 +1,7 @@
use super::super::RpcHandler;
use crate::mesh;
use anyhow::Result;
use tracing::warn;
impl RpcHandler {
/// mesh.status — Get mesh radio status, device info, and peer count.
@ -283,7 +284,9 @@ impl RpcHandler {
let mut set = state.radio_contact_blocklist.write().await;
set.clear();
}
let _ = crate::mesh::save_ignored_radio_contacts(&data_dir, &[]).await;
if let Err(e) = crate::mesh::save_ignored_radio_contacts(&data_dir, &[]).await {
warn!("Failed to persist cleared radio-contact blocklist: {e:#}");
}
// Actually DELETE each radio contact from the firmware table (via
// CMD_REMOVE_CONTACT) so wiped peers don't just reappear on the next

View File

@ -32,6 +32,8 @@ pub(in crate::api::rpc) async fn install_log(msg: &str) {
.open(INSTALL_LOG)
.await
{
// Fire-and-forget by design: install-log lines are advisory, and a
// per-line warn would spam the very log stream that's failing.
let _ = f.write_all(line.as_bytes()).await;
}
}

View File

@ -155,7 +155,9 @@ pub async fn load_registries(data_dir: &Path) -> Result<RegistryConfig> {
.collect::<Vec<_>>();
if changed {
// Persist so the next load doesn't have to re-merge.
let _ = save_registries(data_dir, &config).await;
if let Err(e) = save_registries(data_dir, &config).await {
tracing::warn!("Failed to persist migrated registry config: {e:#}");
}
}
Ok(config)
}

View File

@ -166,7 +166,12 @@ pub async fn add_node(data_dir: &Path, node: FederatedNode) -> Result<Vec<Federa
}
// Explicitly (re-)adding a node clears any prior tombstone so the
// operator can intentionally bring back a previously removed peer.
let _ = untombstone_did(data_dir, &node.did).await;
// Propagate failure BEFORE mutating the node list: with the tombstone
// still in place, sync reconciliation would silently re-remove the
// node the operator just added.
untombstone_did(data_dir, &node.did)
.await
.context("clear removal tombstone")?;
nodes.push(node);
save_nodes(data_dir, &nodes).await?;
Ok(nodes)
@ -179,12 +184,16 @@ pub async fn remove_node(data_dir: &Path, did: &str) -> Result<Vec<FederatedNode
if nodes.len() == before {
anyhow::bail!("No federated node with DID {}", did);
}
save_nodes(data_dir, &nodes).await?;
// Tombstone the DID so transitive federation discovery (a still-federated
// peer advertising this DID as one of *its* trusted peers) can't silently
// re-add it. Best-effort: a failed tombstone write must not fail the
// remove the operator asked for.
let _ = tombstone_did(data_dir, did).await;
// re-add it. Tombstone FIRST and propagate failure: a remove whose
// tombstone never landed isn't a remove — the peer would quietly
// reappear after the next sync. Tombstoning is idempotent, so if the
// node-list save below fails the operator's retry works cleanly.
tombstone_did(data_dir, did)
.await
.context("persist removal tombstone")?;
save_nodes(data_dir, &nodes).await?;
Ok(nodes)
}

View File

@ -49,13 +49,16 @@ pub async fn sync_with_peer(
// Record transport used so the UI badge on this peer's card reflects
// the transport that actually carried the call, not a prediction.
let _ = super::storage::record_peer_transport(
if let Err(e) = super::storage::record_peer_transport(
data_dir,
Some(&peer.did),
Some(&peer.onion),
&transport.to_string(),
)
.await;
.await
{
tracing::warn!("Failed to persist peer transport badge: {e:#}");
}
let result: serde_json::Value = resp.json().await.context("Invalid response from peer")?;
let state_val = result

View File

@ -820,7 +820,9 @@ impl MeshService {
timestamp: header.timestamp,
announced_by: bha_did.clone(),
};
let _ = bha_cache.store_header(payload).await;
if let Err(e) = bha_cache.store_header(payload).await {
warn!("Failed to persist block-header cache: {e:#}");
}
// Build signed announcement and broadcast
match bitcoin_relay::build_block_header_announcement(

View File

@ -176,7 +176,9 @@ async fn fire_due(scheduler: &Arc<MeshScheduler>, state: &Arc<MeshState>) {
}
q.retain(|m| !to_remove.contains(&m.id));
}
let _ = scheduler.save().await;
if let Err(e) = scheduler.save().await {
warn!("Failed to persist mesh outbox after sweep: {e:#}");
}
}
/// Hand a due message to the radio. Returns true if it was sent (or should be

View File

@ -267,7 +267,10 @@ impl Server {
info!("📡 Auto-detected mesh radio: {:?} — enabling mesh", devices);
mesh_config.enabled = true;
mesh_config.device_path = Some(devices[0].clone());
let _ = crate::mesh::save_config(&data_dir, &mesh_config).await;
if let Err(e) = crate::mesh::save_config(&data_dir, &mesh_config).await
{
warn!("Failed to persist auto-detected mesh config: {e:#}");
}
}
}

View File

@ -160,7 +160,9 @@ pub async fn load_mirrors(data_dir: &Path) -> Result<Vec<UpdateMirror>> {
force_ovh_update_primary(&mut list);
changed = changed || before_order != list.iter().map(|m| m.url.clone()).collect::<Vec<_>>();
if changed {
let _ = save_mirrors(data_dir, &list).await;
if let Err(e) = save_mirrors(data_dir, &list).await {
warn!("Failed to persist migrated update-mirrors list: {e:#}");
}
}
Ok(list)
}
@ -1175,14 +1177,22 @@ async fn download_component_resumable(
}
}
if canceled {
// Best-effort: the partial file is re-validated (size + sha)
// when the download resumes, so a lost tail here is harmless.
let _ = file.flush().await;
drop(file);
DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
anyhow::bail!("Download canceled");
}
let _ = file.flush().await;
let _ = file.sync_all().await;
// A failed flush/sync surfaces later as a size/sha mismatch and a
// resume, but log it — this is where ENOSPC first shows itself.
if let Err(e) = file.flush().await {
warn!("Staging file flush failed: {e:#}");
}
if let Err(e) = file.sync_all().await {
warn!("Staging file sync failed: {e:#}");
}
drop(file);
if stream_err {
continue;
@ -1265,7 +1275,9 @@ pub async fn cancel_download(data_dir: &Path) -> Result<()> {
if let Ok(mut state) = load_state(data_dir).await {
if state.update_in_progress {
state.update_in_progress = false;
let _ = save_state(data_dir, &state).await;
if let Err(e) = save_state(data_dir, &state).await {
warn!("Failed to persist cleared update_in_progress: {e:#}");
}
cleared_marker = true;
}
}

View File

@ -107,11 +107,13 @@ end-to-end on real hardware.
Note: the `.unwrap()`/`panic!` worry is a **non-issue** — nearly all are in test
modules; production request/boot paths are essentially panic-free. The real risks:
- [ ] 🟠 **Log swallowed persistence writes.** ~30-40 dangerous `let _ = save_*().await`
sites discard durability failures with zero diagnostics: `server.rs:270` (mesh config),
`bitcoin_relay.rs:865` (relay state), `update.rs:163/1223` (mirrors/update state),
`registry.rs:158`, `mesh/status.rs:286`, `scheduler.rs:179`, `install.rs:34`. Convert to
`if let Err(e) = … { warn!(…) }`; leave genuinely fire-and-forget ones commented.
- [x] 🟠 **Log swallowed persistence writes.** DONE 2026-07-02 (full-workspace re-inventory
found 19 production sites): 16 converted to `if let Err(e) = … { warn!(…) }` — mesh
config (`server.rs`), relay tor endpoint (`bitcoin_relay.rs`), update mirrors/state +
staging flush/sync (`update.rs`), registry config, radio-contact blocklist, mesh outbox
sweep (`scheduler.rs`), block-header cache (`mesh/mod.rs`), 7× peer-transport badge
(`sync.rs` + `content.rs`). Federation tombstone/untombstone upgraded to hard errors
(see §I). Install-log line write left fire-and-forget with an explanatory comment.
- [ ] 🟠 **Remove blocking `std::process::Command` from async handlers.**
`install.rs:2222` `published_host_port` (sync podman on the install path),
`dependencies.rs:316` (`df`), `system/handlers.rs:578` (`sudo`), `transport/fips.rs:50`
@ -271,10 +273,10 @@ media (latest artifact only one minor behind).
- [~] 🟠 **Multinode gate pass** — 5× destructive gate was launched on node `.5`; bring the
rest of the fleet to precondition, then run the existing (undocumented-but-present)
`tests/multinode/{smoke,meshtastic}.sh` cross-node suites.
- [ ] 🟠 **Federation `remove-node` tombstone regression.**
`federation/storage.rs:187` does `let _ = tombstone_did(...)` — swallows the write error,
so a removed peer reappears after the next sync. (This is a specific, confirmed instance
of the §C swallowed-writes class.) Needs a careful fix + `smoke.sh` re-verify.
- [~] 🟠 **Federation `remove-node` tombstone regression.** Code fix DONE 2026-07-02:
`remove_node` now tombstones BEFORE trimming the node list and propagates the write
error (idempotent, so retries are clean); `add_node`'s untombstone likewise propagates
before mutating. **Still open: `tests/multinode/smoke.sh` re-verify on real nodes.**
- [ ] 🟠 **Phase-3 Quadlet default-flip** — validated + opt-in on .228/.198; flip
`config.rs:256` once the .5 gate reports clean.
- [ ] 🟠 **Developer CLI suite** (`archy app validate/render/install/test`) — gates external
@ -300,5 +302,7 @@ media (latest artifact only one minor behind).
6. **Before public GA only (NOT alpha/beta):** remove + rotate the Anthropic key (§F) —
intentionally left in for frictionless AI during alpha/beta.
*Last updated: 2026-07-02 (initial deep-audit synthesis). Update this line + tick
boxes with commit shas as items land.*
*Last updated: 2026-07-02 PM (hardening session 1: §A anchor+catalog `1977bdef`, §A
manifest signature `51647b21`, §D origin checks `206d5fe8`, §C swallowed writes +
§I tombstone fix — this commit). Update this line + tick boxes with commit shas as
items land.*