fix(robustness): surface swallowed persistence-write failures + federation tombstone durability
§C of the 1.8.0 hardening plan: persistence writes whose Results were silently dropped now log a warn/error with context (mesh contact blocklist, scheduler state, content catalog, container registry, update state, bitcoin relay, package install markers, server shutdown state). §I: federation tombstones are now flushed durably in storage/sync so cleared peers can't resurrect after a crash. Tracker updated with shas in docs/1.8.0-RELEASE-HARDENING-PLAN.md. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
parent
206d5fe8cf
commit
01cbec27ed
@ -862,7 +862,9 @@ async fn hydrate_tor_endpoint(data_dir: &Path, state: &mut BitcoinRelayState) {
|
||||
let onion = onion.trim().trim_end_matches('/').to_string();
|
||||
if !onion.is_empty() {
|
||||
state.settings.tor_endpoint = Some(format!("http://{onion}/"));
|
||||
let _ = save_relay_state(data_dir, state).await;
|
||||
if let Err(e) = save_relay_state(data_dir, state).await {
|
||||
tracing::warn!("Failed to persist relay tor endpoint: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -267,13 +267,16 @@ impl RpcHandler {
|
||||
.context("Failed to connect to peer")?;
|
||||
// Record which transport actually reached the peer (B14) so the UI
|
||||
// reflects FIPS vs Tor truthfully instead of always showing Tor/none.
|
||||
let _ = crate::federation::record_peer_transport(
|
||||
if let Err(e) = crate::federation::record_peer_transport(
|
||||
&self.config.data_dir,
|
||||
None,
|
||||
Some(onion),
|
||||
&transport.to_string(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to persist peer transport badge: {e:#}");
|
||||
}
|
||||
|
||||
if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
|
||||
let body: serde_json::Value = response.json().await.unwrap_or_default();
|
||||
@ -348,13 +351,16 @@ impl RpcHandler {
|
||||
.await
|
||||
.context("Failed to connect to peer")?;
|
||||
// Record which transport actually reached the peer (B14).
|
||||
let _ = crate::federation::record_peer_transport(
|
||||
if let Err(e) = crate::federation::record_peer_transport(
|
||||
&self.config.data_dir,
|
||||
None,
|
||||
Some(onion),
|
||||
&transport.to_string(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to persist peer transport badge: {e:#}");
|
||||
}
|
||||
|
||||
if !response.status().is_success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
@ -497,13 +503,16 @@ impl RpcHandler {
|
||||
}
|
||||
};
|
||||
// Record which transport actually reached the peer (B14).
|
||||
let _ = crate::federation::record_peer_transport(
|
||||
if let Err(e) = crate::federation::record_peer_transport(
|
||||
&self.config.data_dir,
|
||||
None,
|
||||
Some(onion),
|
||||
&transport.to_string(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to persist peer transport badge: {e:#}");
|
||||
}
|
||||
|
||||
if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
|
||||
// Payment was rejected by the seller. Surface the most likely cause
|
||||
@ -763,13 +772,16 @@ impl RpcHandler {
|
||||
}));
|
||||
}
|
||||
};
|
||||
let _ = crate::federation::record_peer_transport(
|
||||
if let Err(e) = crate::federation::record_peer_transport(
|
||||
&self.config.data_dir,
|
||||
None,
|
||||
Some(onion),
|
||||
&transport.to_string(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to persist peer transport badge: {e:#}");
|
||||
}
|
||||
|
||||
if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
|
||||
return Ok(serde_json::json!({
|
||||
@ -951,13 +963,16 @@ impl RpcHandler {
|
||||
}));
|
||||
}
|
||||
};
|
||||
let _ = crate::federation::record_peer_transport(
|
||||
if let Err(e) = crate::federation::record_peer_transport(
|
||||
&self.config.data_dir,
|
||||
None,
|
||||
Some(onion),
|
||||
&transport.to_string(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to persist peer transport badge: {e:#}");
|
||||
}
|
||||
|
||||
if response.status() == reqwest::StatusCode::PAYMENT_REQUIRED {
|
||||
return Ok(serde_json::json!({
|
||||
@ -1019,13 +1034,16 @@ impl RpcHandler {
|
||||
.await
|
||||
.context("Failed to connect to peer for preview")?;
|
||||
// Record which transport actually reached the peer (B14).
|
||||
let _ = crate::federation::record_peer_transport(
|
||||
if let Err(e) = crate::federation::record_peer_transport(
|
||||
&self.config.data_dir,
|
||||
None,
|
||||
Some(onion),
|
||||
&transport.to_string(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to persist peer transport badge: {e:#}");
|
||||
}
|
||||
|
||||
if !response.status().is_success() {
|
||||
return Err(anyhow::anyhow!(
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
use super::super::RpcHandler;
|
||||
use crate::mesh;
|
||||
use anyhow::Result;
|
||||
use tracing::warn;
|
||||
|
||||
impl RpcHandler {
|
||||
/// mesh.status — Get mesh radio status, device info, and peer count.
|
||||
@ -283,7 +284,9 @@ impl RpcHandler {
|
||||
let mut set = state.radio_contact_blocklist.write().await;
|
||||
set.clear();
|
||||
}
|
||||
let _ = crate::mesh::save_ignored_radio_contacts(&data_dir, &[]).await;
|
||||
if let Err(e) = crate::mesh::save_ignored_radio_contacts(&data_dir, &[]).await {
|
||||
warn!("Failed to persist cleared radio-contact blocklist: {e:#}");
|
||||
}
|
||||
|
||||
// Actually DELETE each radio contact from the firmware table (via
|
||||
// CMD_REMOVE_CONTACT) so wiped peers don't just reappear on the next
|
||||
|
||||
@ -32,6 +32,8 @@ pub(in crate::api::rpc) async fn install_log(msg: &str) {
|
||||
.open(INSTALL_LOG)
|
||||
.await
|
||||
{
|
||||
// Fire-and-forget by design: install-log lines are advisory, and a
|
||||
// per-line warn would spam the very log stream that's failing.
|
||||
let _ = f.write_all(line.as_bytes()).await;
|
||||
}
|
||||
}
|
||||
|
||||
@ -155,7 +155,9 @@ pub async fn load_registries(data_dir: &Path) -> Result<RegistryConfig> {
|
||||
.collect::<Vec<_>>();
|
||||
if changed {
|
||||
// Persist so the next load doesn't have to re-merge.
|
||||
let _ = save_registries(data_dir, &config).await;
|
||||
if let Err(e) = save_registries(data_dir, &config).await {
|
||||
tracing::warn!("Failed to persist migrated registry config: {e:#}");
|
||||
}
|
||||
}
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
@ -166,7 +166,12 @@ pub async fn add_node(data_dir: &Path, node: FederatedNode) -> Result<Vec<Federa
|
||||
}
|
||||
// Explicitly (re-)adding a node clears any prior tombstone so the
|
||||
// operator can intentionally bring back a previously removed peer.
|
||||
let _ = untombstone_did(data_dir, &node.did).await;
|
||||
// Propagate failure BEFORE mutating the node list: with the tombstone
|
||||
// still in place, sync reconciliation would silently re-remove the
|
||||
// node the operator just added.
|
||||
untombstone_did(data_dir, &node.did)
|
||||
.await
|
||||
.context("clear removal tombstone")?;
|
||||
nodes.push(node);
|
||||
save_nodes(data_dir, &nodes).await?;
|
||||
Ok(nodes)
|
||||
@ -179,12 +184,16 @@ pub async fn remove_node(data_dir: &Path, did: &str) -> Result<Vec<FederatedNode
|
||||
if nodes.len() == before {
|
||||
anyhow::bail!("No federated node with DID {}", did);
|
||||
}
|
||||
save_nodes(data_dir, &nodes).await?;
|
||||
// Tombstone the DID so transitive federation discovery (a still-federated
|
||||
// peer advertising this DID as one of *its* trusted peers) can't silently
|
||||
// re-add it. Best-effort: a failed tombstone write must not fail the
|
||||
// remove the operator asked for.
|
||||
let _ = tombstone_did(data_dir, did).await;
|
||||
// re-add it. Tombstone FIRST and propagate failure: a remove whose
|
||||
// tombstone never landed isn't a remove — the peer would quietly
|
||||
// reappear after the next sync. Tombstoning is idempotent, so if the
|
||||
// node-list save below fails the operator's retry works cleanly.
|
||||
tombstone_did(data_dir, did)
|
||||
.await
|
||||
.context("persist removal tombstone")?;
|
||||
save_nodes(data_dir, &nodes).await?;
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
|
||||
@ -49,13 +49,16 @@ pub async fn sync_with_peer(
|
||||
|
||||
// Record transport used so the UI badge on this peer's card reflects
|
||||
// the transport that actually carried the call, not a prediction.
|
||||
let _ = super::storage::record_peer_transport(
|
||||
if let Err(e) = super::storage::record_peer_transport(
|
||||
data_dir,
|
||||
Some(&peer.did),
|
||||
Some(&peer.onion),
|
||||
&transport.to_string(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
tracing::warn!("Failed to persist peer transport badge: {e:#}");
|
||||
}
|
||||
|
||||
let result: serde_json::Value = resp.json().await.context("Invalid response from peer")?;
|
||||
let state_val = result
|
||||
|
||||
@ -820,7 +820,9 @@ impl MeshService {
|
||||
timestamp: header.timestamp,
|
||||
announced_by: bha_did.clone(),
|
||||
};
|
||||
let _ = bha_cache.store_header(payload).await;
|
||||
if let Err(e) = bha_cache.store_header(payload).await {
|
||||
warn!("Failed to persist block-header cache: {e:#}");
|
||||
}
|
||||
|
||||
// Build signed announcement and broadcast
|
||||
match bitcoin_relay::build_block_header_announcement(
|
||||
|
||||
@ -176,7 +176,9 @@ async fn fire_due(scheduler: &Arc<MeshScheduler>, state: &Arc<MeshState>) {
|
||||
}
|
||||
q.retain(|m| !to_remove.contains(&m.id));
|
||||
}
|
||||
let _ = scheduler.save().await;
|
||||
if let Err(e) = scheduler.save().await {
|
||||
warn!("Failed to persist mesh outbox after sweep: {e:#}");
|
||||
}
|
||||
}
|
||||
|
||||
/// Hand a due message to the radio. Returns true if it was sent (or should be
|
||||
|
||||
@ -267,7 +267,10 @@ impl Server {
|
||||
info!("📡 Auto-detected mesh radio: {:?} — enabling mesh", devices);
|
||||
mesh_config.enabled = true;
|
||||
mesh_config.device_path = Some(devices[0].clone());
|
||||
let _ = crate::mesh::save_config(&data_dir, &mesh_config).await;
|
||||
if let Err(e) = crate::mesh::save_config(&data_dir, &mesh_config).await
|
||||
{
|
||||
warn!("Failed to persist auto-detected mesh config: {e:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -160,7 +160,9 @@ pub async fn load_mirrors(data_dir: &Path) -> Result<Vec<UpdateMirror>> {
|
||||
force_ovh_update_primary(&mut list);
|
||||
changed = changed || before_order != list.iter().map(|m| m.url.clone()).collect::<Vec<_>>();
|
||||
if changed {
|
||||
let _ = save_mirrors(data_dir, &list).await;
|
||||
if let Err(e) = save_mirrors(data_dir, &list).await {
|
||||
warn!("Failed to persist migrated update-mirrors list: {e:#}");
|
||||
}
|
||||
}
|
||||
Ok(list)
|
||||
}
|
||||
@ -1175,14 +1177,22 @@ async fn download_component_resumable(
|
||||
}
|
||||
}
|
||||
if canceled {
|
||||
// Best-effort: the partial file is re-validated (size + sha)
|
||||
// when the download resumes, so a lost tail here is harmless.
|
||||
let _ = file.flush().await;
|
||||
drop(file);
|
||||
DOWNLOAD_TOTAL.store(0, Ordering::Relaxed);
|
||||
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
|
||||
anyhow::bail!("Download canceled");
|
||||
}
|
||||
let _ = file.flush().await;
|
||||
let _ = file.sync_all().await;
|
||||
// A failed flush/sync surfaces later as a size/sha mismatch and a
|
||||
// resume, but log it — this is where ENOSPC first shows itself.
|
||||
if let Err(e) = file.flush().await {
|
||||
warn!("Staging file flush failed: {e:#}");
|
||||
}
|
||||
if let Err(e) = file.sync_all().await {
|
||||
warn!("Staging file sync failed: {e:#}");
|
||||
}
|
||||
drop(file);
|
||||
if stream_err {
|
||||
continue;
|
||||
@ -1265,7 +1275,9 @@ pub async fn cancel_download(data_dir: &Path) -> Result<()> {
|
||||
if let Ok(mut state) = load_state(data_dir).await {
|
||||
if state.update_in_progress {
|
||||
state.update_in_progress = false;
|
||||
let _ = save_state(data_dir, &state).await;
|
||||
if let Err(e) = save_state(data_dir, &state).await {
|
||||
warn!("Failed to persist cleared update_in_progress: {e:#}");
|
||||
}
|
||||
cleared_marker = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -107,11 +107,13 @@ end-to-end on real hardware.
|
||||
Note: the `.unwrap()`/`panic!` worry is a **non-issue** — nearly all are in test
|
||||
modules; production request/boot paths are essentially panic-free. The real risks:
|
||||
|
||||
- [ ] 🟠 **Log swallowed persistence writes.** ~30-40 dangerous `let _ = save_*().await`
|
||||
sites discard durability failures with zero diagnostics: `server.rs:270` (mesh config),
|
||||
`bitcoin_relay.rs:865` (relay state), `update.rs:163/1223` (mirrors/update state),
|
||||
`registry.rs:158`, `mesh/status.rs:286`, `scheduler.rs:179`, `install.rs:34`. Convert to
|
||||
`if let Err(e) = … { warn!(…) }`; leave genuinely fire-and-forget ones commented.
|
||||
- [x] 🟠 **Log swallowed persistence writes.** DONE 2026-07-02 (full-workspace re-inventory
|
||||
found 19 production sites): 16 converted to `if let Err(e) = … { warn!(…) }` — mesh
|
||||
config (`server.rs`), relay tor endpoint (`bitcoin_relay.rs`), update mirrors/state +
|
||||
staging flush/sync (`update.rs`), registry config, radio-contact blocklist, mesh outbox
|
||||
sweep (`scheduler.rs`), block-header cache (`mesh/mod.rs`), 7× peer-transport badge
|
||||
(`sync.rs` + `content.rs`). Federation tombstone/untombstone upgraded to hard errors
|
||||
(see §I). Install-log line write left fire-and-forget with an explanatory comment.
|
||||
- [ ] 🟠 **Remove blocking `std::process::Command` from async handlers.**
|
||||
`install.rs:2222` `published_host_port` (sync podman on the install path),
|
||||
`dependencies.rs:316` (`df`), `system/handlers.rs:578` (`sudo`), `transport/fips.rs:50`
|
||||
@ -271,10 +273,10 @@ media (latest artifact only one minor behind).
|
||||
- [~] 🟠 **Multinode gate pass** — 5× destructive gate was launched on node `.5`; bring the
|
||||
rest of the fleet to precondition, then run the existing (undocumented-but-present)
|
||||
`tests/multinode/{smoke,meshtastic}.sh` cross-node suites.
|
||||
- [ ] 🟠 **Federation `remove-node` tombstone regression.**
|
||||
`federation/storage.rs:187` does `let _ = tombstone_did(...)` — swallows the write error,
|
||||
so a removed peer reappears after the next sync. (This is a specific, confirmed instance
|
||||
of the §C swallowed-writes class.) Needs a careful fix + `smoke.sh` re-verify.
|
||||
- [~] 🟠 **Federation `remove-node` tombstone regression.** Code fix DONE 2026-07-02:
|
||||
`remove_node` now tombstones BEFORE trimming the node list and propagates the write
|
||||
error (idempotent, so retries are clean); `add_node`'s untombstone likewise propagates
|
||||
before mutating. **Still open: `tests/multinode/smoke.sh` re-verify on real nodes.**
|
||||
- [ ] 🟠 **Phase-3 Quadlet default-flip** — validated + opt-in on .228/.198; flip
|
||||
`config.rs:256` once the .5 gate reports clean.
|
||||
- [ ] 🟠 **Developer CLI suite** (`archy app validate/render/install/test`) — gates external
|
||||
@ -300,5 +302,7 @@ media (latest artifact only one minor behind).
|
||||
6. **Before public GA only (NOT alpha/beta):** remove + rotate the Anthropic key (§F) —
|
||||
intentionally left in for frictionless AI during alpha/beta.
|
||||
|
||||
*Last updated: 2026-07-02 (initial deep-audit synthesis). Update this line + tick
|
||||
boxes with commit shas as items land.*
|
||||
*Last updated: 2026-07-02 PM (hardening session 1: §A anchor+catalog `1977bdef`, §A
|
||||
manifest signature `51647b21`, §D origin checks `206d5fe8`, §C swallowed writes +
|
||||
§I tombstone fix — this commit). Update this line + tick boxes with commit shas as
|
||||
items land.*
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user