1516 lines
63 KiB
Rust
Raw Normal View History

2026-01-24 22:59:20 +00:00
use crate::api::ApiHandler;
use crate::config::{Config, ContainerRuntime};
use crate::container::{
docker_packages, ContainerOrchestrator, DevContainerOrchestrator, DockerPackageScanner,
};
use crate::identity::{self, NodeIdentity};
use crate::monitoring::MetricsStore;
use crate::node_message;
use crate::nostr_discovery;
2026-03-12 12:56:59 +00:00
use crate::nostr_handshake;
use crate::peers;
use crate::state::StateManager;
2026-01-24 22:59:20 +00:00
use anyhow::Result;
use hyper::server::conn::Http;
use hyper::service::service_fn;
use std::collections::HashMap;
2026-01-24 22:59:20 +00:00
use std::net::SocketAddr;
2026-06-11 04:44:58 -04:00
use std::sync::atomic::{AtomicBool, Ordering};
2026-01-24 22:59:20 +00:00
use std::sync::Arc;
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
use std::time::{Duration, Instant};
use tokio::io::{AsyncReadExt, AsyncWriteExt};
2026-01-24 22:59:20 +00:00
use tokio::net::TcpListener;
use tracing::{debug, error, info, warn};
2026-01-24 22:59:20 +00:00
pub struct Server {
_config: Config,
_identity: Arc<NodeIdentity>,
2026-01-24 22:59:20 +00:00
api_handler: Arc<ApiHandler>,
_state_manager: Arc<StateManager>,
2026-01-24 22:59:20 +00:00
}
2026-06-11 04:44:58 -04:00
struct ContainerScanGuard<'a> {
scanning: &'a AtomicBool,
}
impl<'a> ContainerScanGuard<'a> {
fn try_acquire(scanning: &'a AtomicBool) -> Option<Self> {
scanning
.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
.ok()
.map(|_| Self { scanning })
}
}
impl Drop for ContainerScanGuard<'_> {
fn drop(&mut self) {
self.scanning.store(false, Ordering::Release);
}
}
2026-01-24 22:59:20 +00:00
impl Server {
pub async fn new(
config: Config,
orchestrator: Option<Arc<dyn ContainerOrchestrator>>,
dev_orchestrator: Option<Arc<DevContainerOrchestrator>>,
) -> Result<Self> {
let state_manager = Arc::new(StateManager::new());
// Load node identity and set stable server_info.
// Detect seed-backed vs legacy vs fresh install.
let identity_dir = config.data_dir.join("identity");
let has_seed = crate::seed::seed_exists(&config.data_dir);
let has_node_key = NodeIdentity::key_exists(&identity_dir);
let identity = if has_node_key {
// Existing keys on disk (seed-derived or legacy random) — load them.
NodeIdentity::load_or_create(&identity_dir).await?
} else {
// Fresh install — create a temporary identity.
// Onboarding will overwrite this with seed-derived keys.
NodeIdentity::load_or_create(&identity_dir).await?
};
let (mut data, _) = state_manager.get_snapshot().await;
data.server_info.id = identity.node_id();
data.server_info.pubkey = identity.pubkey_hex();
data.server_info.seed_backed = has_seed;
// Load persisted server name
let name_file = config.data_dir.join("server-name");
if let Ok(name) = tokio::fs::read_to_string(&name_file).await {
let name = name.trim().to_string();
if !name.is_empty() {
data.server_info.name = Some(name);
}
}
data.server_info.tor_address = docker_packages::read_tor_address("archipelago").await;
if let Some(ref tor) = data.server_info.tor_address {
data.server_info.node_address = Some(identity.node_address(tor));
}
state_manager.update_data(data.clone()).await;
// Retry Tor address in background — Tor may not be ready at startup
if data.server_info.tor_address.is_none() {
let sm = state_manager.clone();
let pubkey = identity.pubkey_hex();
tokio::spawn(async move {
for delay in [5, 10, 20, 30, 60] {
tokio::time::sleep(std::time::Duration::from_secs(delay)).await;
if let Some(tor) = docker_packages::read_tor_address("archipelago").await {
let (mut d, _) = sm.get_snapshot().await;
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
let addr =
format!("archipelago://{}#{}", tor.trim_end_matches('/'), pubkey);
d.server_info.tor_address = Some(tor.clone());
d.server_info.node_address = Some(addr);
sm.update_data(d).await;
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
tracing::info!(
"Tor address discovered after startup: {}",
&tor[..20.min(tor.len())]
);
break;
}
}
});
}
// Load persisted messages (Archipelago channel)
node_message::init(&config.data_dir).await;
release(v1.7.35-alpha): rootless-netns self-heal + app update button + bitcoin-core 28.4 + Node DID unification - core/archipelago/src/bootstrap.rs (NEW): embed scripts/container-doctor.sh and image-recipe/configs/archipelago-doctor.{service,timer} via include_str! and sync to disk + enable the timer on every archipelago startup. Idempotent (content-hash compare), dev-box symlink guard keeps the git checkout untouched, best-effort (warn-only on failure) so bootstrap never blocks server readiness. Wired in main.rs as a background tokio task. - scripts/container-doctor.sh: add fix_rootless_netns_egress(). Detects when the rootless-netns has lost its pasta tap (container-to-container still works but outbound DNS/TCP fails) via an nsenter probe into aardvark-dns; with a two-probe 10s debounce to rule out transients and a host-precheck that bails out if the host itself is offline. When the rootless-netns is truly broken, does a graceful podman stop --all / start --all so pasta + aardvark-dns rebuild the netns from scratch. Bitcoin-knots and every other outbound container recover in one cycle. - core/archipelago/src/update.rs: host_sudo → pub(crate) so bootstrap.rs can reuse the existing systemd-run escape hatch. - apps/bitcoin-core/manifest.yml: bump app version 24.0.0 → 28.4.0 and image bitcoin/bitcoin:24.0 → bitcoin/bitcoin:28.4. Resources aligned with the real container-specs.sh large-disk tune (4 GiB memory cap, cpu_limit: 0 so bitcoind can run -par=auto across every core). - neode-ui/src/views/apps/AppCard.vue + Apps.vue: add an Update button + Updating spinner to every app card that has available-update set. Wires through serverStore.updatePackage(id) — the same RPC the detail view already calls. common.update / common.updating i18n keys added in en.json and es.json. - core/archipelago/src/identity_manager.rs: add create_from_signing_key() that mirrors an existing Ed25519 key as a manager-level identity with a deterministic id (`node-<pubkey16>`). Idempotent across restarts, gets the hex-SVG master avatar. - core/archipelago/src/server.rs: the auto-create path on first boot now mirrors the node's own signing_key (seed-derived on onboarded installs) as a "Node" identity instead of generating a random "Default" keypair. Once this ships, the DID on the Web5 DID Status card (via node.did RPC), the Node entry on the Identities page (via identity.list), and the DID used for peer-to-peer connects (via server_info.pubkey) all resolve to the same seed-derived pubkey. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 08:29:56 -04:00
// Auto-create the Node identity on fresh boot, mirroring the node's
// own signing key (seed-derived when onboarded, random otherwise).
// This keeps the DID shown on the Identities page, the DID Status
// card, and the DID used for peer-to-peer connects all aligned on
// one value — the seed-derived node DID. Idempotent: if the entry
// already exists from a prior boot, create_from_signing_key returns
// the existing record unchanged.
{
let im = crate::identity_manager::IdentityManager::new(&config.data_dir).await;
if let Ok(mgr) = im {
if let Ok((list, _)) = mgr.list().await {
if list.is_empty() {
release(v1.7.35-alpha): rootless-netns self-heal + app update button + bitcoin-core 28.4 + Node DID unification - core/archipelago/src/bootstrap.rs (NEW): embed scripts/container-doctor.sh and image-recipe/configs/archipelago-doctor.{service,timer} via include_str! and sync to disk + enable the timer on every archipelago startup. Idempotent (content-hash compare), dev-box symlink guard keeps the git checkout untouched, best-effort (warn-only on failure) so bootstrap never blocks server readiness. Wired in main.rs as a background tokio task. - scripts/container-doctor.sh: add fix_rootless_netns_egress(). Detects when the rootless-netns has lost its pasta tap (container-to-container still works but outbound DNS/TCP fails) via an nsenter probe into aardvark-dns; with a two-probe 10s debounce to rule out transients and a host-precheck that bails out if the host itself is offline. When the rootless-netns is truly broken, does a graceful podman stop --all / start --all so pasta + aardvark-dns rebuild the netns from scratch. Bitcoin-knots and every other outbound container recover in one cycle. - core/archipelago/src/update.rs: host_sudo → pub(crate) so bootstrap.rs can reuse the existing systemd-run escape hatch. - apps/bitcoin-core/manifest.yml: bump app version 24.0.0 → 28.4.0 and image bitcoin/bitcoin:24.0 → bitcoin/bitcoin:28.4. Resources aligned with the real container-specs.sh large-disk tune (4 GiB memory cap, cpu_limit: 0 so bitcoind can run -par=auto across every core). - neode-ui/src/views/apps/AppCard.vue + Apps.vue: add an Update button + Updating spinner to every app card that has available-update set. Wires through serverStore.updatePackage(id) — the same RPC the detail view already calls. common.update / common.updating i18n keys added in en.json and es.json. - core/archipelago/src/identity_manager.rs: add create_from_signing_key() that mirrors an existing Ed25519 key as a manager-level identity with a deterministic id (`node-<pubkey16>`). Idempotent across restarts, gets the hex-SVG master avatar. - core/archipelago/src/server.rs: the auto-create path on first boot now mirrors the node's own signing_key (seed-derived on onboarded installs) as a "Node" identity instead of generating a random "Default" keypair. Once this ships, the DID on the Web5 DID Status card (via node.did RPC), the Node entry on the Identities page (via identity.list), and the DID used for peer-to-peer connects (via server_info.pubkey) all resolve to the same seed-derived pubkey. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 08:29:56 -04:00
let signing_key = ed25519_dalek::SigningKey::from_bytes(
&identity.signing_key().to_bytes(),
);
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
match mgr
release(v1.7.35-alpha): rootless-netns self-heal + app update button + bitcoin-core 28.4 + Node DID unification - core/archipelago/src/bootstrap.rs (NEW): embed scripts/container-doctor.sh and image-recipe/configs/archipelago-doctor.{service,timer} via include_str! and sync to disk + enable the timer on every archipelago startup. Idempotent (content-hash compare), dev-box symlink guard keeps the git checkout untouched, best-effort (warn-only on failure) so bootstrap never blocks server readiness. Wired in main.rs as a background tokio task. - scripts/container-doctor.sh: add fix_rootless_netns_egress(). Detects when the rootless-netns has lost its pasta tap (container-to-container still works but outbound DNS/TCP fails) via an nsenter probe into aardvark-dns; with a two-probe 10s debounce to rule out transients and a host-precheck that bails out if the host itself is offline. When the rootless-netns is truly broken, does a graceful podman stop --all / start --all so pasta + aardvark-dns rebuild the netns from scratch. Bitcoin-knots and every other outbound container recover in one cycle. - core/archipelago/src/update.rs: host_sudo → pub(crate) so bootstrap.rs can reuse the existing systemd-run escape hatch. - apps/bitcoin-core/manifest.yml: bump app version 24.0.0 → 28.4.0 and image bitcoin/bitcoin:24.0 → bitcoin/bitcoin:28.4. Resources aligned with the real container-specs.sh large-disk tune (4 GiB memory cap, cpu_limit: 0 so bitcoind can run -par=auto across every core). - neode-ui/src/views/apps/AppCard.vue + Apps.vue: add an Update button + Updating spinner to every app card that has available-update set. Wires through serverStore.updatePackage(id) — the same RPC the detail view already calls. common.update / common.updating i18n keys added in en.json and es.json. - core/archipelago/src/identity_manager.rs: add create_from_signing_key() that mirrors an existing Ed25519 key as a manager-level identity with a deterministic id (`node-<pubkey16>`). Idempotent across restarts, gets the hex-SVG master avatar. - core/archipelago/src/server.rs: the auto-create path on first boot now mirrors the node's own signing_key (seed-derived on onboarded installs) as a "Node" identity instead of generating a random "Default" keypair. Once this ships, the DID on the Web5 DID Status card (via node.did RPC), the Node entry on the Identities page (via identity.list), and the DID used for peer-to-peer connects (via server_info.pubkey) all resolve to the same seed-derived pubkey. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 08:29:56 -04:00
.create_from_signing_key(
"Node".to_string(),
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
crate::identity_manager::IdentityPurpose::Personal,
release(v1.7.35-alpha): rootless-netns self-heal + app update button + bitcoin-core 28.4 + Node DID unification - core/archipelago/src/bootstrap.rs (NEW): embed scripts/container-doctor.sh and image-recipe/configs/archipelago-doctor.{service,timer} via include_str! and sync to disk + enable the timer on every archipelago startup. Idempotent (content-hash compare), dev-box symlink guard keeps the git checkout untouched, best-effort (warn-only on failure) so bootstrap never blocks server readiness. Wired in main.rs as a background tokio task. - scripts/container-doctor.sh: add fix_rootless_netns_egress(). Detects when the rootless-netns has lost its pasta tap (container-to-container still works but outbound DNS/TCP fails) via an nsenter probe into aardvark-dns; with a two-probe 10s debounce to rule out transients and a host-precheck that bails out if the host itself is offline. When the rootless-netns is truly broken, does a graceful podman stop --all / start --all so pasta + aardvark-dns rebuild the netns from scratch. Bitcoin-knots and every other outbound container recover in one cycle. - core/archipelago/src/update.rs: host_sudo → pub(crate) so bootstrap.rs can reuse the existing systemd-run escape hatch. - apps/bitcoin-core/manifest.yml: bump app version 24.0.0 → 28.4.0 and image bitcoin/bitcoin:24.0 → bitcoin/bitcoin:28.4. Resources aligned with the real container-specs.sh large-disk tune (4 GiB memory cap, cpu_limit: 0 so bitcoind can run -par=auto across every core). - neode-ui/src/views/apps/AppCard.vue + Apps.vue: add an Update button + Updating spinner to every app card that has available-update set. Wires through serverStore.updatePackage(id) — the same RPC the detail view already calls. common.update / common.updating i18n keys added in en.json and es.json. - core/archipelago/src/identity_manager.rs: add create_from_signing_key() that mirrors an existing Ed25519 key as a manager-level identity with a deterministic id (`node-<pubkey16>`). Idempotent across restarts, gets the hex-SVG master avatar. - core/archipelago/src/server.rs: the auto-create path on first boot now mirrors the node's own signing_key (seed-derived on onboarded installs) as a "Node" identity instead of generating a random "Default" keypair. Once this ships, the DID on the Web5 DID Status card (via node.did RPC), the Node entry on the Identities page (via identity.list), and the DID used for peer-to-peer connects (via server_info.pubkey) all resolve to the same seed-derived pubkey. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 08:29:56 -04:00
signing_key,
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
)
.await
{
Ok(record) => {
let _ = mgr.create_nostr_key(&record.id).await;
release(v1.7.35-alpha): rootless-netns self-heal + app update button + bitcoin-core 28.4 + Node DID unification - core/archipelago/src/bootstrap.rs (NEW): embed scripts/container-doctor.sh and image-recipe/configs/archipelago-doctor.{service,timer} via include_str! and sync to disk + enable the timer on every archipelago startup. Idempotent (content-hash compare), dev-box symlink guard keeps the git checkout untouched, best-effort (warn-only on failure) so bootstrap never blocks server readiness. Wired in main.rs as a background tokio task. - scripts/container-doctor.sh: add fix_rootless_netns_egress(). Detects when the rootless-netns has lost its pasta tap (container-to-container still works but outbound DNS/TCP fails) via an nsenter probe into aardvark-dns; with a two-probe 10s debounce to rule out transients and a host-precheck that bails out if the host itself is offline. When the rootless-netns is truly broken, does a graceful podman stop --all / start --all so pasta + aardvark-dns rebuild the netns from scratch. Bitcoin-knots and every other outbound container recover in one cycle. - core/archipelago/src/update.rs: host_sudo → pub(crate) so bootstrap.rs can reuse the existing systemd-run escape hatch. - apps/bitcoin-core/manifest.yml: bump app version 24.0.0 → 28.4.0 and image bitcoin/bitcoin:24.0 → bitcoin/bitcoin:28.4. Resources aligned with the real container-specs.sh large-disk tune (4 GiB memory cap, cpu_limit: 0 so bitcoind can run -par=auto across every core). - neode-ui/src/views/apps/AppCard.vue + Apps.vue: add an Update button + Updating spinner to every app card that has available-update set. Wires through serverStore.updatePackage(id) — the same RPC the detail view already calls. common.update / common.updating i18n keys added in en.json and es.json. - core/archipelago/src/identity_manager.rs: add create_from_signing_key() that mirrors an existing Ed25519 key as a manager-level identity with a deterministic id (`node-<pubkey16>`). Idempotent across restarts, gets the hex-SVG master avatar. - core/archipelago/src/server.rs: the auto-create path on first boot now mirrors the node's own signing_key (seed-derived on onboarded installs) as a "Node" identity instead of generating a random "Default" keypair. Once this ships, the DID on the Web5 DID Status card (via node.did RPC), the Node entry on the Identities page (via identity.list), and the DID used for peer-to-peer connects (via server_info.pubkey) all resolve to the same seed-derived pubkey. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 08:29:56 -04:00
tracing::info!(did = %record.did, "Auto-created Node identity mirroring node key");
}
Err(e) => tracing::debug!("Auto-identity creation (non-fatal): {}", e),
}
}
}
}
}
// DHT swarm-assist (Phase 3): build the iroh provider once at startup so
// release downloads can fetch from peers (origin always wins) and seed
// what they hold. Inert unless built with `iroh-swarm` AND swarm_enabled.
if let Err(e) = crate::swarm::init(
&config.data_dir,
&config.nostr_relays,
config.nostr_tor_proxy.as_deref(),
config.swarm_enabled,
)
.await
{
tracing::warn!("Swarm init (non-fatal, falling back to origin-only): {}", e);
}
// Revoke any previously published Nostr data (runs before publish so revocation is not overwritten)
let identity_dir = config.data_dir.join("identity");
let tor_proxy_revoke = config.nostr_tor_proxy.clone();
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
if let Err(e) =
nostr_discovery::revoke_if_needed(&identity_dir, tor_proxy_revoke.as_deref()).await
{
tracing::debug!("Nostr revoke (non-fatal): {}", e);
}
// Publish presence-only to Nostr (DID + Nostr pubkey, NO onion address).
// Onion addresses are exchanged privately via NIP-44 encrypted DMs.
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
if config.nostr_discovery_enabled && !config.nostr_relays.is_empty() {
let identity_dir = config.data_dir.join("identity");
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
let did =
identity::did_key_from_pubkey_hex(&data.server_info.pubkey).unwrap_or_default();
let version = data.server_info.version.clone();
let relays = config.nostr_relays.clone();
let tor_proxy = config.nostr_tor_proxy.clone();
tokio::spawn(async move {
if let Err(e) = nostr_handshake::publish_presence(
&identity_dir,
&did,
&version,
&relays,
tor_proxy.as_deref(),
)
.await
{
tracing::debug!("Nostr presence publish (non-fatal): {}", e);
}
});
}
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
info!(
"🔑 Node identity: {} (pubkey: {}...)",
identity.node_id(),
&identity.pubkey_hex()[..16.min(identity.pubkey_hex().len())]
);
let identity = Arc::new(identity);
// Create metrics store and spawn background collector
let metrics_store = Arc::new(MetricsStore::with_data_dir(config.data_dir.clone()).await);
let metrics_for_telemetry = metrics_store.clone();
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
crate::monitoring::spawn_metrics_collector(
metrics_store.clone(),
Some(state_manager.clone()),
Some(config.data_dir.clone()),
);
let api_handler = Arc::new(
ApiHandler::new(
config.clone(),
state_manager.clone(),
metrics_store,
orchestrator,
dev_orchestrator,
)
.await?,
);
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
2026-03-17 00:03:08 +00:00
// Initialize mesh networking service (if config has enabled: true)
{
let data_dir = config.data_dir.clone();
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
let did =
identity::did_key_from_pubkey_hex(&data.server_info.pubkey).unwrap_or_default();
2026-03-17 00:03:08 +00:00
let pubkey_hex = identity.pubkey_hex();
let signing_key = identity.signing_key();
match crate::mesh::MeshService::new(&data_dir, signing_key, &did, &pubkey_hex).await {
Ok(mut mesh_service) => {
// Pass the human-readable server name for mesh adverts
mesh_service.set_server_name(data.server_info.name.clone());
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
let mut mesh_config = crate::mesh::load_config(&data_dir)
.await
.unwrap_or_default();
// Auto-enable mesh if a radio is detected and no config exists yet
if !mesh_config.enabled {
let devices = crate::mesh::detect_devices().await;
if !devices.is_empty() {
info!("📡 Auto-detected mesh radio: {:?} — enabling mesh", devices);
mesh_config.enabled = true;
mesh_config.device_path = Some(devices[0].clone());
let _ = crate::mesh::save_config(&data_dir, &mesh_config).await;
}
}
2026-03-17 00:03:08 +00:00
if mesh_config.enabled {
if let Err(e) = mesh_service.start() {
warn!("Mesh service start failed (non-fatal): {}", e);
} else {
info!("📡 Mesh networking started");
}
}
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
api_handler
.rpc_handler()
.set_mesh_service(mesh_service)
.await;
2026-03-17 00:03:08 +00:00
info!("📡 Mesh service initialized");
}
Err(e) => {
warn!("Mesh service init failed (non-fatal): {}", e);
}
}
}
// Initialize transport router (unified routing: mesh > lan > tor)
{
let data_dir = config.data_dir.clone();
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
let did =
identity::did_key_from_pubkey_hex(&data.server_info.pubkey).unwrap_or_default();
2026-03-17 00:03:08 +00:00
let pubkey_hex = identity.pubkey_hex();
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
let mesh_config = crate::mesh::load_config(&data_dir)
.await
.unwrap_or_default();
2026-03-17 00:03:08 +00:00
let mesh_only = mesh_config.mesh_only_mode.unwrap_or(false);
match crate::transport::PeerRegistry::load(&data_dir).await {
Ok(registry) => {
let registry = std::sync::Arc::new(registry);
let mut transports: Vec<Box<dyn crate::transport::NodeTransport>> = Vec::new();
// Tor transport (always register — availability checked dynamically)
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
transports.push(Box::new(crate::transport::tor::TorTransport::new(
&pubkey_hex,
)));
2026-03-17 00:03:08 +00:00
// Mesh transport (wraps the mesh service)
transports.push(Box::new(
crate::transport::mesh_transport::MeshTransport::new(
api_handler.rpc_handler().mesh_service_arc(),
),
));
// LAN transport (mDNS discovery)
let mut lan = crate::transport::lan::LanTransport::new(&did, &pubkey_hex, 5678);
match lan.start(registry.clone()) {
Ok(()) => info!("📡 LAN transport (mDNS) started"),
Err(e) => debug!("LAN transport init (non-fatal): {}", e),
}
transports.push(Box::new(lan));
let router = std::sync::Arc::new(crate::transport::TransportRouter::new(
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
transports, registry, mesh_only,
2026-03-17 00:03:08 +00:00
));
api_handler.rpc_handler().set_transport_router(router).await;
info!("📡 Transport router initialized (mesh_only={})", mesh_only);
}
Err(e) => {
warn!("Transport router init failed (non-fatal): {}", e);
}
}
}
// Register Archipelago DWN protocols (background, non-blocking)
{
let data_dir = config.data_dir.clone();
tokio::spawn(async move {
if let Err(e) = register_dwn_protocols(&data_dir).await {
debug!("DWN protocol registration (non-fatal): {}", e);
}
});
}
// Periodic Tor address refresh (runs regardless of dev_mode)
// Picks up hostname when Tor creates it after startup/rotation (30-60s delay)
{
let state = state_manager.clone();
let identity_clone = identity.clone();
tokio::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_secs(30));
loop {
interval.tick().await;
if let Err(e) = refresh_tor_address(&state, identity_clone.as_ref()).await {
debug!("Tor address refresh (non-fatal): {}", e);
}
}
});
}
// Initialize container scanner — discovers installed apps from Podman/Docker
{
let scanner = create_docker_scanner(&config).await?;
let state = state_manager.clone();
let identity_clone = identity.clone();
2026-05-05 11:29:18 -04:00
let data_dir = config.data_dir.clone();
let scan_kick = api_handler.rpc_handler().scan_kick();
let scan_tick = api_handler.rpc_handler().scan_tick();
// Initial scan (delayed to let crash recovery finish first)
tokio::spawn(async move {
// Brief delay for containers to stabilize after boot
tokio::time::sleep(Duration::from_secs(3)).await;
info!("🐳 Scanning containers...");
// Tracks how many consecutive scans each container has been absent from.
// Prevents UI flapping when podman intermittently returns incomplete results.
let mut absence_tracker: HashMap<String, u32> = HashMap::new();
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
// Tracks when each container first entered a transitional state
// (Stopping / Starting / Restarting / ...). Used by the merge
// loop below to ignore podman's live state during a pending
// lifecycle op, and to break out if the spawned task dies
// without ever writing a final state.
let mut transitional_since: HashMap<String, Instant> = HashMap::new();
let mut scan_backoff_until: Option<Instant> = None;
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
if let Err(e) = scan_and_update_packages(
&scanner,
&state,
identity_clone.as_ref(),
2026-05-05 11:29:18 -04:00
&data_dir,
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
&mut absence_tracker,
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
&mut transitional_since,
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
)
.await
{
error!("Failed to scan containers: {}", e);
if is_podman_scan_timeout(&e) {
scan_backoff_until = Some(Instant::now() + Duration::from_secs(30));
warn!("Podman container scan timed out; backing off scans for 30s");
}
}
// Bump the scan-completion counter so any caller waiting on a
// kicked scan (install/update success path) can proceed.
scan_tick.send_modify(|n| *n = n.wrapping_add(1));
// Periodic scan every 60 seconds (only broadcasts if state changed).
// Also wakes immediately when `scan_kick` fires — install/update
// success paths poke it so the fresh manifest (with populated
// interfaces) lands before they flip state to Running.
// Uses an in-flight guard to skip scans when a previous one is still running
let mut interval = tokio::time::interval(Duration::from_secs(60));
// Skip missed ticks instead of catching up — prevents burst of scans
// after a slow podman response (which causes DB lock storms)
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
2026-06-11 04:44:58 -04:00
let scanning = std::sync::Arc::new(AtomicBool::new(false));
loop {
tokio::select! {
_ = interval.tick() => {}
_ = scan_kick.notified() => {
debug!("Scan kicked by install/update success — running immediately");
}
}
if let Some(until) = scan_backoff_until {
if Instant::now() < until {
debug!("Skipping container scan — Podman scan backoff active");
scan_tick.send_modify(|n| *n = n.wrapping_add(1));
continue;
}
}
2026-06-11 04:44:58 -04:00
let Some(_scan_guard) = ContainerScanGuard::try_acquire(&scanning) else {
debug!("Skipping container scan — previous scan still in progress");
scan_tick.send_modify(|n| *n = n.wrapping_add(1));
continue;
2026-06-11 04:44:58 -04:00
};
let scan_result = scan_and_update_packages(
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
&scanner,
&state,
identity_clone.as_ref(),
2026-05-05 11:29:18 -04:00
&data_dir,
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
&mut absence_tracker,
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
&mut transitional_since,
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
)
2026-06-11 04:44:58 -04:00
.await;
if let Err(e) = scan_result {
error!("Failed to update containers: {}", e);
if is_podman_scan_timeout(&e) {
scan_backoff_until = Some(Instant::now() + Duration::from_secs(30));
warn!("Podman container scan timed out; backing off scans for 30s");
}
} else {
scan_backoff_until = None;
}
scan_tick.send_modify(|n| *n = n.wrapping_add(1));
}
});
}
2026-01-24 22:59:20 +00:00
// Peer health monitoring — check every 5 minutes
{
let state = state_manager.clone();
let data_dir = config.data_dir.clone();
tokio::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_secs(300));
loop {
interval.tick().await;
if let Err(e) = check_peer_health(&state, &data_dir).await {
debug!("Peer health check (non-fatal): {}", e);
}
}
});
}
// FIPS seed-anchor apply loop — every 5 minutes we re-push the
// configured seed anchors into the running fips daemon via
// `fipsctl connect`. This keeps the mesh bootstrap resilient:
// operators add cluster-local anchors in the UI, and a daemon
// restart or a flaky public anchor can't strand the node.
// First run is delayed 30s so fips has time to come up after
// onboarding before we start dialing.
{
let data_dir = config.data_dir.clone();
tokio::spawn(async move {
tokio::time::sleep(Duration::from_secs(30)).await;
let mut interval = tokio::time::interval(Duration::from_secs(300));
loop {
interval.tick().await;
match crate::fips::anchors::load(&data_dir).await {
Ok(list) if !list.is_empty() => {
let _ = crate::fips::anchors::apply(&list).await;
}
Ok(_) => { /* no seed anchors configured yet */ }
Err(e) => {
tracing::debug!("Seed-anchor apply: load failed (non-fatal): {}", e)
}
}
}
});
}
// did:dht auto-refresh — re-publish DHT records every 2 hours
if config.nostr_discovery_enabled {
let data_dir = config.data_dir.clone();
tokio::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_secs(7200));
loop {
interval.tick().await;
let identity_dir = data_dir.join("identity");
let node_key_path = identity_dir.join("node_key");
if !node_key_path.exists() {
continue;
}
match tokio::fs::read(&node_key_path).await {
Ok(key_bytes) if key_bytes.len() == 32 => {
let mut seed = [0u8; 32];
seed.copy_from_slice(&key_bytes);
let signing_key = ed25519_dalek::SigningKey::from_bytes(&seed);
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
match crate::network::did_dht::create_and_publish(&signing_key, &[])
.await
{
Ok(did) => tracing::info!(did = %did, "did:dht record refreshed"),
Err(e) => tracing::debug!("did:dht refresh (non-fatal): {}", e),
}
}
_ => {
tracing::debug!("did:dht refresh skipped: no valid node key");
}
}
}
});
}
// Periodic federation state sync — every 30 min we call
// federation::sync_with_peer on each Trusted peer. Without this
// users had to manually click Sync for `fips_npub`/transport
// badge/state updates to propagate; now it happens in the
// background. Staggers peers with a 5s delay so we don't thunder
// the Tor SOCKS proxy. Sync itself already prefers FIPS.
{
let data_dir = config.data_dir.clone();
let state = state_manager.clone();
tokio::spawn(async move {
// First run 60s after boot to let onboarding settle.
tokio::time::sleep(Duration::from_secs(60)).await;
let mut interval = tokio::time::interval(Duration::from_secs(1800));
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
loop {
interval.tick().await;
let Ok(nodes) = crate::federation::load_nodes(&data_dir).await else {
continue;
};
if nodes.is_empty() {
continue;
}
let (data, _) = state.get_snapshot().await;
let Ok(local_did) =
crate::identity::did_key_from_pubkey_hex(&data.server_info.pubkey)
else {
continue;
};
let identity_dir = data_dir.join("identity");
let Ok(node_identity) =
crate::identity::NodeIdentity::load_or_create(&identity_dir).await
else {
continue;
};
for node in &nodes {
if node.trust_level == crate::federation::TrustLevel::Untrusted {
continue;
}
match crate::federation::sync_with_peer(
&data_dir,
node,
&local_did,
|bytes| node_identity.sign(bytes),
)
.await
{
Ok(_) => debug!(
"Periodic federation sync ok: {}",
node.did.chars().take(20).collect::<String>()
),
Err(e) => debug!(
"Periodic federation sync with {}: {}",
node.did.chars().take(20).collect::<String>(),
e
),
}
tokio::time::sleep(Duration::from_secs(5)).await;
}
}
});
}
// Container health monitoring — auto-restart unhealthy containers
// Respects webhook config: skips when disabled or ContainerCrash not subscribed
crate::health_monitor::spawn_health_monitor(state_manager.clone(), config.data_dir.clone());
// Periodic telemetry reporter (every 15 min when opted in)
crate::monitoring::spawn_telemetry_reporter(
metrics_for_telemetry,
Some(state_manager.clone()),
config.data_dir.clone(),
);
fix(fips,iso): bulletproof FIPS from install — no Activate button needed Problems addressed (all observed on .198): * fips_key was written as raw 32 bytes; upstream fips daemon reads it with read_to_string() and bailed with "stream did not contain valid UTF-8", crashlooping indefinitely. * Activate button racy: user had to hit it, and it would keep failing silently because the daemon couldn't parse its own config. * FIPS schema drift (already fixed in 7d8a5864) put the config write path behind the same broken "Activate" flow, so the fix alone didn't help existing nodes. * Journal was on tmpfs — every reboot wiped install/onboarding history, making post-hoc debugging impossible. Changes: * identity.rs: write fips_key as bech32 nsec + newline. load_fips_keys now auto-migrates legacy 32-byte files to bech32 the first time it reads them, so OTA updates from v1.5.0-alpha self-heal without user action. * server.rs: post-onboarding auto-activate task runs on every archipelago startup. If fips_key exists it ensures /etc/fips/fips.yaml is schema-current and starts archipelago-fips.service. Pre-onboarding nodes stay quiet (guarded on fips_key_exists). * ISO build: un-mask archipelago-fips + archipelago-wg + wg-address — all use ConditionPathExists on their key files, so systemd silently skips them pre-onboarding (no MOTD [FAILED]). Only nostr-vpn stays masked (legacy service, superseded by upstream fips). * Journald made persistent via /var/log/journal + 500M cap, so install and first-boot logs survive reboots for diagnosis. After this, a fresh install + onboarding should bring FIPS up automatically with no user interaction. The UI "Activate" button can stay as an escape hatch (the RPC is still there) but is no longer on the critical path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 16:33:21 -04:00
// Post-onboarding auto-activation for archipelago-fips. Runs once
// at startup: if fips_key is on disk, install /etc/fips/fips.yaml
// (schema-refreshed) and start the service. This removes the
// need for a user-facing "Activate" button — the node comes up
// with FIPS running whenever the seed has been onboarded. Also
// self-heals legacy raw-byte fips.key files (load_fips_keys
// rewrites them as bech32 nsec the first time they're read).
// Pre-onboarding nodes: ConditionPathExists on the service unit
// + the `fips_key_exists` guard here keep this quiet.
{
let data_dir = config.data_dir.clone();
tokio::spawn(async move {
let identity_dir = data_dir.join("identity");
if !crate::identity::fips_key_exists(&identity_dir) {
tracing::debug!("FIPS auto-activate skipped: fips_key not on disk");
return;
}
// Trigger the migration path in load_fips_keys so old raw-byte
// key files are rewritten as bech32 before fips.yaml install.
if let Err(e) = crate::identity::load_fips_keys(&identity_dir).await {
tracing::warn!("FIPS key load/migrate failed: {}", e);
return;
}
// Check if the installed fips.yaml matches what we'd
// render now. If not, we need to restart the daemon after
// reinstalling so it picks up schema changes (e.g. the
// v1.7.25 re-addition of the TCP transport). Without this,
// OTA'd nodes would be stuck on the old UDP-only config
// until someone manually clicked Reconnect.
let expected = crate::fips::config::render_config_yaml();
let installed = tokio::fs::read_to_string("/etc/fips/fips.yaml").await.ok();
let config_changed = installed.as_deref() != Some(expected.as_str());
fix(fips,iso): bulletproof FIPS from install — no Activate button needed Problems addressed (all observed on .198): * fips_key was written as raw 32 bytes; upstream fips daemon reads it with read_to_string() and bailed with "stream did not contain valid UTF-8", crashlooping indefinitely. * Activate button racy: user had to hit it, and it would keep failing silently because the daemon couldn't parse its own config. * FIPS schema drift (already fixed in 7d8a5864) put the config write path behind the same broken "Activate" flow, so the fix alone didn't help existing nodes. * Journal was on tmpfs — every reboot wiped install/onboarding history, making post-hoc debugging impossible. Changes: * identity.rs: write fips_key as bech32 nsec + newline. load_fips_keys now auto-migrates legacy 32-byte files to bech32 the first time it reads them, so OTA updates from v1.5.0-alpha self-heal without user action. * server.rs: post-onboarding auto-activate task runs on every archipelago startup. If fips_key exists it ensures /etc/fips/fips.yaml is schema-current and starts archipelago-fips.service. Pre-onboarding nodes stay quiet (guarded on fips_key_exists). * ISO build: un-mask archipelago-fips + archipelago-wg + wg-address — all use ConditionPathExists on their key files, so systemd silently skips them pre-onboarding (no MOTD [FAILED]). Only nostr-vpn stays masked (legacy service, superseded by upstream fips). * Journald made persistent via /var/log/journal + 500M cap, so install and first-boot logs survive reboots for diagnosis. After this, a fresh install + onboarding should bring FIPS up automatically with no user interaction. The UI "Activate" button can stay as an escape hatch (the RPC is still there) but is no longer on the critical path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 16:33:21 -04:00
if let Err(e) = crate::fips::config::install(&identity_dir).await {
tracing::warn!("FIPS config install failed on startup: {}", e);
return;
}
if config_changed {
tracing::info!(
"FIPS config schema changed on disk — restarting daemon to pick up new transports"
);
// Restart whichever unit is actually supervising
// the daemon (archipelago-fips vs upstream fips).
let unit = crate::fips::service::active_unit().await;
if let Err(e) = crate::fips::service::restart(unit).await {
tracing::warn!(
"FIPS restart after config migration failed on {}: {} — user can retry via fips.reconnect",
unit,
e
);
}
}
fix(fips,iso): bulletproof FIPS from install — no Activate button needed Problems addressed (all observed on .198): * fips_key was written as raw 32 bytes; upstream fips daemon reads it with read_to_string() and bailed with "stream did not contain valid UTF-8", crashlooping indefinitely. * Activate button racy: user had to hit it, and it would keep failing silently because the daemon couldn't parse its own config. * FIPS schema drift (already fixed in 7d8a5864) put the config write path behind the same broken "Activate" flow, so the fix alone didn't help existing nodes. * Journal was on tmpfs — every reboot wiped install/onboarding history, making post-hoc debugging impossible. Changes: * identity.rs: write fips_key as bech32 nsec + newline. load_fips_keys now auto-migrates legacy 32-byte files to bech32 the first time it reads them, so OTA updates from v1.5.0-alpha self-heal without user action. * server.rs: post-onboarding auto-activate task runs on every archipelago startup. If fips_key exists it ensures /etc/fips/fips.yaml is schema-current and starts archipelago-fips.service. Pre-onboarding nodes stay quiet (guarded on fips_key_exists). * ISO build: un-mask archipelago-fips + archipelago-wg + wg-address — all use ConditionPathExists on their key files, so systemd silently skips them pre-onboarding (no MOTD [FAILED]). Only nostr-vpn stays masked (legacy service, superseded by upstream fips). * Journald made persistent via /var/log/journal + 500M cap, so install and first-boot logs survive reboots for diagnosis. After this, a fresh install + onboarding should bring FIPS up automatically with no user interaction. The UI "Activate" button can stay as an escape hatch (the RPC is still there) but is no longer on the critical path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 16:33:21 -04:00
if let Err(e) = crate::fips::service::activate(crate::fips::SERVICE_UNIT).await {
tracing::warn!(
"archipelago-fips activate failed on startup: {} — user can retry via fips.install RPC",
e
);
return;
}
tracing::info!("archipelago-fips auto-activated on startup");
});
}
2026-01-24 22:59:20 +00:00
Ok(Self {
_config: config,
_identity: identity,
2026-01-24 22:59:20 +00:00
api_handler,
_state_manager: state_manager,
2026-01-24 22:59:20 +00:00
})
}
/// Serve with a graceful shutdown signal.
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
///
/// `main_addr` is the primary listener (historically `127.0.0.1:5678`).
/// The main listener always comes up on `main_addr`. The FIPS peer
/// listener (path-filtered, bound to `fips0`'s ULA) is managed by a
/// late-binding task that polls every 30s: if fips0 isn't up at
/// startup (pre-onboarding install, legacy node pre-fips.install),
/// it keeps trying until the interface appears — no archipelago
/// restart required after the user activates FIPS.
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
///
/// When `shutdown` completes, both listeners stop accepting and drain
/// in-flight requests (bounded by `DRAIN_TIMEOUT`).
pub async fn serve_with_shutdown(
&self,
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
main_addr: SocketAddr,
shutdown: impl std::future::Future<Output = ()>,
) -> Result<()> {
let active_connections = Arc::new(tokio::sync::Semaphore::new(1024));
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
let (tx, rx_main) = tokio::sync::watch::channel(false);
let main_task = tokio::spawn(accept_loop(
self.api_handler.clone(),
TcpListener::bind(main_addr).await?,
active_connections.clone(),
false, // main listener: no path filter
rx_main,
main_addr,
));
// Peer listener: late-binding so we don't need an archipelago
// restart when fips0 comes up after onboarding.
let peer_task = tokio::spawn(peer_late_bind_loop(
self.api_handler.clone(),
active_connections.clone(),
tx.subscribe(),
));
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
shutdown.await;
info!("Shutdown signal received, draining connections...");
let _ = tx.send(true);
// Wait up to 5s for in-flight requests.
let drain_start = std::time::Instant::now();
let drain_timeout = std::time::Duration::from_secs(5);
while active_connections.available_permits() < 1024 {
if drain_start.elapsed() > drain_timeout {
warn!("Drain timeout reached, forcing shutdown");
break;
}
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
2026-01-24 22:59:20 +00:00
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
let _ = main_task.await;
let _ = peer_task.await;
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
info!("Shutdown complete");
Ok(())
}
}
/// Poll every 30s for `fips0`'s ULA; when it appears, bind the peer
/// listener and run the normal accept loop. If the bind fails (port
/// already taken, permissions), log and keep retrying. Returns on
/// shutdown. First tick fires immediately so the hot path for
/// already-up fips0 is still zero-cost.
async fn peer_late_bind_loop(
handler: Arc<ApiHandler>,
active_connections: Arc<tokio::sync::Semaphore>,
mut shutdown_rx: tokio::sync::watch::Receiver<bool>,
) {
let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
loop {
tokio::select! {
_ = interval.tick() => {
let Some(ip) = crate::fips::iface::fips0_ula() else { continue };
let addr = SocketAddr::new(
std::net::IpAddr::V6(ip),
crate::fips::dial::PEER_PORT,
);
let listener = match TcpListener::bind(addr).await {
Ok(l) => l,
Err(e) => {
warn!("FIPS peer listener bind {} failed: {} — retrying in 30s", addr, e);
continue;
}
};
info!("FIPS peer listener bound {}", addr);
// Once bound, serve until shutdown fires. accept_loop
// returns on shutdown, which also ends this outer loop.
accept_loop(
handler,
listener,
active_connections,
true, // peer listener: apply path filter
shutdown_rx,
addr,
)
.await;
return;
}
_ = shutdown_rx.changed() => {
if *shutdown_rx.borrow() { return; }
}
}
}
}
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
/// Whitelist of HTTP paths reachable via the peer-facing (FIPS) listener.
/// Every entry is an endpoint already protected by cryptographic auth
/// (ed25519 signature verification inside the handler, federation DID
/// headers checked by the content server, or JSON-RPC methods whose
/// handlers verify per-message signatures).
///
/// Anything not on this list returns 404 on the peer listener.
pub fn is_peer_allowed_path(path: &str) -> bool {
// Exact matches
matches!(
path,
"/health"
| "/rpc/v1"
| "/archipelago/node-message"
| "/archipelago/mesh-typed"
| "/dwn"
| "/transport/inbox"
// Content *catalog* — the peer-browse entry point. This is the
// exact path `/content` (no trailing slash); the prefix match
// below only covers `/content/<id>` item fetches, so without
// this the catalog 404s over the mesh and `content.browse-peer`
// fails with "Peer returned error: 404 Not Found" (and never
// falls back to Tor, since a 404 is a successful HTTP exchange).
| "/content"
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
)
// Prefix-matched content endpoints (peer file browse + fetch)
|| path.starts_with("/content/")
}
async fn accept_loop(
handler: Arc<ApiHandler>,
listener: TcpListener,
active_connections: Arc<tokio::sync::Semaphore>,
peer_only: bool,
mut shutdown_rx: tokio::sync::watch::Receiver<bool>,
local_addr: SocketAddr,
) {
loop {
tokio::select! {
result = listener.accept() => {
let (stream, peer_addr) = match result {
Ok(c) => c,
Err(e) => {
error!("{} accept error: {}", local_addr, e);
continue;
}
};
let handler = handler.clone();
let permit = active_connections.clone().acquire_owned().await;
tokio::spawn(async move {
let _permit = permit;
let service = service_fn(move |req: hyper::Request<hyper::Body>| {
let handler = handler.clone();
async move {
if peer_only && !is_peer_allowed_path(req.uri().path()) {
let resp = hyper::Response::builder()
.status(hyper::StatusCode::NOT_FOUND)
.body(hyper::Body::empty())
.expect("static response builds");
return Ok::<_, std::io::Error>(resp);
}
handler
.handle_request(req)
.await
.map_err(|e| std::io::Error::other(format!("{}", e)))
}
});
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
if let Err(e) = Http::new()
.http1_keep_alive(false)
.serve_connection(stream, service)
.with_upgrades()
.await
{
error!("Error serving connection from {}: {}", peer_addr, e);
2026-01-24 22:59:20 +00:00
}
feat(fips): peer dialing + dedicated fips0 listener with path whitelist Wires the FIPS transport end-to-end so peer-to-peer calls can reach other nodes over the mesh without going through Tor: - fips::dial — raw RFC 1035 DNS client (zero new deps) that queries the FIPS daemon's local resolver at 127.0.0.1:5354 for `<npub>.fips` AAAA records. Exposes peer_base_url(npub) → "http://[fd9d:…]:5679" plus a reqwest client factory for call-site migrations. - fips::iface — parses /proc/net/if_inet6 to find the ULA address on `fips0`. Runs under the archipelago service user without extra caps. - FipsTransport::is_available() — live probe of archipelago-fips and upstream fips.service via `systemctl is-active`, cached 10s so the send hot path doesn't thrash DBus. - FipsTransport::send() — resolve npub, POST TransportMessage JSON to the peer's /transport/inbox. Today /transport/inbox isn't wired on the receive side, so call-site migrations use dial::peer_base_url directly against the already-signed endpoints (/rpc/v1, /archipelago/node-message, /content/*). The inbox handler lands as part of the Settings/transport work. - server::serve_with_shutdown — takes an optional peer_addr and spawns a second listener bound specifically to the fips0 ULA on port 5679. The peer listener applies is_peer_allowed_path() — a whitelist of endpoints that already do per-request signature auth — and returns 404 for everything else. Shutdown cascades to both listeners via a watch channel; 5s drain window preserved. - main.rs — if fips0 has a ULA at startup, pass the peer SocketAddr to serve_with_shutdown; otherwise run the main listener only. Security: the peer listener is bound to the fips0 ULA directly, not wildcard, so it's unreachable from WAN IPv6. The path whitelist limits exposure to endpoints whose handlers verify ed25519 signatures or federation DID headers server-side. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:12:39 -04:00
});
}
_ = shutdown_rx.changed() => {
if *shutdown_rx.borrow() {
return;
2026-01-24 22:59:20 +00:00
}
}
2026-01-24 22:59:20 +00:00
}
}
}
async fn create_docker_scanner(config: &Config) -> Result<DockerPackageScanner> {
let user = std::env::var("USER").unwrap_or_else(|_| "archipelago".to_string());
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
let runtime: Arc<dyn archipelago_container::ContainerRuntime> = match &config.container_runtime
{
ContainerRuntime::Podman => {
Arc::new(archipelago_container::PodmanRuntime::new(user.clone()))
}
ContainerRuntime::Docker => {
Arc::new(archipelago_container::DockerRuntime::new(user.clone()))
}
ContainerRuntime::Auto => {
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
Arc::new(archipelago_container::AutoRuntime::new(user.clone()).await?)
}
};
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
Ok(DockerPackageScanner::new(runtime))
}
async fn refresh_tor_address(state: &StateManager, identity: &NodeIdentity) -> Result<()> {
let tor_addr = docker_packages::read_tor_address("archipelago").await;
let (current_data, _) = state.get_snapshot().await;
if tor_addr != current_data.server_info.tor_address {
let mut data = current_data;
data.server_info.tor_address = tor_addr.clone();
data.server_info.node_address = tor_addr.as_ref().map(|t| identity.node_address(t));
state.update_data(data).await;
if let Some(ref addr) = tor_addr {
info!("🔒 Tor address updated: {}", addr);
}
}
Ok(())
}
/// Number of consecutive absent scans before removing a container from state.
/// 3 scans × 30s = 90 seconds of absence before removal.
const CONTAINER_ABSENCE_THRESHOLD: u32 = 3;
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
/// Maximum time a package entry may remain stuck in a transitional state
/// before the scan loop overrides it with podman's live state.
///
/// Rationale: the longest single-container stop timeout is bitcoin-core at
/// 600s. 2× that gives the spawned task ample margin before we assume it
/// died (panic, OOM, process restart mid-stop) and fall back to the
/// scanner's authoritative view. Applies to all transitional variants.
2026-05-13 15:09:22 -04:00
const TRANSITIONAL_STUCK_TIMEOUT: Duration = Duration::from_secs(120);
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
2026-05-17 22:13:21 -04:00
/// Multi-container installs can legitimately spend several minutes before the
/// primary user-facing container exists. BTCPay, for example, pulls/starts
/// Postgres and NBXplorer before `btcpay-server`; do not erase its installing
/// card just because the primary container is absent during that setup window.
const INSTALLING_STUCK_TIMEOUT: Duration = Duration::from_secs(20 * 60);
fn transitional_stuck_timeout(state: &crate::data_model::PackageState) -> Duration {
use crate::data_model::PackageState::*;
match state {
Installing | Starting | Restarting => INSTALLING_STUCK_TIMEOUT,
_ => TRANSITIONAL_STUCK_TIMEOUT,
2026-05-17 22:13:21 -04:00
}
}
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
/// Returns true if `state` is one of the transitional variants that a
/// `spawn_transitional`-style background task owns. While such a state is
/// set, the package scanner must not overwrite it with whatever podman
/// reports (see `merge_preserving_transitional`).
fn is_transitional(state: &crate::data_model::PackageState) -> bool {
use crate::data_model::PackageState::*;
matches!(
state,
Installing
| Stopping
| Starting
| Restarting
| Updating
| Removing
| CreatingBackup
| RestoringBackup
| BackingUp
)
}
fn absent_transitional_replacement(
state: &crate::data_model::PackageState,
) -> Option<crate::data_model::PackageState> {
match state {
// A stop operation is complete once the container record disappears.
// Do not leave the app card wedged in "Stopping..." just because the
// background task died or the backend restarted before it wrote back.
crate::data_model::PackageState::Stopping => Some(crate::data_model::PackageState::Stopped),
_ => None,
}
}
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
/// Merge a fresh scan entry `fresh` into `existing` while preserving
/// `existing.state` (which is transitional — the RPC spawn task owns it).
/// Non-state observability fields are taken from `fresh` so the UI still
/// sees live health / exit_code / lan_address readings during a transition.
fn merge_preserving_transitional(
existing: &crate::data_model::PackageDataEntry,
fresh: &crate::data_model::PackageDataEntry,
user_stop_requested: bool,
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
) -> crate::data_model::PackageDataEntry {
2026-05-05 11:29:18 -04:00
let state = match (&existing.state, &fresh.state) {
// A user-initiated stop must keep showing Stopping while podman still
// reports Running. Repair/restart transitions do not have a user-stop
// marker, so a fresh Running scan means the app recovered.
(crate::data_model::PackageState::Stopping, crate::data_model::PackageState::Running)
if !user_stop_requested =>
{
fresh.state.clone()
}
2026-05-05 11:29:18 -04:00
// Removing with a live running container is stale: uninstall either
// failed or Archipelago restarted before the spawned task could revert
// state. Let the scanner recover the UI immediately instead of
// keeping the app wedged in Removing for 20 minutes.
(crate::data_model::PackageState::Removing, crate::data_model::PackageState::Running) => {
fresh.state.clone()
}
_ => existing.state.clone(),
};
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
crate::data_model::PackageDataEntry {
2026-05-05 11:29:18 -04:00
state,
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
// install_progress and uninstall_stage are also owned by the
// initiating op (same reason as state) — keep them.
install_progress: existing.install_progress.clone(),
uninstall_stage: existing.uninstall_stage.clone(),
// Everything else comes from the fresh scan.
health: fresh.health.clone(),
exit_code: fresh.exit_code,
static_files: fresh.static_files.clone(),
manifest: fresh.manifest.clone(),
installed: fresh.installed.clone(),
available_update: fresh.available_update.clone(),
}
}
fn is_podman_scan_timeout(error: &anyhow::Error) -> bool {
let msg = format!("{:#}", error);
msg.contains("podman ps") && msg.contains("timed out")
}
async fn scan_and_update_packages(
scanner: &DockerPackageScanner,
state: &StateManager,
identity: &NodeIdentity,
2026-05-05 11:29:18 -04:00
data_dir: &std::path::Path,
absence_tracker: &mut HashMap<String, u32>,
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
transitional_since: &mut HashMap<String, Instant>,
) -> Result<()> {
2026-05-05 11:29:18 -04:00
let mut packages = scanner.scan_containers().await?;
let user_stopped = crate::crash_recovery::load_user_stopped(data_dir).await;
for (id, pkg) in packages.iter_mut() {
if pkg.state == crate::data_model::PackageState::Exited && user_stopped.contains(id) {
pkg.state = crate::data_model::PackageState::Stopped;
pkg.exit_code = None;
}
}
normalize_reachable_package_health(&mut packages).await;
let (current_data, _) = state.get_snapshot().await;
let tor_addr = docker_packages::read_tor_address("archipelago").await;
let tor_changed = tor_addr != current_data.server_info.tor_address;
let first_scan = !current_data.server_info.status_info.containers_scanned;
// Check if update scheduler has found an available update
let update_available = crate::update::load_state(std::path::Path::new("/var/lib/archipelago"))
.await
.map(|s| s.available_update.is_some())
.unwrap_or(false);
let update_changed = update_available != current_data.server_info.status_info.updated;
// Empty scan result = podman failure or timeout, preserve existing state
if packages.is_empty() && !first_scan {
if tor_changed || update_changed {
let mut data = current_data;
data.server_info.tor_address = tor_addr.clone();
data.server_info.node_address = tor_addr.as_ref().map(|t| identity.node_address(t));
data.server_info.status_info.updated = update_available;
state.update_data(data).await;
}
return Ok(());
}
// Merge scan results with current state instead of full replacement.
// This prevents containers from vanishing when podman intermittently
// returns incomplete results under heavy load.
let mut merged = current_data.package_data.clone();
let mut changed = false;
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
// Update/add containers found in this scan.
//
// Transitional states (Stopping, Starting, Restarting, Installing,
// Updating, Removing, backup variants) are owned by the RPC spawn_task
// that initiated the operation — podman's live state during the op is
// meaningless ("running" during a graceful stop, "exited" during a
// restart, etc.) and must not be written back. See
// `merge_preserving_transitional` for the exact rule.
//
// Escape hatch: if a package has been in a transitional state for
// longer than TRANSITIONAL_STUCK_TIMEOUT we assume the spawned task
// died without cleanup and let the scan override it.
let now = Instant::now();
for (id, pkg) in &packages {
absence_tracker.remove(id);
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
let existing = merged.get(id);
let overwrite = match existing {
Some(existing_entry) if is_transitional(&existing_entry.state) => {
let entered = *transitional_since.entry(id.clone()).or_insert(now);
2026-05-17 22:13:21 -04:00
let timeout = transitional_stuck_timeout(&existing_entry.state);
let stuck = now.duration_since(entered) > timeout;
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
if stuck {
warn!(
"Container {} stuck in {:?} for >{}s; overriding with scan state {:?}",
id,
existing_entry.state,
2026-05-17 22:13:21 -04:00
timeout.as_secs(),
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
pkg.state
);
transitional_since.remove(id);
true
} else {
// Keep existing transitional state, but merge non-state
// observability fields (health, exit_code, lan_address
// via installed) from the fresh scan so the UI still
// sees live readings.
let merged_entry = merge_preserving_transitional(
existing_entry,
pkg,
user_stopped.contains(id),
);
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
if existing.cloned() != Some(merged_entry.clone()) {
merged.insert(id.clone(), merged_entry);
changed = true;
}
false
}
}
Some(_) => {
// Not transitional: the side-table may hold a stale entry
// from a previous transition on this id; drop it.
transitional_since.remove(id);
existing != Some(pkg)
}
None => {
transitional_since.remove(id);
true
}
};
if overwrite && merged.get(id) != Some(pkg) {
merged.insert(id.clone(), pkg.clone());
changed = true;
}
}
// Track containers in state but missing from this scan.
// Only remove after CONTAINER_ABSENCE_THRESHOLD consecutive absent scans.
let current_ids: Vec<String> = merged.keys().cloned().collect();
for id in current_ids {
if !packages.contains_key(&id) {
chore: release v1.7.45-alpha Resilience-validated release. Three full sweeps of the new resilience harness against .228 confirm no shipstoppers. Big user-visible: - Bitcoin RPC auth durably correct via host-rendered nginx.conf bind-mount, replaces fragile post-start exec that failed under restricted-cap rootless podman ("crun: write cgroup.procs: Permission denied") - Multi-container stack installs (indeedhub, immich, btcpay, mempool) now emit phase events at every boundary so the progress bar advances - Apps no longer vanish from the dashboard mid-install (absent-scanner skips packages in transitional states) - Indeedhub fresh installs work end-to-end (was 8500+ restart loop): five missing env vars (DATABASE_PORT, QUEUE_HOST, QUEUE_PORT, S3_PRIVATE_BUCKET_NAME, AES_MASTER_SECRET) added to install code - Tailscale install fixed: --entrypoint string was being passed as a single shell-line arg; switched to custom_args array - Catalog cleaned of broken entries (dwn, endurain, ollama removed; nextcloud restored on docker.io) - Bitcoin Core update path uses correct image (was looking for nonexistent lfg2025/bitcoin:28.4) - ISO installs now allocate swap on the encrypted data partition Infra: - New resilience harness (scripts/resilience/) — black-box state-machine tester, every app × every transition. Run before each release. Sweep #3 final: PASS 107 / FAIL 12 / SKIP 14. The 12 fails are 1 cosmetic (homeassistant trusted_hosts), 8 harness/timing false-positives, and 3 non-shipstopper tracked items. Down from 23 in baseline sweep #1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 12:31:45 -04:00
// Don't evict packages mid-transition: Installing/Updating/Removing
// legitimately have no live container yet (image still pulling) or
// briefly (during recreate). The absence-eviction here was racing
// installs and removing apps from the UI 14s in. The transitional
// owner (spawn_task) is responsible for clearing state, not us.
if let Some(entry) = merged.get(&id) {
if is_transitional(&entry.state) {
if let Some(replacement) = absent_transitional_replacement(&entry.state) {
let mut updated = entry.clone();
updated.state = replacement;
updated.health = None;
updated.exit_code = None;
updated.install_progress = None;
updated.uninstall_stage = None;
merged.insert(id.clone(), updated);
transitional_since.remove(&id);
absence_tracker.remove(&id);
changed = true;
continue;
}
2026-05-05 11:29:18 -04:00
let entered = *transitional_since.entry(id.clone()).or_insert(now);
2026-05-17 22:13:21 -04:00
let timeout = transitional_stuck_timeout(&entry.state);
if now.duration_since(entered) > timeout {
2026-05-05 11:29:18 -04:00
warn!(
"Container {} stuck in {:?} and absent for >{}s; removing stale transitional state",
id,
entry.state,
2026-05-17 22:13:21 -04:00
timeout.as_secs()
2026-05-05 11:29:18 -04:00
);
merged.remove(&id);
transitional_since.remove(&id);
changed = true;
}
chore: release v1.7.45-alpha Resilience-validated release. Three full sweeps of the new resilience harness against .228 confirm no shipstoppers. Big user-visible: - Bitcoin RPC auth durably correct via host-rendered nginx.conf bind-mount, replaces fragile post-start exec that failed under restricted-cap rootless podman ("crun: write cgroup.procs: Permission denied") - Multi-container stack installs (indeedhub, immich, btcpay, mempool) now emit phase events at every boundary so the progress bar advances - Apps no longer vanish from the dashboard mid-install (absent-scanner skips packages in transitional states) - Indeedhub fresh installs work end-to-end (was 8500+ restart loop): five missing env vars (DATABASE_PORT, QUEUE_HOST, QUEUE_PORT, S3_PRIVATE_BUCKET_NAME, AES_MASTER_SECRET) added to install code - Tailscale install fixed: --entrypoint string was being passed as a single shell-line arg; switched to custom_args array - Catalog cleaned of broken entries (dwn, endurain, ollama removed; nextcloud restored on docker.io) - Bitcoin Core update path uses correct image (was looking for nonexistent lfg2025/bitcoin:28.4) - ISO installs now allocate swap on the encrypted data partition Infra: - New resilience harness (scripts/resilience/) — black-box state-machine tester, every app × every transition. Run before each release. Sweep #3 final: PASS 107 / FAIL 12 / SKIP 14. The 12 fails are 1 cosmetic (homeassistant trusted_hosts), 8 harness/timing false-positives, and 3 non-shipstopper tracked items. Down from 23 in baseline sweep #1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 12:31:45 -04:00
absence_tracker.remove(&id);
continue;
}
2026-05-13 15:09:22 -04:00
// Quadlet-generated units run containers with `--rm`, so a
// clean user stop removes the Podman record. Keep the package
// visible as Stopped while the user-stopped marker exists so
// package.start can recreate it via systemd/Quadlet.
if entry.state == crate::data_model::PackageState::Stopped
&& user_stopped.contains(&id)
{
absence_tracker.remove(&id);
continue;
}
chore: release v1.7.45-alpha Resilience-validated release. Three full sweeps of the new resilience harness against .228 confirm no shipstoppers. Big user-visible: - Bitcoin RPC auth durably correct via host-rendered nginx.conf bind-mount, replaces fragile post-start exec that failed under restricted-cap rootless podman ("crun: write cgroup.procs: Permission denied") - Multi-container stack installs (indeedhub, immich, btcpay, mempool) now emit phase events at every boundary so the progress bar advances - Apps no longer vanish from the dashboard mid-install (absent-scanner skips packages in transitional states) - Indeedhub fresh installs work end-to-end (was 8500+ restart loop): five missing env vars (DATABASE_PORT, QUEUE_HOST, QUEUE_PORT, S3_PRIVATE_BUCKET_NAME, AES_MASTER_SECRET) added to install code - Tailscale install fixed: --entrypoint string was being passed as a single shell-line arg; switched to custom_args array - Catalog cleaned of broken entries (dwn, endurain, ollama removed; nextcloud restored on docker.io) - Bitcoin Core update path uses correct image (was looking for nonexistent lfg2025/bitcoin:28.4) - ISO installs now allocate swap on the encrypted data partition Infra: - New resilience harness (scripts/resilience/) — black-box state-machine tester, every app × every transition. Run before each release. Sweep #3 final: PASS 107 / FAIL 12 / SKIP 14. The 12 fails are 1 cosmetic (homeassistant trusted_hosts), 8 harness/timing false-positives, and 3 non-shipstopper tracked items. Down from 23 in baseline sweep #1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 12:31:45 -04:00
}
let count = absence_tracker.entry(id.clone()).or_insert(0);
*count += 1;
if *count >= CONTAINER_ABSENCE_THRESHOLD {
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
debug!(
"Removing {} from state after {} consecutive absent scans",
id, count
);
merged.remove(&id);
absence_tracker.remove(&id);
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
transitional_since.remove(&id);
changed = true;
}
}
}
if changed || tor_changed || first_scan || update_changed {
let mut data = current_data;
data.package_data = merged;
data.server_info.tor_address = tor_addr.clone();
data.server_info.node_address = tor_addr.as_ref().map(|t| identity.node_address(t));
data.server_info.status_info.containers_scanned = true;
data.server_info.status_info.updated = update_available;
state.update_data(data).await;
chore(ci): rustfmt + clippy clean-up to unblock the Rust CI job The .github/workflows/ci.yml Rust job runs cargo fmt --check, clippy with -D warnings, and tests. All three were failing. This commit: - Applies rustfmt across the tree (the bulk of the diff — untouched since the last toolchain bump, so a wide sweep was unavoidable). - Fixes the correctness-level clippy errors: container/bitcoin_simulator.rs wildcard-in-or-pattern container/manifest.rs from_str rename to parse (reserved name) container/podman_client.rs .get(0) -> .first() container/runtime.rs manual += collapse archipelago/src/constants.rs doc-comment → module-doc api/rpc/package/install.rs stray /// comment above a non-item container/docker_packages.rs redundant field init streaming/advertisement.rs missing Metric import in tests tests/orchestration_tests.rs `vec!` in non-Vec contexts mesh/listener/dispatch.rs unused store_plain_message import api/rpc/tor/mod.rs and mesh/steganography.rs: push-after-new → vec! - Quiets wide legacy surfaces with crate-level allows in main.rs for stylistic lints (too_many_arguments, type_complexity, doc indent, enum variant prefix, wildcard-in-or, assertions-on-constants, drop_non_drop, unused_io_amount, ptr_arg) — these fired in dozens of places with no correctness payoff and have been churning every toolchain bump. - Tags intentional-dead-code helpers: wallet/ and streaming/ modules are WIP, mesh::send_chunked_payload and DM_V1_MARKER are kept for rollback compatibility, vpn::get_nostr_vpn_status is surface-area for a not-yet-landed RPC. cargo fmt --check, cargo clippy --all-targets --all-features -- -D warnings, and cargo test --all-features now all pass locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 17:23:46 -04:00
debug!(
"📦 State changed (packages={}, tor={}, first_scan={}, update={}), broadcasting update",
changed, tor_changed, first_scan, update_changed
);
}
Ok(())
}
async fn normalize_reachable_package_health(
packages: &mut HashMap<String, crate::data_model::PackageDataEntry>,
) {
for (id, pkg) in packages.iter_mut() {
if pkg.state != crate::data_model::PackageState::Running {
continue;
}
if !matches!(pkg.health.as_deref(), Some("starting" | "unhealthy" | "1")) {
continue;
}
let Some(port) = pkg
.installed
.as_ref()
.and_then(|i| i.interface_addresses.get("main"))
.and_then(|a| a.lan_address.as_deref())
.and_then(port_from_url)
.or_else(|| fallback_package_port(id))
else {
continue;
};
if frontend_port_http_ready(port).await {
debug!(app_id = %id, port, "normalizing reachable package health to healthy");
pkg.health = Some("healthy".to_string());
ensure_main_lan_address(pkg, port);
}
}
}
async fn frontend_port_http_ready(port: u16) -> bool {
let Ok(Ok(mut stream)) = tokio::time::timeout(
Duration::from_secs(2),
tokio::net::TcpStream::connect(("127.0.0.1", port)),
)
.await
else {
return false;
};
let request = b"GET / HTTP/1.1\r\nHost: 127.0.0.1\r\nConnection: close\r\n\r\n";
if stream.write_all(request).await.is_err() {
return false;
}
let mut buf = [0u8; 64];
let Ok(Ok(n)) = tokio::time::timeout(Duration::from_secs(2), stream.read(&mut buf)).await
else {
return false;
};
if n == 0 {
return false;
}
let head = String::from_utf8_lossy(&buf[..n]);
head.starts_with("HTTP/1.1 2")
|| head.starts_with("HTTP/1.1 3")
|| head.starts_with("HTTP/1.0 2")
|| head.starts_with("HTTP/1.0 3")
}
fn ensure_main_lan_address(pkg: &mut crate::data_model::PackageDataEntry, port: u16) {
let Some(installed) = pkg.installed.as_mut() else {
return;
};
let main = installed
.interface_addresses
.entry("main".to_string())
.or_insert_with(|| crate::data_model::InterfaceAddress {
tor_address: String::new(),
lan_address: None,
});
if main.lan_address.is_none() {
main.lan_address = Some(format!("http://localhost:{port}"));
}
}
fn fallback_package_port(app_id: &str) -> Option<u16> {
match app_id {
"fedimint" | "fedimintd" => Some(8175),
"filebrowser" => Some(8083),
"indeedhub" => Some(7778),
"nginx-proxy-manager" => Some(8081),
"nostr-rs-relay" => Some(18081),
_ => None,
}
}
fn port_from_url(url: &str) -> Option<u16> {
let after_scheme = url.split_once("://").map(|(_, rest)| rest).unwrap_or(url);
let host_port = after_scheme.split('/').next().unwrap_or(after_scheme);
let port = host_port.rsplit_once(':')?.1;
port.parse::<u16>().ok()
}
/// Register Archipelago DWN protocols on startup.
async fn register_dwn_protocols(data_dir: &std::path::Path) -> Result<()> {
use crate::network::dwn_store::{DwnStore, ProtocolDefinition};
let protocols = [
("https://archipelago.dev/protocols/node-identity/v1", true),
("https://archipelago.dev/protocols/file-catalog/v1", true),
("https://archipelago.dev/protocols/federation/v1", false),
("https://archipelago.dev/protocols/app-deploy/v1", false),
];
let store = DwnStore::new(data_dir).await?;
let existing = store.list_protocols().await?;
let existing_uris: std::collections::HashSet<String> =
existing.iter().map(|p| p.protocol.clone()).collect();
let mut registered = 0;
for (uri, published) in &protocols {
if existing_uris.contains(*uri) {
continue;
}
let def = ProtocolDefinition {
protocol: uri.to_string(),
published: *published,
types: std::collections::HashMap::new(),
structure: std::collections::HashMap::new(),
date_registered: chrono::Utc::now().to_rfc3339(),
};
store.register_protocol(&def).await?;
registered += 1;
}
if registered > 0 {
info!("📋 Registered {registered} DWN protocols");
}
Ok(())
}
/// Periodically check peer reachability and broadcast status changes.
async fn check_peer_health(state: &StateManager, data_dir: &std::path::Path) -> Result<()> {
let known_peers = peers::load_peers(data_dir).await.unwrap_or_default();
if known_peers.is_empty() {
return Ok(());
}
let mut new_health = std::collections::HashMap::new();
for peer in &known_peers {
feat(messaging,dwn,mesh): route peer messaging + DWN sync + blob fetch via FIPS first Migrates the remaining Tor-direct peer call sites to PeerRequest so FIPS is the default when the peer is federated and running the daemon: - node_message::send_to_peer / check_peer_reachable: gain a fips_npub parameter. Error messages updated to reference both transports. - Callers (api/rpc/network.rs, api/rpc/peers.rs, server health loop): look up fips_npub from federation storage by onion and pass it. - mesh::send_typed_wire_via_federation: the spawned background POST for the /archipelago/mesh-typed endpoint now uses PeerRequest with federation-resolved fips_npub. Signature domain unchanged. - api/rpc/mesh/typed_messages.rs fetch_blob_from_peer: blob URL rebuilt as (base_url, path_with_query) so PeerRequest can append the query string after swapping the host. Cap/exp/peer parameters are still signed over the content ref itself, so transport choice is invisible to the signature. - network/dwn_sync.rs sync_with_peers: per-peer fips_npub lookup before sync_single_peer; health/pull/push each dial through PeerRequest, so any DWN peer known to federation gets FIPS. Left Tor-only on purpose: - api/rpc/identity/handlers.rs handle_identity_resolve_peer_onion — resolving TO a DID, no anchor yet. - content.browse / preview calls to non-federated peers fall through to Tor naturally inside PeerRequest (no fips_npub → skip FIPS branch). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 01:36:04 -04:00
let fips_npub = crate::federation::fips_npub_for_onion(data_dir, &peer.onion).await;
let reachable = node_message::check_peer_reachable(&peer.onion, fips_npub.as_deref())
.await
.unwrap_or(false);
new_health.insert(peer.onion.clone(), reachable);
}
let (current_data, _) = state.get_snapshot().await;
if current_data.peer_health != new_health {
let mut data = current_data;
data.peer_health = new_health;
state.update_data(data).await;
debug!("🔗 Peer health updated, broadcasting changes");
}
Ok(())
}
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
#[cfg(test)]
mod merge_tests {
use super::*;
use crate::data_model::{Description, Manifest, PackageDataEntry, PackageState, StaticFiles};
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
fn make_manifest() -> Manifest {
Manifest {
id: "lnd".to_string(),
title: "LND".to_string(),
version: "0.18.4".to_string(),
description: Description {
short: "".to_string(),
long: "".to_string(),
},
release_notes: "".to_string(),
license: "".to_string(),
wrapper_repo: "".to_string(),
upstream_repo: "".to_string(),
support_site: "".to_string(),
marketing_site: "".to_string(),
donation_url: None,
author: None,
website: None,
interfaces: None,
tier: None,
}
}
fn make_static() -> StaticFiles {
StaticFiles {
license: "".to_string(),
instructions: "".to_string(),
icon: "".to_string(),
}
}
fn make_entry(state: PackageState, health: Option<&str>) -> PackageDataEntry {
PackageDataEntry {
state,
health: health.map(|s| s.to_string()),
exit_code: None,
static_files: make_static(),
manifest: make_manifest(),
installed: None,
install_progress: None,
uninstall_stage: None,
available_update: None,
}
}
#[test]
fn peer_path_filter_allows_content_catalog_and_items() {
// Regression: the content *catalog* is exactly "/content" (no trailing
// slash). It must be reachable over the peer (FIPS) listener, else
// `content.browse-peer` 404s over the mesh. Item fetches are
// "/content/<id>".
assert!(is_peer_allowed_path("/content"), "catalog must be allowed");
assert!(
is_peer_allowed_path("/content/abc123"),
"items must be allowed"
);
assert!(is_peer_allowed_path("/rpc/v1"));
assert!(is_peer_allowed_path("/health"));
// Not on the allow-list → rejected (no broad surface over the mesh).
assert!(!is_peer_allowed_path("/contention"), "must not prefix-leak");
assert!(!is_peer_allowed_path("/"));
assert!(!is_peer_allowed_path("/rpc/v2"));
}
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
#[test]
fn preserves_transitional_state_on_merge() {
// existing: user initiated a stop, spawn_transitional set Stopping.
// fresh: podman hasn't finished the stop yet, still reports Running.
// Expected: merged state stays Stopping — podman's live view must
// not clobber the transitional state owned by the RPC spawn task.
let existing = make_entry(PackageState::Stopping, Some("healthy"));
let fresh = make_entry(PackageState::Running, Some("starting"));
let merged = merge_preserving_transitional(&existing, &fresh, true);
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
assert_eq!(merged.state, PackageState::Stopping);
}
#[test]
fn non_user_stopping_recovers_when_container_is_running() {
let existing = make_entry(PackageState::Stopping, Some("unknown"));
let fresh = make_entry(PackageState::Running, Some("healthy"));
let merged = merge_preserving_transitional(&existing, &fresh, false);
assert_eq!(merged.state, PackageState::Running);
assert_eq!(merged.health.as_deref(), Some("healthy"));
}
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
#[test]
fn merges_fresh_observability_fields() {
// Non-state observability fields (health, exit_code, installed)
// MUST come from the fresh scan even while state is preserved —
// the UI still shows live health/health during a transition.
let mut existing = make_entry(PackageState::Stopping, Some("healthy"));
existing.exit_code = None;
let mut fresh = make_entry(PackageState::Running, Some("unhealthy"));
fresh.exit_code = Some(0);
let merged = merge_preserving_transitional(&existing, &fresh, true);
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
assert_eq!(merged.state, PackageState::Stopping);
assert_eq!(merged.health.as_deref(), Some("unhealthy"));
assert_eq!(merged.exit_code, Some(0));
}
2026-05-05 11:29:18 -04:00
#[test]
fn stale_removing_recovers_when_container_is_running() {
let existing = make_entry(PackageState::Removing, Some("unknown"));
let fresh = make_entry(PackageState::Running, Some("healthy"));
let merged = merge_preserving_transitional(&existing, &fresh, false);
2026-05-05 11:29:18 -04:00
assert_eq!(merged.state, PackageState::Running);
assert_eq!(merged.health.as_deref(), Some("healthy"));
}
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
#[test]
fn is_transitional_covers_all_variants() {
for s in [
PackageState::Installing,
PackageState::Stopping,
PackageState::Starting,
PackageState::Restarting,
PackageState::Updating,
PackageState::Removing,
PackageState::CreatingBackup,
PackageState::RestoringBackup,
PackageState::BackingUp,
] {
assert!(is_transitional(&s), "{:?} should be transitional", s);
}
for s in [
PackageState::Installed,
PackageState::Stopped,
PackageState::Exited,
PackageState::Running,
] {
assert!(!is_transitional(&s), "{:?} should NOT be transitional", s);
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
}
}
2026-05-17 22:13:21 -04:00
#[test]
fn installing_uses_longer_stale_timeout_than_other_transitions() {
assert!(transitional_stuck_timeout(&PackageState::Installing) > TRANSITIONAL_STUCK_TIMEOUT);
assert_eq!(
transitional_stuck_timeout(&PackageState::Stopping),
TRANSITIONAL_STUCK_TIMEOUT
);
}
#[test]
fn absent_stopping_transitions_to_stopped() {
assert_eq!(
absent_transitional_replacement(&PackageState::Stopping),
Some(PackageState::Stopped)
);
}
#[test]
fn absent_installing_still_waits_for_owner() {
assert_eq!(
absent_transitional_replacement(&PackageState::Installing),
None
);
}
fix(state): preserve transitional state across container scans The 30s package scan loop used to blindly overwrite every package entry from podman inspect. While a user-initiated Stop / Start / Restart was in flight, the RPC spawn task would flip the state to Stopping / Starting / Restarting, the next scan would see podman still reporting "running" (for the duration of the graceful stop, up to 600s for bitcoin-core), and clobber the transitional state back to Running. The dashboard would then flip Running -> Stopping -> Running -> Stopped, making it look like the stop had silently failed until it eventually completed. The merge loop now treats transitional variants (Stopping, Starting, Restarting, Installing, Updating, Removing, and the three backup variants) as owned by the RPC spawn task. For those variants, merge_preserving_transitional keeps the existing state while still taking live observability fields (health, exit_code, installed, lan_address, manifest, static_files, available_update) from the fresh scan so the UI continues to see live health readings. Adds an escape hatch via a per-scan transitional_since side table: if a package has been in a transitional state for more than 1200s (2x the longest graceful stop at 600s on bitcoin-core), the scan loop assumes the spawn task died without cleanup and overrides with podman's live state. Prevents a crashed background task from wedging a package in Stopping forever. Three unit tests cover the merge rule, the observability passthrough, and the transitional-variant classifier.
2026-04-23 05:15:13 -04:00
}