Phase 3 wiring (task #12): - NostrSeedDiscovery: async ProviderDiscovery that queries relays for signed seed adverts and parses endpoint ids (swarm/iroh_provider.rs, seed_advert.rs). - seed_and_advertise publish path; dep-free fetch/publish helpers reuse the node's Nostr identity (build_nostr_client/load_or_create_nostr_keys made pub(crate)). - swarm::init builds the IrohProvider once into a OnceLock runtime; providers() returns it; announce_held_blob() is called from update.rs after a release component passes both hash gates. - config swarm_enabled (ARCHIPELAGO_SWARM_ENABLED, default off); server.rs init. Paid swarm serving (Phase 4 step F): - swarm/paid.rs gates the iroh-blobs provider through streaming::gate, intercepting connect + GET (peer push hard-disabled). Free by default (content-download service disabled); denies unpaid peers when enabled; fails open on internal error so a payment fault never blocks distribution. Wired into IrohProvider::new. All iroh code behind the iroh-swarm feature; the default build is inert. Default build clean; --features iroh-swarm: 11/11 swarm tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1516 lines
63 KiB
Rust
1516 lines
63 KiB
Rust
use crate::api::ApiHandler;
|
||
use crate::config::{Config, ContainerRuntime};
|
||
use crate::container::{
|
||
docker_packages, ContainerOrchestrator, DevContainerOrchestrator, DockerPackageScanner,
|
||
};
|
||
use crate::identity::{self, NodeIdentity};
|
||
use crate::monitoring::MetricsStore;
|
||
use crate::node_message;
|
||
use crate::nostr_discovery;
|
||
use crate::nostr_handshake;
|
||
use crate::peers;
|
||
use crate::state::StateManager;
|
||
use anyhow::Result;
|
||
use hyper::server::conn::Http;
|
||
use hyper::service::service_fn;
|
||
use std::collections::HashMap;
|
||
use std::net::SocketAddr;
|
||
use std::sync::atomic::{AtomicBool, Ordering};
|
||
use std::sync::Arc;
|
||
use std::time::{Duration, Instant};
|
||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||
use tokio::net::TcpListener;
|
||
use tracing::{debug, error, info, warn};
|
||
|
||
pub struct Server {
|
||
_config: Config,
|
||
_identity: Arc<NodeIdentity>,
|
||
api_handler: Arc<ApiHandler>,
|
||
_state_manager: Arc<StateManager>,
|
||
}
|
||
|
||
struct ContainerScanGuard<'a> {
|
||
scanning: &'a AtomicBool,
|
||
}
|
||
|
||
impl<'a> ContainerScanGuard<'a> {
|
||
fn try_acquire(scanning: &'a AtomicBool) -> Option<Self> {
|
||
scanning
|
||
.compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed)
|
||
.ok()
|
||
.map(|_| Self { scanning })
|
||
}
|
||
}
|
||
|
||
impl Drop for ContainerScanGuard<'_> {
|
||
fn drop(&mut self) {
|
||
self.scanning.store(false, Ordering::Release);
|
||
}
|
||
}
|
||
|
||
impl Server {
|
||
pub async fn new(
|
||
config: Config,
|
||
orchestrator: Option<Arc<dyn ContainerOrchestrator>>,
|
||
dev_orchestrator: Option<Arc<DevContainerOrchestrator>>,
|
||
) -> Result<Self> {
|
||
let state_manager = Arc::new(StateManager::new());
|
||
|
||
// Load node identity and set stable server_info.
|
||
// Detect seed-backed vs legacy vs fresh install.
|
||
let identity_dir = config.data_dir.join("identity");
|
||
let has_seed = crate::seed::seed_exists(&config.data_dir);
|
||
let has_node_key = NodeIdentity::key_exists(&identity_dir);
|
||
|
||
let identity = if has_node_key {
|
||
// Existing keys on disk (seed-derived or legacy random) — load them.
|
||
NodeIdentity::load_or_create(&identity_dir).await?
|
||
} else {
|
||
// Fresh install — create a temporary identity.
|
||
// Onboarding will overwrite this with seed-derived keys.
|
||
NodeIdentity::load_or_create(&identity_dir).await?
|
||
};
|
||
|
||
let (mut data, _) = state_manager.get_snapshot().await;
|
||
data.server_info.id = identity.node_id();
|
||
data.server_info.pubkey = identity.pubkey_hex();
|
||
data.server_info.seed_backed = has_seed;
|
||
// Load persisted server name
|
||
let name_file = config.data_dir.join("server-name");
|
||
if let Ok(name) = tokio::fs::read_to_string(&name_file).await {
|
||
let name = name.trim().to_string();
|
||
if !name.is_empty() {
|
||
data.server_info.name = Some(name);
|
||
}
|
||
}
|
||
data.server_info.tor_address = docker_packages::read_tor_address("archipelago").await;
|
||
if let Some(ref tor) = data.server_info.tor_address {
|
||
data.server_info.node_address = Some(identity.node_address(tor));
|
||
}
|
||
state_manager.update_data(data.clone()).await;
|
||
|
||
// Retry Tor address in background — Tor may not be ready at startup
|
||
if data.server_info.tor_address.is_none() {
|
||
let sm = state_manager.clone();
|
||
let pubkey = identity.pubkey_hex();
|
||
tokio::spawn(async move {
|
||
for delay in [5, 10, 20, 30, 60] {
|
||
tokio::time::sleep(std::time::Duration::from_secs(delay)).await;
|
||
if let Some(tor) = docker_packages::read_tor_address("archipelago").await {
|
||
let (mut d, _) = sm.get_snapshot().await;
|
||
let addr =
|
||
format!("archipelago://{}#{}", tor.trim_end_matches('/'), pubkey);
|
||
d.server_info.tor_address = Some(tor.clone());
|
||
d.server_info.node_address = Some(addr);
|
||
sm.update_data(d).await;
|
||
tracing::info!(
|
||
"Tor address discovered after startup: {}",
|
||
&tor[..20.min(tor.len())]
|
||
);
|
||
break;
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
// Load persisted messages (Archipelago channel)
|
||
node_message::init(&config.data_dir).await;
|
||
|
||
// Auto-create the Node identity on fresh boot, mirroring the node's
|
||
// own signing key (seed-derived when onboarded, random otherwise).
|
||
// This keeps the DID shown on the Identities page, the DID Status
|
||
// card, and the DID used for peer-to-peer connects all aligned on
|
||
// one value — the seed-derived node DID. Idempotent: if the entry
|
||
// already exists from a prior boot, create_from_signing_key returns
|
||
// the existing record unchanged.
|
||
{
|
||
let im = crate::identity_manager::IdentityManager::new(&config.data_dir).await;
|
||
if let Ok(mgr) = im {
|
||
if let Ok((list, _)) = mgr.list().await {
|
||
if list.is_empty() {
|
||
let signing_key = ed25519_dalek::SigningKey::from_bytes(
|
||
&identity.signing_key().to_bytes(),
|
||
);
|
||
match mgr
|
||
.create_from_signing_key(
|
||
"Node".to_string(),
|
||
crate::identity_manager::IdentityPurpose::Personal,
|
||
signing_key,
|
||
)
|
||
.await
|
||
{
|
||
Ok(record) => {
|
||
let _ = mgr.create_nostr_key(&record.id).await;
|
||
tracing::info!(did = %record.did, "Auto-created Node identity mirroring node key");
|
||
}
|
||
Err(e) => tracing::debug!("Auto-identity creation (non-fatal): {}", e),
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// DHT swarm-assist (Phase 3): build the iroh provider once at startup so
|
||
// release downloads can fetch from peers (origin always wins) and seed
|
||
// what they hold. Inert unless built with `iroh-swarm` AND swarm_enabled.
|
||
if let Err(e) = crate::swarm::init(
|
||
&config.data_dir,
|
||
&config.nostr_relays,
|
||
config.nostr_tor_proxy.as_deref(),
|
||
config.swarm_enabled,
|
||
)
|
||
.await
|
||
{
|
||
tracing::warn!("Swarm init (non-fatal, falling back to origin-only): {}", e);
|
||
}
|
||
|
||
// Revoke any previously published Nostr data (runs before publish so revocation is not overwritten)
|
||
let identity_dir = config.data_dir.join("identity");
|
||
let tor_proxy_revoke = config.nostr_tor_proxy.clone();
|
||
if let Err(e) =
|
||
nostr_discovery::revoke_if_needed(&identity_dir, tor_proxy_revoke.as_deref()).await
|
||
{
|
||
tracing::debug!("Nostr revoke (non-fatal): {}", e);
|
||
}
|
||
|
||
// Publish presence-only to Nostr (DID + Nostr pubkey, NO onion address).
|
||
// Onion addresses are exchanged privately via NIP-44 encrypted DMs.
|
||
if config.nostr_discovery_enabled && !config.nostr_relays.is_empty() {
|
||
let identity_dir = config.data_dir.join("identity");
|
||
let did =
|
||
identity::did_key_from_pubkey_hex(&data.server_info.pubkey).unwrap_or_default();
|
||
let version = data.server_info.version.clone();
|
||
let relays = config.nostr_relays.clone();
|
||
let tor_proxy = config.nostr_tor_proxy.clone();
|
||
tokio::spawn(async move {
|
||
if let Err(e) = nostr_handshake::publish_presence(
|
||
&identity_dir,
|
||
&did,
|
||
&version,
|
||
&relays,
|
||
tor_proxy.as_deref(),
|
||
)
|
||
.await
|
||
{
|
||
tracing::debug!("Nostr presence publish (non-fatal): {}", e);
|
||
}
|
||
});
|
||
}
|
||
info!(
|
||
"🔑 Node identity: {} (pubkey: {}...)",
|
||
identity.node_id(),
|
||
&identity.pubkey_hex()[..16.min(identity.pubkey_hex().len())]
|
||
);
|
||
|
||
let identity = Arc::new(identity);
|
||
|
||
// Create metrics store and spawn background collector
|
||
let metrics_store = Arc::new(MetricsStore::with_data_dir(config.data_dir.clone()).await);
|
||
let metrics_for_telemetry = metrics_store.clone();
|
||
crate::monitoring::spawn_metrics_collector(
|
||
metrics_store.clone(),
|
||
Some(state_manager.clone()),
|
||
Some(config.data_dir.clone()),
|
||
);
|
||
|
||
let api_handler = Arc::new(
|
||
ApiHandler::new(
|
||
config.clone(),
|
||
state_manager.clone(),
|
||
metrics_store,
|
||
orchestrator,
|
||
dev_orchestrator,
|
||
)
|
||
.await?,
|
||
);
|
||
|
||
// Initialize mesh networking service (if config has enabled: true)
|
||
{
|
||
let data_dir = config.data_dir.clone();
|
||
let did =
|
||
identity::did_key_from_pubkey_hex(&data.server_info.pubkey).unwrap_or_default();
|
||
let pubkey_hex = identity.pubkey_hex();
|
||
let signing_key = identity.signing_key();
|
||
match crate::mesh::MeshService::new(&data_dir, signing_key, &did, &pubkey_hex).await {
|
||
Ok(mut mesh_service) => {
|
||
// Pass the human-readable server name for mesh adverts
|
||
mesh_service.set_server_name(data.server_info.name.clone());
|
||
let mut mesh_config = crate::mesh::load_config(&data_dir)
|
||
.await
|
||
.unwrap_or_default();
|
||
|
||
// Auto-enable mesh if a radio is detected and no config exists yet
|
||
if !mesh_config.enabled {
|
||
let devices = crate::mesh::detect_devices().await;
|
||
if !devices.is_empty() {
|
||
info!("📡 Auto-detected mesh radio: {:?} — enabling mesh", devices);
|
||
mesh_config.enabled = true;
|
||
mesh_config.device_path = Some(devices[0].clone());
|
||
let _ = crate::mesh::save_config(&data_dir, &mesh_config).await;
|
||
}
|
||
}
|
||
|
||
if mesh_config.enabled {
|
||
if let Err(e) = mesh_service.start() {
|
||
warn!("Mesh service start failed (non-fatal): {}", e);
|
||
} else {
|
||
info!("📡 Mesh networking started");
|
||
}
|
||
}
|
||
api_handler
|
||
.rpc_handler()
|
||
.set_mesh_service(mesh_service)
|
||
.await;
|
||
info!("📡 Mesh service initialized");
|
||
}
|
||
Err(e) => {
|
||
warn!("Mesh service init failed (non-fatal): {}", e);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Initialize transport router (unified routing: mesh > lan > tor)
|
||
{
|
||
let data_dir = config.data_dir.clone();
|
||
let did =
|
||
identity::did_key_from_pubkey_hex(&data.server_info.pubkey).unwrap_or_default();
|
||
let pubkey_hex = identity.pubkey_hex();
|
||
let mesh_config = crate::mesh::load_config(&data_dir)
|
||
.await
|
||
.unwrap_or_default();
|
||
let mesh_only = mesh_config.mesh_only_mode.unwrap_or(false);
|
||
|
||
match crate::transport::PeerRegistry::load(&data_dir).await {
|
||
Ok(registry) => {
|
||
let registry = std::sync::Arc::new(registry);
|
||
let mut transports: Vec<Box<dyn crate::transport::NodeTransport>> = Vec::new();
|
||
|
||
// Tor transport (always register — availability checked dynamically)
|
||
transports.push(Box::new(crate::transport::tor::TorTransport::new(
|
||
&pubkey_hex,
|
||
)));
|
||
|
||
// Mesh transport (wraps the mesh service)
|
||
transports.push(Box::new(
|
||
crate::transport::mesh_transport::MeshTransport::new(
|
||
api_handler.rpc_handler().mesh_service_arc(),
|
||
),
|
||
));
|
||
|
||
// LAN transport (mDNS discovery)
|
||
let mut lan = crate::transport::lan::LanTransport::new(&did, &pubkey_hex, 5678);
|
||
match lan.start(registry.clone()) {
|
||
Ok(()) => info!("📡 LAN transport (mDNS) started"),
|
||
Err(e) => debug!("LAN transport init (non-fatal): {}", e),
|
||
}
|
||
transports.push(Box::new(lan));
|
||
|
||
let router = std::sync::Arc::new(crate::transport::TransportRouter::new(
|
||
transports, registry, mesh_only,
|
||
));
|
||
api_handler.rpc_handler().set_transport_router(router).await;
|
||
info!("📡 Transport router initialized (mesh_only={})", mesh_only);
|
||
}
|
||
Err(e) => {
|
||
warn!("Transport router init failed (non-fatal): {}", e);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Register Archipelago DWN protocols (background, non-blocking)
|
||
{
|
||
let data_dir = config.data_dir.clone();
|
||
tokio::spawn(async move {
|
||
if let Err(e) = register_dwn_protocols(&data_dir).await {
|
||
debug!("DWN protocol registration (non-fatal): {}", e);
|
||
}
|
||
});
|
||
}
|
||
|
||
// Periodic Tor address refresh (runs regardless of dev_mode)
|
||
// Picks up hostname when Tor creates it after startup/rotation (30-60s delay)
|
||
{
|
||
let state = state_manager.clone();
|
||
let identity_clone = identity.clone();
|
||
tokio::spawn(async move {
|
||
let mut interval = tokio::time::interval(Duration::from_secs(30));
|
||
loop {
|
||
interval.tick().await;
|
||
if let Err(e) = refresh_tor_address(&state, identity_clone.as_ref()).await {
|
||
debug!("Tor address refresh (non-fatal): {}", e);
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
// Initialize container scanner — discovers installed apps from Podman/Docker
|
||
{
|
||
let scanner = create_docker_scanner(&config).await?;
|
||
let state = state_manager.clone();
|
||
let identity_clone = identity.clone();
|
||
let data_dir = config.data_dir.clone();
|
||
let scan_kick = api_handler.rpc_handler().scan_kick();
|
||
let scan_tick = api_handler.rpc_handler().scan_tick();
|
||
|
||
// Initial scan (delayed to let crash recovery finish first)
|
||
tokio::spawn(async move {
|
||
// Brief delay for containers to stabilize after boot
|
||
tokio::time::sleep(Duration::from_secs(3)).await;
|
||
info!("🐳 Scanning containers...");
|
||
// Tracks how many consecutive scans each container has been absent from.
|
||
// Prevents UI flapping when podman intermittently returns incomplete results.
|
||
let mut absence_tracker: HashMap<String, u32> = HashMap::new();
|
||
// Tracks when each container first entered a transitional state
|
||
// (Stopping / Starting / Restarting / ...). Used by the merge
|
||
// loop below to ignore podman's live state during a pending
|
||
// lifecycle op, and to break out if the spawned task dies
|
||
// without ever writing a final state.
|
||
let mut transitional_since: HashMap<String, Instant> = HashMap::new();
|
||
let mut scan_backoff_until: Option<Instant> = None;
|
||
if let Err(e) = scan_and_update_packages(
|
||
&scanner,
|
||
&state,
|
||
identity_clone.as_ref(),
|
||
&data_dir,
|
||
&mut absence_tracker,
|
||
&mut transitional_since,
|
||
)
|
||
.await
|
||
{
|
||
error!("Failed to scan containers: {}", e);
|
||
if is_podman_scan_timeout(&e) {
|
||
scan_backoff_until = Some(Instant::now() + Duration::from_secs(30));
|
||
warn!("Podman container scan timed out; backing off scans for 30s");
|
||
}
|
||
}
|
||
// Bump the scan-completion counter so any caller waiting on a
|
||
// kicked scan (install/update success path) can proceed.
|
||
scan_tick.send_modify(|n| *n = n.wrapping_add(1));
|
||
|
||
// Periodic scan every 60 seconds (only broadcasts if state changed).
|
||
// Also wakes immediately when `scan_kick` fires — install/update
|
||
// success paths poke it so the fresh manifest (with populated
|
||
// interfaces) lands before they flip state to Running.
|
||
// Uses an in-flight guard to skip scans when a previous one is still running
|
||
let mut interval = tokio::time::interval(Duration::from_secs(60));
|
||
// Skip missed ticks instead of catching up — prevents burst of scans
|
||
// after a slow podman response (which causes DB lock storms)
|
||
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||
let scanning = std::sync::Arc::new(AtomicBool::new(false));
|
||
loop {
|
||
tokio::select! {
|
||
_ = interval.tick() => {}
|
||
_ = scan_kick.notified() => {
|
||
debug!("Scan kicked by install/update success — running immediately");
|
||
}
|
||
}
|
||
if let Some(until) = scan_backoff_until {
|
||
if Instant::now() < until {
|
||
debug!("Skipping container scan — Podman scan backoff active");
|
||
scan_tick.send_modify(|n| *n = n.wrapping_add(1));
|
||
continue;
|
||
}
|
||
}
|
||
let Some(_scan_guard) = ContainerScanGuard::try_acquire(&scanning) else {
|
||
debug!("Skipping container scan — previous scan still in progress");
|
||
scan_tick.send_modify(|n| *n = n.wrapping_add(1));
|
||
continue;
|
||
};
|
||
let scan_result = scan_and_update_packages(
|
||
&scanner,
|
||
&state,
|
||
identity_clone.as_ref(),
|
||
&data_dir,
|
||
&mut absence_tracker,
|
||
&mut transitional_since,
|
||
)
|
||
.await;
|
||
if let Err(e) = scan_result {
|
||
error!("Failed to update containers: {}", e);
|
||
if is_podman_scan_timeout(&e) {
|
||
scan_backoff_until = Some(Instant::now() + Duration::from_secs(30));
|
||
warn!("Podman container scan timed out; backing off scans for 30s");
|
||
}
|
||
} else {
|
||
scan_backoff_until = None;
|
||
}
|
||
scan_tick.send_modify(|n| *n = n.wrapping_add(1));
|
||
}
|
||
});
|
||
}
|
||
|
||
// Peer health monitoring — check every 5 minutes
|
||
{
|
||
let state = state_manager.clone();
|
||
let data_dir = config.data_dir.clone();
|
||
tokio::spawn(async move {
|
||
let mut interval = tokio::time::interval(Duration::from_secs(300));
|
||
loop {
|
||
interval.tick().await;
|
||
if let Err(e) = check_peer_health(&state, &data_dir).await {
|
||
debug!("Peer health check (non-fatal): {}", e);
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
// FIPS seed-anchor apply loop — every 5 minutes we re-push the
|
||
// configured seed anchors into the running fips daemon via
|
||
// `fipsctl connect`. This keeps the mesh bootstrap resilient:
|
||
// operators add cluster-local anchors in the UI, and a daemon
|
||
// restart or a flaky public anchor can't strand the node.
|
||
// First run is delayed 30s so fips has time to come up after
|
||
// onboarding before we start dialing.
|
||
{
|
||
let data_dir = config.data_dir.clone();
|
||
tokio::spawn(async move {
|
||
tokio::time::sleep(Duration::from_secs(30)).await;
|
||
let mut interval = tokio::time::interval(Duration::from_secs(300));
|
||
loop {
|
||
interval.tick().await;
|
||
match crate::fips::anchors::load(&data_dir).await {
|
||
Ok(list) if !list.is_empty() => {
|
||
let _ = crate::fips::anchors::apply(&list).await;
|
||
}
|
||
Ok(_) => { /* no seed anchors configured yet */ }
|
||
Err(e) => {
|
||
tracing::debug!("Seed-anchor apply: load failed (non-fatal): {}", e)
|
||
}
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
// did:dht auto-refresh — re-publish DHT records every 2 hours
|
||
if config.nostr_discovery_enabled {
|
||
let data_dir = config.data_dir.clone();
|
||
tokio::spawn(async move {
|
||
let mut interval = tokio::time::interval(Duration::from_secs(7200));
|
||
loop {
|
||
interval.tick().await;
|
||
let identity_dir = data_dir.join("identity");
|
||
let node_key_path = identity_dir.join("node_key");
|
||
if !node_key_path.exists() {
|
||
continue;
|
||
}
|
||
match tokio::fs::read(&node_key_path).await {
|
||
Ok(key_bytes) if key_bytes.len() == 32 => {
|
||
let mut seed = [0u8; 32];
|
||
seed.copy_from_slice(&key_bytes);
|
||
let signing_key = ed25519_dalek::SigningKey::from_bytes(&seed);
|
||
match crate::network::did_dht::create_and_publish(&signing_key, &[])
|
||
.await
|
||
{
|
||
Ok(did) => tracing::info!(did = %did, "did:dht record refreshed"),
|
||
Err(e) => tracing::debug!("did:dht refresh (non-fatal): {}", e),
|
||
}
|
||
}
|
||
_ => {
|
||
tracing::debug!("did:dht refresh skipped: no valid node key");
|
||
}
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
// Periodic federation state sync — every 30 min we call
|
||
// federation::sync_with_peer on each Trusted peer. Without this
|
||
// users had to manually click Sync for `fips_npub`/transport
|
||
// badge/state updates to propagate; now it happens in the
|
||
// background. Staggers peers with a 5s delay so we don't thunder
|
||
// the Tor SOCKS proxy. Sync itself already prefers FIPS.
|
||
{
|
||
let data_dir = config.data_dir.clone();
|
||
let state = state_manager.clone();
|
||
tokio::spawn(async move {
|
||
// First run 60s after boot to let onboarding settle.
|
||
tokio::time::sleep(Duration::from_secs(60)).await;
|
||
let mut interval = tokio::time::interval(Duration::from_secs(1800));
|
||
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
|
||
loop {
|
||
interval.tick().await;
|
||
let Ok(nodes) = crate::federation::load_nodes(&data_dir).await else {
|
||
continue;
|
||
};
|
||
if nodes.is_empty() {
|
||
continue;
|
||
}
|
||
let (data, _) = state.get_snapshot().await;
|
||
let Ok(local_did) =
|
||
crate::identity::did_key_from_pubkey_hex(&data.server_info.pubkey)
|
||
else {
|
||
continue;
|
||
};
|
||
let identity_dir = data_dir.join("identity");
|
||
let Ok(node_identity) =
|
||
crate::identity::NodeIdentity::load_or_create(&identity_dir).await
|
||
else {
|
||
continue;
|
||
};
|
||
|
||
for node in &nodes {
|
||
if node.trust_level == crate::federation::TrustLevel::Untrusted {
|
||
continue;
|
||
}
|
||
match crate::federation::sync_with_peer(
|
||
&data_dir,
|
||
node,
|
||
&local_did,
|
||
|bytes| node_identity.sign(bytes),
|
||
)
|
||
.await
|
||
{
|
||
Ok(_) => debug!(
|
||
"Periodic federation sync ok: {}",
|
||
node.did.chars().take(20).collect::<String>()
|
||
),
|
||
Err(e) => debug!(
|
||
"Periodic federation sync with {}: {}",
|
||
node.did.chars().take(20).collect::<String>(),
|
||
e
|
||
),
|
||
}
|
||
tokio::time::sleep(Duration::from_secs(5)).await;
|
||
}
|
||
}
|
||
});
|
||
}
|
||
|
||
// Container health monitoring — auto-restart unhealthy containers
|
||
// Respects webhook config: skips when disabled or ContainerCrash not subscribed
|
||
crate::health_monitor::spawn_health_monitor(state_manager.clone(), config.data_dir.clone());
|
||
|
||
// Periodic telemetry reporter (every 15 min when opted in)
|
||
crate::monitoring::spawn_telemetry_reporter(
|
||
metrics_for_telemetry,
|
||
Some(state_manager.clone()),
|
||
config.data_dir.clone(),
|
||
);
|
||
|
||
// Post-onboarding auto-activation for archipelago-fips. Runs once
|
||
// at startup: if fips_key is on disk, install /etc/fips/fips.yaml
|
||
// (schema-refreshed) and start the service. This removes the
|
||
// need for a user-facing "Activate" button — the node comes up
|
||
// with FIPS running whenever the seed has been onboarded. Also
|
||
// self-heals legacy raw-byte fips.key files (load_fips_keys
|
||
// rewrites them as bech32 nsec the first time they're read).
|
||
// Pre-onboarding nodes: ConditionPathExists on the service unit
|
||
// + the `fips_key_exists` guard here keep this quiet.
|
||
{
|
||
let data_dir = config.data_dir.clone();
|
||
tokio::spawn(async move {
|
||
let identity_dir = data_dir.join("identity");
|
||
if !crate::identity::fips_key_exists(&identity_dir) {
|
||
tracing::debug!("FIPS auto-activate skipped: fips_key not on disk");
|
||
return;
|
||
}
|
||
// Trigger the migration path in load_fips_keys so old raw-byte
|
||
// key files are rewritten as bech32 before fips.yaml install.
|
||
if let Err(e) = crate::identity::load_fips_keys(&identity_dir).await {
|
||
tracing::warn!("FIPS key load/migrate failed: {}", e);
|
||
return;
|
||
}
|
||
// Check if the installed fips.yaml matches what we'd
|
||
// render now. If not, we need to restart the daemon after
|
||
// reinstalling so it picks up schema changes (e.g. the
|
||
// v1.7.25 re-addition of the TCP transport). Without this,
|
||
// OTA'd nodes would be stuck on the old UDP-only config
|
||
// until someone manually clicked Reconnect.
|
||
let expected = crate::fips::config::render_config_yaml();
|
||
let installed = tokio::fs::read_to_string("/etc/fips/fips.yaml").await.ok();
|
||
let config_changed = installed.as_deref() != Some(expected.as_str());
|
||
|
||
if let Err(e) = crate::fips::config::install(&identity_dir).await {
|
||
tracing::warn!("FIPS config install failed on startup: {}", e);
|
||
return;
|
||
}
|
||
if config_changed {
|
||
tracing::info!(
|
||
"FIPS config schema changed on disk — restarting daemon to pick up new transports"
|
||
);
|
||
// Restart whichever unit is actually supervising
|
||
// the daemon (archipelago-fips vs upstream fips).
|
||
let unit = crate::fips::service::active_unit().await;
|
||
if let Err(e) = crate::fips::service::restart(unit).await {
|
||
tracing::warn!(
|
||
"FIPS restart after config migration failed on {}: {} — user can retry via fips.reconnect",
|
||
unit,
|
||
e
|
||
);
|
||
}
|
||
}
|
||
if let Err(e) = crate::fips::service::activate(crate::fips::SERVICE_UNIT).await {
|
||
tracing::warn!(
|
||
"archipelago-fips activate failed on startup: {} — user can retry via fips.install RPC",
|
||
e
|
||
);
|
||
return;
|
||
}
|
||
tracing::info!("archipelago-fips auto-activated on startup");
|
||
});
|
||
}
|
||
|
||
Ok(Self {
|
||
_config: config,
|
||
_identity: identity,
|
||
api_handler,
|
||
_state_manager: state_manager,
|
||
})
|
||
}
|
||
|
||
/// Serve with a graceful shutdown signal.
|
||
///
|
||
/// `main_addr` is the primary listener (historically `127.0.0.1:5678`).
|
||
/// The main listener always comes up on `main_addr`. The FIPS peer
|
||
/// listener (path-filtered, bound to `fips0`'s ULA) is managed by a
|
||
/// late-binding task that polls every 30s: if fips0 isn't up at
|
||
/// startup (pre-onboarding install, legacy node pre-fips.install),
|
||
/// it keeps trying until the interface appears — no archipelago
|
||
/// restart required after the user activates FIPS.
|
||
///
|
||
/// When `shutdown` completes, both listeners stop accepting and drain
|
||
/// in-flight requests (bounded by `DRAIN_TIMEOUT`).
|
||
pub async fn serve_with_shutdown(
|
||
&self,
|
||
main_addr: SocketAddr,
|
||
shutdown: impl std::future::Future<Output = ()>,
|
||
) -> Result<()> {
|
||
let active_connections = Arc::new(tokio::sync::Semaphore::new(1024));
|
||
let (tx, rx_main) = tokio::sync::watch::channel(false);
|
||
|
||
let main_task = tokio::spawn(accept_loop(
|
||
self.api_handler.clone(),
|
||
TcpListener::bind(main_addr).await?,
|
||
active_connections.clone(),
|
||
false, // main listener: no path filter
|
||
rx_main,
|
||
main_addr,
|
||
));
|
||
|
||
// Peer listener: late-binding so we don't need an archipelago
|
||
// restart when fips0 comes up after onboarding.
|
||
let peer_task = tokio::spawn(peer_late_bind_loop(
|
||
self.api_handler.clone(),
|
||
active_connections.clone(),
|
||
tx.subscribe(),
|
||
));
|
||
|
||
shutdown.await;
|
||
info!("Shutdown signal received, draining connections...");
|
||
let _ = tx.send(true);
|
||
|
||
// Wait up to 5s for in-flight requests.
|
||
let drain_start = std::time::Instant::now();
|
||
let drain_timeout = std::time::Duration::from_secs(5);
|
||
while active_connections.available_permits() < 1024 {
|
||
if drain_start.elapsed() > drain_timeout {
|
||
warn!("Drain timeout reached, forcing shutdown");
|
||
break;
|
||
}
|
||
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
|
||
}
|
||
|
||
let _ = main_task.await;
|
||
let _ = peer_task.await;
|
||
|
||
info!("Shutdown complete");
|
||
Ok(())
|
||
}
|
||
}
|
||
|
||
/// Poll every 30s for `fips0`'s ULA; when it appears, bind the peer
|
||
/// listener and run the normal accept loop. If the bind fails (port
|
||
/// already taken, permissions), log and keep retrying. Returns on
|
||
/// shutdown. First tick fires immediately so the hot path for
|
||
/// already-up fips0 is still zero-cost.
|
||
async fn peer_late_bind_loop(
|
||
handler: Arc<ApiHandler>,
|
||
active_connections: Arc<tokio::sync::Semaphore>,
|
||
mut shutdown_rx: tokio::sync::watch::Receiver<bool>,
|
||
) {
|
||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(30));
|
||
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
|
||
loop {
|
||
tokio::select! {
|
||
_ = interval.tick() => {
|
||
let Some(ip) = crate::fips::iface::fips0_ula() else { continue };
|
||
let addr = SocketAddr::new(
|
||
std::net::IpAddr::V6(ip),
|
||
crate::fips::dial::PEER_PORT,
|
||
);
|
||
let listener = match TcpListener::bind(addr).await {
|
||
Ok(l) => l,
|
||
Err(e) => {
|
||
warn!("FIPS peer listener bind {} failed: {} — retrying in 30s", addr, e);
|
||
continue;
|
||
}
|
||
};
|
||
info!("FIPS peer listener bound {}", addr);
|
||
// Once bound, serve until shutdown fires. accept_loop
|
||
// returns on shutdown, which also ends this outer loop.
|
||
accept_loop(
|
||
handler,
|
||
listener,
|
||
active_connections,
|
||
true, // peer listener: apply path filter
|
||
shutdown_rx,
|
||
addr,
|
||
)
|
||
.await;
|
||
return;
|
||
}
|
||
_ = shutdown_rx.changed() => {
|
||
if *shutdown_rx.borrow() { return; }
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Whitelist of HTTP paths reachable via the peer-facing (FIPS) listener.
|
||
/// Every entry is an endpoint already protected by cryptographic auth
|
||
/// (ed25519 signature verification inside the handler, federation DID
|
||
/// headers checked by the content server, or JSON-RPC methods whose
|
||
/// handlers verify per-message signatures).
|
||
///
|
||
/// Anything not on this list returns 404 on the peer listener.
|
||
pub fn is_peer_allowed_path(path: &str) -> bool {
|
||
// Exact matches
|
||
matches!(
|
||
path,
|
||
"/health"
|
||
| "/rpc/v1"
|
||
| "/archipelago/node-message"
|
||
| "/archipelago/mesh-typed"
|
||
| "/dwn"
|
||
| "/transport/inbox"
|
||
// Content *catalog* — the peer-browse entry point. This is the
|
||
// exact path `/content` (no trailing slash); the prefix match
|
||
// below only covers `/content/<id>` item fetches, so without
|
||
// this the catalog 404s over the mesh and `content.browse-peer`
|
||
// fails with "Peer returned error: 404 Not Found" (and never
|
||
// falls back to Tor, since a 404 is a successful HTTP exchange).
|
||
| "/content"
|
||
)
|
||
// Prefix-matched content endpoints (peer file browse + fetch)
|
||
|| path.starts_with("/content/")
|
||
}
|
||
|
||
async fn accept_loop(
|
||
handler: Arc<ApiHandler>,
|
||
listener: TcpListener,
|
||
active_connections: Arc<tokio::sync::Semaphore>,
|
||
peer_only: bool,
|
||
mut shutdown_rx: tokio::sync::watch::Receiver<bool>,
|
||
local_addr: SocketAddr,
|
||
) {
|
||
loop {
|
||
tokio::select! {
|
||
result = listener.accept() => {
|
||
let (stream, peer_addr) = match result {
|
||
Ok(c) => c,
|
||
Err(e) => {
|
||
error!("{} accept error: {}", local_addr, e);
|
||
continue;
|
||
}
|
||
};
|
||
let handler = handler.clone();
|
||
let permit = active_connections.clone().acquire_owned().await;
|
||
tokio::spawn(async move {
|
||
let _permit = permit;
|
||
let service = service_fn(move |req: hyper::Request<hyper::Body>| {
|
||
let handler = handler.clone();
|
||
async move {
|
||
if peer_only && !is_peer_allowed_path(req.uri().path()) {
|
||
let resp = hyper::Response::builder()
|
||
.status(hyper::StatusCode::NOT_FOUND)
|
||
.body(hyper::Body::empty())
|
||
.expect("static response builds");
|
||
return Ok::<_, std::io::Error>(resp);
|
||
}
|
||
handler
|
||
.handle_request(req)
|
||
.await
|
||
.map_err(|e| std::io::Error::other(format!("{}", e)))
|
||
}
|
||
});
|
||
if let Err(e) = Http::new()
|
||
.http1_keep_alive(false)
|
||
.serve_connection(stream, service)
|
||
.with_upgrades()
|
||
.await
|
||
{
|
||
error!("Error serving connection from {}: {}", peer_addr, e);
|
||
}
|
||
});
|
||
}
|
||
_ = shutdown_rx.changed() => {
|
||
if *shutdown_rx.borrow() {
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
async fn create_docker_scanner(config: &Config) -> Result<DockerPackageScanner> {
|
||
let user = std::env::var("USER").unwrap_or_else(|_| "archipelago".to_string());
|
||
|
||
let runtime: Arc<dyn archipelago_container::ContainerRuntime> = match &config.container_runtime
|
||
{
|
||
ContainerRuntime::Podman => {
|
||
Arc::new(archipelago_container::PodmanRuntime::new(user.clone()))
|
||
}
|
||
ContainerRuntime::Docker => {
|
||
Arc::new(archipelago_container::DockerRuntime::new(user.clone()))
|
||
}
|
||
ContainerRuntime::Auto => {
|
||
Arc::new(archipelago_container::AutoRuntime::new(user.clone()).await?)
|
||
}
|
||
};
|
||
|
||
Ok(DockerPackageScanner::new(runtime))
|
||
}
|
||
|
||
async fn refresh_tor_address(state: &StateManager, identity: &NodeIdentity) -> Result<()> {
|
||
let tor_addr = docker_packages::read_tor_address("archipelago").await;
|
||
let (current_data, _) = state.get_snapshot().await;
|
||
if tor_addr != current_data.server_info.tor_address {
|
||
let mut data = current_data;
|
||
data.server_info.tor_address = tor_addr.clone();
|
||
data.server_info.node_address = tor_addr.as_ref().map(|t| identity.node_address(t));
|
||
state.update_data(data).await;
|
||
if let Some(ref addr) = tor_addr {
|
||
info!("🔒 Tor address updated: {}", addr);
|
||
}
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
/// Number of consecutive absent scans before removing a container from state.
|
||
/// 3 scans × 30s = 90 seconds of absence before removal.
|
||
const CONTAINER_ABSENCE_THRESHOLD: u32 = 3;
|
||
|
||
/// Maximum time a package entry may remain stuck in a transitional state
|
||
/// before the scan loop overrides it with podman's live state.
|
||
///
|
||
/// Rationale: the longest single-container stop timeout is bitcoin-core at
|
||
/// 600s. 2× that gives the spawned task ample margin before we assume it
|
||
/// died (panic, OOM, process restart mid-stop) and fall back to the
|
||
/// scanner's authoritative view. Applies to all transitional variants.
|
||
const TRANSITIONAL_STUCK_TIMEOUT: Duration = Duration::from_secs(120);
|
||
|
||
/// Multi-container installs can legitimately spend several minutes before the
|
||
/// primary user-facing container exists. BTCPay, for example, pulls/starts
|
||
/// Postgres and NBXplorer before `btcpay-server`; do not erase its installing
|
||
/// card just because the primary container is absent during that setup window.
|
||
const INSTALLING_STUCK_TIMEOUT: Duration = Duration::from_secs(20 * 60);
|
||
|
||
fn transitional_stuck_timeout(state: &crate::data_model::PackageState) -> Duration {
|
||
use crate::data_model::PackageState::*;
|
||
match state {
|
||
Installing | Starting | Restarting => INSTALLING_STUCK_TIMEOUT,
|
||
_ => TRANSITIONAL_STUCK_TIMEOUT,
|
||
}
|
||
}
|
||
|
||
/// Returns true if `state` is one of the transitional variants that a
|
||
/// `spawn_transitional`-style background task owns. While such a state is
|
||
/// set, the package scanner must not overwrite it with whatever podman
|
||
/// reports (see `merge_preserving_transitional`).
|
||
fn is_transitional(state: &crate::data_model::PackageState) -> bool {
|
||
use crate::data_model::PackageState::*;
|
||
matches!(
|
||
state,
|
||
Installing
|
||
| Stopping
|
||
| Starting
|
||
| Restarting
|
||
| Updating
|
||
| Removing
|
||
| CreatingBackup
|
||
| RestoringBackup
|
||
| BackingUp
|
||
)
|
||
}
|
||
|
||
fn absent_transitional_replacement(
|
||
state: &crate::data_model::PackageState,
|
||
) -> Option<crate::data_model::PackageState> {
|
||
match state {
|
||
// A stop operation is complete once the container record disappears.
|
||
// Do not leave the app card wedged in "Stopping..." just because the
|
||
// background task died or the backend restarted before it wrote back.
|
||
crate::data_model::PackageState::Stopping => Some(crate::data_model::PackageState::Stopped),
|
||
_ => None,
|
||
}
|
||
}
|
||
|
||
/// Merge a fresh scan entry `fresh` into `existing` while preserving
|
||
/// `existing.state` (which is transitional — the RPC spawn task owns it).
|
||
/// Non-state observability fields are taken from `fresh` so the UI still
|
||
/// sees live health / exit_code / lan_address readings during a transition.
|
||
fn merge_preserving_transitional(
|
||
existing: &crate::data_model::PackageDataEntry,
|
||
fresh: &crate::data_model::PackageDataEntry,
|
||
user_stop_requested: bool,
|
||
) -> crate::data_model::PackageDataEntry {
|
||
let state = match (&existing.state, &fresh.state) {
|
||
// A user-initiated stop must keep showing Stopping while podman still
|
||
// reports Running. Repair/restart transitions do not have a user-stop
|
||
// marker, so a fresh Running scan means the app recovered.
|
||
(crate::data_model::PackageState::Stopping, crate::data_model::PackageState::Running)
|
||
if !user_stop_requested =>
|
||
{
|
||
fresh.state.clone()
|
||
}
|
||
// Removing with a live running container is stale: uninstall either
|
||
// failed or Archipelago restarted before the spawned task could revert
|
||
// state. Let the scanner recover the UI immediately instead of
|
||
// keeping the app wedged in Removing for 20 minutes.
|
||
(crate::data_model::PackageState::Removing, crate::data_model::PackageState::Running) => {
|
||
fresh.state.clone()
|
||
}
|
||
_ => existing.state.clone(),
|
||
};
|
||
|
||
crate::data_model::PackageDataEntry {
|
||
state,
|
||
// install_progress and uninstall_stage are also owned by the
|
||
// initiating op (same reason as state) — keep them.
|
||
install_progress: existing.install_progress.clone(),
|
||
uninstall_stage: existing.uninstall_stage.clone(),
|
||
// Everything else comes from the fresh scan.
|
||
health: fresh.health.clone(),
|
||
exit_code: fresh.exit_code,
|
||
static_files: fresh.static_files.clone(),
|
||
manifest: fresh.manifest.clone(),
|
||
installed: fresh.installed.clone(),
|
||
available_update: fresh.available_update.clone(),
|
||
}
|
||
}
|
||
|
||
fn is_podman_scan_timeout(error: &anyhow::Error) -> bool {
|
||
let msg = format!("{:#}", error);
|
||
msg.contains("podman ps") && msg.contains("timed out")
|
||
}
|
||
|
||
async fn scan_and_update_packages(
|
||
scanner: &DockerPackageScanner,
|
||
state: &StateManager,
|
||
identity: &NodeIdentity,
|
||
data_dir: &std::path::Path,
|
||
absence_tracker: &mut HashMap<String, u32>,
|
||
transitional_since: &mut HashMap<String, Instant>,
|
||
) -> Result<()> {
|
||
let mut packages = scanner.scan_containers().await?;
|
||
let user_stopped = crate::crash_recovery::load_user_stopped(data_dir).await;
|
||
for (id, pkg) in packages.iter_mut() {
|
||
if pkg.state == crate::data_model::PackageState::Exited && user_stopped.contains(id) {
|
||
pkg.state = crate::data_model::PackageState::Stopped;
|
||
pkg.exit_code = None;
|
||
}
|
||
}
|
||
normalize_reachable_package_health(&mut packages).await;
|
||
|
||
let (current_data, _) = state.get_snapshot().await;
|
||
let tor_addr = docker_packages::read_tor_address("archipelago").await;
|
||
let tor_changed = tor_addr != current_data.server_info.tor_address;
|
||
let first_scan = !current_data.server_info.status_info.containers_scanned;
|
||
|
||
// Check if update scheduler has found an available update
|
||
let update_available = crate::update::load_state(std::path::Path::new("/var/lib/archipelago"))
|
||
.await
|
||
.map(|s| s.available_update.is_some())
|
||
.unwrap_or(false);
|
||
let update_changed = update_available != current_data.server_info.status_info.updated;
|
||
|
||
// Empty scan result = podman failure or timeout, preserve existing state
|
||
if packages.is_empty() && !first_scan {
|
||
if tor_changed || update_changed {
|
||
let mut data = current_data;
|
||
data.server_info.tor_address = tor_addr.clone();
|
||
data.server_info.node_address = tor_addr.as_ref().map(|t| identity.node_address(t));
|
||
data.server_info.status_info.updated = update_available;
|
||
state.update_data(data).await;
|
||
}
|
||
return Ok(());
|
||
}
|
||
|
||
// Merge scan results with current state instead of full replacement.
|
||
// This prevents containers from vanishing when podman intermittently
|
||
// returns incomplete results under heavy load.
|
||
let mut merged = current_data.package_data.clone();
|
||
let mut changed = false;
|
||
|
||
// Update/add containers found in this scan.
|
||
//
|
||
// Transitional states (Stopping, Starting, Restarting, Installing,
|
||
// Updating, Removing, backup variants) are owned by the RPC spawn_task
|
||
// that initiated the operation — podman's live state during the op is
|
||
// meaningless ("running" during a graceful stop, "exited" during a
|
||
// restart, etc.) and must not be written back. See
|
||
// `merge_preserving_transitional` for the exact rule.
|
||
//
|
||
// Escape hatch: if a package has been in a transitional state for
|
||
// longer than TRANSITIONAL_STUCK_TIMEOUT we assume the spawned task
|
||
// died without cleanup and let the scan override it.
|
||
let now = Instant::now();
|
||
for (id, pkg) in &packages {
|
||
absence_tracker.remove(id);
|
||
let existing = merged.get(id);
|
||
let overwrite = match existing {
|
||
Some(existing_entry) if is_transitional(&existing_entry.state) => {
|
||
let entered = *transitional_since.entry(id.clone()).or_insert(now);
|
||
let timeout = transitional_stuck_timeout(&existing_entry.state);
|
||
let stuck = now.duration_since(entered) > timeout;
|
||
if stuck {
|
||
warn!(
|
||
"Container {} stuck in {:?} for >{}s; overriding with scan state {:?}",
|
||
id,
|
||
existing_entry.state,
|
||
timeout.as_secs(),
|
||
pkg.state
|
||
);
|
||
transitional_since.remove(id);
|
||
true
|
||
} else {
|
||
// Keep existing transitional state, but merge non-state
|
||
// observability fields (health, exit_code, lan_address
|
||
// via installed) from the fresh scan so the UI still
|
||
// sees live readings.
|
||
let merged_entry = merge_preserving_transitional(
|
||
existing_entry,
|
||
pkg,
|
||
user_stopped.contains(id),
|
||
);
|
||
if existing.cloned() != Some(merged_entry.clone()) {
|
||
merged.insert(id.clone(), merged_entry);
|
||
changed = true;
|
||
}
|
||
false
|
||
}
|
||
}
|
||
Some(_) => {
|
||
// Not transitional: the side-table may hold a stale entry
|
||
// from a previous transition on this id; drop it.
|
||
transitional_since.remove(id);
|
||
existing != Some(pkg)
|
||
}
|
||
None => {
|
||
transitional_since.remove(id);
|
||
true
|
||
}
|
||
};
|
||
if overwrite && merged.get(id) != Some(pkg) {
|
||
merged.insert(id.clone(), pkg.clone());
|
||
changed = true;
|
||
}
|
||
}
|
||
|
||
// Track containers in state but missing from this scan.
|
||
// Only remove after CONTAINER_ABSENCE_THRESHOLD consecutive absent scans.
|
||
let current_ids: Vec<String> = merged.keys().cloned().collect();
|
||
for id in current_ids {
|
||
if !packages.contains_key(&id) {
|
||
// Don't evict packages mid-transition: Installing/Updating/Removing
|
||
// legitimately have no live container yet (image still pulling) or
|
||
// briefly (during recreate). The absence-eviction here was racing
|
||
// installs and removing apps from the UI 14s in. The transitional
|
||
// owner (spawn_task) is responsible for clearing state, not us.
|
||
if let Some(entry) = merged.get(&id) {
|
||
if is_transitional(&entry.state) {
|
||
if let Some(replacement) = absent_transitional_replacement(&entry.state) {
|
||
let mut updated = entry.clone();
|
||
updated.state = replacement;
|
||
updated.health = None;
|
||
updated.exit_code = None;
|
||
updated.install_progress = None;
|
||
updated.uninstall_stage = None;
|
||
merged.insert(id.clone(), updated);
|
||
transitional_since.remove(&id);
|
||
absence_tracker.remove(&id);
|
||
changed = true;
|
||
continue;
|
||
}
|
||
let entered = *transitional_since.entry(id.clone()).or_insert(now);
|
||
let timeout = transitional_stuck_timeout(&entry.state);
|
||
if now.duration_since(entered) > timeout {
|
||
warn!(
|
||
"Container {} stuck in {:?} and absent for >{}s; removing stale transitional state",
|
||
id,
|
||
entry.state,
|
||
timeout.as_secs()
|
||
);
|
||
merged.remove(&id);
|
||
transitional_since.remove(&id);
|
||
changed = true;
|
||
}
|
||
absence_tracker.remove(&id);
|
||
continue;
|
||
}
|
||
// Quadlet-generated units run containers with `--rm`, so a
|
||
// clean user stop removes the Podman record. Keep the package
|
||
// visible as Stopped while the user-stopped marker exists so
|
||
// package.start can recreate it via systemd/Quadlet.
|
||
if entry.state == crate::data_model::PackageState::Stopped
|
||
&& user_stopped.contains(&id)
|
||
{
|
||
absence_tracker.remove(&id);
|
||
continue;
|
||
}
|
||
}
|
||
let count = absence_tracker.entry(id.clone()).or_insert(0);
|
||
*count += 1;
|
||
if *count >= CONTAINER_ABSENCE_THRESHOLD {
|
||
debug!(
|
||
"Removing {} from state after {} consecutive absent scans",
|
||
id, count
|
||
);
|
||
merged.remove(&id);
|
||
absence_tracker.remove(&id);
|
||
transitional_since.remove(&id);
|
||
changed = true;
|
||
}
|
||
}
|
||
}
|
||
|
||
if changed || tor_changed || first_scan || update_changed {
|
||
let mut data = current_data;
|
||
data.package_data = merged;
|
||
data.server_info.tor_address = tor_addr.clone();
|
||
data.server_info.node_address = tor_addr.as_ref().map(|t| identity.node_address(t));
|
||
data.server_info.status_info.containers_scanned = true;
|
||
data.server_info.status_info.updated = update_available;
|
||
state.update_data(data).await;
|
||
debug!(
|
||
"📦 State changed (packages={}, tor={}, first_scan={}, update={}), broadcasting update",
|
||
changed, tor_changed, first_scan, update_changed
|
||
);
|
||
}
|
||
|
||
Ok(())
|
||
}
|
||
|
||
async fn normalize_reachable_package_health(
|
||
packages: &mut HashMap<String, crate::data_model::PackageDataEntry>,
|
||
) {
|
||
for (id, pkg) in packages.iter_mut() {
|
||
if pkg.state != crate::data_model::PackageState::Running {
|
||
continue;
|
||
}
|
||
if !matches!(pkg.health.as_deref(), Some("starting" | "unhealthy" | "1")) {
|
||
continue;
|
||
}
|
||
let Some(port) = pkg
|
||
.installed
|
||
.as_ref()
|
||
.and_then(|i| i.interface_addresses.get("main"))
|
||
.and_then(|a| a.lan_address.as_deref())
|
||
.and_then(port_from_url)
|
||
.or_else(|| fallback_package_port(id))
|
||
else {
|
||
continue;
|
||
};
|
||
if frontend_port_http_ready(port).await {
|
||
debug!(app_id = %id, port, "normalizing reachable package health to healthy");
|
||
pkg.health = Some("healthy".to_string());
|
||
ensure_main_lan_address(pkg, port);
|
||
}
|
||
}
|
||
}
|
||
|
||
async fn frontend_port_http_ready(port: u16) -> bool {
|
||
let Ok(Ok(mut stream)) = tokio::time::timeout(
|
||
Duration::from_secs(2),
|
||
tokio::net::TcpStream::connect(("127.0.0.1", port)),
|
||
)
|
||
.await
|
||
else {
|
||
return false;
|
||
};
|
||
|
||
let request = b"GET / HTTP/1.1\r\nHost: 127.0.0.1\r\nConnection: close\r\n\r\n";
|
||
if stream.write_all(request).await.is_err() {
|
||
return false;
|
||
}
|
||
|
||
let mut buf = [0u8; 64];
|
||
let Ok(Ok(n)) = tokio::time::timeout(Duration::from_secs(2), stream.read(&mut buf)).await
|
||
else {
|
||
return false;
|
||
};
|
||
if n == 0 {
|
||
return false;
|
||
}
|
||
|
||
let head = String::from_utf8_lossy(&buf[..n]);
|
||
head.starts_with("HTTP/1.1 2")
|
||
|| head.starts_with("HTTP/1.1 3")
|
||
|| head.starts_with("HTTP/1.0 2")
|
||
|| head.starts_with("HTTP/1.0 3")
|
||
}
|
||
|
||
fn ensure_main_lan_address(pkg: &mut crate::data_model::PackageDataEntry, port: u16) {
|
||
let Some(installed) = pkg.installed.as_mut() else {
|
||
return;
|
||
};
|
||
let main = installed
|
||
.interface_addresses
|
||
.entry("main".to_string())
|
||
.or_insert_with(|| crate::data_model::InterfaceAddress {
|
||
tor_address: String::new(),
|
||
lan_address: None,
|
||
});
|
||
if main.lan_address.is_none() {
|
||
main.lan_address = Some(format!("http://localhost:{port}"));
|
||
}
|
||
}
|
||
|
||
fn fallback_package_port(app_id: &str) -> Option<u16> {
|
||
match app_id {
|
||
"fedimint" | "fedimintd" => Some(8175),
|
||
"filebrowser" => Some(8083),
|
||
"indeedhub" => Some(7778),
|
||
"nginx-proxy-manager" => Some(8081),
|
||
"nostr-rs-relay" => Some(18081),
|
||
_ => None,
|
||
}
|
||
}
|
||
|
||
fn port_from_url(url: &str) -> Option<u16> {
|
||
let after_scheme = url.split_once("://").map(|(_, rest)| rest).unwrap_or(url);
|
||
let host_port = after_scheme.split('/').next().unwrap_or(after_scheme);
|
||
let port = host_port.rsplit_once(':')?.1;
|
||
port.parse::<u16>().ok()
|
||
}
|
||
|
||
/// Register Archipelago DWN protocols on startup.
|
||
async fn register_dwn_protocols(data_dir: &std::path::Path) -> Result<()> {
|
||
use crate::network::dwn_store::{DwnStore, ProtocolDefinition};
|
||
|
||
let protocols = [
|
||
("https://archipelago.dev/protocols/node-identity/v1", true),
|
||
("https://archipelago.dev/protocols/file-catalog/v1", true),
|
||
("https://archipelago.dev/protocols/federation/v1", false),
|
||
("https://archipelago.dev/protocols/app-deploy/v1", false),
|
||
];
|
||
|
||
let store = DwnStore::new(data_dir).await?;
|
||
let existing = store.list_protocols().await?;
|
||
let existing_uris: std::collections::HashSet<String> =
|
||
existing.iter().map(|p| p.protocol.clone()).collect();
|
||
|
||
let mut registered = 0;
|
||
for (uri, published) in &protocols {
|
||
if existing_uris.contains(*uri) {
|
||
continue;
|
||
}
|
||
let def = ProtocolDefinition {
|
||
protocol: uri.to_string(),
|
||
published: *published,
|
||
types: std::collections::HashMap::new(),
|
||
structure: std::collections::HashMap::new(),
|
||
date_registered: chrono::Utc::now().to_rfc3339(),
|
||
};
|
||
store.register_protocol(&def).await?;
|
||
registered += 1;
|
||
}
|
||
|
||
if registered > 0 {
|
||
info!("📋 Registered {registered} DWN protocols");
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
/// Periodically check peer reachability and broadcast status changes.
|
||
async fn check_peer_health(state: &StateManager, data_dir: &std::path::Path) -> Result<()> {
|
||
let known_peers = peers::load_peers(data_dir).await.unwrap_or_default();
|
||
if known_peers.is_empty() {
|
||
return Ok(());
|
||
}
|
||
|
||
let mut new_health = std::collections::HashMap::new();
|
||
for peer in &known_peers {
|
||
let fips_npub = crate::federation::fips_npub_for_onion(data_dir, &peer.onion).await;
|
||
let reachable = node_message::check_peer_reachable(&peer.onion, fips_npub.as_deref())
|
||
.await
|
||
.unwrap_or(false);
|
||
new_health.insert(peer.onion.clone(), reachable);
|
||
}
|
||
|
||
let (current_data, _) = state.get_snapshot().await;
|
||
if current_data.peer_health != new_health {
|
||
let mut data = current_data;
|
||
data.peer_health = new_health;
|
||
state.update_data(data).await;
|
||
debug!("🔗 Peer health updated, broadcasting changes");
|
||
}
|
||
|
||
Ok(())
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod merge_tests {
|
||
use super::*;
|
||
use crate::data_model::{Description, Manifest, PackageDataEntry, PackageState, StaticFiles};
|
||
|
||
fn make_manifest() -> Manifest {
|
||
Manifest {
|
||
id: "lnd".to_string(),
|
||
title: "LND".to_string(),
|
||
version: "0.18.4".to_string(),
|
||
description: Description {
|
||
short: "".to_string(),
|
||
long: "".to_string(),
|
||
},
|
||
release_notes: "".to_string(),
|
||
license: "".to_string(),
|
||
wrapper_repo: "".to_string(),
|
||
upstream_repo: "".to_string(),
|
||
support_site: "".to_string(),
|
||
marketing_site: "".to_string(),
|
||
donation_url: None,
|
||
author: None,
|
||
website: None,
|
||
interfaces: None,
|
||
tier: None,
|
||
}
|
||
}
|
||
|
||
fn make_static() -> StaticFiles {
|
||
StaticFiles {
|
||
license: "".to_string(),
|
||
instructions: "".to_string(),
|
||
icon: "".to_string(),
|
||
}
|
||
}
|
||
|
||
fn make_entry(state: PackageState, health: Option<&str>) -> PackageDataEntry {
|
||
PackageDataEntry {
|
||
state,
|
||
health: health.map(|s| s.to_string()),
|
||
exit_code: None,
|
||
static_files: make_static(),
|
||
manifest: make_manifest(),
|
||
installed: None,
|
||
install_progress: None,
|
||
uninstall_stage: None,
|
||
available_update: None,
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn peer_path_filter_allows_content_catalog_and_items() {
|
||
// Regression: the content *catalog* is exactly "/content" (no trailing
|
||
// slash). It must be reachable over the peer (FIPS) listener, else
|
||
// `content.browse-peer` 404s over the mesh. Item fetches are
|
||
// "/content/<id>".
|
||
assert!(is_peer_allowed_path("/content"), "catalog must be allowed");
|
||
assert!(
|
||
is_peer_allowed_path("/content/abc123"),
|
||
"items must be allowed"
|
||
);
|
||
assert!(is_peer_allowed_path("/rpc/v1"));
|
||
assert!(is_peer_allowed_path("/health"));
|
||
// Not on the allow-list → rejected (no broad surface over the mesh).
|
||
assert!(!is_peer_allowed_path("/contention"), "must not prefix-leak");
|
||
assert!(!is_peer_allowed_path("/"));
|
||
assert!(!is_peer_allowed_path("/rpc/v2"));
|
||
}
|
||
|
||
#[test]
|
||
fn preserves_transitional_state_on_merge() {
|
||
// existing: user initiated a stop, spawn_transitional set Stopping.
|
||
// fresh: podman hasn't finished the stop yet, still reports Running.
|
||
// Expected: merged state stays Stopping — podman's live view must
|
||
// not clobber the transitional state owned by the RPC spawn task.
|
||
let existing = make_entry(PackageState::Stopping, Some("healthy"));
|
||
let fresh = make_entry(PackageState::Running, Some("starting"));
|
||
let merged = merge_preserving_transitional(&existing, &fresh, true);
|
||
assert_eq!(merged.state, PackageState::Stopping);
|
||
}
|
||
|
||
#[test]
|
||
fn non_user_stopping_recovers_when_container_is_running() {
|
||
let existing = make_entry(PackageState::Stopping, Some("unknown"));
|
||
let fresh = make_entry(PackageState::Running, Some("healthy"));
|
||
let merged = merge_preserving_transitional(&existing, &fresh, false);
|
||
assert_eq!(merged.state, PackageState::Running);
|
||
assert_eq!(merged.health.as_deref(), Some("healthy"));
|
||
}
|
||
|
||
#[test]
|
||
fn merges_fresh_observability_fields() {
|
||
// Non-state observability fields (health, exit_code, installed)
|
||
// MUST come from the fresh scan even while state is preserved —
|
||
// the UI still shows live health/health during a transition.
|
||
let mut existing = make_entry(PackageState::Stopping, Some("healthy"));
|
||
existing.exit_code = None;
|
||
let mut fresh = make_entry(PackageState::Running, Some("unhealthy"));
|
||
fresh.exit_code = Some(0);
|
||
let merged = merge_preserving_transitional(&existing, &fresh, true);
|
||
assert_eq!(merged.state, PackageState::Stopping);
|
||
assert_eq!(merged.health.as_deref(), Some("unhealthy"));
|
||
assert_eq!(merged.exit_code, Some(0));
|
||
}
|
||
|
||
#[test]
|
||
fn stale_removing_recovers_when_container_is_running() {
|
||
let existing = make_entry(PackageState::Removing, Some("unknown"));
|
||
let fresh = make_entry(PackageState::Running, Some("healthy"));
|
||
let merged = merge_preserving_transitional(&existing, &fresh, false);
|
||
assert_eq!(merged.state, PackageState::Running);
|
||
assert_eq!(merged.health.as_deref(), Some("healthy"));
|
||
}
|
||
|
||
#[test]
|
||
fn is_transitional_covers_all_variants() {
|
||
for s in [
|
||
PackageState::Installing,
|
||
PackageState::Stopping,
|
||
PackageState::Starting,
|
||
PackageState::Restarting,
|
||
PackageState::Updating,
|
||
PackageState::Removing,
|
||
PackageState::CreatingBackup,
|
||
PackageState::RestoringBackup,
|
||
PackageState::BackingUp,
|
||
] {
|
||
assert!(is_transitional(&s), "{:?} should be transitional", s);
|
||
}
|
||
for s in [
|
||
PackageState::Installed,
|
||
PackageState::Stopped,
|
||
PackageState::Exited,
|
||
PackageState::Running,
|
||
] {
|
||
assert!(!is_transitional(&s), "{:?} should NOT be transitional", s);
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn installing_uses_longer_stale_timeout_than_other_transitions() {
|
||
assert!(transitional_stuck_timeout(&PackageState::Installing) > TRANSITIONAL_STUCK_TIMEOUT);
|
||
assert_eq!(
|
||
transitional_stuck_timeout(&PackageState::Stopping),
|
||
TRANSITIONAL_STUCK_TIMEOUT
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn absent_stopping_transitions_to_stopped() {
|
||
assert_eq!(
|
||
absent_transitional_replacement(&PackageState::Stopping),
|
||
Some(PackageState::Stopped)
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn absent_installing_still_waits_for_owner() {
|
||
assert_eq!(
|
||
absent_transitional_replacement(&PackageState::Installing),
|
||
None
|
||
);
|
||
}
|
||
}
|