// Archipelago Bitcoin Node OS - Native Backend // Pure Archipelago implementation, no StartOS dependencies // Crate-level clippy allowances. These are stylistic lints that fire on // large legacy surfaces and offer no correctness benefit to chase on every // PR — suppressing them crate-wide keeps CI gating on correctness issues // without drowning in cleanup noise every time a new toolchain tightens. #![allow( clippy::too_many_arguments, clippy::doc_lazy_continuation, clippy::type_complexity, clippy::enum_variant_names, clippy::wildcard_in_or_patterns, clippy::assertions_on_constants, clippy::drop_non_drop, clippy::unused_io_amount, clippy::ptr_arg )] use anyhow::{Context, Result}; use std::net::SocketAddr; use std::sync::Arc; use tokio::signal; use tokio::sync::Notify; use tracing::info; mod api; mod auth; mod avatar; mod backup; mod bitcoin_rpc; mod blobs; mod bootstrap; mod config; mod constants; mod container; mod content_server; mod crash_recovery; mod credentials; mod data_model; mod disk_monitor; mod electrs_status; mod federation; mod fips; mod health_monitor; mod identity; mod identity_manager; mod marketplace; mod mesh; mod monitoring; mod names; mod network; mod node_message; mod nostr_discovery; mod nostr_handshake; mod nostr_relays; mod peers; mod port_allocator; mod rate_limit; pub mod seed; mod server; mod session; mod settings; mod state; mod streaming; mod totp; mod transport; mod update; mod vpn; mod wallet; mod webhooks; use config::Config; use container::{ BootReconciler, ContainerOrchestrator, DevContainerOrchestrator, ProdContainerOrchestrator, RECONCILER_DEFAULT_INTERVAL, }; use server::Server; #[tokio::main] async fn main() -> Result<()> { let startup_start = std::time::Instant::now(); crash_recovery::init_start_time(); // Initialize tracing tracing_subscriber::fmt() .with_env_filter( tracing_subscriber::EnvFilter::try_from_default_env() .unwrap_or_else(|_| "archipelago=debug,info".into()), ) .init(); info!("Starting Archipelago Bitcoin Node OS"); // Self-heal web-ui permissions. The OTA updater in <=v1.7.38 left // /opt/archipelago/web-ui as drwx------ (700) after the atomic // swap — nginx (www-data) then returned 500/403 on every request // until someone shelled in and chmod'd it. Check on every boot // and repair if needed so a node auto-recovers after the next // service restart that follows a broken OTA. tokio::spawn(async move { use std::os::unix::fs::PermissionsExt; let web_ui = std::path::Path::new("/opt/archipelago/web-ui"); if let Ok(meta) = tokio::fs::metadata(web_ui).await { let mode = meta.permissions().mode() & 0o777; if mode & 0o005 != 0o005 { tracing::warn!( "web-ui perms {:o} not world-readable — self-healing", mode ); let _ = tokio::process::Command::new("sudo") .args([ "-n", "chmod", "-R", "u=rwX,go=rX", "/opt/archipelago/web-ui", ]) .status() .await; } } }); // Load configuration let config = Config::load().await?; info!("📁 Data directory: {}", config.data_dir.display()); // Load user transport preferences so peer-to-peer call sites can // consult them from any module without threading a handle through // deep async chains. Missing/corrupt file → default (Auto everywhere). if let Err(e) = settings::transport::init(&config.data_dir).await { tracing::warn!( "Failed to initialise transport preferences: {} — using defaults", e ); } // Write PID marker early so we can detect crashes on next startup crash_recovery::write_pid_marker(&config.data_dir).await?; // Crash recovery runs in background so health endpoint is available immediately { let data_dir = config.data_dir.clone(); tokio::spawn(async move { // Check if previous instance shut down cleanly match crash_recovery::check_for_crash(&data_dir).await { Ok(Some(containers)) => { info!( "🔧 Recovering {} containers from previous crash...", containers.len() ); let report = crash_recovery::recover_containers(&containers).await; info!( "🔧 Recovery complete: {}/{} containers restarted (failed: {:?})", report.recovered, report.total, report.failed ); } Ok(None) => {} Err(e) => { tracing::warn!("Crash recovery check failed: {}", e); } } // Start any stopped containers (handles clean reboot) // Skips user-stopped containers, uses tier ordering let boot_report = crash_recovery::start_stopped_containers(&data_dir).await; if boot_report.total > 0 { info!( "🔄 Boot startup: {}/{} containers started (failed: {:?})", boot_report.recovered, boot_report.total, boot_report.failed ); } // Signal to health monitor that boot recovery is done crash_recovery::mark_recovery_complete(); }); } // Construct the container orchestrator once. In prod mode we load the // on-disk app manifests, do an initial adoption pass, and spawn the // BootReconciler loop (Step 5/6 of the rust-orchestrator migration). // Dev mode uses the in-memory DevContainerOrchestrator and has no // reconciler (manifests are pushed via RPC, not discovered from disk). let shutdown_notify = Arc::new(Notify::new()); let (orchestrator, dev_orchestrator): ( Option>, Option>, ) = if config.dev_mode { let dev = Arc::new(DevContainerOrchestrator::new(config.clone()).await?); let trait_obj: Arc = dev.clone(); (Some(trait_obj), Some(dev)) } else { let prod = Arc::new(ProdContainerOrchestrator::new(config.clone()).await?); // Best-effort manifest load; a missing /opt/archipelago/apps is // logged inside load_manifests and not fatal. match prod.load_manifests().await { Ok(n) => info!("📦 Loaded {n} app manifest(s) from disk"), Err(e) => { tracing::error!(error = %e, "prod orchestrator: load_manifests failed at startup"); } } // Adoption pass: link existing podman containers back to their // manifests so the reconciler doesn't recreate them. match prod.adopt_existing().await { Ok(report) => { info!( "🔗 Adopted {} existing container(s): {:?}", report.adopted.len(), report.adopted ); } Err(e) => { tracing::warn!(error = %e, "prod orchestrator: adopt_existing failed (non-fatal)"); } } // Spawn the boot reconciler loop. Runs an initial reconcile // immediately, then re-checks every RECONCILER_DEFAULT_INTERVAL // until shutdown_notify fires. { let reconciler = BootReconciler::new( prod.clone(), RECONCILER_DEFAULT_INTERVAL, shutdown_notify.clone(), ); tokio::spawn(reconciler.run_forever()); info!( "🔄 Boot reconciler started (interval: {:?})", RECONCILER_DEFAULT_INTERVAL ); } let trait_obj: Arc = prod; (Some(trait_obj), None) }; // Ensure a default user exists so login works after install/onboarding. // In production, the default password is "password123" (shown during install). // In dev mode, the dev default password is used. // Don't auto-create default user — let onboarding flow handle password setup // via auth.setup RPC. The Login page detects is_setup=false and shows // "Create Password" form instead of login form. // Create server let server = Server::new(config.clone(), orchestrator, dev_orchestrator).await?; // Start server let addr: SocketAddr = format!("{}:{}", config.bind_host, config.bind_port) .parse() .context("Invalid bind address")?; // The FIPS peer listener is bound lazily by server::serve_with_shutdown // on a 30s poll of fips0 — so a post-onboarding fips.install brings it // online without needing an archipelago restart. // Post-OTA verification: if apply_update() wrote a pending-verify // marker right before the restart, probe the frontend now and auto- // rollback if it's broken. This is the guardrail that stops fleet- // wide breakage when an OTA lands a subtly-bad release (v1.7.38/39 // tarball-perms → nginx 500 was the trigger). Runs concurrently // with normal startup — doesn't delay the server coming up. { let data_dir = config.data_dir.clone(); tokio::spawn(async move { update::verify_pending_update(&data_dir).await; }); } // Spawn background update scheduler let update_data_dir = config.data_dir.clone(); tokio::spawn(async move { update::run_update_scheduler(update_data_dir).await; }); // Synchronize host-side doctor artifacts (script + systemd units) with // what's embedded in this binary. Runs in the background so it never // delays server readiness; best-effort, warnings only. tokio::spawn(bootstrap::ensure_doctor_installed()); // Spawn periodic container snapshot (for crash recovery) crash_recovery::spawn_snapshot_task(config.data_dir.clone()); // Spawn disk space monitor (warns at 85%, auto-cleans at 90%) disk_monitor::spawn_disk_monitor(config.data_dir.clone()); // Restore WireGuard peers into wg0 (kernel loses them on every reboot). { let data_dir = config.data_dir.clone(); tokio::spawn(async move { vpn::restore_wg_peers(&data_dir).await; }); } // Spawn ElectrumX status cache (refreshes every 15s, serves cached data to avoid race conditions) electrs_status::spawn_status_cache(); let startup_ms = startup_start.elapsed().as_millis(); info!( "Server listening on http://{} (startup: {}ms)", addr, startup_ms ); info!("RPC API: http://{}/rpc/v1", addr); info!("WebSocket: ws://{}/ws", addr); // Notify systemd that we're ready (Type=notify) // Note: first param `false` keeps NOTIFY_SOCKET so watchdog pings work let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Ready]); // Spawn systemd watchdog ping (WatchdogSec=300, ping every 120s) tokio::spawn(async { let mut interval = tokio::time::interval(std::time::Duration::from_secs(120)); loop { interval.tick().await; let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Watchdog]); } }); // Graceful shutdown: wait for SIGTERM or SIGINT let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate()) .context("Failed to register SIGTERM handler")?; let shutdown_notify_for_signal = shutdown_notify.clone(); let shutdown = async move { tokio::select! { _ = signal::ctrl_c() => { info!("Received SIGINT (Ctrl+C), initiating graceful shutdown..."); } _ = sigterm.recv() => { info!("Received SIGTERM, initiating graceful shutdown..."); } } // Signal the boot reconciler (and any other subscribers) to stop. // `notify_one` stores a permit if no task is currently parked on // `notified()`, so we don't race the reconciler's reconcile_all pass. shutdown_notify_for_signal.notify_one(); }; server.serve_with_shutdown(addr, shutdown).await?; // Clean shutdown: remove PID marker so next startup doesn't trigger recovery crash_recovery::remove_pid_marker(&config.data_dir).await; info!("Archipelago shut down cleanly"); // Hard-exit after logging. All business state is persisted by now // (connections drained, PID marker removed, disk flushes done via // tokio::fs awaits). Letting tokio try to drop the runtime instead // can stall for 15s+ on non-daemon OS threads we don't directly // own (mdns_sd daemon, reqwest resolver pool, etc.) — long enough // for systemd's TimeoutStopSec to SIGKILL us and mark the service // Failed, which makes an otherwise-successful update look like a // crash in `systemctl status`. std::process::exit(0); }