Dorian 3e121b525f feat: auto-start stopped containers on boot, add failure recovery tests
Added start_stopped_containers() to crash_recovery.rs that starts all
exited/created containers on backend startup, fixing the issue where
containers didn't come back after clean reboot (PID marker removed by
systemd stop). Created test-failure-recovery.sh covering 5 failure
scenarios: container crash, backend restart, Tor restart, full reboot,
and Tor traffic block (UPTIME-02).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-13 03:55:14 +00:00

147 lines
4.4 KiB
Rust

// Archipelago Bitcoin Node OS - Native Backend
// Pure Archipelago implementation, no StartOS dependencies
use anyhow::Result;
use std::net::SocketAddr;
use tracing::info;
use tokio::signal;
mod api;
mod auth;
mod backup;
mod config;
mod content_server;
mod crash_recovery;
mod credentials;
mod disk_monitor;
mod health_monitor;
mod electrs_status;
mod container;
mod port_allocator;
mod data_model;
mod federation;
mod identity;
mod identity_manager;
mod marketplace;
mod mesh;
mod monitoring;
mod node_message;
mod nostr_discovery;
mod nostr_handshake;
mod peers;
mod server;
mod session;
mod state;
mod totp;
mod wallet;
mod names;
mod network;
mod nostr_relays;
mod update;
mod vpn;
mod webhooks;
use auth::AuthManager;
use config::Config;
use server::Server;
/// Default dev password when auto-creating a user (matches mock-backend).
const DEV_DEFAULT_PASSWORD: &str = "password123";
#[tokio::main]
async fn main() -> Result<()> {
let startup_start = std::time::Instant::now();
// Initialize tracing
tracing_subscriber::fmt()
.with_env_filter(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| "archipelago=debug,info".into()),
)
.init();
info!("Starting Archipelago Bitcoin Node OS");
// Load configuration
let config = Config::load().await?;
info!("📁 Data directory: {}", config.data_dir.display());
// Crash recovery: check if previous instance shut down cleanly
if let Some(containers) = crash_recovery::check_for_crash(&config.data_dir).await? {
info!("🔧 Recovering {} containers from previous crash...", containers.len());
let report = crash_recovery::recover_containers(&containers).await;
info!(
"🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
report.recovered, report.total, report.failed
);
}
// Start any stopped containers (handles clean reboot where PID was removed)
let boot_report = crash_recovery::start_stopped_containers().await;
if boot_report.total > 0 {
info!(
"🔄 Boot startup: {}/{} containers started (failed: {:?})",
boot_report.recovered, boot_report.total, boot_report.failed
);
}
// Write PID marker so we can detect crashes on next startup
crash_recovery::write_pid_marker(&config.data_dir).await?;
// In dev mode, ensure a default user exists so login works without manual setup
if config.dev_mode {
let auth = AuthManager::new(config.data_dir.clone());
if !auth.is_setup().await? {
auth.setup_user(DEV_DEFAULT_PASSWORD).await?;
info!("👤 Created default dev user (password: {})", DEV_DEFAULT_PASSWORD);
}
}
// Create server
let server = Server::new(config.clone()).await?;
// Start server
let addr: SocketAddr = format!("{}:{}", config.bind_host, config.bind_port)
.parse()
.expect("Invalid bind address");
// Spawn background update scheduler
let update_data_dir = config.data_dir.clone();
tokio::spawn(async move {
update::run_update_scheduler(update_data_dir).await;
});
// Spawn periodic container snapshot (for crash recovery)
crash_recovery::spawn_snapshot_task(config.data_dir.clone());
// Spawn disk space monitor (warns at 85%, auto-cleans at 90%)
disk_monitor::spawn_disk_monitor(config.data_dir.clone());
let startup_ms = startup_start.elapsed().as_millis();
info!("Server listening on http://{} (startup: {}ms)", addr, startup_ms);
info!("RPC API: http://{}/rpc/v1", addr);
info!("WebSocket: ws://{}/ws", addr);
// Graceful shutdown: wait for SIGTERM or SIGINT
let shutdown = async {
let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate())
.expect("Failed to register SIGTERM handler");
tokio::select! {
_ = signal::ctrl_c() => {
info!("Received SIGINT (Ctrl+C), initiating graceful shutdown...");
}
_ = sigterm.recv() => {
info!("Received SIGTERM, initiating graceful shutdown...");
}
}
};
server.serve_with_shutdown(addr, shutdown).await?;
// Clean shutdown: remove PID marker so next startup doesn't trigger recovery
crash_recovery::remove_pid_marker(&config.data_dir).await;
info!("Archipelago shut down cleanly");
Ok(())
}