347 lines
13 KiB
Rust
347 lines
13 KiB
Rust
// Archipelago Bitcoin Node OS - Native Backend
|
|
// Pure Archipelago implementation, no StartOS dependencies
|
|
|
|
// Crate-level clippy allowances. These are stylistic lints that fire on
|
|
// large legacy surfaces and offer no correctness benefit to chase on every
|
|
// PR — suppressing them crate-wide keeps CI gating on correctness issues
|
|
// without drowning in cleanup noise every time a new toolchain tightens.
|
|
#![allow(
|
|
clippy::too_many_arguments,
|
|
clippy::doc_lazy_continuation,
|
|
clippy::type_complexity,
|
|
clippy::enum_variant_names,
|
|
clippy::wildcard_in_or_patterns,
|
|
clippy::assertions_on_constants,
|
|
clippy::drop_non_drop,
|
|
clippy::unused_io_amount,
|
|
clippy::ptr_arg
|
|
)]
|
|
|
|
use anyhow::{Context, Result};
|
|
use std::net::SocketAddr;
|
|
use std::sync::Arc;
|
|
use tokio::signal;
|
|
use tokio::sync::Notify;
|
|
use tracing::info;
|
|
|
|
mod api;
|
|
mod auth;
|
|
mod avatar;
|
|
mod backup;
|
|
mod bitcoin_rpc;
|
|
mod bitcoin_status;
|
|
mod blobs;
|
|
mod bootstrap;
|
|
mod config;
|
|
mod constants;
|
|
mod container;
|
|
mod content_server;
|
|
mod crash_recovery;
|
|
mod credentials;
|
|
mod data_model;
|
|
mod disk_monitor;
|
|
mod electrs_status;
|
|
mod federation;
|
|
mod fips;
|
|
mod health_monitor;
|
|
mod identity;
|
|
mod identity_manager;
|
|
mod marketplace;
|
|
mod mesh;
|
|
mod monitoring;
|
|
mod names;
|
|
mod network;
|
|
mod node_message;
|
|
mod nostr_discovery;
|
|
mod nostr_handshake;
|
|
mod nostr_relays;
|
|
mod peers;
|
|
mod port_allocator;
|
|
mod rate_limit;
|
|
pub mod seed;
|
|
mod server;
|
|
mod session;
|
|
mod settings;
|
|
mod state;
|
|
mod streaming;
|
|
mod totp;
|
|
mod transport;
|
|
mod update;
|
|
mod vpn;
|
|
mod wallet;
|
|
mod webhooks;
|
|
|
|
use config::Config;
|
|
use container::{
|
|
BootReconciler, ContainerOrchestrator, DevContainerOrchestrator, ProdContainerOrchestrator,
|
|
RECONCILER_DEFAULT_INTERVAL,
|
|
};
|
|
use server::Server;
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<()> {
|
|
let startup_start = std::time::Instant::now();
|
|
crash_recovery::init_start_time();
|
|
|
|
// Initialize tracing
|
|
tracing_subscriber::fmt()
|
|
.with_env_filter(
|
|
tracing_subscriber::EnvFilter::try_from_default_env()
|
|
.unwrap_or_else(|_| "archipelago=debug,info".into()),
|
|
)
|
|
.init();
|
|
|
|
info!("Starting Archipelago Bitcoin Node OS");
|
|
|
|
// Self-heal web-ui permissions. The OTA updater in <=v1.7.38 left
|
|
// /opt/archipelago/web-ui as drwx------ (700) after the atomic
|
|
// swap — nginx (www-data) then returned 500/403 on every request
|
|
// until someone shelled in and chmod'd it. Check on every boot
|
|
// and repair if needed so a node auto-recovers after the next
|
|
// service restart that follows a broken OTA.
|
|
tokio::spawn(async move {
|
|
use std::os::unix::fs::PermissionsExt;
|
|
let web_ui = std::path::Path::new("/opt/archipelago/web-ui");
|
|
if let Ok(meta) = tokio::fs::metadata(web_ui).await {
|
|
let mode = meta.permissions().mode() & 0o777;
|
|
if mode & 0o005 != 0o005 {
|
|
tracing::warn!("web-ui perms {:o} not world-readable — self-healing", mode);
|
|
let _ = tokio::process::Command::new("sudo")
|
|
.args([
|
|
"-n",
|
|
"chmod",
|
|
"-R",
|
|
"u=rwX,go=rX",
|
|
"/opt/archipelago/web-ui",
|
|
])
|
|
.status()
|
|
.await;
|
|
}
|
|
}
|
|
});
|
|
|
|
// Load configuration
|
|
let config = Config::load().await?;
|
|
info!("📁 Data directory: {}", config.data_dir.display());
|
|
|
|
// Load user transport preferences so peer-to-peer call sites can
|
|
// consult them from any module without threading a handle through
|
|
// deep async chains. Missing/corrupt file → default (Auto everywhere).
|
|
if let Err(e) = settings::transport::init(&config.data_dir).await {
|
|
tracing::warn!(
|
|
"Failed to initialise transport preferences: {} — using defaults",
|
|
e
|
|
);
|
|
}
|
|
|
|
// Write PID marker early so we can detect crashes on next startup
|
|
crash_recovery::write_pid_marker(&config.data_dir).await?;
|
|
|
|
// Run crash recovery before starting the manifest reconciler. Both paths
|
|
// mutate Podman; running them concurrently can corrupt transient runtime
|
|
// state and leave netavark/conmon unable to start containers.
|
|
match crash_recovery::check_for_crash(&config.data_dir).await {
|
|
Ok(Some(containers)) => {
|
|
info!(
|
|
"🔧 Recovering {} containers from previous crash...",
|
|
containers.len()
|
|
);
|
|
let report = crash_recovery::recover_containers(&containers).await;
|
|
info!(
|
|
"🔧 Recovery complete: {}/{} containers restarted (failed: {:?})",
|
|
report.recovered, report.total, report.failed
|
|
);
|
|
}
|
|
Ok(None) => {}
|
|
Err(e) => {
|
|
tracing::warn!("Crash recovery check failed: {}", e);
|
|
}
|
|
}
|
|
|
|
// Start any stopped containers (handles clean reboot). This remains
|
|
// synchronous for the same reason: no concurrent reconciler during Podman
|
|
// startup/recovery operations.
|
|
let boot_report = crash_recovery::start_stopped_containers(&config.data_dir).await;
|
|
if boot_report.total > 0 {
|
|
info!(
|
|
"🔄 Boot startup: {}/{} containers started (failed: {:?})",
|
|
boot_report.recovered, boot_report.total, boot_report.failed
|
|
);
|
|
}
|
|
crash_recovery::mark_recovery_complete();
|
|
|
|
// Construct the container orchestrator once. In prod mode we load the
|
|
// on-disk app manifests, do an initial adoption pass, and spawn the
|
|
// BootReconciler loop (Step 5/6 of the rust-orchestrator migration).
|
|
// Dev mode uses the in-memory DevContainerOrchestrator and has no
|
|
// reconciler (manifests are pushed via RPC, not discovered from disk).
|
|
let shutdown_notify = Arc::new(Notify::new());
|
|
let (orchestrator, dev_orchestrator): (
|
|
Option<Arc<dyn ContainerOrchestrator>>,
|
|
Option<Arc<DevContainerOrchestrator>>,
|
|
) = if config.dev_mode {
|
|
let dev = Arc::new(DevContainerOrchestrator::new(config.clone()).await?);
|
|
let trait_obj: Arc<dyn ContainerOrchestrator> = dev.clone();
|
|
(Some(trait_obj), Some(dev))
|
|
} else {
|
|
let prod = Arc::new(ProdContainerOrchestrator::new(config.clone()).await?);
|
|
// Best-effort manifest load; a missing /opt/archipelago/apps is
|
|
// logged inside load_manifests and not fatal.
|
|
match prod.load_manifests().await {
|
|
Ok(n) => info!("📦 Loaded {n} app manifest(s) from disk"),
|
|
Err(e) => {
|
|
tracing::error!(error = %e, "prod orchestrator: load_manifests failed at startup");
|
|
}
|
|
}
|
|
// Adoption pass: link existing podman containers back to their
|
|
// manifests so the reconciler doesn't recreate them.
|
|
match prod.adopt_existing().await {
|
|
Ok(report) => {
|
|
info!(
|
|
"🔗 Adopted {} existing container(s): {:?}",
|
|
report.adopted.len(),
|
|
report.adopted
|
|
);
|
|
}
|
|
Err(e) => {
|
|
tracing::warn!(error = %e, "prod orchestrator: adopt_existing failed (non-fatal)");
|
|
}
|
|
}
|
|
// Spawn the boot reconciler loop. Runs an initial reconcile
|
|
// immediately, then re-checks every RECONCILER_DEFAULT_INTERVAL
|
|
// until shutdown_notify fires.
|
|
{
|
|
let reconciler = BootReconciler::new(
|
|
prod.clone(),
|
|
RECONCILER_DEFAULT_INTERVAL,
|
|
shutdown_notify.clone(),
|
|
);
|
|
tokio::spawn(reconciler.run_forever());
|
|
info!(
|
|
"🔄 Boot reconciler started (interval: {:?})",
|
|
RECONCILER_DEFAULT_INTERVAL
|
|
);
|
|
}
|
|
let trait_obj: Arc<dyn ContainerOrchestrator> = prod;
|
|
(Some(trait_obj), None)
|
|
};
|
|
|
|
// Ensure a default user exists so login works after install/onboarding.
|
|
// In production, the default password is "password123" (shown during install).
|
|
// In dev mode, the dev default password is used.
|
|
// Don't auto-create default user — let onboarding flow handle password setup
|
|
// via auth.setup RPC. The Login page detects is_setup=false and shows
|
|
// "Create Password" form instead of login form.
|
|
|
|
// Create server
|
|
let server = Server::new(config.clone(), orchestrator, dev_orchestrator).await?;
|
|
|
|
// Start server
|
|
let addr: SocketAddr = format!("{}:{}", config.bind_host, config.bind_port)
|
|
.parse()
|
|
.context("Invalid bind address")?;
|
|
|
|
// The FIPS peer listener is bound lazily by server::serve_with_shutdown
|
|
// on a 30s poll of fips0 — so a post-onboarding fips.install brings it
|
|
// online without needing an archipelago restart.
|
|
|
|
// Post-OTA verification: if apply_update() wrote a pending-verify
|
|
// marker right before the restart, probe the frontend now and auto-
|
|
// rollback if it's broken. This is the guardrail that stops fleet-
|
|
// wide breakage when an OTA lands a subtly-bad release (v1.7.38/39
|
|
// tarball-perms → nginx 500 was the trigger). Runs concurrently
|
|
// with normal startup — doesn't delay the server coming up.
|
|
{
|
|
let data_dir = config.data_dir.clone();
|
|
tokio::spawn(async move {
|
|
update::verify_pending_update(&data_dir).await;
|
|
});
|
|
}
|
|
|
|
// Spawn background update scheduler
|
|
let update_data_dir = config.data_dir.clone();
|
|
tokio::spawn(async move {
|
|
update::run_update_scheduler(update_data_dir).await;
|
|
});
|
|
|
|
// Synchronize host-side doctor artifacts (script + systemd units) with
|
|
// what's embedded in this binary. Runs in the background so it never
|
|
// delays server readiness; best-effort, warnings only.
|
|
tokio::spawn(bootstrap::ensure_doctor_installed());
|
|
|
|
// Spawn periodic container snapshot (for crash recovery)
|
|
crash_recovery::spawn_snapshot_task(config.data_dir.clone());
|
|
|
|
// Spawn disk space monitor (warns at 85%, auto-cleans at 90%)
|
|
disk_monitor::spawn_disk_monitor(config.data_dir.clone());
|
|
|
|
// Restore WireGuard peers into wg0 (kernel loses them on every reboot).
|
|
{
|
|
let data_dir = config.data_dir.clone();
|
|
tokio::spawn(async move {
|
|
vpn::restore_wg_peers(&data_dir).await;
|
|
});
|
|
}
|
|
|
|
// Spawn ElectrumX status cache (refreshes every 15s, serves cached data to avoid race conditions)
|
|
electrs_status::spawn_status_cache();
|
|
bitcoin_status::spawn_status_cache();
|
|
|
|
let startup_ms = startup_start.elapsed().as_millis();
|
|
info!(
|
|
"Server listening on http://{} (startup: {}ms)",
|
|
addr, startup_ms
|
|
);
|
|
info!("RPC API: http://{}/rpc/v1", addr);
|
|
info!("WebSocket: ws://{}/ws", addr);
|
|
|
|
// Notify systemd that we're ready (Type=notify)
|
|
// Note: first param `false` keeps NOTIFY_SOCKET so watchdog pings work
|
|
let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Ready]);
|
|
|
|
// Spawn systemd watchdog ping (WatchdogSec=300, ping every 120s)
|
|
tokio::spawn(async {
|
|
let mut interval = tokio::time::interval(std::time::Duration::from_secs(120));
|
|
loop {
|
|
interval.tick().await;
|
|
let _ = sd_notify::notify(false, &[sd_notify::NotifyState::Watchdog]);
|
|
}
|
|
});
|
|
|
|
// Graceful shutdown: wait for SIGTERM or SIGINT
|
|
let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate())
|
|
.context("Failed to register SIGTERM handler")?;
|
|
let shutdown_notify_for_signal = shutdown_notify.clone();
|
|
let shutdown = async move {
|
|
tokio::select! {
|
|
_ = signal::ctrl_c() => {
|
|
info!("Received SIGINT (Ctrl+C), initiating graceful shutdown...");
|
|
}
|
|
_ = sigterm.recv() => {
|
|
info!("Received SIGTERM, initiating graceful shutdown...");
|
|
}
|
|
}
|
|
// Signal the boot reconciler (and any other subscribers) to stop.
|
|
// `notify_one` stores a permit if no task is currently parked on
|
|
// `notified()`, so we don't race the reconciler's reconcile_all pass.
|
|
shutdown_notify_for_signal.notify_one();
|
|
};
|
|
|
|
server.serve_with_shutdown(addr, shutdown).await?;
|
|
|
|
// Clean shutdown: remove PID marker so next startup doesn't trigger recovery
|
|
crash_recovery::remove_pid_marker(&config.data_dir).await;
|
|
|
|
info!("Archipelago shut down cleanly");
|
|
|
|
// Hard-exit after logging. All business state is persisted by now
|
|
// (connections drained, PID marker removed, disk flushes done via
|
|
// tokio::fs awaits). Letting tokio try to drop the runtime instead
|
|
// can stall for 15s+ on non-daemon OS threads we don't directly
|
|
// own (mdns_sd daemon, reqwest resolver pool, etc.) — long enough
|
|
// for systemd's TimeoutStopSec to SIGKILL us and mark the service
|
|
// Failed, which makes an otherwise-successful update look like a
|
|
// crash in `systemctl status`.
|
|
std::process::exit(0);
|
|
}
|