diff --git a/core/archipelago/src/api/handler/node_message.rs b/core/archipelago/src/api/handler/node_message.rs index a1879ef1..d0d794d3 100644 --- a/core/archipelago/src/api/handler/node_message.rs +++ b/core/archipelago/src/api/handler/node_message.rs @@ -128,6 +128,22 @@ impl ApiHandler { hyper::Body::from(r#"{"ok":true,"handled":"connection_accepted"}"#), )); } + + if let Some(handled) = + crate::api::rpc::bitcoin_relay::record_incoming_relay_message( + std::path::Path::new("/var/lib/archipelago"), + from, + incoming.from_name.as_deref(), + &val, + ) + .await? + { + return Ok(build_response( + StatusCode::OK, + "application/json", + hyper::Body::from(format!(r#"{{"ok":true,"handled":"{}"}}"#, handled)), + )); + } } let safe_from = sanitize_log_string(from); diff --git a/core/archipelago/src/api/rpc/analytics.rs b/core/archipelago/src/api/rpc/analytics.rs index 8f261e86..817e9f41 100644 --- a/core/archipelago/src/api/rpc/analytics.rs +++ b/core/archipelago/src/api/rpc/analytics.rs @@ -189,6 +189,27 @@ impl RpcHandler { .map(|f| f as u64) .unwrap_or(0); + let latest = self.metrics_store.latest().await; + let (cpu_pct, mem_pct, disk_pct): (f64, f64, f64) = latest + .map(|s| { + let mem_total = s.system.mem_total_bytes as f64; + let disk_total = s.system.disk_total_bytes as f64; + ( + s.system.cpu_percent, + if mem_total > 0.0 { + (s.system.mem_used_bytes as f64 / mem_total) * 100.0 + } else { + 0.0 + }, + if disk_total > 0.0 { + (s.system.disk_used_bytes as f64 / disk_total) * 100.0 + } else { + 0.0 + }, + ) + }) + .unwrap_or((0.0, 0.0, 0.0)); + // Recent alerts from metrics store let recent_alerts: Vec = self .metrics_store @@ -210,6 +231,9 @@ impl RpcHandler { "uptime_secs": uptime_secs, "cpu_cores": cpu_cores, "ram_mb": total_ram_mb, + "cpu_pct": (cpu_pct * 10.0).round() / 10.0, + "mem_pct": (mem_pct * 10.0).round() / 10.0, + "disk_pct": (disk_pct * 10.0).round() / 10.0, "containers": containers, "container_count": data.package_data.len(), "running_count": data.package_data.values() diff --git a/core/archipelago/src/api/rpc/auth.rs b/core/archipelago/src/api/rpc/auth.rs index 5c433c95..d15685f4 100644 --- a/core/archipelago/src/api/rpc/auth.rs +++ b/core/archipelago/src/api/rpc/auth.rs @@ -79,7 +79,8 @@ impl RpcHandler { .and_then(|v| v.as_bool()) .unwrap_or(true); - self.auth_manager + let outcome = self + .auth_manager .change_password(current_password, new_password, also_change_ssh) .await?; @@ -88,7 +89,12 @@ impl RpcHandler { self.session_store.invalidate_all_except(token).await; } - Ok(serde_json::json!({ "success": true, "session_rotated": true })) + Ok(serde_json::json!({ + "success": true, + "session_rotated": true, + "ssh_updated": outcome.ssh_updated, + "ssh_error": outcome.ssh_error, + })) } pub(super) async fn handle_auth_is_setup(&self) -> Result { diff --git a/core/archipelago/src/api/rpc/bitcoin_relay.rs b/core/archipelago/src/api/rpc/bitcoin_relay.rs new file mode 100644 index 00000000..429489e7 --- /dev/null +++ b/core/archipelago/src/api/rpc/bitcoin_relay.rs @@ -0,0 +1,900 @@ +use super::RpcHandler; +use crate::container::docker_packages; +use crate::data_model::{Notification, NotificationLevel}; +use crate::{bitcoin_status, identity, peers}; +use anyhow::{Context, Result}; +use base64::{engine::general_purpose::STANDARD as BASE64, Engine as _}; +use hmac::{Hmac, Mac}; +use rand::RngCore; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use sha2::Sha256; +use std::path::{Path, PathBuf}; +use tokio::fs; + +const RELAY_DIR: &str = "bitcoin-relay"; +const RELAY_STATE_FILE: &str = "state.json"; +const TXRELAY_USER: &str = "txrelay"; +const TXRELAY_PASSWORD_FILE: &str = "bitcoin-rpc-txrelay-password"; +const TXRELAY_RPCAUTH_FILE: &str = "bitcoin-rpc-txrelay-rpcauth"; +const TXRELAY_CLIENT_ENV_FILE: &str = "bitcoin-rpc-txrelay-client.env"; + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default)] +struct BitcoinRelayState { + settings: BitcoinRelaySettings, + requests: Vec, + updated_at: Option, +} + +impl Default for BitcoinRelayState { + fn default() -> Self { + Self { + settings: BitcoinRelaySettings::default(), + requests: Vec::new(), + updated_at: None, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default)] +struct BitcoinRelaySettings { + enabled_for_peers: bool, + allow_peer_requests: bool, + allow_http: bool, + allow_https: bool, + allow_tor: bool, + selected_peer_pubkey: Option, + http_endpoint: Option, + https_endpoint: Option, + tor_endpoint: Option, +} + +impl Default for BitcoinRelaySettings { + fn default() -> Self { + Self { + enabled_for_peers: false, + allow_peer_requests: false, + allow_http: false, + allow_https: true, + allow_tor: false, + selected_peer_pubkey: None, + http_endpoint: None, + https_endpoint: None, + tor_endpoint: None, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct BitcoinRelayRequest { + id: String, + direction: RelayRequestDirection, + status: RelayRequestStatus, + peer_pubkey: String, + peer_onion: String, + peer_name: Option, + message: Option, + approved_endpoint: Option, + credential_secret_path: Option, + created_at: String, + updated_at: String, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +enum RelayRequestDirection { + Incoming, + Outbound, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +enum RelayRequestStatus { + Pending, + Approved, + Rejected, +} + +#[derive(Debug, Serialize)] +struct TrustedRelayPeer { + pubkey: String, + onion: String, + name: Option, + relay_approved: bool, +} + +#[derive(Debug, Clone)] +struct TxRelayCredentials { + username: String, + password: String, +} + +impl RpcHandler { + pub(super) async fn handle_bitcoin_relay_status(&self) -> Result { + let mut state = load_relay_state(&self.config.data_dir).await?; + hydrate_tor_endpoint(&self.config.data_dir, &mut state).await; + let known_peers = peers::load_peers(&self.config.data_dir) + .await + .unwrap_or_default(); + let trusted_nodes = trusted_relay_peers(&known_peers, &state); + let local_node = local_sync_status().await; + let credential_status = txrelay_credential_status(&self.config.data_dir).await; + + Ok(json!({ + "settings": state.settings, + "trusted_nodes": trusted_nodes, + "requests": state.requests, + "local_node": local_node, + "credentials": credential_status, + })) + } + + pub(super) async fn handle_bitcoin_relay_update_settings( + &self, + params: Option, + ) -> Result { + let params = params.unwrap_or_default(); + let mut state = load_relay_state(&self.config.data_dir).await?; + let known_peers = peers::load_peers(&self.config.data_dir) + .await + .unwrap_or_default(); + + update_bool( + ¶ms, + "enabled_for_peers", + &mut state.settings.enabled_for_peers, + ); + update_bool( + ¶ms, + "allow_peer_requests", + &mut state.settings.allow_peer_requests, + ); + update_bool(¶ms, "allow_http", &mut state.settings.allow_http); + update_bool(¶ms, "allow_https", &mut state.settings.allow_https); + update_bool(¶ms, "allow_tor", &mut state.settings.allow_tor); + + update_endpoint(¶ms, "http_endpoint", &mut state.settings.http_endpoint)?; + update_endpoint( + ¶ms, + "https_endpoint", + &mut state.settings.https_endpoint, + )?; + update_endpoint(¶ms, "tor_endpoint", &mut state.settings.tor_endpoint)?; + + if state.settings.enabled_for_peers { + ensure_txrelay_credentials(&self.config.data_dir).await?; + } + + if params.get("selected_peer_pubkey").is_some() { + let selected = params + .get("selected_peer_pubkey") + .and_then(|v| v.as_str()) + .map(str::trim) + .filter(|s| !s.is_empty()); + if let Some(pubkey) = selected { + if !known_peers.iter().any(|p| p.pubkey == pubkey) { + anyhow::bail!("Selected relay peer is not in trusted nodes"); + } + state.settings.selected_peer_pubkey = Some(pubkey.to_string()); + } else { + state.settings.selected_peer_pubkey = None; + } + } + + state.updated_at = Some(now()); + save_relay_state(&self.config.data_dir, &state).await?; + self.notify( + "Bitcoin relay settings updated", + "Transaction relay sharing preferences were saved.", + ) + .await; + self.handle_bitcoin_relay_status().await + } + + pub(super) async fn handle_bitcoin_relay_request_peer( + &self, + params: Option, + ) -> Result { + let params = params.unwrap_or_default(); + let peer_pubkey = params + .get("peer_pubkey") + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing required parameter: peer_pubkey"))?; + let message = params + .get("message") + .and_then(|v| v.as_str()) + .map(sanitize_optional_text) + .transpose()?; + let peer = peers::load_peers(&self.config.data_dir) + .await + .unwrap_or_default() + .into_iter() + .find(|p| p.pubkey == peer_pubkey) + .ok_or_else(|| anyhow::anyhow!("Peer is not in trusted nodes"))?; + + let mut state = load_relay_state(&self.config.data_dir).await?; + let existing = state.requests.iter_mut().find(|r| { + r.direction == RelayRequestDirection::Outbound + && r.peer_pubkey == peer.pubkey + && r.status == RelayRequestStatus::Pending + }); + let request_id = if let Some(req) = existing { + req.message = message.clone(); + req.updated_at = now(); + req.id.clone() + } else { + let timestamp = now(); + let req = BitcoinRelayRequest { + id: uuid::Uuid::new_v4().to_string(), + direction: RelayRequestDirection::Outbound, + status: RelayRequestStatus::Pending, + peer_pubkey: peer.pubkey.clone(), + peer_onion: peer.onion.clone(), + peer_name: peer.name.clone(), + message: message.clone(), + approved_endpoint: None, + credential_secret_path: None, + created_at: timestamp.clone(), + updated_at: timestamp, + }; + let id = req.id.clone(); + state.requests.push(req); + id + }; + state.updated_at = Some(now()); + save_relay_state(&self.config.data_dir, &state).await?; + + if let Err(e) = self + .send_relay_peer_message( + &peer, + json!({ + "type": "bitcoin_relay_request", + "request_id": request_id, + "message": message, + }), + ) + .await + { + tracing::warn!(peer = %peer.onion, error = %e, "Failed to send Bitcoin relay request"); + } + + self.notify( + "Bitcoin relay request sent", + "A trusted peer was asked to approve transaction relay access.", + ) + .await; + Ok(json!({ "ok": true, "request_id": request_id })) + } + + pub(super) async fn handle_bitcoin_relay_approve_request( + &self, + params: Option, + ) -> Result { + self.update_relay_request_status(params, RelayRequestStatus::Approved) + .await + } + + pub(super) async fn handle_bitcoin_relay_reject_request( + &self, + params: Option, + ) -> Result { + self.update_relay_request_status(params, RelayRequestStatus::Rejected) + .await + } + + pub(super) async fn handle_bitcoin_relay_create_tor_service( + &self, + ) -> Result { + let params = json!({ + "name": "bitcoin-rpc", + "local_port": 80, + "remote_port": 80, + }); + let created = match self.handle_tor_create_service(Some(params)).await { + Ok(v) => v, + Err(e) if e.to_string().contains("already exists") => { + self.handle_tor_get_onion_address(Some(json!({ "name": "bitcoin-rpc" }))) + .await? + } + Err(e) => return Err(e), + }; + + let onion = created + .get("onion_address") + .and_then(|v| v.as_str()) + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()); + if let Some(onion) = onion { + let mut state = load_relay_state(&self.config.data_dir).await?; + state.settings.allow_tor = true; + state.settings.tor_endpoint = Some(format!("http://{onion}/")); + state.updated_at = Some(now()); + save_relay_state(&self.config.data_dir, &state).await?; + } + + self.notify( + "Bitcoin relay Tor service enabled", + "A Tor endpoint was created for Bitcoin transaction relay access.", + ) + .await; + Ok(created) + } + + async fn update_relay_request_status( + &self, + params: Option, + status: RelayRequestStatus, + ) -> Result { + let params = params.unwrap_or_default(); + let request_id = params + .get("id") + .or_else(|| params.get("request_id")) + .and_then(|v| v.as_str()) + .ok_or_else(|| anyhow::anyhow!("Missing required parameter: id"))?; + let mut state = load_relay_state(&self.config.data_dir).await?; + let serving_endpoint = if status == RelayRequestStatus::Approved { + preferred_endpoint(&state.settings) + } else { + None + }; + let request_direction = state + .requests + .iter() + .find(|r| r.id == request_id) + .ok_or_else(|| anyhow::anyhow!("Request not found: {}", request_id))? + .direction; + if status == RelayRequestStatus::Approved + && request_direction == RelayRequestDirection::Incoming + && serving_endpoint.is_none() + { + anyhow::bail!( + "Configure an HTTP, HTTPS, or Tor relay endpoint before approving access" + ); + } + let credentials = if status == RelayRequestStatus::Approved { + Some(ensure_txrelay_credentials(&self.config.data_dir).await?) + } else { + None + }; + let (peer_pubkey, peer_onion, peer_name, direction) = { + let req = state + .requests + .iter_mut() + .find(|r| r.id == request_id) + .ok_or_else(|| anyhow::anyhow!("Request not found: {}", request_id))?; + req.status = status; + req.updated_at = now(); + if let Some(endpoint) = &serving_endpoint { + req.approved_endpoint = Some(endpoint.clone()); + } + ( + req.peer_pubkey.clone(), + req.peer_onion.clone(), + req.peer_name.clone(), + req.direction, + ) + }; + let peer = peers::load_peers(&self.config.data_dir) + .await + .unwrap_or_default() + .into_iter() + .find(|p| p.pubkey == peer_pubkey); + let peer_name = peer_name.unwrap_or_else(|| peer_onion.clone()); + state.updated_at = Some(now()); + save_relay_state(&self.config.data_dir, &state).await?; + + if let Some(peer) = peer { + let message_type = match status { + RelayRequestStatus::Approved => "bitcoin_relay_approved", + RelayRequestStatus::Rejected => "bitcoin_relay_rejected", + RelayRequestStatus::Pending => "bitcoin_relay_pending", + }; + if let Err(e) = self + .send_relay_peer_message( + &peer, + relay_response_payload( + message_type, + request_id, + direction, + serving_endpoint.as_deref(), + credentials.as_ref(), + ), + ) + .await + { + tracing::warn!(peer = %peer.onion, error = %e, "Failed to send Bitcoin relay response"); + } + } + + let title = match status { + RelayRequestStatus::Approved => "Bitcoin relay request approved", + RelayRequestStatus::Rejected => "Bitcoin relay request rejected", + RelayRequestStatus::Pending => "Bitcoin relay request updated", + }; + self.notify( + title, + &format!("Relay access request for {peer_name} was updated."), + ) + .await; + Ok(json!({ "ok": true, "request_id": request_id })) + } + + async fn send_relay_peer_message( + &self, + peer: &peers::KnownPeer, + mut payload: serde_json::Value, + ) -> Result<()> { + let (data, _) = self.state_manager.get_snapshot().await; + let my_pubkey = data.server_info.pubkey.clone(); + let my_did = identity::did_key_from_pubkey_hex(&my_pubkey).ok(); + let my_onion = docker_packages::read_tor_address("archipelago") + .await + .unwrap_or_default(); + payload["from_did"] = my_did.map(serde_json::Value::String).unwrap_or_default(); + payload["from_pubkey"] = serde_json::Value::String(my_pubkey.clone()); + payload["from_onion"] = serde_json::Value::String(my_onion); + payload["from_name"] = data + .server_info + .name + .clone() + .map(serde_json::Value::String) + .unwrap_or_default(); + + let to_fips_npub = + crate::federation::fips_npub_for_onion(&self.config.data_dir, &peer.onion).await; + let identity_dir = self.config.data_dir.join("identity"); + let signing_key = crate::identity::NodeIdentity::load_or_create(&identity_dir) + .await + .ok(); + crate::node_message::send_to_peer( + &peer.onion, + to_fips_npub.as_deref(), + &my_pubkey, + &payload.to_string(), + signing_key.as_ref().map(|i| i.signing_key()), + Some(&peer.pubkey), + data.server_info.name.as_deref(), + ) + .await + } + + async fn notify(&self, title: &str, message: &str) { + let (mut data, _) = self.state_manager.get_snapshot().await; + data.notifications.push(Notification { + id: format!("bitcoin-relay-{}", uuid::Uuid::new_v4()), + level: NotificationLevel::Info, + title: title.to_string(), + message: message.to_string(), + timestamp: now(), + app_id: Some("bitcoin-knots".to_string()), + }); + let len = data.notifications.len(); + if len > 30 { + data.notifications.drain(0..len - 30); + } + self.state_manager.update_data(data).await; + } +} + +pub(crate) async fn record_incoming_relay_message( + data_dir: &Path, + from_pubkey: &str, + from_name: Option<&str>, + payload: &serde_json::Value, +) -> Result> { + let msg_type = payload.get("type").and_then(|v| v.as_str()).unwrap_or(""); + match msg_type { + "bitcoin_relay_request" => { + let from_onion = payload + .get("from_onion") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + let message = payload + .get("message") + .and_then(|v| v.as_str()) + .map(sanitize_optional_text) + .transpose()?; + let remote_request_id = payload + .get("request_id") + .and_then(|v| v.as_str()) + .unwrap_or_default(); + let mut state = load_relay_state(data_dir).await?; + if !state.settings.allow_peer_requests { + return Ok(Some("bitcoin_relay_request_disabled")); + } + if !state.requests.iter().any(|r| { + r.direction == RelayRequestDirection::Incoming + && r.peer_pubkey == from_pubkey + && r.status == RelayRequestStatus::Pending + }) { + let timestamp = now(); + state.requests.push(BitcoinRelayRequest { + id: if remote_request_id.is_empty() { + uuid::Uuid::new_v4().to_string() + } else { + remote_request_id.to_string() + }, + direction: RelayRequestDirection::Incoming, + status: RelayRequestStatus::Pending, + peer_pubkey: from_pubkey.to_string(), + peer_onion: from_onion, + peer_name: from_name.map(String::from), + message, + approved_endpoint: None, + credential_secret_path: None, + created_at: timestamp.clone(), + updated_at: timestamp, + }); + state.updated_at = Some(now()); + save_relay_state(data_dir, &state).await?; + } + Ok(Some("bitcoin_relay_request")) + } + "bitcoin_relay_approved" | "bitcoin_relay_rejected" => { + let request_id = payload.get("request_id").and_then(|v| v.as_str()); + let mut state = load_relay_state(data_dir).await?; + let status = if msg_type == "bitcoin_relay_approved" { + RelayRequestStatus::Approved + } else { + RelayRequestStatus::Rejected + }; + let approved_access = if status == RelayRequestStatus::Approved { + save_peer_relay_access(data_dir, from_pubkey, payload).await? + } else { + None + }; + if let Some(req) = state.requests.iter_mut().find(|r| { + r.direction == RelayRequestDirection::Outbound + && r.peer_pubkey == from_pubkey + && request_id.map(|id| id == r.id).unwrap_or(true) + }) { + req.status = status; + req.updated_at = now(); + if let Some((endpoint, secret_path)) = approved_access { + req.approved_endpoint = Some(endpoint); + req.credential_secret_path = Some(secret_path); + } + state.updated_at = Some(now()); + save_relay_state(data_dir, &state).await?; + } + Ok(Some(if msg_type == "bitcoin_relay_approved" { + "bitcoin_relay_approved" + } else { + "bitcoin_relay_rejected" + })) + } + _ => Ok(None), + } +} + +fn trusted_relay_peers( + known_peers: &[peers::KnownPeer], + state: &BitcoinRelayState, +) -> Vec { + known_peers + .iter() + .map(|peer| TrustedRelayPeer { + pubkey: peer.pubkey.clone(), + onion: peer.onion.clone(), + name: peer.name.clone(), + relay_approved: state.requests.iter().any(|req| { + req.peer_pubkey == peer.pubkey && req.status == RelayRequestStatus::Approved + }), + }) + .collect() +} + +async fn txrelay_credential_status(data_dir: &Path) -> serde_json::Value { + let (password_path, rpcauth_path, client_env_path) = txrelay_secret_paths(data_dir); + let password_available = fs::metadata(&password_path).await.is_ok(); + let rpcauth_available = fs::metadata(&rpcauth_path).await.is_ok(); + let client_env_available = fs::metadata(&client_env_path).await.is_ok(); + json!({ + "username": TXRELAY_USER, + "available": password_available && rpcauth_available && client_env_available, + "password_available": password_available, + "rpcauth_available": rpcauth_available, + "client_env_available": client_env_available, + "client_env_path": client_env_path.display().to_string(), + "restart_hint": "If this was just generated, restart Bitcoin Core/Knots so bitcoind loads the txrelay rpcauth whitelist.", + }) +} + +async fn ensure_txrelay_credentials(data_dir: &Path) -> Result { + let (password_path, rpcauth_path, client_env_path) = txrelay_secret_paths(data_dir); + let password = match read_trimmed(&password_path).await { + Some(value) => value, + None => { + let generated = generate_random_password(); + write_secret_file(&password_path, &generated).await?; + generated + } + }; + let rpcauth = match read_trimmed(&rpcauth_path).await { + Some(value) => value, + None => { + let generated = generate_rpcauth(TXRELAY_USER, &password); + write_secret_file(&rpcauth_path, &generated).await?; + generated + } + }; + let client_env = format!( + "BITCOIN_RPC_TXRELAY_USER={}\nBITCOIN_RPC_TXRELAY_PASSWORD={}\nBITCOIN_RPC_TXRELAY_RPCAUTH={}\n", + TXRELAY_USER, password, rpcauth + ); + write_secret_file(&client_env_path, &client_env).await?; + + Ok(TxRelayCredentials { + username: TXRELAY_USER.to_string(), + password, + }) +} + +fn txrelay_secret_paths(data_dir: &Path) -> (PathBuf, PathBuf, PathBuf) { + let secrets_dir = data_dir.join("secrets"); + ( + secrets_dir.join(TXRELAY_PASSWORD_FILE), + secrets_dir.join(TXRELAY_RPCAUTH_FILE), + secrets_dir.join(TXRELAY_CLIENT_ENV_FILE), + ) +} + +async fn read_trimmed(path: &Path) -> Option { + fs::read_to_string(path) + .await + .ok() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) +} + +async fn write_secret_file(path: &Path, contents: &str) -> Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).await?; + } + fs::write(path, contents).await?; + set_private_permissions(path).await; + Ok(()) +} + +async fn set_private_permissions(path: &Path) { + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let _ = fs::set_permissions(path, std::fs::Permissions::from_mode(0o600)).await; + } +} + +fn generate_random_password() -> String { + let mut bytes = [0u8; 32]; + rand::rngs::OsRng.fill_bytes(&mut bytes); + BASE64.encode(bytes) +} + +fn generate_rpcauth(username: &str, password: &str) -> String { + let mut salt_bytes = [0u8; 16]; + rand::rngs::OsRng.fill_bytes(&mut salt_bytes); + let salt_hex = hex::encode(salt_bytes); + let mut mac = + Hmac::::new_from_slice(salt_hex.as_bytes()).expect("HMAC accepts any key length"); + mac.update(password.as_bytes()); + let hash_hex = hex::encode(mac.finalize().into_bytes()); + format!("{username}:{salt_hex}${hash_hex}") +} + +fn preferred_endpoint(settings: &BitcoinRelaySettings) -> Option { + if settings.allow_https { + if let Some(endpoint) = settings.https_endpoint.clone() { + return Some(endpoint); + } + } + if settings.allow_tor { + if let Some(endpoint) = settings.tor_endpoint.clone() { + return Some(endpoint); + } + } + if settings.allow_http { + if let Some(endpoint) = settings.http_endpoint.clone() { + return Some(endpoint); + } + } + settings + .https_endpoint + .clone() + .or_else(|| settings.tor_endpoint.clone()) + .or_else(|| settings.http_endpoint.clone()) +} + +fn relay_response_payload( + message_type: &str, + request_id: &str, + request_direction: RelayRequestDirection, + endpoint: Option<&str>, + credentials: Option<&TxRelayCredentials>, +) -> serde_json::Value { + let mut payload = json!({ + "type": message_type, + "request_id": request_id, + }); + if message_type == "bitcoin_relay_approved" + && request_direction == RelayRequestDirection::Incoming + { + if let (Some(endpoint), Some(credentials)) = (endpoint, credentials) { + payload["relay_access"] = json!({ + "endpoint": endpoint, + "username": &credentials.username, + "password": &credentials.password, + }); + } + } + payload +} + +async fn save_peer_relay_access( + data_dir: &Path, + from_pubkey: &str, + payload: &serde_json::Value, +) -> Result> { + let Some(access) = payload.get("relay_access") else { + return Ok(None); + }; + let endpoint = access + .get("endpoint") + .and_then(|v| v.as_str()) + .map(validate_endpoint) + .transpose()?; + let username = access.get("username").and_then(|v| v.as_str()); + let password = access.get("password").and_then(|v| v.as_str()); + let (Some(endpoint), Some(username), Some(password)) = (endpoint, username, password) else { + return Ok(None); + }; + validate_env_value(username)?; + validate_env_value(password)?; + + let secret_path = data_dir.join("secrets").join(format!( + "bitcoin-relay-peer-{}.env", + safe_pubkey_fragment(from_pubkey) + )); + let contents = format!( + "BITCOIN_RELAY_PEER_PUBKEY={}\nBITCOIN_RELAY_ENDPOINT={}\nBITCOIN_RELAY_USERNAME={}\nBITCOIN_RELAY_PASSWORD={}\n", + from_pubkey, endpoint, username, password + ); + write_secret_file(&secret_path, &contents).await?; + Ok(Some((endpoint, secret_path.display().to_string()))) +} + +fn validate_env_value(value: &str) -> Result<()> { + if value.is_empty() || value.len() > 1024 || value.contains('\n') || value.contains('\r') { + anyhow::bail!("Invalid relay credential value"); + } + Ok(()) +} + +fn safe_pubkey_fragment(pubkey: &str) -> String { + let fragment = pubkey + .chars() + .filter(|c| c.is_ascii_hexdigit()) + .take(24) + .collect::(); + if fragment.is_empty() { + "unknown".to_string() + } else { + fragment + } +} + +async fn hydrate_tor_endpoint(data_dir: &Path, state: &mut BitcoinRelayState) { + if state.settings.tor_endpoint.is_some() { + return; + } + if let Some(onion) = docker_packages::read_tor_address("bitcoin-rpc").await { + let onion = onion.trim().trim_end_matches('/').to_string(); + if !onion.is_empty() { + state.settings.tor_endpoint = Some(format!("http://{onion}/")); + let _ = save_relay_state(data_dir, state).await; + } + } +} + +async fn local_sync_status() -> serde_json::Value { + let status = bitcoin_status::get_bitcoin_status().await; + let blockchain = status.blockchain_info.as_ref(); + let blocks = blockchain + .and_then(|v| v.get("blocks")) + .and_then(|v| v.as_u64()) + .unwrap_or(0); + let headers = blockchain + .and_then(|v| v.get("headers")) + .and_then(|v| v.as_u64()) + .unwrap_or(0); + let initial_block_download = blockchain + .and_then(|v| v.get("initialblockdownload")) + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let synced = + status.ok && headers > 0 && blocks >= headers.saturating_sub(1) && !initial_block_download; + + json!({ + "synced": synced, + "blocks": blocks, + "headers": headers, + "chain": blockchain + .and_then(|v| v.get("chain")) + .and_then(|v| v.as_str()) + .unwrap_or("unknown"), + "status_ok": status.ok, + "status_stale": status.stale, + "error": status.error, + }) +} + +async fn load_relay_state(data_dir: &Path) -> Result { + let path = state_path(data_dir); + if !path.exists() { + return Ok(BitcoinRelayState::default()); + } + let content = fs::read_to_string(&path) + .await + .with_context(|| format!("Failed to read {}", path.display()))?; + Ok(serde_json::from_str(&content).unwrap_or_default()) +} + +async fn save_relay_state(data_dir: &Path, state: &BitcoinRelayState) -> Result<()> { + let dir = data_dir.join(RELAY_DIR); + fs::create_dir_all(&dir).await?; + let content = serde_json::to_string_pretty(state)?; + fs::write(dir.join(RELAY_STATE_FILE), content).await?; + Ok(()) +} + +fn state_path(data_dir: &Path) -> PathBuf { + data_dir.join(RELAY_DIR).join(RELAY_STATE_FILE) +} + +fn update_bool(params: &serde_json::Value, key: &str, target: &mut bool) { + if let Some(value) = params.get(key).and_then(|v| v.as_bool()) { + *target = value; + } +} + +fn update_endpoint( + params: &serde_json::Value, + key: &str, + target: &mut Option, +) -> Result<()> { + if !params.get(key).is_some() { + return Ok(()); + } + let endpoint = params + .get(key) + .and_then(|v| v.as_str()) + .map(str::trim) + .filter(|s| !s.is_empty()); + *target = endpoint.map(validate_endpoint).transpose()?; + Ok(()) +} + +fn validate_endpoint(endpoint: &str) -> Result { + if endpoint.len() > 512 || endpoint.contains('\n') || endpoint.contains('\r') { + anyhow::bail!("Invalid endpoint"); + } + let lower = endpoint.to_ascii_lowercase(); + if !(lower.starts_with("http://") || lower.starts_with("https://")) { + anyhow::bail!("Endpoint must start with http:// or https://"); + } + Ok(endpoint.to_string()) +} + +fn sanitize_optional_text(value: &str) -> Result { + let value = value.trim(); + if value.len() > 500 || value.contains('\0') { + anyhow::bail!("Invalid message"); + } + Ok(value.to_string()) +} + +fn now() -> String { + chrono::Utc::now().to_rfc3339() +} diff --git a/core/archipelago/src/api/rpc/container.rs b/core/archipelago/src/api/rpc/container.rs index 7377bca2..9446bc00 100644 --- a/core/archipelago/src/api/rpc/container.rs +++ b/core/archipelago/src/api/rpc/container.rs @@ -4,8 +4,9 @@ use super::RpcHandler; use anyhow::{Context, Result}; use std::time::Duration; -const PODMAN_INSPECT_TIMEOUT: Duration = Duration::from_secs(10); -const PODMAN_PS_TIMEOUT: Duration = Duration::from_secs(10); +const PODMAN_INSPECT_TIMEOUT: Duration = Duration::from_secs(5); +const PODMAN_PS_TIMEOUT: Duration = Duration::from_secs(5); +const ORCHESTRATOR_HEALTH_TIMEOUT: Duration = Duration::from_secs(5); impl RpcHandler { pub(super) async fn handle_container_install( @@ -171,46 +172,69 @@ impl RpcHandler { // between "installed" and "not-installed" in the UI. let (data, _) = self.state_manager.get_snapshot().await; if data.server_info.status_info.containers_scanned && !data.package_data.is_empty() { - let containers: Vec = data - .package_data - .iter() - .map(|(id, pkg)| { - // Keep this mapping in sync with the UI's - // ContainerStatus.state union in - // neode-ui/src/api/container-client.ts. The UI maps - // transitional variants to single-button labels - // (Stopping… / Starting… / Restarting…). - let state = match &pkg.state { - crate::data_model::PackageState::Running => "running", - crate::data_model::PackageState::Stopped => "stopped", - crate::data_model::PackageState::Exited => "exited", - crate::data_model::PackageState::Starting => "starting", - crate::data_model::PackageState::Stopping => "stopping", - crate::data_model::PackageState::Restarting => "restarting", - crate::data_model::PackageState::Installing => "installing", - crate::data_model::PackageState::Installed => "installed", - crate::data_model::PackageState::Updating => "updating", - crate::data_model::PackageState::Removing => "removing", - crate::data_model::PackageState::CreatingBackup => "creating-backup", - crate::data_model::PackageState::RestoringBackup => "restoring-backup", - crate::data_model::PackageState::BackingUp => "backing-up", - }; - let lan = pkg - .installed - .as_ref() - .and_then(|i| i.interface_addresses.get("main")) - .and_then(|a| a.lan_address.as_deref()); - serde_json::json!({ - "id": id, - "name": id, - "state": state, - "image": "", - "created": "", - "ports": [], - "lan_address": lan, - }) - }) - .collect(); + let mut containers = Vec::with_capacity(data.package_data.len()); + for (id, pkg) in &data.package_data { + // Keep this mapping in sync with the UI's + // ContainerStatus.state union in + // neode-ui/src/api/container-client.ts. The UI maps + // transitional variants to single-button labels + // (Stopping… / Starting… / Restarting…). + let mut state = match &pkg.state { + crate::data_model::PackageState::Running => "running".to_string(), + crate::data_model::PackageState::Stopped => "stopped".to_string(), + crate::data_model::PackageState::Exited => "exited".to_string(), + crate::data_model::PackageState::Starting => "starting".to_string(), + crate::data_model::PackageState::Stopping => "stopping".to_string(), + crate::data_model::PackageState::Restarting => "restarting".to_string(), + crate::data_model::PackageState::Installing => "installing".to_string(), + crate::data_model::PackageState::Installed => "installed".to_string(), + crate::data_model::PackageState::Updating => "updating".to_string(), + crate::data_model::PackageState::Removing => "removing".to_string(), + crate::data_model::PackageState::CreatingBackup => { + "creating-backup".to_string() + } + crate::data_model::PackageState::RestoringBackup => { + "restoring-backup".to_string() + } + crate::data_model::PackageState::BackingUp => "backing-up".to_string(), + }; + + // Scanner backoff preserves cached package_data. Refresh stable + // states so callers do not see stale `running`/`exited` after + // health-monitor recovery or Quadlet --rm container removal. + if state == "running" && requires_launch_port_for_health(id) { + if !self.cached_reachable_health(id).await?.is_some() { + state = live_state_for_app(id) + .await + .unwrap_or("starting".to_string()); + } + } else if should_refresh_cached_state(&state) { + if launch_port_reachable(id).await { + state = "running".to_string(); + } else { + if let Some(live) = live_state_for_app(id).await { + state = live; + } else if quadlet_service_active(id).await { + state = "starting".to_string(); + } + } + } + + let lan = pkg + .installed + .as_ref() + .and_then(|i| i.interface_addresses.get("main")) + .and_then(|a| a.lan_address.as_deref()); + containers.push(serde_json::json!({ + "id": id, + "name": id, + "state": state, + "image": "", + "created": "", + "ports": [], + "lan_address": lan, + })); + } return Ok(serde_json::json!(containers)); } @@ -383,15 +407,33 @@ impl RpcHandler { // If app_id is provided, get health for that app. if let Some(params) = params { if let Some(app_id) = params.get("app_id").and_then(|v| v.as_str()) { + if let Some(health) = self.cached_reachable_health(app_id).await? { + return Ok(serde_json::json!({ app_id: health })); + } + + if let Some(health) = self.cached_state_health(app_id).await { + return Ok(serde_json::json!({ app_id: health })); + } + + if requires_launch_port_for_health(app_id) { + return Ok(serde_json::json!({ app_id: "starting" })); + } + if let Some(health) = self.stack_health(app_id).await? { return Ok(serde_json::json!({ app_id: health })); } let mut last_err: Option = None; for candidate in status_app_id_candidates(app_id) { - match orchestrator.health(&candidate).await { - Ok(health) => return Ok(serde_json::json!({ app_id: health })), - Err(e) => last_err = Some(e), + match tokio::time::timeout( + ORCHESTRATOR_HEALTH_TIMEOUT, + orchestrator.health(&candidate), + ) + .await + { + Ok(Ok(health)) => return Ok(serde_json::json!({ app_id: health })), + Ok(Err(e)) => last_err = Some(e), + Err(_) => {} } } for name in status_container_name_candidates(app_id) { @@ -424,14 +466,19 @@ impl RpcHandler { .and_then(|s| s.strip_suffix("-dev")) .or_else(|| container.name.strip_prefix("archy-")) .unwrap_or(container.name.as_str()); - match orchestrator.health(app_id_candidate).await { - Ok(health) => { + match tokio::time::timeout( + ORCHESTRATOR_HEALTH_TIMEOUT, + orchestrator.health(app_id_candidate), + ) + .await + { + Ok(Ok(health)) => { health_map.insert( app_id_candidate.to_string(), serde_json::Value::String(health), ); } - Err(_) => { + Ok(Err(_)) | Err(_) => { health_map.insert( app_id_candidate.to_string(), serde_json::Value::String("unknown".to_string()), @@ -443,6 +490,65 @@ impl RpcHandler { Ok(serde_json::Value::Object(health_map)) } + async fn cached_state_health(&self, app_id: &str) -> Option<&'static str> { + let (data, _) = self.state_manager.get_snapshot().await; + let Some(pkg) = data.package_data.get(app_id) else { + if data.server_info.status_info.containers_scanned { + return Some("stopped"); + } + return None; + }; + match pkg.state { + crate::data_model::PackageState::Running => None, + crate::data_model::PackageState::Installing + | crate::data_model::PackageState::Installed + | crate::data_model::PackageState::Starting => Some("starting"), + crate::data_model::PackageState::Stopping + | crate::data_model::PackageState::Stopped + | crate::data_model::PackageState::Exited => Some("stopped"), + crate::data_model::PackageState::Removing => Some("removing"), + crate::data_model::PackageState::Restarting + | crate::data_model::PackageState::Updating + | crate::data_model::PackageState::CreatingBackup + | crate::data_model::PackageState::RestoringBackup + | crate::data_model::PackageState::BackingUp => Some("starting"), + } + } + + async fn cached_reachable_health(&self, app_id: &str) -> Result> { + let (data, _) = self.state_manager.get_snapshot().await; + let pkg = data.package_data.get(app_id); + if matches!( + pkg.map(|pkg| &pkg.state), + Some(crate::data_model::PackageState::Removing) + ) { + return Ok(None); + } + + let url = pkg + .and_then(|pkg| pkg.installed.as_ref()) + .and_then(|i| i.interface_addresses.get("main")) + .and_then(|a| a.lan_address.as_deref()) + .map(ToOwned::to_owned) + .or_else(|| health_probe_url_for_app(app_id)); + + let Some(url) = url else { + return Ok(None); + }; + if url.starts_with("http://") || url.starts_with("https://") { + return Ok(http_launch_url_reachable(&url) + .await + .then(|| "healthy".to_string())); + } + + let Some(port) = port_from_url(&url) else { + return Ok(None); + }; + Ok(launch_port_reachable_by_port(port) + .await + .then(|| "healthy".to_string())) + } + async fn stack_health(&self, app_id: &str) -> Result> { let Some(members) = stack_health_members(app_id) else { return Ok(None); @@ -469,8 +575,14 @@ impl RpcHandler { } if saw_unknown { + if let Some(health) = self.cached_reachable_health(app_id).await? { + return Ok(Some(health)); + } Ok(Some("unknown".to_string())) } else if saw_starting { + if let Some(health) = self.cached_reachable_health(app_id).await? { + return Ok(Some(health)); + } Ok(Some("starting".to_string())) } else { Ok(Some("healthy".to_string())) @@ -482,7 +594,9 @@ async fn member_health( orchestrator: &dyn crate::container::traits::ContainerOrchestrator, app_id: &str, ) -> Result { - if let Ok(health) = orchestrator.health(app_id).await { + if let Ok(Ok(health)) = + tokio::time::timeout(ORCHESTRATOR_HEALTH_TIMEOUT, orchestrator.health(app_id)).await + { return Ok(health); } for name in status_container_name_candidates(app_id) { @@ -508,10 +622,8 @@ fn stack_health_members(app_id: &str) -> Option<&'static [&'static str]> { "indeedhub-minio", "indeedhub-relay", "indeedhub-api", - "indeedhub-ffmpeg", "indeedhub", ]), - "fedimint" => Some(&["fedimint"]), _ => None, } } @@ -583,6 +695,115 @@ fn status_container_name_candidates(app_id: &str) -> Vec { out } +fn should_refresh_cached_state(state: &str) -> bool { + matches!(state, "exited" | "stopped" | "stopping") +} + +async fn live_state_for_app(app_id: &str) -> Option { + for name in status_container_name_candidates(app_id) { + if let Some(live) = inspect_container_state_value(&name).await { + if let Some(live_state) = live.get("state").and_then(|v| v.as_str()) { + return Some(live_state.to_string()); + } + } + } + None +} + +async fn quadlet_service_active(app_id: &str) -> bool { + for name in status_container_name_candidates(app_id) { + let service = format!("{name}.service"); + let mut cmd = tokio::process::Command::new("systemctl"); + cmd.args(["--user", "is-active", "--quiet", &service]); + cmd.kill_on_drop(true); + if matches!( + tokio::time::timeout(Duration::from_secs(2), cmd.status()).await, + Ok(Ok(status)) if status.success() + ) { + return true; + } + } + false +} + +fn health_probe_url_for_app(app_id: &str) -> Option { + let port = match app_id { + "bitcoin-ui" => 8334, + "botfights" => 9100, + "btcpay-server" | "btcpay" | "btcpayserver" => 23000, + "dwn" => 3100, + "electrumx" | "electrs" | "mempool-electrs" | "electrs-ui" => 50002, + "fedimint" | "fedimintd" => 8175, + "filebrowser" => 8083, + "gitea" => 3001, + "grafana" => 3000, + "homeassistant" | "home-assistant" => 8123, + "immich" | "immich_server" => 2283, + "indeedhub" => 7778, + "jellyfin" => 8096, + "lnd" | "lnd-ui" => 18083, + "mempool" | "mempool-web" => 4080, + "nginx-proxy-manager" => 8081, + "ollama" => 11434, + "photoprism" => 2342, + "portainer" => 9000, + "searxng" => 8888, + "tailscale" => 8240, + "uptime-kuma" => 3002, + "vaultwarden" => 8082, + _ => return None, + }; + Some(format!("http://localhost:{port}")) +} + +fn requires_launch_port_for_health(app_id: &str) -> bool { + matches!(app_id, "fedimint" | "fedimintd" | "fedimint-gateway") +} + +async fn launch_port_reachable(app_id: &str) -> bool { + let Some(port) = health_probe_url_for_app(app_id).and_then(|url| port_from_url(&url)) else { + return false; + }; + launch_port_reachable_by_port(port).await +} + +async fn launch_port_reachable_by_port(port: u16) -> bool { + matches!( + tokio::time::timeout( + Duration::from_secs(2), + tokio::net::TcpStream::connect(("127.0.0.1", port)), + ) + .await, + Ok(Ok(_)) + ) +} + +async fn http_launch_url_reachable(url: &str) -> bool { + let Ok(client) = reqwest::Client::builder() + .timeout(Duration::from_secs(2)) + .redirect(reqwest::redirect::Policy::none()) + .build() + else { + return false; + }; + match client.get(url).send().await { + Ok(response) => { + let status = response.status(); + status.is_success() || status.is_redirection() + } + Err(_) => false, + } +} + +fn port_from_url(url: &str) -> Option { + let after_colon = url.rsplit_once(':')?.1; + let port = after_colon + .chars() + .take_while(|c| c.is_ascii_digit()) + .collect::(); + port.parse::().ok() +} + async fn inspect_container_state_value(name: &str) -> Option { if let Some(v) = ps_container_state_value(name).await { return Some(v); diff --git a/core/archipelago/src/api/rpc/dispatcher.rs b/core/archipelago/src/api/rpc/dispatcher.rs index 367b584a..1e3b624d 100644 --- a/core/archipelago/src/api/rpc/dispatcher.rs +++ b/core/archipelago/src/api/rpc/dispatcher.rs @@ -98,6 +98,20 @@ impl RpcHandler { // Bitcoin & Lightning deep data "bitcoin.getinfo" => self.handle_bitcoin_getinfo().await, + "bitcoin.relay-status" => self.handle_bitcoin_relay_status().await, + "bitcoin.relay-update-settings" => { + self.handle_bitcoin_relay_update_settings(params).await + } + "bitcoin.relay-request-peer" => self.handle_bitcoin_relay_request_peer(params).await, + "bitcoin.relay-approve-request" => { + self.handle_bitcoin_relay_approve_request(params).await + } + "bitcoin.relay-reject-request" => { + self.handle_bitcoin_relay_reject_request(params).await + } + "bitcoin.relay-create-tor-service" => { + self.handle_bitcoin_relay_create_tor_service().await + } "bitcoin.init-wallet-from-seed" => { self.handle_bitcoin_init_wallet_from_seed(params).await } diff --git a/core/archipelago/src/api/rpc/lnd/wallet.rs b/core/archipelago/src/api/rpc/lnd/wallet.rs index 72f6ad72..dc560930 100644 --- a/core/archipelago/src/api/rpc/lnd/wallet.rs +++ b/core/archipelago/src/api/rpc/lnd/wallet.rs @@ -23,10 +23,15 @@ impl RpcHandler { .await .context("Failed to parse newaddress response")?; + if let Some(error) = body.get("error").and_then(|v| v.as_str()) { + anyhow::bail!("LND could not generate an address: {}", error); + } + let address = body .get("address") .and_then(|v| v.as_str()) - .unwrap_or("") + .filter(|addr| !addr.trim().is_empty()) + .ok_or_else(|| anyhow::anyhow!("LND did not return a Bitcoin address. The wallet may still be locked, uninitialized, or waiting for Bitcoin to sync."))? .to_string(); Ok(serde_json::json!({ "address": address })) diff --git a/core/archipelago/src/api/rpc/mod.rs b/core/archipelago/src/api/rpc/mod.rs index fc50efc5..ee0439c5 100644 --- a/core/archipelago/src/api/rpc/mod.rs +++ b/core/archipelago/src/api/rpc/mod.rs @@ -2,6 +2,7 @@ mod analytics; mod auth; mod backup_rpc; mod bitcoin; +pub(crate) mod bitcoin_relay; mod container; mod content; mod credentials; @@ -302,6 +303,7 @@ impl RpcHandler { | "system.stats" | "tor.status" | "tor.onion-addresses" + | "bitcoin.relay-status" | "federation.list-nodes" | "system.get-settings" | "system.get-node-key" diff --git a/core/archipelago/src/api/rpc/package/config.rs b/core/archipelago/src/api/rpc/package/config.rs index e1092f59..9aaab545 100644 --- a/core/archipelago/src/api/rpc/package/config.rs +++ b/core/archipelago/src/api/rpc/package/config.rs @@ -3,7 +3,7 @@ use crate::port_allocator::PortAllocator; use anyhow::{Context, Result}; use std::time::Duration; -const PODMAN_LIST_TIMEOUT: Duration = Duration::from_secs(15); +const PODMAN_LIST_TIMEOUT: Duration = Duration::from_secs(60); fn is_platform_managed_app(app_id: &str) -> bool { matches!( @@ -31,7 +31,6 @@ fn is_platform_managed_app(app_id: &str) -> bool { | "fedimint" | "fedimint-gateway" | "indeedhub" - | "saleor" | "immich" ) } @@ -501,15 +500,6 @@ pub(super) fn all_container_names(package_id: &str) -> Vec { "netbird-dashboard".into(), "netbird-server".into(), ], - "saleor" => vec![ - "saleor-db".into(), - "saleor-cache".into(), - "saleor-api".into(), - "saleor-worker".into(), - "saleor-jaeger".into(), - "saleor-mailpit".into(), - "saleor".into(), - ], "nostr-vpn" => vec![ "nostr-vpn".into(), "archy-nostr-vpn".into(), @@ -599,7 +589,6 @@ pub(super) fn get_data_dirs_for_app(package_id: &str) -> Vec { format!("{}/penpot-postgres", base), ], "netbird" => vec![format!("{}/netbird", base)], - "saleor" => vec![format!("{}/saleor", base), format!("{}/saleor-db", base)], _ => vec![format!("{}/{}", base, package_id)], } } @@ -977,6 +966,7 @@ pub(super) async fn get_app_config( vec![ "/var/lib/archipelago/portainer:/data".to_string(), "/run/user/1000/podman/podman.sock:/var/run/docker.sock".to_string(), + "/var/lib/archipelago/portainer/compose:/data/compose".to_string(), ], vec![], None, @@ -1006,7 +996,7 @@ pub(super) async fn get_app_config( Some(vec![ "sh".to_string(), "-c".to_string(), - "tailscaled --tun=userspace-networking & sleep 2; tailscale web --listen 0.0.0.0:8240 & wait".to_string(), + "tailscaled --tun=userspace-networking & for i in $(seq 1 30); do [ -S /var/run/tailscale/tailscaled.sock ] && break; sleep 1; done; tailscale web --listen 0.0.0.0:8240 & wait".to_string(), ]), ), "fedimint" => ( @@ -1079,13 +1069,6 @@ pub(super) async fn get_app_config( None, None, ), - "saleor" => ( - vec!["9010:80".to_string(), "8000:8000".to_string()], - vec!["/var/lib/archipelago/saleor:/app/media".to_string()], - vec![], - None, - None, - ), "nostr-rs-relay" => ( vec!["18081:8080".to_string()], vec!["/var/lib/archipelago/nostr-rs-relay:/usr/src/app/db".to_string()], diff --git a/core/archipelago/src/api/rpc/package/dependencies.rs b/core/archipelago/src/api/rpc/package/dependencies.rs index 9a03de87..8e2e7af2 100644 --- a/core/archipelago/src/api/rpc/package/dependencies.rs +++ b/core/archipelago/src/api/rpc/package/dependencies.rs @@ -289,15 +289,6 @@ pub(super) fn startup_order(package_id: &str) -> &'static [&'static str] { &["archy-btcpay-db", "archy-nbxplorer", "btcpay-server"] } "netbird" => &["netbird-server", "netbird-dashboard", "netbird"], - "saleor" => &[ - "saleor-db", - "saleor-cache", - "saleor-jaeger", - "saleor-mailpit", - "saleor-api", - "saleor-worker", - "saleor", - ], "penpot" | "penpot-frontend" => &[ "penpot-postgres", "penpot-valkey", diff --git a/core/archipelago/src/api/rpc/package/install.rs b/core/archipelago/src/api/rpc/package/install.rs index 18658a6f..6fbdabb5 100644 --- a/core/archipelago/src/api/rpc/package/install.rs +++ b/core/archipelago/src/api/rpc/package/install.rs @@ -13,11 +13,12 @@ use crate::api::rpc::RpcHandler; use crate::data_model::InstallPhase; use crate::update::host_sudo; use anyhow::{Context, Result}; -use tokio::io::{AsyncBufReadExt, BufReader}; +use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader}; use tokio::time::{timeout, Duration}; use tracing::{debug, info, warn}; const INSTALL_LOG: &str = "/var/log/archipelago/container-installs.log"; +const IMAGE_INSPECT_TIMEOUT: Duration = Duration::from_secs(10); /// Append a timestamped line to the persistent install log. pub(in crate::api::rpc) async fn install_log(msg: &str) { @@ -34,6 +35,36 @@ pub(in crate::api::rpc) async fn install_log(msg: &str) { } } +async fn local_podman_image_exists(image: &str) -> Result { + let mut cmd = tokio::process::Command::new("podman"); + cmd.args(["image", "inspect", image]); + cmd.kill_on_drop(true); + let output = timeout(IMAGE_INSPECT_TIMEOUT, cmd.output()) + .await + .with_context(|| { + format!( + "podman image inspect {} timed out after {}s", + image, + IMAGE_INSPECT_TIMEOUT.as_secs() + ) + })? + .with_context(|| format!("Failed to execute podman image inspect {}", image))?; + match output.status.code() { + Some(0) => Ok(true), + Some(1) => Ok(false), + Some(code) => Err(anyhow::anyhow!( + "podman image inspect {} exited with {}: {}", + image, + code, + String::from_utf8_lossy(&output.stderr).trim() + )), + None => Err(anyhow::anyhow!( + "podman image inspect {} terminated by signal", + image + )), + } +} + pub(super) async fn patch_indeedhub_nostr_provider() { tokio::time::sleep(std::time::Duration::from_secs(5)).await; @@ -244,10 +275,6 @@ impl RpcHandler { if package_id == "netbird" { return self.install_netbird_stack().await; } - if package_id == "saleor" { - return self.install_saleor_stack().await; - } - // Dependency checks. Prefer the scanner's cached package state so a // congested Podman API does not turn an already-running dependency into // a false install failure. Fall back to a bounded direct Podman probe @@ -447,6 +474,7 @@ impl RpcHandler { Ok(container_name) => { self.set_install_phase(package_id, InstallPhase::WaitingHealthy) .await; + ensure_host_port_listener(package_id, &container_name, &[]).await?; crate::api::rpc::package::runtime::reconcile_companions_for(package_id) .await; install_log(&format!( @@ -652,10 +680,6 @@ impl RpcHandler { self.write_lnd_conf(&rpc_user, &rpc_pass).await?; } - if package_id == "portainer" { - ensure_user_podman_socket().await?; - } - // Pre-install: SearXNG settings.yml (required or container exits immediately) if package_id == "searxng" { let searx_dir = "/var/lib/archipelago/searxng"; @@ -748,16 +772,10 @@ impl RpcHandler { .await; debug!("Running container with args: {:?}", run_args); - // Build command with optional custom command/args - let mut cmd = tokio::process::Command::new("podman"); - cmd.args(&run_args); - if let Some(custom_cmd) = custom_command { - cmd.arg(custom_cmd); - } else if let Some(args) = custom_args { - cmd.args(args); - } - - let mut run_output = cmd.output().await.context("Failed to run container")?; + let command_tail = install_command_tail(custom_command.as_deref(), custom_args.as_ref()); + let mut run_output = podman_run_for_install(package_id, &run_args, &command_tail) + .await + .context("Failed to run container")?; if !run_output.status.success() { let stderr = String::from_utf8_lossy(&run_output.stderr).to_string(); @@ -766,7 +784,9 @@ impl RpcHandler { .args(["rm", "-f", container_name]) .output() .await; - run_output = cmd.output().await.context("Failed to rerun container")?; + run_output = podman_run_for_install(package_id, &run_args, &command_tail) + .await + .context("Failed to rerun container")?; } } @@ -922,12 +942,7 @@ impl RpcHandler { let is_local_image = docker_image.starts_with("localhost/"); let has_local_fallback = if !is_local_image { let local_tag = format!("localhost/{}:latest", package_id); - let check = tokio::process::Command::new("podman") - .args(["images", "-q", &local_tag]) - .output() - .await - .ok(); - check.is_some_and(|o| !String::from_utf8_lossy(&o.stdout).trim().is_empty()) + local_podman_image_exists(&local_tag).await.unwrap_or(false) } else { false }; @@ -942,14 +957,9 @@ impl RpcHandler { ); } else { // Local image — verify it exists - let images_output = tokio::process::Command::new("podman") - .args(["images", "-q", docker_image]) - .output() + if !local_podman_image_exists(docker_image) .await - .context("Failed to check local image")?; - if String::from_utf8_lossy(&images_output.stdout) - .trim() - .is_empty() + .context("Failed to check local image")? { return Err(anyhow::anyhow!( "Local image {} not found. Build the image first \ @@ -1139,12 +1149,10 @@ impl RpcHandler { } // Verify image exists locally after pull. - let verify = tokio::process::Command::new("podman") - .args(["images", "-q", docker_image]) - .output() + if !local_podman_image_exists(docker_image) .await - .context("Failed to verify pulled image")?; - if String::from_utf8_lossy(&verify.stdout).trim().is_empty() { + .context("Failed to verify pulled image")? + { return Err(anyhow::anyhow!( "Image {} not found locally after pull", docker_image @@ -1278,11 +1286,13 @@ impl RpcHandler { // set `prune=N` in bitcoin.conf themselves after install. let bitcoin_conf = format!( "\ -# rpcauth: salted hash only — no plaintext password in config or CLI\n\ +# rpcauth: salted hash only - no plaintext password in config or CLI\n\ {}\n\ server=1\n\ rpcallowip=0.0.0.0/0\n\ listen=1\n\ +rpcthreads=16\n\ +rpcworkqueue=256\n\ printtoconsole=1\n", rpcauth_line ); @@ -1871,29 +1881,34 @@ autopilot.active=false\n", .unwrap_or_default(); super::validation::validate_app_id(app_id)?; - match app_id { - "saleor" => { - let password = - tokio::fs::read_to_string("/var/lib/archipelago/secrets/saleor-admin-password") - .await - .unwrap_or_default() - .trim() - .to_string(); - if password.is_empty() { - return Ok(serde_json::json!({ "credentials": [] })); - } - - Ok(serde_json::json!({ - "title": "Saleor admin login", - "description": "Saleor opens to its own dashboard login. Use this generated admin account to sign in.", - "credentials": [ - { "label": "Email", "value": "admin@example.com", "sensitive": false }, - { "label": "Password", "value": password, "sensitive": true } - ] - })) - } - _ => Ok(serde_json::json!({ "credentials": [] })), + if app_id == "filebrowser" { + let password = + tokio::fs::read_to_string("/var/lib/archipelago/secrets/filebrowser/password") + .await + .map(|p| p.trim().to_string()) + .unwrap_or_else(|_| "admin".to_string()); + return Ok(serde_json::json!({ + "title": "File Browser credentials", + "description": "Use these credentials when File Browser asks you to sign in.", + "credentials": [ + { "label": "Username", "value": "admin" }, + { "label": "Password", "value": password, "sensitive": true } + ] + })); } + + if app_id == "photoprism" { + return Ok(serde_json::json!({ + "title": "PhotoPrism credentials", + "description": "Use these credentials when PhotoPrism asks you to sign in.", + "credentials": [ + { "label": "Username", "value": "admin" }, + { "label": "Password", "value": "archipelago", "sensitive": true } + ] + })); + } + + Ok(serde_json::json!({ "credentials": [] })) } } @@ -1914,10 +1929,128 @@ async fn cleanup_stale_package_ports(package_id: &str) { cleanup_stale_pasta_port("8444").await; } "nextcloud" => cleanup_stale_pasta_port("8085").await, + "portainer" => cleanup_stale_pasta_port("9000").await, _ => {} } } +fn install_command_tail( + custom_cmd: Option<&str>, + custom_args: Option<&Vec>, +) -> Vec { + if let Some(cmd) = custom_cmd { + vec![cmd.to_string()] + } else if let Some(args) = custom_args { + args.clone() + } else { + Vec::new() + } +} + +async fn podman_run_for_install( + package_id: &str, + run_args: &[&str], + command_tail: &[String], +) -> Result { + if should_scope_podman_run(package_id) { + match podman_create_then_scoped_start(package_id, run_args, command_tail).await { + Ok(output) => return Ok(output), + Err(err) => { + tracing::warn!(package_id, error = %err, "scoped podman create/start failed; falling back to direct podman run"); + } + } + } + + let mut cmd = tokio::process::Command::new("podman"); + cmd.args(run_args); + cmd.args(command_tail); + cmd.output().await.context("Failed to run podman") +} + +async fn podman_create_then_scoped_start( + package_id: &str, + run_args: &[&str], + command_tail: &[String], +) -> Result { + let container_name = run_args + .windows(2) + .find_map(|pair| (pair[0] == "--name").then_some(pair[1])) + .unwrap_or(package_id); + let mut create_args = Vec::with_capacity(run_args.len() + command_tail.len()); + for (idx, arg) in run_args.iter().enumerate() { + if idx == 0 && *arg == "run" { + create_args.push("create".to_string()); + } else if *arg != "-d" { + create_args.push((*arg).to_string()); + } + } + create_args.extend(command_tail.iter().cloned()); + + let mut create = tokio::process::Command::new("podman"); + create.args(&create_args); + let create_output = create + .output() + .await + .context("Failed to run podman create")?; + if !create_output.status.success() { + return Ok(create_output); + } + + let mut scoped_start = tokio::process::Command::new("systemd-run"); + scoped_start.args([ + "--user", + "--scope", + "--quiet", + "--collect", + "podman", + "start", + container_name, + ]); + match scoped_start.output().await { + Ok(output) if output.status.success() => Ok(create_output), + Ok(output) => { + tracing::warn!( + package_id, + container = container_name, + stderr = %String::from_utf8_lossy(&output.stderr).trim(), + "scoped podman start after create failed; trying direct podman start" + ); + let mut direct_start = tokio::process::Command::new("podman"); + direct_start.args(["start", container_name]); + let direct_output = direct_start + .output() + .await + .context("Failed to run fallback podman start")?; + if direct_output.status.success() { + Ok(create_output) + } else { + Ok(direct_output) + } + } + Err(err) => Err(err).context("Failed to run scoped podman start"), + } +} + +fn should_scope_podman_run(package_id: &str) -> bool { + matches!( + package_id, + "botfights" + | "filebrowser" + | "gitea" + | "grafana" + | "homeassistant" + | "home-assistant" + | "jellyfin" + | "nginx-proxy-manager" + | "nostr-rs-relay" + | "photoprism" + | "portainer" + | "searxng" + | "uptime-kuma" + | "vaultwarden" + ) +} + async fn cleanup_start_conflict(package_id: &str, stderr: &str) -> bool { if stderr.contains("name is already in use") || stderr.contains("name \"") { return true; @@ -1968,6 +2101,12 @@ async fn cleanup_start_conflict(package_id: &str, stderr: &str) -> bool { cleanup_stale_pasta_port("8085").await; true } + "portainer" + if stderr.contains("pasta failed") || stderr.contains("address already in use") => + { + cleanup_stale_pasta_port("9000").await; + true + } _ => false, } } @@ -2026,7 +2165,7 @@ async fn ensure_host_port_listener( return Ok(()); }; - if wait_for_host_port(port, 10).await { + if wait_for_host_port(package_id, port, 10).await { return Ok(()); } @@ -2052,7 +2191,7 @@ async fn ensure_host_port_listener( )); } - if wait_for_host_port(port, 60).await { + if wait_for_host_port(package_id, port, 60).await { install_log(&format!( "INSTALL REPAIR OK: {} — host port {} is listening after restart", package_id, port @@ -2084,31 +2223,6 @@ fn published_host_port(container_name: &str) -> Option { }) } -async fn ensure_user_podman_socket() -> Result<()> { - let socket_path = "/run/user/1000/podman/podman.sock"; - if tokio::fs::try_exists(socket_path).await.unwrap_or(false) { - return Ok(()); - } - - let status = tokio::process::Command::new("systemctl") - .args(["--user", "restart", "podman.socket"]) - .status() - .await - .context("spawn systemctl --user restart podman.socket")?; - if !status.success() { - anyhow::bail!("systemctl --user restart podman.socket exited {status}"); - } - - for _ in 0..20 { - if tokio::fs::try_exists(socket_path).await.unwrap_or(false) { - return Ok(()); - } - tokio::time::sleep(Duration::from_millis(250)).await; - } - - anyhow::bail!("podman socket {socket_path} did not appear after restart") -} - fn required_host_port(package_id: &str) -> Option { match package_id { "grafana" => Some(3000), @@ -2118,17 +2232,21 @@ fn required_host_port(package_id: &str) -> Option { "gitea" => Some(3001), "nextcloud" => Some(8085), "nginx-proxy-manager" => Some(8081), + "portainer" => Some(9000), _ => None, } } -async fn wait_for_host_port(port: u16, timeout_secs: u64) -> bool { +async fn wait_for_host_port(package_id: &str, port: u16, timeout_secs: u64) -> bool { let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); loop { - if tokio::net::TcpStream::connect(("127.0.0.1", port)) - .await - .is_ok() - { + let ready = match package_id { + "uptime-kuma" => http_host_port_ready(port, "/").await, + _ => tokio::net::TcpStream::connect(("127.0.0.1", port)) + .await + .is_ok(), + }; + if ready { return true; } @@ -2140,6 +2258,36 @@ async fn wait_for_host_port(port: u16, timeout_secs: u64) -> bool { } } +async fn http_host_port_ready(port: u16, path: &str) -> bool { + let Ok(Ok(mut stream)) = tokio::time::timeout( + Duration::from_secs(3), + tokio::net::TcpStream::connect(("127.0.0.1", port)), + ) + .await + else { + return false; + }; + + let request = format!("GET {path} HTTP/1.1\r\nHost: 127.0.0.1\r\nConnection: close\r\n\r\n"); + if stream.write_all(request.as_bytes()).await.is_err() { + return false; + } + + let mut buf = [0u8; 128]; + let Ok(Ok(n)) = tokio::time::timeout(Duration::from_secs(3), stream.read(&mut buf)).await + else { + return false; + }; + if n == 0 { + return false; + } + let head = String::from_utf8_lossy(&buf[..n]); + head.starts_with("HTTP/1.1 2") + || head.starts_with("HTTP/1.1 3") + || head.starts_with("HTTP/1.0 2") + || head.starts_with("HTTP/1.0 3") +} + /// Resolve the host gateway IP for --add-host flag. /// Resolve the default gateway IP from the routing table for --add-host flag. /// Explicit IP avoids issues with "host-gateway" in rootless Podman. @@ -2235,6 +2383,18 @@ set -eu conf=/var/lib/archipelago/bitcoin/bitcoin.conf [ -f "$conf" ] || exit 0 changed=0 +tmp=$(mktemp) +awk -F= ' + /^(server|txindex|rpcbind|rpcallowip|rpcport|listen|bind|dbcache|rpcthreads|rpcworkqueue)=/ { + if (seen[$1]++) next + } + { print } +' "$conf" > "$tmp" +if ! cmp -s "$conf" "$tmp"; then + cat "$tmp" > "$conf" + changed=1 +fi +rm -f "$tmp" ensure_line() { line="$1" key="${line%%=*}" @@ -2246,6 +2406,8 @@ ensure_line() { ensure_line server=1 ensure_line rpcallowip=0.0.0.0/0 ensure_line listen=1 +ensure_line rpcthreads=16 +ensure_line rpcworkqueue=256 [ "$changed" -eq 0 ] && exit 0 exit 2 "#; @@ -2272,6 +2434,7 @@ fn should_try_orchestrator_install(package_id: &str, orchestrator_available: boo fn orchestrator_install_app_id(package_id: &str) -> &str { match package_id { "electrs" | "mempool-electrs" => "electrumx", + "home-assistant" => "homeassistant", _ => package_id, } } @@ -2299,6 +2462,16 @@ fn uses_orchestrator_install_flow(package_id: &str) -> bool { | "archy-btcpay-db" | "archy-nbxplorer" | "btcpay-server" + | "homeassistant" + | "home-assistant" + | "nextcloud" + | "vaultwarden" + | "jellyfin" + | "photoprism" + | "uptime-kuma" + | "gitea" + | "portainer" + | "meshtastic" ) } @@ -2336,6 +2509,16 @@ mod tests { "archy-btcpay-db", "archy-nbxplorer", "btcpay-server", + "homeassistant", + "home-assistant", + "nextcloud", + "vaultwarden", + "jellyfin", + "photoprism", + "uptime-kuma", + "gitea", + "portainer", + "meshtastic", ] { assert!(uses_orchestrator_install_flow(app)); assert!(should_try_orchestrator_install(app, true)); @@ -2364,6 +2547,10 @@ mod tests { assert_eq!(orchestrator_install_app_id("bitcoin-core"), "bitcoin-core"); assert_eq!(orchestrator_install_app_id("electrs"), "electrumx"); assert_eq!(orchestrator_install_app_id("mempool-electrs"), "electrumx"); + assert_eq!( + orchestrator_install_app_id("home-assistant"), + "homeassistant" + ); assert_eq!(orchestrator_install_app_id("lnd"), "lnd"); } diff --git a/core/archipelago/src/api/rpc/package/runtime.rs b/core/archipelago/src/api/rpc/package/runtime.rs index 1d06ddf1..3f2b1d54 100644 --- a/core/archipelago/src/api/rpc/package/runtime.rs +++ b/core/archipelago/src/api/rpc/package/runtime.rs @@ -2,15 +2,18 @@ use super::config::{ get_app_capabilities, get_containers_for_app, get_data_dirs_for_app, get_health_check_args, get_memory_limit, is_valid_docker_image, }; -use super::dependencies::ordered_containers_for_start; +use super::dependencies::{ordered_containers_for_start, startup_order}; use super::install::install_log; use super::validation::validate_app_id; use crate::api::rpc::RpcHandler; use crate::data_model::PackageState; use anyhow::{Context, Result}; +use archipelago_container::AppManifest; +use std::path::Path; use std::process::Output; use std::sync::Arc; use std::time::Duration; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tracing::warn; const PODMAN_CONTROL_TIMEOUT: Duration = Duration::from_secs(30); @@ -53,7 +56,11 @@ impl RpcHandler { .ok_or_else(|| anyhow::anyhow!("Missing package id"))?; validate_app_id(package_id)?; - let to_start = ordered_containers_for_start(package_id).await?; + let to_start = if self.orchestrator.is_some() && uses_single_orchestrator_app(package_id) { + vec![orchestrator_app_id(package_id).to_string()] + } else { + ordered_containers_for_start(package_id).await? + }; if to_start.is_empty() { tracing::warn!("package.start {}: no containers found", package_id); return Err(anyhow::anyhow!("No containers found for {}", package_id)); @@ -124,7 +131,16 @@ impl RpcHandler { .ok_or_else(|| anyhow::anyhow!("Missing package id"))?; validate_app_id(package_id)?; - let containers = get_containers_for_app(package_id).await?; + let single_orchestrator_app = + self.orchestrator.is_some() && uses_single_orchestrator_app(package_id); + let mut containers = if single_orchestrator_app { + vec![orchestrator_app_id(package_id).to_string()] + } else { + get_containers_for_app(package_id).await? + }; + if !single_orchestrator_app { + containers.reverse(); + } if containers.is_empty() { tracing::warn!("package.stop {}: no containers found", package_id); return Err(anyhow::anyhow!("No containers found for {}", package_id)); @@ -190,7 +206,13 @@ impl RpcHandler { .ok_or_else(|| anyhow::anyhow!("Missing package id"))?; validate_app_id(package_id)?; - let containers = get_containers_for_app(package_id).await?; + let single_orchestrator_app = + self.orchestrator.is_some() && uses_single_orchestrator_app(package_id); + let containers = if single_orchestrator_app { + vec![orchestrator_app_id(package_id).to_string()] + } else { + get_containers_for_app(package_id).await? + }; if containers.is_empty() { tracing::warn!("package.restart {}: no containers found", package_id); return Err(anyhow::anyhow!("No containers found for {}", package_id)); @@ -206,7 +228,11 @@ impl RpcHandler { let package_id_owned = package_id.to_string(); let companion_app_id = package_id_owned.clone(); - let to_restart = ordered_containers_for_start(package_id).await?; + let to_restart = if single_orchestrator_app { + vec![orchestrator_app_id(package_id).to_string()] + } else { + ordered_containers_for_start(package_id).await? + }; let state_manager = Arc::clone(&self.state_manager); let orchestrator = self.orchestrator.clone(); let pre_state = @@ -323,7 +349,9 @@ impl RpcHandler { match rm_out { Ok(o) if o.status.success() => removed += 1, Ok(o) => { - // If normal rm fails (e.g., still running), force as fallback + // If normal rm fails (e.g., still running/stopping/removing), + // force with targeted cleanup fallbacks. This is deliberately + // container-scoped; never prune the store during uninstall. let stderr = String::from_utf8_lossy(&o.stderr); tracing::warn!( "Uninstall {}: rm {} failed ({}), trying force", @@ -331,28 +359,36 @@ impl RpcHandler { name, stderr.trim() ); - let force_rm = podman_control(&["rm", "-f", name]).await; - match force_rm { - Ok(o2) if o2.status.success() => removed += 1, - _ => { - let msg = format!("Failed to remove {}: {}", name, stderr.trim()); + match force_remove_runtime_container(name).await { + Ok(()) => removed += 1, + Err(e) => { + let msg = + format!("Failed to remove {}: {}; {}", name, stderr.trim(), e); tracing::error!("Uninstall {}: {}", package_id, msg); errors.push(msg); } } } - Err(e) => { - let msg = format!("Failed to remove {}: {}", name, e); - tracing::error!("Uninstall {}: {}", package_id, msg); - errors.push(msg); - } + Err(e) => match force_remove_runtime_container(name).await { + Ok(()) => removed += 1, + Err(force_err) => { + let msg = format!("Failed to remove {}: {}; {}", name, e, force_err); + tracing::error!("Uninstall {}: {}", package_id, msg); + errors.push(msg); + } + }, } } self.set_uninstall_stage(package_id, "Cleaning up volumes") .await; - // Clean up dangling volumes associated with removed containers - let _ = podman_control(&["volume", "prune", "-f"]).await; + // Avoid global Podman volume prune on production nodes: store-wide + // Podman cleanup commands can block app health under load. App data is + // removed explicitly below when preserve_data=false. + tracing::info!( + package_id = %package_id, + "Skipping global podman volume prune during uninstall" + ); // Clean up app-specific networks (only if no other containers use them) let app_networks: Vec<&str> = match package_id { @@ -600,9 +636,25 @@ async fn do_package_start(to_start: &[String]) -> Result<()> { if i > 0 { tokio::time::sleep(std::time::Duration::from_secs(2)).await; } + if let Err(e) = ensure_startable_container_state(name).await { + tracing::error!(container = %name, error = %e, "container is not startable"); + errors.push(format!("{}: {}", name, e)); + continue; + } + match inspect_runtime_container_state(name).await { + Ok(Some(state)) if state == "running" => { + tracing::debug!(container = %name, "container already running during package start"); + continue; + } + Ok(_) => {} + Err(e) => { + tracing::warn!(container = %name, error = %e, "failed to re-inspect before package start") + } + } repair_before_package_start(name).await; + wait_before_package_start(name).await; tracing::info!("Starting container: {}", name); - let out = podman_control(&["start", name]) + let out = podman_start_container(name) .await .context(format!("Failed to exec podman start {}", name))?; if !out.status.success() { @@ -669,6 +721,7 @@ async fn do_orchestrator_package_start( tokio::time::sleep(std::time::Duration::from_secs(2)).await; } repair_before_package_start(name).await; + wait_before_package_start(name).await; match orchestrator.start(name).await { Ok(()) => wait_after_orchestrator_start(name).await, Err(e) if is_unknown_app_id_error(&e) => { @@ -681,10 +734,13 @@ async fn do_orchestrator_package_start( } } } - if errors.is_empty() { - Ok(()) - } else { + if !errors.is_empty() { Err(anyhow::anyhow!("Start failed: {}", errors.join("; "))) + } else { + for name in to_start { + ensure_runtime_host_port_listener(name).await?; + } + Ok(()) } } @@ -703,6 +759,137 @@ async fn podman_control(args: &[&str]) -> Result { podman_with_timeout(args, podman_control_timeout(args)).await } +async fn force_remove_runtime_container(container_name: &str) -> Result<()> { + for args in [ + vec!["rm", "-f", container_name], + vec!["rm", "-f", "--time", "0", container_name], + ] { + let output = podman_control(&args).await?; + if output.status.success() + || is_missing_container_error(&String::from_utf8_lossy(&output.stderr)) + { + return Ok(()); + } + } + + let _ = podman_control(&["container", "cleanup", container_name]).await; + let output = podman_control(&["rm", "-f", container_name]).await?; + if output.status.success() + || is_missing_container_error(&String::from_utf8_lossy(&output.stderr)) + { + return Ok(()); + } + + Err(anyhow::anyhow!( + "force remove failed: {}", + String::from_utf8_lossy(&output.stderr).trim() + )) +} + +async fn force_stop_runtime_container(container_name: &str) -> Result<()> { + for args in [ + vec!["stop", "-t", "0", container_name], + vec!["kill", container_name], + ] { + let output = podman_control(&args).await?; + if output.status.success() + || is_missing_container_error(&String::from_utf8_lossy(&output.stderr)) + { + return Ok(()); + } + } + + for _ in 0..15 { + match inspect_runtime_container_state(container_name).await? { + None => return Ok(()), + Some(state) if matches!(state.as_str(), "exited" | "stopped" | "configured") => { + return Ok(()) + } + Some(_) => tokio::time::sleep(Duration::from_secs(2)).await, + } + } + + Err(anyhow::anyhow!( + "container did not reach stopped state after force stop" + )) +} + +async fn ensure_startable_container_state(container_name: &str) -> Result<()> { + let Some(state) = inspect_runtime_container_state(container_name).await? else { + return Ok(()); + }; + + match state.as_str() { + "configured" | "created" | "exited" | "stopped" | "running" | "paused" => Ok(()), + "removing" => { + wait_for_container_absent_or_startable(container_name, Duration::from_secs(60)).await + } + other => Err(anyhow::anyhow!( + "container is in unsupported state before start: {}", + other + )), + } +} + +async fn wait_for_container_absent_or_startable( + container_name: &str, + timeout: Duration, +) -> Result<()> { + let deadline = std::time::Instant::now() + timeout; + loop { + match inspect_runtime_container_state(container_name).await? { + None => return Ok(()), + Some(state) + if matches!( + state.as_str(), + "configured" | "created" | "exited" | "stopped" | "running" | "paused" + ) => + { + return Ok(()) + } + Some(state) if state == "removing" && std::time::Instant::now() < deadline => { + tokio::time::sleep(Duration::from_secs(2)).await; + } + Some(state) if state == "removing" => { + force_remove_runtime_container(container_name).await?; + return Ok(()); + } + Some(state) => { + return Err(anyhow::anyhow!( + "container is in unsupported state before start: {}", + state + )) + } + } + } +} + +async fn inspect_runtime_container_state(container_name: &str) -> Result> { + let output = podman_with_timeout( + &["inspect", container_name, "--format", "{{.State.Status}}"], + Duration::from_secs(10), + ) + .await?; + if output.status.success() { + return Ok(Some( + String::from_utf8_lossy(&output.stdout).trim().to_string(), + )); + } + let stderr = String::from_utf8_lossy(&output.stderr); + if is_missing_container_error(&stderr) { + Ok(None) + } else { + Err(anyhow::anyhow!("inspect failed: {}", stderr.trim())) + } +} + +fn is_missing_container_error(stderr: &str) -> bool { + stderr.contains("no such container") + || stderr.contains("no container with name") + || stderr.contains("does not exist") + || stderr.contains("not found") +} + fn podman_control_timeout(args: &[&str]) -> Duration { args.windows(2) .find_map(|pair| { @@ -714,6 +901,13 @@ fn podman_control_timeout(args: &[&str]) -> Duration { .unwrap_or(PODMAN_CONTROL_TIMEOUT) } +fn podman_start_timeout(container_name: &str) -> Duration { + match container_name { + "immich_server" | "netbird-server" => Duration::from_secs(120), + _ => PODMAN_CONTROL_TIMEOUT, + } +} + async fn podman_with_timeout(args: &[&str], timeout: Duration) -> Result { let mut cmd = tokio::process::Command::new("podman"); cmd.args(args); @@ -732,12 +926,48 @@ async fn command_with_timeout( .with_context(|| format!("Failed to exec {}", description)) } +async fn podman_start_container(container_name: &str) -> Result { + if !runtime_host_ports(container_name).is_empty() { + let mut cmd = tokio::process::Command::new("systemd-run"); + cmd.args([ + "--user", + "--scope", + "--quiet", + "--collect", + "podman", + "start", + ]) + .arg(container_name); + let scoped = command_with_timeout( + cmd, + podman_start_timeout(container_name), + &format!("systemd-run --user --scope podman start {container_name}"), + ) + .await; + if scoped.as_ref().is_ok_and(|out| out.status.success()) { + return scoped; + } + if let Err(err) = &scoped { + tracing::warn!( + container = %container_name, + error = %err, + "scoped podman start failed; falling back to direct podman start" + ); + } + } + podman_with_timeout( + &["start", container_name], + podman_start_timeout(container_name), + ) + .await +} + async fn do_orchestrator_package_stop( orchestrator: &dyn crate::container::traits::ContainerOrchestrator, containers: &[String], ) -> Result<()> { let mut errors = Vec::new(); - for name in containers.iter().rev() { + for name in containers { match orchestrator.stop(name).await { Ok(()) => {} Err(e) if is_unknown_app_id_error(&e) => { @@ -758,6 +988,44 @@ async fn do_orchestrator_package_stop( } } +fn orchestrator_app_id(package_id: &str) -> &str { + match package_id { + "electrs" | "mempool-electrs" => "electrumx", + "home-assistant" => "homeassistant", + _ => package_id, + } +} + +fn uses_single_orchestrator_app(package_id: &str) -> bool { + startup_order(package_id).is_empty() + && matches!( + package_id, + "bitcoin-ui" + | "electrs-ui" + | "lnd-ui" + | "bitcoin-core" + | "bitcoin-knots" + | "lnd" + | "fedimint" + | "fedimint-gateway" + | "filebrowser" + | "electrumx" + | "electrs" + | "mempool-electrs" + | "homeassistant" + | "home-assistant" + | "nextcloud" + | "vaultwarden" + | "jellyfin" + | "photoprism" + | "uptime-kuma" + | "gitea" + | "portainer" + | "meshtastic" + | "botfights" + ) +} + async fn do_orchestrator_package_restart( orchestrator: &dyn crate::container::traits::ContainerOrchestrator, to_restart: &[String], @@ -770,22 +1038,72 @@ async fn do_orchestrator_package_restart( async fn do_package_stop(containers: &[String]) -> Result<()> { let mut errors = Vec::new(); for name in containers { + match inspect_runtime_container_state(name).await { + Ok(None) => { + tracing::debug!(container = %name, "container already absent during stop"); + continue; + } + Ok(Some(state)) if matches!(state.as_str(), "exited" | "stopped" | "configured") => { + tracing::debug!(container = %name, state = %state, "container already stopped"); + continue; + } + Ok(Some(_)) => {} + Err(e) => { + tracing::warn!(container = %name, error = %e, "failed to inspect before stop") + } + } tracing::info!( "Stopping container: {} (timeout: {}s)", name, stop_timeout_secs(name) ); - let out = podman_control(&["stop", "-t", stop_timeout_secs(name), name]) - .await - .context(format!("Failed to exec podman stop {}", name))?; + let out = match podman_control(&["stop", "-t", stop_timeout_secs(name), name]).await { + Ok(out) => out, + Err(e) => { + tracing::warn!( + container = %name, + error = %e, + "podman stop errored, trying force stop" + ); + match force_stop_runtime_container(name).await { + Ok(()) => { + tracing::info!(container = %name, "force stop after stop error succeeded"); + continue; + } + Err(force_err) => { + tracing::error!( + "Failed to stop {}: {}; force stop failed: {}", + name, + e, + force_err + ); + errors.push(format!("{}: {}; force stop failed: {}", name, e, force_err)); + continue; + } + } + } + }; if !out.status.success() { let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); if is_missing_companion_ok(name, &stderr) { tracing::debug!(container = %name, "companion already absent during stop"); continue; } - tracing::error!("Failed to stop {}: {}", name, stderr); - errors.push(format!("{}: {}", name, stderr)); + tracing::warn!("Failed to stop {}: {}, trying force stop", name, stderr); + match force_stop_runtime_container(name).await { + Ok(()) => { + tracing::info!(container = %name, "force stop after stop failure succeeded") + } + Err(e) => { + tracing::error!( + "Failed to stop {}: {}; force stop failed: {}", + name, + stderr, + e + ); + errors.push(format!("{}: {}; force stop failed: {}", name, stderr, e)); + } + } } } if !errors.is_empty() { @@ -801,6 +1119,7 @@ async fn do_package_restart(containers: &[String]) -> Result<()> { for name in containers { tracing::info!("Restarting container: {}", name); repair_before_package_start(name).await; + wait_before_package_start(name).await; let out = podman_control(&["restart", "-t", stop_timeout_secs(name), name]) .await .context(format!("Failed to exec podman restart {}", name))?; @@ -818,7 +1137,8 @@ async fn do_package_restart(containers: &[String]) -> Result<()> { ); // Fallback: stop then start let _ = podman_control(&["stop", "-t", stop_timeout_secs(name), name]).await; - let start_out = podman_control(&["start", name]) + wait_before_package_start(name).await; + let start_out = podman_start_container(name) .await .context(format!("Failed to exec podman start {}", name))?; if !start_out.status.success() { @@ -855,22 +1175,158 @@ fn is_unknown_app_id_error(err: &anyhow::Error) -> bool { async fn repair_before_package_start(container_name: &str) { match container_name { "btcpay-server" | "archy-nbxplorer" => repair_btcpay_dirs().await, - "indeedhub-postgres" | "indeedhub-redis" | "indeedhub-minio" | "indeedhub-relay" - | "indeedhub-api" | "indeedhub-ffmpeg" | "indeedhub" => repair_indeedhub_network().await, + "indeedhub" => repair_indeedhub_network().await, + "immich_server" => repair_immich_dirs().await, + "netbird" => repair_netbird_network().await, "grafana" => { repair_grafana_dirs().await; - cleanup_stale_pasta_port("3000").await; } - "vaultwarden" => cleanup_stale_pasta_port("8082").await, - "homeassistant" | "home-assistant" => cleanup_stale_pasta_port("8123").await, "nextcloud" => { repair_nextcloud_dirs().await; - cleanup_stale_pasta_port("8085").await; } "nginx-proxy-manager" => repair_nginx_proxy_manager_container().await, - "gitea" => cleanup_gitea_stale_ports().await, _ => {} } + cleanup_runtime_host_ports(container_name).await; +} + +async fn wait_before_package_start(container_name: &str) { + match container_name { + "indeedhub" => wait_for_indeedhub_dependency_dns().await, + "immich_server" => wait_for_immich_dependencies().await, + "netbird" => wait_for_netbird_dependency_dns().await, + _ => {} + } +} + +async fn wait_for_indeedhub_dependency_dns() { + for _ in 0..30 { + if indeedhub_frontend_dependencies_running().await { + super::stacks::repair_indeedhub_network_aliases().await; + break; + } + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } + + for _ in 0..30 { + let ready = podman_with_timeout( + &["exec", "indeedhub-minio", "getent", "hosts", "minio"], + Duration::from_secs(5), + ) + .await + .map(|out| out.status.success()) + .unwrap_or(false); + if ready { + return; + } + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } +} + +async fn indeedhub_frontend_dependencies_running() -> bool { + for container in ["indeedhub-minio", "indeedhub-redis", "indeedhub-api"] { + if !container_is_running(container).await { + return false; + } + } + true +} + +async fn container_is_running(container: &str) -> bool { + let Ok(output) = podman_with_timeout( + &["inspect", container, "--format", "{{.State.Status}}"], + Duration::from_secs(5), + ) + .await + else { + return false; + }; + + output.status.success() && String::from_utf8_lossy(&output.stdout).trim() == "running" +} + +async fn wait_for_netbird_dependency_dns() { + for _ in 0..30 { + if container_is_running("netbird-server").await + && container_is_running("netbird-dashboard").await + { + super::stacks::repair_netbird_network_aliases().await; + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + return; + } + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } +} + +async fn wait_for_immich_dependencies() { + for _ in 0..60 { + if immich_postgres_ready().await && immich_redis_ready().await { + return; + } + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } +} + +async fn immich_postgres_ready() -> bool { + if container_health_is_healthy("immich_postgres").await { + return true; + } + + let Ok(output) = podman_with_timeout( + &[ + "exec", + "immich_postgres", + "pg_isready", + "-U", + "postgres", + "-d", + "immich", + ], + Duration::from_secs(5), + ) + .await + else { + return false; + }; + output.status.success() +} + +async fn immich_redis_ready() -> bool { + if container_health_is_healthy("immich_redis").await { + return true; + } + + let Ok(output) = podman_with_timeout( + &["exec", "immich_redis", "valkey-cli", "ping"], + Duration::from_secs(5), + ) + .await + else { + return false; + }; + output.status.success() && String::from_utf8_lossy(&output.stdout).contains("PONG") +} + +async fn container_health_is_healthy(container: &str) -> bool { + let Ok(output) = podman_with_timeout( + &[ + "inspect", + container, + "--format", + "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}", + ], + Duration::from_secs(5), + ) + .await + else { + return false; + }; + + output.status.success() && String::from_utf8_lossy(&output.stdout).trim() == "healthy" +} + +async fn repair_netbird_network() { + super::stacks::repair_netbird_network_aliases().await; } async fn repair_nginx_proxy_manager_container() { @@ -1009,11 +1465,11 @@ async fn recreate_nginx_proxy_manager_container() -> Result<()> { } async fn ensure_runtime_host_port_listener(container_name: &str) -> Result<()> { - let Some(port) = runtime_required_host_port(container_name) else { + let Some(port) = runtime_host_ports(container_name).into_iter().next() else { return Ok(()); }; - if wait_for_runtime_host_port(port, 10).await { + if wait_for_runtime_host_port(container_name, port, 10).await { return Ok(()); } @@ -1035,7 +1491,7 @@ async fn ensure_runtime_host_port_listener(container_name: &str) -> Result<()> { )); } - if wait_for_runtime_host_port(port, 60).await { + if wait_for_runtime_host_port(container_name, port, 60).await { install_log(&format!( "START REPAIR OK: {} — host port {} is listening after restart", container_name, port @@ -1051,27 +1507,99 @@ async fn ensure_runtime_host_port_listener(container_name: &str) -> Result<()> { )) } -fn runtime_required_host_port(container_name: &str) -> Option { - match container_name { - "grafana" => Some(3000), - "homeassistant" | "home-assistant" => Some(8123), - "searxng" => Some(8888), - "uptime-kuma" => Some(3002), - "vaultwarden" => Some(8082), - "gitea" => Some(3001), - "nextcloud" => Some(8085), - "nginx-proxy-manager" => Some(8081), - _ => None, +fn runtime_host_ports(container_name: &str) -> Vec { + let manifest_ports = manifest_host_ports(container_name); + if !manifest_ports.is_empty() { + return with_legacy_extra_ports(container_name, manifest_ports); + } + + let ports = match container_name { + "grafana" => vec![3000], + "homeassistant" | "home-assistant" => vec![8123], + "jellyfin" => vec![8096], + "searxng" => vec![8888], + "uptime-kuma" => vec![3002], + "vaultwarden" => vec![8082], + "gitea" => vec![3001, 2222, 3000], + "nextcloud" => vec![8085], + "nginx-proxy-manager" => vec![8081, 8084, 8444], + _ => Vec::new(), + }; + ports +} + +fn with_legacy_extra_ports(container_name: &str, mut ports: Vec) -> Vec { + if container_name == "gitea" && !ports.contains(&3000) { + ports.push(3000); + } + if container_name == "nginx-proxy-manager" { + for port in [8084, 8444] { + if !ports.contains(&port) { + ports.push(port); + } + } + } + ports +} + +fn manifest_host_ports(container_name: &str) -> Vec { + for apps_dir in manifest_apps_dirs() { + let Ok(entries) = std::fs::read_dir(apps_dir) else { + continue; + }; + for entry in entries.flatten() { + let path = entry.path().join("manifest.yml"); + let Ok(contents) = std::fs::read_to_string(&path) else { + continue; + }; + let Ok(manifest) = AppManifest::parse(&contents) else { + continue; + }; + if manifest_container_name(&manifest) == container_name { + return manifest.app.ports.iter().map(|p| p.host).collect(); + } + } + } + Vec::new() +} + +fn manifest_apps_dirs() -> Vec { + let mut dirs = Vec::new(); + if let Ok(manifest_dir) = std::env::var("CARGO_MANIFEST_DIR") { + dirs.push(Path::new(&manifest_dir).join("../../apps")); + } + dirs.extend([ + Path::new("apps").to_path_buf(), + Path::new("/opt/archipelago/apps").to_path_buf(), + Path::new("/opt/archipelago/web-ui/archipelago-runtime/apps").to_path_buf(), + ]); + dirs +} + +fn manifest_container_name(manifest: &AppManifest) -> String { + if let Some(v) = manifest.app.extensions.get("container_name") { + if let Some(s) = v.as_str() { + if !s.is_empty() { + return s.to_string(); + } + } + } + match manifest.app.id.as_str() { + "bitcoin-ui" | "electrs-ui" | "lnd-ui" => format!("archy-{}", manifest.app.id), + id => id.to_string(), } } -async fn wait_for_runtime_host_port(port: u16, timeout_secs: u64) -> bool { +async fn wait_for_runtime_host_port(container_name: &str, port: u16, timeout_secs: u64) -> bool { let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); loop { - if tokio::net::TcpStream::connect(("127.0.0.1", port)) - .await - .is_ok() - { + let ready = match container_name { + "uptime-kuma" => http_host_port_ready(port, "/").await, + _ => tokio::net::TcpStream::connect(("127.0.0.1", port)) + .await + .is_ok(), + }; + if ready { return true; } @@ -1083,6 +1611,37 @@ async fn wait_for_runtime_host_port(port: u16, timeout_secs: u64) -> bool { } } +async fn http_host_port_ready(port: u16, path: &str) -> bool { + let Ok(Ok(mut stream)) = tokio::time::timeout( + std::time::Duration::from_secs(3), + tokio::net::TcpStream::connect(("127.0.0.1", port)), + ) + .await + else { + return false; + }; + + let request = format!("GET {path} HTTP/1.1\r\nHost: 127.0.0.1\r\nConnection: close\r\n\r\n"); + if stream.write_all(request.as_bytes()).await.is_err() { + return false; + } + + let mut buf = [0u8; 128]; + let Ok(Ok(n)) = + tokio::time::timeout(std::time::Duration::from_secs(3), stream.read(&mut buf)).await + else { + return false; + }; + if n == 0 { + return false; + } + let head = String::from_utf8_lossy(&buf[..n]); + head.starts_with("HTTP/1.1 2") + || head.starts_with("HTTP/1.1 3") + || head.starts_with("HTTP/1.0 2") + || head.starts_with("HTTP/1.0 3") +} + async fn repair_btcpay_dirs() { let _ = tokio::process::Command::new("sudo") .args([ @@ -1157,6 +1716,27 @@ async fn repair_nextcloud_dirs() { } } +async fn repair_immich_dirs() { + let _ = tokio::process::Command::new("sudo") + .args(["mkdir", "-p", "/var/lib/archipelago/immich"]) + .output() + .await; + let podman_chown = podman_control(&[ + "unshare", + "chown", + "-R", + "0:0", + "/var/lib/archipelago/immich", + ]) + .await; + if !podman_chown.as_ref().is_ok_and(|o| o.status.success()) { + let _ = tokio::process::Command::new("sudo") + .args(["chown", "-R", "1000:1000", "/var/lib/archipelago/immich"]) + .output() + .await; + } +} + async fn repair_btcpay_database_password() { let Ok(db_pass) = tokio::fs::read_to_string("/var/lib/archipelago/secrets/btcpay-db-password").await @@ -1205,25 +1785,28 @@ async fn cleanup_start_conflict(container_name: &str, stderr: &str) { return; } - if container_name == "gitea" { - cleanup_gitea_stale_ports().await; + let ports = runtime_host_ports(container_name); + if !ports.is_empty() { + cleanup_ports(&ports).await; return; } +} - match container_name { - "grafana" => cleanup_stale_pasta_port("3000").await, - "homeassistant" | "home-assistant" => cleanup_stale_pasta_port("8123").await, - "vaultwarden" => cleanup_stale_pasta_port("8082").await, - "nextcloud" => cleanup_stale_pasta_port("8085").await, - "nginx-proxy-manager" => cleanup_nginx_proxy_manager_ports().await, - _ => {} +async fn cleanup_runtime_host_ports(container_name: &str) { + let ports = runtime_host_ports(container_name); + if !ports.is_empty() { + cleanup_ports(&ports).await; } } async fn cleanup_nginx_proxy_manager_ports() { - cleanup_stale_pasta_port("8081").await; - cleanup_stale_pasta_port("8084").await; - cleanup_stale_pasta_port("8444").await; + cleanup_ports(&[8081, 8084, 8444]).await; +} + +async fn cleanup_ports(ports: &[u16]) { + for port in ports { + cleanup_stale_pasta_port(&port.to_string()).await; + } } async fn cleanup_stale_pasta_port(port: &str) { @@ -1249,31 +1832,6 @@ async fn cleanup_stale_pasta_port(port: &str) { tokio::time::sleep(std::time::Duration::from_secs(1)).await; } -async fn cleanup_gitea_stale_ports() { - for port in ["3001", "2222", "3000"] { - let kill_listener = format!( - "ss -ltnp 'sport = :{}' 2>/dev/null | sed -n 's/.*pid=\\([0-9]*\\).*/\\1/p' | xargs -r kill 2>/dev/null || true", - port - ); - let _ = tokio::process::Command::new("sh") - .args(["-c", &kill_listener]) - .output() - .await; - - let pattern = format!("pasta.*{}", port); - let _ = tokio::process::Command::new("pkill") - .args(["-f", &pattern]) - .output() - .await; - let pattern = format!("rootlessport.*{}", port); - let _ = tokio::process::Command::new("pkill") - .args(["-f", &pattern]) - .output() - .await; - } - tokio::time::sleep(std::time::Duration::from_secs(1)).await; -} - pub(super) fn is_missing_companion_ok(name: &str, stderr: &str) -> bool { matches!( name, @@ -1352,3 +1910,20 @@ pub(super) fn orchestrator_uninstall_app_ids(package_id: &str) -> Vec { _ => vec![package_id.to_string()], } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn runtime_host_ports_are_manifest_derived_for_public_apps() { + assert_eq!(runtime_host_ports("photoprism"), vec![2342]); + assert_eq!(runtime_host_ports("jellyfin"), vec![8096]); + assert_eq!(runtime_host_ports("uptime-kuma"), vec![3002]); + } + + #[test] + fn runtime_host_ports_preserve_legacy_extra_ports() { + assert_eq!(runtime_host_ports("gitea"), vec![3001, 2222, 3000]); + } +} diff --git a/core/archipelago/src/api/rpc/package/stacks.rs b/core/archipelago/src/api/rpc/package/stacks.rs index 77211240..b1e434a8 100644 --- a/core/archipelago/src/api/rpc/package/stacks.rs +++ b/core/archipelago/src/api/rpc/package/stacks.rs @@ -7,10 +7,134 @@ use crate::api::rpc::RpcHandler; use crate::data_model::InstallPhase; use anyhow::{Context, Result}; use base64::Engine; +use std::process::Output; +use std::time::Duration; use tracing::info; use super::install::{install_log, patch_indeedhub_nostr_provider}; +const PODMAN_STACK_PROBE_TIMEOUT: Duration = Duration::from_secs(10); +const PODMAN_STACK_LOG_TIMEOUT: Duration = Duration::from_secs(15); +const PODMAN_STACK_PULL_TIMEOUT: Duration = Duration::from_secs(600); +const PODMAN_STACK_REMOVE_TIMEOUT: Duration = Duration::from_secs(90); + +async fn podman_stack_output(args: &[&str], timeout: Duration) -> Result { + let mut cmd = tokio::process::Command::new("podman"); + cmd.args(args); + cmd.kill_on_drop(true); + tokio::time::timeout(timeout, cmd.output()) + .await + .with_context(|| { + format!( + "podman {} timed out after {}s", + args.join(" "), + timeout.as_secs() + ) + })? + .with_context(|| format!("failed to run podman {}", args.join(" "))) +} + +async fn podman_stack_status(args: &[&str], timeout: Duration) -> Result { + let mut cmd = tokio::process::Command::new("podman"); + cmd.args(args); + cmd.kill_on_drop(true); + tokio::time::timeout(timeout, cmd.status()) + .await + .with_context(|| { + format!( + "podman {} timed out after {}s", + args.join(" "), + timeout.as_secs() + ) + })? + .with_context(|| format!("failed to run podman {}", args.join(" "))) +} + +async fn force_remove_stack_container(container_name: &str) -> Result<()> { + for args in [ + vec!["rm", "-f", container_name], + vec!["rm", "-f", "--time", "0", container_name], + ] { + let output = podman_stack_output(&args, PODMAN_STACK_REMOVE_TIMEOUT).await?; + if output.status.success() + || is_missing_container_error(&String::from_utf8_lossy(&output.stderr)) + { + return wait_for_stack_container_absent(container_name, Duration::from_secs(90)).await; + } + } + + let _ = podman_stack_output( + &["container", "cleanup", container_name], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; + let output = + podman_stack_output(&["rm", "-f", container_name], PODMAN_STACK_REMOVE_TIMEOUT).await?; + if output.status.success() + || is_missing_container_error(&String::from_utf8_lossy(&output.stderr)) + { + return wait_for_stack_container_absent(container_name, Duration::from_secs(90)).await; + } + + Err(anyhow::anyhow!( + "force remove {} failed: {}", + container_name, + String::from_utf8_lossy(&output.stderr).trim() + )) +} + +async fn wait_for_stack_container_absent(container_name: &str, timeout: Duration) -> Result<()> { + let deadline = std::time::Instant::now() + timeout; + loop { + let output = match podman_stack_output( + &["ps", "-a", "--format", "{{.Names}}"], + PODMAN_STACK_REMOVE_TIMEOUT, + ) + .await + { + Ok(output) => output, + Err(e) if std::time::Instant::now() < deadline => { + tracing::warn!( + container = %container_name, + error = %e, + "inspect during stack removal wait failed; retrying" + ); + tokio::time::sleep(Duration::from_secs(2)).await; + continue; + } + Err(e) => return Err(e), + }; + if output.status.success() + && !String::from_utf8_lossy(&output.stdout) + .lines() + .any(|line| line.trim() == container_name) + { + return Ok(()); + } + + if std::time::Instant::now() >= deadline { + let state = format!( + "stdout={} stderr={}", + String::from_utf8_lossy(&output.stdout).trim(), + String::from_utf8_lossy(&output.stderr).trim() + ); + return Err(anyhow::anyhow!( + "container {} still exists after removal attempt: {}", + container_name, + state + )); + } + tokio::time::sleep(Duration::from_secs(2)).await; + } +} + +fn is_missing_container_error(stderr: &str) -> bool { + stderr.contains("no such container") + || stderr.contains("no container with name") + || stderr.contains("does not exist") + || stderr.contains("not found") +} + /// Adopt an existing container stack: start all named containers and return success. /// Returns `Ok(Some(json))` if the primary container was found (adopted), /// or `Ok(None)` if it was not found (proceed with fresh install). @@ -19,11 +143,12 @@ async fn adopt_stack_if_exists( stack_name: &str, all_containers: &[&str], ) -> Result> { - let check = tokio::process::Command::new("podman") - .args(["ps", "-a", "--format", "{{.Names}}"]) - .output() - .await - .context("Failed to list containers")?; + let check = podman_stack_output( + &["ps", "-a", "--format", "{{.Names}}"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await + .context("Failed to list containers")?; let stdout = String::from_utf8_lossy(&check.stdout); let names: Vec<&str> = stdout.lines().map(|l| l.trim()).collect(); @@ -45,10 +170,9 @@ async fn adopt_stack_if_exists( for container in all_containers { if names.iter().any(|n| n == container) { - let _ = tokio::process::Command::new("podman") - .args(["start", container]) - .output() - .await; + let _ = + podman_stack_output(["start", container].as_slice(), PODMAN_STACK_PROBE_TIMEOUT) + .await; } } let existing: Vec<&str> = all_containers @@ -76,10 +200,6 @@ async fn adopt_stack_if_exists( async fn repair_stack_before_adopt(stack_name: &str) { match stack_name { - "saleor" => { - repair_saleor_network_aliases().await; - let _ = start_saleor_storefront_containers().await; - } "btcpay" | "btcpay-server" => { let _ = tokio::process::Command::new("sudo") .args([ @@ -102,16 +222,17 @@ async fn repair_stack_before_adopt(stack_name: &str) { } } "indeedhub" => repair_indeedhub_network_aliases().await, - "netbird" => repair_netbird_unified_origin().await, + "netbird" => repair_netbird_network_aliases().await, _ => {} } } pub(in crate::api::rpc::package) async fn repair_indeedhub_network_aliases() { - let _ = tokio::process::Command::new("podman") - .args(["network", "create", "indeedhub-net"]) - .output() - .await; + let _ = podman_stack_output( + &["network", "create", "indeedhub-net"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; for (container, alias) in [ ("indeedhub-postgres", "postgres"), @@ -121,210 +242,117 @@ pub(in crate::api::rpc::package) async fn repair_indeedhub_network_aliases() { ("indeedhub-api", "api"), ("indeedhub", "indeedhub"), ] { - let exists = tokio::process::Command::new("podman") - .args(["container", "exists", container]) - .status() - .await - .map(|s| s.success()) - .unwrap_or(false); + let exists = podman_stack_status( + &["container", "exists", container], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await + .map(|s| s.success()) + .unwrap_or(false); if !exists { continue; } + if indeedhub_alias_present(container, alias).await { + continue; + } - let _ = tokio::process::Command::new("podman") - .args(["network", "disconnect", "-f", "indeedhub-net", container]) - .output() - .await; - let _ = tokio::process::Command::new("podman") - .args([ + let _ = podman_stack_output( + &["network", "disconnect", "-f", "indeedhub-net", container], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; + let _ = podman_stack_output( + &[ "network", "connect", "--alias", alias, "indeedhub-net", container, - ]) - .output() - .await; + ], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; } } -async fn repair_netbird_unified_origin() { - let host_ip = detect_netbird_public_host_ip() - .await - .unwrap_or_else(|| "127.0.0.1".to_string()); - let _ = write_netbird_config_files(&host_ip).await; - - for container in ["netbird", "netbird-dashboard"] { - let _ = tokio::process::Command::new("podman") - .args(["rm", "-f", container]) - .output() - .await; - } - - let _ = tokio::process::Command::new("podman") - .args(["network", "create", "netbird-net"]) - .output() - .await; - - let _ = pull_image_with_retry(NETBIRD_DASHBOARD_IMAGE).await; - let _ = pull_image_with_retry(NETBIRD_PROXY_IMAGE).await; - - let _ = tokio::process::Command::new("podman") - .args([ - "network", - "disconnect", - "-f", - "netbird-net", - "netbird-server", - ]) - .output() - .await; - let _ = tokio::process::Command::new("podman") - .args([ - "network", - "connect", - "--alias", - "netbird-server", - "netbird-net", - "netbird-server", - ]) - .output() - .await; - let _ = tokio::process::Command::new("podman") - .args(["restart", "netbird-server"]) - .output() - .await; - - tokio::time::sleep(std::time::Duration::from_secs(2)).await; - - let _ = tokio::process::Command::new("podman") - .args([ - "run", - "-d", - "--name", - "netbird-dashboard", - "--network", - "netbird-net", - "--restart=unless-stopped", - "--env-file", - "/var/lib/archipelago/netbird/dashboard.env", - NETBIRD_DASHBOARD_IMAGE, - ]) - .output() - .await; - - let _ = tokio::process::Command::new("podman") - .args([ - "run", - "-d", - "--name", - "netbird", - "--network", - "netbird-net", - "--restart=unless-stopped", - "-p", - "8087:80", - "-v", - "/var/lib/archipelago/netbird/nginx.conf:/etc/nginx/conf.d/default.conf:ro", - NETBIRD_PROXY_IMAGE, - ]) - .output() - .await; +async fn indeedhub_alias_present(container: &str, alias: &str) -> bool { + network_alias_present("indeedhub-net", container, alias).await } -async fn repair_saleor_network_aliases() { - let _ = tokio::process::Command::new("podman") - .args(["network", "create", "saleor-net"]) - .output() - .await; +async fn network_alias_present(network_name: &str, container: &str, alias: &str) -> bool { + let output = match podman_stack_output( + &[ + "inspect", + container, + "--format", + "{{json .NetworkSettings.Networks}}", + ], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await + { + Ok(output) if output.status.success() => output, + _ => return false, + }; + + let Ok(networks) = serde_json::from_slice::(&output.stdout) else { + return false; + }; + networks + .get(network_name) + .and_then(|network| network.get("Aliases")) + .and_then(|aliases| aliases.as_array()) + .map(|aliases| aliases.iter().any(|value| value.as_str() == Some(alias))) + .unwrap_or(false) +} + +pub(in crate::api::rpc::package) async fn repair_netbird_network_aliases() { + let _ = podman_stack_output( + &["network", "create", "netbird-net"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; for (container, alias) in [ - ("saleor-db", "db"), - ("saleor-cache", "cache"), - ("saleor-jaeger", "jaeger"), - ("saleor-mailpit", "mailpit"), - ("saleor-api", "api"), - ("saleor-worker", "worker"), - ("saleor", "saleor"), - ("saleor-storefront", "storefront"), - ("saleor-storefront-app", "storefront-app"), + ("netbird-server", "netbird-server"), + ("netbird-dashboard", "netbird-dashboard"), + ("netbird", "netbird"), ] { - let exists = tokio::process::Command::new("podman") - .args(["container", "exists", container]) - .status() - .await - .map(|s| s.success()) - .unwrap_or(false); + let exists = podman_stack_status( + &["container", "exists", container], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await + .map(|s| s.success()) + .unwrap_or(false); if !exists { continue; } + if network_alias_present("netbird-net", container, alias).await { + continue; + } - let _ = tokio::process::Command::new("podman") - .args(["network", "disconnect", "-f", "saleor-net", container]) - .output() - .await; - let _ = tokio::process::Command::new("podman") - .args([ + let _ = podman_stack_output( + &["network", "disconnect", "-f", "netbird-net", container], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; + let _ = podman_stack_output( + &[ "network", "connect", "--alias", alias, - "saleor-net", + "netbird-net", container, - ]) - .output() - .await; + ], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; } } -async fn start_saleor_storefront_containers() -> Result<()> { - let names = podman_container_names().await?; - - if !names.iter().any(|name| name == "saleor-storefront-app") { - pull_image_with_retry(SALEOR_STOREFRONT_IMAGE).await?; - let mut storefront_cmd = saleor_storefront_app_command(); - run_required_stack_command("saleor", "create storefront app", &mut storefront_cmd).await?; - } else { - let _ = tokio::process::Command::new("podman") - .args(["start", "saleor-storefront-app"]) - .output() - .await; - } - - write_saleor_storefront_proxy_config().await?; - if !names.iter().any(|name| name == "saleor-storefront") { - let mut proxy_cmd = saleor_storefront_proxy_command(); - run_required_stack_command("saleor", "create storefront proxy", &mut proxy_cmd).await?; - } else { - let _ = tokio::process::Command::new("podman") - .args(["start", "saleor-storefront"]) - .output() - .await; - } - - wait_for_stack_containers( - "saleor", - &["saleor-storefront-app", "saleor-storefront"], - 60, - ) - .await -} - -async fn podman_container_names() -> Result> { - let output = tokio::process::Command::new("podman") - .args(["ps", "-a", "--format", "{{.Names}}"]) - .output() - .await - .context("Failed to list containers")?; - Ok(String::from_utf8_lossy(&output.stdout) - .lines() - .map(str::trim) - .filter(|name| !name.is_empty()) - .map(ToOwned::to_owned) - .collect()) -} - async fn run_required_stack_command( stack_name: &str, label: &str, @@ -345,6 +373,62 @@ async fn run_required_stack_command( Err(anyhow::anyhow!("{} {}", stack_name, msg.trim())) } +async fn run_required_scoped_podman_stack_command( + stack_name: &str, + label: &str, + args: Vec, +) -> Result<()> { + run_required_scoped_podman_stack_command_with_env(stack_name, label, args, &[]).await +} + +async fn run_required_scoped_podman_stack_command_with_env( + stack_name: &str, + label: &str, + args: Vec, + env: &[String], +) -> Result<()> { + let mut cmd = tokio::process::Command::new("systemd-run"); + cmd.args(["--user", "--scope", "--quiet", "--collect"]); + for item in env { + cmd.arg(format!("--setenv={item}")); + } + cmd.arg("podman"); + cmd.args(&args); + let output = cmd + .output() + .await + .with_context(|| format!("{}: failed to run scoped {}", stack_name, label))?; + if output.status.success() { + return Ok(()); + } + + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + if matches!(args.first().map(String::as_str), Some("run" | "create")) { + let msg = format!("{} scoped creation failed: {}{}", label, stdout, stderr); + install_log(&format!("INSTALL FAIL: {} - {}", stack_name, msg.trim())).await; + return Err(anyhow::anyhow!("{} {}", stack_name, msg.trim())); + } + + install_log(&format!( + "INSTALL WARN: {} - scoped {} failed: {}{}; trying direct podman", + stack_name, label, stdout, stderr + )) + .await; + + let refs = args.iter().map(String::as_str).collect::>(); + let direct = podman_stack_output(&refs, PODMAN_STACK_PULL_TIMEOUT).await?; + if direct.status.success() { + return Ok(()); + } + + let stdout = String::from_utf8_lossy(&direct.stdout); + let stderr = String::from_utf8_lossy(&direct.stderr); + let msg = format!("{} failed: {}{}", label, stdout, stderr); + install_log(&format!("INSTALL FAIL: {} - {}", stack_name, msg.trim())).await; + Err(anyhow::anyhow!("{} {}", stack_name, msg.trim())) +} + async fn wait_for_stack_containers( stack_name: &str, containers: &[&str], @@ -354,10 +438,11 @@ async fn wait_for_stack_containers( loop { let mut pending = Vec::new(); for container in containers { - let status = tokio::process::Command::new("podman") - .args(["inspect", container, "--format", "{{.State.Status}}"]) - .output() - .await; + let status = podman_stack_output( + &["inspect", container, "--format", "{{.State.Status}}"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; match status { Ok(output) if output.status.success() => { let state = String::from_utf8_lossy(&output.stdout).trim().to_string(); @@ -413,17 +498,82 @@ async fn wait_for_stack_containers( } } +async fn wait_for_stack_container_health( + stack_name: &str, + container: &str, + timeout_secs: u64, +) -> Result<()> { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); + loop { + let status = podman_stack_output( + &[ + "inspect", + container, + "--format", + "{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}", + ], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; + match status { + Ok(output) if output.status.success() => { + let health = String::from_utf8_lossy(&output.stdout).trim().to_string(); + if health == "healthy" { + return Ok(()); + } + if health == "unhealthy" { + let logs = stack_container_logs(container, 80).await; + install_log(&format!( + "INSTALL UNHEALTHY: {} - container {} unhealthy. Logs:\n{}", + stack_name, + container, + logs.chars().take(1000).collect::() + )) + .await; + } + } + Ok(output) => { + install_log(&format!( + "INSTALL HEALTH WAIT: {} - {} inspect failed: {}", + stack_name, + container, + String::from_utf8_lossy(&output.stderr).trim() + )) + .await; + } + Err(e) => { + install_log(&format!( + "INSTALL HEALTH WAIT: {} - {} inspect error: {}", + stack_name, container, e + )) + .await; + } + } + + if std::time::Instant::now() >= deadline { + return Err(anyhow::anyhow!( + "{} container {} did not become healthy within {}s", + stack_name, + container, + timeout_secs + )); + } + tokio::time::sleep(std::time::Duration::from_secs(3)).await; + } +} + async fn stack_container_logs(container: &str, lines: u32) -> String { - tokio::process::Command::new("podman") - .args(["logs", "--tail", &lines.to_string(), container]) - .output() - .await - .map(|o| { - let stdout = String::from_utf8_lossy(&o.stdout); - let stderr = String::from_utf8_lossy(&o.stderr); - format!("{}{}", stdout, stderr) - }) - .unwrap_or_default() + podman_stack_output( + &["logs", "--tail", &lines.to_string(), container], + PODMAN_STACK_LOG_TIMEOUT, + ) + .await + .map(|o| { + let stdout = String::from_utf8_lossy(&o.stdout); + let stderr = String::from_utf8_lossy(&o.stderr); + format!("{}{}", stdout, stderr) + }) + .unwrap_or_default() } async fn install_stack_via_orchestrator( @@ -495,20 +645,10 @@ const REGISTRY: &str = "146.59.87.168:3000/lfg2025"; const NETBIRD_DASHBOARD_IMAGE: &str = "docker.io/netbirdio/dashboard:v2.38.0"; const NETBIRD_SERVER_IMAGE: &str = "docker.io/netbirdio/netbird-server:0.71.2"; const NETBIRD_PROXY_IMAGE: &str = "docker.io/library/nginx:1.27-alpine"; -const SALEOR_API_IMAGE: &str = "ghcr.io/saleor/saleor:3.23"; -const SALEOR_DASHBOARD_IMAGE: &str = "ghcr.io/saleor/saleor-dashboard:3.23"; -const SALEOR_STOREFRONT_IMAGE: &str = "146.59.87.168:3000/lfg2025/saleor-storefront:6eb0b97"; -const SALEOR_POSTGRES_IMAGE: &str = "docker.io/library/postgres:15-alpine"; -const SALEOR_VALKEY_IMAGE: &str = "docker.io/valkey/valkey:8.1-alpine"; -const SALEOR_JAEGER_IMAGE: &str = "docker.io/jaegertracing/jaeger:latest"; -const SALEOR_MAILPIT_IMAGE: &str = "docker.io/axllent/mailpit:latest"; /// Pull an image with retry and exponential backoff (3 attempts). async fn pull_image_with_retry(image: &str) -> Result<()> { - let exists = tokio::process::Command::new("podman") - .args(["image", "exists", image]) - .status() - .await; + let exists = podman_stack_status(&["image", "exists", image], PODMAN_STACK_PROBE_TIMEOUT).await; if matches!(exists, Ok(status) if status.success()) { return Ok(()); } @@ -522,10 +662,16 @@ async fn pull_image_with_retry(image: &str) -> Result<()> { if archipelago_container::image_uses_insecure_registry(image) { cmd.arg("--tls-verify=false"); } - let output = cmd - .arg(image) - .output() + cmd.arg(image); + cmd.kill_on_drop(true); + let output = tokio::time::timeout(PODMAN_STACK_PULL_TIMEOUT, cmd.output()) .await + .with_context(|| { + format!( + "podman pull {image} timed out after {}s", + PODMAN_STACK_PULL_TIMEOUT.as_secs() + ) + })? .context("Failed to execute podman pull")?; if output.status.success() { @@ -557,75 +703,6 @@ async fn pull_image_with_retry(image: &str) -> Result<()> { unreachable!() } -fn saleor_storefront_app_command() -> tokio::process::Command { - let mut cmd = tokio::process::Command::new("podman"); - cmd.args([ - "run", - "-d", - "--name", - "saleor-storefront-app", - "--network", - "saleor-net", - "--network-alias", - "storefront-app", - "--restart=unless-stopped", - "--cap-drop=ALL", - "--cap-add=CHOWN", - "--cap-add=DAC_OVERRIDE", - "--cap-add=FOWNER", - "--cap-add=SETGID", - "--cap-add=SETUID", - "--security-opt=no-new-privileges:true", - "--memory=512m", - "--pids-limit=2048", - "-e", - "NEXT_PUBLIC_SALEOR_API_URL=http://api:8000/graphql/", - "-e", - "NEXT_PUBLIC_SALEOR_MEDIA_URL=http://api:8000/", - "-e", - "NEXT_PUBLIC_STOREFRONT_URL=http://localhost:9011", - "-e", - "NEXT_PUBLIC_DEFAULT_CHANNEL=default-channel", - "-e", - "HOSTNAME=0.0.0.0", - "-e", - "PORT=3000", - SALEOR_STOREFRONT_IMAGE, - ]); - cmd -} - -fn saleor_storefront_proxy_command() -> tokio::process::Command { - let mut cmd = tokio::process::Command::new("podman"); - cmd.args([ - "run", - "-d", - "--name", - "saleor-storefront", - "--network", - "saleor-net", - "--network-alias", - "storefront", - "--restart=unless-stopped", - "--cap-drop=ALL", - "--cap-add=CHOWN", - "--cap-add=DAC_OVERRIDE", - "--cap-add=FOWNER", - "--cap-add=NET_BIND_SERVICE", - "--cap-add=SETGID", - "--cap-add=SETUID", - "--security-opt=no-new-privileges:true", - "--memory=128m", - "--pids-limit=1024", - "-p", - "9011:80", - "-v", - "/var/lib/archipelago/saleor-storefront/nginx.conf:/etc/nginx/conf.d/default.conf:ro", - NETBIRD_PROXY_IMAGE, - ]); - cmd -} - impl RpcHandler { /// Install Immich stack (postgres + redis + server). pub(super) async fn install_immich_stack(&self) -> Result { @@ -640,21 +717,16 @@ impl RpcHandler { } // Clean up stale "immich" container (old naming) before fresh install - let check = tokio::process::Command::new("podman") - .args(["ps", "-a", "--format", "{{.Names}}"]) - .output() - .await - .context("Failed to list containers")?; + let check = podman_stack_output( + &["ps", "-a", "--format", "{{.Names}}"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await + .context("Failed to list containers")?; let stdout = String::from_utf8_lossy(&check.stdout); if stdout.contains("immich\n") || stdout.lines().any(|l| l.trim() == "immich") { - let _ = tokio::process::Command::new("podman") - .args(["stop", "immich"]) - .output() - .await; - let _ = tokio::process::Command::new("podman") - .args(["rm", "-f", "immich"]) - .output() - .await; + let _ = podman_stack_output(&["stop", "immich"], PODMAN_STACK_PROBE_TIMEOUT).await; + let _ = force_remove_stack_container("immich").await; } let images = [ @@ -694,123 +766,129 @@ impl RpcHandler { ]) .output() .await; - let _ = tokio::process::Command::new("podman") - .args(["network", "create", "immich-net"]) - .output() - .await; + let _ = podman_stack_output( + &["network", "create", "immich-net"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; let db_pass = super::config::read_or_generate_secret("immich-db-password").await; - let _ = tokio::process::Command::new("podman") - .args([ - "run", - "-d", - "--name", - "immich_postgres", - "--restart", - "unless-stopped", - "--network", - "immich-net", - "--network-alias", - "immich_postgres", - "--cap-drop=ALL", - "--cap-add=CHOWN", - "--cap-add=DAC_OVERRIDE", - "--cap-add=FOWNER", - "--cap-add=SETGID", - "--cap-add=SETUID", - "--security-opt=no-new-privileges:true", - "--memory=2g", - "--pids-limit=4096", - "--health-cmd=pg_isready -U postgres || exit 1", - "--health-interval=30s", - "--health-retries=3", - "-v", - "/var/lib/archipelago/immich-db:/var/lib/postgresql/data", - "-e", - &format!("POSTGRES_PASSWORD={}", db_pass), - "-e", - "POSTGRES_USER=postgres", - "-e", - "POSTGRES_DB=immich", - "146.59.87.168:3000/lfg2025/immich-postgres:14-vectorchord0.4.3-pgvectors0.2.0", - ]) - .output() - .await; + let mut postgres_cmd = tokio::process::Command::new("podman"); + postgres_cmd.args([ + "run", + "-d", + "--name", + "immich_postgres", + "--restart", + "unless-stopped", + "--network", + "immich-net", + "--network-alias", + "immich_postgres", + "--cap-drop=ALL", + "--cap-add=CHOWN", + "--cap-add=DAC_OVERRIDE", + "--cap-add=FOWNER", + "--cap-add=SETGID", + "--cap-add=SETUID", + "--security-opt=no-new-privileges:true", + "--memory=2g", + "--pids-limit=4096", + "--health-cmd=pg_isready -U postgres || exit 1", + "--health-interval=30s", + "--health-retries=3", + "-v", + "/var/lib/archipelago/immich-db:/var/lib/postgresql/data", + "-e", + &format!("POSTGRES_PASSWORD={}", db_pass), + "-e", + "POSTGRES_USER=postgres", + "-e", + "POSTGRES_DB=immich", + "146.59.87.168:3000/lfg2025/immich-postgres:14-vectorchord0.4.3-pgvectors0.2.0", + ]); + run_required_stack_command("immich", "create postgres", &mut postgres_cmd).await?; tokio::time::sleep(std::time::Duration::from_secs(5)).await; - let _ = tokio::process::Command::new("podman") - .args([ - "run", - "-d", - "--name", - "immich_redis", - "--restart", - "unless-stopped", - "--network", - "immich-net", - "--network-alias", - "immich_redis", - "--cap-drop=ALL", - "--cap-add=SETGID", - "--cap-add=SETUID", - "--security-opt=no-new-privileges:true", - "--memory=128m", - "--pids-limit=2048", - "--health-cmd=valkey-cli ping || exit 1", - "--health-interval=30s", - "--health-retries=3", - "146.59.87.168:3000/lfg2025/valkey:7-alpine", - ]) - .output() - .await; + let mut redis_cmd = tokio::process::Command::new("podman"); + redis_cmd.args([ + "run", + "-d", + "--name", + "immich_redis", + "--restart", + "unless-stopped", + "--network", + "immich-net", + "--network-alias", + "immich_redis", + "--cap-drop=ALL", + "--cap-add=SETGID", + "--cap-add=SETUID", + "--security-opt=no-new-privileges:true", + "--memory=128m", + "--pids-limit=2048", + "--health-cmd=valkey-cli ping || exit 1", + "--health-interval=30s", + "--health-retries=3", + "146.59.87.168:3000/lfg2025/valkey:7-alpine", + ]); + run_required_stack_command("immich", "create redis", &mut redis_cmd).await?; tokio::time::sleep(std::time::Duration::from_secs(2)).await; - let run = tokio::process::Command::new("podman") - .args([ - "run", - "-d", - "--name", - "immich_server", - "--restart", - "unless-stopped", - "--network", - "immich-net", - "--network-alias", - "immich_server", - "--cap-drop=ALL", - "--security-opt=no-new-privileges:true", - "--memory=2g", - "--pids-limit=4096", - "-p", - "2283:2283", - "-v", - "/var/lib/archipelago/immich:/usr/src/app/upload", - "-e", - "DB_HOSTNAME=immich_postgres", - "-e", - "DB_USERNAME=postgres", - "-e", - &format!("DB_PASSWORD={}", db_pass), - "-e", - "DB_DATABASE_NAME=immich", - "-e", - "REDIS_HOSTNAME=immich_redis", - "-e", - "UPLOAD_LOCATION=/usr/src/app/upload", - "146.59.87.168:3000/lfg2025/immich-server:release", - ]) - .output() - .await - .context("Failed to start immich_server")?; - - if !run.status.success() { - let stderr = String::from_utf8_lossy(&run.stderr); - return Err(anyhow::anyhow!("Failed to start Immich server: {}", stderr)); - } + run_required_scoped_podman_stack_command( + "immich", + "create server", + vec![ + "run".to_string(), + "-d".to_string(), + "--name".to_string(), + "immich_server".to_string(), + "--restart".to_string(), + "unless-stopped".to_string(), + "--network".to_string(), + "immich-net".to_string(), + "--network-alias".to_string(), + "immich_server".to_string(), + "--cap-drop=ALL".to_string(), + "--security-opt=no-new-privileges:true".to_string(), + "--memory=2g".to_string(), + "--pids-limit=4096".to_string(), + "--health-cmd=curl -sf http://localhost:2283/api/server/ping || exit 1".to_string(), + "--health-interval=30s".to_string(), + "--health-retries=20".to_string(), + "--health-start-period=180s".to_string(), + "-p".to_string(), + "2283:2283".to_string(), + "-v".to_string(), + "/var/lib/archipelago/immich:/usr/src/app/upload".to_string(), + "-e".to_string(), + "DB_HOSTNAME=immich_postgres".to_string(), + "-e".to_string(), + "DB_USERNAME=postgres".to_string(), + "-e".to_string(), + format!("DB_PASSWORD={}", db_pass), + "-e".to_string(), + "DB_DATABASE_NAME=immich".to_string(), + "-e".to_string(), + "REDIS_HOSTNAME=immich_redis".to_string(), + "-e".to_string(), + "UPLOAD_LOCATION=/usr/src/app/upload".to_string(), + "146.59.87.168:3000/lfg2025/immich-server:release".to_string(), + ], + ) + .await?; self.set_install_phase("immich", InstallPhase::WaitingHealthy) .await; + wait_for_stack_containers( + "immich", + &["immich_postgres", "immich_redis", "immich_server"], + 120, + ) + .await?; + wait_for_stack_container_health("immich", "immich_server", 360).await?; self.set_install_phase("immich", InstallPhase::PostInstall) .await; self.set_install_phase("immich", InstallPhase::Done).await; @@ -896,10 +974,11 @@ impl RpcHandler { } // Ensure archy-net exists - let _ = tokio::process::Command::new("podman") - .args(["network", "create", "archy-net"]) - .output() - .await; + let _ = podman_stack_output( + &["network", "create", "archy-net"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; // 1. PostgreSQL let _ = tokio::process::Command::new("podman") @@ -1119,10 +1198,11 @@ impl RpcHandler { } // Ensure archy-net exists - let _ = tokio::process::Command::new("podman") - .args(["network", "create", "archy-net"]) - .output() - .await; + let _ = podman_stack_output( + &["network", "create", "archy-net"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; // 1. MariaDB let _ = tokio::process::Command::new("podman") @@ -1336,15 +1416,15 @@ impl RpcHandler { "indeedhub-build_api_1", "indeedhub-build_ffmpeg-worker_1", ] { - let _ = tokio::process::Command::new("podman") - .args(["rm", "-f", name]) - .status() - .await; + force_remove_stack_container(name) + .await + .with_context(|| format!("Failed to clear stale IndeedHub container {}", name))?; } - let _ = tokio::process::Command::new("podman") - .args(["network", "rm", "-f", "indeedhub-net"]) - .status() - .await; + let _ = podman_stack_status( + &["network", "rm", "-f", "indeedhub-net"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; // Phase: CreatingContainer — pulls done, network rebuilt, now spinning // up the 7 stack containers. Bar advances from PullingImage band into @@ -1353,10 +1433,11 @@ impl RpcHandler { .await; // Create indeedhub-net - let _ = tokio::process::Command::new("podman") - .args(["network", "create", "indeedhub-net"]) - .status() - .await; + let _ = podman_stack_status( + &["network", "create", "indeedhub-net"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; // Generate secrets let db_pass = super::config::read_or_generate_secret("indeedhub-db-password").await; @@ -1364,231 +1445,247 @@ impl RpcHandler { let minio_user = "indeeadmin"; let minio_pass = super::config::read_or_generate_secret("indeedhub-minio-password").await; + let tmp_env = [format!("TMPDIR={user_tmp}")]; + // 1. Postgres - let mut postgres_cmd = tokio::process::Command::new("podman"); - postgres_cmd - .args([ - "run", - "-d", - "--name", - "indeedhub-postgres", - "--network", - "indeedhub-net", - "--network-alias", - "postgres", - "--restart", - "unless-stopped", - "-e", - "POSTGRES_DB=indeedhub", - "-e", - "POSTGRES_USER=indeedhub", - "-e", - &format!("POSTGRES_PASSWORD={}", db_pass), - "-v", - "indeedhub-postgres-data:/var/lib/postgresql/data", - &format!("{}/postgres:16.13-alpine", registry), - ]) - .env("TMPDIR", &user_tmp); - run_required_stack_command("indeedhub", "create postgres", &mut postgres_cmd).await?; + run_required_scoped_podman_stack_command_with_env( + "indeedhub", + "create postgres", + vec![ + "run".to_string(), + "-d".to_string(), + "--name".to_string(), + "indeedhub-postgres".to_string(), + "--network".to_string(), + "indeedhub-net".to_string(), + "--network-alias".to_string(), + "postgres".to_string(), + "--restart".to_string(), + "unless-stopped".to_string(), + "-e".to_string(), + "POSTGRES_DB=indeedhub".to_string(), + "-e".to_string(), + "POSTGRES_USER=indeedhub".to_string(), + "-e".to_string(), + format!("POSTGRES_PASSWORD={db_pass}"), + "-v".to_string(), + "indeedhub-postgres-data:/var/lib/postgresql/data".to_string(), + format!("{registry}/postgres:16.13-alpine"), + ], + &tmp_env, + ) + .await?; // 2. Redis - let mut redis_cmd = tokio::process::Command::new("podman"); - redis_cmd - .args([ - "run", - "-d", - "--name", - "indeedhub-redis", - "--network", - "indeedhub-net", - "--network-alias", - "redis", - "--restart", - "unless-stopped", - "-v", - "indeedhub-redis-data:/data", - &format!("{}/redis:7.4.8-alpine", registry), - ]) - .env("TMPDIR", &user_tmp); - run_required_stack_command("indeedhub", "create redis", &mut redis_cmd).await?; + run_required_scoped_podman_stack_command_with_env( + "indeedhub", + "create redis", + vec![ + "run".to_string(), + "-d".to_string(), + "--name".to_string(), + "indeedhub-redis".to_string(), + "--network".to_string(), + "indeedhub-net".to_string(), + "--network-alias".to_string(), + "redis".to_string(), + "--restart".to_string(), + "unless-stopped".to_string(), + "-v".to_string(), + "indeedhub-redis-data:/data".to_string(), + format!("{registry}/redis:7.4.8-alpine"), + ], + &tmp_env, + ) + .await?; // 3. MinIO - let mut minio_cmd = tokio::process::Command::new("podman"); - minio_cmd - .args([ - "run", - "-d", - "--name", - "indeedhub-minio", - "--network", - "indeedhub-net", - "--network-alias", - "minio", - "--restart", - "unless-stopped", - "-e", - &format!("MINIO_ROOT_USER={}", minio_user), - "-e", - &format!("MINIO_ROOT_PASSWORD={}", minio_pass), - "-v", - "indeedhub-minio-data:/data", - &format!("{}/minio:RELEASE.2024-11-07T00-52-20Z", registry), - "server", - "/data", - ]) - .env("TMPDIR", &user_tmp); - run_required_stack_command("indeedhub", "create minio", &mut minio_cmd).await?; + run_required_scoped_podman_stack_command_with_env( + "indeedhub", + "create minio", + vec![ + "run".to_string(), + "-d".to_string(), + "--name".to_string(), + "indeedhub-minio".to_string(), + "--network".to_string(), + "indeedhub-net".to_string(), + "--network-alias".to_string(), + "minio".to_string(), + "--restart".to_string(), + "unless-stopped".to_string(), + "-e".to_string(), + format!("MINIO_ROOT_USER={minio_user}"), + "-e".to_string(), + format!("MINIO_ROOT_PASSWORD={minio_pass}"), + "-v".to_string(), + "indeedhub-minio-data:/data".to_string(), + format!("{registry}/minio:RELEASE.2024-11-07T00-52-20Z"), + "server".to_string(), + "/data".to_string(), + ], + &tmp_env, + ) + .await?; // 4. Nostr relay - let mut relay_cmd = tokio::process::Command::new("podman"); - relay_cmd - .args([ - "run", - "-d", - "--name", - "indeedhub-relay", - "--network", - "indeedhub-net", - "--network-alias", - "relay", - "--restart", - "unless-stopped", - "-v", - "indeedhub-relay-data:/usr/src/app/db", - &format!("{}/nostr-rs-relay:0.9.0", registry), - ]) - .env("TMPDIR", &user_tmp); - run_required_stack_command("indeedhub", "create relay", &mut relay_cmd).await?; + run_required_scoped_podman_stack_command_with_env( + "indeedhub", + "create relay", + vec![ + "run".to_string(), + "-d".to_string(), + "--name".to_string(), + "indeedhub-relay".to_string(), + "--network".to_string(), + "indeedhub-net".to_string(), + "--network-alias".to_string(), + "relay".to_string(), + "--restart".to_string(), + "unless-stopped".to_string(), + "-v".to_string(), + "indeedhub-relay-data:/usr/src/app/db".to_string(), + format!("{registry}/nostr-rs-relay:0.9.0"), + ], + &tmp_env, + ) + .await?; // 5. API - let mut api_cmd = tokio::process::Command::new("podman"); - api_cmd - .args([ - "run", - "-d", - "--name", - "indeedhub-api", - "--network", - "indeedhub-net", - "--network-alias", - "api", - "--restart", - "unless-stopped", - "-e", - "PORT=4000", - "-e", - "DATABASE_HOST=postgres", - "-e", - "DATABASE_PORT=5432", - "-e", - "DATABASE_USER=indeedhub", - "-e", - &format!("DATABASE_PASSWORD={}", db_pass), - "-e", - "DATABASE_NAME=indeedhub", - "-e", - "QUEUE_HOST=redis", - "-e", - "QUEUE_PORT=6379", - "-e", - "S3_ENDPOINT=http://minio:9000", - "-e", - "AWS_REGION=us-east-1", - "-e", - &format!("AWS_ACCESS_KEY={}", minio_user), - "-e", - &format!("AWS_SECRET_KEY={}", minio_pass), - "-e", - "S3_PUBLIC_BUCKET_NAME=indeedhub-public", - "-e", - "S3_PRIVATE_BUCKET_NAME=indeedhub-private", - "-e", - "S3_PUBLIC_BUCKET_URL=/storage", - "-e", - &format!("NOSTR_JWT_SECRET={}", jwt_secret), - "-e", - "NOSTR_JWT_EXPIRES_IN=7d", - "-e", - "AES_MASTER_SECRET=0123456789abcdef0123456789abcdef", - "-e", - "ENVIRONMENT=production", - &format!("{}/indeedhub-api:1.0.0", registry), - ]) - .env("TMPDIR", &user_tmp); - run_required_stack_command("indeedhub", "create api", &mut api_cmd).await?; + run_required_scoped_podman_stack_command_with_env( + "indeedhub", + "create api", + vec![ + "run".to_string(), + "-d".to_string(), + "--name".to_string(), + "indeedhub-api".to_string(), + "--network".to_string(), + "indeedhub-net".to_string(), + "--network-alias".to_string(), + "api".to_string(), + "--restart".to_string(), + "unless-stopped".to_string(), + "-e".to_string(), + "PORT=4000".to_string(), + "-e".to_string(), + "DATABASE_HOST=postgres".to_string(), + "-e".to_string(), + "DATABASE_PORT=5432".to_string(), + "-e".to_string(), + "DATABASE_USER=indeedhub".to_string(), + "-e".to_string(), + format!("DATABASE_PASSWORD={db_pass}"), + "-e".to_string(), + "DATABASE_NAME=indeedhub".to_string(), + "-e".to_string(), + "QUEUE_HOST=redis".to_string(), + "-e".to_string(), + "QUEUE_PORT=6379".to_string(), + "-e".to_string(), + "S3_ENDPOINT=http://minio:9000".to_string(), + "-e".to_string(), + "AWS_REGION=us-east-1".to_string(), + "-e".to_string(), + format!("AWS_ACCESS_KEY={minio_user}"), + "-e".to_string(), + format!("AWS_SECRET_KEY={minio_pass}"), + "-e".to_string(), + "S3_PUBLIC_BUCKET_NAME=indeedhub-public".to_string(), + "-e".to_string(), + "S3_PRIVATE_BUCKET_NAME=indeedhub-private".to_string(), + "-e".to_string(), + "S3_PUBLIC_BUCKET_URL=/storage".to_string(), + "-e".to_string(), + format!("NOSTR_JWT_SECRET={jwt_secret}"), + "-e".to_string(), + "NOSTR_JWT_EXPIRES_IN=7d".to_string(), + "-e".to_string(), + "AES_MASTER_SECRET=0123456789abcdef0123456789abcdef".to_string(), + "-e".to_string(), + "ENVIRONMENT=production".to_string(), + format!("{registry}/indeedhub-api:1.0.0"), + ], + &tmp_env, + ) + .await?; // 6. FFmpeg worker - let mut ffmpeg_cmd = tokio::process::Command::new("podman"); - ffmpeg_cmd - .args([ - "run", - "-d", - "--name", - "indeedhub-ffmpeg", - "--network", - "indeedhub-net", - "--restart", - "unless-stopped", - "-e", - "DATABASE_HOST=postgres", - "-e", - "DATABASE_PORT=5432", - "-e", - "DATABASE_USER=indeedhub", - "-e", - &format!("DATABASE_PASSWORD={}", db_pass), - "-e", - "DATABASE_NAME=indeedhub", - "-e", - "QUEUE_HOST=redis", - "-e", - "QUEUE_PORT=6379", - "-e", - "S3_ENDPOINT=http://minio:9000", - "-e", - &format!("AWS_ACCESS_KEY={}", minio_user), - "-e", - &format!("AWS_SECRET_KEY={}", minio_pass), - "-e", - "AWS_REGION=us-east-1", - "-e", - "S3_PUBLIC_BUCKET_NAME=indeedhub-public", - "-e", - "S3_PRIVATE_BUCKET_NAME=indeedhub-private", - "-e", - "ENVIRONMENT=production", - "-e", - "AES_MASTER_SECRET=0123456789abcdef0123456789abcdef", - &format!("{}/indeedhub-ffmpeg:1.0.0", registry), - ]) - .env("TMPDIR", &user_tmp); - run_required_stack_command("indeedhub", "create ffmpeg worker", &mut ffmpeg_cmd).await?; + run_required_scoped_podman_stack_command_with_env( + "indeedhub", + "create ffmpeg worker", + vec![ + "run".to_string(), + "-d".to_string(), + "--name".to_string(), + "indeedhub-ffmpeg".to_string(), + "--network".to_string(), + "indeedhub-net".to_string(), + "--restart".to_string(), + "unless-stopped".to_string(), + "-e".to_string(), + "DATABASE_HOST=postgres".to_string(), + "-e".to_string(), + "DATABASE_PORT=5432".to_string(), + "-e".to_string(), + "DATABASE_USER=indeedhub".to_string(), + "-e".to_string(), + format!("DATABASE_PASSWORD={db_pass}"), + "-e".to_string(), + "DATABASE_NAME=indeedhub".to_string(), + "-e".to_string(), + "QUEUE_HOST=redis".to_string(), + "-e".to_string(), + "QUEUE_PORT=6379".to_string(), + "-e".to_string(), + "S3_ENDPOINT=http://minio:9000".to_string(), + "-e".to_string(), + format!("AWS_ACCESS_KEY={minio_user}"), + "-e".to_string(), + format!("AWS_SECRET_KEY={minio_pass}"), + "-e".to_string(), + "AWS_REGION=us-east-1".to_string(), + "-e".to_string(), + "S3_PUBLIC_BUCKET_NAME=indeedhub-public".to_string(), + "-e".to_string(), + "S3_PRIVATE_BUCKET_NAME=indeedhub-private".to_string(), + "-e".to_string(), + "ENVIRONMENT=production".to_string(), + "-e".to_string(), + "AES_MASTER_SECRET=0123456789abcdef0123456789abcdef".to_string(), + format!("{registry}/indeedhub-ffmpeg:1.0.0"), + ], + &tmp_env, + ) + .await?; // Wait for backend services to start tokio::time::sleep(std::time::Duration::from_secs(5)).await; // 7. Frontend (nginx) - let mut frontend_cmd = tokio::process::Command::new("podman"); - frontend_cmd - .args([ - "run", - "-d", - "--name", - "indeedhub", - "--network", - "indeedhub-net", - "--restart", - "unless-stopped", - "--tmpfs", - "/run:rw,nosuid,nodev,size=16m", - "--tmpfs", - "/var/cache/nginx:rw,nosuid,nodev,size=32m", - "-p", - "7778:7777", - &format!("{}/indeedhub:1.0.0", registry), - ]) - .env("TMPDIR", &user_tmp); - run_required_stack_command("indeedhub", "create frontend", &mut frontend_cmd).await?; + run_required_scoped_podman_stack_command_with_env( + "indeedhub", + "create frontend", + vec![ + "run".to_string(), + "-d".to_string(), + "--name".to_string(), + "indeedhub".to_string(), + "--network".to_string(), + "indeedhub-net".to_string(), + "--restart".to_string(), + "unless-stopped".to_string(), + "--tmpfs".to_string(), + "/run:rw,nosuid,nodev,size=16m".to_string(), + "--tmpfs".to_string(), + "/var/cache/nginx:rw,nosuid,nodev,size=32m".to_string(), + "-p".to_string(), + "7778:7777".to_string(), + format!("{}/indeedhub:1.0.0", registry), + ], + &tmp_env, + ) + .await?; wait_for_stack_containers( "indeedhub", @@ -1661,15 +1758,13 @@ impl RpcHandler { self.set_install_progress("netbird", 3, 3).await; for name in ["netbird", "netbird-dashboard", "netbird-server"] { - let _ = tokio::process::Command::new("podman") - .args(["rm", "-f", name]) - .status() - .await; + let _ = podman_stack_status(&["rm", "-f", name], PODMAN_STACK_PROBE_TIMEOUT).await; } - let _ = tokio::process::Command::new("podman") - .args(["network", "rm", "-f", "netbird-net"]) - .status() - .await; + let _ = podman_stack_status( + &["network", "rm", "-f", "netbird-net"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; self.set_install_phase("netbird", InstallPhase::CreatingContainer) .await; @@ -1683,10 +1778,11 @@ impl RpcHandler { .unwrap_or_else(|| self.config.host_ip.clone()); write_netbird_config_files(&host_ip).await?; - let _ = tokio::process::Command::new("podman") - .args(["network", "create", "netbird-net"]) - .status() - .await; + let _ = podman_stack_status( + &["network", "create", "netbird-net"], + PODMAN_STACK_PROBE_TIMEOUT, + ) + .await; let mut server_cmd = tokio::process::Command::new("podman"); server_cmd.args([ @@ -1771,500 +1867,6 @@ impl RpcHandler { "message": "NetBird self-hosted stack installed", })) } - - /// Install Saleor stack (PostgreSQL + Valkey + API + worker + dashboard + Jaeger + Mailpit). - pub(super) async fn install_saleor_stack(&self) -> Result { - if let Some(adopted) = adopt_stack_if_exists( - "saleor", - "saleor", - &[ - "saleor-db", - "saleor-cache", - "saleor-jaeger", - "saleor-mailpit", - "saleor-api", - "saleor-worker", - "saleor", - "saleor-storefront", - "saleor-storefront-app", - ], - ) - .await? - { - return Ok(adopted); - } - - install_log("INSTALL START: saleor stack (postgres + valkey + api + worker + dashboard + storefront)") - .await; - info!("Installing Saleor stack"); - - let images = [ - SALEOR_POSTGRES_IMAGE, - SALEOR_VALKEY_IMAGE, - SALEOR_API_IMAGE, - SALEOR_DASHBOARD_IMAGE, - SALEOR_STOREFRONT_IMAGE, - SALEOR_JAEGER_IMAGE, - SALEOR_MAILPIT_IMAGE, - ]; - self.set_install_phase("saleor", InstallPhase::PullingImage) - .await; - let n_images = images.len() as u64; - for (i, image) in images.iter().enumerate() { - self.set_install_progress("saleor", i as u64, n_images) - .await; - pull_image_with_retry(image) - .await - .with_context(|| format!("Failed to pull Saleor image: {}", image))?; - } - self.set_install_progress("saleor", n_images, n_images) - .await; - - for name in [ - "saleor", - "saleor-api", - "saleor-worker", - "saleor-storefront", - "saleor-storefront-app", - "saleor-db", - "saleor-cache", - "saleor-jaeger", - "saleor-mailpit", - ] { - let _ = tokio::process::Command::new("podman") - .args(["rm", "-f", name]) - .status() - .await; - } - let _ = tokio::process::Command::new("podman") - .args(["network", "rm", "-f", "saleor-net"]) - .status() - .await; - - self.set_install_phase("saleor", InstallPhase::CreatingContainer) - .await; - - let _ = tokio::process::Command::new("sudo") - .args([ - "mkdir", - "-p", - "/var/lib/archipelago/saleor", - "/var/lib/archipelago/saleor-db", - "/var/lib/archipelago/saleor-cache", - "/var/lib/archipelago/saleor-storefront", - ]) - .output() - .await; - let user = std::env::var("USER").unwrap_or_else(|_| "archipelago".to_string()); - for dir in [ - "/var/lib/archipelago/saleor", - "/var/lib/archipelago/saleor-db", - "/var/lib/archipelago/saleor-cache", - "/var/lib/archipelago/saleor-storefront", - ] { - let _ = tokio::process::Command::new("sudo") - .args(["chown", "-R", &format!("{}:{}", user, user), dir]) - .output() - .await; - } - - let _ = tokio::process::Command::new("podman") - .args(["network", "create", "saleor-net"]) - .status() - .await; - - let db_pass = super::config::read_or_generate_secret("saleor-db-password").await; - let secret_key = super::config::read_or_generate_secret("saleor-secret-key").await; - let admin_pass = super::config::read_or_generate_secret("saleor-admin-password").await; - let host_ip = &self.config.host_ip; - let dashboard_origin = format!("http://{}:9010", host_ip); - let dashboard_url = format!("{}/", dashboard_origin); - let api_url = format!("http://{}:8000/graphql/", host_ip); - let storefront_origin = format!("http://{}:9011", host_ip); - let allowed_hosts = format!("localhost,127.0.0.1,api,saleor-api,{}", host_ip); - let allowed_client_hosts = format!( - "{},{},http://localhost:9010,http://127.0.0.1:9010,http://localhost:9011,http://127.0.0.1:9011", - dashboard_origin, storefront_origin - ); - let database_url = format!("postgres://saleor:{}@db/saleor", db_pass); - - let mut db_cmd = tokio::process::Command::new("podman"); - db_cmd.args([ - "run", - "-d", - "--name", - "saleor-db", - "--network", - "saleor-net", - "--network-alias", - "db", - "--restart=unless-stopped", - "--cap-drop=ALL", - "--cap-add=CHOWN", - "--cap-add=DAC_OVERRIDE", - "--cap-add=FOWNER", - "--cap-add=SETGID", - "--cap-add=SETUID", - "--security-opt=no-new-privileges:true", - "--memory=512m", - "--pids-limit=4096", - "--health-cmd=pg_isready -U saleor || exit 1", - "--health-interval=30s", - "--health-retries=3", - "-v", - "/var/lib/archipelago/saleor-db:/var/lib/postgresql/data", - "-e", - "POSTGRES_USER=saleor", - "-e", - &format!("POSTGRES_PASSWORD={}", db_pass), - "-e", - "POSTGRES_DB=saleor", - SALEOR_POSTGRES_IMAGE, - ]); - run_required_stack_command("saleor", "create postgres", &mut db_cmd).await?; - - let mut cache_cmd = tokio::process::Command::new("podman"); - cache_cmd.args([ - "run", - "-d", - "--name", - "saleor-cache", - "--network", - "saleor-net", - "--network-alias", - "cache", - "--restart=unless-stopped", - "--cap-drop=ALL", - "--cap-add=SETGID", - "--cap-add=SETUID", - "--security-opt=no-new-privileges:true", - "--memory=128m", - "--pids-limit=2048", - "--health-cmd=valkey-cli ping || exit 1", - "--health-interval=30s", - "--health-retries=3", - SALEOR_VALKEY_IMAGE, - ]); - run_required_stack_command("saleor", "create cache", &mut cache_cmd).await?; - - let mut jaeger_cmd = tokio::process::Command::new("podman"); - jaeger_cmd.args([ - "run", - "-d", - "--name", - "saleor-jaeger", - "--network", - "saleor-net", - "--network-alias", - "jaeger", - "--restart=unless-stopped", - "--cap-drop=ALL", - "--security-opt=no-new-privileges:true", - "--memory=512m", - "--pids-limit=4096", - "-p", - "16686:16686", - "-p", - "4317:4317", - "-p", - "4318:4318", - "--tmpfs", - "/tmp:rw,nosuid,nodev,size=128m", - SALEOR_JAEGER_IMAGE, - ]); - run_required_stack_command("saleor", "create jaeger", &mut jaeger_cmd).await?; - - let mut mailpit_cmd = tokio::process::Command::new("podman"); - mailpit_cmd.args([ - "run", - "-d", - "--name", - "saleor-mailpit", - "--network", - "saleor-net", - "--network-alias", - "mailpit", - "--restart=unless-stopped", - "--cap-drop=ALL", - "--security-opt=no-new-privileges:true", - "--memory=128m", - "--pids-limit=2048", - "-p", - "1025:1025", - "-p", - "8025:8025", - SALEOR_MAILPIT_IMAGE, - ]); - run_required_stack_command("saleor", "create mailpit", &mut mailpit_cmd).await?; - - tokio::time::sleep(std::time::Duration::from_secs(8)).await; - - let saleor_env = vec![ - "-e".to_string(), - "CACHE_URL=redis://cache:6379/0".to_string(), - "-e".to_string(), - "CELERY_BROKER_URL=redis://cache:6379/1".to_string(), - "-e".to_string(), - format!("DATABASE_URL={}", database_url), - "-e".to_string(), - "DEFAULT_CHANNEL_SLUG=default-channel".to_string(), - "-e".to_string(), - "DEFAULT_FROM_EMAIL=noreply@example.com".to_string(), - "-e".to_string(), - "EMAIL_URL=smtp://mailpit:1025".to_string(), - "-e".to_string(), - format!("SECRET_KEY={}", secret_key), - "-e".to_string(), - "OTEL_SERVICE_NAME=saleor".to_string(), - "-e".to_string(), - "OTEL_TRACES_EXPORTER=otlp".to_string(), - "-e".to_string(), - "OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4317".to_string(), - "-e".to_string(), - "HTTP_IP_FILTER_ALLOW_LOOPBACK_IPS=True".to_string(), - "-e".to_string(), - "HTTP_IP_FILTER_ENABLED=False".to_string(), - "-e".to_string(), - format!("DASHBOARD_URL={}", dashboard_url), - "-e".to_string(), - format!("ALLOWED_CLIENT_HOSTS={}", allowed_client_hosts), - "-e".to_string(), - format!("ALLOWED_GRAPHQL_ORIGINS={}", allowed_client_hosts), - "-e".to_string(), - format!("ALLOWED_HOSTS={}", allowed_hosts), - ]; - - let mut migrate_cmd = tokio::process::Command::new("podman"); - migrate_cmd.args([ - "run", - "--rm", - "--network", - "saleor-net", - "-v", - "/var/lib/archipelago/saleor:/app/media", - ]); - migrate_cmd.args(&saleor_env); - migrate_cmd.args([SALEOR_API_IMAGE, "python3", "manage.py", "migrate"]); - run_required_stack_command("saleor", "run migrations", &mut migrate_cmd).await?; - - let mut populate_cmd = tokio::process::Command::new("podman"); - populate_cmd.args([ - "run", - "--rm", - "--network", - "saleor-net", - "-v", - "/var/lib/archipelago/saleor:/app/media", - ]); - populate_cmd.args(&saleor_env); - populate_cmd.args([ - SALEOR_API_IMAGE, - "python3", - "manage.py", - "populatedb", - "--createsuperuser", - ]); - let populate = populate_cmd.output().await; - if let Ok(output) = populate { - if !output.status.success() { - install_log(&format!( - "INSTALL WARN: saleor - populate sample data skipped: {}{}", - String::from_utf8_lossy(&output.stdout), - String::from_utf8_lossy(&output.stderr) - )) - .await; - } - } - - let admin_script = format!( - r#"from django.contrib.auth import get_user_model -User = get_user_model() -user, _ = User.objects.get_or_create(email="admin@example.com", defaults={{"is_staff": True, "is_superuser": True}}) -user.is_staff = True -user.is_superuser = True -user.set_password({:?}) -user.save() -"#, - admin_pass - ); - let mut admin_cmd = tokio::process::Command::new("podman"); - admin_cmd.args([ - "run", - "--rm", - "--network", - "saleor-net", - "-v", - "/var/lib/archipelago/saleor:/app/media", - ]); - admin_cmd.args(&saleor_env); - admin_cmd.args([ - SALEOR_API_IMAGE, - "python3", - "manage.py", - "shell", - "-c", - &admin_script, - ]); - run_required_stack_command("saleor", "create or update admin user", &mut admin_cmd).await?; - install_log("INSTALL INFO: saleor admin email admin@example.com; password stored in /var/lib/archipelago/secrets/saleor-admin-password").await; - - let mut api_cmd = tokio::process::Command::new("podman"); - api_cmd.args([ - "run", - "-d", - "--name", - "saleor-api", - "--network", - "saleor-net", - "--network-alias", - "api", - "--restart=unless-stopped", - "--cap-drop=ALL", - "--cap-add=CHOWN", - "--cap-add=DAC_OVERRIDE", - "--cap-add=FOWNER", - "--cap-add=SETGID", - "--cap-add=SETUID", - "--security-opt=no-new-privileges:true", - "--memory=1g", - "--pids-limit=4096", - "-p", - "8000:8000", - "-v", - "/var/lib/archipelago/saleor:/app/media", - ]); - api_cmd.args(&saleor_env); - api_cmd.arg(SALEOR_API_IMAGE); - run_required_stack_command("saleor", "create api", &mut api_cmd).await?; - - let mut worker_cmd = tokio::process::Command::new("podman"); - worker_cmd.args([ - "run", - "-d", - "--name", - "saleor-worker", - "--network", - "saleor-net", - "--restart=unless-stopped", - "--cap-drop=ALL", - "--cap-add=CHOWN", - "--cap-add=DAC_OVERRIDE", - "--cap-add=FOWNER", - "--cap-add=SETGID", - "--cap-add=SETUID", - "--security-opt=no-new-privileges:true", - "--memory=1g", - "--pids-limit=4096", - "-v", - "/var/lib/archipelago/saleor:/app/media", - ]); - worker_cmd.args(&saleor_env); - worker_cmd.args([ - SALEOR_API_IMAGE, - "celery", - "-A", - "saleor", - "--app=saleor.celeryconf:app", - "worker", - "--loglevel=info", - "-B", - ]); - run_required_stack_command("saleor", "create worker", &mut worker_cmd).await?; - - self.set_install_phase("saleor", InstallPhase::StartingContainer) - .await; - tokio::time::sleep(std::time::Duration::from_secs(5)).await; - - let mut dashboard_cmd = tokio::process::Command::new("podman"); - dashboard_cmd.args([ - "run", - "-d", - "--name", - "saleor", - "--network", - "saleor-net", - "--restart=unless-stopped", - "--cap-drop=ALL", - "--cap-add=CHOWN", - "--cap-add=DAC_OVERRIDE", - "--cap-add=FOWNER", - "--cap-add=NET_BIND_SERVICE", - "--cap-add=SETGID", - "--cap-add=SETUID", - "--security-opt=no-new-privileges:true", - "--memory=256m", - "--pids-limit=2048", - "-p", - "9010:80", - "-e", - &format!("API_URL={}", api_url), - "-e", - "APP_MOUNT_URI=/", - SALEOR_DASHBOARD_IMAGE, - ]); - run_required_stack_command("saleor", "create dashboard", &mut dashboard_cmd).await?; - - let mut storefront_cmd = saleor_storefront_app_command(); - run_required_stack_command("saleor", "create storefront app", &mut storefront_cmd).await?; - - write_saleor_storefront_proxy_config().await?; - let mut storefront_proxy_cmd = saleor_storefront_proxy_command(); - run_required_stack_command( - "saleor", - "create storefront proxy", - &mut storefront_proxy_cmd, - ) - .await?; - - wait_for_stack_containers( - "saleor", - &[ - "saleor-db", - "saleor-cache", - "saleor-jaeger", - "saleor-mailpit", - "saleor-api", - "saleor-worker", - "saleor", - "saleor-storefront", - "saleor-storefront-app", - ], - 120, - ) - .await?; - tokio::time::sleep(std::time::Duration::from_secs(5)).await; - wait_for_stack_containers( - "saleor", - &[ - "saleor-db", - "saleor-cache", - "saleor-jaeger", - "saleor-mailpit", - "saleor-api", - "saleor-worker", - "saleor", - "saleor-storefront", - "saleor-storefront-app", - ], - 30, - ) - .await?; - - self.set_install_phase("saleor", InstallPhase::WaitingHealthy) - .await; - self.set_install_phase("saleor", InstallPhase::PostInstall) - .await; - self.set_install_phase("saleor", InstallPhase::Done).await; - self.clear_install_progress("saleor").await; - - install_log("INSTALL OK: saleor stack").await; - info!("Saleor stack installed"); - Ok(serde_json::json!({ - "success": true, - "package_id": "saleor", - "message": "Saleor stack installed (9 containers)", - })) - } } async fn read_or_generate_b64_secret(name: &str) -> String { @@ -2283,71 +1885,6 @@ async fn read_or_generate_b64_secret(name: &str) -> String { secret } -async fn write_saleor_storefront_proxy_config() -> Result<()> { - tokio::fs::create_dir_all("/var/lib/archipelago/saleor-storefront") - .await - .context("Failed to create Saleor storefront config directory")?; - - let nginx_conf = r#"map $http_x_forwarded_proto $saleor_storefront_proto { - default $http_x_forwarded_proto; - "" $scheme; -} - -server { - listen 80; - server_name _; - resolver 10.89.4.1 valid=10s ipv6=off; - - proxy_http_version 1.1; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Host $host; - proxy_set_header X-Forwarded-Proto $scheme; - - location ^~ /graphql/ { - set $saleor_api http://api:8000/graphql/; - proxy_pass $saleor_api; - proxy_set_header Host api; - proxy_set_header Origin ""; - } - - location ^~ /thumbnail/ { - set $saleor_media http://api:8000$request_uri; - proxy_pass $saleor_media; - proxy_set_header Host api; - proxy_set_header Origin ""; - } - - location ^~ /media/ { - set $saleor_media http://api:8000$request_uri; - proxy_pass $saleor_media; - proxy_set_header Host api; - proxy_set_header Origin ""; - } - - location / { - set $saleor_storefront_app http://storefront-app:3000; - proxy_pass $saleor_storefront_app; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "upgrade"; - proxy_set_header Accept-Encoding ""; - sub_filter_once off; - sub_filter_types text/html application/javascript text/javascript; - sub_filter 'http://api:8000/graphql/' '$saleor_storefront_proto://$host/graphql/'; - } -} -"#; - - tokio::fs::write( - "/var/lib/archipelago/saleor-storefront/nginx.conf", - nginx_conf, - ) - .await - .context("Failed to write Saleor storefront nginx.conf")?; - Ok(()) -} - async fn write_netbird_config_files(host_ip: &str) -> Result<()> { let public_origin = format!("http://{}:8087", host_ip); let server_origin = format!("http://{}:8086", host_ip); diff --git a/core/archipelago/src/api/rpc/package/update.rs b/core/archipelago/src/api/rpc/package/update.rs index 90541499..f48044a6 100644 --- a/core/archipelago/src/api/rpc/package/update.rs +++ b/core/archipelago/src/api/rpc/package/update.rs @@ -16,6 +16,8 @@ use anyhow::{Context, Result}; use tokio::io::{AsyncBufReadExt, BufReader}; use tracing::{error, info, warn}; +const PODMAN_UPDATE_PULL_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600); + impl RpcHandler { /// Update a package to the version pinned in image-versions.sh. /// This is a manual operation — the user clicks "Update" in the UI. @@ -327,6 +329,7 @@ impl RpcHandler { if archipelago_container::image_uses_insecure_registry(image) { cmd.arg("--tls-verify=false"); } + cmd.kill_on_drop(true); let mut child = cmd .arg(image) .stdout(std::process::Stdio::piped()) @@ -334,23 +337,38 @@ impl RpcHandler { .spawn() .context("Failed to start image pull")?; - if let Some(stderr) = child.stderr.take() { + let progress_task = if let Some(stderr) = child.stderr.take() { let reader = BufReader::new(stderr); let mut lines = reader.lines(); let pkg_id = package_id.to_string(); let state_mgr = self.state_manager.clone(); - while let Ok(Some(line)) = lines.next_line().await { - if let Some((downloaded, total)) = parse_pull_progress(&line) { - Self::update_install_progress(&state_mgr, &pkg_id, downloaded, total).await; + Some(tokio::spawn(async move { + while let Ok(Some(line)) = lines.next_line().await { + if let Some((downloaded, total)) = parse_pull_progress(&line) { + Self::update_install_progress(&state_mgr, &pkg_id, downloaded, total).await; + } } - } - } + })) + } else { + None + }; - let status = child - .wait() - .await - .context("Failed to wait for image pull")?; + let status = match tokio::time::timeout(PODMAN_UPDATE_PULL_TIMEOUT, child.wait()).await { + Ok(result) => result.context("Failed to wait for image pull")?, + Err(_) => { + let _ = child.kill().await; + return Err(anyhow::anyhow!( + "podman pull {} timed out after {}s", + image, + PODMAN_UPDATE_PULL_TIMEOUT.as_secs() + )); + } + }; + + if let Some(task) = progress_task { + let _ = task.await; + } if !status.success() { return Err(anyhow::anyhow!("podman pull {} failed", image)); } @@ -430,7 +448,6 @@ fn should_try_orchestrator_update(package_id: &str, orchestrator_available: bool fn orchestrator_update_app_id(package_id: &str) -> &str { match package_id { - "bitcoin-knots" => "bitcoin-core", "electrs" | "mempool-electrs" => "electrumx", _ => package_id, } @@ -459,8 +476,8 @@ fn candidate_app_ids_for_container(container_name: &str) -> Vec { match container_name { "bitcoin-knots" | "bitcoin-core" => { - push("bitcoin-core"); push("bitcoin-knots"); + push("bitcoin-core"); } "archy-bitcoin-ui" => push("bitcoin-ui"), "archy-lnd-ui" => push("lnd-ui"), @@ -525,7 +542,7 @@ mod tests { fn container_name_candidates_cover_common_aliases() { assert_eq!( candidate_app_ids_for_container("bitcoin-knots"), - vec!["bitcoin-core", "bitcoin-knots"] + vec!["bitcoin-knots", "bitcoin-core"] ); assert_eq!( candidate_app_ids_for_container("archy-bitcoin-ui"), @@ -543,7 +560,8 @@ mod tests { #[test] fn update_aliases_map_to_manifest_app_ids() { - assert_eq!(orchestrator_update_app_id("bitcoin-knots"), "bitcoin-core"); + assert_eq!(orchestrator_update_app_id("bitcoin-knots"), "bitcoin-knots"); + assert_eq!(orchestrator_update_app_id("bitcoin-core"), "bitcoin-core"); assert_eq!(orchestrator_update_app_id("electrs"), "electrumx"); assert_eq!(orchestrator_update_app_id("mempool-electrs"), "electrumx"); assert_eq!(orchestrator_update_app_id("fedimint"), "fedimint"); diff --git a/core/archipelago/src/api/rpc/system/handlers.rs b/core/archipelago/src/api/rpc/system/handlers.rs index 281839a2..573f4600 100644 --- a/core/archipelago/src/api/rpc/system/handlers.rs +++ b/core/archipelago/src/api/rpc/system/handlers.rs @@ -1,7 +1,7 @@ use super::*; use crate::api::rpc::RpcHandler; use anyhow::{Context, Result}; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; impl RpcHandler { /// server.set-name — Rename the server (persisted to data_dir/server-name) @@ -32,6 +32,21 @@ impl RpcHandler { data.server_info.name = Some(name.clone()); self.state_manager.update_data(data).await; + let hostname = hostname_from_server_name(&name); + let hostname_result = set_system_hostname(&hostname).await; + let (hostname_updated, hostname_error) = match hostname_result { + Ok(()) => (true, None), + Err(e) => { + warn!( + name = %name, + hostname = %hostname, + "Server name persisted but OS hostname update failed: {}", + e + ); + (false, Some(e.to_string())) + } + }; + info!("Server name updated to: {}", name); // Push the new name to federation peers in background @@ -43,7 +58,12 @@ impl RpcHandler { } }); - Ok(serde_json::json!({ "name": name })) + Ok(serde_json::json!({ + "name": name, + "hostname": hostname, + "hostname_updated": hostname_updated, + "hostname_error": hostname_error, + })) } /// system.stats — CPU usage, RAM used/total, disk used/total, uptime, load average @@ -155,21 +175,7 @@ impl RpcHandler { let mut freed_bytes: u64 = 0; let mut actions: Vec = Vec::new(); - // 1. Prune dangling container images - match prune_container_images().await { - Ok(bytes) => { - if bytes > 0 { - freed_bytes += bytes; - actions.push(format!( - "Pruned dangling images: {} freed", - format_bytes(bytes) - )); - } - } - Err(e) => actions.push(format!("Image prune failed: {}", e)), - } - - // 2. Clean old log files (> 30 days) + // 1. Clean old log files (> 30 days) match clean_old_logs(30).await { Ok(bytes) => { if bytes > 0 { @@ -180,7 +186,20 @@ impl RpcHandler { Err(e) => actions.push(format!("Log cleanup failed: {}", e)), } - // 3. Remove stale temp files + match vacuum_journal_logs("200M").await { + Ok(bytes) => { + if bytes > 0 { + freed_bytes += bytes; + actions.push(format!( + "Vacuumed journal logs: {} freed", + format_bytes(bytes) + )); + } + } + Err(e) => actions.push(format!("Journal cleanup failed: {}", e)), + } + + // 2. Remove stale temp files match clean_temp_files().await { Ok(bytes) => { if bytes > 0 { @@ -191,17 +210,53 @@ impl RpcHandler { Err(e) => actions.push(format!("Temp cleanup failed: {}", e)), } - // 4. Prune container build cache - match prune_build_cache().await { + // 3. Keep only the most recent backend deploy backups. These are useful + // for rollback, but a long-lived alpha node can accumulate gigabytes of + // old binaries under /usr/local/bin. + match clean_backend_backups(3).await { Ok(bytes) => { if bytes > 0 { freed_bytes += bytes; - actions.push(format!("Pruned build cache: {} freed", format_bytes(bytes))); + actions.push(format!( + "Removed old backend backups: {} freed", + format_bytes(bytes) + )); } } - Err(e) => actions.push(format!("Build cache prune failed: {}", e)), + Err(e) => actions.push(format!("Backend backup cleanup failed: {}", e)), } + match clean_legacy_backend_backups(3).await { + Ok(bytes) => { + if bytes > 0 { + freed_bytes += bytes; + actions.push(format!( + "Removed old legacy backend backups: {} freed", + format_bytes(bytes) + )); + } + } + Err(e) => actions.push(format!("Legacy backend backup cleanup failed: {}", e)), + } + + match clean_web_ui_backups(3).await { + Ok(bytes) => { + if bytes > 0 { + freed_bytes += bytes; + actions.push(format!( + "Removed old web UI backups: {} freed", + format_bytes(bytes) + )); + } + } + Err(e) => actions.push(format!("Web UI backup cleanup failed: {}", e)), + } + + actions.push( + "Skipped Podman image/volume prune: Podman store commands can block app health on busy nodes" + .to_string(), + ); + tracing::info!( "Disk cleanup complete: {} freed ({} actions)", format_bytes(freed_bytes), @@ -216,6 +271,54 @@ impl RpcHandler { } } +pub(super) fn hostname_from_server_name(name: &str) -> String { + let mut hostname = String::with_capacity(name.len()); + let mut previous_dash = false; + + for c in name.trim().chars().flat_map(char::to_lowercase) { + let valid = c.is_ascii_lowercase() || c.is_ascii_digit(); + if valid { + hostname.push(c); + previous_dash = false; + } else if !previous_dash { + hostname.push('-'); + previous_dash = true; + } + if hostname.len() >= 63 { + break; + } + } + + let hostname = hostname.trim_matches('-').to_string(); + if hostname.is_empty() { + "archipelago".to_string() + } else { + hostname + } +} + +async fn set_system_hostname(hostname: &str) -> Result<()> { + let output = tokio::process::Command::new("/usr/bin/sudo") + .args(["-n", "/usr/bin/hostnamectl", "set-hostname", hostname]) + .output() + .await + .context("Failed to run hostnamectl")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + anyhow::bail!( + "{}", + if stderr.is_empty() { + "hostnamectl failed".to_string() + } else { + stderr + } + ); + } + + Ok(()) +} + impl RpcHandler { /// system.factory-reset — Wipe all user data, remove containers, and restart. /// Only preserves the data_dir itself (recreated empty on restart). diff --git a/core/archipelago/src/api/rpc/system/mod.rs b/core/archipelago/src/api/rpc/system/mod.rs index 86f5a324..a9cd7d3d 100644 --- a/core/archipelago/src/api/rpc/system/mod.rs +++ b/core/archipelago/src/api/rpc/system/mod.rs @@ -1,6 +1,9 @@ mod handlers; +use crate::update::host_sudo; use anyhow::{Context, Result}; +use std::path::{Path, PathBuf}; +use std::time::SystemTime; use tracing::{debug, info}; /// Push the server name to all federation peers by syncing state. @@ -301,53 +304,12 @@ pub(super) async fn detect_usb_hardware_wallets() -> Result Result { - let output = tokio::process::Command::new("podman") - .args(["image", "prune", "-f"]) - .output() - .await - .context("Failed to run podman image prune")?; - - if !output.status.success() { - anyhow::bail!( - "podman image prune failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } - - // Podman outputs image IDs, estimate ~100MB per pruned image - let stdout = String::from_utf8_lossy(&output.stdout); - let pruned_count = stdout.lines().filter(|l| !l.trim().is_empty()).count(); - Ok(pruned_count as u64 * 100_000_000) // rough estimate -} - -/// Prune container build cache via `podman system prune -f`. -pub(super) async fn prune_build_cache() -> Result { - // Just prune volumes and build cache (not containers or images — those are handled above) - let output = tokio::process::Command::new("podman") - .args(["volume", "prune", "-f"]) - .output() - .await - .context("Failed to run podman volume prune")?; - - if !output.status.success() { - anyhow::bail!( - "podman volume prune failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } - - let stdout = String::from_utf8_lossy(&output.stdout); - let pruned_count = stdout.lines().filter(|l| !l.trim().is_empty()).count(); - Ok(pruned_count as u64 * 10_000_000) // rough estimate per volume -} - /// Clean log files older than `max_age_days` from common log directories. pub(super) async fn clean_old_logs(max_age_days: u64) -> Result { - let output = tokio::process::Command::new("sudo") + let output = tokio::process::Command::new("timeout") .args([ + "60s", + "sudo", "find", "/var/log", "-type", @@ -366,8 +328,10 @@ pub(super) async fn clean_old_logs(max_age_days: u64) -> Result { let stdout = String::from_utf8_lossy(&output.stdout); let deleted_count = stdout.lines().filter(|l| !l.trim().is_empty()).count(); // Also clean rotated/compressed logs - let _ = tokio::process::Command::new("sudo") + let _ = tokio::process::Command::new("timeout") .args([ + "60s", + "sudo", "find", "/var/log", "-type", @@ -384,14 +348,81 @@ pub(super) async fn clean_old_logs(max_age_days: u64) -> Result { Ok(deleted_count as u64 * 500_000) // rough estimate per log file } +/// Vacuum systemd journals to a bounded size. Returns measured bytes freed. +pub(super) async fn vacuum_journal_logs(max_size: &str) -> Result { + let before = journal_disk_usage().await.unwrap_or(0); + let output = tokio::process::Command::new("timeout") + .args(["60s", "sudo", "journalctl", "--vacuum-size", max_size]) + .output() + .await + .context("Failed to run journal vacuum")?; + + if !output.status.success() { + anyhow::bail!( + "journal vacuum failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + let after = journal_disk_usage().await.unwrap_or(before); + Ok(before.saturating_sub(after)) +} + +async fn journal_disk_usage() -> Result { + let output = tokio::process::Command::new("sudo") + .args(["-n", "journalctl", "--disk-usage"]) + .output() + .await + .context("Failed to read journal disk usage")?; + + if !output.status.success() { + anyhow::bail!( + "journalctl --disk-usage failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + + parse_journal_disk_usage(&String::from_utf8_lossy(&output.stdout)) + .ok_or_else(|| anyhow::anyhow!("could not parse journal disk usage")) +} + +fn parse_journal_disk_usage(output: &str) -> Option { + let mut parts = output.split_whitespace(); + while let Some(part) = parts.next() { + let (number, inline_unit) = split_number_unit(part); + let Ok(value) = number.parse::() else { + continue; + }; + let unit = inline_unit.unwrap_or_else(|| parts.next().unwrap_or_default()); + let multiplier = match unit { + "B" | "bytes" => 1.0, + "K" | "KB" | "KiB" => 1024.0, + "M" | "MB" | "MiB" => 1024.0 * 1024.0, + "G" | "GB" | "GiB" => 1024.0 * 1024.0 * 1024.0, + _ => continue, + }; + return Some((value * multiplier) as u64); + } + None +} + +fn split_number_unit(value: &str) -> (&str, Option<&str>) { + let split_at = value + .char_indices() + .find_map(|(idx, ch)| (!ch.is_ascii_digit() && ch != '.').then_some(idx)) + .unwrap_or(value.len()); + let (number, unit) = value.split_at(split_at); + (number, (!unit.is_empty()).then_some(unit)) +} + /// Remove stale temp files from /tmp and /var/tmp. pub(super) async fn clean_temp_files() -> Result { let mut freed = 0u64; for dir in &["/tmp", "/var/tmp"] { - let output = tokio::process::Command::new("sudo") + let output = tokio::process::Command::new("timeout") .args([ - "find", dir, "-type", "f", "-mtime", "+7", "-delete", "-print", + "45s", "sudo", "find", dir, "-type", "f", "-mtime", "+7", "-delete", "-print", ]) .output() .await; @@ -406,6 +437,177 @@ pub(super) async fn clean_temp_files() -> Result { Ok(freed) } +/// Keep the newest timestamped backend backups and remove older ones. +pub(super) async fn clean_backend_backups(keep: usize) -> Result { + clean_backend_backups_in(Path::new("/usr/local/bin"), keep).await +} + +/// Keep the newest legacy backend backups and remove older alpha-era deploy artifacts. +pub(super) async fn clean_legacy_backend_backups(keep: usize) -> Result { + clean_named_backups_in( + Path::new("/usr/local/bin"), + keep, + |name| name.starts_with("archipelago.bak") || name.starts_with("archipelago.before-"), + false, + ) + .await +} + +/// Keep the newest web UI rollback backups and remove older copies. +pub(super) async fn clean_web_ui_backups(keep: usize) -> Result { + clean_named_backups_in( + Path::new("/opt/archipelago"), + keep, + |name| name.starts_with("web-ui.bak") || name == "web-ui.old", + true, + ) + .await +} + +async fn clean_backend_backups_in(dir: &Path, keep: usize) -> Result { + let mut backups = backend_backup_candidates(dir).await?; + remove_old_backups(&mut backups, keep, false).await +} + +async fn clean_named_backups_in( + dir: &Path, + keep: usize, + matches_name: impl Fn(&str) -> bool, + allow_dirs: bool, +) -> Result { + let mut backups = named_backup_candidates(dir, matches_name, allow_dirs).await?; + remove_old_backups(&mut backups, keep, allow_dirs).await +} + +async fn remove_old_backups( + backups: &mut Vec, + keep: usize, + allow_dirs: bool, +) -> Result { + backups.sort_by(|a, b| { + b.modified + .cmp(&a.modified) + .then_with(|| b.name.cmp(&a.name)) + }); + + let mut freed = 0u64; + for backup in backups.iter().skip(keep) { + let remove_result = if backup.is_dir && allow_dirs { + tokio::fs::remove_dir_all(&backup.path).await + } else { + tokio::fs::remove_file(&backup.path).await + }; + match remove_result { + Ok(()) => freed += backup.size, + Err(_) => { + remove_path_with_sudo(&backup.path, backup.is_dir && allow_dirs).await?; + freed += backup.size; + } + } + } + Ok(freed) +} + +async fn remove_path_with_sudo(path: &Path, recursive: bool) -> Result<()> { + let path = path.to_string_lossy(); + let args = if recursive { + vec!["rm", "-rf", path.as_ref()] + } else { + vec!["rm", "-f", path.as_ref()] + }; + let status = host_sudo(&args) + .await + .with_context(|| format!("removing {path} via sudo"))?; + if !status.success() { + anyhow::bail!( + "sudo rm {} {path} exited with {status}", + if recursive { "-rf" } else { "-f" } + ); + } + Ok(()) +} + +#[derive(Debug)] +struct BackupArtifact { + path: PathBuf, + name: String, + modified: SystemTime, + size: u64, + is_dir: bool, +} + +async fn backend_backup_candidates(dir: &Path) -> Result> { + named_backup_candidates( + dir, + |name| { + name.strip_prefix("archipelago.backup-") + .is_some_and(|suffix| !suffix.is_empty() && !suffix.contains('/')) + }, + false, + ) + .await +} + +async fn named_backup_candidates( + dir: &Path, + matches_name: impl Fn(&str) -> bool, + allow_dirs: bool, +) -> Result> { + let mut backups = Vec::new(); + let mut entries = match tokio::fs::read_dir(dir).await { + Ok(entries) => entries, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(backups), + Err(e) => return Err(e).with_context(|| format!("reading {}", dir.display())), + }; + + while let Some(entry) = entries.next_entry().await? { + let file_name = entry.file_name(); + let name = file_name.to_string_lossy(); + if !matches_name(&name) { + continue; + } + + let meta = entry.metadata().await?; + if !meta.is_file() && !(allow_dirs && meta.is_dir()) { + continue; + } + + backups.push(BackupArtifact { + path: entry.path(), + name: name.to_string(), + modified: meta.modified().unwrap_or(SystemTime::UNIX_EPOCH), + size: path_size(&entry.path(), &meta).await.unwrap_or(meta.len()), + is_dir: meta.is_dir(), + }); + } + Ok(backups) +} + +async fn path_size(path: &Path, meta: &std::fs::Metadata) -> Result { + if meta.is_file() { + return Ok(meta.len()); + } + if !meta.is_dir() { + return Ok(0); + } + + let output = tokio::process::Command::new("du") + .args(["-sb", &path.to_string_lossy()]) + .output() + .await + .with_context(|| format!("du -sb {}", path.display()))?; + if !output.status.success() { + anyhow::bail!("du -sb {} failed", path.display()); + } + let stdout = String::from_utf8_lossy(&output.stdout); + stdout + .split_whitespace() + .next() + .ok_or_else(|| anyhow::anyhow!("du output missing size for {}", path.display()))? + .parse::() + .with_context(|| format!("parse du size for {}", path.display())) +} + pub(super) fn format_bytes(bytes: u64) -> String { const KB: u64 = 1024; const MB: u64 = KB * 1024; @@ -422,6 +624,103 @@ pub(super) fn format_bytes(bytes: u64) -> String { } } +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn backend_backup_cleanup_keeps_newest_files() { + let dir = tempfile::tempdir().unwrap(); + for name in [ + "archipelago.backup-20260501", + "archipelago.backup-20260502", + "archipelago.backup-20260503", + "archipelago.backup-20260504", + "archipelago.backup-20260505", + "archipelago.bak", + "archipelago", + ] { + tokio::fs::write(dir.path().join(name), b"12345") + .await + .unwrap(); + } + + let freed = clean_backend_backups_in(dir.path(), 3).await.unwrap(); + + assert_eq!(freed, 10); + assert!(!dir.path().join("archipelago.backup-20260501").exists()); + assert!(!dir.path().join("archipelago.backup-20260502").exists()); + assert!(dir.path().join("archipelago.backup-20260503").exists()); + assert!(dir.path().join("archipelago.backup-20260504").exists()); + assert!(dir.path().join("archipelago.backup-20260505").exists()); + assert!(dir.path().join("archipelago.bak").exists()); + assert!(dir.path().join("archipelago").exists()); + } + + #[tokio::test] + async fn legacy_backend_backup_cleanup_keeps_newest_matching_files() { + let dir = tempfile::tempdir().unwrap(); + for name in [ + "archipelago.bak-1", + "archipelago.bak-2", + "archipelago.before-3", + "archipelago.backup-keep-separate", + "archipelago", + ] { + tokio::fs::write(dir.path().join(name), b"12345") + .await + .unwrap(); + } + + let freed = clean_named_backups_in( + dir.path(), + 1, + |name| name.starts_with("archipelago.bak") || name.starts_with("archipelago.before-"), + false, + ) + .await + .unwrap(); + + assert_eq!(freed, 10); + assert_eq!( + [ + "archipelago.bak-1", + "archipelago.bak-2", + "archipelago.before-3" + ] + .into_iter() + .filter(|name| dir.path().join(name).exists()) + .count(), + 1 + ); + assert!(dir.path().join("archipelago.backup-keep-separate").exists()); + assert!(dir.path().join("archipelago").exists()); + } + + #[test] + fn hostname_from_server_name_derives_linux_safe_hostname() { + assert_eq!( + handlers::hostname_from_server_name("My Archipelago Node"), + "my-archipelago-node" + ); + assert_eq!( + handlers::hostname_from_server_name("Kitchen_Node!! 01"), + "kitchen-node-01" + ); + assert_eq!(handlers::hostname_from_server_name("!!!"), "archipelago"); + } + + #[test] + fn parses_journal_disk_usage() { + assert_eq!( + parse_journal_disk_usage( + "Archived and active journals take up 463.9M in the file system." + ), + Some(486_434_406) + ); + } +} + /// Read temperatures from /sys/class/thermal/thermal_zone*/temp. pub(super) async fn read_temperatures() -> Result> { let mut temps = Vec::new(); diff --git a/core/archipelago/src/auth.rs b/core/archipelago/src/auth.rs index 12f71f9a..ce7f51cc 100644 --- a/core/archipelago/src/auth.rs +++ b/core/archipelago/src/auth.rs @@ -86,6 +86,11 @@ pub struct AuthManager { data_dir: PathBuf, } +pub struct ChangePasswordOutcome { + pub ssh_updated: bool, + pub ssh_error: Option, +} + impl AuthManager { pub fn new(data_dir: PathBuf) -> Self { Self { data_dir } @@ -288,7 +293,7 @@ impl AuthManager { current_password: &str, new_password: &str, also_change_ssh: bool, - ) -> Result<()> { + ) -> Result { if !self.verify_password(current_password).await? { anyhow::bail!("Current password is incorrect"); } @@ -314,11 +319,21 @@ impl AuthManager { let content = serde_json::to_string_pretty(&user)?; fs::write(&user_file, content).await?; + let mut outcome = ChangePasswordOutcome { + ssh_updated: false, + ssh_error: None, + }; if also_change_ssh { - change_ssh_password(new_password).await?; + match change_ssh_password(new_password).await { + Ok(()) => outcome.ssh_updated = true, + Err(e) => { + tracing::warn!("Web password changed but SSH password update failed: {}", e); + outcome.ssh_error = Some(e.to_string()); + } + } } - Ok(()) + Ok(outcome) } } @@ -485,6 +500,23 @@ mod tests { assert!(validate_password_strength("MyP@ssw0rd!123").is_ok()); } + #[tokio::test] + async fn test_change_password_updates_web_password_without_ssh() { + let dir = tempfile::tempdir().unwrap(); + let auth = AuthManager::new(dir.path().to_path_buf()); + auth.setup_user("password123").await.unwrap(); + + let outcome = auth + .change_password("password123", "MyP@ssw0rd!123", false) + .await + .unwrap(); + + assert!(!outcome.ssh_updated); + assert!(outcome.ssh_error.is_none()); + assert!(auth.verify_password("MyP@ssw0rd!123").await.unwrap()); + assert!(!auth.verify_password("password123").await.unwrap()); + } + #[test] fn test_validate_password_strength_too_short() { assert!(validate_password_strength("Ab1!").is_err()); diff --git a/core/archipelago/src/bitcoin_status.rs b/core/archipelago/src/bitcoin_status.rs index c0e16107..1a67cdad 100644 --- a/core/archipelago/src/bitcoin_status.rs +++ b/core/archipelago/src/bitcoin_status.rs @@ -13,7 +13,8 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tokio::sync::RwLock; use tracing::{debug, warn}; -const CACHE_REFRESH_SECS: u64 = 5; +const CACHE_REFRESH_SECS: u64 = 10; +const CACHE_ERROR_BACKOFF_SECS: u64 = 15; #[derive(Debug, Clone, Serialize)] pub struct BitcoinNodeStatus { @@ -65,6 +66,36 @@ fn transient_error(err_msg: &str) -> bool { || lower.contains("broken pipe") || lower.contains("eof") || lower.contains("500 internal server error") + || lower.contains("503 service unavailable") + || lower.contains("work queue depth exceeded") + || lower.contains("decode bitcoin rpc json") + || lower.contains("error decoding response body") + || lower.contains("expected value at line 1 column 1") +} + +fn friendly_transient_error(has_cached_state: bool, err_msg: &str) -> String { + let detail = err_msg + .lines() + .next() + .unwrap_or(err_msg) + .trim() + .trim_end_matches('.'); + let lower = detail.to_lowercase(); + let state = if lower.contains("verifying blocks") { + "verifying blocks after restart" + } else if lower.contains("connection refused") || lower.contains("tcp connect error") { + "waiting for the Bitcoin RPC listener" + } else if lower.contains("timed out") || lower.contains("timeout") { + "busy and not answering RPC before the timeout" + } else { + "starting or busy syncing" + }; + + if has_cached_state { + format!("Bitcoin node is {state}; showing last known state and retrying. Detail: {detail}") + } else { + format!("Bitcoin node is {state}; retrying automatically. Detail: {detail}") + } } pub fn spawn_status_cache() { @@ -72,6 +103,7 @@ pub fn spawn_status_cache() { loop { let fresh = fetch_bitcoin_status().await; let mut cached = cache().write().await; + let mut sleep_secs = CACHE_REFRESH_SECS; match fresh { Ok(mut status) => { status.ok = true; @@ -80,33 +112,31 @@ pub fn spawn_status_cache() { *cached = status; } Err(e) => { - let err_msg = e.to_string(); + let err_msg = format!("{e:#}"); if transient_error(&err_msg) { debug!("Bitcoin status: transient RPC failure: {}", err_msg); } else { warn!("Bitcoin status: RPC failure: {}", err_msg); } + sleep_secs = CACHE_ERROR_BACKOFF_SECS; if cached.blockchain_info.is_some() { cached.ok = false; cached.stale = true; - cached.error = Some(format!( - "Bitcoin node is reconnecting; showing last known state: {}", - err_msg - )); + cached.error = Some(friendly_transient_error(true, &err_msg)); } else { *cached = BitcoinNodeStatus { ok: false, stale: false, updated_at_ms: now_ms(), - error: Some(format!("Connecting to Bitcoin node: {}", err_msg)), + error: Some(friendly_transient_error(false, &err_msg)), ..BitcoinNodeStatus::default() }; } } } drop(cached); - tokio::time::sleep(Duration::from_secs(CACHE_REFRESH_SECS)).await; + tokio::time::sleep(Duration::from_secs(sleep_secs)).await; } }); } @@ -117,7 +147,7 @@ pub async fn get_bitcoin_status() -> BitcoinNodeStatus { async fn fetch_bitcoin_status() -> Result { let client = reqwest::Client::builder() - .timeout(Duration::from_secs(8)) + .timeout(Duration::from_secs(20)) .build() .context("build Bitcoin status HTTP client")?; @@ -183,3 +213,40 @@ async fn bitcoin_rpc_call( .cloned() .context("missing Bitcoin RPC result") } + +#[cfg(test)] +mod tests { + use super::friendly_transient_error; + + #[test] + fn explains_verifying_blocks_without_generic_timeout_copy() { + let msg = friendly_transient_error( + false, + r#"getblockchaininfo: Bitcoin RPC returned 500 Internal Server Error: {"error":{"code":-28,"message":"Verifying blocks..."}}"#, + ); + + assert!(msg.contains("verifying blocks after restart")); + assert!(msg.contains("retrying automatically")); + } + + #[test] + fn explains_missing_rpc_listener() { + let msg = friendly_transient_error( + true, + "getblockchaininfo: tcp connect error: Connection refused (os error 111)", + ); + + assert!(msg.contains("waiting for the Bitcoin RPC listener")); + assert!(msg.contains("showing last known state")); + } + + #[test] + fn explains_rpc_timeout() { + let msg = friendly_transient_error( + false, + "getblockchaininfo: Bitcoin RPC request failed: operation timed out", + ); + + assert!(msg.contains("busy and not answering RPC before the timeout")); + } +} diff --git a/core/archipelago/src/container/bitcoin_ui_nginx.conf.template b/core/archipelago/src/container/bitcoin_ui_nginx.conf.template index 7ab57dd8..43ad2ee6 100644 --- a/core/archipelago/src/container/bitcoin_ui_nginx.conf.template +++ b/core/archipelago/src/container/bitcoin_ui_nginx.conf.template @@ -23,5 +23,15 @@ server { proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; add_header Cache-Control "no-store"; } + location /rpc/v1 { + proxy_pass http://127.0.0.1:5678/rpc/v1; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header Cookie $http_cookie; + proxy_set_header X-CSRF-Token $http_x_csrf_token; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + add_header Cache-Control "no-store"; + } location / { try_files $uri $uri/ /index.html; } } diff --git a/core/archipelago/src/container/boot_reconciler.rs b/core/archipelago/src/container/boot_reconciler.rs index 264a86d0..d52bd3ed 100644 --- a/core/archipelago/src/container/boot_reconciler.rs +++ b/core/archipelago/src/container/boot_reconciler.rs @@ -34,6 +34,7 @@ pub struct BootReconciler { /// `systemctl --user` and `podman`, which both block real time /// and would race the paused-clock test fixtures. companion_stage: bool, + wait_for_recovery: bool, } impl BootReconciler { @@ -47,6 +48,7 @@ impl BootReconciler { interval, shutdown, companion_stage: true, + wait_for_recovery: true, } } @@ -56,6 +58,7 @@ impl BootReconciler { #[cfg(test)] pub fn without_companion_stage(mut self) -> Self { self.companion_stage = false; + self.wait_for_recovery = false; self } @@ -78,6 +81,21 @@ impl BootReconciler { /// by the orchestrator, and companion failures are logged but never /// propagated. pub async fn run_forever(self) { + let wait_start = Instant::now(); + while self.wait_for_recovery && !crate::crash_recovery::is_recovery_complete() { + if wait_start.elapsed() > Duration::from_secs(1800) { + tracing::warn!("boot reconciler: boot recovery did not complete within 30 minutes, starting anyway"); + break; + } + tokio::select! { + _ = time::sleep(Duration::from_secs(5)) => {} + _ = self.shutdown.notified() => { + tracing::info!("boot reconciler: shutdown requested before recovery completed"); + return; + } + } + } + // Initial pass: no delay. self.tick().await; @@ -244,58 +262,65 @@ mod tests { ProdContainerOrchestrator::with_runtime(rt, PathBuf::from("/nonexistent-for-tests")); let tmp = tempfile::tempdir().unwrap().keep(); orch.set_data_dir(tmp); + orch.set_disk_gb_for_test(2_000); let orch = Arc::new(orch); orch.insert_manifest_for_test( - pull_manifest("bitcoin-knots", "docker.io/bitcoin/knots:28"), - PathBuf::from("/tmp/bk"), + pull_manifest("test-app", "docker.io/example/test-app:1"), + PathBuf::from("/tmp/test-app"), ) .await; orch } - #[tokio::test(start_paused = true)] + async fn wait_for_status_calls(rt: &CountingRuntime, expected: u32) -> u32 { + for _ in 0..100 { + let count = rt.status_call_count(); + if count >= expected { + return count; + } + tokio::task::yield_now().await; + tokio::time::sleep(Duration::from_millis(1)).await; + } + rt.status_call_count() + } + + #[tokio::test] async fn initial_pass_fires_immediately() { - let rt = Arc::new(CountingRuntime::new_with(&["bitcoin-knots"])); + let rt = Arc::new(CountingRuntime::new_with(&["test-app"])); let orch = orch_with_one_running_manifest(rt.clone()).await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()) + BootReconciler::new(orch.clone(), Duration::from_millis(50), shutdown.clone()) .without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); - // Yield so the spawned task gets CPU to run its initial reconcile. - tokio::task::yield_now().await; - tokio::task::yield_now().await; - // We expect exactly one reconcile pass to have run by now (the initial), // NOT a second one (the 30s sleep hasn't elapsed in paused time). - assert_eq!(rt.status_call_count(), 1, "initial pass should fire once"); + assert_eq!( + wait_for_status_calls(&rt, 1).await, + 1, + "initial pass should fire once" + ); shutdown.notify_one(); - // Under paused clock the select! is blocked on sleep_until; the notify - // will unblock it. Advance wall-clock a hair so the notify gets polled. tokio::task::yield_now().await; let _ = tokio::time::timeout(Duration::from_secs(1), handle).await; } - #[tokio::test(start_paused = true)] + #[tokio::test] async fn second_pass_fires_after_interval() { - let rt = Arc::new(CountingRuntime::new_with(&["bitcoin-knots"])); + let rt = Arc::new(CountingRuntime::new_with(&["test-app"])); let orch = orch_with_one_running_manifest(rt.clone()).await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()) + BootReconciler::new(orch.clone(), Duration::from_millis(10), shutdown.clone()) .without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); - tokio::task::yield_now().await; - tokio::task::yield_now().await; - assert_eq!(rt.status_call_count(), 1); + assert_eq!(wait_for_status_calls(&rt, 1).await, 1); - // Fast-forward past one interval; the sleep_until should fire. - tokio::time::advance(Duration::from_secs(31)).await; - tokio::task::yield_now().await; - tokio::task::yield_now().await; + tokio::time::sleep(Duration::from_millis(20)).await; + wait_for_status_calls(&rt, 2).await; assert_eq!( rt.status_call_count(), @@ -308,27 +333,23 @@ mod tests { let _ = tokio::time::timeout(Duration::from_secs(1), handle).await; } - #[tokio::test(start_paused = true)] + #[tokio::test] async fn shutdown_terminates_loop() { - let rt = Arc::new(CountingRuntime::new_with(&["bitcoin-knots"])); + let rt = Arc::new(CountingRuntime::new_with(&["test-app"])); let orch = orch_with_one_running_manifest(rt.clone()).await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()) + BootReconciler::new(orch.clone(), Duration::from_millis(50), shutdown.clone()) .without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); - tokio::task::yield_now().await; - tokio::task::yield_now().await; + wait_for_status_calls(&rt, 1).await; shutdown.notify_one(); - // The select! should wake on Notified and return. Use a real timeout - // with advancing the paused clock to make sure the task exits. - tokio::time::advance(Duration::from_millis(10)).await; let result = tokio::time::timeout(Duration::from_secs(5), handle).await; assert!(result.is_ok(), "reconciler did not exit after shutdown"); } - #[tokio::test(start_paused = true)] + #[tokio::test] async fn failure_in_one_pass_does_not_stop_loop() { // Manifest references a container the runtime does not have AND // cannot create (no install path — install_fresh will also fail to @@ -344,26 +365,23 @@ mod tests { ); let tmp = tempfile::tempdir().unwrap().keep(); orch.set_data_dir(tmp); + orch.set_disk_gb_for_test(2_000); let orch = Arc::new(orch); orch.insert_manifest_for_test( - pull_manifest("bitcoin-knots", "docker.io/bitcoin/knots:28"), - PathBuf::from("/tmp/bk"), + pull_manifest("test-app", "docker.io/example/test-app:1"), + PathBuf::from("/tmp/test-app"), ) .await; let shutdown = Arc::new(Notify::new()); let reconciler = - BootReconciler::new(orch.clone(), Duration::from_secs(30), shutdown.clone()) + BootReconciler::new(orch.clone(), Duration::from_millis(10), shutdown.clone()) .without_companion_stage(); let handle = tokio::spawn(reconciler.run_forever()); - tokio::task::yield_now().await; - tokio::task::yield_now().await; - let first = rt.status_call_count(); + let first = wait_for_status_calls(&rt, 1).await; assert!(first >= 1, "initial pass should have touched the runtime"); - // Advance one interval — second pass should fire regardless of what - // the first pass did. - tokio::time::advance(Duration::from_secs(31)).await; + tokio::time::sleep(Duration::from_millis(20)).await; tokio::task::yield_now().await; tokio::task::yield_now().await; let second = rt.status_call_count(); @@ -373,7 +391,6 @@ mod tests { ); shutdown.notify_one(); - tokio::time::advance(Duration::from_millis(10)).await; let _ = tokio::time::timeout(Duration::from_secs(5), handle).await; } } diff --git a/core/archipelago/src/container/companion.rs b/core/archipelago/src/container/companion.rs index 5cca19e0..144505ca 100644 --- a/core/archipelago/src/container/companion.rs +++ b/core/archipelago/src/container/companion.rs @@ -9,6 +9,7 @@ //! | bitcoin-core | archy-bitcoin-ui | RPC viewer | //! | lnd | archy-lnd-ui | wallet/channel UI | //! | electrumx | archy-electrs-ui | indexer status UI | +//! | fedimint | archy-fedimint-ui | wait/proxy Guardian UI | //! //! Lifecycle: `install` writes a Quadlet `.container` unit to //! `~/.config/containers/systemd/`, daemon-reloads, then starts the @@ -22,6 +23,7 @@ use anyhow::{Context, Result}; use std::path::PathBuf; +use std::time::Duration; use tokio::fs; use tokio::process::Command; use tracing::{info, warn}; @@ -30,6 +32,9 @@ use crate::container::quadlet::{self, BindMount, NetworkMode, QuadletUnit}; use archipelago_container::image_uses_insecure_registry; const COMPANION_REGISTRY: &str = "146.59.87.168:3000/lfg2025"; +const COMPANION_IMAGE_CHECK_TIMEOUT: Duration = Duration::from_secs(15); +const COMPANION_BUILD_TIMEOUT: Duration = Duration::from_secs(900); +const COMPANION_PULL_TIMEOUT: Duration = Duration::from_secs(300); /// Static description of one companion. The full list per backend /// app_id lives in `companions_for`. @@ -65,6 +70,7 @@ pub fn companions_for(package_id: &str) -> &'static [CompanionSpec] { "bitcoin" | "bitcoin-core" | "bitcoin-knots" => BITCOIN_UI, "lnd" => LND_UI, "electrumx" | "electrs" | "mempool-electrs" => ELECTRS_UI, + "fedimint" | "fedimintd" => FEDIMINT_UI, _ => &[], } } @@ -114,6 +120,20 @@ const ELECTRS_UI: &[CompanionSpec] = &[CompanionSpec { host_network: true, }]; +const FEDIMINT_UI: &[CompanionSpec] = &[CompanionSpec { + name: "archy-fedimint-ui", + image_base: "fedimint-ui", + build_dir_candidates: &[ + "/opt/archipelago/docker/fedimint-ui", + "/home/archipelago/archy/docker/fedimint-ui", + "/home/archipelago/Projects/archy/docker/fedimint-ui", + ], + pre_start: None, + bind_mounts: &[], + ports: &[], + host_network: true, +}]; + fn render_bitcoin_ui() -> futures_util::future::BoxFuture<'static, Result<()>> { Box::pin(async { let paths = crate::container::bitcoin_ui::RenderPaths::default(); @@ -201,11 +221,12 @@ async fn ensure_image_present(spec: &CompanionSpec) -> Result { return Ok(local_image); } info!(companion = spec.name, "building locally from {dir}"); - let out = Command::new("podman") - .args(["build", "-t", &local_image, dir]) - .output() - .await - .context("spawn podman build")?; + let out = command_output_with_timeout( + Command::new("podman").args(["build", "-t", &local_image, dir]), + COMPANION_BUILD_TIMEOUT, + "podman build companion image", + ) + .await?; if out.status.success() { return Ok(local_image); } @@ -226,7 +247,12 @@ async fn ensure_image_present(spec: &CompanionSpec) -> Result { cmd.arg("--tls-verify=false"); } cmd.arg(®istry_image); - let out = cmd.output().await.context("spawn podman pull")?; + let out = command_output_with_timeout( + &mut cmd, + COMPANION_PULL_TIMEOUT, + "podman pull companion image", + ) + .await?; if !out.status.success() { anyhow::bail!( "no local Dockerfile and registry pull failed for {}: {}", @@ -238,11 +264,31 @@ async fn ensure_image_present(spec: &CompanionSpec) -> Result { } async fn image_exists(image: &str) -> bool { - Command::new("podman") - .args(["image", "exists", image]) - .status() + let mut cmd = Command::new("podman"); + cmd.args(["image", "inspect", image]); + match tokio::time::timeout(COMPANION_IMAGE_CHECK_TIMEOUT, cmd.status()).await { + Ok(Ok(status)) => status.success(), + Ok(Err(err)) => { + warn!(image = %image, error = %err, "companion image existence check failed"); + false + } + Err(_) => { + warn!(image = %image, "companion image existence check timed out"); + false + } + } +} + +async fn command_output_with_timeout( + cmd: &mut Command, + timeout: Duration, + description: &str, +) -> Result { + cmd.kill_on_drop(true); + tokio::time::timeout(timeout, cmd.output()) .await - .is_ok_and(|status| status.success()) + .with_context(|| format!("{description} timed out after {}s", timeout.as_secs()))? + .with_context(|| format!("spawn {description}")) } fn build_unit(spec: &CompanionSpec, image: &str) -> QuadletUnit { @@ -368,6 +414,8 @@ mod tests { assert_eq!(companions_for("electrumx").len(), 1); assert_eq!(companions_for("electrs").len(), 1); assert_eq!(companions_for("mempool-electrs").len(), 1); + assert_eq!(companions_for("fedimint").len(), 1); + assert_eq!(companions_for("fedimintd").len(), 1); assert_eq!(companions_for("nextcloud").len(), 0); assert_eq!(companions_for("not-a-real-app").len(), 0); } @@ -398,4 +446,13 @@ mod tests { assert!(matches!(u.network, NetworkMode::Bridge(ref n) if n == "bridge")); assert_eq!(u.ports, vec![(18083, 80, "tcp".into())]); } + + #[test] + fn fedimint_ui_uses_host_network_for_public_guardian_port() { + let spec = &FEDIMINT_UI[0]; + let u = build_unit(spec, "localhost/fedimint-ui:latest"); + assert_eq!(u.name, "archy-fedimint-ui"); + assert!(matches!(u.network, NetworkMode::Host)); + assert!(u.ports.is_empty()); + } } diff --git a/core/archipelago/src/container/docker_packages.rs b/core/archipelago/src/container/docker_packages.rs index 1c161c98..97dfbe03 100644 --- a/core/archipelago/src/container/docker_packages.rs +++ b/core/archipelago/src/container/docker_packages.rs @@ -26,13 +26,7 @@ impl DockerPackageScanner { /// Scan Docker containers and convert to package data pub async fn scan_containers(&self) -> Result> { - let containers = match self.runtime.list_containers().await { - Ok(c) => c, - Err(e) => { - debug!("Failed to list containers: {}", e); - return Ok(HashMap::new()); - } - }; + let containers = self.runtime.list_containers().await?; debug!("Found {} containers", containers.len()); @@ -63,14 +57,6 @@ impl DockerPackageScanner { "indeedhub-build_ffmpeg-worker_1", "netbird-server", "netbird-dashboard", - "saleor-api", - "saleor-worker", - "saleor-db", - "saleor-cache", - "saleor-jaeger", - "saleor-mailpit", - "saleor-storefront", - "saleor-storefront-app", "buildx_buildkit_default", ]; @@ -298,7 +284,6 @@ fn get_app_tier(app_id: &str) -> &'static str { "uptime-kuma" => "recommended", "grafana" => "recommended", "searxng" => "recommended", - "saleor" => "recommended", "tailscale" | "netbird" => "recommended", "portainer" => "recommended", // Optional: everything else @@ -519,13 +504,6 @@ fn get_app_metadata(app_id: &str) -> AppMetadata { repo: "https://github.com/netbirdio/netbird".to_string(), tier: "", }, - "saleor" => AppMetadata { - title: "Saleor".to_string(), - description: "Composable commerce platform with storefront, dashboard, and GraphQL API. The customer storefront opens on port 9011; admin dashboard is on 9010 with admin@example.com credentials stored on the node.".to_string(), - icon: "/assets/img/app-icons/saleor.svg".to_string(), - repo: "https://github.com/saleor/saleor".to_string(), - tier: "", - }, "gitea" => AppMetadata { title: "Gitea".to_string(), description: "Self-hosted Git service with repository and package hosting".to_string(), @@ -732,20 +710,25 @@ async fn reachable_lan_address(app_id: &str, candidate: Option) -> Optio let Some(port) = url.rsplit(':').next().and_then(|p| p.parse::().ok()) else { return None; }; - match tokio::time::timeout( - std::time::Duration::from_secs(2), - tokio::net::TcpStream::connect(("127.0.0.1", port)), - ) - .await - { - Ok(Ok(_)) => Some(url), - _ => { - debug!(app_id = %app_id, port, "suppressing unreachable launch URL"); - None - } + if launch_port_reachable(port).await { + Some(url) + } else { + debug!(app_id = %app_id, port, "suppressing unreachable launch URL"); + None } } +async fn launch_port_reachable(port: u16) -> bool { + matches!( + tokio::time::timeout( + std::time::Duration::from_secs(2), + tokio::net::TcpStream::connect(("127.0.0.1", port)), + ) + .await, + Ok(Ok(_)) + ) +} + fn requires_reachable_launch(app_id: &str) -> bool { matches!( app_id, @@ -766,7 +749,6 @@ fn requires_reachable_launch(app_id: &str) -> bool { | "tailscale" | "immich" | "searxng" - | "saleor" ) } diff --git a/core/archipelago/src/container/filebrowser.rs b/core/archipelago/src/container/filebrowser.rs index 8501699d..e51b11fe 100644 --- a/core/archipelago/src/container/filebrowser.rs +++ b/core/archipelago/src/container/filebrowser.rs @@ -8,6 +8,8 @@ use anyhow::{Context, Result}; use std::path::PathBuf; use tokio::fs; +use crate::update::host_sudo; + pub const DEFAULT_SRV_ROOT: &str = "/var/lib/archipelago/filebrowser"; pub const DEFAULT_DATA_DIR: &str = "/var/lib/archipelago/filebrowser-data"; pub const DEFAULT_CONFIG_PATH: &str = "/var/lib/archipelago/filebrowser-data/.filebrowser.json"; @@ -39,17 +41,11 @@ pub enum EnsureOutcome { } pub async fn ensure_config(paths: &EnsurePaths) -> Result { - fs::create_dir_all(&paths.srv_root) - .await - .with_context(|| format!("creating {}", paths.srv_root.display()))?; - fs::create_dir_all(&paths.data_dir) - .await - .with_context(|| format!("creating {}", paths.data_dir.display()))?; + create_dir_all_or_sudo(&paths.srv_root).await?; + create_dir_all_or_sudo(&paths.data_dir).await?; for d in ["Documents", "Photos", "Music", "Downloads", "Builds"] { - fs::create_dir_all(paths.srv_root.join(d)) - .await - .with_context(|| format!("creating {}/{}", paths.srv_root.display(), d))?; + create_dir_all_or_sudo(&paths.srv_root.join(d)).await?; } if paths.config_path.exists() { @@ -60,27 +56,67 @@ pub async fn ensure_config(paths: &EnsurePaths) -> Result { .config_path .parent() .ok_or_else(|| anyhow::anyhow!("config_path has no parent directory"))?; - fs::create_dir_all(parent) - .await - .with_context(|| format!("creating {}", parent.display()))?; + create_dir_all_or_sudo(parent).await?; - let tmp = paths.config_path.with_extension("tmp"); - fs::write(&tmp, DEFAULT_CONFIG_JSON) - .await - .with_context(|| format!("writing tmp {}", tmp.display()))?; - fs::rename(&tmp, &paths.config_path) - .await - .with_context(|| { - format!( - "renaming {} -> {}", - tmp.display(), - paths.config_path.display() - ) - })?; + write_config_atomically(paths).await?; Ok(EnsureOutcome::Written) } +async fn create_dir_all_or_sudo(path: &std::path::Path) -> Result<()> { + match fs::create_dir_all(path).await { + Ok(()) => Ok(()), + Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => { + let path = path.to_string_lossy(); + let status = host_sudo(&["mkdir", "-p", &path]) + .await + .with_context(|| format!("creating {path} via sudo"))?; + if !status.success() { + anyhow::bail!("mkdir -p {path} via sudo exited with {status}"); + } + Ok(()) + } + Err(e) => Err(e).with_context(|| format!("creating {}", path.display())), + } +} + +async fn write_config_atomically(paths: &EnsurePaths) -> Result<()> { + let tmp = paths.config_path.with_extension("tmp"); + match fs::write(&tmp, DEFAULT_CONFIG_JSON).await { + Ok(()) => { + fs::rename(&tmp, &paths.config_path) + .await + .with_context(|| { + format!( + "renaming {} -> {}", + tmp.display(), + paths.config_path.display() + ) + })?; + Ok(()) + } + Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => { + let script = format!( + "set -eu\ncat > '{}' <<'FILEBROWSERCONF'\n{}FILEBROWSERCONF\n", + shell_quote(&paths.config_path.to_string_lossy()), + DEFAULT_CONFIG_JSON + ); + let status = host_sudo(&["sh", "-lc", &script]) + .await + .context("writing .filebrowser.json via sudo")?; + if !status.success() { + anyhow::bail!("writing .filebrowser.json via sudo exited with {status}"); + } + Ok(()) + } + Err(e) => Err(e).with_context(|| format!("writing tmp {}", tmp.display())), + } +} + +fn shell_quote(s: &str) -> String { + s.replace('\'', "'\\''") +} + #[cfg(test)] mod tests { use super::*; diff --git a/core/archipelago/src/container/image_versions.rs b/core/archipelago/src/container/image_versions.rs index 7fd7828a..5d68f967 100644 --- a/core/archipelago/src/container/image_versions.rs +++ b/core/archipelago/src/container/image_versions.rs @@ -219,6 +219,10 @@ pub fn pinned_image_for_app(app_id: &str) -> Option { /// explicit versions we should advertise to users as available updates. pub fn available_update_for_app(app_id: &str, running_image: &str) -> Option { let pinned = pinned_image_for_app(app_id)?; + available_update_for_images(&pinned, running_image) +} + +fn available_update_for_images(pinned: &str, running_image: &str) -> Option { let pinned_version = extract_version_from_image(&pinned); if is_floating_tag(&pinned_version) { return None; @@ -378,6 +382,28 @@ mod tests { assert!(!is_floating_tag("v0.18.4-beta")); } + #[test] + fn available_update_ignores_registry_only_changes() { + assert_eq!( + available_update_for_images( + "146.59.87.168:3000/lfg2025/nextcloud:29", + "git.tx1138.com/lfg2025/nextcloud:29", + ), + None + ); + } + + #[test] + fn available_update_returns_pinned_version_for_same_repo_newer_tag() { + assert_eq!( + available_update_for_images( + "146.59.87.168:3000/lfg2025/nextcloud:29", + "146.59.87.168:3000/lfg2025/nextcloud:28", + ), + Some("29".to_string()) + ); + } + #[test] fn test_parse_image_versions() { let content = r#" diff --git a/core/archipelago/src/container/lnd.rs b/core/archipelago/src/container/lnd.rs index 22420099..98f84cb3 100644 --- a/core/archipelago/src/container/lnd.rs +++ b/core/archipelago/src/container/lnd.rs @@ -76,7 +76,7 @@ pub async fn ensure_wallet_initialized() -> Result<()> { let admin_macaroon = "/var/lib/archipelago/lnd/data/chain/bitcoin/mainnet/admin.macaroon"; let wallet_db = "/var/lib/archipelago/lnd/data/chain/bitcoin/mainnet/wallet.db"; if file_exists_as_root(wallet_db).await { - if file_exists_as_root(admin_macaroon).await && lnd_getinfo_ready(admin_macaroon).await { + if file_exists_as_root(admin_macaroon).await { return Ok(()); } unlock_existing_wallet().await?; @@ -305,6 +305,7 @@ async fn decode_lnd_unlocker_response Deserialize<'de>>( anyhow::bail!("LND REST {path} returned {status}: {text}") } +#[allow(dead_code)] async fn lnd_getinfo_ready(admin_macaroon: &str) -> bool { let Ok(macaroon) = read_file_as_root(admin_macaroon).await else { return false; diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 3f109333..cf9cc70a 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -26,13 +26,15 @@ use anyhow::{Context, Result}; use archipelago_container::{ AppManifest, ContainerRuntime as ContainerRuntimeTrait, ContainerState, ContainerStatus, - HostFacts, ManifestError, ResolvedSource, SecretsProvider, + Dependency, GeneratedFile, HostFacts, ManifestError, ResolvedSource, SecretsProvider, }; use async_trait::async_trait; use std::collections::{HashMap, HashSet}; +use std::os::unix::fs::FileTypeExt; use std::path::{Path, PathBuf}; use std::process::Command; use std::sync::Arc; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::{Mutex, RwLock}; use crate::config::{Config, ContainerRuntime as ConfigContainerRuntime}; @@ -48,6 +50,15 @@ use crate::update::host_sudo; /// so the rule is visible in one place and unit-testable. const UI_APP_IDS: &[&str] = &["bitcoin-ui", "electrs-ui", "lnd-ui"]; const ARCHIVAL_BITCOIN_DISK_GB: u64 = 1000; +const INDEEDHUB_BACKEND_CONTAINERS: &[&str] = &[ + "indeedhub-postgres", + "indeedhub-redis", + "indeedhub-minio", + "indeedhub-relay", + "indeedhub-api", + "indeedhub-ffmpeg", +]; +const INDEEDHUB_FRONTEND_READY_TIMEOUT_SECS: u64 = 90; fn is_required_baseline_app(app_id: &str) -> bool { matches!( @@ -75,6 +86,67 @@ fn is_restart_sensitive_app(app_id: &str) -> bool { ) } +fn is_builtin_network_mode(network: &str) -> bool { + matches!( + network, + "host" | "bridge" | "none" | "slirp4netns" | "pasta" + ) +} + +fn uses_pasta_network(manifest: &AppManifest) -> bool { + manifest.app.container.network.as_deref() == Some("pasta") +} + +fn dependency_manifests_required_by_active_apps<'a>( + manifests: impl Iterator, + user_stopped: &HashSet, +) -> HashSet { + let mut required = HashSet::new(); + for manifest in manifests { + let app_id = manifest.app.id.as_str(); + if user_stopped.contains(app_id) || user_stopped.contains(&compute_container_name(manifest)) + { + continue; + } + for app_id in manifest_dependency_app_ids(manifest) { + required.insert(app_id); + } + } + required +} + +fn manifest_dependency_app_ids(manifest: &AppManifest) -> Vec { + manifest + .app + .dependencies + .iter() + .filter_map(|dep| match dep { + Dependency::App { app_id, .. } => Some(app_id.clone()), + Dependency::Simple(app_id) => Some(app_id.clone()), + Dependency::Storage { .. } => None, + }) + .collect() +} + +fn host_port_wait_timeout_secs(manifest: &AppManifest) -> u64 { + if manifest.app.id == "uptime-kuma" { + return 420; + } + if uses_pasta_network(manifest) { + 120 + } else { + 60 + } +} + +fn host_port_repair_probe_timeout_secs(_manifest: &AppManifest) -> u64 { + 5 +} + +fn is_podman_socket_bind_source(source: &str) -> bool { + source.ends_with("/podman.sock") +} + fn requires_archival_bitcoin(app_id: &str) -> bool { matches!( app_id, @@ -163,9 +235,57 @@ async fn wait_for_host_port(port: u16, timeout_secs: u64) -> bool { } } +async fn wait_for_http_host_port(port: u16, path: &str, timeout_secs: u64) -> bool { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); + loop { + if http_host_port_ready(port, path).await { + return true; + } + if std::time::Instant::now() >= deadline { + return false; + } + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + } +} + +async fn http_host_port_ready(port: u16, path: &str) -> bool { + let Ok(Ok(mut stream)) = tokio::time::timeout( + std::time::Duration::from_secs(3), + tokio::net::TcpStream::connect(("127.0.0.1", port)), + ) + .await + else { + return false; + }; + + let request = format!("GET {path} HTTP/1.1\r\nHost: 127.0.0.1\r\nConnection: close\r\n\r\n"); + if stream.write_all(request.as_bytes()).await.is_err() { + return false; + } + + let mut buf = [0u8; 128]; + let Ok(Ok(n)) = + tokio::time::timeout(std::time::Duration::from_secs(3), stream.read(&mut buf)).await + else { + return false; + }; + if n == 0 { + return false; + } + let head = String::from_utf8_lossy(&buf[..n]); + head.starts_with("HTTP/1.1 2") + || head.starts_with("HTTP/1.1 3") + || head.starts_with("HTTP/1.0 2") + || head.starts_with("HTTP/1.0 3") +} + async fn wait_for_manifest_host_ports(manifest: &AppManifest, timeout_secs: u64) -> Result<()> { for port in manifest.app.ports.iter().map(|p| p.host) { - if !wait_for_host_port(port, timeout_secs).await { + let ready = match manifest.app.id.as_str() { + "uptime-kuma" => wait_for_http_host_port(port, "/", timeout_secs).await, + _ => wait_for_host_port(port, timeout_secs).await, + }; + if !ready { return Err(anyhow::anyhow!( "{} host port {} did not become reachable within {}s", manifest.app.id, @@ -177,6 +297,245 @@ async fn wait_for_manifest_host_ports(manifest: &AppManifest, timeout_secs: u64) Ok(()) } +async fn ensure_user_podman_socket() -> Result<()> { + let socket_path = "/run/user/1000/podman/podman.sock"; + if podman_socket_accepts_connections(socket_path).await { + return Ok(()); + } + + let _ = tokio::process::Command::new("systemctl") + .args([ + "--user", + "stop", + "podman-archy-api.service", + "podman.socket", + "podman.service", + ]) + .status() + .await; + + remove_stale_podman_socket_path(socket_path).await; + + let _ = tokio::process::Command::new("systemctl") + .args([ + "--user", + "reset-failed", + "podman-archy-api.service", + "podman.socket", + "podman.service", + ]) + .status() + .await; + + let service_status = tokio::process::Command::new("systemd-run") + .args([ + "--user", + "--unit", + "podman-archy-api", + "--collect", + "podman", + "system", + "service", + "--time=0", + "unix:///run/user/1000/podman/podman.sock", + ]) + .status() + .await + .context("spawn systemd-run podman-archy-api")?; + if !service_status.success() { + tracing::warn!(?service_status, "podman-archy-api service start failed"); + } + + for _ in 0..20 { + if podman_socket_accepts_connections(socket_path).await { + return Ok(()); + } + tokio::time::sleep(std::time::Duration::from_millis(250)).await; + } + + for args in [ + ["--user", "reset-failed", "podman.socket", "podman.service"].as_slice(), + ["--user", "start", "podman.socket"].as_slice(), + ] { + let status = tokio::process::Command::new("systemctl") + .args(args) + .status() + .await + .with_context(|| format!("spawn systemctl {}", args.join(" ")))?; + if !status.success() { + tracing::warn!(?status, command = %args.join(" "), "systemctl podman socket repair step failed"); + } + } + + for _ in 0..20 { + if podman_socket_accepts_connections(socket_path).await { + return Ok(()); + } + tokio::time::sleep(std::time::Duration::from_millis(250)).await; + } + + Err(anyhow::anyhow!( + "podman socket {socket_path} did not appear after restart" + )) +} + +async fn podman_socket_accepts_connections(socket_path: &str) -> bool { + if !tokio::fs::try_exists(socket_path).await.unwrap_or(false) { + return false; + } + matches!( + tokio::time::timeout( + std::time::Duration::from_secs(2), + tokio::net::UnixStream::connect(socket_path), + ) + .await, + Ok(Ok(_)) + ) +} + +async fn remove_stale_podman_socket_path(socket_path: &str) { + if podman_socket_accepts_connections(socket_path).await { + return; + } + let Ok(metadata) = tokio::fs::symlink_metadata(socket_path).await else { + return; + }; + let file_type = metadata.file_type(); + if file_type.is_socket() { + let _ = tokio::fs::remove_file(socket_path).await; + } else if file_type.is_dir() { + let _ = tokio::fs::remove_dir_all(socket_path).await; + } else { + let _ = tokio::fs::remove_file(socket_path).await; + } +} + +async fn wait_for_container_stable_running( + runtime: &dyn ContainerRuntimeTrait, + name: &str, + stable_secs: u64, + timeout_secs: u64, +) -> Result<()> { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); + let mut running_since: Option = None; + let mut last_state: String; + + loop { + match runtime.get_container_status(name).await { + Ok(status) if matches!(status.state, ContainerState::Running) => { + let since = *running_since.get_or_insert_with(std::time::Instant::now); + if since.elapsed() >= std::time::Duration::from_secs(stable_secs) { + return Ok(()); + } + last_state = "running".to_string(); + } + Ok(status) => { + running_since = None; + last_state = format!("{:?}", status.state); + } + Err(err) => { + running_since = None; + last_state = err.to_string(); + } + } + + if std::time::Instant::now() >= deadline { + return Err(anyhow::anyhow!( + "{} did not remain running for {}s within {}s (last={})", + name, + stable_secs, + timeout_secs, + last_state + )); + } + + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + } +} + +async fn repair_manifest_host_ports_after_stability( + runtime: &dyn ContainerRuntimeTrait, + manifest: &AppManifest, + name: &str, +) -> Result<()> { + tokio::time::sleep(std::time::Duration::from_secs(5)).await; + if wait_for_manifest_host_ports(manifest, 5).await.is_ok() { + return Ok(()); + } + + tracing::warn!( + app_id = %manifest.app.id, + container = %name, + "host listener disappeared after startup; restarting container" + ); + if uses_pasta_network(manifest) { + podman_user_scope(&["restart", name]) + .await + .with_context(|| format!("podman restart {name}"))?; + } else if let Err(err) = quadlet::restart_service(&format!("{name}.service")).await { + tracing::warn!( + app_id = %manifest.app.id, + container = %name, + error = %err, + "quadlet restart failed during host listener repair; falling back to podman restart" + ); + let _ = runtime.stop_container(name).await; + runtime + .start_container(name) + .await + .with_context(|| format!("restart container {name}"))?; + } + + wait_for_manifest_host_ports(manifest, host_port_wait_timeout_secs(manifest)).await?; + wait_for_container_stable_running(runtime, name, 15, 90).await +} + +async fn podman_user_scope(args: &[&str]) -> Result<()> { + let output = tokio::process::Command::new("systemd-run") + .args(["--user", "--scope", "--quiet", "--collect", "podman"]) + .args(args) + .output() + .await + .with_context(|| format!("systemd-run --user --scope podman {}", args.join(" ")))?; + if output.status.success() { + return Ok(()); + } + + Err(anyhow::anyhow!( + "systemd-run --user --scope podman {} failed: {}", + args.join(" "), + String::from_utf8_lossy(&output.stderr).trim() + )) +} + +async fn start_container_scoped_if_pasta( + runtime: &dyn ContainerRuntimeTrait, + manifest: &AppManifest, + name: &str, +) -> Result<()> { + if uses_pasta_network(manifest) { + // Rootless pasta/conmon inherit the cgroup of the process that starts + // them. Starting through archipelago.service lets backend restarts kill + // app networking; a transient user scope keeps app daemons independent. + podman_user_scope(&["start", name]).await + } else { + runtime.start_container(name).await + } +} + +async fn restart_container_scoped_if_pasta( + runtime: &dyn ContainerRuntimeTrait, + manifest: &AppManifest, + name: &str, +) -> Result<()> { + if uses_pasta_network(manifest) { + podman_user_scope(&["restart", name]).await + } else { + let _ = runtime.stop_container(name).await; + runtime.start_container(name).await + } +} + async fn patch_indeedhub_nostr_provider() { let _ = tokio::process::Command::new("podman") .args([ @@ -633,13 +992,18 @@ impl ProdContainerOrchestrator { let user_stopped = crate::crash_recovery::load_user_stopped(&self.data_dir).await; let manifests: Vec = { let state = self.state.read().await; + let dependency_required = dependency_manifests_required_by_active_apps( + state.manifests.values().map(|lm| &lm.manifest), + &user_stopped, + ); state .manifests .iter() .filter(|(app_id, _)| !state.disabled.contains(*app_id)) .filter(|(app_id, lm)| { - !user_stopped.contains(*app_id) - && !user_stopped.contains(&compute_container_name(&lm.manifest)) + dependency_required.contains(*app_id) + || (!user_stopped.contains(*app_id) + && !user_stopped.contains(&compute_container_name(&lm.manifest))) }) .map(|(_, lm)| lm.clone()) .collect() @@ -700,11 +1064,12 @@ impl ProdContainerOrchestrator { // after proving the container exists. Boot reconciliation must // not create every catalog app just because a Quadlet unit is // absent. - if self.use_quadlet_backends { + if self.use_quadlet_backends && !uses_pasta_network(&resolved_manifest) { if let Some(action) = self.migrate_to_quadlet_if_needed(lm, &name).await? { return Ok(action); } self.sync_quadlet_unit(lm, &name).await?; + self.sync_dependency_quadlet_units(lm).await?; } match status.state { ContainerState::Running => { @@ -713,15 +1078,19 @@ impl ProdContainerOrchestrator { // password rotated (or template changed via OTA). If // anything was rewritten, restart the container so nginx // picks up the new config. - if let Some(HookOutcome::Rewritten) = - self.run_pre_start_hooks(&app_id).await? + let app_hook = self.run_pre_start_hooks(&app_id).await?; + let file_hook = self.ensure_manifest_files(&resolved_manifest).await?; + if app_hook == Some(HookOutcome::Rewritten) + || file_hook == HookOutcome::Rewritten { tracing::info!(app_id = %app_id, "config rewritten while running — restarting"); - let _ = self.runtime.stop_container(&name).await; - self.runtime - .start_container(&name) - .await - .with_context(|| format!("reconcile restart {name}"))?; + restart_container_scoped_if_pasta( + self.runtime.as_ref(), + &resolved_manifest, + &name, + ) + .await + .with_context(|| format!("reconcile restart {name}"))?; self.run_post_start_hooks(&app_id).await?; return Ok(ReconcileAction::Started); } @@ -744,6 +1113,39 @@ impl ProdContainerOrchestrator { return Ok(ReconcileAction::Installed); } self.run_post_start_hooks(&app_id).await?; + if !resolved_manifest.app.ports.is_empty() { + if let Err(err) = wait_for_manifest_host_ports( + &resolved_manifest, + host_port_repair_probe_timeout_secs(&resolved_manifest), + ) + .await + { + tracing::warn!(app_id = %app_id, container = %name, mode = ?mode, error = %err, "host listener missing for running container; repairing"); + repair_manifest_host_ports_after_stability( + self.runtime.as_ref(), + &resolved_manifest, + &name, + ) + .await?; + return Ok(ReconcileAction::Started); + } + } + if uses_pasta_network(&resolved_manifest) { + if let Err(err) = wait_for_container_stable_running( + self.runtime.as_ref(), + &name, + 15, + 90, + ) + .await + { + tracing::warn!(app_id = %app_id, container = %name, mode = ?mode, error = %err, "pasta container not stable after install; recreating via direct runtime"); + let _ = self.runtime.stop_container(&name).await; + let _ = self.runtime.remove_container(&name).await; + self.install_fresh(lm).await?; + return Ok(ReconcileAction::Installed); + } + } Ok(ReconcileAction::NoOp) } ContainerState::Stopped | ContainerState::Exited => { @@ -758,7 +1160,13 @@ impl ProdContainerOrchestrator { self.install_fresh(lm).await?; return Ok(ReconcileAction::Installed); } - if let Err(e) = self.runtime.start_container(&name).await { + if let Err(e) = start_container_scoped_if_pasta( + self.runtime.as_ref(), + &resolved_manifest, + &name, + ) + .await + { tracing::warn!( app_id = %app_id, container = %name, @@ -771,7 +1179,15 @@ impl ProdContainerOrchestrator { return Ok(ReconcileAction::Installed); } self.run_post_start_hooks(&app_id).await?; - wait_for_manifest_host_ports(&resolved_manifest, 60).await?; + wait_for_manifest_host_ports( + &resolved_manifest, + host_port_wait_timeout_secs(&resolved_manifest), + ) + .await?; + if uses_pasta_network(&resolved_manifest) { + wait_for_container_stable_running(self.runtime.as_ref(), &name, 15, 90) + .await?; + } Ok(ReconcileAction::Started) } ContainerState::Stopping => { @@ -793,7 +1209,13 @@ impl ProdContainerOrchestrator { self.install_fresh(lm).await?; return Ok(ReconcileAction::Installed); } - if let Err(e) = self.runtime.start_container(&name).await { + if let Err(e) = start_container_scoped_if_pasta( + self.runtime.as_ref(), + &resolved_manifest, + &name, + ) + .await + { tracing::warn!( app_id = %app_id, container = %name, @@ -806,7 +1228,15 @@ impl ProdContainerOrchestrator { return Ok(ReconcileAction::Installed); } self.run_post_start_hooks(&app_id).await?; - wait_for_manifest_host_ports(&resolved_manifest, 60).await?; + wait_for_manifest_host_ports( + &resolved_manifest, + host_port_wait_timeout_secs(&resolved_manifest), + ) + .await?; + if uses_pasta_network(&resolved_manifest) { + wait_for_container_stable_running(self.runtime.as_ref(), &name, 15, 90) + .await?; + } Ok(ReconcileAction::Started) } ContainerState::Paused => Ok(ReconcileAction::Left("paused".to_string())), @@ -819,16 +1249,28 @@ impl ProdContainerOrchestrator { // lost the container record after a crash/reboot. Sync the unit // bytes first (clears stale Notify=healthy/nc probes), then ask // user systemd to start the generated service. - if self.use_quadlet_backends && self.quadlet_unit_exists(&name).await? { + if self.use_quadlet_backends + && !uses_pasta_network(&resolved_manifest) + && self.quadlet_unit_exists(&name).await? + { self.prepare_for_start(&resolved_manifest).await?; self.sync_quadlet_unit(lm, &name).await?; + self.ensure_resolved_source_available(lm).await?; quadlet::enable_now(&format!("{name}.service")) .await .with_context(|| { format!("start existing quadlet service {name}.service") })?; self.run_post_start_hooks(&app_id).await?; - wait_for_manifest_host_ports(&resolved_manifest, 60).await?; + wait_for_manifest_host_ports( + &resolved_manifest, + host_port_wait_timeout_secs(&resolved_manifest), + ) + .await?; + if uses_pasta_network(&resolved_manifest) { + wait_for_container_stable_running(self.runtime.as_ref(), &name, 15, 90) + .await?; + } return Ok(ReconcileAction::Started); } @@ -886,11 +1328,17 @@ impl ProdContainerOrchestrator { .to_string_lossy() .into_owned(); } - let already = self - .runtime - .image_exists(&bcfg.tag) - .await - .with_context(|| format!("image_exists {}", bcfg.tag))?; + let already = match self.runtime.image_exists(&bcfg.tag).await { + Ok(exists) => exists, + Err(err) => { + tracing::warn!( + image = %bcfg.tag, + error = %err, + "build image existence check failed; rebuilding image" + ); + false + } + }; if !already { self.runtime .build_image(&bcfg) @@ -909,26 +1357,61 @@ impl ProdContainerOrchestrator { self.prepare_for_start(&resolved_manifest).await?; self.ensure_container_network(&resolved_manifest).await?; - if self.use_quadlet_backends { + if self.use_quadlet_backends && !uses_pasta_network(&resolved_manifest) { // Phase 3.2 path: declarative .container unit + systemctl. // Containers parented under user.slice instead of // archipelago.service's cgroup → no FM3 cascade SIGKILL on // archipelago restart. self.install_via_quadlet(&resolved_manifest, &name).await?; } else { + self.remove_quadlet_unit_if_present(&name).await?; + ensure_user_podman_socket().await?; // Legacy path. Production until tests/lifecycle/run-20x.sh // goes green against the Quadlet path. self.runtime .create_container(&resolved_manifest, &name, 0) .await .with_context(|| format!("create_container {name}"))?; - self.runtime - .start_container(&name) + start_container_scoped_if_pasta(self.runtime.as_ref(), &resolved_manifest, &name) .await .with_context(|| format!("start_container {name}"))?; } self.run_post_start_hooks(&lm.manifest.app.id).await?; - wait_for_manifest_host_ports(&resolved_manifest, 60).await?; + if uses_pasta_network(&resolved_manifest) { + if let Err(err) = wait_for_manifest_host_ports( + &resolved_manifest, + host_port_wait_timeout_secs(&resolved_manifest), + ) + .await + { + tracing::warn!( + app_id = %resolved_manifest.app.id, + container = %name, + error = %err, + "pasta host listener missing after fresh start; trying restart repair" + ); + repair_manifest_host_ports_after_stability( + self.runtime.as_ref(), + &resolved_manifest, + &name, + ) + .await?; + return Ok(()); + } + wait_for_container_stable_running(self.runtime.as_ref(), &name, 15, 90).await?; + repair_manifest_host_ports_after_stability( + self.runtime.as_ref(), + &resolved_manifest, + &name, + ) + .await?; + } else { + wait_for_manifest_host_ports( + &resolved_manifest, + host_port_wait_timeout_secs(&resolved_manifest), + ) + .await?; + } Ok(()) } @@ -947,7 +1430,9 @@ impl ProdContainerOrchestrator { async fn prepare_for_start(&self, manifest: &AppManifest) -> Result<()> { self.run_pre_start_hooks(&manifest.app.id).await?; + self.ensure_bind_mount_sockets(manifest).await?; self.ensure_bind_mount_dirs(manifest).await?; + self.ensure_manifest_files(manifest).await?; self.apply_data_uid(manifest).await?; self.run_post_data_uid_hooks(&manifest.app.id).await?; Ok(()) @@ -1075,6 +1560,19 @@ impl ProdContainerOrchestrator { .with_context(|| format!("check existing quadlet unit {}", unit_path.display())) } + async fn remove_quadlet_unit_if_present(&self, name: &str) -> Result<()> { + let unit_dir = quadlet::unit_dir() + .await + .context("locate user quadlet unit dir for removal")?; + let unit_path = unit_dir.join(format!("{name}.container")); + if tokio::fs::try_exists(&unit_path).await.unwrap_or(false) { + quadlet::disable_remove(name, &unit_dir) + .await + .with_context(|| format!("remove stale quadlet unit for {name}"))?; + } + Ok(()) + } + /// Drift-sync an existing Quadlet unit file's bytes against what the /// current renderer produces. No-op when the flag is off, when the /// app is a companion (companion.rs owns those units), or when no @@ -1138,6 +1636,7 @@ impl ProdContainerOrchestrator { || restart_for_exec_change || restart_for_health_change) { + self.ensure_resolved_source_available(lm).await?; let service = unit.service_name(); let reason = if restart_required { "stale health gate" @@ -1164,6 +1663,101 @@ impl ProdContainerOrchestrator { Ok(()) } + async fn sync_dependency_quadlet_units(&self, lm: &LoadedManifest) -> Result<()> { + let dependency_ids = manifest_dependency_app_ids(&lm.manifest); + if dependency_ids.is_empty() { + return Ok(()); + } + + let dependencies: Vec = { + let state = self.state.read().await; + dependency_ids + .iter() + .filter_map(|app_id| state.manifests.get(app_id).cloned()) + .collect() + }; + + for dep in dependencies { + let mut resolved = dep.manifest.clone(); + self.resolve_dynamic_env(&mut resolved)?; + let name = compute_container_name(&dep.manifest); + if self.runtime.get_container_status(&name).await.is_err() { + continue; + } + if self.use_quadlet_backends && !uses_pasta_network(&resolved) { + self.sync_quadlet_unit(&dep, &name).await?; + } + } + + Ok(()) + } + + async fn ensure_resolved_source_available(&self, lm: &LoadedManifest) -> Result<()> { + let resolved = lm.manifest.app.container.resolve().ok_or_else(|| { + anyhow::anyhow!( + "manifest for {} has invalid container source (neither image nor build)", + lm.manifest.app.id + ) + })?; + + match resolved { + ResolvedSource::Pull { + image, + image_signature, + .. + } => { + let exists = match self.runtime.image_exists(&image).await { + Ok(exists) => exists, + Err(err) => { + tracing::warn!( + image = %image, + error = %err, + "image existence check failed; pulling image instead" + ); + false + } + }; + if !exists { + self.runtime + .pull_image(&image, image_signature.as_deref()) + .await + .with_context(|| format!("pulling {image}"))?; + } + } + ResolvedSource::Build(mut bcfg) => { + let ctx_path = Path::new(&bcfg.context); + if !ctx_path.is_absolute() { + bcfg.context = lm + .manifest_dir + .join(ctx_path) + .to_string_lossy() + .into_owned(); + } + if !self + .runtime + .image_exists(&bcfg.tag) + .await + .map_err(|err| { + tracing::warn!( + image = %bcfg.tag, + error = %err, + "build image existence check failed; rebuilding image" + ); + err + }) + .unwrap_or(false) + { + self.runtime + .build_image(&bcfg) + .await + .with_context(|| format!("build_image {}", bcfg.tag))?; + } + } + } + + Ok(()) + } + /// Phase 3.2 install path. Renders the manifest as a Quadlet unit, /// writes it atomically into ~/.config/containers/systemd/, asks /// systemd to reload, and starts the generated service. Errors at @@ -1189,7 +1783,7 @@ impl ProdContainerOrchestrator { let Some(network) = manifest.app.container.network.as_deref() else { return Ok(()); }; - if network.is_empty() || matches!(network, "host" | "bridge" | "none" | "slirp4netns") { + if network.is_empty() || is_builtin_network_mode(network) { return Ok(()); } @@ -1329,7 +1923,6 @@ impl ProdContainerOrchestrator { "chown {db_dir} failed with status {status}" )); } - self.repair_btcpay_database_password().await?; Ok(()) } @@ -1359,83 +1952,34 @@ impl ProdContainerOrchestrator { Ok(()) } - async fn repair_btcpay_database_password(&self) -> Result<()> { - let secret_path = self.secrets_dir.join("btcpay-db-password"); - let Ok(db_pass) = tokio::fs::read_to_string(&secret_path).await else { - return Ok(()); - }; - let db_pass = db_pass.trim(); - if db_pass.is_empty() { - return Ok(()); - } - - if self - .runtime - .get_container_status("archy-btcpay-db") - .await - .is_err() - { - return Ok(()); - } - let _ = self.runtime.start_container("archy-btcpay-db").await; - tokio::time::sleep(std::time::Duration::from_secs(2)).await; - - let escaped = db_pass.replace('\'', "''"); - let sql = format!("ALTER USER btcpay WITH PASSWORD '{}';", escaped); - let output = tokio::process::Command::new("podman") - .args([ - "exec", - "archy-btcpay-db", - "psql", - "-U", - "btcpay", - "-d", - "btcpay", - "-c", - &sql, - ]) - .output() - .await - .context("btcpay db password repair: exec psql")?; - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - tracing::warn!(error = %stderr.trim(), "btcpay db password repair failed"); - } - let _ = tokio::process::Command::new("podman") - .args([ - "exec", - "archy-btcpay-db", - "createdb", - "-U", - "btcpay", - "nbxplorer", - ]) - .output() - .await; - Ok(()) - } - async fn start_indeedhub_backends(&self) -> Result<()> { let _ = tokio::process::Command::new("podman") .args(["network", "create", "indeedhub-net"]) .output() .await; - for name in [ - "indeedhub-postgres", - "indeedhub-redis", - "indeedhub-minio", - "indeedhub-relay", - "indeedhub-api", - "indeedhub-ffmpeg", - ] { - let exists = self.runtime.get_container_status(name).await.is_ok(); - if exists { - let _ = self.runtime.start_container(name).await; + for name in INDEEDHUB_BACKEND_CONTAINERS { + let status = match self.runtime.get_container_status(name).await { + Ok(status) => status, + Err(_) => continue, + }; + if !matches!(status.state, ContainerState::Running) { + if let Err(err) = podman_user_scope(&["start", name]).await { + tracing::warn!( + container = %name, + error = %err, + "IndeedHub scoped backend start failed; falling back to runtime start" + ); + self.runtime + .start_container(name) + .await + .with_context(|| format!("start IndeedHub backend {name}"))?; + } tokio::time::sleep(std::time::Duration::from_secs(2)).await; } } self.repair_indeedhub_network_aliases().await; + self.wait_for_indeedhub_dependencies_ready(120).await?; Ok(()) } @@ -1461,10 +2005,16 @@ impl ProdContainerOrchestrator { | ContainerState::Exited | ContainerState::Created | ContainerState::Stopping => { - self.runtime - .start_container("indeedhub") - .await - .context("start IndeedHub frontend during reconcile")?; + if let Err(err) = podman_user_scope(&["start", "indeedhub"]).await { + tracing::warn!( + error = %err, + "IndeedHub scoped frontend start failed; falling back to runtime start" + ); + self.runtime + .start_container("indeedhub") + .await + .context("start IndeedHub frontend during reconcile")?; + } started = true; } ContainerState::Paused => return Ok(ReconcileAction::Left("paused".to_string())), @@ -1475,17 +2025,45 @@ impl ProdContainerOrchestrator { self.repair_indeedhub_network_aliases().await; patch_indeedhub_nostr_provider().await; - if !wait_for_host_port(7778, 10).await { + let frontend_stable = wait_for_container_stable_running( + self.runtime.as_ref(), + "indeedhub", + 5, + INDEEDHUB_FRONTEND_READY_TIMEOUT_SECS, + ) + .await; + if frontend_stable.is_err() || !wait_for_host_port(7778, 10).await { tracing::warn!( - "IndeedHub frontend running but host port 7778 is not listening; restarting" + error = ?frontend_stable.err(), + "IndeedHub frontend did not stay reachable after reconcile; restarting" ); let _ = self.runtime.stop_container("indeedhub").await; - self.runtime - .start_container("indeedhub") - .await - .context("restart IndeedHub frontend after missing host port")?; + if let Err(err) = podman_user_scope(&["start", "indeedhub"]).await { + tracing::warn!( + error = %err, + "IndeedHub scoped frontend restart failed; falling back to runtime start" + ); + self.runtime + .start_container("indeedhub") + .await + .context("restart IndeedHub frontend after failed readiness")?; + } + started = true; tokio::time::sleep(std::time::Duration::from_secs(5)).await; patch_indeedhub_nostr_provider().await; + wait_for_container_stable_running( + self.runtime.as_ref(), + "indeedhub", + 5, + INDEEDHUB_FRONTEND_READY_TIMEOUT_SECS, + ) + .await + .context("IndeedHub frontend did not remain running after restart")?; + if !wait_for_host_port(7778, 30).await { + return Err(anyhow::anyhow!( + "IndeedHub frontend did not expose host port 7778 after restart" + )); + } } if started { @@ -1495,6 +2073,85 @@ impl ProdContainerOrchestrator { } } + async fn wait_for_indeedhub_dependencies_ready(&self, timeout_secs: u64) -> Result<()> { + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); + let mut last = String::from("not checked"); + loop { + let mut all_running = true; + for name in INDEEDHUB_BACKEND_CONTAINERS { + match self.runtime.get_container_status(name).await { + Ok(status) if matches!(status.state, ContainerState::Running) => {} + Ok(status) => { + all_running = false; + last = format!("{name} state {:?}", status.state); + break; + } + Err(err) => { + all_running = false; + last = format!("{name} status error: {err}"); + break; + } + } + } + + if all_running && self.indeedhub_api_dependency_dns_ready().await { + return Ok(()); + } + if all_running { + last = "indeedhub-api dependency DNS not ready".to_string(); + } + + if std::time::Instant::now() >= deadline { + return Err(anyhow::anyhow!( + "IndeedHub dependencies were not ready within {}s ({})", + timeout_secs, + last + )); + } + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + } + } + + async fn indeedhub_api_dependency_dns_ready(&self) -> bool { + let aliases_ready = self.indeedhub_required_aliases_present().await; + if cfg!(test) { + return true; + } + + for host in ["postgres", "redis", "minio", "relay"] { + let Ok(Ok(output)) = tokio::time::timeout( + std::time::Duration::from_secs(5), + tokio::process::Command::new("podman") + .args(["exec", "indeedhub-api", "getent", "hosts", host]) + .output(), + ) + .await + else { + return aliases_ready; + }; + if !output.status.success() { + return aliases_ready; + } + } + true + } + + async fn indeedhub_required_aliases_present(&self) -> bool { + for (container, alias) in [ + ("indeedhub-postgres", "postgres"), + ("indeedhub-redis", "redis"), + ("indeedhub-minio", "minio"), + ("indeedhub-relay", "relay"), + ("indeedhub-api", "api"), + ("indeedhub", "indeedhub"), + ] { + if !self.indeedhub_alias_present(container, alias).await { + return false; + } + } + true + } + async fn repair_indeedhub_network_aliases(&self) { for (container, alias) in [ ("indeedhub-postgres", "postgres"), @@ -1513,6 +2170,9 @@ impl ProdContainerOrchestrator { if !exists { continue; } + if self.indeedhub_alias_present(container, alias).await { + continue; + } let _ = tokio::process::Command::new("podman") .args(["network", "disconnect", "-f", "indeedhub-net", container]) @@ -1532,6 +2192,32 @@ impl ProdContainerOrchestrator { } } + async fn indeedhub_alias_present(&self, container: &str, alias: &str) -> bool { + let output = match tokio::process::Command::new("podman") + .args([ + "inspect", + container, + "--format", + "{{json .NetworkSettings.Networks}}", + ]) + .output() + .await + { + Ok(output) if output.status.success() => output, + _ => return false, + }; + + let Ok(networks) = serde_json::from_slice::(&output.stdout) else { + return false; + }; + networks + .get("indeedhub-net") + .and_then(|network| network.get("Aliases")) + .and_then(|aliases| aliases.as_array()) + .map(|aliases| aliases.iter().any(|value| value.as_str() == Some(alias))) + .unwrap_or(false) + } + async fn cleanup_stale_grafana_port(&self) { let _ = tokio::process::Command::new("pkill") .args(["-f", "pasta.*3001"]) @@ -1635,6 +2321,10 @@ impl ProdContainerOrchestrator { ) })?; env.extend(secrets); + if manifest.app.id == "fedimint" || manifest.app.id == "fedimintd" { + env.retain(|entry| !entry.starts_with("FM_BITCOIND_URL=")); + env.push("FM_BITCOIND_URL=http://bitcoin-knots:8332".to_string()); + } Self::expand_env_placeholders(&mut env); manifest.app.environment = env; Ok(()) @@ -1762,6 +2452,9 @@ impl ProdContainerOrchestrator { if volume.volume_type == "tmpfs" || volume.source.is_empty() { continue; } + if is_podman_socket_bind_source(&volume.source) { + continue; + } let mkdir_status = host_sudo(&["mkdir", "-p", &volume.source]) .await @@ -1789,6 +2482,15 @@ impl ProdContainerOrchestrator { if !Path::new(&volume.source).is_absolute() { continue; } + if is_podman_socket_bind_source(&volume.source) { + ensure_user_podman_socket().await.with_context(|| { + format!( + "ensure podman socket for {} bind mount {}", + manifest.app.id, volume.source + ) + })?; + continue; + } // File mounts are rendered by app-specific hooks; everything else // must exist before the Podman API accepts the bind mount. @@ -1809,6 +2511,113 @@ impl ProdContainerOrchestrator { } Ok(()) } + + async fn ensure_bind_mount_sockets(&self, manifest: &AppManifest) -> Result<()> { + for volume in &manifest.app.volumes { + if volume.volume_type == "tmpfs" || volume.source.is_empty() { + continue; + } + if is_podman_socket_bind_source(&volume.source) { + ensure_user_podman_socket().await.with_context(|| { + format!( + "ensure podman socket for {} bind mount {}", + manifest.app.id, volume.source + ) + })?; + } + } + Ok(()) + } + + async fn ensure_manifest_files(&self, manifest: &AppManifest) -> Result { + let mut outcome = HookOutcome::Unchanged; + for file in &manifest.app.files { + if ensure_generated_file(file) + .await + .with_context(|| format!("ensure manifest file {}", file.path))? + == HookOutcome::Rewritten + { + outcome = HookOutcome::Rewritten; + } + } + Ok(outcome) + } +} + +async fn ensure_generated_file(file: &GeneratedFile) -> Result { + let path = Path::new(&file.path); + if let Ok(existing) = tokio::fs::read_to_string(path).await { + if existing == file.content || !file.overwrite { + return Ok(HookOutcome::Unchanged); + } + } else if path.exists() && !file.overwrite { + return Ok(HookOutcome::Unchanged); + } + + let parent = path + .parent() + .ok_or_else(|| anyhow::anyhow!("generated file path has no parent: {}", file.path))?; + create_dir_all_or_sudo(parent).await?; + write_generated_file_atomically(path, &file.content).await?; + Ok(HookOutcome::Rewritten) +} + +async fn create_dir_all_or_sudo(path: &Path) -> Result<()> { + match tokio::fs::create_dir_all(path).await { + Ok(()) => Ok(()), + Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => { + let path = path.to_string_lossy(); + let status = host_sudo(&["mkdir", "-p", &path]) + .await + .with_context(|| format!("creating {path} via sudo"))?; + if !status.success() { + anyhow::bail!("mkdir -p {path} via sudo exited with {status}"); + } + Ok(()) + } + Err(e) => Err(e).with_context(|| format!("creating {}", path.display())), + } +} + +async fn write_generated_file_atomically(path: &Path, content: &str) -> Result<()> { + let file_name = path.file_name().and_then(|s| s.to_str()).ok_or_else(|| { + anyhow::anyhow!("generated file path has no filename: {}", path.display()) + })?; + let tmp = path.with_file_name(format!(".{file_name}.archy-tmp")); + match tokio::fs::write(&tmp, content).await { + Ok(()) => { + tokio::fs::rename(&tmp, path).await.with_context(|| { + format!( + "renaming generated file {} -> {}", + tmp.display(), + path.display() + ) + })?; + Ok(()) + } + Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => { + let script = format!( + "set -eu\ncat > '{}' <<'ARCHYGENERATEDFILE'\n{}ARCHYGENERATEDFILE\n", + shell_quote(&path.to_string_lossy()), + content + ); + let status = host_sudo(&["sh", "-lc", &script]) + .await + .with_context(|| format!("writing generated file {} via sudo", path.display()))?; + if !status.success() { + anyhow::bail!( + "writing generated file {} via sudo exited with {status}", + path.display() + ); + } + Ok(()) + } + Err(e) => Err(e).with_context(|| format!("writing tmp {}", tmp.display())), + } +} + +fn shell_quote(s: &str) -> String { + s.replace('\'', "'\\''") } /// Result of a pre-start hook pass. See `run_pre_start_hooks` docs. @@ -1912,10 +2721,31 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { if let Err(err) = quadlet::stop_service(&format!("{name}.service")).await { tracing::debug!(container = %name, error = %err, "quadlet stop skipped/failed"); } - self.runtime - .stop_container(&name) - .await - .with_context(|| format!("stop_container {name}")) + match self.runtime.stop_container(&name).await { + Ok(()) => Ok(()), + Err(err) => { + let stuck_stopping = self + .runtime + .get_container_status(&name) + .await + .map(|status| matches!(status.state, ContainerState::Stopping)) + .unwrap_or(false); + if stuck_stopping { + tracing::warn!( + app_id = %app_id, + container = %name, + error = %err, + "container stop timed out and left a stuck Stopping record; force-removing record" + ); + self.runtime + .remove_container(&name) + .await + .with_context(|| format!("force remove stuck stopping container {name}"))?; + return Ok(()); + } + Err(err).with_context(|| format!("stop_container {name}")) + } + } } async fn restart(&self, app_id: &str) -> Result<()> { @@ -1923,11 +2753,11 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { let lock = self.app_lock(app_id).await; let _guard = lock.lock().await; let name = compute_container_name(&lm.manifest); + let mut resolved_manifest = lm.manifest.clone(); + self.resolve_dynamic_env(&mut resolved_manifest)?; let service = format!("{name}.service"); if self.quadlet_unit_exists(&name).await? { - let mut resolved_manifest = lm.manifest.clone(); - self.resolve_dynamic_env(&mut resolved_manifest)?; self.prepare_for_start(&resolved_manifest).await?; self.sync_quadlet_unit(&lm, &name).await?; if let Err(err) = quadlet::restart_service(&service).await { @@ -1937,17 +2767,36 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { .with_context(|| format!("restart start quadlet service {service}"))?; } self.run_post_start_hooks(app_id).await?; - wait_for_manifest_host_ports(&resolved_manifest, 60).await?; + wait_for_manifest_host_ports( + &resolved_manifest, + host_port_wait_timeout_secs(&resolved_manifest), + ) + .await?; return Ok(()); } - // Best-effort stop (ignored if already stopped), then start. - let _ = self.runtime.stop_container(&name).await; - self.prepare_for_start(&lm.manifest).await?; - self.runtime - .start_container(&name) + // Best-effort restart. Pasta containers need a real restart in the + // user scope; stop+start can devolve into a no-op if stop times out. + self.prepare_for_start(&resolved_manifest).await?; + restart_container_scoped_if_pasta(self.runtime.as_ref(), &resolved_manifest, &name) .await - .with_context(|| format!("restart start_container {name}")) + .with_context(|| format!("restart container {name}"))?; + self.run_post_start_hooks(app_id).await?; + wait_for_manifest_host_ports( + &resolved_manifest, + host_port_wait_timeout_secs(&resolved_manifest), + ) + .await?; + if uses_pasta_network(&resolved_manifest) { + wait_for_container_stable_running(self.runtime.as_ref(), &name, 15, 90).await?; + repair_manifest_host_ports_after_stability( + self.runtime.as_ref(), + &resolved_manifest, + &name, + ) + .await?; + } + Ok(()) } /// Remove the container. `preserve_data=true` is honored by NOT touching volumes @@ -2079,6 +2928,8 @@ mod tests { created_env: StdMutex>>, /// If set, the next `build_image` call fails with this message. fail_build: StdMutex>, + /// If set, `image_exists` fails for this image reference. + fail_image_exists: StdMutex>, /// If set, `start_container` for this container fails with this message. fail_start: StdMutex>, } @@ -2201,6 +3052,9 @@ mod tests { } async fn image_exists(&self, image_ref: &str) -> Result { self.record(format!("image_exists:{image_ref}")); + if let Some(msg) = self.fail_image_exists.lock().unwrap().get(image_ref) { + return Err(anyhow::anyhow!(msg.clone())); + } Ok(self .images .lock() @@ -2236,6 +3090,34 @@ mod tests { AppManifest::parse(&yaml).unwrap() } + #[test] + fn active_manifest_dependencies_override_stale_user_stopped_entries() { + let parent = AppManifest::parse( + r#" +app: + id: btcpay-server + name: btcpay-server + version: 1.0.0 + container: + image: btcpay:latest + dependencies: + - app_id: archy-nbxplorer + version: ">=1" +"#, + ) + .unwrap(); + let child = pull_manifest("archy-nbxplorer", "nbxplorer:latest"); + let mut user_stopped = HashSet::new(); + user_stopped.insert("archy-nbxplorer".to_string()); + + let required = dependency_manifests_required_by_active_apps( + [&parent, &child].into_iter(), + &user_stopped, + ); + + assert!(required.contains("archy-nbxplorer")); + } + fn manifest_with_container_name(id: &str, image: &str, name: &str) -> AppManifest { let yaml = format!( "app:\n id: {id}\n name: {id}\n version: 1.0.0\n container_name: {name}\n container:\n image: {image}\n" @@ -2307,6 +3189,24 @@ mod tests { AppManifest::parse(&yaml).unwrap() } + fn pull_manifest_with_generated_file(id: &str, image: &str, source: &str) -> AppManifest { + let yaml = format!( + "app:\n id: {id}\n name: {id}\n version: 1.0.0\n container:\n image: {image}\n volumes:\n - type: bind\n source: {source}\n target: /data\n files:\n - path: {source}/config.yaml\n content: |\n key: value\n" + ); + AppManifest::parse(&yaml).unwrap() + } + + fn pull_manifest_with_generated_file_overwrite( + id: &str, + image: &str, + source: &str, + ) -> AppManifest { + let yaml = format!( + "app:\n id: {id}\n name: {id}\n version: 1.0.0\n container:\n image: {image}\n volumes:\n - type: bind\n source: {source}\n target: /data\n files:\n - path: {source}/config.yaml\n overwrite: true\n content: |\n key: new\n" + ); + AppManifest::parse(&yaml).unwrap() + } + fn pull_manifest_filebrowser() -> AppManifest { let yaml = r#" app: @@ -2428,6 +3328,37 @@ app: .any(|c| c == "start_container:archy-bitcoin-ui")); } + #[tokio::test] + async fn install_fresh_builds_when_image_exists_check_fails() { + let rt = Arc::new(MockRuntime::default()); + rt.fail_image_exists.lock().unwrap().insert( + "archy-bitcoin-ui:local".to_string(), + "podman image inspect timed out".to_string(), + ); + let orch = orch_with(rt.clone()).await; + orch.insert_manifest_for_test( + build_manifest( + "bitcoin-ui", + "/opt/archy/docker/bitcoin-ui", + "archy-bitcoin-ui:local", + ), + PathBuf::from("/opt/archy/apps/bitcoin-ui"), + ) + .await; + + orch.install("bitcoin-ui").await.unwrap(); + let calls = rt.calls(); + assert!(calls + .iter() + .any(|c| c == "image_exists:archy-bitcoin-ui:local")); + assert!(calls + .iter() + .any(|c| c.starts_with("build_image:archy-bitcoin-ui:local:"))); + assert!(calls + .iter() + .any(|c| c == "create_container:archy-bitcoin-ui:offset=0")); + } + #[tokio::test] async fn install_bitcoin_ui_renders_nginx_conf_via_hook() { // End-to-end: install("bitcoin-ui") must invoke the pre-start @@ -2583,6 +3514,83 @@ app: .any(|c| c == "create_container:electrumx:offset=0")); } + #[tokio::test] + async fn install_writes_manifest_generated_files_before_create() { + let rt = Arc::new(MockRuntime::default()); + let orch = orch_with(rt.clone()).await; + + let data_dir = tempfile::tempdir().unwrap(); + orch.insert_manifest_for_test( + pull_manifest_with_generated_file( + "meshtastic", + "docker.io/meshtastic/meshtasticd:daily-alpine", + data_dir.path().to_string_lossy().as_ref(), + ), + PathBuf::from("/tmp/meshtastic"), + ) + .await; + + orch.install("meshtastic").await.unwrap(); + + let config_path = data_dir.path().join("config.yaml"); + let config = std::fs::read_to_string(config_path).unwrap(); + assert_eq!(config, "key: value\n"); + let calls = rt.calls(); + assert!(calls + .iter() + .any(|c| c == "create_container:meshtastic:offset=0")); + } + + #[tokio::test] + async fn manifest_generated_files_do_not_overwrite_by_default() { + let rt = Arc::new(MockRuntime::default()); + let orch = orch_with(rt.clone()).await; + + let data_dir = tempfile::tempdir().unwrap(); + let config_path = data_dir.path().join("config.yaml"); + std::fs::write(&config_path, "key: operator\n").unwrap(); + + orch.insert_manifest_for_test( + pull_manifest_with_generated_file( + "meshtastic", + "docker.io/meshtastic/meshtasticd:daily-alpine", + data_dir.path().to_string_lossy().as_ref(), + ), + PathBuf::from("/tmp/meshtastic"), + ) + .await; + + orch.install("meshtastic").await.unwrap(); + + let config = std::fs::read_to_string(config_path).unwrap(); + assert_eq!(config, "key: operator\n"); + } + + #[tokio::test] + async fn manifest_generated_files_can_overwrite_when_declared() { + let rt = Arc::new(MockRuntime::default()); + let orch = orch_with(rt.clone()).await; + + let data_dir = tempfile::tempdir().unwrap(); + let config_path = data_dir.path().join("config.yaml"); + std::fs::write(&config_path, "key: old\n").unwrap(); + + orch.insert_manifest_for_test( + pull_manifest_with_generated_file_overwrite( + "meshtastic", + "docker.io/meshtastic/meshtasticd:daily-alpine", + data_dir.path().to_string_lossy().as_ref(), + ), + PathBuf::from("/tmp/meshtastic"), + ) + .await; + + orch.install("meshtastic").await.unwrap(); + + let config = std::fs::read_to_string(config_path).unwrap(); + assert_eq!(config, "key: new\n"); + } + #[tokio::test] async fn reconcile_noop_when_already_running() { let rt = Arc::new(MockRuntime::default()); diff --git a/core/archipelago/src/container/quadlet.rs b/core/archipelago/src/container/quadlet.rs index c142b6da..f0927472 100644 --- a/core/archipelago/src/container/quadlet.rs +++ b/core/archipelago/src/container/quadlet.rs @@ -34,9 +34,13 @@ use anyhow::{anyhow, Context, Result}; use archipelago_container::AppManifest; use std::fmt::Write as _; use std::path::{Path, PathBuf}; +use std::time::Duration; use tokio::fs; use tokio::process::Command; +const QUADLET_START_TIMEOUT: Duration = Duration::from_secs(90); +const QUADLET_STOP_TIMEOUT: Duration = Duration::from_secs(45); + /// Default rootless quadlet directory. Resolved per-user at runtime via /// `unit_dir()`. Tests pass an explicit dir. pub const DEFAULT_REL_UNIT_DIR: &str = ".config/containers/systemd"; @@ -61,6 +65,12 @@ pub enum NetworkMode { /// attached to it. The network must already exist (orchestrator's /// `ensure_container_network` handles that on every reconcile tick). Bridge(String), + /// Rootless slirp4netns networking. Podman rejects network aliases with + /// this mode, so render only Network=slirp4netns. + Slirp4netns, + /// Rootless pasta networking. This is more reliable than slirp4netns for + /// host port forwarding on long-running web apps. + Pasta, } /// systemd Restart= policy for the generated `.service` unit. Companions @@ -181,6 +191,12 @@ impl QuadletUnit { NetworkMode::Host => { let _ = writeln!(s, "Network=host"); } + NetworkMode::Slirp4netns => { + let _ = writeln!(s, "Network=slirp4netns"); + } + NetworkMode::Pasta => { + let _ = writeln!(s, "Network=pasta"); + } NetworkMode::Bridge(net) => { let _ = writeln!(s, "Network={net}"); for alias in &self.network_aliases { @@ -261,6 +277,13 @@ impl QuadletUnit { } let _ = writeln!(s); let _ = writeln!(s, "[Service]"); + // Dependency-gated apps may legitimately keep their container entrypoint + // in a wait loop before the actual daemon binds ports. Fedimint waits + // for Bitcoin IBD to finish before execing fedimintd; systemd's default + // start timeout otherwise kills the generated podman run job and leaves + // the unit stuck in deactivating. Health/status remains app-level state, + // not a systemd start gate. + let _ = writeln!(s, "TimeoutStartSec=0"); // Restart policy + 10s backoff. RestartSec keeps a crash-loop // from saturating the journal. Companions: Always. Backends: // OnFailure (clean stops stay stopped). @@ -334,6 +357,8 @@ impl QuadletUnit { // either form. other if !other.is_empty() && other != "isolated" => NetworkMode::Bridge(other.into()), _ => match app.container.network.as_deref() { + Some("slirp4netns") => NetworkMode::Slirp4netns, + Some("pasta") => NetworkMode::Pasta, Some(n) if !n.is_empty() && n != "host" => NetworkMode::Bridge(n.into()), _ => NetworkMode::Default, }, @@ -382,7 +407,7 @@ impl QuadletUnit { entrypoint: app.container.entrypoint.clone(), command: app.container.custom_args.clone(), read_only_root: app.security.readonly_root, - no_new_privileges: true, + no_new_privileges: app.security.no_new_privileges, cpu_quota: app.resources.cpu_limit, restart_policy: RestartPolicy::OnFailure, } @@ -436,13 +461,14 @@ fn translate_health_check(hc: &archipelago_container::HealthCheck) -> Option/dev/null 2>&1; then wget -q -T 5 -O /dev/null {0}; elif command -v curl >/dev/null 2>&1; then curl -fsS -m 5 {0}; else exit 0; fi", - final_url + "if command -v wget >/dev/null 2>&1; then wget -q -T {1} -O /dev/null {0}; elif command -v curl >/dev/null 2>&1; then curl -fsS -m {1} {0}; else exit 0; fi", + final_url, helper_timeout ) } "cmd" => hc.endpoint.as_deref()?.to_string(), @@ -456,6 +482,29 @@ fn translate_health_check(hc: &archipelago_container::HealthCheck) -> Option u64 { + let trimmed = raw.trim(); + if trimmed.is_empty() { + return 5; + } + + let (number, multiplier) = match trimmed.chars().last() { + Some('s') | Some('S') => (&trimmed[..trimmed.len() - 1], 1), + Some('m') | Some('M') => (&trimmed[..trimmed.len() - 1], 60), + Some('h') | Some('H') => (&trimmed[..trimmed.len() - 1], 3600), + Some(c) if c.is_ascii_digit() => (trimmed, 1), + _ => return 5, + }; + + number + .trim() + .parse::() + .ok() + .and_then(|n| n.checked_mul(multiplier)) + .filter(|n| *n > 0) + .unwrap_or(5) +} + /// Parse the manifest's memory_limit string into MiB. Recognises the /// forms our manifests actually use: "", "m"/"M", "g"/"G". /// Returns None for anything else; the caller treats None as unlimited. @@ -532,12 +581,21 @@ pub async fn enable_now(service: &str) -> Result<()> { // .service file lives under /run, not /etc — `enable` would refuse // ("transient or generated"). The unit's `[Install] WantedBy` is // honoured at daemon-reload, so we just start it. - let status = Command::new("systemctl") - .args(["--user", "start", service]) - .status() + let status = systemctl_user_status(&["start", service], QUADLET_START_TIMEOUT) .await - .with_context(|| format!("spawn systemctl --user start {service}"))?; + .with_context(|| format!("systemctl --user start {service}"))?; if !status.success() { + if wait_not_deactivating(service, Duration::from_secs(30)).await { + let retry = systemctl_user_status(&["start", service], QUADLET_START_TIMEOUT) + .await + .with_context(|| format!("retry systemctl --user start {service}"))?; + if retry.success() { + return Ok(()); + } + return Err(anyhow!( + "systemctl --user start {service} exited {status}; retry exited {retry}" + )); + } return Err(anyhow!("systemctl --user start {service} exited {status}")); } Ok(()) @@ -545,32 +603,112 @@ pub async fn enable_now(service: &str) -> Result<()> { /// Restart a generated Quadlet service after rewriting a known-bad unit. pub async fn restart_service(service: &str) -> Result<()> { - let status = Command::new("systemctl") - .args(["--user", "restart", service]) - .status() - .await - .with_context(|| format!("spawn systemctl --user restart {service}"))?; - if !status.success() { + // `systemctl restart` hides the stop phase. On rootless Podman nodes a + // generated unit can sit in deactivating while `podman rm -f` hangs, which + // makes RPC/UI state look frozen. Split restart into bounded stop + start + // so stop timeouts can be recovered with an app-scoped kill/reset. + if let Err(err) = stop_service(service).await { + tracing::warn!( + service = %service, + error = %err, + "quadlet stop failed during restart; waiting for unit to settle before start" + ); + } + if !wait_not_deactivating(service, Duration::from_secs(120)).await { return Err(anyhow!( - "systemctl --user restart {service} exited {status}" + "systemctl --user restart {service} could not leave deactivating state" )); } - Ok(()) + enable_now(service).await } /// Stop a generated Quadlet service without removing its unit file. pub async fn stop_service(service: &str) -> Result<()> { - let status = Command::new("systemctl") - .args(["--user", "stop", service]) - .status() - .await - .with_context(|| format!("spawn systemctl --user stop {service}"))?; - if !status.success() { - return Err(anyhow!("systemctl --user stop {service} exited {status}")); + match systemctl_user_status(&["stop", service], QUADLET_STOP_TIMEOUT).await { + Ok(status) if status.success() => Ok(()), + Ok(status) => Err(anyhow!("systemctl --user stop {service} exited {status}")), + Err(err) => { + tracing::warn!( + service = %service, + error = %err, + "quadlet stop timed out/failed; killing app-scoped unit" + ); + kill_and_reset_service(service).await?; + if !wait_not_deactivating(service, Duration::from_secs(60)).await { + return Err(anyhow!( + "systemctl --user stop {service} remained deactivating after app-scoped kill" + )); + } + Ok(()) + } } +} + +async fn systemctl_user_status( + args: &[&str], + timeout: Duration, +) -> Result { + let mut cmd = Command::new("systemctl"); + cmd.arg("--user").args(args); + cmd.kill_on_drop(true); + tokio::time::timeout(timeout, cmd.status()) + .await + .with_context(|| { + format!( + "systemctl --user {} timed out after {}s", + args.join(" "), + timeout.as_secs() + ) + })? + .with_context(|| format!("spawn systemctl --user {}", args.join(" "))) +} + +async fn kill_and_reset_service(service: &str) -> Result<()> { + let _ = systemctl_user_status( + &["kill", "--kill-whom=all", "-s", "SIGKILL", service], + Duration::from_secs(15), + ) + .await; + tokio::time::sleep(Duration::from_secs(2)).await; + let _ = systemctl_user_status(&["reset-failed", service], Duration::from_secs(15)).await; Ok(()) } +async fn wait_not_deactivating(service: &str, timeout: Duration) -> bool { + let deadline = tokio::time::Instant::now() + timeout; + loop { + let Ok(status) = + systemctl_user_output(&["is-active", service], Duration::from_secs(5)).await + else { + return true; + }; + let state = String::from_utf8_lossy(&status.stdout).trim().to_string(); + if state != "deactivating" && state != "activating" { + return true; + } + if tokio::time::Instant::now() >= deadline { + return false; + } + tokio::time::sleep(Duration::from_secs(2)).await; + } +} + +async fn systemctl_user_output(args: &[&str], timeout: Duration) -> Result { + let mut cmd = Command::new("systemctl"); + cmd.arg("--user").args(args); + cmd.kill_on_drop(true); + tokio::time::timeout(timeout, cmd.output()) + .await + .with_context(|| { + format!( + "systemctl --user {} timed out after {}s", + args.join(" "), + timeout.as_secs() + ) + })? + .with_context(|| format!("spawn systemctl --user {}", args.join(" "))) +} + pub fn contains_stale_health_gate(unit_body: &str) -> bool { unit_body.contains("Notify=healthy") || unit_body.contains("TimeoutStartSec=600") @@ -579,6 +717,12 @@ pub fn contains_stale_health_gate(unit_body: &str) -> bool { pub fn health_cmd_changed(old_body: &str, new_body: &str) -> bool { directive_values(old_body, "HealthCmd=") != directive_values(new_body, "HealthCmd=") + || directive_values(old_body, "HealthInterval=") + != directive_values(new_body, "HealthInterval=") + || directive_values(old_body, "HealthTimeout=") + != directive_values(new_body, "HealthTimeout=") + || directive_values(old_body, "HealthRetries=") + != directive_values(new_body, "HealthRetries=") } pub fn publish_ports_changed(old_body: &str, new_body: &str) -> bool { @@ -588,9 +732,11 @@ pub fn publish_ports_changed(old_body: &str, new_body: &str) -> bool { } pub fn network_aliases_changed(old_body: &str, new_body: &str) -> bool { + let old_network = directive_values(old_body, "Network="); + let new_network = directive_values(new_body, "Network="); let old_aliases = directive_values(old_body, "NetworkAlias="); let new_aliases = directive_values(new_body, "NetworkAlias="); - old_aliases != new_aliases + old_network != new_network || old_aliases != new_aliases } pub fn exec_changed(old_body: &str, new_body: &str) -> bool { @@ -620,9 +766,11 @@ pub async fn disable_remove(unit_name: &str, dir: &Path) -> Result<()> { .await; let path = dir.join(format!("{unit_name}.container")); if fs::try_exists(&path).await.unwrap_or(false) { - fs::remove_file(&path) - .await - .with_context(|| format!("remove {}", path.display()))?; + match fs::remove_file(&path).await { + Ok(()) => {} + Err(err) if err.kind() == std::io::ErrorKind::NotFound => {} + Err(err) => return Err(err).with_context(|| format!("remove {}", path.display())), + } } daemon_reload_user().await.ok(); // Defensive: kill the actual container too, in case quadlet left it. @@ -957,6 +1105,48 @@ app: assert!(!s.contains("Network=host")); } + #[test] + fn from_manifest_slirp4netns_omits_network_alias() { + let yaml = r#" +app: + id: vaultwarden + name: Vaultwarden + version: 1.0.0 + container: + image: registry/vaultwarden:1 + network: slirp4netns + security: + network_policy: isolated +"#; + let m = AppManifest::parse(yaml).expect("manifest must parse"); + let s = QuadletUnit::from_manifest(&m, "vaultwarden").render(); + + assert!(s.contains("Network=slirp4netns")); + assert!(!s.contains("NetworkAlias=")); + assert!(!s.contains("--network-alias")); + } + + #[test] + fn from_manifest_pasta_omits_network_alias() { + let yaml = r#" +app: + id: nextcloud + name: Nextcloud + version: 1.0.0 + container: + image: registry/nextcloud:1 + network: pasta + security: + network_policy: isolated +"#; + let m = AppManifest::parse(yaml).expect("manifest must parse"); + let s = QuadletUnit::from_manifest(&m, "nextcloud").render(); + + assert!(s.contains("Network=pasta")); + assert!(!s.contains("NetworkAlias=")); + assert!(!s.contains("--network-alias")); + } + #[test] fn from_manifest_preserves_grafana_data_uid_and_volume_shape() { let yaml = r#" @@ -1056,18 +1246,20 @@ app: assert!(s.contains("HealthRetries=3")); assert!(!s.contains("Notify=healthy")); assert!(!s.contains("TimeoutStartSec=600")); + assert!(s.contains("TimeoutStartSec=0")); } #[test] fn render_skips_health_directives_when_absent() { - // No health spec → no Notify=healthy, no HealthCmd, no TimeoutStartSec - // override. Companions rely on this so their rendered bytes stay - // unchanged. + // No health spec → no Notify=healthy and no HealthCmd. TimeoutStartSec=0 + // is a service-level baseline so dependency-waiting apps are not killed + // by systemd before their app daemon binds. let s = sample_unit().render(); assert!(!s.contains("HealthCmd=")); assert!(!s.contains("Notify=healthy")); assert!(!s.contains("HealthRetries=")); - assert!(!s.contains("TimeoutStartSec=")); + assert!(s.contains("TimeoutStartSec=0")); + assert!(!s.contains("TimeoutStartSec=600")); } #[test] @@ -1094,7 +1286,7 @@ app: let h = translate_health_check(&http).expect("http must translate"); assert_eq!( h.cmd, - "if command -v wget >/dev/null 2>&1; then wget -q -T 5 -O /dev/null http://localhost:8080/health; elif command -v curl >/dev/null 2>&1; then curl -fsS -m 5 http://localhost:8080/health; else exit 0; fi" + "if command -v wget >/dev/null 2>&1; then wget -q -T 3 -O /dev/null http://localhost:8080/health; elif command -v curl >/dev/null 2>&1; then curl -fsS -m 3 http://localhost:8080/health; else exit 0; fi" ); let cmdck = HealthCheck { @@ -1163,6 +1355,25 @@ app: assert!(h.cmd.contains("https://example.local/health")); } + #[test] + fn translate_health_check_http_uses_manifest_timeout_for_helpers() { + use archipelago_container::HealthCheck; + let http = HealthCheck { + check_type: "http".into(), + endpoint: Some("localhost:3000".into()), + path: Some("/api/health".into()), + interval: "30s".into(), + timeout: "30s".into(), + retries: 5, + }; + + let h = translate_health_check(&http).expect("http must translate"); + assert!(h.cmd.contains("wget -q -T 30 "), "got: {}", h.cmd); + assert!(h.cmd.contains("curl -fsS -m 30 "), "got: {}", h.cmd); + assert_eq!(h.timeout, "30s"); + assert_eq!(h.retries, 5); + } + #[test] fn from_manifest_picks_up_health_check() { let yaml = r#" @@ -1201,6 +1412,14 @@ app: assert!(!network_aliases_changed(new, new)); } + #[test] + fn network_aliases_changed_detects_network_mode_drift() { + let old = "[Container]\nNetwork=slirp4netns\n"; + let new = "[Container]\n"; + assert!(network_aliases_changed(old, new)); + assert!(!network_aliases_changed(new, new)); + } + #[test] fn shell_join_escapes_dollars_for_container_runtime_expansion() { let rendered = shell_join(&["sh".into(), "-lc".into(), "echo ${BITCOIN_RPC_PASS}".into()]); @@ -1223,6 +1442,14 @@ app: assert!(!health_cmd_changed(new, new)); } + #[test] + fn health_cmd_changed_detects_probe_timing_drift() { + let old = "[Container]\nHealthCmd=curl -fsS http://localhost:8080/\nHealthTimeout=5s\nHealthRetries=3\n"; + let new = "[Container]\nHealthCmd=curl -fsS http://localhost:8080/\nHealthTimeout=30s\nHealthRetries=5\n"; + assert!(health_cmd_changed(old, new)); + assert!(!health_cmd_changed(new, new)); + } + #[test] fn from_manifest_renders_to_a_systemd_unit() { // End-to-end: parse a real-shape manifest, build the unit, render diff --git a/core/archipelago/src/crash_recovery.rs b/core/archipelago/src/crash_recovery.rs index fd74f26a..56f3f391 100644 --- a/core/archipelago/src/crash_recovery.rs +++ b/core/archipelago/src/crash_recovery.rs @@ -334,6 +334,103 @@ fn is_process_running(pid: u32) -> bool { /// The crash recovery (PID-based) handles dirty shutdowns; this handles clean ones. /// Skips containers that the user intentionally stopped via the UI. pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport { + start_stopped_containers_for(data_dir, false).await +} + +/// Start stopped multi-container stack members after the backend is already +/// ready. These can take minutes after a reboot, so they must not block +/// systemd readiness. +pub async fn start_stopped_stack_containers(data_dir: &Path) -> RecoveryReport { + start_stopped_app_stacks(data_dir).await +} + +async fn start_stopped_app_stacks(data_dir: &Path) -> RecoveryReport { + let user_stopped = load_user_stopped(data_dir).await; + let mut report = RecoveryReport { + total: 0, + recovered: 0, + failed: Vec::new(), + }; + + for stack in stack_recovery_specs() { + if !stack_has_any_container(stack).await { + continue; + } + + info!( + "Recovering stopped {} stack containers after boot", + stack.name + ); + repair_stack_network_aliases(stack).await; + + for container in stack.containers { + if user_stopped.contains(*container) { + info!("Skipping user-stopped container: {}", container); + continue; + } + + match container_state(container).await { + Some(state) if state == "running" => continue, + Some(_) => {} + None => continue, + } + + repair_stack_network_aliases(stack).await; + wait_before_stack_container_recovery(stack, container).await; + + report.total += 1; + if start_existing_container(container).await { + report.recovered += 1; + } else { + report.failed.push((*container).to_string()); + } + } + } + + report +} + +async fn wait_before_stack_container_recovery(stack: &StackRecoverySpec, container: &str) { + if stack.name != "indeedhub" || container != "indeedhub" { + return; + } + + for _ in 0..60 { + if indeedhub_recovery_dependencies_running().await { + repair_stack_network_aliases(stack).await; + break; + } + tokio::time::sleep(Duration::from_secs(2)).await; + } + + for _ in 0..60 { + let ready = podman_output( + &["exec", "indeedhub-api", "getent", "hosts", "minio"], + Duration::from_secs(5), + ) + .await + .map(|output| output.status.success()) + .unwrap_or(false); + if ready { + return; + } + tokio::time::sleep(Duration::from_secs(2)).await; + } +} + +async fn indeedhub_recovery_dependencies_running() -> bool { + for name in ["indeedhub-redis", "indeedhub-minio", "indeedhub-api"] { + if container_state(name).await.as_deref() != Some("running") { + return false; + } + } + true +} + +async fn start_stopped_containers_for( + data_dir: &Path, + include_stack_members: bool, +) -> RecoveryReport { let mut cmd = tokio::process::Command::new("podman"); cmd.args([ "ps", @@ -400,7 +497,7 @@ pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport { let names: Vec = names .into_iter() - .filter(|n| should_auto_start_stopped_container(n)) + .filter(|n| should_auto_start_stopped_container(n, include_stack_members)) .collect(); if names.is_empty() { @@ -429,11 +526,276 @@ pub async fn start_stopped_containers(data_dir: &Path) -> RecoveryReport { recover_containers(&records).await } -fn should_auto_start_stopped_container(name: &str) -> bool { +fn should_auto_start_stopped_container(name: &str, include_stack_members: bool) -> bool { // Keep generic boot recovery narrow. The Rust manifest reconciler owns // managed app stacks; starting every exited Podman container here races // it and resurrects legacy/orphan helper containers. - matches!(name, "filebrowser" | "nostr-rs-relay") + if matches!(name, "filebrowser" | "nostr-rs-relay") { + return true; + } + include_stack_members + && matches!( + name, + "immich_postgres" + | "immich_redis" + | "immich_server" + | "indeedhub-postgres" + | "indeedhub-redis" + | "indeedhub-minio" + | "indeedhub-relay" + | "indeedhub-api" + | "indeedhub-ffmpeg" + | "indeedhub" + | "netbird-server" + | "netbird-dashboard" + | "netbird" + | "saleor-db" + | "saleor-cache" + | "saleor-jaeger" + | "saleor-mailpit" + | "saleor-api" + | "saleor-worker" + | "saleor" + | "saleor-storefront" + | "saleor-storefront-app" + ) +} + +struct StackRecoverySpec { + name: &'static str, + network: &'static str, + aliases: &'static [(&'static str, &'static str)], + containers: &'static [&'static str], +} + +fn stack_recovery_specs() -> &'static [StackRecoverySpec] { + &[ + StackRecoverySpec { + name: "immich", + network: "immich-net", + aliases: &[ + ("immich_postgres", "immich_postgres"), + ("immich_redis", "immich_redis"), + ("immich_server", "immich_server"), + ], + containers: &["immich_postgres", "immich_redis", "immich_server"], + }, + StackRecoverySpec { + name: "indeedhub", + network: "indeedhub-net", + aliases: &[ + ("indeedhub-postgres", "postgres"), + ("indeedhub-redis", "redis"), + ("indeedhub-minio", "minio"), + ("indeedhub-relay", "relay"), + ("indeedhub-api", "api"), + ("indeedhub", "indeedhub"), + ], + containers: &[ + "indeedhub-postgres", + "indeedhub-redis", + "indeedhub-minio", + "indeedhub-relay", + "indeedhub-api", + "indeedhub-ffmpeg", + "indeedhub", + ], + }, + StackRecoverySpec { + name: "netbird", + network: "netbird-net", + aliases: &[ + ("netbird-server", "netbird-server"), + ("netbird-dashboard", "netbird-dashboard"), + ("netbird", "netbird"), + ], + containers: &["netbird-server", "netbird-dashboard", "netbird"], + }, + StackRecoverySpec { + name: "saleor", + network: "saleor-net", + aliases: &[ + ("saleor-db", "db"), + ("saleor-cache", "cache"), + ("saleor-jaeger", "jaeger"), + ("saleor-mailpit", "mailpit"), + ("saleor-api", "api"), + ("saleor-worker", "worker"), + ("saleor", "saleor"), + ("saleor-storefront", "storefront"), + ("saleor-storefront-app", "storefront-app"), + ], + containers: &[ + "saleor-db", + "saleor-cache", + "saleor-jaeger", + "saleor-mailpit", + "saleor-api", + "saleor-worker", + "saleor", + "saleor-storefront", + "saleor-storefront-app", + ], + }, + ] +} + +async fn stack_has_any_container(stack: &StackRecoverySpec) -> bool { + for container in stack.containers { + if container_state(container).await.is_some() { + return true; + } + } + false +} + +async fn repair_stack_network_aliases(stack: &StackRecoverySpec) { + let _ = podman_status( + &["network", "create", stack.network], + Duration::from_secs(15), + ) + .await; + + for (container, alias) in stack.aliases { + if container_state(container).await.is_none() { + continue; + } + if network_alias_present(stack.network, container, alias).await { + continue; + } + + let _ = podman_status( + &["network", "disconnect", "-f", stack.network, container], + Duration::from_secs(15), + ) + .await; + let _ = podman_status( + &[ + "network", + "connect", + "--alias", + alias, + stack.network, + container, + ], + Duration::from_secs(15), + ) + .await; + } +} + +async fn network_alias_present(network_name: &str, container: &str, alias: &str) -> bool { + let output = match podman_output( + &[ + "inspect", + container, + "--format", + "{{json .NetworkSettings.Networks}}", + ], + Duration::from_secs(10), + ) + .await + { + Ok(output) if output.status.success() => output, + _ => return false, + }; + + let Ok(networks) = serde_json::from_slice::(&output.stdout) else { + return false; + }; + networks + .get(network_name) + .and_then(|network| network.get("Aliases")) + .and_then(|aliases| aliases.as_array()) + .map(|aliases| aliases.iter().any(|value| value.as_str() == Some(alias))) + .unwrap_or(false) +} + +async fn container_state(container: &str) -> Option { + let output = podman_output( + &["inspect", container, "--format", "{{.State.Status}}"], + Duration::from_secs(10), + ) + .await + .ok()?; + output + .status + .success() + .then(|| String::from_utf8_lossy(&output.stdout).trim().to_string()) +} + +async fn start_existing_container(container: &str) -> bool { + info!("Recovering stack container: {}", container); + let timeout = match container { + "immich_server" | "netbird-server" => Duration::from_secs(120), + _ => Duration::from_secs(90), + }; + if container_state(container).await.as_deref() == Some("initialized") { + cleanup_container_runtime_state(container).await; + } + match podman_output(&["start", container], timeout).await { + Ok(output) if output.status.success() => { + tokio::time::sleep(Duration::from_secs(3)).await; + if container_state(container).await.as_deref() == Some("exited") { + warn!("Stack container {} exited shortly after start", container); + false + } else { + info!("Successfully recovered stack container: {}", container); + true + } + } + Ok(output) => { + let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); + if stderr.contains("exec.fifo") || stderr.contains("failed to start container") { + cleanup_container_runtime_state(container).await; + if let Ok(retry) = podman_output(&["start", container], timeout).await { + if retry.status.success() { + info!( + "Successfully recovered stack container after cleanup: {}", + container + ); + return true; + } + warn!( + "Failed to recover stack container {} after cleanup: {}", + container, + String::from_utf8_lossy(&retry.stderr).trim() + ); + return false; + } + } + warn!( + "Failed to recover stack container {}: {}", + container, stderr + ); + false + } + Err(e) => { + warn!("Failed to recover stack container {}: {}", container, e); + false + } + } +} + +async fn cleanup_container_runtime_state(container: &str) { + let _ = podman_output( + &["container", "cleanup", container], + Duration::from_secs(30), + ) + .await; +} + +async fn podman_status(args: &[&str], timeout: Duration) -> Option { + podman_output(args, timeout) + .await + .ok() + .map(|output| output.status) +} + +async fn podman_output(args: &[&str], timeout: Duration) -> Result { + let mut cmd = tokio::process::Command::new("podman"); + cmd.args(args); + command_with_timeout(cmd, timeout, &format!("podman {}", args.join(" "))).await } /// Simple tier ordering for boot recovery (mirrors health_monitor tiers). @@ -620,10 +982,17 @@ mod tests { #[test] fn generic_boot_recovery_skips_manifest_owned_and_legacy_stacks() { - assert!(should_auto_start_stopped_container("filebrowser")); - assert!(should_auto_start_stopped_container("nostr-rs-relay")); - assert!(!should_auto_start_stopped_container("bitcoin-knots")); - assert!(!should_auto_start_stopped_container("lnd")); - assert!(!should_auto_start_stopped_container("indeedhub-postgres")); + assert!(should_auto_start_stopped_container("filebrowser", false)); + assert!(should_auto_start_stopped_container("nostr-rs-relay", false)); + assert!(!should_auto_start_stopped_container("bitcoin-knots", false)); + assert!(!should_auto_start_stopped_container("lnd", false)); + assert!(!should_auto_start_stopped_container( + "indeedhub-postgres", + false + )); + assert!(should_auto_start_stopped_container( + "indeedhub-postgres", + true + )); } } diff --git a/core/archipelago/src/electrs_status.rs b/core/archipelago/src/electrs_status.rs index c046637c..f84b4693 100644 --- a/core/archipelago/src/electrs_status.rs +++ b/core/archipelago/src/electrs_status.rs @@ -17,7 +17,8 @@ const ELECTRUMX_DATA_DIR: &str = "/var/lib/archipelago/electrumx"; const ESTIMATED_FULL_INDEX_BYTES: f64 = 130_000_000_000.0; /// Refresh interval for status cache -const CACHE_REFRESH_SECS: u64 = 15; +const CACHE_REFRESH_SECS: u64 = 30; +const CACHE_ERROR_BACKOFF_SECS: u64 = 60; /// Build Bitcoin RPC Basic auth header using shared credentials. async fn bitcoin_rpc_auth() -> String { @@ -70,6 +71,11 @@ pub fn spawn_status_cache() { tokio::spawn(async { loop { let mut fresh = fetch_electrs_sync_status().await; + let sleep_secs = if fresh.status == "waiting" && fresh.bitcoin_height == 0 { + CACHE_ERROR_BACKOFF_SECS + } else { + CACHE_REFRESH_SECS + }; let mut cached = cache().write().await; if fresh.indexed_height == 0 && cached.indexed_height > 0 @@ -92,7 +98,7 @@ pub fn spawn_status_cache() { } *cached = fresh; drop(cached); - tokio::time::sleep(Duration::from_secs(CACHE_REFRESH_SECS)).await; + tokio::time::sleep(Duration::from_secs(sleep_secs)).await; } }); } @@ -146,6 +152,8 @@ fn is_transient_error(err_msg: &str) -> bool { || lower.contains("broken pipe") || lower.contains("eof") || lower.contains("connection") + || lower.contains("503 service unavailable") + || lower.contains("work queue depth exceeded") } /// Fetch ElectrumX indexed height via Electrum protocol (TCP JSON-RPC). diff --git a/core/archipelago/src/health_monitor.rs b/core/archipelago/src/health_monitor.rs index c37fc3ff..5bbeeb3a 100644 --- a/core/archipelago/src/health_monitor.rs +++ b/core/archipelago/src/health_monitor.rs @@ -217,6 +217,7 @@ struct ContainerHealth { app_id: String, state: String, podman_health: Option, + host_port_ready: Option, healthy: bool, } @@ -427,42 +428,92 @@ async fn check_containers() -> Vec { // nbxplorer, mempool-api) and UI containers need auto-restart too. // Only skip ephemeral containers (build infrastructure, init one-shots). - containers - .iter() - .filter_map(|c| { - let name = c.get("Names").and_then(|v| { - if let Some(arr) = v.as_array() { - arr.first().and_then(|n| n.as_str()).map(|s| s.to_string()) - } else { - v.as_str().map(|s| s.to_string()) - } - })?; - - // Skip podman-compose infrastructure and one-shot init containers - if name.starts_with("indeedhub-build_") || name.contains("-init") { - return None; + let mut out = Vec::new(); + for c in &containers { + let name = c.get("Names").and_then(|v| { + if let Some(arr) = v.as_array() { + arr.first().and_then(|n| n.as_str()).map(|s| s.to_string()) + } else { + v.as_str().map(|s| s.to_string()) } + }); + let Some(name) = name else { + continue; + }; - let app_id = name.strip_prefix("archy-").unwrap_or(&name).to_string(); + // Skip podman-compose infrastructure and one-shot init containers + if name.starts_with("indeedhub-build_") || name.contains("-init") { + continue; + } - let state = c - .get("State") + let app_id = name.strip_prefix("archy-").unwrap_or(&name).to_string(); + + let state = c + .get("State") + .and_then(|v| v.as_str()) + .unwrap_or("unknown") + .to_lowercase(); + + let podman_health = parse_podman_health(c, &state); + let host_ports = host_tcp_ports_from_container(c); + let host_port_ready = if host_ports.is_empty() { + None + } else { + Some(host_ports_ready(&host_ports).await) + }; + let healthy = state == "running" + && podman_health.as_deref() != Some("unhealthy") + && host_port_ready != Some(false); + + out.push(ContainerHealth { + name, + app_id, + state, + podman_health, + host_port_ready, + healthy, + }); + } + out +} + +fn host_tcp_ports_from_container(c: &serde_json::Value) -> Vec { + let Some(ports) = c.get("Ports").and_then(|v| v.as_array()) else { + return Vec::new(); + }; + + let mut out: Vec = ports + .iter() + .filter(|p| { + p.get("protocol") .and_then(|v| v.as_str()) - .unwrap_or("unknown") - .to_lowercase(); - - let podman_health = parse_podman_health(c, &state); - let healthy = state == "running" && podman_health.as_deref() != Some("unhealthy"); - - Some(ContainerHealth { - name, - app_id, - state, - podman_health, - healthy, - }) + .unwrap_or("tcp") + .eq_ignore_ascii_case("tcp") }) - .collect() + .filter_map(|p| { + p.get("host_port") + .and_then(|v| v.as_u64()) + .and_then(|port| u16::try_from(port).ok()) + }) + .collect(); + out.sort_unstable(); + out.dedup(); + out +} + +async fn host_ports_ready(ports: &[u16]) -> bool { + for port in ports { + let ready = tokio::time::timeout( + std::time::Duration::from_secs(2), + tokio::net::TcpStream::connect(("127.0.0.1", *port)), + ) + .await + .is_ok_and(|r| r.is_ok()); + if !ready { + return false; + } + } + true } fn live_container_ids(containers: &[serde_json::Value]) -> HashSet { @@ -640,33 +691,41 @@ fn parse_health_from_status(status: &str) -> Option { (start < end).then(|| status[start + 1..end].to_string()) } -/// Try to restart a container. -async fn restart_container(name: &str) -> bool { - info!("Auto-restarting unhealthy container: {}", name); +/// Try to recover a container. Running containers need a real restart so +/// rootless network helpers such as pasta are recreated; `podman start` is a +/// no-op for a running container with a missing host listener. +async fn restart_container(name: &str, state: &str) -> bool { + let action = if state == "running" { + "restart" + } else { + "start" + }; + info!("Auto-{}ing unhealthy container: {}", action, name); let result = tokio::time::timeout( std::time::Duration::from_secs(120), - tokio::process::Command::new("podman") - .args(["start", name]) + tokio::process::Command::new("systemd-run") + .args(["--user", "--scope", "--quiet", "--collect", "podman"]) + .args([action, name]) .output(), ) .await; match result { Ok(Ok(output)) if output.status.success() => { - info!("Successfully restarted container: {}", name); + info!("Successfully recovered container: {}", name); true } Ok(Ok(output)) => { let stderr = String::from_utf8_lossy(&output.stderr); - warn!("Failed to restart container {}: {}", name, stderr.trim()); + warn!("Failed to {} container {}: {}", action, name, stderr.trim()); false } Ok(Err(e)) => { - warn!("Failed to execute podman start for {}: {}", name, e); + warn!("Failed to execute podman {} for {}: {}", action, name, e); false } Err(_) => { - warn!("Timeout starting container {} (120s)", name); + warn!("Timeout {}ing container {} (120s)", action, name); false } } @@ -684,9 +743,10 @@ pub fn spawn_health_monitor(state: Arc, data_dir: PathBuf) { if crate::crash_recovery::is_recovery_complete() { break; } - // Safety timeout: start anyway after 5 minutes even if recovery hangs - if wait_start.elapsed().as_secs() > 300 { - warn!("Health monitor: boot recovery did not complete within 5 minutes, starting anyway"); + // Safety timeout: start anyway after 30 minutes even if recovery hangs. + // Stack recovery can take many minutes on low-resource nodes after reboot. + if wait_start.elapsed().as_secs() > 1800 { + warn!("Health monitor: boot recovery did not complete within 30 minutes, starting anyway"); break; } tokio::time::sleep(std::time::Duration::from_secs(5)).await; @@ -827,6 +887,7 @@ pub fn spawn_health_monitor(state: Arc, data_dir: PathBuf) { } // Handle exited, stopped, created, and Podman-unhealthy running containers. if container.podman_health.as_deref() == Some("unhealthy") + || container.host_port_ready == Some(false) || container.state == "exited" || container.state == "stopped" || container.state == "created" @@ -932,7 +993,7 @@ pub fn spawn_health_monitor(state: Arc, data_dir: PathBuf) { .unwrap_or(&90) ); - let restarted = restart_container(&container.name).await; + let restarted = restart_container(&container.name, &container.state).await; if !restarted || attempt >= MAX_RESTART_ATTEMPTS { let notification = Notification { @@ -1088,6 +1149,7 @@ mod tests { app_id: "bitcoin-knots".to_string(), state: "running".to_string(), podman_health: Some("healthy".to_string()), + host_port_ready: None, healthy: true, }; assert!(health.healthy); @@ -1103,6 +1165,7 @@ mod tests { app_id: "mempool-web".to_string(), state: "exited".to_string(), podman_health: None, + host_port_ready: None, healthy: false, }; assert!(!health.healthy); @@ -1193,6 +1256,7 @@ mod tests { app_id: "indeedhub-postgres".into(), state: "running".into(), podman_health: None, + host_port_ready: None, healthy: true, }, ContainerHealth { @@ -1200,6 +1264,7 @@ mod tests { app_id: "indeedhub-redis".into(), state: "running".into(), podman_health: None, + host_port_ready: None, healthy: true, }, ContainerHealth { @@ -1207,6 +1272,7 @@ mod tests { app_id: "indeedhub-api".into(), state: "exited".into(), podman_health: None, + host_port_ready: None, healthy: false, }, ]; @@ -1217,6 +1283,7 @@ mod tests { app_id: "indeedhub-redis".into(), state: "running".into(), podman_health: None, + host_port_ready: None, healthy: true, }]; assert!(!deps_are_running("indeedhub-api", &partial)); @@ -1229,6 +1296,7 @@ mod tests { app_id: "bitcoin-core".into(), state: "running".into(), podman_health: None, + host_port_ready: None, healthy: true, }]; assert!(deps_are_running("lnd", &core)); @@ -1238,6 +1306,7 @@ mod tests { app_id: "bitcoin-knots".into(), state: "running".into(), podman_health: None, + host_port_ready: None, healthy: true, }]; assert!(deps_are_running("fedimint", &knots)); @@ -1247,6 +1316,7 @@ mod tests { app_id: "bitcoin-core".into(), state: "stopped".into(), podman_health: None, + host_port_ready: None, healthy: false, }]; assert!(!deps_are_running("electrumx", &stopped)); @@ -1259,6 +1329,7 @@ mod tests { app_id: "bitcoin-core".into(), state: "running".into(), podman_health: None, + host_port_ready: None, healthy: true, }]; @@ -1274,6 +1345,7 @@ mod tests { app_id: "bitcoin-core".into(), state: "stopped".into(), podman_health: None, + host_port_ready: None, healthy: false, }]; diff --git a/core/archipelago/src/main.rs b/core/archipelago/src/main.rs index d534f936..93e24bdd 100644 --- a/core/archipelago/src/main.rs +++ b/core/archipelago/src/main.rs @@ -20,6 +20,7 @@ use anyhow::{Context, Result}; use std::net::SocketAddr; use std::sync::Arc; +use std::time::Duration; use tokio::signal; use tokio::sync::Notify; use tracing::info; @@ -168,8 +169,6 @@ async fn main() -> Result<()> { boot_report.recovered, boot_report.total, boot_report.failed ); } - crash_recovery::mark_recovery_complete(); - // Construct the container orchestrator once. In prod mode we load the // on-disk app manifests, do an initial adoption pass, and spawn the // BootReconciler loop (Step 5/6 of the rust-orchestrator migration). @@ -195,17 +194,20 @@ async fn main() -> Result<()> { } // Adoption pass: link existing podman containers back to their // manifests so the reconciler doesn't recreate them. - match prod.adopt_existing().await { - Ok(report) => { + match tokio::time::timeout(Duration::from_secs(35), prod.adopt_existing()).await { + Ok(Ok(report)) => { info!( "🔗 Adopted {} existing container(s): {:?}", report.adopted.len(), report.adopted ); } - Err(e) => { + Ok(Err(e)) => { tracing::warn!(error = %e, "prod orchestrator: adopt_existing failed (non-fatal)"); } + Err(_) => { + tracing::warn!("prod orchestrator: adopt_existing timed out after 35s (non-fatal)") + } } // Spawn the boot reconciler loop. Runs an initial reconcile // immediately, then re-checks every RECONCILER_DEFAULT_INTERVAL @@ -272,6 +274,23 @@ async fn main() -> Result<()> { // Spawn periodic container snapshot (for crash recovery) crash_recovery::spawn_snapshot_task(config.data_dir.clone()); + // Recover stopped multi-container stack members after the backend is up. + // This can take minutes on busy nodes after a reboot, so keep it out of + // the synchronous systemd startup path. + { + let data_dir = config.data_dir.clone(); + tokio::spawn(async move { + let report = crash_recovery::start_stopped_stack_containers(&data_dir).await; + if report.total > 0 { + info!( + "🔄 Stack boot recovery: {}/{} containers started (failed: {:?})", + report.recovered, report.total, report.failed + ); + } + crash_recovery::mark_recovery_complete(); + }); + } + // Spawn disk space monitor (warns at 85%, auto-cleans at 90%) disk_monitor::spawn_disk_monitor(config.data_dir.clone()); diff --git a/core/archipelago/src/monitoring/notifications.rs b/core/archipelago/src/monitoring/notifications.rs index 45cccac1..6fdf478b 100644 --- a/core/archipelago/src/monitoring/notifications.rs +++ b/core/archipelago/src/monitoring/notifications.rs @@ -1,15 +1,20 @@ use crate::monitoring::types::{AlertRuleKind, FiredAlert}; use crate::webhooks::{self, WebhookEvent, WebhookPayload}; +use chrono::Utc; +use std::collections::HashSet; use std::path::Path; use std::sync::Arc; use tracing::info; +const NOTIFICATION_MAX_AGE_SECS: i64 = 30 * 60; + /// Push fired alerts as notifications to the state manager (broadcast via WebSocket). pub(crate) async fn push_alert_notifications( state_mgr: &Arc, alerts: &[FiredAlert], ) { let (mut data, _rev) = state_mgr.get_snapshot().await; + prune_stale_alert_notifications(&mut data.notifications, alerts); for alert in alerts { let level = match alert.kind { AlertRuleKind::DiskUsage | AlertRuleKind::RamUsage => { @@ -27,7 +32,7 @@ pub(crate) async fn push_alert_notifications( level, title: format!("{:?} Alert", alert.kind), message: alert.message.clone(), - timestamp: chrono::Utc::now().to_rfc3339(), + timestamp: Utc::now().to_rfc3339(), app_id: None, }; data.notifications.push(notification); @@ -40,6 +45,30 @@ pub(crate) async fn push_alert_notifications( info!("Fired {} alert(s)", alerts.len()); } +fn prune_stale_alert_notifications( + notifications: &mut Vec, + alerts: &[FiredAlert], +) { + let now = Utc::now(); + let active_ids: HashSet<&str> = alerts.iter().map(|alert| alert.id.as_str()).collect(); + notifications.retain(|notification| { + if active_ids.contains(notification.id.as_str()) { + return false; + } + if notification.app_id.is_some() || notification.id.starts_with("health-") { + return true; + } + match chrono::DateTime::parse_from_rfc3339(¬ification.timestamp) { + Ok(ts) => { + now.signed_duration_since(ts.with_timezone(&Utc)) + .num_seconds() + <= NOTIFICATION_MAX_AGE_SECS + } + Err(_) => false, + } + }); +} + /// Deliver webhook notifications for alerts that map to webhook events. pub(crate) async fn deliver_alert_webhooks(data_dir: &Path, alerts: &[FiredAlert]) { for alert in alerts { @@ -53,7 +82,7 @@ pub(crate) async fn deliver_alert_webhooks(data_dir: &Path, alerts: &[FiredAlert event, title: format!("{:?} Alert", alert.kind), message: alert.message.clone(), - timestamp: chrono::Utc::now().to_rfc3339(), + timestamp: Utc::now().to_rfc3339(), node_id: String::new(), details: Some(serde_json::json!({ "value": alert.value, @@ -64,3 +93,46 @@ pub(crate) async fn deliver_alert_webhooks(data_dir: &Path, alerts: &[FiredAlert } } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::data_model::{Notification, NotificationLevel}; + + fn notification(id: &str, timestamp: String, app_id: Option<&str>) -> Notification { + Notification { + id: id.to_string(), + level: NotificationLevel::Warning, + title: "DiskUsage Alert".to_string(), + message: "Disk warning".to_string(), + timestamp, + app_id: app_id.map(str::to_string), + } + } + + #[test] + fn prune_stale_alert_notifications_removes_duplicate_and_old_generic_alerts() { + let active_alert = FiredAlert { + id: "alert-active".to_string(), + kind: AlertRuleKind::DiskUsage, + message: "Disk warning".to_string(), + value: 90.0, + threshold: 85.0, + timestamp: Utc::now().timestamp(), + acknowledged: false, + }; + let old_timestamp = (Utc::now() - chrono::Duration::minutes(45)).to_rfc3339(); + let fresh_timestamp = (Utc::now() - chrono::Duration::minutes(5)).to_rfc3339(); + let mut notifications = vec![ + notification("alert-active", fresh_timestamp.clone(), None), + notification("alert-old", old_timestamp, None), + notification("alert-fresh", fresh_timestamp.clone(), None), + notification("health-indeedhub-1", fresh_timestamp, Some("indeedhub")), + ]; + + prune_stale_alert_notifications(&mut notifications, &[active_alert]); + + let ids: Vec<&str> = notifications.iter().map(|n| n.id.as_str()).collect(); + assert_eq!(ids, vec!["alert-fresh", "health-indeedhub-1"]); + } +} diff --git a/core/archipelago/src/monitoring/telemetry.rs b/core/archipelago/src/monitoring/telemetry.rs index 0b1c4482..2cc1a6ce 100644 --- a/core/archipelago/src/monitoring/telemetry.rs +++ b/core/archipelago/src/monitoring/telemetry.rs @@ -71,30 +71,49 @@ async fn build_telemetry_report( data_dir: &std::path::Path, ) -> anyhow::Result { // Anonymous node ID — truncated SHA-256 hash of pubkey - let (node_id, version, container_count, running_count, peer_count) = if let Some(ref sm) = state - { - let (data, _) = sm.get_snapshot().await; - let id = { - use sha2::{Digest, Sha256}; - let mut h = Sha256::new(); - h.update(data.server_info.pubkey.as_bytes()); - hex::encode(h.finalize())[..16].to_string() + let (node_id, version, container_count, running_count, peer_count, containers) = + if let Some(ref sm) = state { + let (data, _) = sm.get_snapshot().await; + let id = { + use sha2::{Digest, Sha256}; + let mut h = Sha256::new(); + h.update(data.server_info.pubkey.as_bytes()); + hex::encode(h.finalize())[..16].to_string() + }; + let containers: Vec = data + .package_data + .iter() + .map(|(id, pkg)| { + serde_json::json!({ + "id": id, + "state": format!("{:?}", pkg.state), + "version": pkg.manifest.version, + }) + }) + .collect(); + let running = data + .package_data + .values() + .filter(|p| matches!(p.state, crate::data_model::PackageState::Running)) + .count(); + ( + id, + data.server_info.version.clone(), + data.package_data.len(), + running, + data.peer_health.len(), + containers, + ) + } else { + ( + "unknown".to_string(), + "unknown".to_string(), + 0, + 0, + 0, + Vec::new(), + ) }; - let running = data - .package_data - .values() - .filter(|p| matches!(p.state, crate::data_model::PackageState::Running)) - .count(); - ( - id, - data.server_info.version.clone(), - data.package_data.len(), - running, - data.peer_health.len(), - ) - } else { - ("unknown".to_string(), "unknown".to_string(), 0, 0, 0) - }; // System info let cpu_cores = std::thread::available_parallelism() @@ -153,6 +172,7 @@ async fn build_telemetry_report( "cpu_pct": (cpu_pct * 10.0).round() / 10.0, "mem_pct": (mem_pct * 10.0).round() / 10.0, "disk_pct": (disk_pct * 10.0).round() / 10.0, + "containers": containers, "container_count": container_count, "running_count": running_count, "federation_peers": peer_count, @@ -166,16 +186,28 @@ async fn post_telemetry_report(url: &str, report: &serde_json::Value) -> anyhow: let client = reqwest::Client::builder() .timeout(std::time::Duration::from_secs(10)) .build()?; + let payload = serde_json::json!({ + "method": "telemetry.ingest", + "params": report, + }); let response = client .post(url) .header("Content-Type", "application/json") .header("User-Agent", "Archipelago-Telemetry/1.0") - .json(report) + .json(&payload) .send() .await?; if !response.status().is_success() { anyhow::bail!("Collector returned {}", response.status()); } + let status = response.status(); + let body: serde_json::Value = response.json().await.unwrap_or_default(); + if let Some(error) = body.get("error") { + anyhow::bail!("Collector RPC error: {}", error); + } + if body.get("result").is_none() { + anyhow::bail!("Collector returned {} without RPC result", status); + } Ok(()) } diff --git a/core/archipelago/src/server.rs b/core/archipelago/src/server.rs index 30eaa3d2..dc1cafbc 100644 --- a/core/archipelago/src/server.rs +++ b/core/archipelago/src/server.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use std::net::SocketAddr; use std::sync::Arc; use std::time::{Duration, Instant}; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::net::TcpListener; use tracing::{debug, error, info, warn}; @@ -331,6 +332,7 @@ impl Server { // lifecycle op, and to break out if the spawned task dies // without ever writing a final state. let mut transitional_since: HashMap = HashMap::new(); + let mut scan_backoff_until: Option = None; if let Err(e) = scan_and_update_packages( &scanner, &state, @@ -342,6 +344,10 @@ impl Server { .await { error!("Failed to scan containers: {}", e); + if is_podman_scan_timeout(&e) { + scan_backoff_until = Some(Instant::now() + Duration::from_secs(30)); + warn!("Podman container scan timed out; backing off scans for 30s"); + } } // Bump the scan-completion counter so any caller waiting on a // kicked scan (install/update success path) can proceed. @@ -364,8 +370,16 @@ impl Server { debug!("Scan kicked by install/update success — running immediately"); } } + if let Some(until) = scan_backoff_until { + if Instant::now() < until { + debug!("Skipping container scan — Podman scan backoff active"); + scan_tick.send_modify(|n| *n = n.wrapping_add(1)); + continue; + } + } if scanning.load(std::sync::atomic::Ordering::Relaxed) { debug!("Skipping container scan — previous scan still in progress"); + scan_tick.send_modify(|n| *n = n.wrapping_add(1)); continue; } scanning.store(true, std::sync::atomic::Ordering::Relaxed); @@ -380,6 +394,12 @@ impl Server { .await { error!("Failed to update containers: {}", e); + if is_podman_scan_timeout(&e) { + scan_backoff_until = Some(Instant::now() + Duration::from_secs(30)); + warn!("Podman container scan timed out; backing off scans for 30s"); + } + } else { + scan_backoff_until = None; } scan_tick.send_modify(|n| *n = n.wrapping_add(1)); scanning.store(false, std::sync::atomic::Ordering::Relaxed); @@ -847,10 +867,10 @@ const TRANSITIONAL_STUCK_TIMEOUT: Duration = Duration::from_secs(120); const INSTALLING_STUCK_TIMEOUT: Duration = Duration::from_secs(20 * 60); fn transitional_stuck_timeout(state: &crate::data_model::PackageState) -> Duration { - if *state == crate::data_model::PackageState::Installing { - INSTALLING_STUCK_TIMEOUT - } else { - TRANSITIONAL_STUCK_TIMEOUT + use crate::data_model::PackageState::*; + match state { + Installing | Starting | Restarting => INSTALLING_STUCK_TIMEOUT, + _ => TRANSITIONAL_STUCK_TIMEOUT, } } @@ -874,6 +894,18 @@ fn is_transitional(state: &crate::data_model::PackageState) -> bool { ) } +fn absent_transitional_replacement( + state: &crate::data_model::PackageState, +) -> Option { + match state { + // A stop operation is complete once the container record disappears. + // Do not leave the app card wedged in "Stopping..." just because the + // background task died or the backend restarted before it wrote back. + crate::data_model::PackageState::Stopping => Some(crate::data_model::PackageState::Stopped), + _ => None, + } +} + /// Merge a fresh scan entry `fresh` into `existing` while preserving /// `existing.state` (which is transitional — the RPC spawn task owns it). /// Non-state observability fields are taken from `fresh` so the UI still @@ -881,8 +913,17 @@ fn is_transitional(state: &crate::data_model::PackageState) -> bool { fn merge_preserving_transitional( existing: &crate::data_model::PackageDataEntry, fresh: &crate::data_model::PackageDataEntry, + user_stop_requested: bool, ) -> crate::data_model::PackageDataEntry { let state = match (&existing.state, &fresh.state) { + // A user-initiated stop must keep showing Stopping while podman still + // reports Running. Repair/restart transitions do not have a user-stop + // marker, so a fresh Running scan means the app recovered. + (crate::data_model::PackageState::Stopping, crate::data_model::PackageState::Running) + if !user_stop_requested => + { + fresh.state.clone() + } // Removing with a live running container is stale: uninstall either // failed or Archipelago restarted before the spawned task could revert // state. Let the scanner recover the UI immediately instead of @@ -909,6 +950,11 @@ fn merge_preserving_transitional( } } +fn is_podman_scan_timeout(error: &anyhow::Error) -> bool { + let msg = format!("{:#}", error); + msg.contains("podman ps") && msg.contains("timed out") +} + async fn scan_and_update_packages( scanner: &DockerPackageScanner, state: &StateManager, @@ -925,6 +971,7 @@ async fn scan_and_update_packages( pkg.exit_code = None; } } + normalize_reachable_package_health(&mut packages).await; let (current_data, _) = state.get_snapshot().await; let tor_addr = docker_packages::read_tor_address("archipelago").await; @@ -992,7 +1039,11 @@ async fn scan_and_update_packages( // observability fields (health, exit_code, lan_address // via installed) from the fresh scan so the UI still // sees live readings. - let merged_entry = merge_preserving_transitional(existing_entry, pkg); + let merged_entry = merge_preserving_transitional( + existing_entry, + pkg, + user_stopped.contains(id), + ); if existing.cloned() != Some(merged_entry.clone()) { merged.insert(id.clone(), merged_entry); changed = true; @@ -1029,6 +1080,19 @@ async fn scan_and_update_packages( // owner (spawn_task) is responsible for clearing state, not us. if let Some(entry) = merged.get(&id) { if is_transitional(&entry.state) { + if let Some(replacement) = absent_transitional_replacement(&entry.state) { + let mut updated = entry.clone(); + updated.state = replacement; + updated.health = None; + updated.exit_code = None; + updated.install_progress = None; + updated.uninstall_stage = None; + merged.insert(id.clone(), updated); + transitional_since.remove(&id); + absence_tracker.remove(&id); + changed = true; + continue; + } let entered = *transitional_since.entry(id.clone()).or_insert(now); let timeout = transitional_stuck_timeout(&entry.state); if now.duration_since(entered) > timeout { @@ -1088,6 +1152,99 @@ async fn scan_and_update_packages( Ok(()) } +async fn normalize_reachable_package_health( + packages: &mut HashMap, +) { + for (id, pkg) in packages.iter_mut() { + if pkg.state != crate::data_model::PackageState::Running { + continue; + } + if !matches!(pkg.health.as_deref(), Some("starting" | "unhealthy" | "1")) { + continue; + } + let Some(port) = pkg + .installed + .as_ref() + .and_then(|i| i.interface_addresses.get("main")) + .and_then(|a| a.lan_address.as_deref()) + .and_then(port_from_url) + .or_else(|| fallback_package_port(id)) + else { + continue; + }; + if frontend_port_http_ready(port).await { + debug!(app_id = %id, port, "normalizing reachable package health to healthy"); + pkg.health = Some("healthy".to_string()); + ensure_main_lan_address(pkg, port); + } + } +} + +async fn frontend_port_http_ready(port: u16) -> bool { + let Ok(Ok(mut stream)) = tokio::time::timeout( + Duration::from_secs(2), + tokio::net::TcpStream::connect(("127.0.0.1", port)), + ) + .await + else { + return false; + }; + + let request = b"GET / HTTP/1.1\r\nHost: 127.0.0.1\r\nConnection: close\r\n\r\n"; + if stream.write_all(request).await.is_err() { + return false; + } + + let mut buf = [0u8; 64]; + let Ok(Ok(n)) = tokio::time::timeout(Duration::from_secs(2), stream.read(&mut buf)).await + else { + return false; + }; + if n == 0 { + return false; + } + + let head = String::from_utf8_lossy(&buf[..n]); + head.starts_with("HTTP/1.1 2") + || head.starts_with("HTTP/1.1 3") + || head.starts_with("HTTP/1.0 2") + || head.starts_with("HTTP/1.0 3") +} + +fn ensure_main_lan_address(pkg: &mut crate::data_model::PackageDataEntry, port: u16) { + let Some(installed) = pkg.installed.as_mut() else { + return; + }; + let main = installed + .interface_addresses + .entry("main".to_string()) + .or_insert_with(|| crate::data_model::InterfaceAddress { + tor_address: String::new(), + lan_address: None, + }); + if main.lan_address.is_none() { + main.lan_address = Some(format!("http://localhost:{port}")); + } +} + +fn fallback_package_port(app_id: &str) -> Option { + match app_id { + "fedimint" | "fedimintd" => Some(8175), + "filebrowser" => Some(8083), + "indeedhub" => Some(7778), + "nginx-proxy-manager" => Some(8081), + "nostr-rs-relay" => Some(18081), + _ => None, + } +} + +fn port_from_url(url: &str) -> Option { + let after_scheme = url.split_once("://").map(|(_, rest)| rest).unwrap_or(url); + let host_port = after_scheme.split('/').next().unwrap_or(after_scheme); + let port = host_port.rsplit_once(':')?.1; + port.parse::().ok() +} + /// Register Archipelago DWN protocols on startup. async fn register_dwn_protocols(data_dir: &std::path::Path) -> Result<()> { use crate::network::dwn_store::{DwnStore, ProtocolDefinition}; @@ -1211,10 +1368,19 @@ mod merge_tests { // not clobber the transitional state owned by the RPC spawn task. let existing = make_entry(PackageState::Stopping, Some("healthy")); let fresh = make_entry(PackageState::Running, Some("starting")); - let merged = merge_preserving_transitional(&existing, &fresh); + let merged = merge_preserving_transitional(&existing, &fresh, true); assert_eq!(merged.state, PackageState::Stopping); } + #[test] + fn non_user_stopping_recovers_when_container_is_running() { + let existing = make_entry(PackageState::Stopping, Some("unknown")); + let fresh = make_entry(PackageState::Running, Some("healthy")); + let merged = merge_preserving_transitional(&existing, &fresh, false); + assert_eq!(merged.state, PackageState::Running); + assert_eq!(merged.health.as_deref(), Some("healthy")); + } + #[test] fn merges_fresh_observability_fields() { // Non-state observability fields (health, exit_code, installed) @@ -1224,7 +1390,7 @@ mod merge_tests { existing.exit_code = None; let mut fresh = make_entry(PackageState::Running, Some("unhealthy")); fresh.exit_code = Some(0); - let merged = merge_preserving_transitional(&existing, &fresh); + let merged = merge_preserving_transitional(&existing, &fresh, true); assert_eq!(merged.state, PackageState::Stopping); assert_eq!(merged.health.as_deref(), Some("unhealthy")); assert_eq!(merged.exit_code, Some(0)); @@ -1234,7 +1400,7 @@ mod merge_tests { fn stale_removing_recovers_when_container_is_running() { let existing = make_entry(PackageState::Removing, Some("unknown")); let fresh = make_entry(PackageState::Running, Some("healthy")); - let merged = merge_preserving_transitional(&existing, &fresh); + let merged = merge_preserving_transitional(&existing, &fresh, false); assert_eq!(merged.state, PackageState::Running); assert_eq!(merged.health.as_deref(), Some("healthy")); } @@ -1272,4 +1438,20 @@ mod merge_tests { TRANSITIONAL_STUCK_TIMEOUT ); } + + #[test] + fn absent_stopping_transitions_to_stopped() { + assert_eq!( + absent_transitional_replacement(&PackageState::Stopping), + Some(PackageState::Stopped) + ); + } + + #[test] + fn absent_installing_still_waits_for_owner() { + assert_eq!( + absent_transitional_replacement(&PackageState::Installing), + None + ); + } } diff --git a/core/container/src/lib.rs b/core/container/src/lib.rs index 5f828aa5..e9f1ceb7 100644 --- a/core/container/src/lib.rs +++ b/core/container/src/lib.rs @@ -8,9 +8,9 @@ pub mod runtime; pub use bitcoin_simulator::{BitcoinSimulationMode, BitcoinSimulator}; pub use health_monitor::HealthMonitor; pub use manifest::{ - AppManifest, BuildConfig, ContainerConfig, Dependency, DerivedEnv, HealthCheck, HostFacts, - ManifestError, ResolvedSource, ResourceLimits, SecretEnv, SecretsProvider, SecurityPolicy, - Volume, + AppManifest, BuildConfig, ContainerConfig, Dependency, DerivedEnv, GeneratedFile, HealthCheck, + HostFacts, ManifestError, ResolvedSource, ResourceLimits, SecretEnv, SecretsProvider, + SecurityPolicy, Volume, }; pub use podman_client::{ image_uses_insecure_registry, ContainerState, ContainerStatus, PodmanClient, diff --git a/core/container/src/manifest.rs b/core/container/src/manifest.rs index 123bc82e..62c22b9b 100644 --- a/core/container/src/manifest.rs +++ b/core/container/src/manifest.rs @@ -1,5 +1,5 @@ use serde::{Deserialize, Serialize}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use thiserror::Error; #[derive(Debug, Error)] @@ -42,6 +42,9 @@ pub struct AppDefinition { #[serde(default)] pub volumes: Vec, + #[serde(default)] + pub files: Vec, + #[serde(default)] pub environment: Vec, @@ -216,6 +219,8 @@ pub struct SecurityPolicy { pub capabilities: Vec, #[serde(default = "default_true")] pub readonly_root: bool, + #[serde(default = "default_true")] + pub no_new_privileges: bool, #[serde(default = "default_network_policy")] pub network_policy: String, #[serde(default)] @@ -263,6 +268,14 @@ pub struct Volume { pub tmpfs_options: Option, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct GeneratedFile { + pub path: String, + pub content: String, + #[serde(default)] + pub overwrite: bool, +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct HealthCheck { #[serde(rename = "type")] @@ -302,8 +315,16 @@ impl AppManifest { } pub fn validate(&self) -> Result<(), ManifestError> { - if self.app.id.is_empty() { - return Err(ManifestError::Invalid("app.id cannot be empty".to_string())); + if !is_valid_app_id(&self.app.id) { + return Err(ManifestError::Invalid( + "app.id must be lowercase ASCII letters, digits, or single hyphens".to_string(), + )); + } + + if self.app.name.trim().is_empty() { + return Err(ManifestError::Invalid( + "app.name cannot be empty".to_string(), + )); } // Exactly one of container.image or container.build must be set. We can't @@ -355,6 +376,11 @@ impl AppManifest { "container.network cannot be empty (omit the field to use default)".to_string(), )); } + if is_dangerous_network_mode(n) { + return Err(ManifestError::Invalid(format!( + "container.network '{n}' is not allowed in app manifests" + ))); + } } // custom_args: no empty strings (would inject literal "" into @@ -447,6 +473,11 @@ impl AppManifest { } } + validate_security(&self.app.security)?; + validate_ports(&self.app.ports)?; + validate_environment(&self.app.environment)?; + validate_devices(&self.app.devices)?; + // Volume tmpfs_options: only meaningful for type: tmpfs. for (i, v) in self.app.volumes.iter().enumerate() { if v.volume_type == "tmpfs" { @@ -466,6 +497,11 @@ impl AppManifest { v.volume_type ))); } else { + if v.volume_type != "bind" && v.volume_type != "volume" { + return Err(ManifestError::Invalid(format!( + "volumes[{i}].type must be bind, volume, or tmpfs" + ))); + } if v.source.is_empty() { return Err(ManifestError::Invalid(format!( "volumes[{i}] ({}) must set source", @@ -478,6 +514,45 @@ impl AppManifest { v.volume_type ))); } + if v.volume_type == "bind" { + validate_bind_source(i, &v.source)?; + } else if !is_valid_named_volume(&v.source) { + return Err(ManifestError::Invalid(format!( + "volumes[{i}].source must be a safe named volume" + ))); + } + validate_container_path(i, &v.target)?; + validate_volume_options(i, &v.options)?; + } + } + + for (i, f) in self.app.files.iter().enumerate() { + if f.path.is_empty() { + return Err(ManifestError::Invalid(format!( + "files[{i}].path cannot be empty" + ))); + } + if !std::path::Path::new(&f.path).is_absolute() { + return Err(ManifestError::Invalid(format!( + "files[{i}].path must be absolute" + ))); + } + if f.content.is_empty() { + return Err(ManifestError::Invalid(format!( + "files[{i}].content cannot be empty" + ))); + } + let file_path = std::path::Path::new(&f.path); + let under_bind_mount = self + .app + .volumes + .iter() + .filter(|v| v.volume_type != "tmpfs" && !v.source.is_empty()) + .any(|v| file_path.starts_with(std::path::Path::new(&v.source))); + if !under_bind_mount { + return Err(ManifestError::Invalid(format!( + "files[{i}].path must live under a bind-mounted volume source" + ))); } } @@ -485,6 +560,195 @@ impl AppManifest { } } +fn is_valid_app_id(id: &str) -> bool { + if id.is_empty() || id.starts_with('-') || id.ends_with('-') || id.contains("--") { + return false; + } + id.chars() + .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-') +} + +fn is_dangerous_network_mode(mode: &str) -> bool { + mode.starts_with("container:") || mode.starts_with("ns:") +} + +fn validate_security(policy: &SecurityPolicy) -> Result<(), ManifestError> { + let allowed_network_policies = ["isolated", "bridge", "host"]; + if !policy.network_policy.is_empty() + && !allowed_network_policies.contains(&policy.network_policy.as_str()) + { + return Err(ManifestError::Invalid(format!( + "security.network_policy must be one of {}", + allowed_network_policies.join(", ") + ))); + } + + let allowed_caps = [ + "CHOWN", + "DAC_OVERRIDE", + "FOWNER", + "NET_ADMIN", + "NET_BIND_SERVICE", + "NET_RAW", + "SETGID", + "SETUID", + "SYS_ADMIN", + ]; + let mut seen = HashSet::new(); + for cap in &policy.capabilities { + if !allowed_caps.contains(&cap.as_str()) { + return Err(ManifestError::Invalid(format!( + "security.capabilities contains unsupported capability '{cap}'" + ))); + } + if !seen.insert(cap.as_str()) { + return Err(ManifestError::Invalid(format!( + "security.capabilities contains duplicate capability '{cap}'" + ))); + } + } + Ok(()) +} + +fn validate_ports(ports: &[PortMapping]) -> Result<(), ManifestError> { + let mut seen_host = HashSet::new(); + for (i, port) in ports.iter().enumerate() { + if port.host == 0 || port.container == 0 { + return Err(ManifestError::Invalid(format!( + "ports[{i}].host and ports[{i}].container must be non-zero" + ))); + } + let protocol = if port.protocol.is_empty() { + "tcp" + } else { + port.protocol.as_str() + }; + if protocol != "tcp" && protocol != "udp" { + return Err(ManifestError::Invalid(format!( + "ports[{i}].protocol must be tcp or udp" + ))); + } + if !seen_host.insert((port.host, protocol.to_string())) { + return Err(ManifestError::Invalid(format!( + "ports contains duplicate host binding {}/{}", + port.host, protocol + ))); + } + } + Ok(()) +} + +fn validate_environment(env: &[String]) -> Result<(), ManifestError> { + let mut seen = HashSet::new(); + for (i, entry) in env.iter().enumerate() { + let Some((key, _)) = entry.split_once('=') else { + return Err(ManifestError::Invalid(format!( + "environment[{i}] must be KEY=VALUE" + ))); + }; + if !is_valid_env_key(key) { + return Err(ManifestError::Invalid(format!( + "environment[{i}] has invalid key '{key}'" + ))); + } + if !seen.insert(key) { + return Err(ManifestError::Invalid(format!( + "environment contains duplicate key '{key}'" + ))); + } + } + Ok(()) +} + +fn is_valid_env_key(key: &str) -> bool { + let mut chars = key.chars(); + match chars.next() { + Some(c) if c.is_ascii_alphabetic() || c == '_' => {} + _ => return false, + } + chars.all(|c| c.is_ascii_alphanumeric() || c == '_') +} + +fn validate_devices(devices: &[String]) -> Result<(), ManifestError> { + let mut seen = HashSet::new(); + for (i, device) in devices.iter().enumerate() { + if !device.starts_with("/dev/") || device.contains("..") { + return Err(ManifestError::Invalid(format!( + "devices[{i}] must be an absolute /dev path" + ))); + } + if !seen.insert(device.as_str()) { + return Err(ManifestError::Invalid(format!( + "devices contains duplicate entry '{device}'" + ))); + } + } + Ok(()) +} + +fn validate_bind_source(index: usize, source: &str) -> Result<(), ManifestError> { + let path = std::path::Path::new(source); + if !path.is_absolute() { + if is_valid_named_volume(source) { + return Ok(()); + } + return Err(ManifestError::Invalid(format!( + "volumes[{index}].source must be absolute for host bind mounts or a safe named volume" + ))); + } + if source.contains("..") { + return Err(ManifestError::Invalid(format!( + "volumes[{index}].source must not contain '..'" + ))); + } + if source.starts_with("/var/lib/archipelago/") || is_reviewed_host_bind_exception(source) { + return Ok(()); + } + Err(ManifestError::Invalid(format!( + "volumes[{index}].source must be under /var/lib/archipelago or a reviewed host-bind exception" + ))) +} + +fn is_reviewed_host_bind_exception(source: &str) -> bool { + source == "/run/user/1000/podman/podman.sock" || source == "/var/run/dbus" +} + +fn is_valid_named_volume(source: &str) -> bool { + if source.is_empty() || source.contains('/') || source.contains("..") { + return false; + } + source + .chars() + .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == '.') +} + +fn validate_container_path(index: usize, target: &str) -> Result<(), ManifestError> { + if !std::path::Path::new(target).is_absolute() || target.contains("..") { + return Err(ManifestError::Invalid(format!( + "volumes[{index}].target must be an absolute container path without '..'" + ))); + } + Ok(()) +} + +fn validate_volume_options(index: usize, options: &[String]) -> Result<(), ManifestError> { + let allowed = ["rw", "ro", "z", "Z", "shared", "rshared", "slave", "rslave"]; + let mut seen = HashSet::new(); + for option in options { + if !allowed.contains(&option.as_str()) { + return Err(ManifestError::Invalid(format!( + "volumes[{index}].options contains unsupported option '{option}'" + ))); + } + if !seen.insert(option.as_str()) { + return Err(ManifestError::Invalid(format!( + "volumes[{index}].options contains duplicate option '{option}'" + ))); + } + } + Ok(()) +} + /// Host facts available to `derived_env` templates at apply time. /// /// Mirrors the values `scripts/container-specs.sh:detect_environment()` @@ -864,6 +1128,38 @@ app: ); } + #[test] + fn generated_files_must_live_under_bind_mounts() { + let yaml = r#" +app: + id: test-app + name: Test App + version: 1.0.0 + container: + image: test/image:latest + volumes: + - type: bind + source: /var/lib/archipelago/test-app + target: /data + files: + - path: /var/lib/archipelago/test-app/config.yaml + content: | + key: value +"#; + let manifest = AppManifest::parse(yaml).unwrap(); + assert_eq!(manifest.app.files.len(), 1); + + let bad = yaml.replace( + "/var/lib/archipelago/test-app/config.yaml", + "/etc/test-app/config.yaml", + ); + let err = AppManifest::parse(&bad).unwrap_err(); + assert!( + format!("{err}").contains("bind-mounted volume source"), + "unexpected error: {err}" + ); + } + #[test] fn empty_custom_arg_is_rejected() { let yaml = r#" @@ -1089,6 +1385,157 @@ app: } } + #[test] + fn unsafe_manifest_values_are_rejected() { + let cases = [ + ( + "bad app id", + r#" +app: + id: Bad_App + name: Bad + version: 1.0.0 + container: + image: test/image:latest +"#, + "app.id", + ), + ( + "unsupported capability", + r#" +app: + id: bad-cap + name: Bad + version: 1.0.0 + container: + image: test/image:latest + security: + capabilities: [SYS_MODULE] +"#, + "unsupported capability", + ), + ( + "docker socket bind", + r#" +app: + id: bad-bind + name: Bad + version: 1.0.0 + container: + image: test/image:latest + volumes: + - type: bind + source: /var/run/docker.sock + target: /var/run/docker.sock +"#, + "reviewed host-bind exception", + ), + ( + "path-like relative bind source", + r#" +app: + id: bad-bind + name: Bad + version: 1.0.0 + container: + image: test/image:latest + volumes: + - type: bind + source: data/cache + target: /data +"#, + "absolute for host bind mounts", + ), + ( + "bad environment key", + r#" +app: + id: bad-env + name: Bad + version: 1.0.0 + container: + image: test/image:latest + environment: + - 1BAD=value +"#, + "invalid key", + ), + ( + "duplicate host port", + r#" +app: + id: bad-port + name: Bad + version: 1.0.0 + container: + image: test/image:latest + ports: + - { host: 8080, container: 80, protocol: tcp } + - { host: 8080, container: 81, protocol: tcp } +"#, + "duplicate host binding", + ), + ( + "bad device", + r#" +app: + id: bad-device + name: Bad + version: 1.0.0 + container: + image: test/image:latest + devices: + - /tmp/fake-device +"#, + "absolute /dev path", + ), + ( + "container network namespace", + r#" +app: + id: bad-network + name: Bad + version: 1.0.0 + container: + image: test/image:latest + network: container:host +"#, + "not allowed", + ), + ]; + + for (name, yaml, expected) in cases { + let err = AppManifest::parse(yaml).unwrap_err(); + let msg = format!("{err}"); + assert!( + msg.contains(expected), + "case {name} expected '{expected}', got: {msg}" + ); + } + } + + #[test] + fn reviewed_host_bind_exceptions_parse() { + let yaml = r#" +app: + id: reviewed-binds + name: Reviewed Binds + version: 1.0.0 + container: + image: test/image:latest + volumes: + - type: bind + source: /run/user/1000/podman/podman.sock + target: /var/run/docker.sock + options: [rw] + - type: bind + source: /var/run/dbus + target: /var/run/dbus + options: [ro] +"#; + AppManifest::parse(yaml).unwrap(); + } + #[test] fn parse_every_real_manifest() { let app_manifests = list_repo_manifests(); @@ -1099,7 +1546,6 @@ app: let mut failures: Vec = Vec::new(); let mut modern_count = 0usize; - let mut legacy_count = 0usize; for path in app_manifests { let content = fs::read_to_string(&path).expect("read manifest"); let parsed_yaml: serde_yaml::Value = match serde_yaml::from_str(&content) { @@ -1121,15 +1567,14 @@ app: failures.push(format!("{}: {err}", path.display())); } } else { - legacy_count += 1; + failures.push(format!( + "{}: expected modern app-schema manifest", + path.display() + )); } } assert!(modern_count > 0, "no modern app-schema manifests found"); - assert!( - legacy_count > 0, - "expected at least one legacy manifest shape" - ); assert!( failures.is_empty(), diff --git a/core/container/src/podman_client.rs b/core/container/src/podman_client.rs index 3111a4db..7ce04a40 100644 --- a/core/container/src/podman_client.rs +++ b/core/container/src/podman_client.rs @@ -56,9 +56,9 @@ pub enum ContainerState { impl From<&str> for ContainerState { fn from(s: &str) -> Self { match s.to_lowercase().as_str() { - "created" => ContainerState::Created, + "created" | "initialized" => ContainerState::Created, "running" => ContainerState::Running, - "stopping" => ContainerState::Stopping, + "stopping" | "removing" => ContainerState::Stopping, "stopped" => ContainerState::Stopped, "exited" => ContainerState::Exited, "paused" => ContainerState::Paused, @@ -129,7 +129,6 @@ impl PodmanClient { "filebrowser" => "http://localhost:8083", "nginx-proxy-manager" => "http://localhost:8081", "portainer" => "http://localhost:9000", - "saleor" => "http://localhost:9011", "uptime-kuma" => "http://localhost:3002", "fedimint" | "fedimintd" => "http://localhost:8175", "fedimint-gateway" => "http://localhost:8176", @@ -390,7 +389,7 @@ impl PodmanClient { "cap_add": cap_add, "cap_drop": cap_drop, "read_only_filesystem": manifest.app.security.readonly_root, - "no_new_privileges": true, + "no_new_privileges": manifest.app.security.no_new_privileges, "restart_policy": "unless-stopped", "restart_tries": 5, "netns": { @@ -635,6 +634,7 @@ fn podman_network_settings( Some("bridge") => ("bridge", None), Some("none") => ("none", None), Some("slirp4netns") => ("slirp4netns", None), + Some("pasta") => ("pasta", None), Some("private") => ("private", None), Some(custom) => ("bridge", Some(custom.to_string())), None if network_policy == "host" => ("host", None), diff --git a/core/container/src/runtime.rs b/core/container/src/runtime.rs index 0c6b9c1f..e92626ea 100644 --- a/core/container/src/runtime.rs +++ b/core/container/src/runtime.rs @@ -7,6 +7,7 @@ use std::time::Duration; use tokio::process::Command as TokioCommand; const PODMAN_CLI_DEFAULT_TIMEOUT: Duration = Duration::from_secs(30); +const PODMAN_CLI_IMAGE_CHECK_TIMEOUT: Duration = Duration::from_secs(10); const PODMAN_CLI_BUILD_TIMEOUT: Duration = Duration::from_secs(900); #[async_trait] @@ -150,7 +151,25 @@ impl ContainerRuntime for PodmanRuntime { if is_missing_container_error(&stderr) { return Ok(()); } - Err(api_err.context(format!("podman rm fallback failed: {}", stderr.trim()))) + let zero_timeout = self.podman_cli(&["rm", "-f", "--time", "0", name]).await?; + if zero_timeout.status.success() { + return Ok(()); + } + + let _ = self.podman_cli(&["container", "cleanup", name]).await; + let cleanup_rm = self.podman_cli(&["rm", "-f", name]).await?; + if cleanup_rm.status.success() { + return Ok(()); + } + let cleanup_stderr = String::from_utf8_lossy(&cleanup_rm.stderr); + if is_missing_container_error(&cleanup_stderr) { + return Ok(()); + } + Err(api_err.context(format!( + "podman rm fallback failed: {}; cleanup rm failed: {}", + stderr.trim(), + cleanup_stderr.trim() + ))) } } } @@ -196,20 +215,26 @@ impl ContainerRuntime for PodmanRuntime { } async fn image_exists(&self, image_ref: &str) -> Result { - // `podman image exists` returns 0 if present, 1 if absent. Any other - // exit code is an environment failure we should surface. - let output = self.podman_cli(&["image", "exists", image_ref]).await?; + // Avoid `podman image exists`: on production nodes with a stressed + // rootless store it can hang even when targeted at one image. A bounded + // inspect is the local-storage probe the trait contract describes. + let output = self + .podman_cli_timeout( + &["image", "inspect", image_ref], + PODMAN_CLI_IMAGE_CHECK_TIMEOUT, + ) + .await?; match output.status.code() { Some(0) => Ok(true), Some(1) => Ok(false), Some(code) => { let stderr = String::from_utf8_lossy(&output.stderr); Err(anyhow::anyhow!( - "podman image exists {image_ref} exited with {code}: {stderr}" + "podman image inspect {image_ref} exited with {code}: {stderr}" )) } None => Err(anyhow::anyhow!( - "podman image exists {image_ref} terminated by signal" + "podman image inspect {image_ref} terminated by signal" )), } } diff --git a/docker/bitcoin-ui/index.html b/docker/bitcoin-ui/index.html index 415fdf85..d24dc0fd 100644 --- a/docker/bitcoin-ui/index.html +++ b/docker/bitcoin-ui/index.html @@ -18,6 +18,7 @@ body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', sans-serif; min-height: 100vh; + background: #000; color: white; overflow-x: hidden; } @@ -555,6 +556,87 @@ + +
+
+
+

Transaction Relay Sharing

+

Trusted peer access for broadcasting transactions through this node

+
+
+ Local node + Checking... +
+
+ +
+
+
HTTPS Endpoint
+
Not configured
+
+
+
HTTP Endpoint
+
Not configured
+
+
+
Tor Endpoint
+
Not configured
+
+
+ +
+
+
+ + + +
+ +
+ + +
+
+ + +
+ +
+ +
+
+ + +
+ +
+
+ Restricted RPC user + txrelay +
+
Credential status unavailable
+
+
+
Relay Requests
+
+
No relay requests
+
+
+
+
+
+
@@ -608,6 +690,7 @@ // RPC Configuration - Use local Nginx proxy within container const RPC_ENDPOINT = 'bitcoin-rpc/'; const STATUS_ENDPOINT = 'bitcoin-status'; + const ARCHY_RPC_ENDPOINT = 'rpc/v1'; console.log('[Bitcoin UI] RPC Endpoint:', RPC_ENDPOINT); // Make RPC call to Bitcoin node via local proxy @@ -654,6 +737,220 @@ return response.json(); } + function cookieValue(name) { + return document.cookie + .split('; ') + .find(row => row.startsWith(`${name}=`)) + ?.split('=') + .slice(1) + .join('=') || ''; + } + + async function callArchyRPC(method, params = {}) { + const headers = { 'Content-Type': 'application/json' }; + const csrf = cookieValue('csrf'); + if (csrf) headers['X-CSRF-Token'] = decodeURIComponent(csrf); + const response = await fetch(ARCHY_RPC_ENDPOINT, { + method: 'POST', + headers, + credentials: 'include', + cache: 'no-store', + body: JSON.stringify({ method, params }) + }); + const body = await response.json().catch(() => ({})); + if (!response.ok || body.error) { + throw new Error(body.error?.message || `Archipelago RPC ${response.status}`); + } + return body.result; + } + + function escapeHtml(value) { + return String(value ?? '').replace(/[&<>"']/g, char => ({ + '&': '&', + '<': '<', + '>': '>', + '"': '"', + "'": ''' + }[char])); + } + + function setText(id, value, fallback = 'Not configured') { + const el = document.getElementById(id); + if (el) el.textContent = value || fallback; + } + + function renderRelayRequests(requests = []) { + const list = document.getElementById('relayRequestsList'); + if (!list) return; + if (!requests.length) { + list.innerHTML = '
No relay requests
'; + return; + } + list.innerHTML = requests.map(req => { + const name = escapeHtml(req.peer_name || req.peer_onion || req.peer_pubkey); + const message = req.message ? `
${escapeHtml(req.message)}
` : ''; + const endpoint = req.approved_endpoint ? `
${escapeHtml(req.approved_endpoint)}
` : ''; + const statusClass = req.status === 'approved' + ? 'text-green-300' + : req.status === 'rejected' + ? 'text-red-300' + : 'text-yellow-300'; + const actions = req.direction === 'incoming' && req.status === 'pending' + ? `
+ + +
` + : ''; + return `
+
+
${name}
+
${escapeHtml(req.direction)} · ${escapeHtml(req.status)}
+
+ ${message} + ${endpoint} + ${actions} +
`; + }).join(''); + } + + function renderRelayPeers(peers = [], selectedPeer = '', localSynced = true) { + const select = document.getElementById('relayPeerSelect'); + const button = document.getElementById('relayRequestButton'); + if (!select) return; + if (!localSynced) { + select.innerHTML = ''; + select.disabled = true; + if (button) button.disabled = true; + return; + } + if (!peers.length) { + select.innerHTML = ''; + select.disabled = true; + if (button) button.disabled = true; + return; + } + select.disabled = false; + if (button) button.disabled = false; + select.innerHTML = '' + peers.map(peer => { + const label = escapeHtml(peer.name || peer.onion || peer.pubkey.slice(0, 16)); + const approved = peer.relay_approved ? ' · approved' : ''; + const selected = peer.pubkey === selectedPeer ? ' selected' : ''; + return ``; + }).join(''); + } + + async function loadRelayAccess() { + const statusEl = document.getElementById('relayStatusMessage'); + try { + const relay = await callArchyRPC('bitcoin.relay-status'); + const settings = relay.settings || {}; + const local = relay.local_node || {}; + setText('relayHttpsEndpoint', settings.https_endpoint); + setText('relayHttpEndpoint', settings.http_endpoint); + setText('relayTorEndpoint', settings.tor_endpoint); + const syncEl = document.getElementById('relaySyncStatus'); + if (syncEl) { + syncEl.textContent = local.synced ? 'Synchronized' : 'Not synchronized'; + syncEl.className = local.synced ? 'ml-2 font-medium text-green-300' : 'ml-2 font-medium text-yellow-300'; + } + const enabled = document.getElementById('relayEnabledToggle'); + const requests = document.getElementById('relayRequestsToggle'); + const tor = document.getElementById('relayTorToggle'); + if (enabled) enabled.checked = !!settings.enabled_for_peers; + if (requests) requests.checked = !!settings.allow_peer_requests; + if (tor) tor.checked = !!settings.allow_tor; + const httpsInput = document.getElementById('relayHttpsInput'); + const httpInput = document.getElementById('relayHttpInput'); + const torInput = document.getElementById('relayTorInput'); + if (httpsInput && document.activeElement !== httpsInput) httpsInput.value = settings.https_endpoint || ''; + if (httpInput && document.activeElement !== httpInput) httpInput.value = settings.http_endpoint || ''; + if (torInput && document.activeElement !== torInput) torInput.value = settings.tor_endpoint || ''; + renderRelayPeers(relay.trusted_nodes || [], settings.selected_peer_pubkey || '', !!local.synced); + renderRelayRequests(relay.requests || []); + setText('relayCredentialUser', relay.credentials?.username || 'txrelay', 'txrelay'); + setText( + 'relayCredentialStatus', + relay.credentials?.available ? `Credential file ready: ${relay.credentials.client_env_path}. ${relay.credentials.restart_hint || ''}` : 'Restricted relay credential will be generated when peer sharing is enabled', + 'Credential status unavailable' + ); + if (statusEl) statusEl.textContent = ''; + } catch (error) { + console.warn('[Bitcoin UI] relay status failed', error); + if (statusEl) statusEl.textContent = `Relay controls unavailable: ${error.message}`; + } + } + + async function saveRelaySettings() { + const statusEl = document.getElementById('relayStatusMessage'); + const payload = { + enabled_for_peers: !!document.getElementById('relayEnabledToggle')?.checked, + allow_peer_requests: !!document.getElementById('relayRequestsToggle')?.checked, + allow_tor: !!document.getElementById('relayTorToggle')?.checked, + allow_https: !!document.getElementById('relayHttpsInput')?.value.trim(), + allow_http: !!document.getElementById('relayHttpInput')?.value.trim(), + selected_peer_pubkey: document.getElementById('relayPeerSelect')?.value || '', + https_endpoint: document.getElementById('relayHttpsInput')?.value.trim() || '', + http_endpoint: document.getElementById('relayHttpInput')?.value.trim() || '', + tor_endpoint: document.getElementById('relayTorInput')?.value.trim() || '' + }; + try { + await callArchyRPC('bitcoin.relay-update-settings', payload); + if (statusEl) statusEl.textContent = 'Relay settings saved.'; + await loadRelayAccess(); + } catch (error) { + if (statusEl) statusEl.textContent = `Save failed: ${error.message}`; + } + } + + async function requestPeerRelay() { + const statusEl = document.getElementById('relayStatusMessage'); + const peer = document.getElementById('relayPeerSelect')?.value; + if (!peer) { + if (statusEl) statusEl.textContent = 'Choose a trusted node first.'; + return; + } + try { + await callArchyRPC('bitcoin.relay-request-peer', { + peer_pubkey: peer, + message: document.getElementById('relayRequestMessage')?.value || '' + }); + if (statusEl) statusEl.textContent = 'Relay access request sent.'; + await loadRelayAccess(); + } catch (error) { + if (statusEl) statusEl.textContent = `Request failed: ${error.message}`; + } + } + + async function approveRelayRequest(id) { + await updateRelayRequest('bitcoin.relay-approve-request', id); + } + + async function rejectRelayRequest(id) { + await updateRelayRequest('bitcoin.relay-reject-request', id); + } + + async function updateRelayRequest(method, id) { + const statusEl = document.getElementById('relayStatusMessage'); + try { + await callArchyRPC(method, { id }); + if (statusEl) statusEl.textContent = 'Relay request updated.'; + await loadRelayAccess(); + } catch (error) { + if (statusEl) statusEl.textContent = `Update failed: ${error.message}`; + } + } + + async function createRelayTorService() { + const statusEl = document.getElementById('relayStatusMessage'); + try { + await callArchyRPC('bitcoin.relay-create-tor-service'); + if (statusEl) statusEl.textContent = 'Tor service requested.'; + await loadRelayAccess(); + } catch (error) { + if (statusEl) statusEl.textContent = `Tor setup failed: ${error.message}`; + } + } + // Implementation branding — detected from getnetworkinfo.subversion. // Bitcoin Knots identifies as "/Satoshi:/Knots:/", Bitcoin Core as "/Satoshi:/". let brandingApplied = false; @@ -720,11 +1017,11 @@ syncStatusText.textContent = status.error || 'Bitcoin node is reconnecting... showing last known values'; syncStatusText.className = 'text-yellow-300 text-sm font-medium'; } else if (consecutiveRpcFailures < 6) { - syncStatusText.textContent = status.error || 'Connecting to Bitcoin node...'; + syncStatusText.textContent = status.error || 'Bitcoin node is starting or busy syncing...'; syncStatusText.className = 'text-yellow-300 text-sm font-medium'; } else { - syncStatusText.textContent = status.error || 'Bitcoin node is not responding yet'; - syncStatusText.className = 'text-red-400 text-sm font-medium'; + syncStatusText.textContent = status.error || 'Bitcoin node is still syncing; retrying automatically...'; + syncStatusText.className = 'text-yellow-300 text-sm font-medium'; } } if (syncIcon) { @@ -910,8 +1207,8 @@ if (syncStatusText) { const hasRecentData = lastSuccessfulUpdateAt > 0 && Date.now() - lastSuccessfulUpdateAt < 120000; syncStatusText.textContent = hasRecentData - ? 'Bitcoin status bridge is reconnecting... keeping last known values' - : 'Connecting to Bitcoin status bridge...'; + ? 'Bitcoin status bridge is retrying... keeping last known values' + : 'Bitcoin status bridge is starting...'; syncStatusText.className = 'text-yellow-300 text-sm font-medium'; } } @@ -920,10 +1217,12 @@ // Initial update console.log('[Bitcoin UI] Starting initial blockchain info update...'); updateBlockchainInfo(); + loadRelayAccess(); // Update every 5 seconds console.log('[Bitcoin UI] Setting up 5-second update interval'); setInterval(updateBlockchainInfo, 5000); + setInterval(loadRelayAccess, 15000); function copyRPCInfo() { const info = `RPC Host: ${window.location.hostname}:8332\nRPC User: archipelago\nRPC Password: archipelago123\nRPC Endpoint: ${RPC_ENDPOINT}`; diff --git a/docker/fedimint-ui/Dockerfile b/docker/fedimint-ui/Dockerfile new file mode 100644 index 00000000..7cb19290 --- /dev/null +++ b/docker/fedimint-ui/Dockerfile @@ -0,0 +1,16 @@ +FROM git.tx1138.com/lfg2025/nginx:1.27.4-alpine + +COPY index.html /usr/share/nginx/html/index.html +COPY nginx.conf /etc/nginx/conf.d/default.conf +COPY assets/img/bg-network.jpg /usr/share/nginx/html/assets/img/bg-network.jpg +COPY assets/img/app-icons/fedimint.png /usr/share/nginx/html/assets/img/app-icons/fedimint.png +COPY assets/img/app-icons/fedimint.jpg /usr/share/nginx/html/assets/img/app-icons/fedimint.jpg + +RUN sed -i 's/^user nginx;/user root;/' /etc/nginx/nginx.conf && \ + mkdir -p /var/cache/nginx/client_temp /var/cache/nginx/proxy_temp \ + /var/cache/nginx/fastcgi_temp /var/cache/nginx/uwsgi_temp \ + /var/cache/nginx/scgi_temp + +EXPOSE 8175 +ENTRYPOINT [] +CMD ["nginx", "-g", "daemon off;"] diff --git a/docker/fedimint-ui/assets/img/app-icons/fedimint.jpg b/docker/fedimint-ui/assets/img/app-icons/fedimint.jpg new file mode 100644 index 00000000..4a759c55 Binary files /dev/null and b/docker/fedimint-ui/assets/img/app-icons/fedimint.jpg differ diff --git a/docker/fedimint-ui/assets/img/app-icons/fedimint.png b/docker/fedimint-ui/assets/img/app-icons/fedimint.png new file mode 100644 index 00000000..4a759c55 Binary files /dev/null and b/docker/fedimint-ui/assets/img/app-icons/fedimint.png differ diff --git a/docker/fedimint-ui/assets/img/bg-network.jpg b/docker/fedimint-ui/assets/img/bg-network.jpg new file mode 100644 index 00000000..f474fa1f Binary files /dev/null and b/docker/fedimint-ui/assets/img/bg-network.jpg differ diff --git a/docker/fedimint-ui/index.html b/docker/fedimint-ui/index.html new file mode 100644 index 00000000..44f85d9f --- /dev/null +++ b/docker/fedimint-ui/index.html @@ -0,0 +1,452 @@ + + + + + + + + + + Fedimint Guardian - Archipelago + + + +
+
+
+
+ +
+
+
+
+ Fedimint Guardian +
+
+

Fedimint Guardian

+

Guardian is installed and will open here automatically when Bitcoin Knots finishes initial block download.

+
+
+
+
+
+
+
+

Status

+

Waiting for Bitcoin sync

+
+
+
+
+ +
+
+

Waiting for Bitcoin sync

+

Fedimint needs a synced Bitcoin RPC before federation setup can start safely.

+

This page refreshes every 30 seconds and switches to the Guardian UI as soon as the backend is ready.

+ +
+ +
+

Readiness check

+

Checking bitcoind readiness...

+
+
+ + +
+ + +
+
+ + diff --git a/docker/fedimint-ui/nginx.conf b/docker/fedimint-ui/nginx.conf new file mode 100644 index 00000000..8f78cfe0 --- /dev/null +++ b/docker/fedimint-ui/nginx.conf @@ -0,0 +1,30 @@ +server { + listen 8175; + server_name _; + + proxy_intercept_errors on; + error_page 500 502 503 504 = @wait_page; + + location /assets/ { + root /usr/share/nginx/html; + add_header Cache-Control "public, max-age=3600" always; + try_files $uri =404; + } + + location / { + proxy_pass http://127.0.0.1:8177; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + } + + location @wait_page { + root /usr/share/nginx/html; + add_header Cache-Control "no-store" always; + try_files /index.html =503; + } +} diff --git a/image-recipe/configs/archipelago.service b/image-recipe/configs/archipelago.service index 8c3bc16d..0cddae7c 100644 --- a/image-recipe/configs/archipelago.service +++ b/image-recipe/configs/archipelago.service @@ -8,6 +8,7 @@ Type=notify User=archipelago Environment="ARCHIPELAGO_BIND=127.0.0.1:5678" Environment="ARCHIPELAGO_USE_QUADLET_BACKENDS=true" +EnvironmentFile=-/var/lib/archipelago/telemetry.env # DEV_MODE disabled in production — enabled via override.conf on dev servers Environment="XDG_RUNTIME_DIR=/run/user/1000" # + prefix runs these as root (needed for chown/mkdir outside ReadWritePaths) diff --git a/image-recipe/configs/nginx-archipelago.conf b/image-recipe/configs/nginx-archipelago.conf index 933ba3a7..bbc3c583 100644 --- a/image-recipe/configs/nginx-archipelago.conf +++ b/image-recipe/configs/nginx-archipelago.conf @@ -148,6 +148,34 @@ server { error_page 504 = @backend_timeout; } + # JSON-RPC endpoint. Browser GETs are navigational mistakes, so send them + # back to the dashboard while keeping RPC POSTs proxied to the backend. + location = /rpc/v1 { + if ($request_method = GET) { + return 303 /; + } + if ($request_method = HEAD) { + return 303 /; + } + + limit_req zone=rpc burst=40 nodelay; + limit_req_status 429; + proxy_pass http://127.0.0.1:5678; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + + # Limit request body to 1MB for RPC calls + client_max_body_size 1m; + + # Increase timeout for long-running operations (e.g., Docker image pulls) + proxy_connect_timeout 600s; + proxy_send_timeout 600s; + proxy_read_timeout 600s; + error_page 502 503 = @backend_unavailable; + error_page 504 = @backend_timeout; + } + # Proxy API requests to backend location /rpc/ { limit_req zone=rpc burst=40 nodelay; @@ -896,23 +924,6 @@ server { } } -# Compatibility proxy for cached PWA bundles that still launch Nginx Proxy -# Manager on :81. Rootless Podman cannot bind host ports below 1024, so the -# container admin UI runs on :8081 and host nginx owns the old :81 entrypoint. -server { - listen 81; - server_name _; - - location / { - proxy_pass http://127.0.0.1:8081/; - proxy_http_version 1.1; - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - } -} - # HTTPS - required for PWA install (Add to Home Screen) from dev servers server { listen 443 ssl default_server; diff --git a/image-recipe/scripts/install-podman.sh b/image-recipe/scripts/install-podman.sh index d630ad10..0977bd8e 100755 --- a/image-recipe/scripts/install-podman.sh +++ b/image-recipe/scripts/install-podman.sh @@ -6,6 +6,19 @@ set -e echo "🐳 Configuring Podman for rootless operation..." +if ! command -v catatonit >/dev/null 2>&1; then + if command -v apt-get >/dev/null 2>&1; then + apt-get update || true + apt-get install -y catatonit || true + elif command -v dnf >/dev/null 2>&1; then + dnf install -y catatonit || true + elif command -v apk >/dev/null 2>&1; then + apk add catatonit || true + fi +fi + +command -v catatonit >/dev/null 2>&1 || echo "WARNING: catatonit not installed; Podman init-enabled containers may fail" + # Ensure archipelago user exists if ! id "archipelago" &>/dev/null; then echo "Creating archipelago user..." diff --git a/scripts/container-specs.sh b/scripts/container-specs.sh index 52220725..2264e56b 100755 --- a/scripts/container-specs.sh +++ b/scripts/container-specs.sh @@ -177,11 +177,16 @@ load_spec_bitcoin-knots() { SPEC_DATA_UID="100101:100101" local btc_dbcache=4096 [ "${LOW_MEM:-false}" = "true" ] && btc_dbcache=2048 + local btc_rpc_headroom="-rpcthreads=16 -rpcworkqueue=256" + local btc_txrelay_flags="-rpcwhitelistdefault=0" + if [ -f "$SECRETS_DIR/bitcoin-rpc-txrelay-rpcauth" ]; then + btc_txrelay_flags="$btc_txrelay_flags -rpcauth=$(cat "$SECRETS_DIR/bitcoin-rpc-txrelay-rpcauth") -rpcwhitelist=txrelay:sendrawtransaction,testmempoolaccept,getmempoolinfo,getrawmempool,getmempoolentry,getnetworkinfo,getblockchaininfo,getblockcount,getblockhash,getblockheader,getrawtransaction,decoderawtransaction,decodescript,estimatesmartfee" + fi # Dynamic: prune on small disk if [ "${DISK_GB:-0}" -lt 1000 ]; then - SPEC_CUSTOM_ARGS="-server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=${btc_dbcache} -par=0 -maxconnections=125" + SPEC_CUSTOM_ARGS="-server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=${btc_dbcache} -par=0 -maxconnections=125 ${btc_rpc_headroom} ${btc_txrelay_flags}" else - SPEC_CUSTOM_ARGS="-server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096 -par=0 -maxconnections=125" + SPEC_CUSTOM_ARGS="-server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096 -par=0 -maxconnections=125 ${btc_rpc_headroom} ${btc_txrelay_flags}" fi } @@ -518,11 +523,12 @@ load_spec_portainer() { SPEC_NAME="portainer" SPEC_IMAGE="${PORTAINER_IMAGE}" SPEC_PORTS="9000:9000" - SPEC_VOLUMES="/var/lib/archipelago/portainer:/data /run/user/1000/podman/podman.sock:/var/run/docker.sock" + SPEC_VOLUMES="/var/lib/archipelago/portainer:/data /run/user/1000/podman/podman.sock:/var/run/docker.sock /var/lib/archipelago/portainer/compose:/data/compose" SPEC_MEMORY="$(mem_limit portainer)" SPEC_HEALTH_CMD="curl -sf http://localhost:9000/ || exit 1" SPEC_TIER="3" SPEC_DATA_DIR="/var/lib/archipelago/portainer" + SPEC_DATA_UID="1000:1000" SPEC_OPTIONAL="true" } diff --git a/scripts/deploy-config.example b/scripts/deploy-config.example index 9f3eecfc..c752d8c2 100644 --- a/scripts/deploy-config.example +++ b/scripts/deploy-config.example @@ -5,3 +5,7 @@ # Edit deploy-config.sh and set ARCHIPELAGO_PASSWORD # export ARCHIPELAGO_PASSWORD='your_password_here' + +# Optional: central beta telemetry collector RPC endpoint. +# The reporter sends telemetry.ingest JSON-RPC requests here when users opt in. +# export TELEMETRY_COLLECTOR_URL='https://YOUR-COLLECTOR-HOST/rpc/v1' diff --git a/scripts/deploy-tailscale.sh b/scripts/deploy-tailscale.sh index e5b8e54d..e81fe4c2 100755 --- a/scripts/deploy-tailscale.sh +++ b/scripts/deploy-tailscale.sh @@ -17,6 +17,7 @@ set -eo pipefail SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" TARGET_DIR="/home/archipelago/archy" +PODMAN_IMAGE_CHECK_TIMEOUT="${PODMAN_IMAGE_CHECK_TIMEOUT:-10}" # Load deploy config defaults (IP addresses etc.) [ -f "$SCRIPT_DIR/deploy-config-defaults.sh" ] && . "$SCRIPT_DIR/deploy-config-defaults.sh" @@ -186,7 +187,7 @@ deploy_node() { # Transfer custom UI images (individual tarballs — never combined) echo " Transferring custom UI images..." for ui_img in bitcoin-ui lnd-ui electrs-ui; do - HAS_IMG=$(ssh $SSH_OPTS "$BUILD_SOURCE" "podman images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep -q '${ui_img}:' && echo yes || echo no" 2>/dev/null) + HAS_IMG=$(ssh $SSH_OPTS "$BUILD_SOURCE" "timeout --kill-after=2s ${PODMAN_IMAGE_CHECK_TIMEOUT}s podman image exists 'localhost/${ui_img}:local' 2>/dev/null && echo yes || echo no" 2>/dev/null) if [ "$HAS_IMG" = "yes" ]; then echo " $ui_img..." if ssh $SSH_OPTS "$BUILD_SOURCE" "podman save 'localhost/${ui_img}:local' 2>/dev/null" > "/tmp/${ui_img}.tar" 2>/dev/null && [ -s "/tmp/${ui_img}.tar" ]; then @@ -926,12 +927,19 @@ LNDCONF if \$DOCKER ps -a --format '{{.Names}}' 2>/dev/null | grep -qx portainer; then \$DOCKER start portainer 2>/dev/null || true else - sudo mkdir -p /var/lib/archipelago/portainer + sudo mkdir -p /var/lib/archipelago/portainer/compose + sudo chown -R archipelago:archipelago /var/lib/archipelago/portainer 2>/dev/null || true + if [ ! -e /data ]; then + sudo ln -s /var/lib/archipelago/portainer /data 2>/dev/null || true + elif [ -d /data ] && [ ! -L /data ] && [ ! -e /data/compose ]; then + sudo ln -s /var/lib/archipelago/portainer/compose /data/compose 2>/dev/null || true + fi \$DOCKER run -d --name portainer --restart unless-stopped \ --health-cmd 'curl -sf http://localhost:9000/' --health-interval=30s --health-timeout=5s --health-retries=3 \ --cap-drop ALL --cap-add CHOWN --cap-add SETUID --cap-add SETGID --cap-add DAC_OVERRIDE \ --security-opt no-new-privileges:true \ -p 9000:9000 -v /var/lib/archipelago/portainer:/data \ + -v /var/lib/archipelago/portainer/compose:/data/compose \ -v /run/user/1000/podman/podman.sock:/var/run/docker.sock \ $PORTAINER_IMAGE fi diff --git a/scripts/deploy-to-target.sh b/scripts/deploy-to-target.sh index f6ea1ed4..a98b02bf 100755 --- a/scripts/deploy-to-target.sh +++ b/scripts/deploy-to-target.sh @@ -421,6 +421,20 @@ deploy_secondary() { rm -f /tmp/archipelago.service ' 2>/dev/null || true fi + if [ -n "${TELEMETRY_COLLECTOR_URL:-}" ]; then + echo " Syncing telemetry collector config to .$SEC_LABEL..." + TMP_TELEMETRY_ENV="$(mktemp)" + printf 'TELEMETRY_COLLECTOR_URL=%s\n' "$TELEMETRY_COLLECTOR_URL" > "$TMP_TELEMETRY_ENV" + scp $SSH_OPTS "$TMP_TELEMETRY_ENV" "$SEC_TARGET:/tmp/telemetry.env" 2>/dev/null || true + rm -f "$TMP_TELEMETRY_ENV" + ssh $SSH_OPTS "$SEC_TARGET" ' + sudo mkdir -p /var/lib/archipelago + sudo cp /tmp/telemetry.env /var/lib/archipelago/telemetry.env + sudo chown archipelago:archipelago /var/lib/archipelago/telemetry.env + sudo chmod 600 /var/lib/archipelago/telemetry.env + rm -f /tmp/telemetry.env + ' 2>/dev/null || true + fi # Deploy udev rule for mesh radio UDEV_RULE="$PROJECT_DIR/image-recipe/configs/99-mesh-radio.rules" @@ -682,6 +696,20 @@ if [ "$LIVE" = true ]; then rm -f /tmp/archipelago.service ' 2>/dev/null || true fi + if [ -n "${TELEMETRY_COLLECTOR_URL:-}" ]; then + progress "Syncing telemetry collector config" + TMP_TELEMETRY_ENV="$(mktemp)" + printf 'TELEMETRY_COLLECTOR_URL=%s\n' "$TELEMETRY_COLLECTOR_URL" > "$TMP_TELEMETRY_ENV" + scp $SSH_OPTS "$TMP_TELEMETRY_ENV" "$TARGET_HOST:/tmp/telemetry.env" 2>/dev/null || true + rm -f "$TMP_TELEMETRY_ENV" + ssh $SSH_OPTS "$TARGET_HOST" ' + sudo mkdir -p /var/lib/archipelago + sudo cp /tmp/telemetry.env /var/lib/archipelago/telemetry.env + sudo chown archipelago:archipelago /var/lib/archipelago/telemetry.env + sudo chmod 600 /var/lib/archipelago/telemetry.env + rm -f /tmp/telemetry.env + ' 2>/dev/null || true + fi # Deploy udev rule for mesh radio stable naming (/dev/mesh-radio) UDEV_RULE="$PROJECT_DIR/image-recipe/configs/99-mesh-radio.rules" diff --git a/scripts/first-boot-containers.sh b/scripts/first-boot-containers.sh index 72b6901d..c702822a 100755 --- a/scripts/first-boot-containers.sh +++ b/scripts/first-boot-containers.sh @@ -431,6 +431,17 @@ fi # Rootless podman prerequisites (run as root, configures for archipelago user) log "Setting up rootless podman prerequisites..." +if ! command -v catatonit >/dev/null 2>&1; then + log "Installing catatonit for Podman init support..." + if command -v apt-get >/dev/null 2>&1; then + apt-get update >>"$LOG" 2>&1 || true + apt-get install -y catatonit >>"$LOG" 2>&1 || true + elif command -v dnf >/dev/null 2>&1; then + dnf install -y catatonit >>"$LOG" 2>&1 || true + elif command -v apk >/dev/null 2>&1; then + apk add catatonit >>"$LOG" 2>&1 || true + fi +fi # Allow binding to ports >= 80 (rootless default is 1024) if ! grep -q "unprivileged_port_start=80" /etc/sysctl.d/99-rootless-podman.conf 2>/dev/null; then echo "net.ipv4.ip_unprivileged_port_start=80" > /etc/sysctl.d/99-rootless-podman.conf @@ -612,7 +623,7 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -qE 'bitcoin-knots|arch -v /var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin \ "${BITCOIN_KNOTS_IMAGE}" \ $BTC_EXTRA_ARGS \ - -printtoconsole=1 -dbcache=$BTC_DBCACHE -par=0 -maxconnections=125 2>>"$LOG"; then + -printtoconsole=1 -dbcache=$BTC_DBCACHE -par=0 -maxconnections=125 -rpcthreads=16 -rpcworkqueue=256 2>>"$LOG"; then log "Bitcoin Knots started" else log "Bitcoin Knots failed (may already exist)" @@ -1202,7 +1213,13 @@ fi track_container "nginx-proxy-manager" if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -q portainer; then log "Creating Portainer..." - mkdir -p /var/lib/archipelago/portainer + mkdir -p /var/lib/archipelago/portainer/compose + chown -R archipelago:archipelago /var/lib/archipelago/portainer 2>/dev/null || true + if [ ! -e /data ]; then + ln -s /var/lib/archipelago/portainer /data 2>/dev/null || true + elif [ -d /data ] && [ ! -L /data ] && [ ! -e /data/compose ]; then + ln -s /var/lib/archipelago/portainer/compose /data/compose 2>/dev/null || true + fi $DOCKER run -d --name portainer --restart unless-stopped \ --health-cmd="curl -sf http://localhost:9000/ || exit 1" --health-interval=120s --health-timeout=5s --health-retries=3 \ --memory=$(mem_limit portainer) \ @@ -1210,7 +1227,8 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -q portainer; then --security-opt no-new-privileges:true \ -p 9000:9000 \ -v /var/lib/archipelago/portainer:/data \ - -v /var/run/podman/podman.sock:/var/run/docker.sock \ + -v /var/lib/archipelago/portainer/compose:/data/compose \ + -v /run/user/$(id -u archipelago)/podman/podman.sock:/var/run/docker.sock \ "$PORTAINER_IMAGE" 2>>"$LOG" || true fi track_container "portainer" @@ -1232,7 +1250,7 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -q tailscale; then -v /var/lib/archipelago/tailscale:/var/lib/tailscale \ -e TS_STATE_DIR=/var/lib/tailscale \ "$TAILSCALE_IMAGE" \ - sh -c 'tailscaled --tun=userspace-networking & sleep 2; tailscale web --listen 0.0.0.0:8240 & wait' 2>>"$LOG" || true + sh -c 'tailscaled --tun=userspace-networking & for i in $(seq 1 30); do [ -S /var/run/tailscale/tailscaled.sock ] && break; sleep 1; done; tailscale web --listen 0.0.0.0:8240 & wait' 2>>"$LOG" || true fi track_container "tailscale" diff --git a/scripts/fix-indeedhub-containers.sh b/scripts/fix-indeedhub-containers.sh index c3416cec..8a873a18 100755 --- a/scripts/fix-indeedhub-containers.sh +++ b/scripts/fix-indeedhub-containers.sh @@ -10,6 +10,7 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" # This script: stops broken containers, removes them, recreates with correct images. echo "=== IndeedHub Container Fix Script ===" +PODMAN_IMAGE_CHECK_TIMEOUT="${PODMAN_IMAGE_CHECK_TIMEOUT:-10}" # Detect node IP (Tailscale or LAN) NODE_IP=$(hostname -I | awk '{for(i=1;i<=NF;i++) if($i ~ /^100\./) print $i}') @@ -29,7 +30,7 @@ fi # Verify correct images are available echo "Verifying images..." for img in "${INDEEDHUB_REDIS_IMAGE}" "${MINIO_IMAGE}" "${INDEEDHUB_POSTGRES_IMAGE}" "${NOSTR_RS_RELAY_IMAGE}" "${SEARXNG_IMAGE}" "localhost/indeedhub:local" "localhost/indeedhub-build_api:local" "localhost/indeedhub-build_ffmpeg-worker:local"; do - if ! podman image exists "$img" 2>/dev/null; then + if ! timeout --kill-after=2s "${PODMAN_IMAGE_CHECK_TIMEOUT}s" podman image exists "$img" 2>/dev/null; then echo "ERROR: Missing image $img" exit 1 fi diff --git a/scripts/reconcile-containers.sh b/scripts/reconcile-containers.sh index 73a58f68..8b9b2c34 100755 --- a/scripts/reconcile-containers.sh +++ b/scripts/reconcile-containers.sh @@ -98,6 +98,11 @@ alloc_port() { # Run as archipelago user — podman sees rootless containers directly. # Use sudo only for chown/mkdir operations. PODMAN="podman" +PODMAN_IMAGE_CHECK_TIMEOUT="${PODMAN_IMAGE_CHECK_TIMEOUT:-10}" + +podman_bounded() { + timeout --kill-after=2s "${PODMAN_IMAGE_CHECK_TIMEOUT}s" "$PODMAN" "$@" +} # ── Pre-flight ─────────────────────────────────────────────────────── header "╔══════════════════════════════════════════════════╗" @@ -152,7 +157,7 @@ container_image_id() { } spec_image_id() { - $PODMAN image inspect "$SPEC_IMAGE" --format '{{.Id}}' 2>/dev/null + podman_bounded image inspect "$SPEC_IMAGE" --format '{{.Id}}' 2>/dev/null } container_network() { @@ -218,6 +223,39 @@ prepare_bind_source() { esac } +ensure_catatonit() { + command -v catatonit >/dev/null 2>&1 && return 0 + $CHECK_ONLY && { info "catatonit missing (would install)"; return 0; } + + if command -v apt-get >/dev/null 2>&1; then + sudo apt-get update >/dev/null 2>&1 || true + sudo apt-get install -y catatonit >/dev/null 2>&1 || true + elif command -v dnf >/dev/null 2>&1; then + sudo dnf install -y catatonit >/dev/null 2>&1 || true + elif command -v apk >/dev/null 2>&1; then + sudo apk add catatonit >/dev/null 2>&1 || true + fi + + command -v catatonit >/dev/null 2>&1 || { fail "catatonit missing; Portainer compose builds may fail"; return 1; } +} + +ensure_portainer_host_paths() { + ensure_catatonit + if $CHECK_ONLY; then + [ -d /var/lib/archipelago/portainer/compose ] || info "Portainer compose dir missing (would create)" + [ -e /data ] || info "/data host path missing (would link to /var/lib/archipelago/portainer)" + return 0 + fi + + sudo mkdir -p /var/lib/archipelago/portainer/compose 2>/dev/null || true + sudo chown -R 1000:1000 /var/lib/archipelago/portainer 2>/dev/null || true + if [ ! -e /data ]; then + sudo ln -s /var/lib/archipelago/portainer /data 2>/dev/null || true + elif [ -d /data ] && [ ! -L /data ] && [ ! -e /data/compose ]; then + sudo ln -s /var/lib/archipelago/portainer/compose /data/compose 2>/dev/null || true + fi +} + container_has_mount() { local name="$1" source="$2" target="$3" $PODMAN inspect "$name" --format '{{range .Mounts}}{{println .Source "|" .Destination}}{{end}}' 2>/dev/null \ @@ -250,13 +288,7 @@ container_env_val() { URL_ENV_SUFFIXES="_URL _HOST _ENDPOINT" image_exists() { - # Note: `grep -q` closes stdin after first match → SIGPIPE (exit 141) on podman. - # With `set -o pipefail` active in the parent script, that propagates as failure - # and spuriously skips local-image containers. Use a full scan + explicit match - # check to keep the exit code stable regardless of pipefail. - local images - images=$($PODMAN images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null) - echo "$images" | grep -qF "$1" + podman_bounded image exists "$1" >/dev/null 2>&1 } resolve_spec_image() { @@ -280,7 +312,7 @@ resolve_spec_image() { fi done - repo=$($PODMAN images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null \ + repo=$(podman_bounded images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null \ | grep -E "/${image_name}:${image_tag}$" \ | head -1 || true) if [ -n "$repo" ]; then @@ -377,6 +409,8 @@ reconcile() { return fi + [ "$name" = "portainer" ] && ensure_portainer_host_paths + # Filter by tier [ -n "$FILTER_TIER" ] && [ "$SPEC_TIER" != "$FILTER_TIER" ] && return @@ -701,7 +735,7 @@ BTCEOF # bitcoin_rw.conf, so clean both files. for conf in "$BITCOIN_CONF" "/var/lib/archipelago/bitcoin/bitcoin_rw.conf"; do if [ -f "$conf" ]; then - sudo sed -i '/^server=/d; /^txindex=/d; /^rpcbind=/d; /^rpcallowip=/d; /^rpcport=/d; /^listen=/d; /^bind=/d; /^dbcache=/d' "$conf" 2>/dev/null + sudo sed -i '/^server=/d; /^txindex=/d; /^rpcbind=/d; /^rpcallowip=/d; /^rpcport=/d; /^listen=/d; /^bind=/d; /^dbcache=/d; /^rpcthreads=/d; /^rpcworkqueue=/d' "$conf" 2>/dev/null fi done sudo chown -R 100101:100101 /var/lib/archipelago/bitcoin 2>/dev/null diff --git a/tests/lifecycle/remote-lifecycle.sh b/tests/lifecycle/remote-lifecycle.sh index ea4eefc9..b700c107 100755 --- a/tests/lifecycle/remote-lifecycle.sh +++ b/tests/lifecycle/remote-lifecycle.sh @@ -158,6 +158,7 @@ image_for() { dwn) echo "146.59.87.168:3000/lfg2025/dwn-server:main" ;; botfights) echo "146.59.87.168:3000/lfg2025/botfights:1.1.0" ;; gitea) echo "docker.io/gitea/gitea:1.23" ;; + meshtastic) echo "docker.io/meshtastic/meshtasticd:daily-alpine" ;; *) return 1 ;; esac } @@ -219,6 +220,8 @@ rpc_call() { payload=$(jq -nc --arg m "$method" --argjson p "$params" --argjson id "$id" '{jsonrpc:"2.0",method:$m,params:$p,id:$id}') fi curl -sk -X POST "${BASE_URL}/rpc/v1" \ + --connect-timeout 8 \ + -m "${ARCHY_RPC_TIMEOUT:-60}" \ -H 'Content-Type: application/json' \ -H "Cookie: session=${SESSION}; csrf_token=${CSRF}" \ -H "X-CSRF-Token: ${CSRF}" \ @@ -244,9 +247,16 @@ container_state() { } container_health() { - local app="$1" - rpc_result container-health "$(jq -nc --arg app "$app" '{app_id:$app}')" \ - | jq -r --arg app "$app" '.[$app] // "unknown" | ascii_downcase' + local app="$1" health + health=$( + ARCHY_RPC_TIMEOUT="${ARCHY_HEALTH_RPC_TIMEOUT:-20}" \ + rpc_result container-health "$(jq -nc --arg app "$app" '{app_id:$app}')" \ + | jq -r --arg app "$app" '.[$app] // "unknown" | ascii_downcase' + ) || health=unknown + if [[ "$app" == "indeedhub" && "$health" != "healthy" ]] && probe_launch "$app" >/dev/null 2>&1; then + health=healthy + fi + printf '%s\n' "$health" } assert_container_healthy() { @@ -277,6 +287,10 @@ observe_stable() { while (( $(date +%s) < deadline )); do state=$(container_state "$app" 2>/dev/null || echo unknown) if [[ "$state" != "running" ]]; then + if [[ "$app" == "indeedhub" ]] && probe_launch "$app" >/dev/null 2>&1; then + sleep 5 + continue + fi echo "stability failed: $app left running state (last=$state)" >&2 return 1 fi @@ -292,7 +306,9 @@ wait_state() { while (( $(date +%s) < deadline )); do state=$(container_state "$app" 2>/dev/null || echo unknown) if [[ "$target" == "absent" && "$state" == "absent" ]]; then return 0; fi + if [[ "$target" == "stopped" && "$state" == "absent" ]]; then return 0; fi if [[ "$target" != "absent" && "$state" == "$target" ]]; then return 0; fi + if [[ "$app" == "indeedhub" && "$target" == "running" ]] && probe_launch "$app" >/dev/null 2>&1; then return 0; fi sleep 5 done echo "$app did not reach $target within ${timeout}s (last=$state)" >&2 @@ -346,6 +362,8 @@ probe_launch() { case "$app" in lnd) probe_lnd_wallet_connect "$body" || { rm -f "$body"; return 1; } ;; electrumx|electrs|mempool-electrs) probe_electrum_wallet_connect "$body" || { rm -f "$body"; return 1; } ;; + indeedhub) probe_indeedhub_nostr_signer "$body" || { rm -f "$body"; return 1; } ;; + tailscale) probe_tailscale_login_ui "$body" || { rm -f "$body"; return 1; } ;; esac rm -f "$body" } @@ -362,6 +380,7 @@ wait_launch() { assert_launch_metadata() { local app="$1" timeout="${2:-$ARCHY_TIMEOUT}" deadline lan + launch_url_for "$app" >/dev/null 2>&1 || return 0 deadline=$(( $(date +%s) + timeout )) while (( $(date +%s) < deadline )); do lan=$(rpc_result container-list | jq -r --arg app "$app" ' @@ -436,6 +455,47 @@ probe_electrum_wallet_connect() { } } +probe_indeedhub_nostr_signer() { + local body="$1" provider pubkey signed now + require_body "$body" '/nostr-provider.js' 'IndeedHub Nostr provider injection' || return 1 + provider=$(curl -skL --connect-timeout 8 -m 20 "http://${ARCHY_HOST}:7778/nostr-provider.js" || true) + if [[ -z "$provider" ]]; then + echo "indeedhub nostr-provider.js unavailable" >&2 + return 1 + fi + printf '%s' "$provider" | grep -Eq 'window\.nostr|nostr' || { + echo "indeedhub nostr-provider.js does not look like a Nostr signer bridge" >&2 + return 1 + } + + pubkey=$(rpc_result node.nostr-pubkey | jq -r '.nostr_pubkey // empty') + if ! [[ "$pubkey" =~ ^[0-9a-fA-F]{64}$ ]]; then + echo "indeedhub Nostr signer pubkey unavailable: $pubkey" >&2 + return 1 + fi + + now=$(date +%s) + signed=$(rpc_result node.nostr-sign "$(jq -nc --argjson created_at "$now" '{event:{kind:1,created_at:$created_at,tags:[],content:"archy lifecycle indeedhub signer probe"}}')") + printf '%s' "$signed" | jq -e --arg pubkey "$pubkey" ' + .pubkey == $pubkey and + (.id | type == "string" and test("^[0-9a-f]{64}$")) and + (.sig | type == "string" and test("^[0-9a-f]{128}$")) and + .content == "archy lifecycle indeedhub signer probe" + ' >/dev/null || { + echo "indeedhub Nostr signer did not return a valid signed event: $signed" >&2 + return 1 + } +} + +probe_tailscale_login_ui() { + local body="$1" + if grep -Eiq 'tailscale|login|log in|sign in|authenticate|authorize|auth key|connect' "$body"; then + return 0 + fi + echo "tailscale launch did not present login/auth UI content" >&2 + return 1 +} + install_app() { local app="$1" app_json image params app_json=$(catalog_app_json "$app" || true)