diff --git a/core/archipelago/src/api/rpc/package/stacks.rs b/core/archipelago/src/api/rpc/package/stacks.rs index 0c010098..75974076 100644 --- a/core/archipelago/src/api/rpc/package/stacks.rs +++ b/core/archipelago/src/api/rpc/package/stacks.rs @@ -6,7 +6,6 @@ use crate::api::rpc::RpcHandler; use crate::data_model::InstallPhase; use anyhow::{Context, Result}; -use base64::Engine; use std::process::Output; use std::time::Duration; use tracing::info; @@ -725,10 +724,6 @@ fn indeedhub_stack_app_ids() -> &'static [&'static str] { const REGISTRY: &str = "146.59.87.168:3000/lfg2025"; -const NETBIRD_DASHBOARD_IMAGE: &str = "docker.io/netbirdio/dashboard:v2.38.0"; -const NETBIRD_SERVER_IMAGE: &str = "docker.io/netbirdio/netbird-server:0.71.2"; -const NETBIRD_PROXY_IMAGE: &str = "docker.io/library/nginx:1.27-alpine"; - /// Pull an image with retry and exponential backoff (3 attempts). async fn pull_image_with_retry(image: &str) -> Result<()> { let exists = podman_stack_status(&["image", "exists", image], PODMAN_STACK_PROBE_TIMEOUT).await; @@ -1846,9 +1841,13 @@ impl RpcHandler { // host facts + the netbird-net gateway. The manifests use the exact live // container names, so on an existing node this ADOPTS the running stack // rather than recreating it (the sqlite store + base64 keys are - // preserved — ensure_generated_secrets no-ops on existing files). Falls - // back to the legacy installer below only when the orchestrator doesn't - // know these app_ids (manifests not yet deployed to the node). + // preserved — ensure_generated_secrets no-ops on existing files). + // + // #20 ph4: the legacy hardcoded `podman run` installer was DELETED — the + // signed catalog always ships apps/netbird-*/manifest.yml, so there is no + // in-Rust fallback. If the orchestrator doesn't know these app_ids and no + // running stack exists to adopt, install errors rather than silently + // diverging from the manifest contract. if let Some(orchestrated) = install_stack_via_orchestrator(self, "netbird", netbird_stack_app_ids()).await? { @@ -1865,491 +1864,12 @@ impl RpcHandler { return Ok(adopted); } - install_log("INSTALL START: netbird stack (dashboard + server)").await; - info!("Installing self-hosted NetBird stack"); - - self.set_install_phase("netbird", InstallPhase::PullingImage) - .await; - for (i, image) in [ - NETBIRD_DASHBOARD_IMAGE, - NETBIRD_SERVER_IMAGE, - NETBIRD_PROXY_IMAGE, - ] - .iter() - .enumerate() - { - self.set_install_progress("netbird", i as u64, 3).await; - pull_image_with_retry(image) - .await - .with_context(|| format!("Failed to pull NetBird image: {}", image))?; - } - self.set_install_progress("netbird", 3, 3).await; - - for name in ["netbird", "netbird-dashboard", "netbird-server"] { - let _ = podman_stack_status(&["rm", "-f", name], PODMAN_STACK_PROBE_TIMEOUT).await; - } - let _ = podman_stack_status( - &["network", "rm", "-f", "netbird-net"], - PODMAN_STACK_PROBE_TIMEOUT, + anyhow::bail!( + "netbird manifests not available on this node — the signed catalog must provide apps/netbird-*/manifest.yml (legacy hardcoded installer removed in #20 ph4)" ) - .await; - - self.set_install_phase("netbird", InstallPhase::CreatingContainer) - .await; - - tokio::fs::create_dir_all("/var/lib/archipelago/netbird/data") - .await - .context("Failed to create NetBird data directory")?; - - let host_ip = detect_netbird_public_host_ip() - .await - .unwrap_or_else(|| self.config.host_ip.clone()); - - // Create the network FIRST so we can read back the gateway it was - // assigned — that gateway is Podman's aardvark DNS, which the proxy's - // nginx needs as an explicit `resolver` to re-resolve container names - // (issue #15: without it nginx caches a container IP and 502s forever - // once that IP changes on restart/reboot). - let _ = podman_stack_status( - &["network", "create", "netbird-net"], - PODMAN_STACK_PROBE_TIMEOUT, - ) - .await; - - let resolver_ip = netbird_net_resolver_ip().await; - write_netbird_config_files(&host_ip, &self.config.host_ip, &resolver_ip).await?; - ensure_netbird_tls_cert(&host_ip).await?; - - let mut server_cmd = tokio::process::Command::new("podman"); - server_cmd.args([ - "run", - "-d", - "--name", - "netbird-server", - "--network", - "netbird-net", - "--network-alias", - "netbird-server", - "--restart=unless-stopped", - "-p", - "8086:80", - "-p", - "3478:3478/udp", - "-v", - "/var/lib/archipelago/netbird/data:/var/lib/netbird", - "-v", - "/var/lib/archipelago/netbird/config.yaml:/etc/netbird/config.yaml:ro", - NETBIRD_SERVER_IMAGE, - "--config", - "/etc/netbird/config.yaml", - ]); - run_required_stack_command("netbird", "create server", &mut server_cmd).await?; - - self.set_install_phase("netbird", InstallPhase::StartingContainer) - .await; - tokio::time::sleep(std::time::Duration::from_secs(5)).await; - - let mut dashboard_cmd = tokio::process::Command::new("podman"); - dashboard_cmd.args([ - "run", - "-d", - "--name", - "netbird-dashboard", - "--network", - "netbird-net", - // Explicit alias so the proxy can always resolve `netbird-dashboard` - // via Podman DNS — don't rely on implicit container-name aliasing. - "--network-alias", - "netbird-dashboard", - "--restart=unless-stopped", - "--env-file", - "/var/lib/archipelago/netbird/dashboard.env", - NETBIRD_DASHBOARD_IMAGE, - ]); - run_required_stack_command("netbird", "create dashboard", &mut dashboard_cmd).await?; - - let mut proxy_cmd = tokio::process::Command::new("podman"); - proxy_cmd.args([ - "run", - "-d", - "--name", - "netbird", - "--network", - "netbird-net", - "--restart=unless-stopped", - // 8087 publishes the TLS listener — netbird's dashboard requires a - // secure context (window.crypto.subtle / OIDC PKCE), issue #15. - "-p", - "8087:443", - "-v", - "/var/lib/archipelago/netbird/nginx.conf:/etc/nginx/conf.d/default.conf:ro", - "-v", - "/var/lib/archipelago/netbird/tls.crt:/etc/nginx/tls.crt:ro", - "-v", - "/var/lib/archipelago/netbird/tls.key:/etc/nginx/tls.key:ro", - NETBIRD_PROXY_IMAGE, - ]); - run_required_stack_command("netbird", "create unified proxy", &mut proxy_cmd).await?; - - wait_for_stack_containers( - "netbird", - &["netbird-server", "netbird-dashboard", "netbird"], - 60, - ) - .await?; - - self.set_install_phase("netbird", InstallPhase::WaitingHealthy) - .await; - // Containers being "running" is NOT the same as the embedded OIDC - // provider being ready (#10). The dashboard SPA opens right after install - // and, if it loads before /oauth2/.well-known is served, caches a bad - // auth state — the user appears logged-in but can't log out until it - // self-corrects. Wait (best-effort) for OIDC discovery to answer before - // we report Done, so the first dashboard load sees a ready provider. - wait_for_netbird_oidc_ready(Duration::from_secs(60)).await; - - self.set_install_phase("netbird", InstallPhase::PostInstall) - .await; - self.set_install_phase("netbird", InstallPhase::Done).await; - self.clear_install_progress("netbird").await; - - install_log("INSTALL OK: netbird stack").await; - info!("NetBird stack installed"); - Ok(serde_json::json!({ - "success": true, - "package_id": "netbird", - "message": "NetBird self-hosted stack installed", - })) } } -/// Best-effort wait for NetBird's embedded OIDC provider to start serving its -/// discovery document. The management server publishes 8086:80 on the host and -/// is the issuer at `/oauth2`, so its `.well-known/openid-configuration` is the -/// signal that the dashboard's login/logout flow will work. Polls until a 2xx -/// or the timeout — NEVER fails the install (the stack is already running; this -/// only narrows the post-install race window in #10). -async fn wait_for_netbird_oidc_ready(timeout: Duration) { - let url = "http://127.0.0.1:8086/oauth2/.well-known/openid-configuration"; - let client = match reqwest::Client::builder() - .timeout(Duration::from_secs(5)) - .build() - { - Ok(c) => c, - Err(_) => return, - }; - let deadline = tokio::time::Instant::now() + timeout; - loop { - if let Ok(resp) = client.get(url).send().await { - if resp.status().is_success() { - info!("NetBird OIDC discovery is ready"); - return; - } - } - if tokio::time::Instant::now() >= deadline { - info!("NetBird OIDC discovery not ready within timeout — proceeding anyway"); - return; - } - tokio::time::sleep(Duration::from_secs(2)).await; - } -} - -async fn read_or_generate_b64_secret(name: &str) -> String { - let path = format!("/var/lib/archipelago/secrets/{}", name); - if let Ok(val) = tokio::fs::read_to_string(&path).await { - let trimmed = val.trim().to_string(); - if !trimmed.is_empty() { - return trimmed; - } - } - let mut buf = [0u8; 32]; - rand::RngCore::fill_bytes(&mut rand::rngs::OsRng, &mut buf); - let secret = base64::engine::general_purpose::STANDARD.encode(buf); - let _ = tokio::fs::create_dir_all("/var/lib/archipelago/secrets").await; - let _ = tokio::fs::write(&path, &secret).await; - secret -} - -/// Read the gateway of the `netbird-net` bridge. Podman runs its aardvark DNS -/// resolver on this address, so nginx can use it as an explicit `resolver` to -/// re-resolve container names at request time. Falls back to Podman's usual -/// first-pool gateway if the inspect fails (best effort — config is rewritten -/// on every (re)install). -async fn netbird_net_resolver_ip() -> String { - let out = tokio::process::Command::new("podman") - .args([ - "network", - "inspect", - "netbird-net", - "--format", - "{{range .Subnets}}{{.Gateway}}{{end}}", - ]) - .output() - .await; - if let Ok(o) = out { - let gw = String::from_utf8_lossy(&o.stdout).trim().to_string(); - if !gw.is_empty() && gw.parse::().is_ok() { - return gw; - } - } - "10.89.0.1".to_string() -} - -/// Generate a self-signed TLS cert for the netbird proxy if absent. The -/// dashboard needs a secure context (window.crypto.subtle / OIDC PKCE), so the -/// proxy serves HTTPS; a self-signed cert is sufficient (the user accepts it -/// once when opening netbird in a tab). SAN covers the LAN IP plus -/// localhost/127.0.0.1 so it's valid however the box is reached locally. -async fn ensure_netbird_tls_cert(host_ip: &str) -> Result<()> { - let dir = "/var/lib/archipelago/netbird"; - let crt = format!("{dir}/tls.crt"); - let key = format!("{dir}/tls.key"); - if tokio::fs::metadata(&crt).await.is_ok() && tokio::fs::metadata(&key).await.is_ok() { - return Ok(()); - } - let _ = tokio::fs::create_dir_all(dir).await; - let san = format!("subjectAltName=IP:{host_ip},IP:127.0.0.1,DNS:localhost"); - let status = tokio::process::Command::new("openssl") - .args([ - "req", - "-x509", - "-newkey", - "rsa:2048", - "-nodes", - "-keyout", - &key, - "-out", - &crt, - "-days", - "3650", - "-subj", - &format!("/CN={host_ip}"), - "-addext", - &san, - ]) - .status() - .await - .context("failed to run openssl for netbird TLS cert")?; - if !status.success() { - anyhow::bail!("openssl failed to generate netbird TLS cert"); - } - Ok(()) -} - -async fn write_netbird_config_files(host_ip: &str, lan_ip: &str, resolver_ip: &str) -> Result<()> { - // netbird's dashboard uses window.crypto.subtle (OIDC PKCE), which browsers - // only expose in a SECURE context — so the proxy serves HTTPS and every - // origin here is https (issue #15: over plain http the dashboard threw - // "window.crypto.subtle is unavailable" and never reached login). - let public_origin = format!("https://{}:8087", host_ip); - let server_origin = format!("http://{}:8086", host_ip); - // A single box is reached via several addresses. Allow the OIDC login flow - // to redirect back to whichever origin the user actually used, otherwise - // post-login lands on the wrong host and the dashboard shows - // "Unauthenticated" (issue #15). The browser-side CORS is handled in the - // nginx proxy; this covers the redirect-URI allow-list. - let lan_origin = format!("https://{}:8087", lan_ip); - let mut redirect_origins = vec![public_origin.clone()]; - if lan_origin != public_origin { - redirect_origins.push(lan_origin); - } - let dashboard_redirect_uris = redirect_origins - .iter() - .flat_map(|o| { - [ - format!(" - \"{o}/nb-auth\""), - format!(" - \"{o}/nb-silent-auth\""), - ] - }) - .collect::>() - .join("\n"); - let dashboard_logout_uris = redirect_origins - .iter() - .map(|o| format!(" - \"{o}/\"")) - .collect::>() - .join("\n"); - let relay_secret = read_or_generate_b64_secret("netbird-relay-auth-secret").await; - let encryption_key = read_or_generate_b64_secret("netbird-store-encryption-key").await; - let config = format!( - r#"server: - listenAddress: ":80" - exposedAddress: "{public_origin}" - stunPorts: - - 3478 - metricsPort: 9090 - healthcheckAddress: ":9000" - logLevel: "info" - logFile: "console" - authSecret: "{relay_secret}" - dataDir: "/var/lib/netbird" - auth: - issuer: "{public_origin}/oauth2" - localAuthDisabled: false - signKeyRefreshEnabled: false - dashboardRedirectURIs: -{dashboard_redirect_uris} - dashboardPostLogoutRedirectURIs: -{dashboard_logout_uris} - cliRedirectURIs: - - "http://localhost:53000/" - store: - engine: "sqlite" - encryptionKey: "{encryption_key}" -"# - ); - tokio::fs::write("/var/lib/archipelago/netbird/config.yaml", config) - .await - .context("Failed to write NetBird config.yaml")?; - - let dashboard_env = format!( - r#"NETBIRD_MGMT_API_ENDPOINT={public_origin} -NETBIRD_MGMT_GRPC_API_ENDPOINT={public_origin} -AUTH_AUDIENCE=netbird-dashboard -AUTH_CLIENT_ID=netbird-dashboard -AUTH_CLIENT_SECRET= -AUTH_AUTHORITY={public_origin}/oauth2 -USE_AUTH0=false -AUTH_SUPPORTED_SCOPES=openid profile email groups -AUTH_REDIRECT_URI=/nb-auth -AUTH_SILENT_REDIRECT_URI=/nb-silent-auth -NETBIRD_TOKEN_SOURCE=idToken -NGINX_SSL_PORT=443 -LETSENCRYPT_DOMAIN=none -"# - ); - tokio::fs::write("/var/lib/archipelago/netbird/dashboard.env", dashboard_env) - .await - .context("Failed to write NetBird dashboard.env")?; - - let nginx_conf = format!( - r#"server {{ - listen 443 ssl; - server_name _; - - # netbird's dashboard needs a secure context (window.crypto.subtle for OIDC - # PKCE), so the proxy terminates TLS with a self-signed cert (issue #15). - ssl_certificate /etc/nginx/tls.crt; - ssl_certificate_key /etc/nginx/tls.key; - - # Rootless Podman can hand a container a new IP across restarts/reboots. - # nginx resolves a literal upstream name ONCE at startup and caches it, so - # after the IP moves every request 502s with "host unreachable" (issue #15, - # observed live on .198: nginx pinned to a dead netbird-dashboard IP). Fix: - # point `resolver` at the netbird-net gateway (Podman's aardvark DNS) and - # use VARIABLE upstreams, which forces nginx to re-resolve the container - # names at request time. Everything is reached container-to-container by - # name so nothing depends on host-published ports either. - resolver {resolver_ip} valid=10s ipv6=off; - - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - proxy_http_version 1.1; - - location ~ ^/(relay|ws-proxy/) {{ - set $nb_server netbird-server; - proxy_pass http://$nb_server:80; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "upgrade"; - proxy_read_timeout 1d; - }} - - location ~ ^/(api|oauth2)(/|$) {{ - # The dashboard is a SPA whose API/OIDC base URL is baked at build time - # to one host:port. A single box is reached via several addresses (LAN - # IP, Tailscale 100.x, hostname), so those fetches are cross-origin and - # the browser blocks them with no Access-Control-Allow-Origin (issue - # #15, observed live on .198). Reflect the caller's Origin so the - # self-hosted management/OIDC API is reachable from any of them, and - # answer the CORS preflight here. - if ($request_method = OPTIONS) {{ - add_header Access-Control-Allow-Origin $http_origin always; - add_header Access-Control-Allow-Credentials true always; - add_header Access-Control-Allow-Methods "GET, POST, PUT, PATCH, DELETE, OPTIONS" always; - add_header Access-Control-Allow-Headers "Authorization, Content-Type, Accept" always; - add_header Access-Control-Max-Age 86400 always; - add_header Content-Length 0; - return 204; - }} - add_header Access-Control-Allow-Origin $http_origin always; - add_header Access-Control-Allow-Credentials true always; - add_header Access-Control-Allow-Methods "GET, POST, PUT, PATCH, DELETE, OPTIONS" always; - add_header Access-Control-Allow-Headers "Authorization, Content-Type, Accept" always; - set $nb_server netbird-server; - proxy_pass http://$nb_server:80; - }} - - location ~ ^/(signalexchange\.SignalExchange|management\.ManagementService|management\.ProxyService)/ {{ - set $nb_server netbird-server; - grpc_pass grpc://$nb_server:80; - grpc_read_timeout 1d; - grpc_send_timeout 1d; - }} - - # OIDC callback routes are client-side SPA routes with NO prebuilt page in - # the dashboard bundle, so proxying them straight through 404s — which - # crashes the dashboard's auth init and shows "Unauthenticated" with dead - # buttons (issue #15, confirmed live on .198: /nb-auth + /nb-silent-auth - # returned 404). Serve the dashboard's index.html at these paths (URL - # unchanged) so react-oidc boots and completes the login / silent-SSO. - location ~ ^/(nb-auth|nb-silent-auth) {{ - set $nb_dashboard netbird-dashboard; - rewrite ^.*$ /index.html break; - proxy_pass http://$nb_dashboard:80; - }} - - location / {{ - set $nb_dashboard netbird-dashboard; - proxy_pass http://$nb_dashboard:80; - }} -}} - -# Direct server remains available for diagnostics at {server_origin}. -"# - ); - tokio::fs::write("/var/lib/archipelago/netbird/nginx.conf", nginx_conf) - .await - .context("Failed to write NetBird nginx.conf")?; - - Ok(()) -} - -async fn detect_netbird_public_host_ip() -> Option { - let output = tokio::process::Command::new("hostname") - .args(["-I"]) - .output() - .await - .ok()?; - let stdout = String::from_utf8_lossy(&output.stdout); - let ips: Vec<&str> = stdout - .split_whitespace() - .filter(|s| s.contains('.')) - .collect(); - - // Prefer the LAN address as the canonical origin — that's what users browse - // to on the local network. Baking the Tailscale 100.x address here broke - // LAN access with cross-origin/redirect mismatches (issue #15). Tailscale - // (100.64.0.0/10 CGNAT) is only a fallback for nodes with no LAN IP. - let is_private_lan = |ip: &str| { - ip.starts_with("192.168.") - || ip.starts_with("10.") - || (ip.starts_with("172.") - && ip - .split('.') - .nth(1) - .and_then(|o| o.parse::().ok()) - .map(|o| (16..=31).contains(&o)) - .unwrap_or(false)) - }; - if let Some(lan) = ips.iter().find(|ip| is_private_lan(ip)) { - return Some(lan.to_string()); - } - ips.iter() - .find(|ip| ip.starts_with("100.")) - .map(|s| s.to_string()) -} - #[cfg(test)] mod tests { use super::{btcpay_stack_app_ids, mempool_stack_app_ids}; diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 31f521be..db01442a 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -2964,7 +2964,8 @@ impl ProdContainerOrchestrator { } /// The gateway IP of the app's podman network — aardvark's DNS resolver - /// address. Mirrors the legacy `netbird_net_resolver_ip`; falls back to + /// address. (Generalised from the old per-app netbird resolver helper, + /// deleted in #20 ph4.) Falls back to /// podman's usual first-pool gateway if the inspect can't be parsed (the /// network was just ensured to exist, so this is a belt-and-braces default). async fn network_gateway(&self, manifest: &AppManifest) -> Result { @@ -3004,8 +3005,8 @@ impl ProdContainerOrchestrator { /// entry whose crt+key already exist (idempotent / data-preserving). CN and /// SAN templates are rendered against host facts; when omitted they default /// to the node's host IP plus `127.0.0.1`/`localhost` so the cert is valid - /// however the box is reached locally. Mirrors the legacy - /// `ensure_netbird_tls_cert` (rsa:2048, 10-year, no per-app Rust). + /// however the box is reached locally. (Generalised from the old per-app + /// netbird TLS helper, deleted in #20 ph4: rsa:2048, 10-year, no per-app Rust.) async fn ensure_manifest_certs(&self, manifest: &AppManifest) -> Result<()> { let facts = self.detect_host_facts(); let render = |s: &str| {