From a483fe4baa4d6028b7ddf4b0555ea439a4cf1d90 Mon Sep 17 00:00:00 2001 From: archipelago Date: Sun, 14 Jun 2026 03:35:19 -0400 Subject: [PATCH] fix: derive launch port from URL authority, not naive rsplit reachable_lan_address() parsed the launch port with url.rsplit(':') which yields "8096/" for manifest interfaces.main URLs that carry a path (http://localhost:8096/). That fails to parse and silently drops a perfectly reachable launch URL, so apps like jellyfin, btcpay-server, fedimint, gitea, nextcloud and portainer showed running with no launch link in the UI. New launch_url_port() reads digits after the final colon (mirroring port_from_url in the RPC layer) and tolerates a trailing path. Adds regression tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- core/Cargo.lock | 2 +- .../src/container/docker_packages.rs | 42 +++++++- docs/MIGRATION_STATUS_REPORT.md | 97 ++++++++++++++++++- 3 files changed, 138 insertions(+), 3 deletions(-) diff --git a/core/Cargo.lock b/core/Cargo.lock index 142233d4..f6b13fbd 100644 --- a/core/Cargo.lock +++ b/core/Cargo.lock @@ -80,7 +80,7 @@ checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" [[package]] name = "archipelago" -version = "1.7.89-alpha" +version = "1.7.90-alpha" dependencies = [ "anyhow", "archipelago-container", diff --git a/core/archipelago/src/container/docker_packages.rs b/core/archipelago/src/container/docker_packages.rs index 2931a393..7a0c87fa 100644 --- a/core/archipelago/src/container/docker_packages.rs +++ b/core/archipelago/src/container/docker_packages.rs @@ -699,7 +699,7 @@ async fn reachable_lan_address(app_id: &str, candidate: Option) -> Optio if !requires_reachable_launch(app_id) { return Some(url); } - let Some(port) = url.rsplit(':').next().and_then(|p| p.parse::().ok()) else { + let Some(port) = launch_url_port(&url) else { return None; }; if launch_port_reachable(port).await { @@ -710,6 +710,23 @@ async fn reachable_lan_address(app_id: &str, candidate: Option) -> Optio } } +/// Extract the TCP port from a launch URL's authority. +/// +/// The candidate URL can carry a path when it comes from a manifest +/// `interfaces.main` declaration (e.g. `http://localhost:8096/`). A naive +/// `rsplit(':')` then yields `"8096/"`, which fails to parse and silently +/// drops a reachable launch URL. Reading digits after the final colon mirrors +/// `port_from_url` in the RPC layer and tolerates a trailing path. +fn launch_url_port(url: &str) -> Option { + let after_colon = url.rsplit_once(':')?.1; + after_colon + .chars() + .take_while(|c| c.is_ascii_digit()) + .collect::() + .parse::() + .ok() +} + async fn launch_port_reachable(port: u16) -> bool { matches!( tokio::time::timeout( @@ -788,3 +805,26 @@ fn package_state_str(state: &PackageState) -> &str { PackageState::Updating => "updating", } } + +#[cfg(test)] +mod launch_url_port_tests { + use super::launch_url_port; + + #[test] + fn parses_port_with_trailing_path() { + // Regression: manifest interfaces.main yields a path-suffixed URL. + // The old rsplit(':') parse produced "8096/" and dropped the URL. + assert_eq!(launch_url_port("http://localhost:8096/"), Some(8096)); + assert_eq!(launch_url_port("http://localhost:8175/admin"), Some(8175)); + } + + #[test] + fn parses_bare_authority_port() { + assert_eq!(launch_url_port("http://localhost:8083"), Some(8083)); + } + + #[test] + fn rejects_url_without_port() { + assert_eq!(launch_url_port("http://localhost/"), None); + } +} diff --git a/docs/MIGRATION_STATUS_REPORT.md b/docs/MIGRATION_STATUS_REPORT.md index 342398f1..ae0477d4 100644 --- a/docs/MIGRATION_STATUS_REPORT.md +++ b/docs/MIGRATION_STATUS_REPORT.md @@ -1,6 +1,101 @@ # Migration Status Report -Last updated: 2026-06-11 +Last updated: 2026-06-14 + +## Validation node (ACTIVE) + +As of 2026-06-14 the app-migration lifecycle validation moves from `.198` (remote, OVH) to +**`.116` — the local dev node (`archi-thinkpad`, `192.168.1.116`)** because it is the machine +this session runs on, so the harness drives it over loopback instead of SSH (much faster, no +network latency). A separate agent owns OS-level fixes + its own test harness; this track owns +the **app-packaging migration** lifecycle validation only. + +How to drive the harness against `.116` (local): + +```bash +ARCHY_HOST=127.0.0.1 ARCHY_SCHEME=http ARCHY_PASSWORD='ThisIsWeb54321@' \ + ARCHY_APPS='meshtastic,jellyfin,filebrowser,uptime-kuma' \ + tests/lifecycle/remote-lifecycle.sh # focused, audit-only (non-destructive) +``` + +- `.116` serves nginx on **:80 only** (443 is tailscale's) → use `ARCHY_SCHEME=http`, `ARCHY_HOST=127.0.0.1`. +- Local node is healthy: `update_state.json.current_version == 1.7.90-alpha`, `update_in_progress=false` + (the OTA self-heal that was a follow-up gap in PROGRESS_MEMORY is now confirmed resolved on .116). +- Login password for `.116`: `ThisIsWeb54321@` (verified against `auth.login`). Note: auth.login + has a login rate-limiter — avoid rapid repeated attempts. +- `.198` results below remain the prior baseline; new results are tagged `[.116]`. + +### [.116] audit log (newest first) + +- **2026-06-14 — focused audit `meshtastic,jellyfin,filebrowser,uptime-kuma` (audit-only, non-destructive):** + harness exit 1, FAILED checks: 1. + - `filebrowser` — running, pass (also passed a standalone single-app smoke run). + - `uptime-kuma` — running, pass. + - `meshtastic` — `state=absent`. Not installed on `.116` (was installed/validated on `.198`). + Not a regression; just node state. To exercise meshtastic here, install it first (it needs + `/dev/ttyUSB0`, which `.116` may not have) or drop it from the focused set on this node. + - `jellyfin` — **running but FAILED: "launch metadata missing: jellyfin has no lan_address".** + **ROOT-CAUSED 2026-06-14 — real, current bug in the working tree (a regression).** See + "FINDING F1" below. + +### [.116] FINDING F1 — manifest launch URLs with a path are silently dropped (OPEN, fix pending) + +**Symptom:** `jellyfin` is `running` and genuinely serving (`curl 127.0.0.1:8096/` → 302), but +`container-list` reports `lan_address: null`, so the UI/harness sees no launch URL. + +**Root cause:** `core/archipelago/src/container/docker_packages.rs::reachable_lan_address()` parses +the port out of the candidate URL with `url.rsplit(':').next()`. When the candidate comes from the +manifest `interfaces.main` (via `PodmanClient::lan_address_for` → +`core/container/src/podman_client.rs::manifest_primary_interface_url`), the URL **includes the +manifest `path`** — e.g. jellyfin → `http://localhost:8096/`. Then `rsplit(':').next()` yields +`"8096/"`, which **fails to `parse::()`**, so the function hits its `else { return None }` +branch and drops a perfectly reachable launch URL. (Diagnostic tell: the dropped-at-parse path +emits **no** log, whereas a genuine unreachable port logs "suppressing unreachable launch URL". +jellyfin has no such log; uptime-kuma — whose candidate `…:3002` has no path — does.) + +**Why it's a regression:** the old `extract_lan_address(ports)` produced `http://localhost:PORT` +(no path), which parsed fine. The newer manifest-interface feature appends the declared `path`, +so any app routed through `lan_address_for` now yields `…:PORT/` and trips the parser. + +**Blast radius (apps in `requires_reachable_launch` whose `interfaces.main.path` = `/`):** +`botfights`, `btcpay-server`, `fedimint`, `jellyfin`, `gitea`, `nextcloud`, `portainer`. +(`filebrowser`/`nextcloud`/`nginx-proxy-manager`/`vaultwarden` are in `uses_allocated_launch_port` +so they hit `extract_lan_address` first and dodge it; `grafana`/`mempool`/`uptime-kuma`/`searxng` +have no manifest `interfaces.main` path.) On `.198` this likely went unnoticed because those apps +weren't all running during the launch-metadata assertion, or predated the interfaces.main addition. + +**Fix (IMPLEMENTED in working tree, uncommitted):** +`docker_packages.rs::reachable_lan_address` now parses the port via a new `launch_url_port()` +helper that reads digits after the final colon (`take_while(is_ascii_digit)`), mirroring the +RPC-layer `port_from_url`, so `http://localhost:8096/` → `Some(8096)`. Added unit tests +(`launch_url_port_tests`) covering the trailing-path regression, the bare-authority case, and a +no-port reject. The existing `lan_address_prefers_manifest_main_interface` test only exercised +`lan_address_for` (which always returned `…:8175/`) and never the `reachable_lan_address` wrapper, +which is why the bug slipped through. + +**Unit validation: GREEN (2026-06-14).** `cargo test -p archipelago --bin archipelago launch_url_port` +→ 3 passed / 0 failed (trailing-path, bare-authority, no-port-reject); crate compiles clean. + +**Coordination note (shared tree):** the repo is on branch `fix/wallet-receive-portdrift-secrets` +at commit `bb808df8` (= the deployed 1.7.90-alpha). A parallel agent has uncommitted changes here +(lnd `wallet.rs`, `bitcoin_relay.rs`, `prod_orchestrator.rs`, electrumx manifest, neode-ui, new +bats). To validate F1 in isolation (and NOT deploy their in-flight work onto the live node, nor +disturb their tree), the live-validation build is done in a detached git worktree at +`/home/archipelago/archy-f1` = clean `bb808df8` + only the F1 `docker_packages.rs` change. Build: +`cd /home/archipelago/archy-f1/core && TMPDIR=/home/archipelago/.buildtmp cargo build --release -p archipelago` +(`.116`'s `/tmp` is a 7.7G tmpfs that runs 100% full → the ring crate's C compile fails with +"No space left on device"; redirect `TMPDIR` to `/` which has ~399G). After validation the +worktree is removed (`git worktree remove`). NOTE: sideloading replaces the OTA-managed +`/usr/local/bin/archipelago` with a local 1.7.90-alpha+F1 build until the next OTA — back up the +current binary first (`/usr/local/bin/archipelago.pre-f1.bak`). + +**Live validation status — PENDING, GATED ON USER OK.** Proving F1 on `.116` requires a backend +rebuild + redeploy + `systemctl restart archipelago`. Per `feedback_no_systemctl_deploy_until_quadlet` +that restart SIGKILLs every container in the service cgroup (the boot reconciler re-adopts them, but +it is disruptive on this live node that runs the user's real bitcoin/lnd/apps). So: unit test first +(non-disruptive), then ask before the redeploy. Do NOT run the prod binary directly to "check a +version" — `/usr/local/bin/archipelago ` boots a whole second node instance (learned the +hard way 2026-06-14; it exited without leaving a stray, but don't repeat). ## Goal