fix(quadlet): http:// double-prefix + companion migration race
Two bugs surfaced by the first real-node validation of Phase 3.2-3.4
on .228 (2026-05-02), both caught before flipping the default.
Bug 1 — translate_health_check double-prefixed http://. Manifests in
the wild carry the scheme inside the endpoint string ("http://localhost:8175"),
and we were prepending another http:// unconditionally. Result on .228:
every backend HealthCmd read `curl -fsS -m 5 http://http://localhost...`,
every probe failed, fedimint hit a 14-restart loop. Now we accept either
form and skip appending hc.path when the endpoint already carries one.
Regression test asserts no double-prefix and that an in-endpoint path
is honoured.
Bug 2 — Phase 3.3 migration ran for UI companions (bitcoin-ui /
electrs-ui / lnd-ui) that have shipped via Quadlet since v1.7.41.
Migration tore down the running companion + raced companion.rs render,
producing "Phase 3.3: re-install archy-bitcoin-ui via Quadlet" reconcile
errors and leaving archy-bitcoin-ui down. Companions now short-circuit
out of migrate_to_quadlet_if_needed before any IO. Also: when try_exists
returns Err for an unrelated reason (permissions, EIO), we now skip
migration instead of treating "I can't tell" as "go ahead and migrate" —
migrating on top of a possibly-existing unit is destructive.
What this does not fix yet:
* the orchestrator's reconciler iterating every manifest in
/opt/archipelago/apps/, not just installed apps. Pre-existing
behavior (also affects the legacy path) — separate scope.
* fedimint /data UID mismatch surfaced when Quadlet started fedimint
fresh. Likely orthogonal — defer.
* no rollback when install_via_quadlet fails after a remove_container.
Tracked as Phase 3.3.1 — defer.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bd96c0475d
commit
384f12de7a
@ -532,13 +532,37 @@ impl ProdContainerOrchestrator {
|
|||||||
lm: &LoadedManifest,
|
lm: &LoadedManifest,
|
||||||
name: &str,
|
name: &str,
|
||||||
) -> Result<Option<ReconcileAction>> {
|
) -> Result<Option<ReconcileAction>> {
|
||||||
|
// Skip companion apps — bitcoin-ui / electrs-ui / lnd-ui have shipped
|
||||||
|
// via Quadlet since v1.7.41 (companion.rs renders the unit). Running
|
||||||
|
// migration for them races companion rendering: when migration ran
|
||||||
|
// for archy-bitcoin-ui on .228 2026-05-02 it tore down the running
|
||||||
|
// companion + tried to start a duplicate unit and failed. Companions
|
||||||
|
// are managed by their own path, never the legacy podman-create one,
|
||||||
|
// so there's nothing here to migrate.
|
||||||
|
let app_id = lm.manifest.app.id.as_str();
|
||||||
|
if UI_APP_IDS.contains(&app_id) {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
let unit_dir = quadlet::unit_dir()
|
let unit_dir = quadlet::unit_dir()
|
||||||
.await
|
.await
|
||||||
.context("locate user quadlet unit dir for migration check")?;
|
.context("locate user quadlet unit dir for migration check")?;
|
||||||
let unit_path = unit_dir.join(format!("{name}.container"));
|
let unit_path = unit_dir.join(format!("{name}.container"));
|
||||||
if tokio::fs::try_exists(&unit_path).await.unwrap_or(false) {
|
// Treat any IO error from try_exists as "the unit might be there" —
|
||||||
// Already on the Quadlet path — nothing to migrate.
|
// migrating on top of an existing unit is destructive (we'd podman rm
|
||||||
return Ok(None);
|
// the running container then fight the existing service over startup
|
||||||
|
// ordering), so be cautious when we can't read the answer.
|
||||||
|
match tokio::fs::try_exists(&unit_path).await {
|
||||||
|
Ok(true) => return Ok(None),
|
||||||
|
Ok(false) => {} // proceed
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!(
|
||||||
|
path = %unit_path.display(),
|
||||||
|
error = %e,
|
||||||
|
"Phase 3.3: skipping migration — couldn't determine if Quadlet unit already exists"
|
||||||
|
);
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// No unit on disk. If a pre-Phase-3 container exists for this
|
// No unit on disk. If a pre-Phase-3 container exists for this
|
||||||
|
|||||||
@ -365,8 +365,14 @@ impl QuadletUnit {
|
|||||||
///
|
///
|
||||||
/// Supported shapes:
|
/// Supported shapes:
|
||||||
/// - type: tcp, endpoint: "host:port" → `nc -z host port`
|
/// - type: tcp, endpoint: "host:port" → `nc -z host port`
|
||||||
/// - type: http, endpoint: "host:port", path → `curl -fsS http://host:port<path>`
|
/// - type: http, endpoint: "host:port" or "http(s)://host:port", path → curl
|
||||||
/// - type: cmd, endpoint: "<shell command>" → `<shell command>` verbatim
|
/// - type: cmd, endpoint: "<shell command>" → `<shell command>` verbatim
|
||||||
|
///
|
||||||
|
/// For type=http we accept the endpoint with or without scheme; manifests
|
||||||
|
/// in the wild use both forms (`localhost:8175` and
|
||||||
|
/// `http://localhost:8175/`). Earlier we blindly prepended `http://` even
|
||||||
|
/// when one was already there, producing `http://http://...` HealthCmds
|
||||||
|
/// that pasted on .228 2026-05-02 and failed every probe.
|
||||||
fn translate_health_check(
|
fn translate_health_check(
|
||||||
hc: &archipelago_container::HealthCheck,
|
hc: &archipelago_container::HealthCheck,
|
||||||
) -> Option<HealthSpec> {
|
) -> Option<HealthSpec> {
|
||||||
@ -379,11 +385,34 @@ fn translate_health_check(
|
|||||||
format!("nc -z {host} {port}")
|
format!("nc -z {host} {port}")
|
||||||
}
|
}
|
||||||
"http" => {
|
"http" => {
|
||||||
let endpoint = hc.endpoint.as_deref()?;
|
let endpoint = hc.endpoint.as_deref()?.trim();
|
||||||
let path = hc.path.as_deref().unwrap_or("/");
|
// Accept either bare host:port or a full URL. If endpoint
|
||||||
|
// already includes a scheme we use it as-is; otherwise we
|
||||||
|
// prepend http://. This keeps existing http://foo manifests
|
||||||
|
// working and stops the http://http:// double-prefix bug.
|
||||||
|
let url = if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
|
||||||
|
endpoint.to_string()
|
||||||
|
} else {
|
||||||
|
format!("http://{endpoint}")
|
||||||
|
};
|
||||||
|
// If the endpoint already carried a path component, honour it
|
||||||
|
// and ignore hc.path (manifests that bake the path into the
|
||||||
|
// endpoint don't expect to merge a separate path field).
|
||||||
|
// Otherwise append hc.path (default "/").
|
||||||
|
let already_has_path = url
|
||||||
|
.splitn(4, '/')
|
||||||
|
.nth(3)
|
||||||
|
.map(|p| !p.is_empty())
|
||||||
|
.unwrap_or(false);
|
||||||
|
let final_url = if already_has_path {
|
||||||
|
url
|
||||||
|
} else {
|
||||||
|
let path = hc.path.as_deref().unwrap_or("/");
|
||||||
|
format!("{url}{path}")
|
||||||
|
};
|
||||||
// -fsS: fail on non-2xx, silent except on error, show errors.
|
// -fsS: fail on non-2xx, silent except on error, show errors.
|
||||||
// -m 5: per-request timeout matches the default manifest timeout.
|
// -m 5: per-request timeout matches the default manifest timeout.
|
||||||
format!("curl -fsS -m 5 http://{endpoint}{path}")
|
format!("curl -fsS -m 5 {final_url}")
|
||||||
}
|
}
|
||||||
"cmd" => hc.endpoint.as_deref()?.to_string(),
|
"cmd" => hc.endpoint.as_deref()?.to_string(),
|
||||||
_ => return None,
|
_ => return None,
|
||||||
@ -921,6 +950,38 @@ app:
|
|||||||
assert!(translate_health_check(&badtcp).is_none());
|
assert!(translate_health_check(&badtcp).is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn translate_health_check_http_does_not_double_prefix_scheme() {
|
||||||
|
// Regression: on .228 2026-05-02 we shipped HealthCmds reading
|
||||||
|
// `curl -fsS -m 5 http://http://localhost:8175/` because manifests
|
||||||
|
// in the wild carry the scheme inside the endpoint string. Every
|
||||||
|
// probe failed and the unit looped. Now we accept either form.
|
||||||
|
use archipelago_container::HealthCheck;
|
||||||
|
let with_scheme = HealthCheck {
|
||||||
|
check_type: "http".into(),
|
||||||
|
endpoint: Some("http://localhost:8175".into()),
|
||||||
|
path: Some("/".into()),
|
||||||
|
interval: "30s".into(),
|
||||||
|
timeout: "5s".into(),
|
||||||
|
retries: 3,
|
||||||
|
};
|
||||||
|
let h = translate_health_check(&with_scheme).expect("with-scheme must translate");
|
||||||
|
assert_eq!(h.cmd, "curl -fsS -m 5 http://localhost:8175/");
|
||||||
|
assert!(!h.cmd.contains("http://http://"), "got: {}", h.cmd);
|
||||||
|
|
||||||
|
let with_https = HealthCheck {
|
||||||
|
check_type: "http".into(),
|
||||||
|
endpoint: Some("https://example.local/health".into()),
|
||||||
|
path: None,
|
||||||
|
interval: "30s".into(),
|
||||||
|
timeout: "5s".into(),
|
||||||
|
retries: 3,
|
||||||
|
};
|
||||||
|
let h = translate_health_check(&with_https).expect("https must translate");
|
||||||
|
// Endpoint already has /health → don't append the default "/".
|
||||||
|
assert_eq!(h.cmd, "curl -fsS -m 5 https://example.local/health");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn from_manifest_picks_up_health_check() {
|
fn from_manifest_picks_up_health_check() {
|
||||||
let yaml = r#"
|
let yaml = r#"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user