diff --git a/apps/immich-postgres/manifest.yml b/apps/immich-postgres/manifest.yml index df27126e..6ba96b2a 100644 --- a/apps/immich-postgres/manifest.yml +++ b/apps/immich-postgres/manifest.yml @@ -4,15 +4,20 @@ app: version: "14-vectorchord0.4.3-pgvectors0.2.0" description: Postgres (pgvecto.rs / vectorchord) backend for Immich. - # The Immich server connects via DB_HOSTNAME=immich_postgres, so the container - # name (and thus its archy-net alias) must be the underscore form. - extensions: - container_name: immich_postgres + # No container_name override: the container is named by app_id (immich-postgres), + # which is also its archy-net alias and the server's DB_HOSTNAME. (Overriding the + # name diverges from the orchestrator's app_id-based naming and spawns duplicate + # containers — mirror the btcpay stack, which names members by app_id.) container: image: 146.59.87.168:3000/lfg2025/immich-postgres:14-vectorchord0.4.3-pgvectors0.2.0 pull_policy: if-not-present network: archy-net + # postgres drops to its own uid (container 999 → host 100998 under rootless), + # so the data dir must be owned by that mapped uid — mirrors archy-btcpay-db. + # Verified on .228: the live immich-db is owned 100998. Without this a FRESH + # install's dir would be service-user-owned and postgres would EACCES. + data_uid: "100998:100998" generated_secrets: - name: immich-db-password kind: hex32 diff --git a/apps/immich-redis/manifest.yml b/apps/immich-redis/manifest.yml index ef5657f4..fabb69db 100644 --- a/apps/immich-redis/manifest.yml +++ b/apps/immich-redis/manifest.yml @@ -4,9 +4,7 @@ app: version: "7-alpine" description: Valkey (Redis-compatible) cache for Immich. - # Immich server connects via REDIS_HOSTNAME=immich_redis — alias must match. - extensions: - container_name: immich_redis + # Named by app_id (immich-redis) = archy-net alias = server's REDIS_HOSTNAME. container: image: 146.59.87.168:3000/lfg2025/valkey:7-alpine diff --git a/apps/immich-server/manifest.yml b/apps/immich-server/manifest.yml index 884e6ee5..23596ba4 100644 --- a/apps/immich-server/manifest.yml +++ b/apps/immich-server/manifest.yml @@ -1,11 +1,11 @@ app: id: immich-server name: Immich - version: "release" + version: "2.7.4" description: Self-hosted photo and video backup with mobile apps and search. - extensions: - container_name: immich_server + # Named by app_id (immich-server); connects to its siblings by their app_id + # aliases on archy-net (see DB_HOSTNAME / REDIS_HOSTNAME below). container: image: 146.59.87.168:3000/lfg2025/immich-server:release @@ -41,10 +41,10 @@ app: options: [rw] environment: - - DB_HOSTNAME=immich_postgres + - DB_HOSTNAME=immich-postgres - DB_USERNAME=postgres - DB_DATABASE_NAME=immich - - REDIS_HOSTNAME=immich_redis + - REDIS_HOSTNAME=immich-redis - UPLOAD_LOCATION=/usr/src/app/upload health_check: diff --git a/core/archipelago/src/api/rpc/package/stacks.rs b/core/archipelago/src/api/rpc/package/stacks.rs index e3106d17..89c8dba4 100644 --- a/core/archipelago/src/api/rpc/package/stacks.rs +++ b/core/archipelago/src/api/rpc/package/stacks.rs @@ -620,16 +620,25 @@ async fn install_stack_via_orchestrator( )) .await; + let mut installed = 0usize; for app_id in app_ids { match orchestrator.install(app_id).await { Ok(container_name) => { + installed += 1; install_log(&format!( "INSTALL ORCH: {} stack — app {} installed as {}", stack_name, app_id, container_name )) .await; } - Err(e) if e.to_string().contains("unknown app_id") => { + Err(e) if e.to_string().contains("unknown app_id") && installed == 0 => { + // None of the stack's manifests are known — the orchestrator + // can't render this stack at all, so defer to the legacy + // installer. Only safe when NOTHING was installed yet: once an + // earlier member is up, falling back would let the legacy path + // double-create containers on the same data dir (observed + // corrupting an immich postgres cluster — two postmasters, one + // PGDATA). A partial set means a deploy bug, not a legacy node. install_log(&format!( "INSTALL ORCH SKIP: {} stack — app {} unknown, falling back to legacy stack installer", stack_name, app_id @@ -637,6 +646,17 @@ async fn install_stack_via_orchestrator( .await; return Ok(None); } + Err(e) if e.to_string().contains("unknown app_id") => { + install_log(&format!( + "INSTALL ORCH FAIL: {} stack — app {} unknown AFTER {} installed; refusing legacy fallback (would double-create on shared data)", + stack_name, app_id, installed + )) + .await; + return Err(e.context(format!( + "orchestrator stack install {} aborted: app {} has no manifest but {} member(s) already installed — deploy all stack manifests", + stack_name, app_id, installed + ))); + } Err(e) => { install_log(&format!( "INSTALL ORCH FAIL: {} stack — app {} failed: {}", @@ -668,6 +688,11 @@ fn mempool_stack_app_ids() -> &'static [&'static str] { &["archy-mempool-db", "mempool-api", "archy-mempool-web"] } +fn immich_stack_app_ids() -> &'static [&'static str] { + // Install order = dependency order: db + cache before the server. + &["immich-postgres", "immich-redis", "immich-server"] +} + const REGISTRY: &str = "146.59.87.168:3000/lfg2025"; const NETBIRD_DASHBOARD_IMAGE: &str = "docker.io/netbirdio/dashboard:v2.38.0"; @@ -734,6 +759,17 @@ async fn pull_image_with_retry(image: &str) -> Result<()> { impl RpcHandler { /// Install Immich stack (postgres + redis + server). pub(super) async fn install_immich_stack(&self) -> Result { + // Manifest-driven path (workstream B/C): render the stack from + // apps/immich-*/manifest.yml via the orchestrator (rootless Quadlet + // units, generated_secrets, reboot-survivable). Falls back to the legacy + // installer below only when the orchestrator doesn't know these app_ids + // (manifests not yet deployed). See docs/PRODUCTION-MASTER-PLAN.md. + if let Some(orchestrated) = + install_stack_via_orchestrator(self, "immich", immich_stack_app_ids()).await? + { + return Ok(orchestrated); + } + if let Some(adopted) = adopt_stack_if_exists( "immich_server", "immich", diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 48609098..0d281ed8 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -3778,10 +3778,14 @@ app: if !mf.exists() { continue; } - let m = match AppManifest::from_file(&mf) { - Ok(m) => m, - Err(_) => continue, // a malformed disk manifest is a separate concern - }; + // Every shipped manifest MUST be valid. load_manifests() silently + // skips malformed ones in prod, which once let an invalid app.version + // ("release", no digit) ship — the app then vanished from the + // orchestrator and a stack install half-fell-back to the legacy path. + // Fail loudly here instead. + let m = AppManifest::from_file(&mf).unwrap_or_else(|e| { + panic!("shipped manifest {} must be valid: {e}", mf.display()) + }); let id = m.app.id.clone(); let is_build = m.app.container.build.is_some(); let value = serde_json::to_value(&m).expect("manifest serializes to JSON"); diff --git a/docs/PRODUCTION-MASTER-PLAN.md b/docs/PRODUCTION-MASTER-PLAN.md index c47d2d5d..b3ae8416 100644 --- a/docs/PRODUCTION-MASTER-PLAN.md +++ b/docs/PRODUCTION-MASTER-PLAN.md @@ -63,7 +63,7 @@ real nodes. Until then, this plan is the priority. | # | Workstream | Detail doc | Status | |---|-----------|-----------|--------| | A | **Manifest-driven app platform** — packaging contract, single/multi-container runtime, routing, controlled hooks, dev tooling (6 phases, security model, migration rules) | `APP-PACKAGING-MIGRATION-PLAN.md` | mostly done; immich + multi-container polish remain | -| B | **Registry-distributed manifests** — catalog carries full signed manifest; orchestrator installs from registry; disk = migration fallback | `registry-manifest-design.md` | **design done — implementing phase 1** | +| B | **Registry-distributed manifests** — catalog carries full signed manifest; orchestrator installs from registry; disk = migration fallback | `registry-manifest-design.md` | **phases 1+2 done** (node consume + opt-in publisher embed); not yet flipped on for the fleet | | C | **Developer-ready external registry** — 3rd-party DID-signed manifests, decentralized Nostr discovery (NIP-78 kind 30078) + trust score, `archy app …` tooling | `marketplace-protocol.md`, `app-developer-guide.md` | design exists; tooling + trust UX pending | | D | **Distribution backbone** — signed catalog, BLAKE3 content-addressing, iroh swarm (origin-always-wins) | `dht-distribution-design.md` | phases 0–2 code-complete (worktree) | | E | **Production test gate** — 20× lifecycle on .228 + .198, per-app L1/L2 matrix | `tests/lifecycle/TESTING.md`, `bulletproof-containers.md` | **never green — exit criterion** |