From 57a013bc668beb52435adce1697b4ed5747c3c15 Mon Sep 17 00:00:00 2001 From: archipelago Date: Mon, 22 Jun 2026 18:12:41 -0400 Subject: [PATCH] =?UTF-8?q?test(gate):=20make=205=C3=97=20the=20canonical?= =?UTF-8?q?=20gate,=20drop=2020x=20naming?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename run-20x.sh → run-gate.sh, default ARCHY_ITERATIONS 20→5, and scrub 20× references across CLAUDE.md, the master plan, TESTING.md, app-registry status, the orchestrator/config doc-comments, and the bats suites. Also add a minimal fail() helper to mempool.bats so guard failures report cleanly. Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 5 +- core/archipelago/src/config.rs | 4 +- .../src/container/prod_orchestrator.rs | 4 +- docs/PRODUCTION-MASTER-PLAN.md | 17 +- docs/app-registry-status-2026-06-21.md | 6 +- docs/bitcoin-multi-version-design.md | 215 ++++++++++++++++++ docs/demo-deployment-design.md | 169 ++++++++++++++ docs/multinode-testing-plan.md | 2 +- scripts/create-release.sh | 2 +- tests/lifecycle/TESTING.md | 16 +- tests/lifecycle/bats/electrumx.bats | 2 +- tests/lifecycle/bats/lnd.bats | 2 +- tests/lifecycle/bats/mempool.bats | 5 + tests/lifecycle/bats/ui-coverage.bats | 2 +- tests/lifecycle/{run-20x.sh => run-gate.sh} | 14 +- tests/lifecycle/setup-teardown.sh | 2 +- 16 files changed, 427 insertions(+), 40 deletions(-) create mode 100644 docs/bitcoin-multi-version-design.md create mode 100644 docs/demo-deployment-design.md rename tests/lifecycle/{run-20x.sh => run-gate.sh} (89%) diff --git a/CLAUDE.md b/CLAUDE.md index b0e57be2..bf1d98d3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -42,10 +42,9 @@ Detailed sub-plans (all linked from the master): ## Production test gate (definition of done) -`tests/lifecycle/run-20x.sh` green across install / UI / stop / start / restart / +`tests/lifecycle/run-gate.sh` green across install / UI / stop / start / restart / reinstall / reboot-survive / archipelago-restart-survive / uninstall — **5× on -.228** (`ARCHY_ITERATIONS=5`; temporarily reduced from 20× — restore to 20× before -the final ship). **Run the gate ON the node** (it uses local podman/systemctl/bitcoin +.228** (`ARCHY_ITERATIONS=5`). **Run the gate ON the node** (it uses local podman/systemctl/bitcoin probes), not via RPC from another host. Until green, the master plan is the priority. **Multinode testing (.198 + the rest of the fleet) is a SEPARATE plan** — `docs/multinode-testing-plan.md` — not part of this single-node gate criterion. diff --git a/core/archipelago/src/config.rs b/core/archipelago/src/config.rs index 1b19b1ff..dea0a949 100644 --- a/core/archipelago/src/config.rs +++ b/core/archipelago/src/config.rs @@ -66,7 +66,7 @@ pub struct Config { /// through Quadlet (`.container` units in ~/.config/containers/systemd /// + systemctl --user start) instead of `podman create + start`. Default /// off so the legacy path stays the production path until the harness - /// at tests/lifecycle/run-20x.sh has gone green against the new path + /// at tests/lifecycle/run-gate.sh has gone green against the new path /// on .228 + .198. See `project_v1_7_52_phase3_quadlet_design`. #[serde(default)] pub use_quadlet_backends: bool, @@ -487,7 +487,7 @@ mod tests { #[test] fn test_config_use_quadlet_backends_defaults_off() { - // Phase 3.2 of v1.7.52 — the new path stays gated until the 20× + // Phase 3.2 of v1.7.52 — the new path stays gated until the 5× // harness goes green on .228 and .198. Flipping this default // ahead of that would route every backend install through code // we haven't fleet-validated yet. diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index a45cee5d..2223a020 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -894,7 +894,7 @@ pub struct ProdContainerOrchestrator { /// Quadlet `.container` unit and starts it via systemctl --user /// instead of shelling out to `podman create + start`. Default /// false so the legacy path remains the production path until the - /// 20× lifecycle harness goes green against the new path. + /// 5× lifecycle harness goes green against the new path. use_quadlet_backends: bool, #[cfg(test)] test_disk_gb: Option, @@ -1738,7 +1738,7 @@ impl ProdContainerOrchestrator { } else { self.remove_quadlet_unit_if_present(&name).await?; ensure_user_podman_socket().await?; - // Legacy path. Production until tests/lifecycle/run-20x.sh + // Legacy path. Production until tests/lifecycle/run-gate.sh // goes green against the Quadlet path. self.runtime .create_container(&resolved_manifest, &name, 0) diff --git a/docs/PRODUCTION-MASTER-PLAN.md b/docs/PRODUCTION-MASTER-PLAN.md index 42b25d71..bbc561ec 100644 --- a/docs/PRODUCTION-MASTER-PLAN.md +++ b/docs/PRODUCTION-MASTER-PLAN.md @@ -57,7 +57,7 @@ real nodes. Until then, this plan is the priority. - **The 4 companions** (`archy-bitcoin-ui`, `-lnd-ui`, `-electrs-ui`, `-fedimint-ui`) build from `docker/` contexts via `companion.rs`, not the manifest registry — a later phase folds them in. -- **No app has passed the formal production gate (5× for now, was 20×).** That is the blocker. +- **No app has passed the formal production gate.** That is the blocker. ## 4. Workstreams (each links its authoritative detail doc) @@ -67,7 +67,7 @@ real nodes. Until then, this plan is the priority. | B | **Registry-distributed manifests** — catalog carries full signed manifest; orchestrator installs from registry; disk = migration fallback | `registry-manifest-design.md` | **phases 1+2 done** (node consume + opt-in publisher embed); not yet flipped on for the fleet | | C | **Developer-ready external registry** — 3rd-party DID-signed manifests, decentralized Nostr discovery (NIP-78 kind 30078) + trust score, `archy app …` tooling | `marketplace-protocol.md`, `app-developer-guide.md` | design exists; tooling + trust UX pending | | D | **Distribution backbone** — signed catalog, BLAKE3 content-addressing, iroh swarm (origin-always-wins) | `dht-distribution-design.md` | phases 0–2 code-complete (worktree) | -| E | **Production test gate** — 5× lifecycle on **.228** (for now; was 20×), per-app L1/L2 matrix; multinode is split out → `multinode-testing-plan.md` | `tests/lifecycle/TESTING.md`, `bulletproof-containers.md` | **.228 GREEN (110/110); 5× in progress** | +| E | **Production test gate** — 5× lifecycle on **.228**, per-app L1/L2 matrix; multinode is split out → `multinode-testing-plan.md` | `tests/lifecycle/TESTING.md`, `bulletproof-containers.md` | **.228 GREEN (110/110); 5× in progress** | **Orchestrator architecture** (foundation for A/B): `rust-orchestrator-migration.md` (ProdContainerOrchestrator, BootReconciler 30s level-triggered reconcile, adoption @@ -76,11 +76,10 @@ modes FM1–FM6 + the desired-state-first reconciler that fixes them). ## 5. Production test gate (exit criterion) -An app is **production-ready** only when `tests/lifecycle/run-20x.sh` is green +An app is **production-ready** only when `tests/lifecycle/run-gate.sh` is green across the full matrix — install / UI-reachable / stop / start / restart / reinstall / **reboot-survive** / **archipelago-restart-survive** / uninstall — -**5× on .228** (`ARCHY_ITERATIONS=5`; temporarily reduced from 20× — restore to -20× before the final ship). **The gate runs ON the node** (it uses local +**5× on .228** (`ARCHY_ITERATIONS=5`). **The gate runs ON the node** (it uses local podman/systemctl/bitcoin probes; running it via RPC from another host silently tests the runner). **Multinode / fleet verification (.198 + others) is a SEPARATE plan — `docs/multinode-testing-plan.md` — NOT part of this single-node criterion.** @@ -101,7 +100,7 @@ proxies; L3 survival ◐; ~30 apps have zero automated coverage. data_uid 100998. Canonical app_id `immich` (title+icon). *(9e6c5370, d5ef4573)* 4. ✅ **Reboot-survival** — podman-restart.service enabled (startup, fleet-wide) for the podman-`--restart` path. *(f160e0c4)* -5. ◧ **E** — 5× gate on **.228** (`ARCHY_ITERATIONS=5`, was 20×). .228 is GREEN +5. ◧ **E** — 5× gate on **.228** (`ARCHY_ITERATIONS=5`). .228 is GREEN 1× (110/110); the 5× run is in progress. This is now the SINGLE-NODE criterion. 6. ◻ Demote this banner once the 5× is green. @@ -177,7 +176,7 @@ gate is now single-node (.228); multinode is split out (`docs/multinode-testing- ``` sshpass -p archipelago ssh archipelago@192.168.1.228 \ 'grep -E "iteration [0-9]+: (PASS|FAIL)|RESULTS|passed:|failed:" /tmp/gate-5x2.log; \ - echo "running pid: $(pgrep -f run-20x.sh$ || echo DONE)"; grep "^not ok" /tmp/gate-5x2.log | sort -u' + echo "running pid: $(pgrep -f run-gate.sh$ || echo DONE)"; grep "^not ok" /tmp/gate-5x2.log | sort -u' ``` - Log: `/tmp/gate-5x2.log` on .228 · launched `nohup` (pid was 4042141) · `ARCHY_ITERATIONS=5 ARCHY_ALLOW_DESTRUCTIVE=1`, run **ON the node** from `/tmp/lifecycle-run/tests/lifecycle` @@ -220,7 +219,7 @@ coverage (~30 apps unwritten); the mobile app-launch UX (§8 Roadmap P1). Multin Manifest-driven lifecycle hooks + the IndeedHub stack migration are **complete and live-verified on BOTH .228 and .198** (adoption + fresh-create + post_install hook exec, stable under load). 15 commits this session: `4c1a4e59`..`e2a012d0`. Working -tree clean. The release lifecycle gate is temporarily **5×** (was 20×; `ARCHY_ITERATIONS=5`). +tree clean. The release lifecycle gate is **5×** (`ARCHY_ITERATIONS=5`). **Shipped (all on `main`, newest first):** - `e2a012d0` indeedhub frontend health → `tcp:7777` (was http GET `/`; the http check @@ -400,7 +399,7 @@ bug is purely "container never stops", not "state not reported". ### MY-SESSION ERRATA (own it on resume) - I ran the gate with `ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1`, which is **NOT** the canonical gate (that - is `ARCHY_ALLOW_DESTRUCTIVE=1` only — stop/start/restart, no uninstall/reinstall; see run-20x.sh + is `ARCHY_ALLOW_DESTRUCTIVE=1` only — stop/start/restart, no uninstall/reinstall; see run-gate.sh "Suggested release-gate invocation"). Cascade ran uninstall/reinstall on every app and, when I killed the run mid-iteration, left bitcoin-knots/electrumx/btcpay/fedimint/immich uninstalled or stranded. **I fully restored .228** (reinstalled bitcoin-knots with the correct image diff --git a/docs/app-registry-status-2026-06-21.md b/docs/app-registry-status-2026-06-21.md index dbac9f00..5cc84b0d 100644 --- a/docs/app-registry-status-2026-06-21.md +++ b/docs/app-registry-status-2026-06-21.md @@ -103,10 +103,10 @@ Notes: ## 4. Test-gate reality -**No app has passed the formal release gate.** The gate is `run-20x.sh` green +**No app has passed the formal release gate.** The gate is `run-gate.sh` green across the full lifecycle matrix (install / UI reachable / stop / start / restart / reinstall / reboot-survive / archipelago-restart-survive / uninstall), -**20× on .228 AND .198**. All 8 release-gate checkboxes in +**5× on .228 AND .198**. All 8 release-gate checkboxes in `tests/lifecycle/TESTING.md` are **unchecked (☐)**. What exists today: @@ -132,7 +132,7 @@ failure): `bitcoin-receive.bats`, `port-drift.bats`, `secret-completeness.bats`. 1. **immich** is the last legacy (in-cgroup) app — migrate to Quadlet to finish Pillar 1. 2. **grafana / strfry** Quadlet units stuck *activating* with no container — investigate. (onlyoffice removed 2026-06-21.) 3. **fedimint-gateway / fedimint-clientd** (this session) now run but have no lifecycle test coverage. -4. The formal **20× release gate has never been green** — it is the blocker for the v1.7.52 tag. +4. The formal **5× release gate has never been green** — it is the blocker for the v1.7.52 tag. --- diff --git a/docs/bitcoin-multi-version-design.md b/docs/bitcoin-multi-version-design.md new file mode 100644 index 00000000..a3e4c0f9 --- /dev/null +++ b/docs/bitcoin-multi-version-design.md @@ -0,0 +1,215 @@ +# Bitcoin Multi-Version Support — Design + +**Status:** design (2026-06-22) +**Goal:** let a user choose *which* version of Bitcoin Core / Bitcoin Knots to +install (latest pre-selected, older versions in a dropdown), and later switch +versions or opt into auto-update — all manifest/catalog-driven, all served from +**our signed registry**, rootless, with **zero data loss** across version +changes. + +See also: [`docs/registry-manifest-design.md`](registry-manifest-design.md) +(catalog distribution + signing this builds on), +[`docs/PRODUCTION-MASTER-PLAN.md`](PRODUCTION-MASTER-PLAN.md) (gate that must be +green first), `MEMORY → project_decoupled_app_updates`, +`MEMORY → project_manifest_driven_north_star`. + +> **Scheduling:** this is net-new scope. It lands **after** the production test +> gate (`tests/lifecycle/run-20x.sh`) is green on `.228` + `.198`. The data- +> preservation invariant (downgrade vs. chainstate) is the highest risk here. + +--- + +## 1. Where we are today + +### Image source / build +| Thing | Today | +|-------|-------| +| `apps/bitcoin-core/Dockerfile` | `FROM bitcoin/bitcoin:24.0` — a **community** image, **stale** (manifest says 28.4), no project-official Docker image exists | +| `apps/bitcoin-knots/` | **no Dockerfile** — `:latest` is built/pushed by hand | +| Registry | `scripts/image-versions.sh` → `ARCHY_REGISTRY="146.59.87.168:3000/lfg2025"`; only `BITCOIN_KNOTS_IMAGE=…/bitcoin-knots:latest` pinned, no Core pin | +| Tags in registry | **one tag per image**. No historical versions. | + +### Version pinning +- `apps/bitcoin-core/manifest.yml` → `…/bitcoin:28.4` (pinned). +- `apps/bitcoin-knots/manifest.yml` → `…/bitcoin-knots:latest` (**floating** — a + liability for reproducibility and for "switch back to the version I had"). +- `core/archipelago/src/container/app_catalog.rs` + `app-catalog/catalog.json`: + signed, hourly-fetched, carries `version` (badge text) + `image`. + `catalog_image_override()` overrides the manifest image **only if same-repo**. + `available_update_for_app()` already ignores floating tags for update + detection. + +### Install path +- `prod_orchestrator.rs::install_fresh()` resolves the image as + **manifest image → catalog override → pull**. There is **no per-install + version parameter** — `orchestrator.install(app_id)` takes only the id. +- RPC `package.install` (`api/rpc/package/install.rs`) *accepts* `dockerImage` / + `version` params but for orchestrator-managed apps (bitcoin-core / bitcoin-knots + are allowlisted) it **ignores them** and lets the orchestrator resolve. +- **Conflict guard** (`prod_orchestrator.rs` ~1306–1325): core and knots may not + run simultaneously. Must be preserved by everything below. + +### UI +- Install is **one-click, no modal** (`MarketplaceAppDetails.vue::installApp()`). +- Update badge + "Update to X" already exist (`appDetails/AppHeroSection.vue`, + RPC `package.update`). +- **No** Bitcoin-specific settings panel; all apps share `AppSidebar.vue`. +- Per-app config persisted **only at install time** as `containerConfig` → + `/var/lib/archipelago/app-configs/.json`. **No post-install set-config RPC.** + +--- + +## 2. Source-of-truth decision: official upstream → our registry + +We use the **official releases** as upstream provenance, but nodes only ever pull +from our registry. Nodes do **not** fetch bitcoin.org / GitHub at install time — +that would break rootless/offline installs and the signed-registry trust model, +and neither project publishes an official Docker image anyway. + +**Official sources (verified):** + +| Impl | Index | Per-version asset pattern | +|------|-------|---------------------------| +| Bitcoin Core | [bitcoincore.org/en/releases](https://bitcoincore.org/en/releases/) · [github bitcoin/bitcoin](https://github.com/bitcoin/bitcoin/releases) | `https://bitcoincore.org/bin/bitcoin-core-/bitcoin--x86_64-linux-gnu.tar.gz` + `SHA256SUMS` + `SHA256SUMS.asc` | +| Bitcoin Knots | [github bitcoinknots/bitcoin](https://github.com/bitcoinknots/bitcoin/releases) · [bitcoinknots.org/files](https://bitcoinknots.org/) | `https://bitcoinknots.org/files/.x//bitcoin--x86_64-linux-gnu.tar.gz` (`` e.g. `29.3.knots20260508`) | + +Both ship **signed binary tarballs** with multi-builder Guix attestations +(`SHA256SUMS.asc`). The build pipeline verifies these **once, at build**; our DHT +Phase 0 registry signature then carries provenance to the fleet. + +> Knots version strings embed a build date (`29.3.knots20260508`). Treat the full +> string as the tag; surface a friendly `29.3` + date in the UI. + +--- + +## 3. Design + +### Phase 0 — Reproducible, verified image pipeline *(prerequisite)* + +New `scripts/build-bitcoin-image.sh ` that, per version: + +1. Downloads the official tarball + `SHA256SUMS(.asc)` (GitHub release assets are + an identical mirror → fallback). +2. Verifies SHA256 **and** the Guix/builder GPG signatures. **Fail closed.** +3. Builds a minimal **rootless** image: pin a small base, unpack + `bitcoind`/`bitcoin-cli`. Keep the existing entrypoint probe + (`command -v bitcoind || find /opt -path '*/bin/bitcoind'`) so per-version + layout differences don't break startup. +4. Tags + pushes `:` **and** updates the default pin (`:latest` / + `:28.4`-style) to the registry. + +**Curate, don't mirror everything.** Publish a bounded set (proposal: current + +last ~3 majors), e.g. Core `31.0, 30.0, 29.3, 28.4, 27.2` and Knots +`29.3.knots…, 28.1.knots…, 27.1.knots…`. **`log` / document dropped versions** — +silent truncation reads as "all versions supported" when it isn't. + +Also fixes existing debt: replaces the stale community `FROM bitcoin/bitcoin:24.0` +and gives Knots a real Dockerfile + non-floating tags. + +### Phase 1 — Version catalog (signed, registry-distributed) + +Extend `AppCatalogEntry` (forward-compatible — no `deny_unknown_fields`, old nodes +ignore it): + +```jsonc +"bitcoin-core": { + "version": "31.0", // default / latest (existing field) + "image": "…/bitcoin:31.0", // existing + "versions": [ // NEW + { "version": "31.0", "image": "…/bitcoin:31.0", "default": true }, + { "version": "30.0", "image": "…/bitcoin:30.0" }, + { "version": "28.4", "image": "…/bitcoin:28.4", "deprecated": true, "eol": "2026-...." } + ] +} +``` + +Published to `releases/app-catalog.json`, signed by the existing release-root +mechanism. This is the **single source of truth** the UI reads for "what can I +install / switch to," and third-party-registry apps inherit the capability for +free. `version`/`image` stay as the default for back-compat. + +### Phase 2 — Install-time version selection + +- **Orchestrator:** add `install_with_image(app_id, Option)` (or an + optional arg on `install`). When a tag is supplied, **validate same-repo** + against the manifest (reuse `image_without_registry_or_tag()`), then override in + `install_fresh()`. Default path unchanged. Preserve the core/knots conflict + guard. +- **RPC:** thread the selected version/image from `package.install` into the + orchestrator for the allowlisted apps (the param is already received — just not + forwarded). +- **UI:** the first **install modal** in the app — latest pre-selected, dropdown + of `versions[]`, deprecated/EOL badges on old entries. On confirm, pass the + chosen version to `package.install`. + +### Phase 3 — In-app version switch + auto-update toggle + +- **UI:** a Bitcoin **"Version & Updates"** card (conditional in `AppSidebar.vue` + for `bitcoin-core` / `bitcoin-knots`): current version, a switch dropdown, and + an **auto-update-to-latest** toggle. +- **Switch = controlled re-pull/recreate** reusing the `package.update` + machinery but targeting an arbitrary (incl. older) tag → effectively + `package.set-version`. +- **Persistence:** new `package.set-config` RPC writing the existing + `app-configs/.json` (`{ pinnedVersion, autoUpdate }`). +- **Auto-update:** the existing hourly catalog check, when `autoUpdate:true`, + triggers `package.update` to the catalog default. A pinned version **suppresses + the update badge**. + +--- + +## 4. Invariants & safety rails + +- **Rootless only.** Pipeline images and run path stay rootless; no Docker-socket, + no privileged. +- **No data loss across version change.** Preserve `/var/lib/archipelago/bitcoin`, + secrets (`bitcoin-rpc-password`, `…-rpcauth`), ports, and the adoption container + name on every install / switch / update. +- **⚠️ Downgrade vs. chainstate (highest risk).** Bitcoin Core refuses to start on + a chainstate written by a *newer* version unless reindexed (expensive, or data + loss on a pruned node). The UI **must** warn loudly on downgrade; the + orchestrator should gate/confirm it and never silently wipe. Pruned nodes can't + simply `-reindex`. +- **Core ⇄ Knots switch** stays governed by the existing conflict guard; treat an + impl switch as distinct from a version switch. +- **Floating tags** (`latest`) are never advertised as a selectable "version" and + never counted as an available update (already handled by + `available_update_for_app`). +- **Verify on a real node** (`.228` then `.198`) and pass `run-20x` before any + tag. + +--- + +## 5. Files / seams (no code yet) + +| Concern | File | +|---------|------| +| Image build/push | new `scripts/build-bitcoin-image.sh`; `apps/bitcoin-core/Dockerfile`; new `apps/bitcoin-knots/Dockerfile`; `scripts/image-versions.sh` | +| Catalog schema | `core/archipelago/src/container/app_catalog.rs`; `releases/app-catalog.json` (+ `app-catalog/catalog.json`) | +| Install override | `core/archipelago/src/container/prod_orchestrator.rs` (`install` / `install_fresh`); `api/rpc/package/install.rs`; `api/rpc/dispatcher.rs` | +| Switch / set-config RPC | `api/rpc/package/update.rs`; new `package.set-config` handler; `app-configs/.json` | +| Install modal | `neode-ui/src/views/MarketplaceAppDetails.vue`; new `…/marketplace/AppInstallModal.vue` | +| Version & Updates card | `neode-ui/src/views/appDetails/AppSidebar.vue`; `neode-ui/src/api/rpc-client.ts`; `neode-ui/src/types/api.ts` | + +--- + +## 6. Open questions + +1. **Curated version set** — how many majors back do we host, and storage budget + on the registry? +2. **Multi-arch** — fleet is x86_64 today; do any nodes need arm64 images? +3. **Pruned-node downgrade policy** — block outright, or allow with an explicit + "this will require re-sync / may lose pruned data" confirmation? +4. **Auto-update default** — off (opt-in) for a consensus-critical app like + Bitcoin? (Recommended: **off**, explicit opt-in.) +5. **Knots date-suffix UX** — how to display `29.3.knots20260508` cleanly. + +--- + +## Sources + +- [Bitcoin Core releases](https://bitcoincore.org/en/releases/) +- [bitcoin/bitcoin releases](https://github.com/bitcoin/bitcoin/releases) +- [bitcoinknots/bitcoin releases](https://github.com/bitcoinknots/bitcoin/releases) +- [Bitcoin Knots](https://bitcoinknots.org/) +- [bitcoin.org version history](https://bitcoin.org/en/version-history) diff --git a/docs/demo-deployment-design.md b/docs/demo-deployment-design.md new file mode 100644 index 00000000..d0e6a4c1 --- /dev/null +++ b/docs/demo-deployment-design.md @@ -0,0 +1,169 @@ +# Public Demo Deployment — Design + +**Status:** design (2026-06-22) +**Goal:** a public, click-to-play demo of the Archipelago UI that **auto-tracks +the real code** yet stays **separated** from the private monorepo and its +secrets/backend. Deployed via **Portainer**, mock-data driven, with working file +storage and a testnet-flavored Bitcoin sandbox so visitors can play freely. + +See also: `neode-ui/mock-backend.js` (existing mock), `docker-compose.demo.yml` +(existing demo stack), `MEMORY → reference_neode_ui_dev_testing`, +`MEMORY → reference_ovh_168_mirror` (Portainer/registry host). + +--- + +## 1. What already exists (the 70%) + +The demo is mostly built. Inventory: + +| Asset | Path | State | +|-------|------|-------| +| Mock backend (Node/Express + ws) | `neode-ui/mock-backend.js` (~3,862 lines) | 95+ JSON-RPC methods: auth, package lifecycle, Bitcoin/LND wallet, mesh, federation, identity, monitoring, mock filebrowser | +| Mock data | `mockData` / `walletState` / `MOCK_FILES` in `mock-backend.js` | rich; 10 pre-installed apps, 30+ marketplace apps, wallet balances, seeded files (Music/Documents/Photos/Videos) | +| Demo compose | `docker-compose.demo.yml` | `neode-backend` (mock, `:5959`) + `neode-web` (nginx, `:4848`); header already says "Deploy via Portainer" | +| Backend image | `neode-ui/Dockerfile.backend` | Node 22 Alpine → `node mock-backend.js` | +| Web image | `neode-ui/Dockerfile.web` | multi-stage `vite build` → nginx | +| Demo nginx | `neode-ui/docker/nginx-demo.conf` | proxies `/rpc/v1`, `/ws`, `/app/*` to the mock backend | +| Precedent | `indee-demo` Portainer stack | separate stack referencing a **pre-built image** — the pattern we extend | + +**Gaps for a *public* (not dev) demo:** state is global (visitors collide), +uploads are no-ops, Bitcoin block height is hardcoded, no CI image pipeline, no +separated public deploy repo. + +--- + +## 2. Architecture: source in monorepo, demo ships as images, public repo is thin + +The tension — "must update as I update the real code" **and** "sort of +separated" — is resolved by separating at the **deploy layer, not the source +layer**. + +``` + monorepo (private — single source of truth) + neode-ui/ + mock-backend.js + │ push to main + ▼ + CI: build archy-demo-web + archy-demo-backend + │ push :demo / :latest + ▼ + registry (146.59.87.168:3000 / vps2) + │ Portainer webhook / re-pull + ▼ + archy-demo (public repo — tiny) + docker-compose.yml ──referencing pre-built images──▶ Portainer ▶ demo. + .env.example +``` + +- **Single source of truth = the monorepo.** `neode-ui/` and `mock-backend.js` + stay where they are, so the demo tracks real code automatically — no fork to + sync, no drift. +- **Separation = the public repo never holds source.** `archy-demo` contains only + a `docker-compose.yml` (image refs) + `.env.example` + README. No Rust backend, + no secrets, no UI source. Safe to make public. +- **Auto-update flow:** edit code → push → CI rebuilds demo images → Portainer + redeploys. The public compose file is touched rarely (only when service shape + changes). + +**Why not a true fork / `git subtree split`?** It works but needs a sync job +*and* re-exposes UI source publicly. The image pipeline gives stronger +separation (zero source leak) **and** zero manual sync. (Decided 2026-06-22.) + +--- + +## 3. Work items + +### 3.1 CI image pipeline +- On push to `main` (path filter: `neode-ui/**`), build: + - `archy-demo-backend` from `neode-ui/Dockerfile.backend` + - `archy-demo-web` from `neode-ui/Dockerfile.web` (`build:docker`) +- Tag `:demo` + `:`, push to the registry. +- Trigger Portainer redeploy (stack webhook) on success. + +### 3.2 Public `archy-demo` repo +- `docker-compose.yml` mirroring `docker-compose.demo.yml` but **`image:` + references instead of `build:`** (pull `:demo`, no build context). +- `.env.example` (`ANTHROPIC_API_KEY`, `VITE_DEV_MODE=existing`, session TTL, + upload quota). +- README: one-paragraph "deploy in Portainer → web editor paste / deploy from + repo," access on `:4848`. +- No source. This is the only public surface. + +### 3.3 Multi-user: per-session sandbox (reset on idle) ⟵ *decided* +The biggest code change. Today `mockData` / `walletState` / `MOCK_FILES` are +**global singletons** → visitors corrupt each other's view. +- Issue a `demo-session` cookie on first hit (the mock already sets a session on + login; extend it to anonymous visitors). +- Key state by session id: `sessions[sid] = { mockData, walletState, files }`, + each **deep-cloned from a pristine seed** on creation. +- Reap on idle (e.g. 30 min no activity) + hard cap concurrent sessions; on reap, + free memory + temp dir. +- RPC dispatch + WS patches resolve the per-session state instead of the global. +- Keeps the demo a true playground: install/uninstall/spend freely, reset by + reconnecting. + +### 3.4 File storage: persisted per session ⟵ *decided* +Today filebrowser upload/delete/rename are 200-OK no-ops. +- Back each session with a temp dir (e.g. `/tmp/demo//`), seeded from + `MOCK_FILES`. +- Make `POST/DELETE/PATCH /app/filebrowser/api/resources/*` and `GET …/raw/*` + read/write that dir. Enforce a per-session quota (e.g. 50 MB) and reject + oversize/odd MIME. +- Cleaned when the session is reaped — no standing public writable volume, no real + filebrowser container to harden. + +### 3.5 Bitcoin: testnet-flavored mock ⟵ *decided* +- Relabel wallet/chain as **testnet/signet**: `tb1q…` addresses, "testnet" chain + in `bitcoin.getinfo`, scripted-but-plausible block height + confirmations. +- Keep `dev.faucet` as the in-UI "get test sats" button (instant, free). +- No real `bitcoind` → no sync, no disk, no public RPC attack surface. +- *Future upgrade path:* swap to a real signet node + LND in the stack if we ever + want movable real test sats (out of scope now). + +### 3.6 Mock containers / app lifecycle +- The mock already simulates `package.install/uninstall/start/stop/restart` + asynchronously. For the demo, **force simulation mode** (never touch a real + Docker socket — rootless/safe and host-independent). Confirm no path in + `mock-backend.js` reaches for a real runtime when `DEMO=1`. + +### 3.7 Mock-data refresh +- Update `mockData` static apps + marketplace to current app set/versions, refresh + wallet figures, seeded mesh messages, and files so the demo feels current. This + is ongoing and rides the same image pipeline. + +--- + +## 4. Invariants / guardrails (public exposure) + +- **No real secrets, no real backend, no real Docker socket** in the demo image or + public repo. Mock password stays a known demo credential, clearly labeled. +- **Per-session isolation** is a hard requirement before going public — without it + the demo is unusable for strangers. +- **Resource caps:** session count, per-session memory + upload quota, idle reap; + the box can't be DoS'd into OOM by upload spam or session churn. +- **`ANTHROPIC_API_KEY`** (chat) is injected via Portainer env, never committed; + rate-limit / budget-cap demo chat usage. +- **Read-only registry creds** for the Portainer host to pull `:demo`. + +--- + +## 5. Files / seams + +| Concern | Where | +|---------|-------| +| Per-session state, file persistence, testnet labels, sim-mode | `neode-ui/mock-backend.js` | +| Build contexts (reused as-is) | `neode-ui/Dockerfile.backend`, `neode-ui/Dockerfile.web`, `neode-ui/docker/nginx-demo.conf` | +| Demo stack (in-repo, dev) | `docker-compose.demo.yml` (keep `build:`) | +| Public stack (new repo) | `archy-demo/docker-compose.yml` (`image:` refs), `.env.example`, README | +| CI pipeline | new workflow (path filter `neode-ui/**` → build + push `:demo` → Portainer webhook) | + +--- + +## 6. Open questions + +1. **Demo host** — which Portainer instance (OVH `.168`? a dedicated VPS)? Public + DNS + TLS for `demo.`? +2. **Registry for `:demo` images** — `146.59.87.168:3000` vs vps2; public-pull or + creds baked into Portainer? +3. **Session TTL + concurrency cap** — concrete numbers (30 min / N sessions / 50 MB)? +4. **Chat in the demo** — enable Claude chat (needs key + budget cap) or stub it? +5. **Sync cadence** — rebuild `:demo` on every `neode-ui/**` push, or nightly? diff --git a/docs/multinode-testing-plan.md b/docs/multinode-testing-plan.md index 6bb92abb..bbda64b3 100644 --- a/docs/multinode-testing-plan.md +++ b/docs/multinode-testing-plan.md @@ -29,7 +29,7 @@ sudo curl -fsSL -o /usr/local/bin/jq \ mkdir -p /tmp/lifecycle-run && tar xzf /tmp/tests.tgz -C /tmp/lifecycle-run cd /tmp/lifecycle-run/tests/lifecycle ARCHY_HOST=127.0.0.1 ARCHY_SCHEME=https ARCHY_PASSWORD= \ - ARCHY_ALLOW_DESTRUCTIVE=1 ARCHY_ITERATIONS=5 nohup ./run-20x.sh > /tmp/gate.log 2>&1 & + ARCHY_ALLOW_DESTRUCTIVE=1 ARCHY_ITERATIONS=5 nohup ./run-gate.sh > /tmp/gate.log 2>&1 & ``` ## Per-node preconditions (learned on .228) diff --git a/scripts/create-release.sh b/scripts/create-release.sh index da012f39..16472da0 100755 --- a/scripts/create-release.sh +++ b/scripts/create-release.sh @@ -80,7 +80,7 @@ fi # runs the release gate harness (cargo fmt/check, catalog drift, vitest, and # the focused cargo suites — incl. the receive/port-drift/secret regressions). # Skipped on --dry-run, or set SKIP_RELEASE_TESTS=1 to bypass in an emergency. -# The lifecycle bats harness (tests/lifecycle/run-20x.sh) still runs separately +# The lifecycle bats harness (tests/lifecycle/run-gate.sh) still runs separately # against live nodes — see tests/lifecycle/TESTING.md. if ! $DRY_RUN; then if [ "${SKIP_RELEASE_TESTS:-0}" = "1" ]; then diff --git a/tests/lifecycle/TESTING.md b/tests/lifecycle/TESTING.md index e10d768b..769a819c 100644 --- a/tests/lifecycle/TESTING.md +++ b/tests/lifecycle/TESTING.md @@ -27,7 +27,7 @@ The migration's aim, restated as **five pillars** (every app must satisfy all fi 3. **Lifecycle bulletproof** — every app passes the full matrix (install / UI reachable / stop / start / restart / reinstall / reboot-survive / archipelago-restart-survive / uninstall) **5× green on .228** — run ON the node - (`ARCHY_ITERATIONS=5`; temporarily reduced from 20×, restore before final ship). + (`ARCHY_ITERATIONS=5`). (Multinode / fleet → `docs/multinode-testing-plan.md`, separate.) before any release. 4. **Data-driven apps** — install/uninstall needs only the app's manifest + @@ -41,7 +41,7 @@ The migration's aim, restated as **five pillars** (every app must satisfy all fi owned by the service user. Security is king. **Per-app definition of done:** all five pillars hold → lifecycle matrix 5× -(for now; was 20×) green on .228 (run ON the node) → catalog/registry updated (`app-catalog/catalog.json` +green on .228 (run ON the node) → catalog/registry updated (`app-catalog/catalog.json` + `releases/app-catalog.json`, rebuilt image pushed to the mirror) → tracker cell ticked. Only then move to the next app. (Fleet/multinode verification is a separate pass → `docs/multinode-testing-plan.md`.) @@ -80,7 +80,7 @@ cost hours of resync. archipelago` → `cp` binary → `start`. 4. Validate: install fedimint-gateway → assert `fedimint-gateway-hash` (0600, archipelago-owned) + `.pw` generated → container starts healthy. -5. Run `tests/lifecycle/run-20x.sh` for the gateway (do NOT touch knots/electrumx/lnd). +5. Run `tests/lifecycle/run-gate.sh` for the gateway (do NOT touch knots/electrumx/lnd). 6. Frontend fixes (separate from binary): see icon/rename below; rebuild neode-ui, ship `dist + catalog.json + assets` to `/opt/archipelago/web-ui` (chown 1000:1000). @@ -168,7 +168,7 @@ v1.7.52 tags. Three production failures shipped on v1.7.90-alpha despite the existing harness, because nothing exercised the receive path, port-mapping drift, or secret completeness on a live node. New suites close those gaps (all run on the archy -host, read-only, so they join `run.sh`/`run-20x.sh` automatically): +host, read-only, so they join `run.sh`/`run-gate.sh` automatically): | Suite | Failure it guards | Asserts | |---|---|---| @@ -196,9 +196,9 @@ ARCHY_PASSWORD=password123 tests/lifecycle/run.sh # Full + destructive (for the verification fleet): ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 tests/lifecycle/run.sh -# 5× release-gate run (for now; was 20× — restore before final ship): +# 5× release-gate run: ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 ARCHY_ITERATIONS=5 \ - tests/lifecycle/run-20x.sh + tests/lifecycle/run-gate.sh ``` To exercise the Phase 3.2 Quadlet-backend path on a target node without @@ -228,7 +228,7 @@ Goal: minimum-viable container subsystem. | `core/container/src/bitcoin_simulator.rs` | 219 | 0 | -219 | ○ couples with dev_orchestrator | | `core/container/src/port_manager.rs` | 175 | 0 | -175 | ○ couples with dev_orchestrator | | `core/archipelago/src/api/rpc/package/install.rs::install_bitcoincoin_rpc_repair` | ~150 | 0 | -150 | ◐ pending fold into orchestrator pre-start | -| imperative `install_fresh` in prod_orchestrator | ~120 | 0 | -120 | ◐ Phase 3.2 wired behind `use_quadlet_backends` flag (default off); 3.3 in-place migration ✅; 3.4 health-gated startup (`Notify=healthy`) ✅ + `TimeoutStartSec=600` race fix ✅; 3.4a unit drift-sync each reconcile ✅; flip default after 20× green | +| imperative `install_fresh` in prod_orchestrator | ~120 | 0 | -120 | ◐ Phase 3.2 wired behind `use_quadlet_backends` flag (default off); 3.3 in-place migration ✅; 3.4 health-gated startup (`Notify=healthy`) ✅ + `TimeoutStartSec=600` race fix ✅; 3.4a unit drift-sync each reconcile ✅; flip default after 5× green | **Today: -270 LoC committed. Outstanding deletes possible: ~1,616 LoC** (if Phase 3 ships fully + dev_mode resolved). @@ -251,7 +251,7 @@ We don't have a performance harness yet. Add as L6 lands: v1.7.52 ships only when ALL of: 1. ☐ Bitcoin-stops fix verified live on a fresh node (tests/lifecycle/bats/bitcoin-knots.bats fully ● after a cold install) -2. ☐ `ARCHY_ITERATIONS=5 tests/lifecycle/run-20x.sh` returns 0 **run ON .228** (5× for now; full suite, ARCHY_ALLOW_DESTRUCTIVE=1) — 1× is GREEN (110/110), 5× in progress +2. ☐ `ARCHY_ITERATIONS=5 tests/lifecycle/run-gate.sh` returns 0 **run ON .228** (5× for now; full suite, ARCHY_ALLOW_DESTRUCTIVE=1) — 1× is GREEN (110/110), 5× in progress 3. ☐ Multinode/fleet (.198 + others) — tracked separately in `docs/multinode-testing-plan.md`, NOT a v1.7.52 single-node gate item 4. ☐ The L3 `backend-survives-archipelago-restart` suite passes (= Phase 3 Quadlet shipped for backends) 5. ☐ Cargo: 0 warnings, 0 unused, all tests green (sustained ✓ since 1c0df95f) diff --git a/tests/lifecycle/bats/electrumx.bats b/tests/lifecycle/bats/electrumx.bats index 63f30390..b3591cce 100644 --- a/tests/lifecycle/bats/electrumx.bats +++ b/tests/lifecycle/bats/electrumx.bats @@ -3,7 +3,7 @@ # # Lifecycle tests for the electrumx package (containers are named # `electrumx` + `archy-electrs-ui`). Mirrors bitcoin-knots.bats / -# lnd.bats so the 20× release-gate run exercises electrumx through +# lnd.bats so the 5× release-gate run exercises electrumx through # the same state matrix. # # Tiers: diff --git a/tests/lifecycle/bats/lnd.bats b/tests/lifecycle/bats/lnd.bats index da66e0ca..6cd5fcd3 100644 --- a/tests/lifecycle/bats/lnd.bats +++ b/tests/lifecycle/bats/lnd.bats @@ -2,7 +2,7 @@ # tests/lifecycle/bats/lnd.bats # # Lifecycle tests for the lnd package. Mirrors bitcoin-knots.bats so the -# 20× release-gate run exercises lnd through the same state matrix. +# 5× release-gate run exercises lnd through the same state matrix. # # Tiers: # - Read-only (always runs): presence, state-reporting consistency, RPC reachable diff --git a/tests/lifecycle/bats/mempool.bats b/tests/lifecycle/bats/mempool.bats index 2e69144b..a345016b 100644 --- a/tests/lifecycle/bats/mempool.bats +++ b/tests/lifecycle/bats/mempool.bats @@ -14,6 +14,11 @@ load '../lib/rpc.bash' +# bats-assert is not loaded in this suite (only rpc.bash), so provide a minimal +# `fail` so the `|| fail "..."` guards below report a real assertion failure +# instead of an undefined-command status 127 that masks the actual reason. +fail() { echo "$@" >&2; return 1; } + setup_file() { : "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}" export ARCHY_FORCE_LOGIN=1 diff --git a/tests/lifecycle/bats/ui-coverage.bats b/tests/lifecycle/bats/ui-coverage.bats index 1ef5db46..3a5a336b 100644 --- a/tests/lifecycle/bats/ui-coverage.bats +++ b/tests/lifecycle/bats/ui-coverage.bats @@ -15,7 +15,7 @@ # - container down → skip (clean dependency report, no false-fail) # - container up → URL MUST return 200 with non-empty body # -# Looped 20× via tests/lifecycle/run-20x.sh. +# Looped 5× via tests/lifecycle/run-gate.sh. load '../lib/rpc.bash' load '../lib/ui-probes.bash' diff --git a/tests/lifecycle/run-20x.sh b/tests/lifecycle/run-gate.sh similarity index 89% rename from tests/lifecycle/run-20x.sh rename to tests/lifecycle/run-gate.sh index 97091fb2..9d67bb91 100755 --- a/tests/lifecycle/run-20x.sh +++ b/tests/lifecycle/run-gate.sh @@ -1,32 +1,32 @@ #!/usr/bin/env bash -# tests/lifecycle/run-20x.sh — loop the lifecycle harness N times. +# tests/lifecycle/run-gate.sh — loop the lifecycle harness N times (default 5×, the release gate). # # Each iteration: setup-teardown → run.sh (with the same args you'd pass # to run.sh) → setup-teardown. Tallies pass/fail per iteration and prints a # summary at the end. Returns non-zero if any iteration failed. # # Env: -# ARCHY_ITERATIONS (default: 20) +# ARCHY_ITERATIONS (default: 5) # ARCHY_FAIL_FAST=1 stop on first failed iteration # plus everything run.sh / lib/rpc.bash respects # (ARCHY_PASSWORD, ARCHY_HOST, ARCHY_SCHEME, ARCHY_ALLOW_DESTRUCTIVE, # ARCHY_ALLOW_CASCADE_DESTRUCTIVE, ARCHY_ALLOW_NOAUTH) # # Usage: -# tests/lifecycle/run-20x.sh # 20× full bats/ suite -# ARCHY_ITERATIONS=5 tests/lifecycle/run-20x.sh # 5× full suite -# tests/lifecycle/run-20x.sh bitcoin-knots # 20× a single suite +# tests/lifecycle/run-gate.sh # 5× full bats/ suite +# ARCHY_ITERATIONS=5 tests/lifecycle/run-gate.sh # 5× full suite +# tests/lifecycle/run-gate.sh bitcoin-knots # 5× a single suite # # Suggested release-gate invocation: # ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 \ -# tests/lifecycle/run-20x.sh +# tests/lifecycle/run-gate.sh set -euo pipefail HERE="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" cd "$HERE" -ITER="${ARCHY_ITERATIONS:-20}" +ITER="${ARCHY_ITERATIONS:-5}" if ! [[ "$ITER" =~ ^[1-9][0-9]*$ ]]; then echo "ARCHY_ITERATIONS must be a positive integer, got: $ITER" >&2 exit 2 diff --git a/tests/lifecycle/setup-teardown.sh b/tests/lifecycle/setup-teardown.sh index e42876d0..9ba7064a 100755 --- a/tests/lifecycle/setup-teardown.sh +++ b/tests/lifecycle/setup-teardown.sh @@ -2,7 +2,7 @@ # tests/lifecycle/setup-teardown.sh # # Cleanup helper used between lifecycle test iterations. Run before AND after -# a full bats pass (run-20x.sh handles this). Idempotent — safe to run any +# a full bats pass (run-gate.sh handles this). Idempotent — safe to run any # time, on any host. # # Removes: