diff --git a/app-catalog/catalog.json b/app-catalog/catalog.json index f6b4f8ee..7d321c10 100644 --- a/app-catalog/catalog.json +++ b/app-catalog/catalog.json @@ -274,7 +274,7 @@ "author": "Fedimint", "category": "money", "tier": "core", - "dockerImage": "146.59.87.168:3000/lfg2025/fmcd:0.8.0", + "dockerImage": "146.59.87.168:3000/lfg2025/fmcd:0.8.1", "repoUrl": "https://github.com/minmoto/fmcd" }, { diff --git a/apps/fedimint-clientd/manifest.yml b/apps/fedimint-clientd/manifest.yml index bfa7ffe1..764a1ac0 100644 --- a/apps/fedimint-clientd/manifest.yml +++ b/apps/fedimint-clientd/manifest.yml @@ -9,7 +9,7 @@ app: # 0.8.2 — iroh-capable). No usable upstream image exists, so we build + push # this to the node registry. Pin the tag to match the REST shapes coded in # core/archipelago/src/wallet/fedimint_client.rs (validated against 0.8.2). - image: 146.59.87.168:3000/lfg2025/fmcd:0.8.0 + image: 146.59.87.168:3000/lfg2025/fmcd:0.8.1 pull_policy: if-not-present network: archy-net # No entrypoint override: the image's resilient `fmcd-run` launcher loops @@ -33,7 +33,12 @@ app: - storage: 2Gi resources: - cpu_limit: 1 + # fmcd's embedded iroh networking can hot-loop on relay/hole-punch retries + # on NAT'd nodes that reach the federation neither directly nor via iroh's + # public relays, pegging its whole allotment. Cap it low so a stuck instance + # can't starve the node (steady-state is <3% of a core; joins are brief); + # the fmcd-run watchdog additionally restarts a sustained-hot process. + cpu_limit: 0.25 memory_limit: 1Gi disk_limit: 2Gi diff --git a/docker/fmcd/fmcd-run b/docker/fmcd/fmcd-run index b5d92792..92b3053b 100644 --- a/docker/fmcd/fmcd-run +++ b/docker/fmcd/fmcd-run @@ -1,5 +1,5 @@ #!/bin/sh -# Resilient launcher for fmcd. +# Resilient launcher for fmcd, with a stuck-CPU watchdog. # # fmcd requires >=1 federation to boot — if the default federation is # unreachable at first boot it exits non-zero. Rather than let the container @@ -9,9 +9,72 @@ # # All config comes from FMCD_* env (FMCD_ADDR, FMCD_MODE, FMCD_DATA_DIR, # FMCD_INVITE_CODE, FMCD_PASSWORD), so fmcd needs no CLI args here. +# +# WATCHDOG: on NAT'd nodes that can reach the iroh federation neither directly +# nor via iroh's public relays, fmcd's embedded iroh networking enters a +# relay/hole-punch reconnect hot-loop that pegs its entire CPU allotment +# indefinitely (observed: ~1 core sustained for 4 days on a Tailscale node, +# while LAN nodes that reach the guardian directly stay <3%). fmcd exposes no +# iroh/relay knobs, but a restart demonstrably clears the stuck iroh state +# (a fresh process idles at <1%). So we sample fmcd's own CPU usage and, if it +# stays near its full allotment for a sustained window, restart it. Real work +# (federation joins, ecash ops) is bursty and measured in seconds — it never +# flat-pegs a core for many consecutive minutes — so the threshold below does +# not fire on legitimate load. set -u + +CLK=$(getconf CLK_TCK 2>/dev/null || echo 100) +WATCH_SAMPLE="${FMCD_WATCH_SAMPLE:-60}" # seconds between CPU samples +WATCH_CORES="${FMCD_WATCH_CORES:-0.18}" # cores; "hot" if usage exceeds this +WATCH_HITS="${FMCD_WATCH_HITS:-15}" # consecutive hot samples -> restart (~15 min) + +# Total CPU ticks (utime+stime, fields 14+15 of /proc/PID/stat) for $1; 0 if gone. +cpu_ticks() { + awk '{print $14 + $15}' "/proc/$1/stat" 2>/dev/null || echo 0 +} + +# Watch fmcd ($1). Returns (so the caller can kill it) once fmcd has been hot +# for WATCH_HITS consecutive samples; exits quietly if fmcd dies on its own. +watchdog() { + pid="$1" + hot=0 + prev=$(cpu_ticks "$pid") + while kill -0 "$pid" 2>/dev/null; do + sleep "$WATCH_SAMPLE" + cur=$(cpu_ticks "$pid") + cores=$(awk -v c="$cur" -v p="$prev" -v clk="$CLK" -v s="$WATCH_SAMPLE" \ + 'BEGIN{ d=c-p; if (d<0) d=0; printf "%.3f", d/clk/s }') + prev="$cur" + if [ "$(awk -v c="$cores" -v t="$WATCH_CORES" 'BEGIN{print (c>t)?1:0}')" = "1" ]; then + hot=$((hot + 1)) + echo "[fmcd-run] watchdog: fmcd hot (${cores} cores) ${hot}/${WATCH_HITS}" >&2 + if [ "$hot" -ge "$WATCH_HITS" ]; then + echo "[fmcd-run] watchdog: fmcd stuck high-CPU — restarting to clear iroh state" >&2 + kill -TERM "$pid" 2>/dev/null + sleep 5 + kill -KILL "$pid" 2>/dev/null + return 0 + fi + else + hot=0 + fi + done + return 0 +} + +# Forward container stop signals to the running fmcd (FMCD_PID is reread when +# the trap fires, so it always targets the current child). +FMCD_PID= +trap 'kill -TERM "$FMCD_PID" 2>/dev/null; exit 0' TERM INT + while true; do - fmcd || true - echo "[fmcd-run] fmcd exited (federation unreachable?); retrying in 30s" >&2 + fmcd & + FMCD_PID=$! + watchdog "$FMCD_PID" & + WD_PID=$! + wait "$FMCD_PID" 2>/dev/null + kill -TERM "$WD_PID" 2>/dev/null + wait "$WD_PID" 2>/dev/null + echo "[fmcd-run] fmcd exited (federation unreachable or watchdog restart); retrying in 30s" >&2 sleep 30 done diff --git a/neode-ui/public/catalog.json b/neode-ui/public/catalog.json index f6b4f8ee..7d321c10 100644 --- a/neode-ui/public/catalog.json +++ b/neode-ui/public/catalog.json @@ -274,7 +274,7 @@ "author": "Fedimint", "category": "money", "tier": "core", - "dockerImage": "146.59.87.168:3000/lfg2025/fmcd:0.8.0", + "dockerImage": "146.59.87.168:3000/lfg2025/fmcd:0.8.1", "repoUrl": "https://github.com/minmoto/fmcd" }, { diff --git a/releases/app-catalog.json b/releases/app-catalog.json index 790862a6..b37ef09e 100644 --- a/releases/app-catalog.json +++ b/releases/app-catalog.json @@ -1,6 +1,6 @@ { "schema": 1, - "updated": "2026-06-24", + "updated": "2026-06-28", "apps": { "adguardhome": { "version": "v0.107.55", @@ -1219,7 +1219,7 @@ "version": "0.8.0", "description": "Fedimint ecash client daemon (fmcd). Lets the node hold Fedimint ecash and join federations; the wallet talks to it over a local REST API.", "container": { - "image": "146.59.87.168:3000/lfg2025/fmcd:0.8.0", + "image": "146.59.87.168:3000/lfg2025/fmcd:0.8.1", "pull_policy": "if-not-present", "network": "archy-net", "generated_secrets": [ @@ -1242,7 +1242,7 @@ } ], "resources": { - "cpu_limit": 1, + "cpu_limit": 0.25, "memory_limit": "1Gi", "disk_limit": "2Gi" }, diff --git a/scripts/image-versions.sh b/scripts/image-versions.sh index 6a14bb84..77d8da39 100644 --- a/scripts/image-versions.sh +++ b/scripts/image-versions.sh @@ -61,7 +61,7 @@ FEDIMINT_GATEWAY_IMAGE="$ARCHY_REGISTRY/gatewayd:v0.10.0" # from minmoto/fmcd. NOT yet added to the bundled CONTAINER_IMAGES list / first- # boot auto-create: bundling fleet-wide needs a fleet-reachable default # federation first (the interim default is node-local). See docs/dual-ecash-design.md. -FMCD_IMAGE="$ARCHY_REGISTRY/fmcd:0.8.0" +FMCD_IMAGE="$ARCHY_REGISTRY/fmcd:0.8.1" # Media REDIS_IMAGE="$ARCHY_REGISTRY/redis:7.4.8"