fix(fmcd): cap CPU + watchdog-restart the iroh relay hot-loop
On NAT'd nodes that can reach the iroh federation neither directly nor via iroh's public relays, fmcd's embedded iroh networking enters a relay/hole-punch reconnect hot-loop that pegs its entire CPU allotment indefinitely (observed ~1 core sustained for 4 days on a Tailscale node, while LAN nodes that reach the guardian directly stay <3%). fmcd 0.8.0 exposes no iroh/relay knobs, so: - fmcd-run now samples fmcd's own CPU and restarts it when it stays near its allotment for ~15 min (a restart demonstrably clears the stuck iroh state; real work is bursty and never flat-pegs a core for minutes). - Lower cpu_limit 1 -> 0.25 core so a stuck instance can't starve the node (steady-state is <3% of a core; joins are brief). Ships as fmcd:0.8.1 (launcher-only rebuild, same fmcd binary). Bumped the image pin + cpu_limit in the manifest, image-versions.sh, the embedded catalog manifest (releases/app-catalog.json), and the UI catalogs. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4519dbf04f
commit
6734947c3e
@ -274,7 +274,7 @@
|
|||||||
"author": "Fedimint",
|
"author": "Fedimint",
|
||||||
"category": "money",
|
"category": "money",
|
||||||
"tier": "core",
|
"tier": "core",
|
||||||
"dockerImage": "146.59.87.168:3000/lfg2025/fmcd:0.8.0",
|
"dockerImage": "146.59.87.168:3000/lfg2025/fmcd:0.8.1",
|
||||||
"repoUrl": "https://github.com/minmoto/fmcd"
|
"repoUrl": "https://github.com/minmoto/fmcd"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@ -9,7 +9,7 @@ app:
|
|||||||
# 0.8.2 — iroh-capable). No usable upstream image exists, so we build + push
|
# 0.8.2 — iroh-capable). No usable upstream image exists, so we build + push
|
||||||
# this to the node registry. Pin the tag to match the REST shapes coded in
|
# this to the node registry. Pin the tag to match the REST shapes coded in
|
||||||
# core/archipelago/src/wallet/fedimint_client.rs (validated against 0.8.2).
|
# core/archipelago/src/wallet/fedimint_client.rs (validated against 0.8.2).
|
||||||
image: 146.59.87.168:3000/lfg2025/fmcd:0.8.0
|
image: 146.59.87.168:3000/lfg2025/fmcd:0.8.1
|
||||||
pull_policy: if-not-present
|
pull_policy: if-not-present
|
||||||
network: archy-net
|
network: archy-net
|
||||||
# No entrypoint override: the image's resilient `fmcd-run` launcher loops
|
# No entrypoint override: the image's resilient `fmcd-run` launcher loops
|
||||||
@ -33,7 +33,12 @@ app:
|
|||||||
- storage: 2Gi
|
- storage: 2Gi
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
cpu_limit: 1
|
# fmcd's embedded iroh networking can hot-loop on relay/hole-punch retries
|
||||||
|
# on NAT'd nodes that reach the federation neither directly nor via iroh's
|
||||||
|
# public relays, pegging its whole allotment. Cap it low so a stuck instance
|
||||||
|
# can't starve the node (steady-state is <3% of a core; joins are brief);
|
||||||
|
# the fmcd-run watchdog additionally restarts a sustained-hot process.
|
||||||
|
cpu_limit: 0.25
|
||||||
memory_limit: 1Gi
|
memory_limit: 1Gi
|
||||||
disk_limit: 2Gi
|
disk_limit: 2Gi
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,5 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# Resilient launcher for fmcd.
|
# Resilient launcher for fmcd, with a stuck-CPU watchdog.
|
||||||
#
|
#
|
||||||
# fmcd requires >=1 federation to boot — if the default federation is
|
# fmcd requires >=1 federation to boot — if the default federation is
|
||||||
# unreachable at first boot it exits non-zero. Rather than let the container
|
# unreachable at first boot it exits non-zero. Rather than let the container
|
||||||
@ -9,9 +9,72 @@
|
|||||||
#
|
#
|
||||||
# All config comes from FMCD_* env (FMCD_ADDR, FMCD_MODE, FMCD_DATA_DIR,
|
# All config comes from FMCD_* env (FMCD_ADDR, FMCD_MODE, FMCD_DATA_DIR,
|
||||||
# FMCD_INVITE_CODE, FMCD_PASSWORD), so fmcd needs no CLI args here.
|
# FMCD_INVITE_CODE, FMCD_PASSWORD), so fmcd needs no CLI args here.
|
||||||
|
#
|
||||||
|
# WATCHDOG: on NAT'd nodes that can reach the iroh federation neither directly
|
||||||
|
# nor via iroh's public relays, fmcd's embedded iroh networking enters a
|
||||||
|
# relay/hole-punch reconnect hot-loop that pegs its entire CPU allotment
|
||||||
|
# indefinitely (observed: ~1 core sustained for 4 days on a Tailscale node,
|
||||||
|
# while LAN nodes that reach the guardian directly stay <3%). fmcd exposes no
|
||||||
|
# iroh/relay knobs, but a restart demonstrably clears the stuck iroh state
|
||||||
|
# (a fresh process idles at <1%). So we sample fmcd's own CPU usage and, if it
|
||||||
|
# stays near its full allotment for a sustained window, restart it. Real work
|
||||||
|
# (federation joins, ecash ops) is bursty and measured in seconds — it never
|
||||||
|
# flat-pegs a core for many consecutive minutes — so the threshold below does
|
||||||
|
# not fire on legitimate load.
|
||||||
set -u
|
set -u
|
||||||
|
|
||||||
|
CLK=$(getconf CLK_TCK 2>/dev/null || echo 100)
|
||||||
|
WATCH_SAMPLE="${FMCD_WATCH_SAMPLE:-60}" # seconds between CPU samples
|
||||||
|
WATCH_CORES="${FMCD_WATCH_CORES:-0.18}" # cores; "hot" if usage exceeds this
|
||||||
|
WATCH_HITS="${FMCD_WATCH_HITS:-15}" # consecutive hot samples -> restart (~15 min)
|
||||||
|
|
||||||
|
# Total CPU ticks (utime+stime, fields 14+15 of /proc/PID/stat) for $1; 0 if gone.
|
||||||
|
cpu_ticks() {
|
||||||
|
awk '{print $14 + $15}' "/proc/$1/stat" 2>/dev/null || echo 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Watch fmcd ($1). Returns (so the caller can kill it) once fmcd has been hot
|
||||||
|
# for WATCH_HITS consecutive samples; exits quietly if fmcd dies on its own.
|
||||||
|
watchdog() {
|
||||||
|
pid="$1"
|
||||||
|
hot=0
|
||||||
|
prev=$(cpu_ticks "$pid")
|
||||||
|
while kill -0 "$pid" 2>/dev/null; do
|
||||||
|
sleep "$WATCH_SAMPLE"
|
||||||
|
cur=$(cpu_ticks "$pid")
|
||||||
|
cores=$(awk -v c="$cur" -v p="$prev" -v clk="$CLK" -v s="$WATCH_SAMPLE" \
|
||||||
|
'BEGIN{ d=c-p; if (d<0) d=0; printf "%.3f", d/clk/s }')
|
||||||
|
prev="$cur"
|
||||||
|
if [ "$(awk -v c="$cores" -v t="$WATCH_CORES" 'BEGIN{print (c>t)?1:0}')" = "1" ]; then
|
||||||
|
hot=$((hot + 1))
|
||||||
|
echo "[fmcd-run] watchdog: fmcd hot (${cores} cores) ${hot}/${WATCH_HITS}" >&2
|
||||||
|
if [ "$hot" -ge "$WATCH_HITS" ]; then
|
||||||
|
echo "[fmcd-run] watchdog: fmcd stuck high-CPU — restarting to clear iroh state" >&2
|
||||||
|
kill -TERM "$pid" 2>/dev/null
|
||||||
|
sleep 5
|
||||||
|
kill -KILL "$pid" 2>/dev/null
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
hot=0
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Forward container stop signals to the running fmcd (FMCD_PID is reread when
|
||||||
|
# the trap fires, so it always targets the current child).
|
||||||
|
FMCD_PID=
|
||||||
|
trap 'kill -TERM "$FMCD_PID" 2>/dev/null; exit 0' TERM INT
|
||||||
|
|
||||||
while true; do
|
while true; do
|
||||||
fmcd || true
|
fmcd &
|
||||||
echo "[fmcd-run] fmcd exited (federation unreachable?); retrying in 30s" >&2
|
FMCD_PID=$!
|
||||||
|
watchdog "$FMCD_PID" &
|
||||||
|
WD_PID=$!
|
||||||
|
wait "$FMCD_PID" 2>/dev/null
|
||||||
|
kill -TERM "$WD_PID" 2>/dev/null
|
||||||
|
wait "$WD_PID" 2>/dev/null
|
||||||
|
echo "[fmcd-run] fmcd exited (federation unreachable or watchdog restart); retrying in 30s" >&2
|
||||||
sleep 30
|
sleep 30
|
||||||
done
|
done
|
||||||
|
|||||||
@ -274,7 +274,7 @@
|
|||||||
"author": "Fedimint",
|
"author": "Fedimint",
|
||||||
"category": "money",
|
"category": "money",
|
||||||
"tier": "core",
|
"tier": "core",
|
||||||
"dockerImage": "146.59.87.168:3000/lfg2025/fmcd:0.8.0",
|
"dockerImage": "146.59.87.168:3000/lfg2025/fmcd:0.8.1",
|
||||||
"repoUrl": "https://github.com/minmoto/fmcd"
|
"repoUrl": "https://github.com/minmoto/fmcd"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"schema": 1,
|
"schema": 1,
|
||||||
"updated": "2026-06-24",
|
"updated": "2026-06-28",
|
||||||
"apps": {
|
"apps": {
|
||||||
"adguardhome": {
|
"adguardhome": {
|
||||||
"version": "v0.107.55",
|
"version": "v0.107.55",
|
||||||
@ -1219,7 +1219,7 @@
|
|||||||
"version": "0.8.0",
|
"version": "0.8.0",
|
||||||
"description": "Fedimint ecash client daemon (fmcd). Lets the node hold Fedimint ecash and join federations; the wallet talks to it over a local REST API.",
|
"description": "Fedimint ecash client daemon (fmcd). Lets the node hold Fedimint ecash and join federations; the wallet talks to it over a local REST API.",
|
||||||
"container": {
|
"container": {
|
||||||
"image": "146.59.87.168:3000/lfg2025/fmcd:0.8.0",
|
"image": "146.59.87.168:3000/lfg2025/fmcd:0.8.1",
|
||||||
"pull_policy": "if-not-present",
|
"pull_policy": "if-not-present",
|
||||||
"network": "archy-net",
|
"network": "archy-net",
|
||||||
"generated_secrets": [
|
"generated_secrets": [
|
||||||
@ -1242,7 +1242,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"resources": {
|
"resources": {
|
||||||
"cpu_limit": 1,
|
"cpu_limit": 0.25,
|
||||||
"memory_limit": "1Gi",
|
"memory_limit": "1Gi",
|
||||||
"disk_limit": "2Gi"
|
"disk_limit": "2Gi"
|
||||||
},
|
},
|
||||||
|
|||||||
@ -61,7 +61,7 @@ FEDIMINT_GATEWAY_IMAGE="$ARCHY_REGISTRY/gatewayd:v0.10.0"
|
|||||||
# from minmoto/fmcd. NOT yet added to the bundled CONTAINER_IMAGES list / first-
|
# from minmoto/fmcd. NOT yet added to the bundled CONTAINER_IMAGES list / first-
|
||||||
# boot auto-create: bundling fleet-wide needs a fleet-reachable default
|
# boot auto-create: bundling fleet-wide needs a fleet-reachable default
|
||||||
# federation first (the interim default is node-local). See docs/dual-ecash-design.md.
|
# federation first (the interim default is node-local). See docs/dual-ecash-design.md.
|
||||||
FMCD_IMAGE="$ARCHY_REGISTRY/fmcd:0.8.0"
|
FMCD_IMAGE="$ARCHY_REGISTRY/fmcd:0.8.1"
|
||||||
|
|
||||||
# Media
|
# Media
|
||||||
REDIS_IMAGE="$ARCHY_REGISTRY/redis:7.4.8"
|
REDIS_IMAGE="$ARCHY_REGISTRY/redis:7.4.8"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user