fix(fips,iso): bulletproof FIPS from install — no Activate button needed

Problems addressed (all observed on .198):
  * fips_key was written as raw 32 bytes; upstream fips daemon reads it
    with read_to_string() and bailed with "stream did not contain valid
    UTF-8", crashlooping indefinitely.
  * Activate button racy: user had to hit it, and it would keep failing
    silently because the daemon couldn't parse its own config.
  * FIPS schema drift (already fixed in 7d8a5864) put the config write
    path behind the same broken "Activate" flow, so the fix alone
    didn't help existing nodes.
  * Journal was on tmpfs — every reboot wiped install/onboarding history,
    making post-hoc debugging impossible.

Changes:
  * identity.rs: write fips_key as bech32 nsec + newline. load_fips_keys
    now auto-migrates legacy 32-byte files to bech32 the first time it
    reads them, so OTA updates from v1.5.0-alpha self-heal without user
    action.
  * server.rs: post-onboarding auto-activate task runs on every
    archipelago startup. If fips_key exists it ensures /etc/fips/fips.yaml
    is schema-current and starts archipelago-fips.service. Pre-onboarding
    nodes stay quiet (guarded on fips_key_exists).
  * ISO build: un-mask archipelago-fips + archipelago-wg + wg-address —
    all use ConditionPathExists on their key files, so systemd silently
    skips them pre-onboarding (no MOTD [FAILED]). Only nostr-vpn stays
    masked (legacy service, superseded by upstream fips).
  * Journald made persistent via /var/log/journal + 500M cap, so
    install and first-boot logs survive reboots for diagnosis.

After this, a fresh install + onboarding should bring FIPS up automatically
with no user interaction. The UI "Activate" button can stay as an escape
hatch (the RPC is still there) but is no longer on the critical path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian 2026-04-19 16:33:21 -04:00
parent 00a86e6ecf
commit d9411c3325
3 changed files with 118 additions and 33 deletions

View File

@ -200,7 +200,16 @@ async fn write_fips_key_from_seed(
let key_path = identity_dir.join(FIPS_KEY_FILE);
let pub_path = identity_dir.join(FIPS_KEY_PUB_FILE);
fs::write(&key_path, keys.secret_key().to_secret_bytes())
// fips daemon reads the key with `fs::read_to_string` and expects a
// bech32 nsec line — raw 32-byte secret bytes fail its UTF-8 check
// ("failed to read config file /etc/fips/fips.key: stream did not
// contain valid UTF-8"). Write the bech32 form with a trailing
// newline so both archipelago and fips load it cleanly.
let nsec = keys
.secret_key()
.to_bech32()
.context("Failed to encode FIPS nsec")?;
fs::write(&key_path, format!("{nsec}\n"))
.await
.context("Failed to write FIPS key")?;
#[cfg(unix)]
@ -210,11 +219,11 @@ async fn write_fips_key_from_seed(
.await
.context("Failed to set FIPS key permissions")?;
}
fs::write(&pub_path, keys.public_key().to_bytes())
let npub = keys.public_key().to_bech32().unwrap_or_default();
fs::write(&pub_path, format!("{npub}\n"))
.await
.context("Failed to write FIPS public key")?;
let npub = keys.public_key().to_bech32().unwrap_or_default();
tracing::info!(
"Derived FIPS mesh key from seed (npub: {}...)",
npub.chars().take(20).collect::<String>()
@ -235,15 +244,50 @@ pub fn fips_key_exists(identity_dir: &Path) -> bool {
#[allow(dead_code)]
pub async fn load_fips_keys(identity_dir: &Path) -> Result<Option<nostr_sdk::Keys>> {
let key_path = identity_dir.join(FIPS_KEY_FILE);
match fs::read(&key_path).await {
Ok(bytes) => {
// Read as raw bytes so we can detect and migrate both formats:
// - v1.6+: bech32 nsec text (what upstream fips expects)
// - <=v1.5: raw 32-byte secret (incompatible with upstream fips)
// When we find the legacy format, rewrite the file in bech32 in place
// so archipelago-fips.service stops crashlooping after an OTA update
// from a release that shipped the old format.
let bytes = match fs::read(&key_path).await {
Ok(b) => b,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
Err(e) => return Err(e).context("Failed to read FIPS key"),
};
// Try bech32 first.
if let Ok(text) = std::str::from_utf8(&bytes) {
if let Ok(secret) = nostr_sdk::SecretKey::parse(text.trim()) {
return Ok(Some(nostr_sdk::Keys::new(secret)));
}
}
// Fall through: treat as legacy raw bytes and migrate.
if bytes.len() == 32 {
let secret = nostr_sdk::SecretKey::from_slice(&bytes)
.map_err(|e| anyhow::anyhow!("Corrupt FIPS key on disk: {}", e))?;
Ok(Some(nostr_sdk::Keys::new(secret)))
let nsec = secret
.to_bech32()
.map_err(|e| anyhow::anyhow!("Failed to encode migrated nsec: {}", e))?;
fs::write(&key_path, format!("{nsec}\n"))
.await
.context("Failed to rewrite FIPS key in bech32 format")?;
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
fs::set_permissions(&key_path, std::fs::Permissions::from_mode(0o600))
.await
.context("Failed to re-set FIPS key permissions after migration")?;
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None),
Err(e) => Err(e).context("Failed to read FIPS key"),
tracing::info!("Migrated legacy raw-bytes FIPS key to bech32 nsec text");
return Ok(Some(nostr_sdk::Keys::new(secret)));
}
anyhow::bail!(
"Corrupt FIPS key on disk (not bech32 nsec and not 32 raw bytes, size={})",
bytes.len()
)
}
/// Return the FIPS npub (bech32) if the key has been materialised.

View File

@ -459,6 +459,44 @@ impl Server {
config.data_dir.clone(),
);
// Post-onboarding auto-activation for archipelago-fips. Runs once
// at startup: if fips_key is on disk, install /etc/fips/fips.yaml
// (schema-refreshed) and start the service. This removes the
// need for a user-facing "Activate" button — the node comes up
// with FIPS running whenever the seed has been onboarded. Also
// self-heals legacy raw-byte fips.key files (load_fips_keys
// rewrites them as bech32 nsec the first time they're read).
// Pre-onboarding nodes: ConditionPathExists on the service unit
// + the `fips_key_exists` guard here keep this quiet.
{
let data_dir = config.data_dir.clone();
tokio::spawn(async move {
let identity_dir = data_dir.join("identity");
if !crate::identity::fips_key_exists(&identity_dir) {
tracing::debug!("FIPS auto-activate skipped: fips_key not on disk");
return;
}
// Trigger the migration path in load_fips_keys so old raw-byte
// key files are rewritten as bech32 before fips.yaml install.
if let Err(e) = crate::identity::load_fips_keys(&identity_dir).await {
tracing::warn!("FIPS key load/migrate failed: {}", e);
return;
}
if let Err(e) = crate::fips::config::install(&identity_dir).await {
tracing::warn!("FIPS config install failed on startup: {}", e);
return;
}
if let Err(e) = crate::fips::service::activate(crate::fips::SERVICE_UNIT).await {
tracing::warn!(
"archipelago-fips activate failed on startup: {} — user can retry via fips.install RPC",
e
);
return;
}
tracing::info!("archipelago-fips auto-activated on startup");
});
}
Ok(Self {
_config: config,
_identity: identity,

View File

@ -453,30 +453,24 @@ RUN systemctl enable NetworkManager || true && \
systemctl enable archipelago-reconcile.timer || true && \
systemctl enable archipelago-tor-helper.path || true && \
systemctl enable nostr-relay || true
# archipelago-wg + wg-address: enabled by first-boot after WG key is generated
# nostr-vpn: enabled by first-boot after Nostr identity is generated
# (env file doesn't exist until onboarding, so pre-enabling causes crash-loop)
# archipelago-fips: masked by default; archipelago backend unmasks +
# starts it via `fips.install` RPC once the seed-derived fips_key is on
# disk and the fips daemon package is installed. Pre-onboarding the node
# stays dark on FIPS so no traffic leaves an ephemeral identity.
RUN systemctl mask archipelago-fips.service || true
# archipelago-fips.service + archipelago-wg.service + archipelago-wg-address.service
# stay installed and enabled. They all use `ConditionPathExists=` on their
# respective seed-derived key files, so on a fresh pre-onboarding boot
# systemd quietly skips them with no [FAILED] in the MOTD. Once the user
# completes the seed onboarding flow, archipelago writes the key files,
# the archipelago backend calls `systemctl start archipelago-fips.service`
# (see server.rs post-onboarding auto-activate block) and the WG setup
# path runs `archipelago-wg setup` directly. No masking, no user-facing
# "Activate" button — install → onboard → FIPS + WG are just running.
RUN systemctl enable archipelago-fips.service || true
# Same rationale for nostr-vpn and wireguard helpers — their env files
# don't exist until onboarding completes, so leaving these "enabled"
# (the default from WantedBy=multi-user.target) produces a red
# [FAILED] in the boot MOTD every reboot. Mask by replacing each
# .service with a /dev/null symlink — plain `systemctl mask` refuses
# to clobber the real files we just COPY'd in, so the previous
# attempt left the services installable via dependency chains
# (nostr-relay has Before=nostr-vpn, which pulls it in). Explicit
# rm + ln -sf creates the proper masked state. The onboarding flow
# removes the symlink and drops in a configured service when env
# files are in place.
RUN for svc in nostr-vpn archipelago-wg archipelago-wg-address; do \\
rm -f /etc/systemd/system/\$svc.service; \\
ln -sf /dev/null /etc/systemd/system/\$svc.service; \\
done
# nostr-vpn is the legacy nostr-tunnel service — deprecated in favour of
# the upstream FIPS daemon. It still crash-loops on boot if left enabled
# (env file doesn't exist until onboarding) so we mask it outright.
# `systemctl mask` alone doesn't stick because the real .service file is
# already in place — explicit rm + /dev/null symlink is what sticks.
RUN rm -f /etc/systemd/system/nostr-vpn.service && \\
ln -sf /dev/null /etc/systemd/system/nostr-vpn.service
# Remove policy-rc.d so services can start on first boot
RUN rm -f /usr/sbin/policy-rc.d
@ -489,6 +483,15 @@ RUN mkdir -p /var/lib/archipelago/data /var/lib/archipelago/config /var/lib/arch
cp /etc/archipelago/nostr-relay-config.toml /var/lib/archipelago/nostr-relay/config.toml && \
chown -R archipelago:archipelago /var/lib/archipelago /opt/archipelago
# Persist journalctl across reboots — without /var/log/journal systemd
# journal uses tmpfs and everything before the last boot is lost. We
# need the full history to diagnose first-boot / install / onboarding
# issues after the fact. Size cap keeps it from eating the disk.
RUN mkdir -p /var/log/journal && \
systemd-tmpfiles --create --prefix /var/log/journal 2>/dev/null || true && \
install -d -m 0755 /etc/systemd/journald.conf.d && \
printf '[Journal]\nStorage=persistent\nSystemMaxUse=500M\nRuntimeMaxUse=100M\nForwardToSyslog=no\n' > /etc/systemd/journald.conf.d/10-archipelago-persistent.conf
# Clean up
RUN apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*