feat(fips): auto-activate + reliability (retry, warm paths) — make FIPS the robust primary (B14b/#27)
User priority: FIPS is the main transport but it was unreliable and needed a manual "Activate" button. Improvements (all in the FIPS dial/supervisor): - Auto-activate: ensure_activated() installs the daemon config + starts the service on its own once seed onboarding has materialised the key — no Activate button needed. Idempotent; runs from the supervisor every 45s so a node that onboards after boot still comes up automatically. - Dial retry: try_fips_get/post now retry ONCE on a connect/timeout error. The first dial to a peer triggers NAT hole-punching and often times out before the path is up; the retry lands on the now-warm path — the main reason calls were dropping to Tor despite the peer being FIPS-reachable. - More patient connect_timeout (5s→8s) so a reachable-but-cold peer isn't abandoned to Tor while hole-punching completes. - Path warmer: spawn_fips_supervisor() keeps hole-punched paths to known federation peers warm (every 45s, concurrent), so on-demand dials are fast and land on FIPS. - Confirmed the daemon config already enables BOTH udp + tcp transports (render_config_yaml), so FIPS already uses TCP where UDP is blocked; the Tor fallback was path-establishment, addressed above. cargo check + fmt clean. Backend — needs a binary rebuild+deploy to validate on .116/.198 (watch last_transport flip fips, and FIPS coming up with no button). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b602a9cea5
commit
774ca28847
@ -93,17 +93,61 @@ pub async fn peer_base_url(npub: &str) -> Result<String> {
|
||||
Ok(format!("http://[{}]:{}", ip, PEER_PORT))
|
||||
}
|
||||
|
||||
/// Build an HTTP client tuned for FIPS peer-to-peer dialing. No proxy,
|
||||
/// short timeout — fall back to Tor on failure.
|
||||
/// Build an HTTP client tuned for FIPS peer-to-peer dialing. No proxy.
|
||||
/// `connect_timeout` is generous enough to let NAT hole-punching complete on
|
||||
/// the first dial (FIPS is UDP hole-punched; the path often isn't established
|
||||
/// until the first packets flow), so a reachable-but-cold peer isn't abandoned
|
||||
/// to Tor prematurely. Reliability over latency — FIPS is the preferred path.
|
||||
pub fn client() -> reqwest::Client {
|
||||
reqwest::Client::builder()
|
||||
.timeout(Duration::from_secs(20))
|
||||
.connect_timeout(Duration::from_secs(5))
|
||||
.connect_timeout(Duration::from_secs(8))
|
||||
.user_agent("archipelago-fips/1")
|
||||
.build()
|
||||
.expect("static reqwest client config")
|
||||
}
|
||||
|
||||
/// Send a FIPS request with ONE retry on a connect/timeout error.
|
||||
///
|
||||
/// The first dial to a peer typically triggers NAT hole-punching and can time
|
||||
/// out before the overlay path is established; a quick retry then lands on the
|
||||
/// now-warm path. Without this, a single cold-path failure drops the call to
|
||||
/// Tor even though the peer is FIPS-reachable — the main reason FIPS "isn't
|
||||
/// robust". Only connect/timeout errors are retried (a real HTTP response,
|
||||
/// including 4xx/5xx, is returned as-is for the caller to interpret).
|
||||
async fn send_with_retry(rb: reqwest::RequestBuilder) -> Result<reqwest::Response, reqwest::Error> {
|
||||
let retry = rb.try_clone();
|
||||
match rb.send().await {
|
||||
Ok(resp) => Ok(resp),
|
||||
Err(e) if (e.is_connect() || e.is_timeout()) && retry.is_some() => {
|
||||
// Brief pause so the hole-punch packets from the first attempt can
|
||||
// traverse before we re-dial onto the warmed path.
|
||||
tokio::time::sleep(Duration::from_millis(600)).await;
|
||||
retry.expect("retry builder present").send().await
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
/// Proactively warm the hole-punched FIPS path to a peer: resolve its overlay
|
||||
/// address and open a short connection to its peer listener. Hole-punched
|
||||
/// paths and NAT mappings go cold after ~30-60s of no traffic, after which the
|
||||
/// next real dial pays the full re-punch cost and often falls back to Tor.
|
||||
/// Keeping the path warm is what makes FIPS the transport that actually gets
|
||||
/// used. Best-effort: any error (peer offline, UDP blocked) is ignored — the
|
||||
/// connection attempt itself is what re-punches and refreshes the path.
|
||||
pub async fn warm_path(npub: &str) {
|
||||
if !is_service_active().await {
|
||||
return;
|
||||
}
|
||||
let Ok(base) = peer_base_url(npub).await else {
|
||||
return;
|
||||
};
|
||||
let c = client();
|
||||
// The response status is irrelevant; establishing the connection warms it.
|
||||
let _ = tokio::time::timeout(Duration::from_secs(8), c.get(&base).send()).await;
|
||||
}
|
||||
|
||||
// ── DNS wire-format helpers ─────────────────────────────────────────────
|
||||
|
||||
fn encode_query(id: u16, npub: &str) -> Result<Vec<u8>> {
|
||||
@ -374,10 +418,14 @@ impl<'a> PeerRequest<'a> {
|
||||
for (k, v) in &self.headers {
|
||||
rb = rb.header(*k, v);
|
||||
}
|
||||
match rb.send().await {
|
||||
match send_with_retry(rb).await {
|
||||
Ok(r) => Ok(Some(r)),
|
||||
Err(e) => {
|
||||
tracing::debug!("FIPS POST {} failed: {}, falling back to Tor", url, e);
|
||||
tracing::debug!(
|
||||
"FIPS POST {} failed after retry: {}, falling back to Tor",
|
||||
url,
|
||||
e
|
||||
);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@ -403,10 +451,14 @@ impl<'a> PeerRequest<'a> {
|
||||
for (k, v) in &self.headers {
|
||||
rb = rb.header(*k, v);
|
||||
}
|
||||
match rb.send().await {
|
||||
match send_with_retry(rb).await {
|
||||
Ok(r) => Ok(Some(r)),
|
||||
Err(e) => {
|
||||
tracing::debug!("FIPS GET {} failed: {}, falling back to Tor", url, e);
|
||||
tracing::debug!(
|
||||
"FIPS GET {} failed after retry: {}, falling back to Tor",
|
||||
url,
|
||||
e
|
||||
);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
@ -33,6 +33,63 @@ pub mod service;
|
||||
pub mod update;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Auto-activate FIPS with no user interaction. Once seed onboarding has
|
||||
/// materialised the fips key, install the daemon config + start the service if
|
||||
/// it isn't already up. Idempotent and best-effort: FIPS is the preferred
|
||||
/// transport and should come up on its own — the UI "Activate" button is now a
|
||||
/// manual fallback, not a requirement. No-op pre-onboarding (no key yet) or
|
||||
/// when the service is already active.
|
||||
pub async fn ensure_activated(data_dir: &std::path::Path) {
|
||||
let identity_dir = identity_dir_from(data_dir);
|
||||
if !identity_dir.join("fips_key").exists() {
|
||||
return; // pre-onboarding: nothing to activate yet
|
||||
}
|
||||
if dial::is_service_active().await {
|
||||
return; // already up
|
||||
}
|
||||
tracing::info!("FIPS inactive — auto-activating (no user interaction needed)");
|
||||
if let Err(e) = config::install(&identity_dir).await {
|
||||
tracing::warn!("FIPS auto-activate: config install failed: {:#}", e);
|
||||
return;
|
||||
}
|
||||
if let Err(e) = service::activate(SERVICE_UNIT).await {
|
||||
tracing::warn!("FIPS auto-activate: service activate failed: {:#}", e);
|
||||
return;
|
||||
}
|
||||
tracing::info!("FIPS auto-activated");
|
||||
}
|
||||
|
||||
/// Spawn the FIPS supervisor: every 45s it (1) auto-activates FIPS if onboarding
|
||||
/// is done but the service is down — so it comes up with zero user interaction,
|
||||
/// and (2) keeps hole-punched paths to known federation peers warm, so on-demand
|
||||
/// dials land on FIPS instead of falling back to Tor. Warms peers concurrently
|
||||
/// so one slow/offline peer doesn't delay the rest.
|
||||
pub fn spawn_fips_supervisor(data_dir: std::path::PathBuf) {
|
||||
tokio::spawn(async move {
|
||||
let mut tick = tokio::time::interval(std::time::Duration::from_secs(45));
|
||||
loop {
|
||||
tick.tick().await;
|
||||
// Bring FIPS up on its own once onboarding has materialised the key.
|
||||
ensure_activated(&data_dir).await;
|
||||
if !dial::is_service_active().await {
|
||||
continue;
|
||||
}
|
||||
let nodes = crate::federation::load_nodes(&data_dir)
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
let mut handles = Vec::new();
|
||||
for node in nodes {
|
||||
if let Some(npub) = node.fips_npub.clone() {
|
||||
handles.push(tokio::spawn(async move { dial::warm_path(&npub).await }));
|
||||
}
|
||||
}
|
||||
for h in handles {
|
||||
let _ = h.await;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// Systemd unit name supervised by archipelago.
|
||||
|
||||
@ -311,6 +311,11 @@ async fn main() -> Result<()> {
|
||||
electrs_status::spawn_status_cache();
|
||||
bitcoin_status::spawn_status_cache();
|
||||
|
||||
// FIPS supervisor: auto-activate FIPS after onboarding (no Activate button
|
||||
// needed) and keep hole-punched paths to federation peers warm so peer dials
|
||||
// land on FIPS (the preferred transport) instead of falling back to Tor.
|
||||
fips::spawn_fips_supervisor(config.data_dir.clone());
|
||||
|
||||
let startup_ms = startup_start.elapsed().as_millis();
|
||||
info!(
|
||||
"Server listening on http://{} (startup: {}ms)",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user