fix: image pull timeout actually triggers fallback

Previous timeout used ExitStatus::default() which is success on Linux,
so the fallback never triggered. Now properly kills process, awaits
exit, and forces fallback path on timeout.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian 2026-04-12 10:08:22 -04:00
parent 96ca70e7a4
commit bcf7ac1839

View File

@ -612,19 +612,27 @@ impl RpcHandler {
} }
// Timeout primary pull after 60s — if registry is down, fail fast to fallback // Timeout primary pull after 60s — if registry is down, fail fast to fallback
let status = tokio::time::timeout( let timed_out;
let status = match tokio::time::timeout(
std::time::Duration::from_secs(60), std::time::Duration::from_secs(60),
child.wait(), child.wait(),
) )
.await .await
.unwrap_or_else(|_| { {
// Timeout: kill the stuck process Ok(result) => {
let _ = child.kill(); timed_out = false;
tracing::warn!("Image pull timed out after 60s: {}", docker_image); result.context("Failed to wait for image pull")?
Ok(std::process::ExitStatus::default()) }
}) Err(_) => {
.context("Failed to wait for image pull")?; // Timeout: kill the stuck process
if !status.success() { tracing::warn!("Image pull timed out after 60s: {}", docker_image);
let _ = child.kill().await;
timed_out = true;
// Wait for process to actually exit after kill
child.wait().await.context("Failed to wait after kill")?
}
};
if timed_out || !status.success() {
// Try all configured fallback registries dynamically // Try all configured fallback registries dynamically
match crate::container::registry::pull_from_registries( match crate::container::registry::pull_from_registries(
&self.config.data_dir, &self.config.data_dir,