fix: pull timeout covers entire operation, swap registry priority

Timeout now wraps stderr reader + wait (was only wrapping wait, so
hung pulls were never killed). 23.182.128.160:3000 is now primary
registry since git.tx1138.com is unreachable.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian 2026-04-12 10:18:24 -04:00
parent 877b3e4168
commit 2d11f262dd
3 changed files with 37 additions and 35 deletions

View File

@ -597,42 +597,44 @@ impl RpcHandler {
.spawn()
.context("Failed to start image pull")?;
if let Some(stderr) = child.stderr.take() {
let reader = BufReader::new(stderr);
let mut lines = reader.lines();
let pkg_id = package_id.to_string();
let state_mgr = self.state_manager.clone();
while let Ok(Some(line)) = lines.next_line().await {
if let Some((downloaded, total)) = parse_pull_progress(&line) {
Self::update_install_progress(&state_mgr, &pkg_id, downloaded, total)
.await;
}
}
}
// Timeout primary pull after 60s — if registry is down, fail fast to fallback
let timed_out;
let status = match tokio::time::timeout(
// Wrap the entire pull (stderr progress + wait) in a 60s timeout.
// If the registry is unreachable, the pull hangs on DNS/TCP and the
// stderr reader never returns — so the timeout must cover everything.
let pull_result = tokio::time::timeout(
std::time::Duration::from_secs(60),
child.wait(),
async {
if let Some(stderr) = child.stderr.take() {
let reader = BufReader::new(stderr);
let mut lines = reader.lines();
let pkg_id = package_id.to_string();
let state_mgr = self.state_manager.clone();
while let Ok(Some(line)) = lines.next_line().await {
if let Some((downloaded, total)) = parse_pull_progress(&line) {
Self::update_install_progress(&state_mgr, &pkg_id, downloaded, total)
.await;
}
}
}
child.wait().await
},
)
.await
{
Ok(result) => {
timed_out = false;
result.context("Failed to wait for image pull")?
.await;
let primary_failed = match pull_result {
Ok(Ok(status)) => !status.success(),
Ok(Err(e)) => {
tracing::warn!("Image pull process error: {}", e);
true
}
Err(_) => {
// Timeout: kill the stuck process
tracing::warn!("Image pull timed out after 60s: {}", docker_image);
let _ = child.kill().await;
timed_out = true;
// Wait for process to actually exit after kill
child.wait().await.context("Failed to wait after kill")?
let _ = child.wait().await; // reap zombie
true
}
};
if timed_out || !status.success() {
if primary_failed {
// Try all configured fallback registries dynamically
match crate::container::registry::pull_from_registries(
&self.config.data_dir,

View File

@ -44,16 +44,16 @@ impl Default for RegistryConfig {
Self {
registries: vec![
Registry {
url: "git.tx1138.com/lfg2025".to_string(),
url: "23.182.128.160:3000/lfg2025".to_string(),
name: "Archipelago Primary".to_string(),
tls_verify: true,
tls_verify: false,
enabled: true,
priority: 0,
},
Registry {
url: "23.182.128.160:3000/lfg2025".to_string(),
name: "Archipelago Fallback".to_string(),
tls_verify: false,
url: "git.tx1138.com/lfg2025".to_string(),
name: "Archipelago Legacy".to_string(),
tls_verify: true,
enabled: true,
priority: 10,
},

View File

@ -2132,8 +2132,8 @@ mkdir -p /mnt/target/var/lib/archipelago/config
cat > /mnt/target/var/lib/archipelago/config/registries.json <<'DYNREG'
{
"registries": [
{"url": "git.tx1138.com/lfg2025", "name": "Archipelago Primary", "tls_verify": true, "enabled": true, "priority": 0},
{"url": "23.182.128.160:3000/lfg2025", "name": "Archipelago Fallback", "tls_verify": false, "enabled": true, "priority": 10}
{"url": "23.182.128.160:3000/lfg2025", "name": "Archipelago Primary", "tls_verify": false, "enabled": true, "priority": 0},
{"url": "git.tx1138.com/lfg2025", "name": "Archipelago Legacy", "tls_verify": true, "enabled": true, "priority": 10}
]
}
DYNREG