Skip to content

Commit 1073a40

Browse files
committed
Reduce CI retry amplification and harden relay-flaky tests
1 parent 470ba26 commit 1073a40

3 files changed

Lines changed: 55 additions & 17 deletions

File tree

.github/workflows/lint_and_test.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ jobs:
3232
- name: Run tests with retries
3333
env:
3434
RUST_MIN_STACK: 8388608
35-
# Retries configured in nextest.toml (no CLI override - P2P tests get 15 retries)
35+
# Retries configured in nextest.toml (no CLI override)
3636
# --no-fail-fast: run all tests even when some fail, so we see full results
3737
run: cargo nextest run --test-threads=1 --no-fail-fast
38-

nextest.toml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@ test-threads = 1
1414
# Mark tests slow after 60s (informational)
1515
slow-timeout = "60s"
1616

17-
# P2P tests need many retries for flaky peer discovery - keep retrying until they pass
17+
# P2P tests are flaky in CI, but very high retry counts can stretch failures to >1h.
18+
# Keep retries moderate so truly broken runs fail fast.
1819
[[profile.default.overrides]]
19-
filter = 'test(test_replicate_group) | test(test_refresh_joined_group) | test(test_join_group)'
20-
retries = { backoff = "exponential", count = 15, delay = "15s", max-delay = "90s", jitter = true }
20+
filter = 'test(test_replicate_group) | test(test_join_group)'
21+
retries = { backoff = "exponential", count = 4, delay = "8s", max-delay = "45s", jitter = true }
22+
23+
[[profile.default.overrides]]
24+
filter = 'test(test_refresh_joined_group)'
25+
retries = { backoff = "exponential", count = 2, delay = "8s", max-delay = "30s", jitter = true }
26+
test-timeout = "300s"

src/lib.rs

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -198,13 +198,46 @@ mod tests {
198198

199199
assert_eq!(resp.groups.len(), 0);
200200

201-
let req = test::TestRequest::post()
202-
.uri("/api/groups")
203-
.set_json(RequestName {
204-
name: "example".to_string(),
205-
})
206-
.to_request();
207-
let group: SnowbirdGroup = test::call_and_read_body_json(&app, req).await;
201+
// Group creation can fail transiently in CI when relay discovery is unstable.
202+
// Retry only this step so basic_test is robust but still fails fast on real errors.
203+
let mut group_opt: Option<SnowbirdGroup> = None;
204+
let mut last_create_group_error = String::new();
205+
for attempt in 1..=6 {
206+
let req = test::TestRequest::post()
207+
.uri("/api/groups")
208+
.set_json(RequestName {
209+
name: "example".to_string(),
210+
})
211+
.to_request();
212+
let resp = test::call_service(&app, req).await;
213+
let resp_status = resp.status();
214+
let body = test::read_body(resp).await;
215+
216+
if resp_status.is_success() {
217+
match serde_json::from_slice::<SnowbirdGroup>(&body) {
218+
Ok(group) => {
219+
group_opt = Some(group);
220+
break;
221+
}
222+
Err(e) => {
223+
last_create_group_error = format!("invalid success payload: {e}; body={body:?}");
224+
}
225+
}
226+
} else {
227+
let body_text = String::from_utf8_lossy(&body).to_string();
228+
last_create_group_error = format!("status={resp_status}, body={body_text}");
229+
if !body_text.contains("couldn't look up relay") {
230+
break;
231+
}
232+
}
233+
234+
if attempt < 6 {
235+
tokio::time::sleep(Duration::from_secs(3)).await;
236+
}
237+
}
238+
let group = group_opt.expect(&format!(
239+
"Creating group failed after retries: {last_create_group_error}"
240+
));
208241

209242
assert_eq!(group.name, Some("example".to_string()));
210243

@@ -920,7 +953,7 @@ mod tests {
920953
// Retry refresh until P2P replication converges: the refresh must succeed AND
921954
// the response must contain the creator's repo (by name) with the uploaded file.
922955
// Repo names and file lists propagate via DHT and may lag behind the initial join.
923-
let mut refresh_retries = 30;
956+
let mut refresh_retries = 12;
924957
let refresh_data: serde_json::Value = loop {
925958
let refresh_req = test::TestRequest::post()
926959
.uri(&format!("/api/groups/{}/refresh", group.id()))
@@ -940,20 +973,20 @@ mod tests {
940973
break data;
941974
}
942975
log::warn!("Refresh succeeded but repo data not yet propagated (attempt {})",
943-
30 - refresh_retries + 1);
976+
12 - refresh_retries + 1);
944977
} else {
945978
log::warn!("Refresh failed (attempt {}): status={}, body={:?}",
946-
30 - refresh_retries + 1,
979+
12 - refresh_retries + 1,
947980
resp.status(),
948981
test::read_body(resp).await
949982
);
950983
}
951984

952985
refresh_retries -= 1;
953986
if refresh_retries == 0 {
954-
panic!("Refresh did not converge after 30 attempts.");
987+
panic!("Refresh did not converge after 12 attempts.");
955988
}
956-
tokio::time::sleep(Duration::from_secs(5)).await;
989+
tokio::time::sleep(Duration::from_secs(4)).await;
957990
};
958991

959992
assert_eq!(refresh_data["status"], "success", "First refresh status should be success");

0 commit comments

Comments
 (0)