@@ -186,10 +186,15 @@ class SlurmScriptTemplate(TypedDict):
186186 ],
187187 "write_to_json" : [
188188 '\n json_path="{log_dir}/{model_name}.$SLURM_JOB_ID/{model_name}.$SLURM_JOB_ID.json"' ,
189- 'jq --arg server_addr "$server_address" \\ ' ,
190- " '. + {{\" server_address\" : $server_addr}}' \\ " ,
191- ' "$json_path" > temp.json \\ ' ,
192- ' && mv temp.json "$json_path"' ,
189+ 'tmp_json="${{json_path}}.tmp.$$"' ,
190+ "for _attempt in 1 2 3 4 5; do" ,
191+ ' jq --arg server_addr "$server_address" \\ ' ,
192+ " '. + {{\" server_address\" : $server_addr}}' \\ " ,
193+ ' "$json_path" > "$tmp_json" \\ ' ,
194+ ' && mv "$tmp_json" "$json_path" \\ ' ,
195+ " && break" ,
196+ " sleep 2" ,
197+ "done" ,
193198 ],
194199 "launch_cmd" : {
195200 "vllm" : [
@@ -303,10 +308,15 @@ class BatchModelLaunchScriptTemplate(TypedDict):
303308 "write_to_json" : [
304309 "het_job_id=$(($SLURM_JOB_ID+{het_group_id}))" ,
305310 'json_path="{log_dir}/{slurm_job_name}.$het_job_id/{model_name}.$het_job_id.json"' ,
306- 'jq --arg server_addr "$server_address" \\ ' ,
307- " '. + {{\" server_address\" : $server_addr}}' \\ " ,
308- ' "$json_path" > temp_{model_name}.json \\ ' ,
309- ' && mv temp_{model_name}.json "$json_path"\n ' ,
311+ 'tmp_json="${{json_path}}.tmp.$$"' ,
312+ "for _attempt in 1 2 3 4 5; do" ,
313+ ' jq --arg server_addr "$server_address" \\ ' ,
314+ " '. + {{\" server_address\" : $server_addr}}' \\ " ,
315+ ' "$json_path" > "$tmp_json" \\ ' ,
316+ ' && mv "$tmp_json" "$json_path" \\ ' ,
317+ " && break" ,
318+ " sleep 2" ,
319+ "done\n " ,
310320 ],
311321 "container_command" : f"{ CONTAINER_MODULE_NAME } exec --nv --containall {{image_path}} \\ " ,
312322 "launch_cmd" : {
0 commit comments