Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
6b1574e
Initial commit for adding NERSC IRI-API support alongside SFAPI for j…
davramov Mar 16, 2026
323b295
Adding an abstraction for _submit_job() and _wait_for_job() that use …
davramov Mar 16, 2026
1a0cbd8
moving NERSCLoginMethod(Enum) to the job_controller.py module
davramov Mar 17, 2026
0dce5c0
Removed NERSCLoginMethod(Enum) from nersc.py. Created a temporary tes…
davramov Mar 17, 2026
df621b0
Updating pytests
davramov Mar 17, 2026
0a585bd
Updating multires() method to use the generic _submit_job() and _wait…
davramov Mar 17, 2026
1e984dc
successfully ran reconstruction using the IRI-API
davramov Mar 30, 2026
6b986f2
removing token.py and moving the logic to get_globus_token.py
davramov Apr 1, 2026
c9a376d
moving get_globus_token.py to orchestration/globus/ to be used as a m…
davramov Apr 1, 2026
4e36253
Cleaning up nersc.py
davramov Apr 1, 2026
3a9ab2f
cleaning up old commented code
davramov Apr 1, 2026
d36af2f
Updating unit tests
davramov Apr 1, 2026
033da55
updating login script
davramov Apr 7, 2026
f6b5042
Rebasing and including segmentation flows as part of iri/sfapi abstra…
davramov Apr 7, 2026
c1d1b2c
commenting out petiole segmentation prune block for now, while testing
davramov Apr 13, 2026
701ff6e
Making reconstruction run as a task
davramov Apr 13, 2026
bb48f24
Making IRIAPI the default login method for now
davramov Apr 13, 2026
5960b13
adjusting queue name and account
davramov Apr 14, 2026
f3315eb
Making the IRI job submission read sbatch settings
davramov Apr 14, 2026
7b98595
Switching to debug queue/2 nodes for the IRI demo
davramov Apr 14, 2026
256c707
check globus token expiration before minting a new one. avoids race c…
davramov Apr 14, 2026
2ccd188
Fixing IRIAPI bugs, also commenting out Globus transfers for now
davramov Apr 14, 2026
99efa7a
removing IRIAPI client ID from nersc.py, since it is only used in glo…
davramov Apr 15, 2026
e77706c
Updating logger comments
davramov Apr 23, 2026
149f63d
connecting to AmSC MLflow service
davramov Apr 24, 2026
792c7f6
removing old commented code
davramov Apr 24, 2026
b5b31cf
updating pytest
davramov Apr 24, 2026
4de1c3e
linting
davramov Apr 24, 2026
79fc854
adjusting import in pytest to avoid error on github that did not occu…
davramov Apr 24, 2026
1eb2b9e
Getting NERSC reservations working with IRI API
davramov May 7, 2026
12c6853
Updating pytests
davramov May 7, 2026
bdeedd5
launch jobs with IRI API and a reservation
davramov May 7, 2026
fa1f942
fixing dino extra_flags bug
davramov May 7, 2026
439840e
fixing globus token race condition when jobs are launch simultaneously
davramov May 7, 2026
c5d9338
Adding frontend/prefect_runner.html
davramov May 10, 2026
2a0935b
updating html page with a timer and collapsible logs
davramov May 12, 2026
7bb7dae
updating config with confab reservation
davramov May 12, 2026
e7bd36b
Separated out general MLflow tests (non-specific to beamlines)
davramov May 20, 2026
69b8936
removing quotes around enum
davramov May 20, 2026
f759202
moving nersc iri/sf-api resource definitions to config (no longer glo…
davramov May 20, 2026
5678f72
Updating nersc.py to pull iri/sf-api parameters from the config, rath…
davramov May 20, 2026
1033161
removing redundant logging setLevel
davramov May 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,8 @@ PREFECT_API_KEY=<prefect_client_secret>
PUSHGATEWAY_URL=<url_of_pushgateway_server>
JOB_NAME=<jobname_for_pushgateway>
INSTANCE_LABEL=<label_for_pushgateway>
TILED_URI=<url_of_tiled_server>
TILED_URI=<url_of_tiled_server>
PATH_NERSC_CLIENT_ID=<path_to_nersc_client_id>
PATH_NERSC_PRI_KEY=<path_to_nersc_priv_key>
NERSC_USERNAME=<nersc_username>
AMSC_API_KEY=<amsc_api_key> # found here: https://profile.american-science-cloud.org/
59 changes: 42 additions & 17 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -185,37 +185,62 @@ mlflow:
staging:
tracking_uri: https://mlflow-staging.computing.als.lbl.gov
registry_uri: https://mlflow-staging.computing.als.lbl.gov
amsc:
tracking_uri: https://mlflow.american-science-cloud.org/
registry_uri: https://mlflow.american-science-cloud.org/
experiment_name: als-bl832-models

nersc_resources:
iri:
api_base_url: https://api.iri.nersc.gov
compute_resource: "compute"
# Perlmutter compute
perlmutter_compute: "94351904-6dba-4c16-b5cd-fbd280d8615b"
perlmutter_login: "e525a224-61c1-419f-9642-91168c792e39"
perlmutter_realtime: "3776417d-747c-4753-895a-6323c17b9c98"
perlmutter_job_submit: "3cf3c048-855e-4dd8-a189-065a483954bb"
# Storage
scratch: "43d8f6c0-f900-48ce-b267-73714103f4ac"
homes: "65b28619-c3b6-4942-8da1-044a3b3a2a9e"
common: "7e07a611-f927-4a39-a44d-b1d6e307accd"
cfs: "59e80c79-4dfd-4c53-9c07-7405685fcd37"
archive: "f4916c65-9001-49c2-b0bf-6fe4276b564c"
# Services
globus: "0a207df3-4bec-45b8-9060-13505d269da9"
dtns: "a762cbdc-af7a-4b2b-9463-67f0189dd2ae"
sfapi:
api_base_url: https://api.nersc.gov/api/v1.2

hpc_submission_settings832:
# ── RECON + MULTIRES SETTINGS ───────────────────────────────────────────────
nersc_reconstruction:
# ── SLURM resource allocation ─────────────────────────────────────────────
qos: realtime
account: als
reservation: "_CAP_TOMO_MOON_CPU"
qos: regular
account: amsc006
reservation: "_CAP_SYNAPS_LIVEDEMO_CPU2"
num_nodes: 16
cpus-per-task: 128
walltime: "0:30:00"
nersc_multiresolution:
# ── SLURM resource allocation ─────────────────────────────────────────────
qos: realtime
qos: debug
account: als
reservation: ""
reservation: "_CAP_SYNAPS_LIVEDEMO_CPU2"
cpus-per-task: 128
walltime: "0:15:00"

# ── PETIOLE SEGMENTATION SETTINGS ───────────────────────────────────────────
nersc_segmentation_sam3:
# ── SLURM resource allocation ─────────────────────────────────────────────
qos: regular
account: als
account: amsc006
constraint: gpu
reservation: ""
num_nodes: 4
reservation: "_CAP_SYNAPS_LIVEDEMO_GPU2"
num_nodes: 32
ntasks-per-node: 1
gpus-per-node: 4
cpus-per-task: 128
walltime: "00:59:00"
walltime: "00:30:00"
# ── Inference parameters ──────────────────────────────────────────────────
script_name: "src/inference_v6.py"
batch_size: 1
Expand All @@ -239,15 +264,15 @@ hpc_submission_settings832:
nersc_segmentation_dinov3:
# ── SLURM resource allocation ─────────────────────────────────────────────
qos: regular
account: als
account: amsc006
constraint: gpu
reservation: ""
num_nodes: 4
reservation: "_CAP_SYNAPS_LIVEDEMO_GPU2"
num_nodes: 8
ntasks-per-node: 1
nproc_per_node: 4
gpus-per-node: 4
cpus-per-task: 128
walltime: "00:59:00"
walltime: "00:30:00"
# ── Inference parameters ──────────────────────────────────────────────────
script_name: "src.inference_dino_v1"
batch_size: 4
Expand All @@ -259,13 +284,13 @@ hpc_submission_settings832:
nersc_combine_segmentations:
# ── SLURM resource allocation ─────────────────────────────────────────────
qos: regular
account: als
account: amsc006
constraint: cpu
reservation: ""
reservation: "_CAP_SYNAPS_LIVEDEMO_CPU2"
num_nodes: 4
ntasks: 128
cpus-per-task: 1
walltime: "01:00:00"
walltime: "00:30:00"
# ── Combination parameters ────────────────────────────────────────────────
script_name: "src.combine_sam_dino_v3"
dilate_px: 5
Expand All @@ -280,7 +305,7 @@ hpc_submission_settings832:
qos: regular
account: als
constraint: gpu
reservation: "_CAP_TOMO_MOON_GPU"
reservation: "_CAP_TOMO_MOON_GPU2"
num_nodes: 4
ntasks-per-node: 1
nproc_per_node: 4
Expand Down
Loading
Loading