Skip to content

Commit 0d833d7

Browse files
Openbind processing update (#352)
* Move output to processing directory * Take forward most recent processing batch * Integrate back with XCE via datasource * Add option for user specified params * Improve logging
1 parent 728613e commit 0d833d7

4 files changed

Lines changed: 114 additions & 76 deletions

File tree

src/dlstbx/services/trigger_xchem.py

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ class PanDDAParameters(pydantic.BaseModel):
5757
automatic: Optional[bool] = False
5858
comment: Optional[str] = None
5959
scaling_id: list[int]
60-
timeout: float = pydantic.Field(default=60, alias="timeout-minutes")
60+
timeout: float = pydantic.Field(default=120, alias="timeout-minutes")
6161
backoff_delay: float = pydantic.Field(default=45, alias="backoff-delay")
6262
backoff_max_try: int = pydantic.Field(default=30, alias="backoff-max-try")
6363
backoff_multiplier: float = pydantic.Field(default=1.1, alias="backoff-multiplier")
@@ -69,7 +69,7 @@ class PanDDA_PostParameters(pydantic.BaseModel):
6969
automatic: Optional[bool] = False
7070
comment: Optional[str] = None
7171
scaling_id: list[int]
72-
processed_directory: str
72+
processing_directory: str
7373
timeout: float = pydantic.Field(default=60, alias="timeout-minutes")
7474

7575

@@ -242,7 +242,7 @@ def trigger_pandda_xchem(
242242
proposal_number = proposal.proposalNumber
243243
proposal_string = proposal_code + proposal_number
244244

245-
# TEMPORARY FILTER
245+
# TEMPORARY PROPOSAL FILTER
246246
allowed_proposals = ["lb42888", "sw44107", "lb36049"]
247247

248248
# 0. Check that this is an XChem expt & locate .SQLite database
@@ -265,7 +265,7 @@ def trigger_pandda_xchem(
265265
location = int(query.one()[1])
266266
container_code = query.one()[2]
267267

268-
# Get user defined spacegroup
268+
# Get the user defined spacegroup
269269
query = (
270270
session.query(Crystal.spaceGroup)
271271
.join(BLSample, BLSample.crystalId == Crystal.crystalId)
@@ -296,7 +296,7 @@ def trigger_pandda_xchem(
296296
# match_yaml = expt_yaml
297297
self.log.info(f"Found user yaml for dtag {dtag} at {yaml_file}")
298298

299-
# account for potentially multiple labxchem visits for a single target
299+
# account for potentially multiple labxchem visits for a single target in proposal
300300
if len(match_dirs) == 1:
301301
match_dir = match_dirs[0]
302302
elif len(match_dirs) > 1:
@@ -365,6 +365,7 @@ def trigger_pandda_xchem(
365365
return {"success": True}
366366
else:
367367
xchem_visit_dir = match_dir
368+
processing_dir = xchem_visit_dir / "processing"
368369
# user_settings = match_yaml["autoprocessing"]
369370

370371
if xchem_visit_dir:
@@ -377,12 +378,7 @@ def trigger_pandda_xchem(
377378
)
378379
return {"success": True}
379380

380-
processing_dir = xchem_visit_dir / "processing"
381-
db = processing_dir / "database" / "soakDBDataFile.sqlite"
382-
processed_dir = xchem_visit_dir / "processed"
383-
384381
# 1. Trigger when all upstream pipelines & related dimple jobs have finished
385-
386382
program_list = [
387383
"xia2 dials",
388384
"xia2 3dii",
@@ -560,6 +556,11 @@ def trigger_pandda_xchem(
560556

561557
if not df_filteredbysg.empty:
562558
df = df_filteredbysg
559+
n_success_upstream = len(df)
560+
self.log.info(
561+
f"There are {n_success_upstream} successful upstream jobs (excl fast-dp) in the user defined spacegroup {user_sg} \
562+
selecting the best one based on I/sigI*completeness * #unique reflections, from the most recent processing batch"
563+
)
563564

564565
# rank datasets by I/sigI*completeness*# unique reflections
565566
df["heuristic"] = (
@@ -571,7 +572,7 @@ def trigger_pandda_xchem(
571572
df = df[["autoProcScalingId", "heuristic"]].copy()
572573
scaling_ids = df["autoProcScalingId"].tolist()
573574

574-
# find associated dimple jobs from scaling_ids
575+
# find associated dimple jobs from scaling_ids, take most recent batch if reprocessing
575576
query = (
576577
(
577578
session.query(
@@ -610,17 +611,21 @@ def trigger_pandda_xchem(
610611
)
611612
return {"success": True}
612613

613-
n_success_upstream = len(df)
614-
n_success_dimple = len(df2)
614+
# mark a new batch whenever the gap is >= 12 hours
615+
df2 = df2.sort_values("processingStartTime").reset_index(drop=True)
616+
df2["time_diff"] = df2["processingStartTime"].diff()
617+
df2["batch"] = (df2["time_diff"] >= pd.Timedelta(hours=12)).cumsum() + 1
618+
recent_batch = df2[df2["batch"] == df2["batch"].max()].copy()
615619

616-
self.log.info(
617-
f"There are {n_success_upstream} successful upstream jobs (excl fast-dp) & {n_success_dimple} successful dimple jobs, \
618-
selecting the best one based on heuristic: I/sigI*completeness * #unique reflections"
619-
)
620-
621-
df2["parameterValue"] = pd.to_numeric(df2["parameterValue"]).astype("Int64")
620+
recent_batch["parameterValue"] = pd.to_numeric(
621+
recent_batch["parameterValue"]
622+
).astype("Int64")
622623
df3 = pd.merge(
623-
df2, df, left_on="parameterValue", right_on="autoProcScalingId", how="inner"
624+
recent_batch,
625+
df,
626+
left_on="parameterValue",
627+
right_on="autoProcScalingId",
628+
how="inner",
624629
).sort_values("heuristic", ascending=False)
625630

626631
if df3.empty:
@@ -653,16 +658,18 @@ def trigger_pandda_xchem(
653658

654659
# 2. Read XChem SQLite database for ligand info
655660

661+
db_master = processing_dir / "database" / "soakDBDataFile.sqlite"
662+
656663
try:
657-
conn = sqlite3.connect(f"file:{db}?mode=ro", uri=True, timeout=10)
664+
conn = sqlite3.connect(f"file:{db_master}?mode=ro", uri=True, timeout=10)
658665
df = pd.read_sql_query(
659666
f"SELECT * from mainTable WHERE Puck = '{container_code}' AND PuckPosition = {location} AND CrystalName = '{dtag}'",
660667
conn,
661668
)
662669

663670
except Exception as e:
664671
self.log.info(
665-
f"Exiting PanDDA2/Pipedream trigger: Exception whilst reading ligand information from {db} for dtag {dtag}, dcid {dcid}: {e}"
672+
f"Exiting PanDDA2/Pipedream trigger: Exception whilst reading ligand information from {db_master} for dtag {dtag}, dcid {dcid}: {e}"
666673
)
667674
return {"success": True}
668675

@@ -672,7 +679,7 @@ def trigger_pandda_xchem(
672679

673680
if len(df) != 1:
674681
self.log.info(
675-
f"Exiting PanDDA2/Pipedream trigger: Unique row in .sqlite for dtag {dtag}, puck {container_code}, puck position {location} cannot be found in {db}, skipping dcid {dcid}"
682+
f"Exiting PanDDA2/Pipedream trigger: Unique row in .sqlite for dtag {dtag}, puck {container_code}, puck position {location} cannot be found in {db_master}, skipping dcid {dcid}"
676683
)
677684
return {"success": True}
678685

@@ -697,7 +704,8 @@ def trigger_pandda_xchem(
697704
return {"success": True}
698705

699706
# 3. Create dataset directory structure
700-
analysis_dir = processed_dir / "analysis"
707+
auto_dir = processing_dir / "auto"
708+
analysis_dir = auto_dir / "analysis"
701709
pandda_dir = analysis_dir / "pandda2"
702710
model_dir = pandda_dir / "model_building"
703711
dataset_dir = model_dir / dtag
@@ -736,16 +744,16 @@ def trigger_pandda_xchem(
736744
smi_file.write(CompoundSMILES)
737745

738746
# 4. Job launch logic
739-
740747
recipe_parameters = {
741748
"dcid": dcid,
742-
"processed_directory": str(processed_dir),
749+
"xchem_visit_dir": str(xchem_visit_dir),
750+
"processing_directory": str(processing_dir),
743751
"model_directory": str(model_dir),
744752
"dtag": dtag,
745753
"n_datasets": 1,
746754
"scaling_id": scaling_id,
747755
"comparator_threshold": comparator_threshold,
748-
"database_path": str(db),
756+
"database_path": str(db_master),
749757
"upstream_mtz": pathlib.Path(upstream_mtz).parts[-1],
750758
"smiles": str(CompoundSMILES),
751759
}
@@ -769,6 +777,7 @@ def trigger_pandda_xchem(
769777

770778
with open(model_dir / ".batch.json", "w") as f:
771779
json.dump(dataset_list, f)
780+
# cannot pass as ispyb_parameter
772781

773782
self.log.info(
774783
f"{dataset_count} = comparator dataset threshold of {comparator_threshold}, launching PanDDA2 array job"
@@ -821,7 +830,7 @@ def trigger_pandda_xchem_post(
821830

822831
dcid = parameters.dcid
823832
scaling_id = parameters.scaling_id[0]
824-
processed_directory = pathlib.Path(parameters.processed_directory)
833+
processing_directory = pathlib.Path(parameters.processing_directory)
825834

826835
_, ispyb_info = dlstbx.ispybtbx.ispyb_filter({}, {"ispyb_dcid": dcid}, session)
827836
visit = ispyb_info.get("ispyb_visit", "")
@@ -868,7 +877,7 @@ def trigger_pandda_xchem_post(
868877

869878
recipe_parameters = {
870879
"dcid": dcid, #
871-
"processed_directory": str(processed_directory),
880+
"processing_directory": str(processing_directory),
872881
"scaling_id": scaling_id,
873882
}
874883

src/dlstbx/wrapper/pandda_post.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,16 @@ def run(self):
1616
)
1717

1818
params = self.recwrap.recipe_step["job_parameters"]
19-
processed_dir = Path(params.get("processed_directory"))
20-
analysis_dir = processed_dir / "analysis"
19+
processing_dir = Path(params.get("processing_directory"))
20+
auto_dir = processing_dir / "auto"
21+
analysis_dir = auto_dir / "analysis"
2122
pandda_dir = analysis_dir / "pandda2"
2223
model_dir = pandda_dir / "model_building"
23-
auto_panddas_dir = Path(pandda_dir / "panddas")
24+
panddas_dir = Path(pandda_dir / "panddas")
2425

2526
# -------------------------------------------------------
2627
pandda2_command = f"source /dls_sw/i04-1/software/PanDDA2/venv/bin/activate; \
27-
python -u /dls_sw/i04-1/software/PanDDA2/scripts/postrun.py --data_dirs={model_dir} --out_dir={auto_panddas_dir} --use_ligand_data=False --debug=True --local_cpus=1 > {auto_panddas_dir / 'pandda2_postrun.log'}"
28+
python -u /dls_sw/i04-1/software/PanDDA2/scripts/postrun.py --data_dirs={model_dir} --out_dir={panddas_dir} --use_ligand_data=False --debug=True --local_cpus=1 > {panddas_dir / 'pandda2_postrun.log'}"
2829

2930
self.log.info("Running PanDDA2 command: {pandda2_command}")
3031

@@ -34,7 +35,7 @@ def run(self):
3435
shell=True,
3536
capture_output=True,
3637
text=True,
37-
cwd=auto_panddas_dir,
38+
cwd=panddas_dir,
3839
check=True,
3940
timeout=params.get("timeout-minutes") * 60,
4041
)

src/dlstbx/wrapper/pandda_xchem.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import numpy as np
1111
import yaml
1212

13-
import dlstbx.util.symlink
1413
from dlstbx.util.mvs.helpers import (
1514
find_residue_by_name,
1615
save_cropped_map,
@@ -33,8 +32,11 @@ def run(self):
3332
params = self.recwrap.recipe_step["job_parameters"]
3433

3534
# database_path = Path(params.get("database_path"))
36-
processed_dir = Path(params.get("processed_directory"))
37-
analysis_dir = Path(processed_dir / "analysis")
35+
xchem_visit_dir = Path(params.get("xchem_visit_dir"))
36+
user_yaml = xchem_visit_dir / ".user.yaml"
37+
processing_dir = Path(params.get("processing_directory"))
38+
auto_dir = processing_dir / "auto"
39+
analysis_dir = Path(auto_dir / "analysis")
3840
pandda_dir = analysis_dir / "pandda2"
3941
model_dir = pandda_dir / "model_building"
4042
panddas_dir = Path(pandda_dir / "panddas")
@@ -53,14 +55,6 @@ def run(self):
5355
dataset_dir = model_dir / dtag
5456
compound_dir = dataset_dir / "compound"
5557

56-
if pipeline_final_params := params.get("pipeline-final", []):
57-
final_directory = Path(pipeline_final_params["path"])
58-
final_directory.mkdir(parents=True, exist_ok=True)
59-
if params.get("create_symlink"):
60-
dlstbx.util.symlink.create_parent_symlink(
61-
final_directory, params.get("create_symlink")
62-
)
63-
6458
self.log.info(f"Processing dtag: {dtag}")
6559

6660
smiles = params.get("smiles")
@@ -127,8 +121,11 @@ def run(self):
127121
pandda2_log = dataset_pdir / "pandda2.log"
128122
attachments.extend([pandda2_log, ligand_cif])
129123

124+
args_string = self.get_pandda_settings(
125+
user_yaml
126+
) # user specified pandda parameters
130127
pandda2_command = f"source {PANDDA_2_DIR}/venv/bin/activate; \
131-
python -u /dls_sw/i04-1/software/PanDDA2/scripts/process_dataset.py --data_dirs={model_dir} --out_dir={panddas_dir} --dtag={dtag} --use_ligand_data=False --local_cpus=1"
128+
python -u /dls_sw/i04-1/software/PanDDA2/scripts/process_dataset.py --data_dirs={model_dir} --out_dir={panddas_dir} --dtag={dtag} --use_ligand_data=False --local_cpus=4 {args_string}"
132129

133130
try:
134131
result = subprocess.run(
@@ -257,6 +254,7 @@ def run(self):
257254
return False
258255

259256
scores = {}
257+
self.log.info(f"Running Ligand Score routine for {build_dir}")
260258

261259
for build_path in builds:
262260
ligand_score = build_dir / f"{build_path.stem}.txt"
@@ -268,8 +266,6 @@ def run(self):
268266
score_command = f"source {PANDDA_2_DIR}/venv/bin/activate; \
269267
python {PANDDA_2_DIR}/scripts/ligand_score.py --mtz_path={mtz_file} --zmap_path={z_map} --ligand_id={ligand_id} --structure_path={build_path} --out_path={ligand_score}"
270268

271-
self.log.info(f"Running Ligand Score command: {score_command}")
272-
273269
try:
274270
os.system(score_command)
275271

@@ -287,9 +283,8 @@ def run(self):
287283
self.log.info(f"Best ligand score for {dtag} = {score}")
288284

289285
# -------------------------------------------------------
290-
# Best build merging
286+
# Merge the protein structure with best fitted ligand -> pandda model
291287

292-
# Merge the protein structure with ligand -> pandda model
293288
protein_st_file = dataset_pdir / f"{dtag}-pandda-input.pdb"
294289
ligand_st_file = best_build_path
295290
pandda_model = modelled_dir / f"{dtag}-pandda-model.pdb"
@@ -501,13 +496,23 @@ def get_contact_chain(self, protein_st, ligand_st):
501496

502497
return min(chain_counts, key=lambda _x: chain_counts[_x])
503498

499+
def get_pandda_settings(self, yaml_file):
500+
with open(yaml_file, "r") as file:
501+
expt_yaml = yaml.load(file, Loader=yaml.SafeLoader)
502+
settings = expt_yaml["autoprocessing"]["pandda"]
503+
if settings:
504+
args_string = " ".join(f"--{k}={v}" for k, v in settings.items())
505+
else:
506+
args_string = ""
507+
return args_string
508+
504509
def send_attachments_to_ispyb(self, attachments, batch):
505510
if batch: # synchweb attachments not supported for array job processing
506511
return
507512
for f in attachments:
508513
if f.exists():
509514
if f.suffix == ".html":
510-
file_type = "Result"
515+
file_type = "Result" # 'Graph', 'Debug'
511516
importance_rank = 1
512517
elif f.suffix == ".ccp4":
513518
file_type = "Result"

0 commit comments

Comments
 (0)