Openbind processing update (#352)

rowanwalker96 · web-flow · commit 0d833d72c89b · 2026-03-31T12:49:26.000+01:00
* Move output to processing directory

* Take forward most recent processing batch

* Integrate back with XCE via datasource

* Add option for user specified params

* Improve logging
diff --git a/src/dlstbx/services/trigger_xchem.py b/src/dlstbx/services/trigger_xchem.py
@@ -57,7 +57,7 @@ class PanDDAParameters(pydantic.BaseModel):
     automatic: Optional[bool] = False
     comment: Optional[str] = None
     scaling_id: list[int]
-    timeout: float = pydantic.Field(default=60, alias="timeout-minutes")
+    timeout: float = pydantic.Field(default=120, alias="timeout-minutes")
     backoff_delay: float = pydantic.Field(default=45, alias="backoff-delay")
     backoff_max_try: int = pydantic.Field(default=30, alias="backoff-max-try")
     backoff_multiplier: float = pydantic.Field(default=1.1, alias="backoff-multiplier")
@@ -69,7 +69,7 @@ class PanDDA_PostParameters(pydantic.BaseModel):
     automatic: Optional[bool] = False
     comment: Optional[str] = None
     scaling_id: list[int]
-    processed_directory: str
+    processing_directory: str
     timeout: float = pydantic.Field(default=60, alias="timeout-minutes")
 
 
@@ -242,7 +242,7 @@ def trigger_pandda_xchem(
         proposal_number = proposal.proposalNumber
         proposal_string = proposal_code + proposal_number
 
-        # TEMPORARY FILTER
+        # TEMPORARY PROPOSAL FILTER
         allowed_proposals = ["lb42888", "sw44107", "lb36049"]
 
         # 0. Check that this is an XChem expt & locate .SQLite database
@@ -265,7 +265,7 @@ def trigger_pandda_xchem(
         location = int(query.one()[1])
         container_code = query.one()[2]
 
-        # Get user defined spacegroup
+        # Get the user defined spacegroup
         query = (
             session.query(Crystal.spaceGroup)
             .join(BLSample, BLSample.crystalId == Crystal.crystalId)
@@ -296,7 +296,7 @@ def trigger_pandda_xchem(
                 # match_yaml = expt_yaml
                 self.log.info(f"Found user yaml for dtag {dtag} at {yaml_file}")
 
-        # account for potentially multiple labxchem visits for a single target
+        # account for potentially multiple labxchem visits for a single target in proposal
         if len(match_dirs) == 1:
             match_dir = match_dirs[0]
         elif len(match_dirs) > 1:
@@ -365,6 +365,7 @@ def trigger_pandda_xchem(
             return {"success": True}
         else:
             xchem_visit_dir = match_dir
+            processing_dir = xchem_visit_dir / "processing"
             # user_settings = match_yaml["autoprocessing"]
 
         if xchem_visit_dir:
@@ -377,12 +378,7 @@ def trigger_pandda_xchem(
             )
             return {"success": True}
 
-        processing_dir = xchem_visit_dir / "processing"
-        db = processing_dir / "database" / "soakDBDataFile.sqlite"
-        processed_dir = xchem_visit_dir / "processed"
-
         # 1. Trigger when all upstream pipelines & related dimple jobs have finished
-
         program_list = [
             "xia2 dials",
             "xia2 3dii",
@@ -560,6 +556,11 @@ def trigger_pandda_xchem(
 
             if not df_filteredbysg.empty:
                 df = df_filteredbysg
+                n_success_upstream = len(df)
+                self.log.info(
+                    f"There are {n_success_upstream} successful upstream jobs (excl fast-dp) in the user defined spacegroup {user_sg} \
+                    selecting the best one based on I/sigI*completeness * #unique reflections, from the most recent processing batch"
+                )
 
         # rank datasets by I/sigI*completeness*# unique reflections
         df["heuristic"] = (
@@ -571,7 +572,7 @@ def trigger_pandda_xchem(
         df = df[["autoProcScalingId", "heuristic"]].copy()
         scaling_ids = df["autoProcScalingId"].tolist()
 
-        # find associated dimple jobs from scaling_ids
+        # find associated dimple jobs from scaling_ids, take most recent batch if reprocessing
         query = (
             (
                 session.query(
@@ -610,17 +611,21 @@ def trigger_pandda_xchem(
             )
             return {"success": True}
 
-        n_success_upstream = len(df)
-        n_success_dimple = len(df2)
+        # mark a new batch whenever the gap is >= 12 hours
+        df2 = df2.sort_values("processingStartTime").reset_index(drop=True)
+        df2["time_diff"] = df2["processingStartTime"].diff()
+        df2["batch"] = (df2["time_diff"] >= pd.Timedelta(hours=12)).cumsum() + 1
+        recent_batch = df2[df2["batch"] == df2["batch"].max()].copy()
 
-        self.log.info(
-            f"There are {n_success_upstream} successful upstream jobs (excl fast-dp) & {n_success_dimple} successful dimple jobs, \
-            selecting the best one based on heuristic: I/sigI*completeness * #unique reflections"
-        )
-
-        df2["parameterValue"] = pd.to_numeric(df2["parameterValue"]).astype("Int64")
+        recent_batch["parameterValue"] = pd.to_numeric(
+            recent_batch["parameterValue"]
+        ).astype("Int64")
         df3 = pd.merge(
-            df2, df, left_on="parameterValue", right_on="autoProcScalingId", how="inner"
+            recent_batch,
+            df,
+            left_on="parameterValue",
+            right_on="autoProcScalingId",
+            how="inner",
         ).sort_values("heuristic", ascending=False)
 
         if df3.empty:
@@ -653,16 +658,18 @@ def trigger_pandda_xchem(
 
         # 2. Read XChem SQLite database for ligand info
 
+        db_master = processing_dir / "database" / "soakDBDataFile.sqlite"
+
         try:
-            conn = sqlite3.connect(f"file:{db}?mode=ro", uri=True, timeout=10)
+            conn = sqlite3.connect(f"file:{db_master}?mode=ro", uri=True, timeout=10)
             df = pd.read_sql_query(
                 f"SELECT * from mainTable WHERE Puck = '{container_code}' AND PuckPosition = {location} AND CrystalName = '{dtag}'",
                 conn,
             )
 
         except Exception as e:
             self.log.info(
-                f"Exiting PanDDA2/Pipedream trigger: Exception whilst reading ligand information from {db} for dtag {dtag}, dcid {dcid}: {e}"
+                f"Exiting PanDDA2/Pipedream trigger: Exception whilst reading ligand information from {db_master} for dtag {dtag}, dcid {dcid}: {e}"
             )
             return {"success": True}
 
@@ -672,7 +679,7 @@ def trigger_pandda_xchem(
 
         if len(df) != 1:
             self.log.info(
-                f"Exiting PanDDA2/Pipedream trigger: Unique row in .sqlite for dtag {dtag}, puck {container_code}, puck position {location} cannot be found in {db}, skipping dcid {dcid}"
+                f"Exiting PanDDA2/Pipedream trigger: Unique row in .sqlite for dtag {dtag}, puck {container_code}, puck position {location} cannot be found in {db_master}, skipping dcid {dcid}"
             )
             return {"success": True}
 
@@ -697,7 +704,8 @@ def trigger_pandda_xchem(
             return {"success": True}
 
         # 3. Create dataset directory structure
-        analysis_dir = processed_dir / "analysis"
+        auto_dir = processing_dir / "auto"
+        analysis_dir = auto_dir / "analysis"
         pandda_dir = analysis_dir / "pandda2"
         model_dir = pandda_dir / "model_building"
         dataset_dir = model_dir / dtag
@@ -736,16 +744,16 @@ def trigger_pandda_xchem(
                 smi_file.write(CompoundSMILES)
 
         # 4. Job launch logic
-
         recipe_parameters = {
             "dcid": dcid,
-            "processed_directory": str(processed_dir),
+            "xchem_visit_dir": str(xchem_visit_dir),
+            "processing_directory": str(processing_dir),
             "model_directory": str(model_dir),
             "dtag": dtag,
             "n_datasets": 1,
             "scaling_id": scaling_id,
             "comparator_threshold": comparator_threshold,
-            "database_path": str(db),
+            "database_path": str(db_master),
             "upstream_mtz": pathlib.Path(upstream_mtz).parts[-1],
             "smiles": str(CompoundSMILES),
         }
@@ -769,6 +777,7 @@ def trigger_pandda_xchem(
 
             with open(model_dir / ".batch.json", "w") as f:
                 json.dump(dataset_list, f)
+                # cannot pass as ispyb_parameter
 
             self.log.info(
                 f"{dataset_count} = comparator dataset threshold of {comparator_threshold}, launching PanDDA2 array job"
@@ -821,7 +830,7 @@ def trigger_pandda_xchem_post(
 
         dcid = parameters.dcid
         scaling_id = parameters.scaling_id[0]
-        processed_directory = pathlib.Path(parameters.processed_directory)
+        processing_directory = pathlib.Path(parameters.processing_directory)
 
         _, ispyb_info = dlstbx.ispybtbx.ispyb_filter({}, {"ispyb_dcid": dcid}, session)
         visit = ispyb_info.get("ispyb_visit", "")
@@ -868,7 +877,7 @@ def trigger_pandda_xchem_post(
 
         recipe_parameters = {
             "dcid": dcid,  #
-            "processed_directory": str(processed_directory),
+            "processing_directory": str(processing_directory),
             "scaling_id": scaling_id,
         }
 
diff --git a/src/dlstbx/wrapper/pandda_post.py b/src/dlstbx/wrapper/pandda_post.py
@@ -16,15 +16,16 @@ def run(self):
         )
 
         params = self.recwrap.recipe_step["job_parameters"]
-        processed_dir = Path(params.get("processed_directory"))
-        analysis_dir = processed_dir / "analysis"
+        processing_dir = Path(params.get("processing_directory"))
+        auto_dir = processing_dir / "auto"
+        analysis_dir = auto_dir / "analysis"
         pandda_dir = analysis_dir / "pandda2"
         model_dir = pandda_dir / "model_building"
-        auto_panddas_dir = Path(pandda_dir / "panddas")
+        panddas_dir = Path(pandda_dir / "panddas")
 
         # -------------------------------------------------------
         pandda2_command = f"source /dls_sw/i04-1/software/PanDDA2/venv/bin/activate; \
-        python -u /dls_sw/i04-1/software/PanDDA2/scripts/postrun.py --data_dirs={model_dir} --out_dir={auto_panddas_dir} --use_ligand_data=False --debug=True --local_cpus=1 > {auto_panddas_dir / 'pandda2_postrun.log'}"
+        python -u /dls_sw/i04-1/software/PanDDA2/scripts/postrun.py --data_dirs={model_dir} --out_dir={panddas_dir} --use_ligand_data=False --debug=True --local_cpus=1 > {panddas_dir / 'pandda2_postrun.log'}"
 
         self.log.info("Running PanDDA2 command: {pandda2_command}")
 
@@ -34,7 +35,7 @@ def run(self):
                 shell=True,
                 capture_output=True,
                 text=True,
-                cwd=auto_panddas_dir,
+                cwd=panddas_dir,
                 check=True,
                 timeout=params.get("timeout-minutes") * 60,
             )
diff --git a/src/dlstbx/wrapper/pandda_xchem.py b/src/dlstbx/wrapper/pandda_xchem.py
@@ -10,7 +10,6 @@
 import numpy as np
 import yaml
 
-import dlstbx.util.symlink
 from dlstbx.util.mvs.helpers import (
     find_residue_by_name,
     save_cropped_map,
@@ -33,8 +32,11 @@ def run(self):
         params = self.recwrap.recipe_step["job_parameters"]
 
         # database_path = Path(params.get("database_path"))
-        processed_dir = Path(params.get("processed_directory"))
-        analysis_dir = Path(processed_dir / "analysis")
+        xchem_visit_dir = Path(params.get("xchem_visit_dir"))
+        user_yaml = xchem_visit_dir / ".user.yaml"
+        processing_dir = Path(params.get("processing_directory"))
+        auto_dir = processing_dir / "auto"
+        analysis_dir = Path(auto_dir / "analysis")
         pandda_dir = analysis_dir / "pandda2"
         model_dir = pandda_dir / "model_building"
         panddas_dir = Path(pandda_dir / "panddas")
@@ -53,14 +55,6 @@ def run(self):
         dataset_dir = model_dir / dtag
         compound_dir = dataset_dir / "compound"
 
-        if pipeline_final_params := params.get("pipeline-final", []):
-            final_directory = Path(pipeline_final_params["path"])
-            final_directory.mkdir(parents=True, exist_ok=True)
-            if params.get("create_symlink"):
-                dlstbx.util.symlink.create_parent_symlink(
-                    final_directory, params.get("create_symlink")
-                )
-
         self.log.info(f"Processing dtag: {dtag}")
 
         smiles = params.get("smiles")
@@ -127,8 +121,11 @@ def run(self):
         pandda2_log = dataset_pdir / "pandda2.log"
         attachments.extend([pandda2_log, ligand_cif])
 
+        args_string = self.get_pandda_settings(
+            user_yaml
+        )  # user specified pandda parameters
         pandda2_command = f"source {PANDDA_2_DIR}/venv/bin/activate; \
-        python -u /dls_sw/i04-1/software/PanDDA2/scripts/process_dataset.py --data_dirs={model_dir} --out_dir={panddas_dir} --dtag={dtag} --use_ligand_data=False --local_cpus=1"
+        python -u /dls_sw/i04-1/software/PanDDA2/scripts/process_dataset.py --data_dirs={model_dir} --out_dir={panddas_dir} --dtag={dtag} --use_ligand_data=False --local_cpus=4 {args_string}"
 
         try:
             result = subprocess.run(
@@ -257,6 +254,7 @@ def run(self):
             return False
 
         scores = {}
+        self.log.info(f"Running Ligand Score routine for {build_dir}")
 
         for build_path in builds:
             ligand_score = build_dir / f"{build_path.stem}.txt"
@@ -268,8 +266,6 @@ def run(self):
             score_command = f"source {PANDDA_2_DIR}/venv/bin/activate; \
             python {PANDDA_2_DIR}/scripts/ligand_score.py --mtz_path={mtz_file} --zmap_path={z_map} --ligand_id={ligand_id} --structure_path={build_path} --out_path={ligand_score}"
 
-            self.log.info(f"Running Ligand Score command: {score_command}")
-
             try:
                 os.system(score_command)
 
@@ -287,9 +283,8 @@ def run(self):
         self.log.info(f"Best ligand score for {dtag} = {score}")
 
         # -------------------------------------------------------
-        # Best build merging
+        # Merge the protein structure with best fitted ligand -> pandda model
 
-        # Merge the protein structure with ligand -> pandda model
         protein_st_file = dataset_pdir / f"{dtag}-pandda-input.pdb"
         ligand_st_file = best_build_path
         pandda_model = modelled_dir / f"{dtag}-pandda-model.pdb"
@@ -501,13 +496,23 @@ def get_contact_chain(self, protein_st, ligand_st):
 
         return min(chain_counts, key=lambda _x: chain_counts[_x])
 
+    def get_pandda_settings(self, yaml_file):
+        with open(yaml_file, "r") as file:
+            expt_yaml = yaml.load(file, Loader=yaml.SafeLoader)
+        settings = expt_yaml["autoprocessing"]["pandda"]
+        if settings:
+            args_string = " ".join(f"--{k}={v}" for k, v in settings.items())
+        else:
+            args_string = ""
+        return args_string
+
     def send_attachments_to_ispyb(self, attachments, batch):
         if batch:  # synchweb attachments not supported for array job processing
             return
         for f in attachments:
             if f.exists():
                 if f.suffix == ".html":
-                    file_type = "Result"
+                    file_type = "Result"  # 'Graph', 'Debug'
                     importance_rank = 1
                 elif f.suffix == ".ccp4":
                     file_type = "Result"
diff --git a/src/dlstbx/wrapper/pipedream_xchem.py b/src/dlstbx/wrapper/pipedream_xchem.py