Skip to content

Commit fbe663f

Browse files
authored
Metal ID fixes and improvements (#324)
Adds the following fixes and improvements to the metal_id pipeline: * Fix bug where peaks with negative coordinates not being parsed * Set symlink from the trigger function * Handle case where peaks are not found by metal_id * Optionally use pipeline_final directory (if specified in the recipe) * Specify primary result and log file in the recipe
1 parent dc4e626 commit fbe663f

2 files changed

Lines changed: 83 additions & 48 deletions

File tree

src/dlstbx/services/trigger.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class MetalIdParameters(pydantic.BaseModel):
9292
backoff_multiplier: float = pydantic.Field(default=2, alias="backoff-multiplier")
9393
automatic: Optional[bool] = False
9494
comment: Optional[str] = None
95+
symlink: str = pydantic.Field(default="")
9596

9697

9798
class ProteinInfo(pydantic.BaseModel):
@@ -858,6 +859,7 @@ def trigger_metal_id(
858859
"data": [mtz_file_below.as_posix(), mtz_file_above.as_posix()],
859860
"scaling_id": [parameters.scaling_id],
860861
"pdb": pdb_files,
862+
"symlink": [parameters.symlink] if parameters.symlink else [],
861863
}
862864

863865
self.log.debug("Metal_id trigger: Starting")

src/dlstbx/wrapper/metal_id.py

Lines changed: 81 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -7,59 +7,53 @@
77
import re
88
import shutil
99
import subprocess
10+
from dataclasses import dataclass
1011
from datetime import datetime
12+
from typing import Any
1113

1214
import dlstbx.util.symlink
1315
from dlstbx import schemas
1416
from dlstbx.wrapper import Wrapper
1517

1618

19+
@dataclass
20+
class PeakData:
21+
density: float
22+
rmsd: float
23+
xyz: tuple[float, float, float]
24+
25+
1726
class MetalIdWrapper(Wrapper):
1827
_logger_name = "dlstbx.wrap.metal_id"
1928

20-
def parse_peak_data(self, peak_data_file):
21-
peak_data = []
29+
def parse_peak_data(self, peak_data_file: pathlib.Path) -> list[PeakData]:
30+
peak_data: list[PeakData] = []
2231
with open(peak_data_file, "r") as file:
2332
for line in file:
2433
match = re.match(
25-
r"Peak \d+: Electron Density = ([\d.]+) e/Å\^3, RMSD = ([\d.]+), XYZ = \(([\d.]+), ([\d.]+), ([\d.]+)\)",
34+
r"Peak \d+: Electron Density = ([\d.]+) e/Å\^3, RMSD = ([\d.]+), XYZ = \((-?[\d.]+), (-?[\d.]+), (-?[\d.]+)\)",
2635
line,
2736
)
2837
if match:
29-
electron_density = float(match.group(1))
38+
density = float(match.group(1))
3039
rmsd = float(match.group(2))
3140
xyz = (
3241
float(match.group(3)),
3342
float(match.group(4)),
3443
float(match.group(5)),
3544
)
36-
peak_data.append(
37-
{"electron_density": electron_density, "rmsd": rmsd, "xyz": xyz}
38-
)
45+
peak_data.append(PeakData(density=density, rmsd=rmsd, xyz=xyz))
3946
return peak_data
4047

4148
def send_results_to_ispyb(
4249
self,
43-
peak_data,
44-
metal_id_command,
45-
dimple_log_file,
46-
results_directory,
47-
start_time,
48-
):
49-
scaling_id = self.params.get("scaling_id", [])
50-
if len(scaling_id) != 1:
51-
self.log.info(f"Scaling ID {scaling_id} provided")
52-
self.log.error(
53-
"Exactly one scaling_id must be provided - cannot insert metal_id results to ISPyB"
54-
)
55-
return False
56-
scaling_id = scaling_id[0]
57-
58-
if not dimple_log_file.is_file():
59-
self.log.error(
60-
f"dimple log file '{dimple_log_file}' not found - cannot insert metal_id results to ISPyB"
61-
)
62-
return False
50+
peak_data: list[PeakData],
51+
metal_id_command: str,
52+
dimple_log_file: pathlib.Path,
53+
results_directory: pathlib.Path,
54+
start_time: datetime,
55+
scaling_id: int,
56+
) -> dict[str, Any]:
6357
self.log.info(
6458
f"Autoproc_prog_id: '{self.recwrap.environment.get('ispyb_autoprocprogram_id')}'"
6559
)
@@ -82,12 +76,12 @@ def send_results_to_ispyb(
8276
blobs = []
8377
for n_peak, peak in enumerate(peak_data, start=1):
8478
self.log.info(
85-
f"Adding blob {n_peak} to ispyb results - Density: {peak['electron_density']}, rmsd: {peak['rmsd']}, xyz: {peak['xyz']}"
79+
f"Adding blob {n_peak} to ispyb results - Density: {peak.density}, rmsd: {peak.rmsd}, xyz: {peak.xyz}"
8680
)
8781
blobs.append(
8882
schemas.Blob(
89-
xyz=peak["xyz"],
90-
height=peak["electron_density"],
83+
xyz=peak.xyz,
84+
height=peak.density,
9185
# nearest_atom=nearest_atom,
9286
# nearest_atom_distance=distance,
9387
map_type="difference", # TODO change this to anomalous_difference once enum exists.
@@ -106,20 +100,18 @@ def send_results_to_ispyb(
106100
)
107101

108102
attachments = []
103+
104+
primary_result_files = self.params.get("primary_result_files", {})
109105
self.log.info("Adding attachments for upload to ispyb")
110106
for f in results_directory.iterdir():
111-
if f.suffix not in [".map", ".log", ".py", ".pha", ".pdb", ".dat"]:
112-
self.log.info(f"Skipping file {f.name}")
113-
continue
114-
elif f.suffix in [".map", ".pdb", ".dat"]:
115-
file_type = "result"
116-
importance_rank = 1
117-
elif f.suffix in [".pha", ".mtz"]:
107+
if f.name in primary_result_files:
108+
file_type = primary_result_files[f.name]["type"]
109+
importance_rank = primary_result_files[f.name]["rank"]
110+
elif f.suffix in [".map", ".pdb", ".dat", ".pha", ".mtz"]:
118111
file_type = "result"
119112
importance_rank = 2
120113
else:
121-
file_type = "log"
122-
importance_rank = 3
114+
continue
123115

124116
attachments.append(
125117
schemas.Attachment(
@@ -132,6 +124,16 @@ def send_results_to_ispyb(
132124
)
133125
self.log.info(f"Added {f.name} as an attachment")
134126

127+
if getattr(self, "final_directory", None):
128+
for att in attachments:
129+
if att.file_name in primary_result_files:
130+
shutil.copy(att.file_path / att.file_name, self.final_directory)
131+
att.file_path = self.final_directory
132+
for blob in blobs:
133+
if blob.filepath and blob.view1:
134+
shutil.copy(blob.filepath / blob.view1, self.final_directory)
135+
blob.filepath = self.final_directory
136+
135137
ispyb_results = {
136138
"mxmrrun": json.loads(mxmrrun.model_dump_json()),
137139
"blobs": [json.loads(blob.model_dump_json()) for blob in blobs],
@@ -152,6 +154,15 @@ def run(self):
152154
# Get parameters from the recipe file
153155
self.params = self.recwrap.recipe_step["job_parameters"]
154156

157+
scaling_id = self.params.get("scaling_id", [])
158+
if len(scaling_id) != 1:
159+
self.log.info(f"Scaling ID {scaling_id} provided")
160+
self.log.error(
161+
"Exactly one scaling_id must be provided - cannot run metal_id"
162+
)
163+
return False
164+
scaling_id = scaling_id[0]
165+
155166
src_mtz_files = self.params.get("data", [])
156167
if not src_mtz_files:
157168
self.log.error("Could not identify on what data to run")
@@ -224,33 +235,55 @@ def run(self):
224235
)
225236

226237
self.log.debug("Reading in peak data")
227-
peak_data = self.parse_peak_data(output_directory / "found_peaks.dat")
238+
peak_file = output_directory / "found_peaks.dat"
239+
if not peak_file.is_file():
240+
self.log.info("Metal_ID: No peaks found")
241+
peak_data = []
242+
else:
243+
peak_data = self.parse_peak_data(peak_file)
228244

229245
for f in output_directory.iterdir():
230-
self.log.info(f"Searching for files to copy. Current file is : {f}")
246+
self.log.debug(f"Searching for files to copy. Current file is : {f}")
231247
if f.is_dir():
232248
continue
233249
if f.name.startswith("."):
234250
continue
235-
if any(f.suffix == skipext for skipext in [".r3d"]):
236-
continue
237-
self.log.info("Copying file")
251+
self.log.debug("Copying file")
238252
shutil.copy(f, results_directory)
239253

240-
if self.params.get("create_symlink"):
254+
symlink = self.params.get("create_symlink")
255+
if isinstance(symlink, list):
256+
symlink = symlink[0]
257+
if symlink:
241258
dlstbx.util.symlink.create_parent_symlink(
242-
os.fspath(output_directory), self.params["create_symlink"]
259+
os.fspath(output_directory), symlink
243260
)
244261
dlstbx.util.symlink.create_parent_symlink(
245-
os.fspath(results_directory), self.params["create_symlink"]
262+
os.fspath(results_directory), symlink
246263
)
247264

248265
self.log.info("Sending results to ISPyB")
249266

250267
dimple_log = working_directory / "metal_id" / "dimple_below" / "dimple.log"
268+
if not dimple_log.is_file():
269+
self.log.error(
270+
f"dimple log file '{dimple_log}' not found - cannot insert metal_id results to ISPyB"
271+
)
272+
return False
273+
274+
if pipeine_final_params := self.params.get("pipeline-final", []):
275+
self.final_directory = pathlib.Path(pipeine_final_params["path"])
276+
self.final_directory.mkdir(parents=True, exist_ok=True)
277+
if self.params.get("create_symlink"):
278+
dlstbx.util.symlink.create_parent_symlink(self.final_directory, symlink)
251279

252280
ispyb_results = self.send_results_to_ispyb(
253-
peak_data, metal_id_command, dimple_log, results_directory, start_time
281+
peak_data,
282+
metal_id_command,
283+
dimple_log,
284+
results_directory,
285+
start_time,
286+
scaling_id,
254287
)
255288

256289
self.log.info(f"Sending {str(ispyb_results)} to ispyb service")

0 commit comments

Comments
 (0)