Skip to content

Commit 73f6d3e

Browse files
committed
store main_isoform info in pickle and orthoxml
1 parent 6ca4dce commit 73f6d3e

5 files changed

Lines changed: 63 additions & 34 deletions

File tree

FastOMA/_utils_roothog.py

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
mmseqs_executable_path ="mmseqs"
2121

2222
HOGMapData = collections.namedtuple("HOGMapData", ("hogid", "score", "seqlen", "subfamily_medianseqlen"))
23-
23+
Gene = collections.namedtuple("Gene", ("numeric_id", "prot_id", "main_isoform"), defaults=(None,) )
24+
2425

2526
"""UnionFind.py
2627
@@ -231,24 +232,21 @@ def handle_splice_variants(species_names, hogmaps, splice_folder):
231232
}
232233

233234

234-
def add_species_name_prot_id( prot_recs_lists):
235+
def add_species_name_prot_id(prot_recs_lists):
235236
"""
236237
adding the name of species to each protein record
237238
- based on file name
238239
adding protein idx number, integer needed by xml format
239240
output: prot_recs_all = {'MYCGE': {'sp|P47500|RF1_MYCGE||MYCGE||1000000001': SeqRecord(seq=
240241
"""
241-
prot_idx_name_pickle_file = "./gene_id_dic_xml.pickle"
242242
start_num_prot = int(1e9)
243243
start_num_prot_per_sp = int(1e6) #
244244
prot_recs_all = {} # {'MYCGE': {'sp|P47500|RF1_MYCGE||MYCGE||1000000001': SeqRecord(seq=
245-
prot_idx_name = {} # {'MYCGE': [(1000000001, 'sp|P47500|RF1_MYCGE'),(1000000002, 'sp|P13927|EFTU_MYCGE'),
246245
species_idx = -1
247246
for species_name, prot_recs_list in prot_recs_lists.items():
248247
species_idx += 1
249248
prot_idx = start_num_prot + species_idx * start_num_prot_per_sp
250249
prot_recs_all[species_name]={}
251-
prot_idx_name[species_name] = []
252250
for prot_rec in prot_recs_list:
253251
prot_idx+=1
254252
prot_name= prot_rec.id
@@ -257,16 +255,9 @@ def add_species_name_prot_id( prot_recs_lists):
257255
logger.info("We are truncating the prot name as it may be problematic for mafft, " + str(prot_name))
258256
prot_name = prot_name[:230]
259257

260-
# todo, this could be a dic
261-
prot_idx_name[species_name].append((prot_idx, prot_name))
262-
263258
prot_name_new = prot_name+ "||"+species_name+"||"+str(prot_idx) # orthoxml file needs an integer as
264259
prot_rec.id = prot_name_new
265260
prot_recs_all[species_name][prot_name] = prot_rec
266-
267-
with open(prot_idx_name_pickle_file, 'wb') as handle:
268-
pickle.dump(prot_idx_name, handle, protocol=pickle.HIGHEST_PROTOCOL)
269-
270261
return prot_recs_all
271262

272263

@@ -401,6 +392,36 @@ def resolve_singletons(rhogs_prots, hogmaps, conf):
401392
logger.info(f"Now, we have {counter_not_singleton} rootHOGs with >1 proteins and {counter_rhog_singleton} singleton rootHOGs")
402393
return rhogs_prots
403394

395+
def save_gene_id_mapping(prot_recs_all, isoform_data=None, out_path="gene_id_dic_xml.pickle"):
396+
"""Save comprehensive gene ID mapping including isoform information"""
397+
398+
gene_mapping = {}
399+
for species_name, prot_dict in prot_recs_all.items():
400+
species_data = []
401+
402+
isoform_to_main = {}
403+
if isoform_data and species_name in isoform_data['selected_isoforms']:
404+
isoform_to_main = {gene: main for main, genes in zip(
405+
isoform_data['selected_isoforms'][species_name], isoform_data['isoform_by_gene'][species_name]
406+
) for gene in genes}
407+
408+
id2numeric = {}
409+
for prot_name, prot_rec in prot_dict.items():
410+
parts = prot_rec.id.split('||')
411+
original_id = parts[0]
412+
numeric_id = int(parts[2])
413+
id2numeric[original_id] = numeric_id
414+
415+
species_data = [Gene(num, orig_id, id2numeric.get(isoform_to_main.get(orig_id))) for orig_id, num in id2numeric.items()]
416+
gene_mapping[species_name] = species_data
417+
418+
# Save to pickle file
419+
with open(out_path, 'wb') as handle:
420+
pickle.dump(gene_mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
421+
422+
logger.info(f"Saved comprehensive gene ID mapping to {out_path}")
423+
return gene_mapping
424+
404425

405426
def filter_big_roothogs(hogmaps, rhogs_prots, conf_infer_roothogs):
406427

FastOMA/collect_subhogs.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,9 +178,11 @@ def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file, id_t
178178
species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "taxonId": str(name2taxid[query_species_name]), "NCBITaxId": "0"})
179179
database_xml = ET.SubElement(species_xml, "database", attrib={"name": "database", "version": "2023"})
180180
genes_xml = ET.SubElement(database_xml, "genes")
181-
for (gene_idx_integer, query_prot_name) in list_prots:
182-
prot_id = id_transformer.transform(query_prot_name)
183-
gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
181+
for gene in list_prots:
182+
attribs = {"id": str(gene.numeric_id), "protId": id_transformer.transform(gene.prot_id)}
183+
if gene.main_isoform is not None:
184+
attribs["main_isoform"] = str(gene.main_isoform)
185+
gene_xml = ET.SubElement(genes_xml, "gene", attrib=attribs)
184186
logger.debug("gene_xml is created.")
185187
orthoxml_file.append(taxonomy)
186188

FastOMA/infer_roothogs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,8 @@ def fastoma_infer_roothogs():
7070

7171

7272
# Step 4: Save results
73-
# logger.info("Saving results...")
74-
# _utils_roothog.save_gene_id_mapping(prot_recs_all, isoform_data)
73+
logger.info("Saving results...")
74+
_utils_roothog.save_gene_id_mapping(prot_recs_all, isoform_data)
7575
# _utils_roothog.write_root_hogs(rhogs_prots, prot_recs_all, conf.out_rhog_folder)
7676

7777
logger.info(f"Successfully created {len(rhogs_prots)} root HOGs")

utils/pickle2orthoxml.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@
1212
from FastOMA.collect_subhogs import update_hogids
1313
from pathlib import Path
1414

15-
```
15+
"""
1616
python pickle2orthoxml.py "no_header" "file_D0680685.pickle"
1717
1818
python pickle2orthoxml.py "selected_genes" pickle_folder gene_id_dic_xml.pickle "species_tree_checked.nwk" # this will be slow. gene_id_dic_xml.pickle is in the output of infer_roothogs
19-
```
19+
"""
2020

2121
mode = sys.argv[1] #"selected_genes" #"no_header" # "selected_genes" "all_genes"
2222

@@ -71,19 +71,24 @@
7171
# #### create the header of orthoxml ####
7272
for query_species_name, list_prots in gene_id_name.items():
7373
first=True
74-
for (gene_idx_integer, query_prot_name) in list_prots:
75-
if gene_idx_integer in gene_int_set:
74+
for gene in list_prots:
75+
if gene.numeric_id in gene_int_set:
7676
if first:
7777
species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "taxonId": str(name2taxid[query_species_name]), "NCBITaxId": "0"})
7878
database_xml = ET.SubElement(species_xml, "database", attrib={"name": "database", "version": "2023"})
7979
genes_xml = ET.SubElement(database_xml, "genes")
80-
prot_id = id_transformer.transform(query_prot_name)
81-
gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
80+
prot_id = id_transformer.transform(gene.prot_id)
81+
attribs = {"id": str(gene.numeric_id), "protId": prot_id}
82+
if gene.main_isoform is not None:
83+
attribs["main_isoform"]= str(gene.main_isoform)
84+
gene_xml = ET.SubElement(genes_xml, "gene", attrib=attribs)
8285
first=False
8386
else:
84-
prot_id = id_transformer.transform(query_prot_name)
85-
gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
86-
87+
prot_id = id_transformer.transform(gene.prot_id)
88+
attribs = {"id": str(gene.numeric_id), "protId": prot_id}
89+
if gene.main_isoform is not None:
90+
attribs["main_isoform"]= str(gene.main_isoform)
91+
gene_xml = ET.SubElement(genes_xml, "gene", attrib=attribs)
8792

8893

8994
print("gene_xml is created.")

utils/write_orthoxml_per_rHOG.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,10 @@
5050

5151
query_species_name_list = []
5252
for query_species_name, list_prots in gene_id_name.items():
53-
54-
for (gene_idx_integer, query_prot_name) in list_prots:
55-
if gene_idx_integer in list_geneid:
53+
for gene in list_prots:
54+
if gene.numeric_id in list_geneid:
5655
query_species_name_list.append(query_species_name)
56+
break # early abort as we found one protein for this species
5757

5858
query_species_name_set = list(set(query_species_name_list))
5959

@@ -67,11 +67,12 @@
6767
database_xml = ET.SubElement(species_xml, "database", attrib={"name": " database ", "version": "2020"})
6868
genes_xml = ET.SubElement(database_xml, "genes")
6969

70-
for (gene_idx_integer, query_prot_name) in list_prots:
71-
if gene_idx_integer in list_geneid: # +[1007003758]
72-
query_prot_name_pure = query_prot_name
73-
gene_xml = ET.SubElement(genes_xml, "gene",
74-
attrib={"id": str(gene_idx_integer), "protId": query_prot_name_pure})
70+
for gene in list_prots:
71+
if gene.numeric_id in list_geneid: # +[1007003758]
72+
attribs = {"id": str(gene.numeric_id), "protId": gene.prot_id}
73+
if gene.main_isoform is not None:
74+
attribs["main_isoform"] = str(gene.main_isoform)
75+
gene_xml = ET.SubElement(genes_xml, "gene", attrib=attribs)
7576

7677
groups_xml = ET.SubElement(orthoxml_file, "groups")
7778

0 commit comments

Comments
 (0)