2020mmseqs_executable_path = "mmseqs"
2121
2222HOGMapData = collections .namedtuple ("HOGMapData" , ("hogid" , "score" , "seqlen" , "subfamily_medianseqlen" ))
23-
23+ Gene = collections .namedtuple ("Gene" , ("numeric_id" , "prot_id" , "main_isoform" ), defaults = (None ,) )
24+
2425
2526"""UnionFind.py
2627
@@ -231,24 +232,21 @@ def handle_splice_variants(species_names, hogmaps, splice_folder):
231232 }
232233
233234
234- def add_species_name_prot_id ( prot_recs_lists ):
235+ def add_species_name_prot_id (prot_recs_lists ):
235236 """
236237 adding the name of species to each protein record
237238 - based on file name
238239 adding protein idx number, integer needed by xml format
239240 output: prot_recs_all = {'MYCGE': {'sp|P47500|RF1_MYCGE||MYCGE||1000000001': SeqRecord(seq=
240241 """
241- prot_idx_name_pickle_file = "./gene_id_dic_xml.pickle"
242242 start_num_prot = int (1e9 )
243243 start_num_prot_per_sp = int (1e6 ) #
244244 prot_recs_all = {} # {'MYCGE': {'sp|P47500|RF1_MYCGE||MYCGE||1000000001': SeqRecord(seq=
245- prot_idx_name = {} # {'MYCGE': [(1000000001, 'sp|P47500|RF1_MYCGE'),(1000000002, 'sp|P13927|EFTU_MYCGE'),
246245 species_idx = - 1
247246 for species_name , prot_recs_list in prot_recs_lists .items ():
248247 species_idx += 1
249248 prot_idx = start_num_prot + species_idx * start_num_prot_per_sp
250249 prot_recs_all [species_name ]= {}
251- prot_idx_name [species_name ] = []
252250 for prot_rec in prot_recs_list :
253251 prot_idx += 1
254252 prot_name = prot_rec .id
@@ -257,16 +255,9 @@ def add_species_name_prot_id( prot_recs_lists):
257255 logger .info ("We are truncating the prot name as it may be problematic for mafft, " + str (prot_name ))
258256 prot_name = prot_name [:230 ]
259257
260- # todo, this could be a dic
261- prot_idx_name [species_name ].append ((prot_idx , prot_name ))
262-
263258 prot_name_new = prot_name + "||" + species_name + "||" + str (prot_idx ) # orthoxml file needs an integer as
264259 prot_rec .id = prot_name_new
265260 prot_recs_all [species_name ][prot_name ] = prot_rec
266-
267- with open (prot_idx_name_pickle_file , 'wb' ) as handle :
268- pickle .dump (prot_idx_name , handle , protocol = pickle .HIGHEST_PROTOCOL )
269-
270261 return prot_recs_all
271262
272263
@@ -401,6 +392,36 @@ def resolve_singletons(rhogs_prots, hogmaps, conf):
401392 logger .info (f"Now, we have { counter_not_singleton } rootHOGs with >1 proteins and { counter_rhog_singleton } singleton rootHOGs" )
402393 return rhogs_prots
403394
395+ def save_gene_id_mapping (prot_recs_all , isoform_data = None , out_path = "gene_id_dic_xml.pickle" ):
396+ """Save comprehensive gene ID mapping including isoform information"""
397+
398+ gene_mapping = {}
399+ for species_name , prot_dict in prot_recs_all .items ():
400+ species_data = []
401+
402+ isoform_to_main = {}
403+ if isoform_data and species_name in isoform_data ['selected_isoforms' ]:
404+ isoform_to_main = {gene : main for main , genes in zip (
405+ isoform_data ['selected_isoforms' ][species_name ], isoform_data ['isoform_by_gene' ][species_name ]
406+ ) for gene in genes }
407+
408+ id2numeric = {}
409+ for prot_name , prot_rec in prot_dict .items ():
410+ parts = prot_rec .id .split ('||' )
411+ original_id = parts [0 ]
412+ numeric_id = int (parts [2 ])
413+ id2numeric [original_id ] = numeric_id
414+
415+ species_data = [Gene (num , orig_id , id2numeric .get (isoform_to_main .get (orig_id ))) for orig_id , num in id2numeric .items ()]
416+ gene_mapping [species_name ] = species_data
417+
418+ # Save to pickle file
419+ with open (out_path , 'wb' ) as handle :
420+ pickle .dump (gene_mapping , handle , protocol = pickle .HIGHEST_PROTOCOL )
421+
422+ logger .info (f"Saved comprehensive gene ID mapping to { out_path } " )
423+ return gene_mapping
424+
404425
405426def filter_big_roothogs (hogmaps , rhogs_prots , conf_infer_roothogs ):
406427
0 commit comments