store main_isoform info in pickle and orthoxml

alpae · alpae · commit 73f6d3e5a929 · 2025-12-01T16:09:55.000+01:00
diff --git a/FastOMA/_utils_roothog.py b/FastOMA/_utils_roothog.py
@@ -20,7 +20,8 @@
 mmseqs_executable_path ="mmseqs"
 
 HOGMapData = collections.namedtuple("HOGMapData", ("hogid", "score", "seqlen", "subfamily_medianseqlen"))
-
+Gene = collections.namedtuple("Gene", ("numeric_id", "prot_id", "main_isoform"), defaults=(None,) )
+    
 
 """UnionFind.py
 
@@ -231,24 +232,21 @@ def handle_splice_variants(species_names, hogmaps, splice_folder):
     }
 
 
-def add_species_name_prot_id( prot_recs_lists):
+def add_species_name_prot_id(prot_recs_lists):
     """
     adding the name of species to each protein record
         - based on file name
     adding protein idx number, integer needed by xml format
     output:  prot_recs_all =  {'MYCGE': {'sp|P47500|RF1_MYCGE||MYCGE||1000000001': SeqRecord(seq=
     """
-    prot_idx_name_pickle_file = "./gene_id_dic_xml.pickle"
     start_num_prot = int(1e9)
     start_num_prot_per_sp = int(1e6) #
     prot_recs_all = {} # {'MYCGE': {'sp|P47500|RF1_MYCGE||MYCGE||1000000001': SeqRecord(seq=
-    prot_idx_name = {} # {'MYCGE': [(1000000001, 'sp|P47500|RF1_MYCGE'),(1000000002, 'sp|P13927|EFTU_MYCGE'),
     species_idx = -1
     for species_name, prot_recs_list in prot_recs_lists.items():
         species_idx += 1
         prot_idx = start_num_prot + species_idx * start_num_prot_per_sp
         prot_recs_all[species_name]={}
-        prot_idx_name[species_name] = []
         for prot_rec in prot_recs_list:
             prot_idx+=1
             prot_name= prot_rec.id
@@ -257,16 +255,9 @@ def add_species_name_prot_id( prot_recs_lists):
                 logger.info("We are truncating the prot name as it may be problematic for mafft, " + str(prot_name))
                 prot_name = prot_name[:230]
 
-            # todo, this could be a dic
-            prot_idx_name[species_name].append((prot_idx, prot_name))
-
             prot_name_new = prot_name+ "||"+species_name+"||"+str(prot_idx) # orthoxml file needs an integer as
             prot_rec.id = prot_name_new
             prot_recs_all[species_name][prot_name] = prot_rec
-
-    with open(prot_idx_name_pickle_file, 'wb') as handle:
-        pickle.dump(prot_idx_name, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
     return prot_recs_all
 
 
@@ -401,6 +392,36 @@ def resolve_singletons(rhogs_prots, hogmaps, conf):
     logger.info(f"Now, we have {counter_not_singleton} rootHOGs with >1 proteins and {counter_rhog_singleton} singleton rootHOGs")
     return rhogs_prots
 
+def save_gene_id_mapping(prot_recs_all, isoform_data=None, out_path="gene_id_dic_xml.pickle"):
+    """Save comprehensive gene ID mapping including isoform information"""
+    
+    gene_mapping = {}
+    for species_name, prot_dict in prot_recs_all.items():
+        species_data = []
+
+        isoform_to_main = {}
+        if isoform_data and species_name in isoform_data['selected_isoforms']:
+            isoform_to_main = {gene: main for main, genes in zip(
+                    isoform_data['selected_isoforms'][species_name], isoform_data['isoform_by_gene'][species_name]
+                ) for gene in genes}
+        
+        id2numeric = {}
+        for prot_name, prot_rec in prot_dict.items():
+            parts = prot_rec.id.split('||')
+            original_id = parts[0]
+            numeric_id = int(parts[2])
+            id2numeric[original_id] = numeric_id
+        
+        species_data = [Gene(num, orig_id, id2numeric.get(isoform_to_main.get(orig_id))) for orig_id, num in id2numeric.items()]
+        gene_mapping[species_name] = species_data
+    
+    # Save to pickle file
+    with open(out_path, 'wb') as handle:
+        pickle.dump(gene_mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
+    
+    logger.info(f"Saved comprehensive gene ID mapping to {out_path}")
+    return gene_mapping
+
 
 def filter_big_roothogs(hogmaps, rhogs_prots, conf_infer_roothogs):
 
diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py
@@ -178,9 +178,11 @@ def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file, id_t
         species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "taxonId": str(name2taxid[query_species_name]), "NCBITaxId": "0"})
         database_xml = ET.SubElement(species_xml, "database", attrib={"name": "database", "version": "2023"})
         genes_xml = ET.SubElement(database_xml, "genes")
-        for (gene_idx_integer, query_prot_name) in list_prots:
-            prot_id = id_transformer.transform(query_prot_name)
-            gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
+        for gene in list_prots:
+            attribs = {"id": str(gene.numeric_id), "protId": id_transformer.transform(gene.prot_id)}
+            if gene.main_isoform is not None:
+                attribs["main_isoform"] = str(gene.main_isoform)
+            gene_xml = ET.SubElement(genes_xml, "gene", attrib=attribs)
     logger.debug("gene_xml is created.")
     orthoxml_file.append(taxonomy)
 
diff --git a/FastOMA/infer_roothogs.py b/FastOMA/infer_roothogs.py
@@ -70,8 +70,8 @@ def fastoma_infer_roothogs():
 
 
     # Step 4: Save results
-    # logger.info("Saving results...")
-    # _utils_roothog.save_gene_id_mapping(prot_recs_all, isoform_data)
+    logger.info("Saving results...")
+    _utils_roothog.save_gene_id_mapping(prot_recs_all, isoform_data)
     # _utils_roothog.write_root_hogs(rhogs_prots, prot_recs_all, conf.out_rhog_folder)
     
     logger.info(f"Successfully created {len(rhogs_prots)} root HOGs")
diff --git a/utils/pickle2orthoxml.py b/utils/pickle2orthoxml.py
@@ -12,11 +12,11 @@
 from FastOMA.collect_subhogs import update_hogids
 from pathlib import Path
 
-```
+"""
 python pickle2orthoxml.py  "no_header" "file_D0680685.pickle"
 
 python pickle2orthoxml.py "selected_genes"  pickle_folder gene_id_dic_xml.pickle  "species_tree_checked.nwk"     # this will be slow.  gene_id_dic_xml.pickle is in the output of infer_roothogs
-```
+"""
 
 mode = sys.argv[1]  #"selected_genes" #"no_header" # "selected_genes"  "all_genes"
 
@@ -71,19 +71,24 @@
     #  #### create the header of orthoxml ####
     for query_species_name, list_prots in gene_id_name.items():
         first=True
-        for (gene_idx_integer, query_prot_name) in list_prots:
-            if gene_idx_integer in gene_int_set:
+        for gene in list_prots:
+            if gene.numeric_id in gene_int_set:
                 if first:
                     species_xml = ET.SubElement(orthoxml_file, "species", attrib={"name": query_species_name, "taxonId": str(name2taxid[query_species_name]), "NCBITaxId": "0"})
                     database_xml = ET.SubElement(species_xml, "database", attrib={"name": "database", "version": "2023"})
                     genes_xml = ET.SubElement(database_xml, "genes")
-                    prot_id = id_transformer.transform(query_prot_name)
-                    gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
+                    prot_id = id_transformer.transform(gene.prot_id)
+                    attribs = {"id": str(gene.numeric_id), "protId": prot_id}
+                    if gene.main_isoform is not None:
+                        attribs["main_isoform"]= str(gene.main_isoform)
+                    gene_xml = ET.SubElement(genes_xml, "gene", attrib=attribs)
                     first=False
                 else:
-                    prot_id = id_transformer.transform(query_prot_name)
-                    gene_xml = ET.SubElement(genes_xml, "gene", attrib={"id": str(gene_idx_integer), "protId": prot_id})
-
+                    prot_id = id_transformer.transform(gene.prot_id)
+                    attribs = {"id": str(gene.numeric_id), "protId": prot_id}
+                    if gene.main_isoform is not None:
+                        attribs["main_isoform"]= str(gene.main_isoform)
+                    gene_xml = ET.SubElement(genes_xml, "gene", attrib=attribs)
 
 
     print("gene_xml is created.")
diff --git a/utils/write_orthoxml_per_rHOG.py b/utils/write_orthoxml_per_rHOG.py
@@ -50,10 +50,10 @@
 
     query_species_name_list = []
     for query_species_name, list_prots in gene_id_name.items():
-
-        for (gene_idx_integer, query_prot_name) in list_prots:
-            if gene_idx_integer in list_geneid:
+        for gene in list_prots:
+            if gene.numeric_id in list_geneid:
                 query_species_name_list.append(query_species_name)
+                break  # early abort as we found one protein for this species
 
     query_species_name_set = list(set(query_species_name_list))
 
@@ -67,11 +67,12 @@
             database_xml = ET.SubElement(species_xml, "database", attrib={"name": " database ", "version": "2020"})
             genes_xml = ET.SubElement(database_xml, "genes")
 
-            for (gene_idx_integer, query_prot_name) in list_prots:
-                if gene_idx_integer in list_geneid:  # +[1007003758]
-                    query_prot_name_pure = query_prot_name
-                    gene_xml = ET.SubElement(genes_xml, "gene",
-                                             attrib={"id": str(gene_idx_integer), "protId": query_prot_name_pure})
+            for gene in list_prots:
+                if gene.numeric_id in list_geneid:  # +[1007003758]
+                    attribs = {"id": str(gene.numeric_id), "protId": gene.prot_id}
+                    if gene.main_isoform is not None:
+                        attribs["main_isoform"] = str(gene.main_isoform)
+                    gene_xml = ET.SubElement(genes_xml, "gene", attrib=attribs)
 
     groups_xml = ET.SubElement(orthoxml_file, "groups")