66import logging
77logger = logging .getLogger (__name__ )
88
9- Gene = collections .namedtuple ("Gene" , "xref species internal_id" )
9+ Gene = collections .namedtuple ("Gene" , "xref species internal_id is_main_isoform " )
1010
1111
1212class SpeciesAnalyser :
1313 def __init__ (self , gene_attr = "protId" ):
1414 self .gene_attr = gene_attr
1515 self .genes = {}
1616 self .nr_genes_per_species = collections .defaultdict (int )
17+ self .nr_proteins_per_species = collections .defaultdict (int )
1718
1819 def add_genome_genes (self , genome_node ):
1920 genome_name = genome_node .get ('name' , None )
@@ -24,8 +25,11 @@ def add_genome_genes(self, genome_node):
2425 for gene in genome_node .findall ('.//{http://orthoXML.org/2011/}gene' ):
2526 gene_id = gene .get ('id' )
2627 gene_prot_id = gene .get (self .gene_attr )
27- generef_2_xref [gene_id ] = Gene (gene_prot_id , genome_name , gene_id )
28- self .nr_genes_per_species [genome_name ] += 1
28+ is_main = gene .get ('main_isoform' , gene_id ) == gene_id
29+ generef_2_xref [gene_id ] = Gene (gene_prot_id , genome_name , gene_id , is_main )
30+ if is_main :
31+ self .nr_genes_per_species [genome_name ] += 1
32+ self .nr_proteins_per_species [genome_name ] += 1
2933 self .genes .update (generef_2_xref )
3034
3135 def gene_in_group (self , gene_id ):
@@ -37,7 +41,7 @@ def get_singletons(self):
3741 def summary (self ):
3842 single = collections .defaultdict (int )
3943 for g in self .genes .values ():
40- single [g .species ] += 1
44+ single [g .species ] += 1 if g . is_main_isoform else 0
4145 return [{'species' : g , 'genes' : self .nr_genes_per_species [g ], 'not_in_group' : single [g ]}
4246 for g in self .nr_genes_per_species ]
4347
0 commit comments