Skip to content

Commit 61a29d7

Browse files
committed
filter non-main isoforms for stats
1 parent 73f6d3e commit 61a29d7

1 file changed

Lines changed: 8 additions & 4 deletions

File tree

FastOMA/zoo/hog/extract_hog_info.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@
66
import logging
77
logger = logging.getLogger(__name__)
88

9-
Gene = collections.namedtuple("Gene", "xref species internal_id")
9+
Gene = collections.namedtuple("Gene", "xref species internal_id is_main_isoform")
1010

1111

1212
class SpeciesAnalyser:
1313
def __init__(self, gene_attr="protId"):
1414
self.gene_attr = gene_attr
1515
self.genes = {}
1616
self.nr_genes_per_species = collections.defaultdict(int)
17+
self.nr_proteins_per_species = collections.defaultdict(int)
1718

1819
def add_genome_genes(self, genome_node):
1920
genome_name = genome_node.get('name', None)
@@ -24,8 +25,11 @@ def add_genome_genes(self, genome_node):
2425
for gene in genome_node.findall('.//{http://orthoXML.org/2011/}gene'):
2526
gene_id = gene.get('id')
2627
gene_prot_id = gene.get(self.gene_attr)
27-
generef_2_xref[gene_id] = Gene(gene_prot_id, genome_name, gene_id)
28-
self.nr_genes_per_species[genome_name] += 1
28+
is_main = gene.get('main_isoform', gene_id) == gene_id
29+
generef_2_xref[gene_id] = Gene(gene_prot_id, genome_name, gene_id, is_main)
30+
if is_main:
31+
self.nr_genes_per_species[genome_name] += 1
32+
self.nr_proteins_per_species[genome_name] += 1
2933
self.genes.update(generef_2_xref)
3034

3135
def gene_in_group(self, gene_id):
@@ -37,7 +41,7 @@ def get_singletons(self):
3741
def summary(self):
3842
single = collections.defaultdict(int)
3943
for g in self.genes.values():
40-
single[g.species] += 1
44+
single[g.species] += 1 if g.is_main_isoform else 0
4145
return [{'species': g, 'genes': self.nr_genes_per_species[g], 'not_in_group': single[g]}
4246
for g in self.nr_genes_per_species]
4347

0 commit comments

Comments
 (0)