Skip to content

Commit 4743c94

Browse files
committed
Merge branch 'rf-roothog-inference' into dev
2 parents 264abdb + f34e526 commit 4743c94

2 files changed

Lines changed: 37 additions & 23 deletions

File tree

FastOMA/fastoma_notebook_stat.ipynb

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
},
2828
"source": [
2929
"import os\n",
30+
"import sys\n",
3031
"import logging\n",
3132
"\n",
3233
"# Scientific libraries\n",
@@ -259,12 +260,15 @@
259260
" hog_df = pd.DataFrame.from_records(extract_hog_info.parse_orthoxml(xml, genome_coverage_stats))\n",
260261
"hog_summary_df = pd.DataFrame.from_records(genome_coverage_stats.summary())\n",
261262
"df_seq = pd.merge(hog_summary_df, protein_df.groupby(\"species\", as_index=False).count(), on='species')\n",
262-
"df_seq['minor_splice'] = df_seq['prot_len']-df_seq['genes']-df_seq['not_in_group']\n",
263-
"df_seq['in_group'] = df_seq['genes']\n",
264-
"df_seq = df_seq[['species', 'in_group', 'not_in_group','minor_splice']]\n",
263+
"if (df_seq[\"prot_len\"] != df_seq[\"proteins\"]).any():\n",
264+
" problem = df_seq[df_seq[\"prot_len\"] != df_seq[\"proteins\"]]\n",
265+
" print(\"ERROR: The number of proteins in the input proteome file and in the FastOMA_HOGs.orthoxml file do not match for the following species:\", file=sys.stderr)\n",
266+
" print(problem, file=sys.stderr)\n",
267+
"df_seq[\"in_group\"] = df_seq[\"genes\"] - df_seq[\"not_in_group\"]\n",
268+
"df_seq[\"fraction_in_group\"] = df_seq[\"in_group\"] / df_seq[\"genes\"]\n",
269+
"df_seq = df_seq[['species', 'proteins', 'in_group', 'not_in_group','minor_splice', 'fraction_in_group']]\n",
265270
"order = species_tree.get_leaf_names()\n",
266271
"df_seq = df_seq.sort_values(by=['species'], key=lambda s: s.apply(order.index)).set_index('species')\n",
267-
"placed = df_seq[\"in_group\"] / (df_seq[\"in_group\"] + df_seq[\"not_in_group\"])\n",
268272
"df_seq"
269273
],
270274
"outputs": [],
@@ -285,7 +289,7 @@
285289
"id": "96b4619e-529b-48f4-8795-89f48f78fe1b",
286290
"metadata": {},
287291
"source": [
288-
"df_seq.plot(kind='bar', stacked=True)\n",
292+
"df_seq[[\"in_group\",\"not_in_group\",\"minor_splice\"]].plot(kind='bar', stacked=True)\n",
289293
"plt.title('Number of proteins in HOGs / singletons / minor splice variants', fontsize=16)\n",
290294
"plt.xlabel('Species')\n",
291295
"plt.ylabel('Counts')\n",
@@ -295,42 +299,47 @@
295299
"execution_count": null
296300
},
297301
{
298-
"metadata": {},
299302
"cell_type": "markdown",
300-
"source": "Next, we check the fraction of genes that are placed in any HOG per species. For this, we ignore the minor splice variants, as they are not used for orthology inference:",
301-
"id": "4c3e6614634a9cd2"
303+
"id": "4c3e6614634a9cd2",
304+
"metadata": {},
305+
"source": [
306+
"Next, we check the fraction of genes that are placed in any HOG per species. For this, we ignore the minor splice variants, as they are not used for orthology inference:"
307+
]
302308
},
303309
{
304-
"metadata": {},
305310
"cell_type": "code",
311+
"id": "3af6b633d31c14a8",
312+
"metadata": {},
306313
"source": [
307-
"placed.plot(kind=\"bar\")\n",
314+
"df_seq[\"fraction_in_group\"].plot(kind=\"bar\")\n",
308315
"plt.xticks(rotation=90)\n",
309316
"plt.ylabel(\"Fraction\")\n",
310317
"plt.xlabel(\"Species\")\n",
311318
"plt.title(\"Fraction of genes placed in HOGs per species\");"
312319
],
313-
"id": "3af6b633d31c14a8",
314320
"outputs": [],
315321
"execution_count": null
316322
},
317323
{
318-
"metadata": {},
319324
"cell_type": "markdown",
320-
"source": "We summarize the statistics of the fraction of genes placed in a HOG per species in a violine plot:",
321-
"id": "3d90db1d820e1c39"
325+
"id": "3d90db1d820e1c39",
326+
"metadata": {},
327+
"source": [
328+
"We summarize the statistics of the fraction of genes placed in a HOG per species in a violine plot:"
329+
]
322330
},
323331
{
324-
"metadata": {},
325332
"cell_type": "code",
326-
"outputs": [],
327-
"execution_count": null,
333+
"id": "f48758e47095a348",
334+
"metadata": {},
328335
"source": [
329336
"plt.figure(figsize=(12, 3))\n",
330-
"sns.violinplot(data=placed, orient=\"h\")\n",
331-
"plt.title(\"Violin Plot of the Fraction of genes placed in HOGs\");"
337+
"sns.violinplot(data=df_seq[\"fraction_in_group\"], orient=\"h\")\n",
338+
"plt.xlabel(\"Fraction of genes placed in HOGs\")\n",
339+
"plt.title(\"Violin Plot of the Fraction of genes placed in HOGs per species\");"
332340
],
333-
"id": "f48758e47095a348"
341+
"outputs": [],
342+
"execution_count": null
334343
},
335344
{
336345
"cell_type": "markdown",
@@ -563,7 +572,8 @@
563572
" treeprofile = ham_analysis.create_tree_profile(outfile=phylostratigraphy)\n",
564573
"\n",
565574
" from IPython.display import IFrame\n",
566-
" IFrame(os.path.basename(phylostratigraphy), width=800, height=600)"
575+
" frame = IFrame(os.path.basename(phylostratigraphy), width=800, height=600)\n",
576+
" display(frame)"
567577
],
568578
"outputs": [],
569579
"execution_count": null

FastOMA/zoo/hog/extract_hog_info.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,11 @@ def summary(self):
4242
single = collections.defaultdict(int)
4343
for g in self.genes.values():
4444
single[g.species] += 1 if g.is_main_isoform else 0
45-
return [{'species': g, 'genes': self.nr_genes_per_species[g], 'not_in_group': single[g]}
45+
return [{'species': g,
46+
'genes': self.nr_genes_per_species[g],
47+
'not_in_group': single[g],
48+
'proteins': self.nr_proteins_per_species[g],
49+
'minor_splice': self.nr_proteins_per_species[g] - self.nr_genes_per_species[g]}
4650
for g in self.nr_genes_per_species]
4751

4852

@@ -100,4 +104,4 @@ def collect_genes(elem):
100104
genome_coverage_stats = SpeciesAnalyser()
101105
with open(conf.orthoxml, 'rt') as xml:
102106
for group in parse_orthoxml(xml, genome_coverage_stats):
103-
print(group)
107+
print(group)

0 commit comments

Comments
 (0)