Merge branch 'rf-roothog-inference' into dev

alpae · alpae · commit 4743c9492d27 · 2025-12-04T11:36:05.000+01:00
diff --git a/FastOMA/fastoma_notebook_stat.ipynb b/FastOMA/fastoma_notebook_stat.ipynb
@@ -27,6 +27,7 @@
    },
    "source": [
     "import os\n",
+    "import sys\n",
     "import logging\n",
     "\n",
     "# Scientific libraries\n",
@@ -259,12 +260,15 @@
     "    hog_df = pd.DataFrame.from_records(extract_hog_info.parse_orthoxml(xml, genome_coverage_stats))\n",
     "hog_summary_df = pd.DataFrame.from_records(genome_coverage_stats.summary())\n",
     "df_seq = pd.merge(hog_summary_df, protein_df.groupby(\"species\", as_index=False).count(), on='species')\n",
-    "df_seq['minor_splice'] = df_seq['prot_len']-df_seq['genes']-df_seq['not_in_group']\n",
-    "df_seq['in_group'] = df_seq['genes']\n",
-    "df_seq = df_seq[['species', 'in_group', 'not_in_group','minor_splice']]\n",
+    "if (df_seq[\"prot_len\"] != df_seq[\"proteins\"]).any():\n",
+    "    problem = df_seq[df_seq[\"prot_len\"] != df_seq[\"proteins\"]]\n",
+    "    print(\"ERROR: The number of proteins in the input proteome file and in the FastOMA_HOGs.orthoxml file do not match for the following species:\", file=sys.stderr)\n",
+    "    print(problem, file=sys.stderr)\n",
+    "df_seq[\"in_group\"] = df_seq[\"genes\"] - df_seq[\"not_in_group\"]\n",
+    "df_seq[\"fraction_in_group\"] = df_seq[\"in_group\"] / df_seq[\"genes\"]\n",
+    "df_seq = df_seq[['species', 'proteins', 'in_group', 'not_in_group','minor_splice', 'fraction_in_group']]\n",
     "order = species_tree.get_leaf_names()\n",
     "df_seq = df_seq.sort_values(by=['species'], key=lambda s: s.apply(order.index)).set_index('species')\n",
-    "placed = df_seq[\"in_group\"] / (df_seq[\"in_group\"] + df_seq[\"not_in_group\"])\n",
     "df_seq"
    ],
    "outputs": [],
@@ -285,7 +289,7 @@
    "id": "96b4619e-529b-48f4-8795-89f48f78fe1b",
    "metadata": {},
    "source": [
-    "df_seq.plot(kind='bar', stacked=True)\n",
+    "df_seq[[\"in_group\",\"not_in_group\",\"minor_splice\"]].plot(kind='bar', stacked=True)\n",
     "plt.title('Number of proteins in HOGs / singletons / minor splice variants', fontsize=16)\n",
     "plt.xlabel('Species')\n",
     "plt.ylabel('Counts')\n",
@@ -295,42 +299,47 @@
    "execution_count": null
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "Next, we check the fraction of genes that are placed in any HOG per species. For this, we ignore the minor splice variants, as they are not used for orthology inference:",
-   "id": "4c3e6614634a9cd2"
+   "id": "4c3e6614634a9cd2",
+   "metadata": {},
+   "source": [
+    "Next, we check the fraction of genes that are placed in any HOG per species. For this, we ignore the minor splice variants, as they are not used for orthology inference:"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "id": "3af6b633d31c14a8",
+   "metadata": {},
    "source": [
-    "placed.plot(kind=\"bar\")\n",
+    "df_seq[\"fraction_in_group\"].plot(kind=\"bar\")\n",
     "plt.xticks(rotation=90)\n",
     "plt.ylabel(\"Fraction\")\n",
     "plt.xlabel(\"Species\")\n",
     "plt.title(\"Fraction of genes placed in HOGs per species\");"
    ],
-   "id": "3af6b633d31c14a8",
    "outputs": [],
    "execution_count": null
   },
   {
-   "metadata": {},
    "cell_type": "markdown",
-   "source": "We summarize the statistics of the fraction of genes placed in a HOG per species in a violine plot:",
-   "id": "3d90db1d820e1c39"
+   "id": "3d90db1d820e1c39",
+   "metadata": {},
+   "source": [
+    "We summarize the statistics of the fraction of genes placed in a HOG per species in a violine plot:"
+   ]
   },
   {
-   "metadata": {},
    "cell_type": "code",
-   "outputs": [],
-   "execution_count": null,
+   "id": "f48758e47095a348",
+   "metadata": {},
    "source": [
     "plt.figure(figsize=(12, 3))\n",
-    "sns.violinplot(data=placed, orient=\"h\")\n",
-    "plt.title(\"Violin Plot of the Fraction of genes placed in HOGs\");"
+    "sns.violinplot(data=df_seq[\"fraction_in_group\"], orient=\"h\")\n",
+    "plt.xlabel(\"Fraction of genes placed in HOGs\")\n",
+    "plt.title(\"Violin Plot of the Fraction of genes placed in HOGs per species\");"
    ],
-   "id": "f48758e47095a348"
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "markdown",
@@ -563,7 +572,8 @@
     "    treeprofile = ham_analysis.create_tree_profile(outfile=phylostratigraphy)\n",
     "\n",
     "    from IPython.display import IFrame\n",
-    "    IFrame(os.path.basename(phylostratigraphy), width=800, height=600)"
+    "    frame = IFrame(os.path.basename(phylostratigraphy), width=800, height=600)\n",
+    "    display(frame)"
    ],
    "outputs": [],
    "execution_count": null
diff --git a/FastOMA/zoo/hog/extract_hog_info.py b/FastOMA/zoo/hog/extract_hog_info.py
@@ -42,7 +42,11 @@ def summary(self):
         single = collections.defaultdict(int)
         for g in self.genes.values():
             single[g.species] += 1 if g.is_main_isoform else 0
-        return [{'species': g, 'genes': self.nr_genes_per_species[g], 'not_in_group': single[g]}
+        return [{'species': g, 
+                 'genes': self.nr_genes_per_species[g], 
+                 'not_in_group': single[g], 
+                 'proteins': self.nr_proteins_per_species[g], 
+                 'minor_splice': self.nr_proteins_per_species[g] - self.nr_genes_per_species[g]}
                 for g in self.nr_genes_per_species]
 
 
@@ -100,4 +104,4 @@ def collect_genes(elem):
     genome_coverage_stats = SpeciesAnalyser()
     with open(conf.orthoxml, 'rt') as xml:
         for group in parse_orthoxml(xml, genome_coverage_stats):
-            print(group)
+            print(group)