|
27 | 27 | }, |
28 | 28 | "source": [ |
29 | 29 | "import os\n", |
| 30 | + "import sys\n", |
30 | 31 | "import logging\n", |
31 | 32 | "\n", |
32 | 33 | "# Scientific libraries\n", |
|
259 | 260 | " hog_df = pd.DataFrame.from_records(extract_hog_info.parse_orthoxml(xml, genome_coverage_stats))\n", |
260 | 261 | "hog_summary_df = pd.DataFrame.from_records(genome_coverage_stats.summary())\n", |
261 | 262 | "df_seq = pd.merge(hog_summary_df, protein_df.groupby(\"species\", as_index=False).count(), on='species')\n", |
262 | | - "df_seq['minor_splice'] = df_seq['prot_len']-df_seq['genes']-df_seq['not_in_group']\n", |
263 | | - "df_seq['in_group'] = df_seq['genes']\n", |
264 | | - "df_seq = df_seq[['species', 'in_group', 'not_in_group','minor_splice']]\n", |
| 263 | + "if (df_seq[\"prot_len\"] != df_seq[\"proteins\"]).any():\n", |
| 264 | + " problem = df_seq[df_seq[\"prot_len\"] != df_seq[\"proteins\"]]\n", |
| 265 | + " print(\"ERROR: The number of proteins in the input proteome file and in the FastOMA_HOGs.orthoxml file do not match for the following species:\", file=sys.stderr)\n", |
| 266 | + " print(problem, file=sys.stderr)\n", |
| 267 | + "df_seq[\"in_group\"] = df_seq[\"genes\"] - df_seq[\"not_in_group\"]\n", |
| 268 | + "df_seq[\"fraction_in_group\"] = df_seq[\"in_group\"] / df_seq[\"genes\"]\n", |
| 269 | + "df_seq = df_seq[['species', 'proteins', 'in_group', 'not_in_group','minor_splice', 'fraction_in_group']]\n", |
265 | 270 | "order = species_tree.get_leaf_names()\n", |
266 | 271 | "df_seq = df_seq.sort_values(by=['species'], key=lambda s: s.apply(order.index)).set_index('species')\n", |
267 | | - "placed = df_seq[\"in_group\"] / (df_seq[\"in_group\"] + df_seq[\"not_in_group\"])\n", |
268 | 272 | "df_seq" |
269 | 273 | ], |
270 | 274 | "outputs": [], |
|
285 | 289 | "id": "96b4619e-529b-48f4-8795-89f48f78fe1b", |
286 | 290 | "metadata": {}, |
287 | 291 | "source": [ |
288 | | - "df_seq.plot(kind='bar', stacked=True)\n", |
| 292 | + "df_seq[[\"in_group\",\"not_in_group\",\"minor_splice\"]].plot(kind='bar', stacked=True)\n", |
289 | 293 | "plt.title('Number of proteins in HOGs / singletons / minor splice variants', fontsize=16)\n", |
290 | 294 | "plt.xlabel('Species')\n", |
291 | 295 | "plt.ylabel('Counts')\n", |
|
295 | 299 | "execution_count": null |
296 | 300 | }, |
297 | 301 | { |
298 | | - "metadata": {}, |
299 | 302 | "cell_type": "markdown", |
300 | | - "source": "Next, we check the fraction of genes that are placed in any HOG per species. For this, we ignore the minor splice variants, as they are not used for orthology inference:", |
301 | | - "id": "4c3e6614634a9cd2" |
| 303 | + "id": "4c3e6614634a9cd2", |
| 304 | + "metadata": {}, |
| 305 | + "source": [ |
| 306 | + "Next, we check the fraction of genes that are placed in any HOG per species. For this, we ignore the minor splice variants, as they are not used for orthology inference:" |
| 307 | + ] |
302 | 308 | }, |
303 | 309 | { |
304 | | - "metadata": {}, |
305 | 310 | "cell_type": "code", |
| 311 | + "id": "3af6b633d31c14a8", |
| 312 | + "metadata": {}, |
306 | 313 | "source": [ |
307 | | - "placed.plot(kind=\"bar\")\n", |
| 314 | + "df_seq[\"fraction_in_group\"].plot(kind=\"bar\")\n", |
308 | 315 | "plt.xticks(rotation=90)\n", |
309 | 316 | "plt.ylabel(\"Fraction\")\n", |
310 | 317 | "plt.xlabel(\"Species\")\n", |
311 | 318 | "plt.title(\"Fraction of genes placed in HOGs per species\");" |
312 | 319 | ], |
313 | | - "id": "3af6b633d31c14a8", |
314 | 320 | "outputs": [], |
315 | 321 | "execution_count": null |
316 | 322 | }, |
317 | 323 | { |
318 | | - "metadata": {}, |
319 | 324 | "cell_type": "markdown", |
320 | | - "source": "We summarize the statistics of the fraction of genes placed in a HOG per species in a violine plot:", |
321 | | - "id": "3d90db1d820e1c39" |
| 325 | + "id": "3d90db1d820e1c39", |
| 326 | + "metadata": {}, |
| 327 | + "source": [ |
| 328 | + "We summarize the statistics of the fraction of genes placed in a HOG per species in a violine plot:" |
| 329 | + ] |
322 | 330 | }, |
323 | 331 | { |
324 | | - "metadata": {}, |
325 | 332 | "cell_type": "code", |
326 | | - "outputs": [], |
327 | | - "execution_count": null, |
| 333 | + "id": "f48758e47095a348", |
| 334 | + "metadata": {}, |
328 | 335 | "source": [ |
329 | 336 | "plt.figure(figsize=(12, 3))\n", |
330 | | - "sns.violinplot(data=placed, orient=\"h\")\n", |
331 | | - "plt.title(\"Violin Plot of the Fraction of genes placed in HOGs\");" |
| 337 | + "sns.violinplot(data=df_seq[\"fraction_in_group\"], orient=\"h\")\n", |
| 338 | + "plt.xlabel(\"Fraction of genes placed in HOGs\")\n", |
| 339 | + "plt.title(\"Violin Plot of the Fraction of genes placed in HOGs per species\");" |
332 | 340 | ], |
333 | | - "id": "f48758e47095a348" |
| 341 | + "outputs": [], |
| 342 | + "execution_count": null |
334 | 343 | }, |
335 | 344 | { |
336 | 345 | "cell_type": "markdown", |
|
563 | 572 | " treeprofile = ham_analysis.create_tree_profile(outfile=phylostratigraphy)\n", |
564 | 573 | "\n", |
565 | 574 | " from IPython.display import IFrame\n", |
566 | | - " IFrame(os.path.basename(phylostratigraphy), width=800, height=600)" |
| 575 | + " frame = IFrame(os.path.basename(phylostratigraphy), width=800, height=600)\n", |
| 576 | + " display(frame)" |
567 | 577 | ], |
568 | 578 | "outputs": [], |
569 | 579 | "execution_count": null |
|
0 commit comments