@@ -9,157 +9,8 @@ params.hogmap_in = params.hogmap_in ?: "${params.input_folder}/hogmap_in"
99params. splice_folder = params. splice_folder ?: " ${ params.input_folder} /splice"
1010params. species_tree = params. species_tree ?: " ${ params.input_folder} /species_tree.nwk"
1111
12- /*
13-
14- if (params.help) {
15- log.info """
16- ===========================================
17- FastOMA -- PIPELINE
18- ===========================================
19- Usage:
20- Run the pipeline with default parameters:
21- nexflow run FastOMA.nf
22-
23- Run with user parameters:
24- nextflow run FastOMA.nf --input_folder {input.dir} --output_folder {results.dir}
25-
26- Mandatory arguments:
27- --input_folder Input data folder. Defaults to ${params.input_folder}. This folder
28- must contain the proteomes (in a subfolder named 'proteome') and
29- a species tree file. Optionally the folder might contain
30- - a sub-folder 'splice' containing splicing variant mappings
31- - a sub-folder 'hogmap_in' containing precomputed OMAmer
32- placement results for all proteomes
33-
34- All sub-folders and sub-files can also be placed in orther
35- locations if you provide alternative values for them (see below on
36- optional arguments section).
37-
38- --output_folder Path where all the output should be stored. Defaults to
39- ${params.output_folder}
40-
41-
42- Profile selection:
43- -profile FastOMA can be run using several execution profiles. The default
44- set of available profiles is
45- - docker Run pipeline using docker containers. Docker needs
46- to be installed on your system. Containers will be
47- fetched automatically from dockerhub. See also
48- additional options '--container_version' and
49- '--container_name'.
50-
51- - singlularity Run pipeline using singularity. Singularity needs
52- to be installed on your system. On HPC clusters,
53- it often needs to be loaded as a seperate module.
54- Containers will be fetched automatically from
55- dockerhub. See also additional options
56- '--container_version' and '--container_name'.
57-
58- - conda Run pipeline in a conda environment. Conda needs
59- to be installed on your system. The environment
60- will be created automatically.
61-
62- - standard Run pipeline on your local system. Mainly intended
63- for development purpose. All dependencies must be
64- installed in the calling environment.
65-
66- - slurm_singularity
67- Run pipeline using SLURM job scheduler and
68- singularity containers. This profile can also be a
69- template for other HPC clusters that use different
70- schedulers.
71-
72- - slurm_conda Run pipeline using SLURM job scheduler and conda
73- environment.
74-
75- Profiles are defined in nextflow.config and can be extended or
76- adjusted according to your needs.
77-
78-
79- Additional options:
80- --proteome_folder Overwrite location of proteomes (default ${params.proteome_folder})
81- --species_tree Overwrite location of species tree file (newick format).
82- Defaults to ${params.species_tree}
83- --splice_folder Overwrite location of splice file folder. The splice files must be
84- named <proteome_file>.splice.
85- Defaults to ${params.splice_folder}
86- --omamer_db Path or URL to download the OMAmer database from.
87- Defaults to ${params.omamer_db}
88- --hogmap_in Optional path where precomputed omamer mapping files are located.
89- Defaults to ${params.hogmap_in}
90- --fasta_header_id_transformer
91- choice of transformers of input proteome fasta header
92- to reported IDs in output files (e.g. orthoxml files)
93- Defaults to '${params.fasta_header_id_transformer}', and can be set to
94- - noop : no transformation (input header == output header)
95- - UniProt : extract accession from uniprot header
96- e.g. '>sp|P68250|1433B_BOVIN' --> 'P68250'
97-
98- Algorithmic parameters:
99- --nr_repr_per_hog The maximum number of representatives per subhog to keep during the
100- inference. Higher values lead to slighlty higher runtime.
101- Default to ${params.nr_repr_per_hog}.
102- --filter_method The applied filtering method on the MSAs before tree building.
103- must be one of "col-row-threshold", "col-elbow-row-threshold", "trimal".
104- Defaults to ${params.filter_method}.
105- --min_sequence_length Minimum length of a sequence to be considered for orthology
106- inference. Too short sequences tend to be problematic.
107- Defaults to ${params.min_sequence_length}.
108-
109-
110- Flags:
111- --help Display this message
112- --debug_enabled Store addtional information that might be helpful to debug in case
113- of a problem with FastOMA.
114- --write_msas MSAs used during inference of subhogs will be stored at
115- every taxonomic level.
116- --write_genetrees Inferred gene trees will be stored at every taxonomic level.
117- --force_pairwise_ortholog_generation
118- Force producing the pairwise orthologs.tsv.gz file even if the
119- dataset contains many proteomes. By default, FastOMA produces the
120- pairwise ortholog file only if there are at most 25 proteomes in
121- the dataset.
122- --report Produce nextflow report and timeline and store in in
123- $params.statdir
124-
125- """.stripIndent()
126-
127- exit 1
128- }
129-
130-
131- log.info """
132- ===========================================
133- FastOMA -- PIPELINE
134- ===========================================
135-
136- Project : ${workflow.projectDir}
137- Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}]
138- Cmd line: ${workflow.commandLine}
139- Manifest's pipeline version: ${workflow.manifest.version}
140-
141- Parameters:
142- input_folder ${params.input_folder}
143- proteome folder ${params.proteome_folder}
144- species_tree ${params.species_tree}
145- splice_folder ${params.splice_folder}
146- omamer_db ${params.omamer_db}
147- hogmap_in ${params.hogmap_in}
148- fasta_header_id_transformer ${params.fasta_header_id_transformer}
149-
150- filter_method ${params.filter_method}
151- filter_gap_ratio_row ${params.filter_gap_ratio_row}
152- filter_gap_ratio_col ${params.filter_gap_ratio_col}
153- nr_repr_per_hog ${params.nr_repr_per_hog}
154- min_sequence_length ${params.min_sequence_length}
155-
156- debug_enabled ${params.debug_enabled}
157- report ${params.report}
158- force_pairwise_ortholog_generation ${params.force_pairwise_ortholog_generation}
159- """.stripIndent()
160- */
161-
162- process fetchTestData {
12+ // Utility process to fetch remote datasets
13+ process fetchRemoteData {
16314 // Cache in a dedicated cache directory
16415 storeDir " ${ params.test_data_cache ?: "${launchDir}/.test-datasets"} "
16516 tag " fetch data from ${ url} "
@@ -171,7 +22,7 @@ process fetchTestData {
17122 path " ${ dataset_name} " , emit: testDataDir
17223
17324 script:
174- dataset_name= url. tokenize(' /' ). last(). replaceAll(/ \. (tar\. gz|tgz|zip) \? .*$/ , ' ' )
25+ dataset_name= url. toString() . tokenize(' /' ). last(). replaceAll(/ \. (tar\. gz|tgz)( \? .*)? $/ , ' ' )
17526 """
17627 python3 - <<'EOF'
17728import os
20354}
20455
20556
57+ // Utility Process to extract a local archive file as input data
58+ process extractLocalArchive {
59+ storeDir " ${ params.test_data_cache ?: "${launchDir}/.test-datasets"} "
60+ tag " extract ${ archive.name} "
61+
62+ input:
63+ path archive
64+
65+ output:
66+ path " ${ archive.baseName} " , emit: extractedDir
67+
68+ script:
69+ """
70+ echo "Extracting local archive: ${ archive} "
71+
72+ mkdir -p ${ archive.baseName}
73+
74+ if [[ "${ archive} " == *.zip ]]; then
75+ unzip -q ${ archive} -d ${ archive.baseName}
76+ else
77+ tar -xzf ${ archive} -C ${ archive.baseName}
78+ fi
79+
80+ echo "Extraction completed."
81+ ls -la ${ archive.baseName} /
82+ """
83+ }
84+
85+
20686process check_input{
20787 label " process_single"
20888 publishDir params. output_folder, mode: ' copy'
@@ -464,6 +344,27 @@ process fastoma_report {
464344 """
465345}
466346
347+ // Helper function to detect input type
348+ def detectInputType(input) {
349+ if (input. startsWith(' http://' ) || input. startsWith(' https://' ) || input. startsWith(' ftp://' )) {
350+ return ' url'
351+ } else if (input. endsWith(' .tar.gz' ) || input. endsWith(' .tgz' ) || input. endsWith(' .zip' )) {
352+ def archive= file(input)
353+ if (! archive. exists() || ! archive. isFile()) {
354+ log. error " Input archive file does not exist: ${ input} "
355+ exit 1
356+ }
357+ return ' archive'
358+ } else {
359+ def dir= file(input)
360+ if (! dir. exists() || ! dir. isDirectory()) {
361+ log. error " Input directory does not exist: ${ input} "
362+ exit 1
363+ }
364+ return ' directory'
365+ }
366+ }
367+
467368workflow {
468369 // Print help message if requested
469370 if (params. help) {
@@ -477,12 +378,29 @@ workflow {
477378 // Print parameter summary
478379 log. info paramsSummaryLog(workflow)
479380
480- // Handle data source
481- if (params. test_data_url) {
381+ // Detect input type
382+ def inputType = detectInputType(params. input_folder)
383+ log. info " Detected input type '${ inputType} ' for: ${ params.input_folder} "
384+ if (inputType == " directory" ) {
385+ log. info " Using local input folder: ${ params.input_folder} "
386+ // Local/custom dataset - allow parameter overrides
387+ proteome_folder = Channel . value(params. proteome_folder)
388+ proteomes = Channel . fromPath(" ${ params.proteome_folder} /*.{fa,fasta}" , checkIfExists : true )
389+ species_tree = Channel . value(params. species_tree)
390+ splice_folder = Channel . value(params. splice_folder)
391+ hogmap_in = Channel . value(params. hogmap_in)
392+ } else {
393+ // Input is either a URL or an archive file - fetch and extract
482394 // Fetch test dataset from remote URL
483- log. info " Fetching test dataset from: ${ params.test_data_url} "
484- input_folder_path = fetchTestData(params. test_data_url)
395+ if (inputType == " url" ) {
396+ log. info " Fetching test dataset from URL: ${ params.input_folder} "
397+ input_folder_path = fetchRemoteData(Channel . value(params. input_folder))
398+ } else if (inputType == " archive" ) {
399+ log. info " Extracting test dataset from local archive: ${ params.input_folder} "
485400
401+ input_folder_path = extractLocalArchive(Channel . fromPath(params. input_folder))
402+ }
403+
486404 // Set up all channels based on the downloaded folder structure
487405 proteome_folder = input_folder_path. map { " ${ it} /proteome" }
488406 proteomes = input_folder_path. flatMap { dir ->
@@ -493,14 +411,7 @@ workflow {
493411 species_tree = input_folder_path. map { " ${ it} /species_tree.nwk" }
494412 splice_folder = input_folder_path. map { " ${ it} /splice" }
495413 hogmap_in = input_folder_path. map { " ${ it} /hogmap_in" }
496- } else {
497- // Local/custom dataset - allow parameter overrides
498- proteome_folder = Channel . value(params. proteome_folder)
499- proteomes = Channel . fromPath(" ${ params.proteome_folder} /*.{fa,fasta}" , checkIfExists : true )
500- species_tree = Channel . value(params. species_tree)
501- splice_folder = Channel . value(params. splice_folder)
502- hogmap_in = Channel . value(params. hogmap_in)
503- }
414+ }
504415
505416 // Static channels
506417 omamerdb = Channel . fromPath(params. omamer_db)
0 commit comments