rewritten to use input_folder as either dataset folder, or remote or local archive

alpae · alpae · commit b0ca70298c39 · 2025-12-08T08:08:22.000+01:00
diff --git a/FastOMA.nf b/FastOMA.nf
@@ -9,157 +9,8 @@ params.hogmap_in       = params.hogmap_in ?: "${params.input_folder}/hogmap_in"
 params.splice_folder   = params.splice_folder ?: "${params.input_folder}/splice"
 params.species_tree    = params.species_tree ?: "${params.input_folder}/species_tree.nwk"
 
-/*
-
-if (params.help) {
-    log.info """
-    ===========================================
-      FastOMA -- PIPELINE
-    ===========================================
-    Usage:
-    Run the pipeline with default parameters:
-    nexflow run FastOMA.nf
-
-    Run with user parameters:
-    nextflow run FastOMA.nf --input_folder {input.dir}  --output_folder {results.dir}
-
-    Mandatory arguments:
-        --input_folder          Input data folder. Defaults to ${params.input_folder}. This folder
-                                must contain the proteomes (in a subfolder named 'proteome') and
-                                a species tree file. Optionally the folder might contain
-                                 - a sub-folder 'splice' containing splicing variant mappings
-                                 - a sub-folder 'hogmap_in' containing precomputed OMAmer
-                                   placement results for all proteomes
-
-                                All sub-folders and sub-files can also be placed in orther
-                                locations if you provide alternative values for them (see below on
-                                optional arguments section).
-
-        --output_folder         Path where all the output should be stored. Defaults to
-                                ${params.output_folder}
-
-
-    Profile selection:
-        -profile                FastOMA can be run using several execution profiles. The default
-                                set of available profiles is
-                                 - docker       Run pipeline using docker containers. Docker needs
-                                                to be installed on your system. Containers will be
-                                                fetched automatically from dockerhub. See also
-                                                additional options '--container_version' and
-                                                '--container_name'.
-
-                                 - singlularity Run pipeline using singularity. Singularity needs
-                                                to be installed on your system. On HPC clusters,
-                                                it often needs to be loaded as a seperate module.
-                                                Containers will be fetched automatically from
-                                                dockerhub. See also additional options
-                                                '--container_version' and '--container_name'.
-
-                                 - conda        Run pipeline in a conda environment. Conda needs
-                                                to be installed on your system. The environment
-                                                will be created automatically.
-
-                                 - standard     Run pipeline on your local system. Mainly intended
-                                                for development purpose. All dependencies must be
-                                                installed in the calling environment.
-
-                                 - slurm_singularity
-                                                Run pipeline using SLURM job scheduler and
-                                                singularity containers. This profile can also be a
-                                                template for other HPC clusters that use different
-                                                schedulers.
-
-                                 - slurm_conda  Run pipeline using SLURM job scheduler and conda
-                                                environment.
-
-                                Profiles are defined in nextflow.config and can be extended or
-                                adjusted according to your needs.
-
-
-    Additional options:
-        --proteome_folder       Overwrite location of proteomes (default ${params.proteome_folder})
-        --species_tree          Overwrite location of species tree file (newick format).
-                                Defaults to ${params.species_tree}
-        --splice_folder         Overwrite location of splice file folder. The splice files must be
-                                named <proteome_file>.splice.
-                                Defaults to ${params.splice_folder}
-        --omamer_db             Path or URL to download the OMAmer database from.
-                                Defaults to ${params.omamer_db}
-        --hogmap_in             Optional path where precomputed omamer mapping files are located.
-                                Defaults to ${params.hogmap_in}
-        --fasta_header_id_transformer
-                                choice of transformers of input proteome fasta header
-                                to reported IDs in output files (e.g. orthoxml files)
-                                Defaults to '${params.fasta_header_id_transformer}', and can be set to
-                                  - noop         : no transformation (input header == output header)
-                                  - UniProt      : extract accession from uniprot header
-                                                   e.g. '>sp|P68250|1433B_BOVIN' --> 'P68250'
-
-    Algorithmic parameters:
-        --nr_repr_per_hog       The maximum number of representatives per subhog to keep during the
-                                inference. Higher values lead to slighlty higher runtime.
-                                Default to ${params.nr_repr_per_hog}.
-        --filter_method         The applied filtering method on the MSAs before tree building.
-                                must be one of "col-row-threshold", "col-elbow-row-threshold", "trimal".
-                                Defaults to ${params.filter_method}.
-        --min_sequence_length   Minimum length of a sequence to be considered for orthology
-                                inference. Too short sequences tend to be problematic.
-                                Defaults to ${params.min_sequence_length}.
-
-
-    Flags:
-        --help                  Display this message
-        --debug_enabled         Store addtional information that might be helpful to debug in case
-                                of a problem with FastOMA.
-        --write_msas            MSAs used during inference of subhogs will be stored at
-                                every taxonomic level.
-        --write_genetrees       Inferred gene trees will be stored at every taxonomic level.
-        --force_pairwise_ortholog_generation
-                                Force producing the pairwise orthologs.tsv.gz file even if the
-                                dataset contains many proteomes. By default, FastOMA produces the
-                                pairwise ortholog file only if there are at most 25 proteomes in
-                                the dataset.
-        --report                Produce nextflow report and timeline and store in in
-                                $params.statdir
-
-    """.stripIndent()
-
-    exit 1
-}
-
-
-log.info """
-===========================================
-  FastOMA -- PIPELINE
-===========================================
-
- Project : ${workflow.projectDir}
- Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}]
- Cmd line: ${workflow.commandLine}
- Manifest's pipeline version: ${workflow.manifest.version}
-
-Parameters:
-   input_folder              ${params.input_folder}
-   proteome folder           ${params.proteome_folder}
-   species_tree              ${params.species_tree}
-   splice_folder             ${params.splice_folder}
-   omamer_db                 ${params.omamer_db}
-   hogmap_in                 ${params.hogmap_in}
-   fasta_header_id_transformer    ${params.fasta_header_id_transformer}
-
-   filter_method             ${params.filter_method}
-   filter_gap_ratio_row      ${params.filter_gap_ratio_row}
-   filter_gap_ratio_col      ${params.filter_gap_ratio_col}
-   nr_repr_per_hog           ${params.nr_repr_per_hog}
-   min_sequence_length       ${params.min_sequence_length}
-
-   debug_enabled             ${params.debug_enabled}
-   report                    ${params.report}
-   force_pairwise_ortholog_generation    ${params.force_pairwise_ortholog_generation}
-""".stripIndent()
-*/
-
-process fetchTestData {
+// Utility process to fetch remote datasets
+process fetchRemoteData {
     // Cache in a dedicated cache directory
     storeDir "${params.test_data_cache ?: "${launchDir}/.test-datasets"}"
     tag "fetch data from ${url}"
@@ -171,7 +22,7 @@ process fetchTestData {
     path "${dataset_name}", emit: testDataDir
 
     script:
-    dataset_name=url.tokenize('/').last().replaceAll(/\.(tar\.gz|tgz|zip)\?.*$/, '')
+    dataset_name=url.toString().tokenize('/').last().replaceAll(/\.(tar\.gz|tgz)(\?.*)?$/, '')
     """
     python3 - <<'EOF'
 import os
@@ -203,6 +54,35 @@ EOF
 }
 
 
+// Utility Process to extract a local archive file as input data
+process extractLocalArchive {
+    storeDir "${params.test_data_cache ?: "${launchDir}/.test-datasets"}"
+    tag "extract ${archive.name}"
+
+    input:
+    path archive
+
+    output:
+    path "${archive.baseName}", emit: extractedDir
+
+    script:
+    """
+    echo "Extracting local archive: ${archive}"
+    
+    mkdir -p ${archive.baseName}
+    
+    if [[ "${archive}" == *.zip ]]; then
+        unzip -q ${archive} -d ${archive.baseName}
+    else
+        tar -xzf ${archive} -C ${archive.baseName}
+    fi
+    
+    echo "Extraction completed."
+    ls -la ${archive.baseName}/
+    """
+}
+
+
 process check_input{
   label "process_single"
   publishDir params.output_folder, mode: 'copy'
@@ -464,6 +344,27 @@ process fastoma_report {
     """
 }
 
+// Helper function to detect input type
+def detectInputType(input) {
+    if (input.startsWith('http://') || input.startsWith('https://') || input.startsWith('ftp://')) {
+        return 'url'
+    } else if (input.endsWith('.tar.gz') || input.endsWith('.tgz') || input.endsWith('.zip')) {
+        def archive=file(input)
+        if (!archive.exists() || !archive.isFile()) {
+            log.error "Input archive file does not exist: ${input}"
+            exit 1
+        }
+        return 'archive'
+    } else {
+        def dir=file(input)
+        if (!dir.exists() || !dir.isDirectory()) {
+            log.error "Input directory does not exist: ${input}"
+            exit 1
+        }
+        return 'directory'
+    }
+}
+
 workflow {
     // Print help message if requested
     if (params.help) {
@@ -477,12 +378,29 @@ workflow {
     // Print parameter summary
     log.info paramsSummaryLog(workflow)
 
-    // Handle data source
-    if (params.test_data_url) {
+    // Detect input type 
+    def inputType = detectInputType(params.input_folder)
+    log.info "Detected input type '${inputType}' for: ${params.input_folder}"       
+    if (inputType == "directory") {
+        log.info "Using local input folder: ${params.input_folder}"
+        // Local/custom dataset - allow parameter overrides
+        proteome_folder = Channel.value(params.proteome_folder)
+        proteomes = Channel.fromPath("${params.proteome_folder}/*.{fa,fasta}", checkIfExists: true)
+        species_tree = Channel.value(params.species_tree)
+        splice_folder = Channel.value(params.splice_folder)
+        hogmap_in = Channel.value(params.hogmap_in)
+    } else {
+        // Input is either a URL or an archive file - fetch and extract
         // Fetch test dataset from remote URL
-        log.info "Fetching test dataset from: ${params.test_data_url}"
-        input_folder_path = fetchTestData(params.test_data_url)
+        if (inputType == "url") {
+            log.info "Fetching test dataset from URL: ${params.input_folder}"
+            input_folder_path = fetchRemoteData(Channel.value(params.input_folder))
+        } else if (inputType == "archive") {
+            log.info "Extracting test dataset from local archive: ${params.input_folder}"
 
+            input_folder_path = extractLocalArchive(Channel.fromPath(params.input_folder))
+        }
+        
         // Set up all channels based on the downloaded folder structure
         proteome_folder = input_folder_path.map { "${it}/proteome" }
         proteomes = input_folder_path.flatMap { dir ->
@@ -493,14 +411,7 @@ workflow {
         species_tree = input_folder_path.map { "${it}/species_tree.nwk" }
         splice_folder = input_folder_path.map { "${it}/splice" }
         hogmap_in = input_folder_path.map { "${it}/hogmap_in" }
-    } else {
-        // Local/custom dataset - allow parameter overrides
-        proteome_folder = Channel.value(params.proteome_folder)
-        proteomes = Channel.fromPath("${params.proteome_folder}/*.{fa,fasta}", checkIfExists: true)
-        species_tree = Channel.value(params.species_tree)
-        splice_folder = Channel.value(params.splice_folder)
-        hogmap_in = Channel.value(params.hogmap_in)
-    }
+    } 
 
     // Static channels
     omamerdb = Channel.fromPath(params.omamer_db)
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -13,7 +13,6 @@
       "properties": {
         "input_folder": {
           "type": "string",
-          "format": "directory-path",
           "description": "Path to input directory containing proteomes and species tree",
           "fa_icon": "fas fa-folder-open"
         },
@@ -53,6 +52,13 @@
           "fa_icon": "fas fa-database",
           "help": "If not provided, the default OMAmer database (LUCA) will be used.",
           "default": "https://omabrowser.org/All/LUCA.h5"
+        },
+        "test_data_cache": {
+          "type": "string",
+          "format": "directory-path",
+          "description": "Path where (remote) input archives will be stored and permanently cached",
+          "fa_ison": "fas fa-folder-open",
+          "default": "./.test-datasets"
         }
       }
     },