From 7beb2f1ffd8a1112780047430b77e9f346f2996a Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Sun, 5 Apr 2026 14:28:02 -0400 Subject: [PATCH 1/7] Integrate dnaseq-nextflow into ReFlow via new dnaseqExperiment template Replaces dnaSeqExperimentFromAccession and dnaSeqExperimentFromLocal with a single dnaseqExperiment datasetTemplate modeled after longReadRnaSeqExperiment. Adds snpAndCnvDNASeq.xml workflow (processSingleExperiment + loadSingleExperiment via dnaseq-nextflow), MakeDnaSeqNextflowConfig.pm, and MakeDnaSeqLoadNextflowConfig.pm. Co-Authored-By: Claude Sonnet 4.6 --- .../MakeDnaSeqLoadNextflowConfig.pm | 39 +++++ .../WorkflowSteps/MakeDnaSeqNextflowConfig.pm | 113 +++++++++++++ Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 151 ++++++++++++++++++ Main/lib/xml/workflowTemplates/dnaseq.xml | 79 ++++++--- 4 files changed, 363 insertions(+), 19 deletions(-) create mode 100644 Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm create mode 100644 Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm create mode 100644 Main/lib/xml/workflow/snpAndCnvDNASeq.xml diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm new file mode 100644 index 000000000..33b1569eb --- /dev/null +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm @@ -0,0 +1,39 @@ +package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqLoadNextflowConfig; + +@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep); + +use strict; +use warnings; +use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep; + +sub run { + my ($self, $test, $undo) = @_; + + my $indelDir = $self->getParamValue("indelDir"); + my $extDbRlsSpec = $self->getParamValue("extDbRlsSpec"); + my $genomeExtDbRlsSpec = $self->getParamValue("genomeExtDbRlsSpec"); + + my $configPath = $self->getWorkflowDataDir() . "/" . $self->getParamValue("nextflowConfigFile"); + + if ($undo) { + $self->runCmd(0, "rm -rf $configPath"); + } else { + open(F, ">", $configPath) or die "$! :Can't open config file '$configPath' for writing"; + print F +" +params { + indelDir = \"$indelDir\" + extDbRlsSpec = '\"$extDbRlsSpec\"' + genomeExtDbRlsSpec = '\"$genomeExtDbRlsSpec\"' +} + +singularity { + enabled = true + autoMounts = true +} +"; + close(F); + } +} + +1; diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm new file mode 100644 index 000000000..c6ed9ec18 --- /dev/null +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm @@ -0,0 +1,113 @@ +package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqNextflowConfig; + +@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep); + +use strict; +use warnings; +use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep; +use GUS::ObjRelP::DbiDatabase; +use GUS::Supported::GusConfig; +use CBIL::Util::PropertySet; + +sub run { + my ($self, $test, $undo) = @_; + + my $workingDirRelativePath = $self->getParamValue("workingDirRelativePath"); + + my $sampleSheetFile = $self->getParamValue("sampleSheetFile"); + my $genomeFile = $self->getParamValue("genomeFile"); + my $gtfFile = $self->getParamValue("gtfFile"); + my $footprintFile = $self->getParamValue("footprintFile"); + my $ploidy = $self->getParamValue("ploidy"); + my $resultsDirectory = $self->getParamValue("resultsDirectory"); + my $organismAbbrev = $self->getParamValue("organismAbbrev"); + my $geneSourceIdOrthologFile = $self->getParamValue("geneSourceIdOrthologFile"); + my $chrsForCalcFile = $self->getParamValue("chrsForCalcFile"); + + my $nextflowConfigFile = $self->getWorkflowDataDir() . "/" . $self->getParamValue("nextflowConfigFile"); + + # Translate local paths to cluster-side paths + my $digestedSampleSheet = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $sampleSheetFile); + my $digestedGenomeFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $genomeFile); + my $digestedGtfFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $gtfFile); + my $digestedFootprintFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $footprintFile); + my $digestedResultsDir = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $resultsDirectory); + my $digestedOrthologFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $geneSourceIdOrthologFile); + my $digestedChrsForCalcFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $chrsForCalcFile); + + # Workflow config values + my $minCoverage = $self->getConfig("minCoverage"); + my $winLen = $self->getConfig("winLen"); + my $maxNumberOfReads = $self->getConfig("maxNumberOfReads"); + my $trimmomaticAdaptorsFile = $self->getConfig("trimmomaticAdaptorsFile"); + my $freebayesMinAltFraction = $self->getConfig("freebayesMinAltFraction"); + my $bwaThreads = $self->getConfig("bwaThreads"); + + my $executor = $self->getClusterExecutor(); + my $queue = $self->getClusterQueue(); + + # Look up taxon_id from the database for this organism + my $gusConfigFile = $ENV{GUS_HOME} . "/config/gus.config"; + die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; + + my @properties = (); + my $gusConfig = CBIL::Util::PropertySet->new($gusConfigFile, \@properties, 1); + + my $taxonSql = "select taxon_id from apidb.organism where abbrev = '$organismAbbrev'"; + + my $db = GUS::ObjRelP::DbiDatabase->new( + $gusConfig->{props}->{dbiDsn}, + $gusConfig->{props}->{databaseLogin}, + $gusConfig->{props}->{databasePassword}, + 0, 0, 1, + $gusConfig->{props}->{coreSchemaName} + ); + + my $dbh = $db->getQueryHandle(); + my $stmt = $dbh->prepare($taxonSql); + $stmt->execute(); + + my $taxonId; + while (my @row = $stmt->fetchrow_array()) { + $taxonId = $row[0]; + } + + if ($undo) { + $self->runCmd(0, "rm -rf $nextflowConfigFile"); + } else { + open(F, ">", $nextflowConfigFile) or die "$! :Can't open config file '$nextflowConfigFile' for writing"; + print F +" +params { + samplesheet = \"$digestedSampleSheet\" + bwaThreads = $bwaThreads + minCoverage = $minCoverage + genomeFastaFile = \"$digestedGenomeFile\" + gtfFile = \"$digestedGtfFile\" + footprintFile = \"$digestedFootprintFile\" + winLen = $winLen + ploidy = $ploidy + outputDir = \"$digestedResultsDir\" + trimmomaticAdaptorsFile = $trimmomaticAdaptorsFile + freebayesMinAltFraction = $freebayesMinAltFraction + maxNumberOfReads = $maxNumberOfReads + taxonId = \"$taxonId\" + geneSourceIdOrthologFile = \"$digestedOrthologFile\" + chrsForCalcFile = \"$digestedChrsForCalcFile\" +} + +process { + executor = '$executor' + queue = '$queue' +} + +singularity { + enabled = true + autoMounts = true +} +"; + close(F); + } +} + +1; diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml new file mode 100644 index 000000000..6b418d32c --- /dev/null +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -0,0 +1,151 @@ + + + + + + + + + + + + + + + + + + $$parentDataDir$$/dnaseq + $$dataDir$$/analysisDir + $$analysisDirectory$$/results + $$analysisDirectory$$/nextflow.config + $$analysisDirectory$$/ngs-samples-nextflow.config + $$parentDataDir$$/final + $$dataDir$$/$$organismAbbrev$$.gtf + $$dataDir$$/$$organismAbbrev$$.fasta + $$dataDir$$/final + $$dataDir$$/loadSingleExperiment + $$loadSingleExperimentDir$$/nextflow.config + + + $$dataDir$$ + + + + $$analysisDirectory$$ + + + + + $$resultsDirectory$$ + + + + + $$loadSingleExperimentDir$$ + + + + + $$gtfFile$$ + $$gtfSymLink$$ + + + + + $$genomeFastaFile$$ + $$genomeSymLink$$ + + + + + $$finalDir$$ + $$finalSymLink$$ + + + + + $$organismAbbrev$$ + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv + + + + + $$gusConfigFile$$ + $$analysisDirectory$$ + $$finalSymLink$$ + $$analysisDirectory$$/ngs-samples-results + $$ngsSamplesNextflowConfigFile$$ + $$dataDir$$ + samplesheet.csv + $$fromSRA$$ + DNASeq + $$organismAbbrev$$ + + + + + $$gusConfigFile$$ + $$nextflowConfigFile$$ + $$dataDir$$ + $$analysisDirectory$$/ngs-samples-results/samplesheet.csv + $$genomeSymLink$$ + $$gtfSymLink$$ + $$footprintFile$$ + $$ploidy$$ + $$resultsDirectory$$ + $$organismAbbrev$$ + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv + + + + + + $$gusConfigFile$$ + $$projectName$$ + $$dataDir$$ + $$nextflowConfigFile$$ + $$ngsSamplesNextflowConfigFile$$ + $$organismAbbrev$$ + $$genomeExtDbRlsSpec$$ + false + $$experimentDatasetName$$|$$experimentDatasetVersion$$ + $$analysisDirectory$$ + VEuPathDB/dnaseq-nextflow + processSingleExperiment + + + + + + + + + $$loadNextflowConfigFile$$ + $$dataDir$$ + $$resultsDirectory$$ + $$experimentDatasetName$$|$$experimentDatasetVersion$$ + $$genomeExtDbRlsSpec$$ + + + + + + $$gusConfigFile$$ + $$projectName$$ + $$dataDir$$ + $$loadSingleExperimentDir$$ + $$loadNextflowConfigFile$$ + $$loadSingleExperimentDir$$/results + $$organismAbbrev$$ + $$genomeExtDbRlsSpec$$ + false + $$experimentDatasetName$$|$$experimentDatasetVersion$$ + VEuPathDB/dnaseq-nextflow + loadSingleExperiment + + + + diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index 0fda986b5..65a2096f3 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -1,5 +1,6 @@ + @@ -30,16 +31,57 @@ $$geneFootprintFile$$ $$projectName$$ $$genomeExtDbRlsSpec$$ - $$gusConfigFile$$ + $$gusConfigFile$$ + + + + + + + + + + ${organismAbbrev}_${name}_dnaseqExperiment_RSRC + $$organismDatasetLoaderXmlFile$$ + $$dataDir$$ + $$gusConfigFile$$ + + + + + ${organismAbbrev} + ${projectName} + ${name} + ${organismAbbrev}_${name}_dnaseqExperiment_RSRC + ${version} + $$dataDir$$/${organismAbbrev}_${name}_dnaseqExperiment_RSRC + $$gtfFile$$ + $$genomeFastaFile$$ + $$geneFootprintFile$$ + ${ploidy} + ${fromSRA} + $$relativeWebServicesDir$$ + $$gusConfigFile$$ + $$genomeExtDbRlsSpec$$ + + + + + + + $$dataDir$$/$$referenceStrainOrganismAbbrev$$_mergeExperiments - + From 85a599f249580082d7e3e5493bb8677779d2cb13 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Tue, 14 Apr 2026 11:44:35 -0400 Subject: [PATCH 2/7] Simplify dnaseq workflow: remove unused params, move shared queries to organism level - Remove trimmomaticAdaptorsFile, freebayesMinAltFraction, maxNumberOfReads from MakeDnaSeqNextflowConfig and MakeDnaSeqSingleExperimentNextflowConfig - Remove taxonId param and DB lookup from MakeDnaSeqNextflowConfig (no longer needed by dnaseq-nextflow pipeline) - Move retrieveGeneCNVAndPloidyQueries from snpAndCnvDNASeq to dnaseq template (organism-level files created once; symlinked into each experiment dir) - snpAndCnvDNASeq now handles processSingleExperiment only; load steps removed Co-Authored-By: Claude Sonnet 4.6 --- .../WorkflowSteps/MakeDnaSeqNextflowConfig.pm | 37 -------- ...akeDnaSeqSingleExperimentNextflowConfig.pm | 4 - Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 63 +++++--------- Main/lib/xml/workflowTemplates/dnaseq.xml | 84 ++++--------------- Main/lib/xml/workflowTemplates/project.xml | 1 - 5 files changed, 34 insertions(+), 155 deletions(-) diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm index c6ed9ec18..9fc9d4067 100644 --- a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm @@ -5,9 +5,6 @@ package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqNextflowConfig; use strict; use warnings; use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep; -use GUS::ObjRelP::DbiDatabase; -use GUS::Supported::GusConfig; -use CBIL::Util::PropertySet; sub run { my ($self, $test, $undo) = @_; @@ -20,7 +17,6 @@ sub run { my $footprintFile = $self->getParamValue("footprintFile"); my $ploidy = $self->getParamValue("ploidy"); my $resultsDirectory = $self->getParamValue("resultsDirectory"); - my $organismAbbrev = $self->getParamValue("organismAbbrev"); my $geneSourceIdOrthologFile = $self->getParamValue("geneSourceIdOrthologFile"); my $chrsForCalcFile = $self->getParamValue("chrsForCalcFile"); @@ -38,40 +34,11 @@ sub run { # Workflow config values my $minCoverage = $self->getConfig("minCoverage"); my $winLen = $self->getConfig("winLen"); - my $maxNumberOfReads = $self->getConfig("maxNumberOfReads"); - my $trimmomaticAdaptorsFile = $self->getConfig("trimmomaticAdaptorsFile"); - my $freebayesMinAltFraction = $self->getConfig("freebayesMinAltFraction"); my $bwaThreads = $self->getConfig("bwaThreads"); my $executor = $self->getClusterExecutor(); my $queue = $self->getClusterQueue(); - # Look up taxon_id from the database for this organism - my $gusConfigFile = $ENV{GUS_HOME} . "/config/gus.config"; - die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; - - my @properties = (); - my $gusConfig = CBIL::Util::PropertySet->new($gusConfigFile, \@properties, 1); - - my $taxonSql = "select taxon_id from apidb.organism where abbrev = '$organismAbbrev'"; - - my $db = GUS::ObjRelP::DbiDatabase->new( - $gusConfig->{props}->{dbiDsn}, - $gusConfig->{props}->{databaseLogin}, - $gusConfig->{props}->{databasePassword}, - 0, 0, 1, - $gusConfig->{props}->{coreSchemaName} - ); - - my $dbh = $db->getQueryHandle(); - my $stmt = $dbh->prepare($taxonSql); - $stmt->execute(); - - my $taxonId; - while (my @row = $stmt->fetchrow_array()) { - $taxonId = $row[0]; - } - if ($undo) { $self->runCmd(0, "rm -rf $nextflowConfigFile"); } else { @@ -88,10 +55,6 @@ params { winLen = $winLen ploidy = $ploidy outputDir = \"$digestedResultsDir\" - trimmomaticAdaptorsFile = $trimmomaticAdaptorsFile - freebayesMinAltFraction = $freebayesMinAltFraction - maxNumberOfReads = $maxNumberOfReads - taxonId = \"$taxonId\" geneSourceIdOrthologFile = \"$digestedOrthologFile\" chrsForCalcFile = \"$digestedChrsForCalcFile\" } diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm index 9ade04724..4579a431a 100644 --- a/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm @@ -35,10 +35,8 @@ sub run { my $varscanPValue = $self->getConfig("varscanPValue"); my $varscanMinVarFreqSnp = $self->getConfig("varscanMinVarFreqSnp"); my $varscanMinVarFreqCons = $self->getConfig("varscanMinVarFreqCons"); - my $maxNumberOfReads = $self->getConfig("maxNumberOfReads"); my $hisat2Index = $self->getConfig("hisat2Index"); my $createIndex = $self->getConfig("createIndex"); - my $trimmomaticAdaptorsFile = $self->getConfig("trimmomaticAdaptorsFile"); my $ebiFtpUser = $self->getConfig("ebiFtpUser"); my $ebiFtpPassword = $self->getConfig("ebiFtpPassword"); @@ -93,11 +91,9 @@ params { hisat2Index = $hisat2Index createIndex = $createIndex outputDir = \"$clusterResultDir\" - trimmomaticAdaptorsFile = $trimmomaticAdaptorsFile varscanPValue = $varscanPValue varscanMinVarFreqSnp = $varscanMinVarFreqSnp varscanMinVarFreqCons = $varscanMinVarFreqCons - maxNumberOfReads = $maxNumberOfReads taxonId = \"$taxonId\" geneSourceIdOrthologFile = \"$geneSourceIdOrthologFile\" chrsForCalcFile = \"$chrsForCalcFile\" diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml index 6b418d32c..aca5a9003 100644 --- a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -1,4 +1,3 @@ - @@ -11,6 +10,8 @@ + + @@ -24,9 +25,8 @@ $$dataDir$$/$$organismAbbrev$$.gtf $$dataDir$$/$$organismAbbrev$$.fasta $$dataDir$$/final - $$dataDir$$/loadSingleExperiment - $$loadSingleExperimentDir$$/nextflow.config - + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv $$dataDir$$ @@ -41,11 +41,6 @@ - - $$loadSingleExperimentDir$$ - - - $$gtfFile$$ $$gtfSymLink$$ @@ -64,10 +59,15 @@ - - $$organismAbbrev$$ - $$dataDir$$/geneSourceIdOrthologFile.tsv - $$dataDir$$/chrsForCalcsFile.tsv + + $$geneSourceIdOrthologFile$$ + $$geneSourceIdOrthologSymLink$$ + + + + + $$chrsForCalcsFile$$ + $$chrsForCalcsSymLink$$ @@ -95,14 +95,14 @@ $$footprintFile$$ $$ploidy$$ $$resultsDirectory$$ - $$organismAbbrev$$ - $$dataDir$$/geneSourceIdOrthologFile.tsv - $$dataDir$$/chrsForCalcsFile.tsv + $$geneSourceIdOrthologSymLink$$ + $$chrsForCalcsSymLink$$ - + + - + $$gusConfigFile$$ $$projectName$$ $$dataDir$$ @@ -120,32 +120,9 @@ + + - - $$loadNextflowConfigFile$$ - $$dataDir$$ - $$resultsDirectory$$ - $$experimentDatasetName$$|$$experimentDatasetVersion$$ - $$genomeExtDbRlsSpec$$ - - - - - - $$gusConfigFile$$ - $$projectName$$ - $$dataDir$$ - $$loadSingleExperimentDir$$ - $$loadNextflowConfigFile$$ - $$loadSingleExperimentDir$$/results - $$organismAbbrev$$ - $$genomeExtDbRlsSpec$$ - false - $$experimentDatasetName$$|$$experimentDatasetVersion$$ - VEuPathDB/dnaseq-nextflow - loadSingleExperiment - - diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index 65a2096f3..a2079e224 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -7,11 +7,12 @@ - $$parentDataDir$$/dnaseq $$dataDir$$/geneFootprintFile.txt $$dataDir$$/$$organismAbbrev$$.gtf + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv $$dataDir$$ @@ -35,6 +36,13 @@ + + $$organismAbbrev$$ + $$geneSourceIdOrthologFile$$ + $$chrsForCalcsFile$$ + + + @@ -66,82 +74,18 @@ $$relativeWebServicesDir$$ $$gusConfigFile$$ $$genomeExtDbRlsSpec$$ + $$geneSourceIdOrthologFile$$ + $$chrsForCalcsFile$$ + - - $$dataDir$$/$$referenceStrainOrganismAbbrev$$_mergeExperiments + $$dataDir$$/$$organismAbbrev$$_mergeExperiments @@ -151,7 +95,7 @@ $$dataDir$$/**/results $$genomeFastaFile$$ $$gtfFile$$ - $$referenceStrainOrganismAbbrev$$ + $$organismAbbrev$$ $$dataDir$$/mergeExperiments/results cache.txt undoneStrains.txt diff --git a/Main/lib/xml/workflowTemplates/project.xml b/Main/lib/xml/workflowTemplates/project.xml index 05a209202..fdb2a5197 100644 --- a/Main/lib/xml/workflowTemplates/project.xml +++ b/Main/lib/xml/workflowTemplates/project.xml @@ -345,7 +345,6 @@ ${projectName}/${organismAbbrev}.xml $$dataDir$$/${organismAbbrev} ${organismAbbrev}_primary_genome_RSRC|${genomeVersion} - ${referenceStrainOrganismAbbrev} $$relativeWebServicesDir$$ $$dataDir$$/${organismAbbrev}/loadGenome/genomicSeqs.fa From 95de88a360d695b47aec5c55e2f71a5610163116 Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Tue, 14 Apr 2026 11:59:47 -0400 Subject: [PATCH 3/7] getTaxonId instead of orgAbbrev for cnvandploidyqueries script --- .../WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm | 10 ++++++++-- Main/lib/xml/workflowTemplates/dnaseq.xml | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm index 627a8aff2..7e34fa2ff 100755 --- a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm +++ b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm @@ -10,7 +10,13 @@ sub run { my $organismAbbrev = $self->getParamValue('organismAbbrev'); my $geneSourceIdOrthologFile = join("/", $self->getWorkflowDataDir(), $self->getParamValue("geneSourceIdOrthologFile")); my $chrsForCalcsFile = join("/", $self->getWorkflowDataDir(), $self->getParamValue('chrsForCalcsFile')); - + my $gusConfigFile = $self->getParamValue('gusConfigFile'); + $gusConfigFile = $self->getWorkflowDataDir() . "/$gusConfigFile"; + + # TODO: get the groups file from globalConfig? + + my $taxonId = $self->getOrganismInfo($test, $organismAbbrev, $gusConfigFile)->getTaxonId(); + if ($undo) { $self->runCmd(0, "rm -f $geneSourceIdOrthologFile"); $self->runCmd(0, "rm -f $chrsForCalcsFile"); @@ -19,7 +25,7 @@ sub run { $self->runCmd($test,"echo test > $geneSourceIdOrthologFile"); $self->runCmd($test,"echo test > $chrsForCalcsFile"); } else { - $self->runCmd($test,"runGeneCNVAndPloidyQuery --organismAbbrev $organismAbbrev --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile"); + $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile"); } } } diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index a2079e224..2446dd05c 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -40,6 +40,7 @@ $$organismAbbrev$$ $$geneSourceIdOrthologFile$$ $$chrsForCalcsFile$$ + $$gusConfigFile$$ From d4a7ae455623906cca835c3717780b36b2f2735e Mon Sep 17 00:00:00 2001 From: rdemko2332 Date: Tue, 14 Apr 2026 15:25:09 -0400 Subject: [PATCH 4/7] Adding ortho group file parameter to RetrieveGeneCNVAndPloidyQueries --- .../perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm index 7e34fa2ff..515450be2 100755 --- a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm +++ b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm @@ -12,8 +12,7 @@ sub run { my $chrsForCalcsFile = join("/", $self->getWorkflowDataDir(), $self->getParamValue('chrsForCalcsFile')); my $gusConfigFile = $self->getParamValue('gusConfigFile'); $gusConfigFile = $self->getWorkflowDataDir() . "/$gusConfigFile"; - - # TODO: get the groups file from globalConfig? + my $fullOrthoGroupsFile = $self->getSharedConfig("fullOrthoGroupsFile"); my $taxonId = $self->getOrganismInfo($test, $organismAbbrev, $gusConfigFile)->getTaxonId(); @@ -25,7 +24,7 @@ sub run { $self->runCmd($test,"echo test > $geneSourceIdOrthologFile"); $self->runCmd($test,"echo test > $chrsForCalcsFile"); } else { - $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile"); + $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile --orthoGroupFile $fullGroupsFile"); } } } From d79e9f513f05a50884688bf4c8074311b49a287b Mon Sep 17 00:00:00 2001 From: rdemko2332 Date: Tue, 14 Apr 2026 15:26:16 -0400 Subject: [PATCH 5/7] Updating runGeneCNVAndPloidyQuery to function from ortho group flat file and with new database schema --- Main/bin/runGeneCNVAndPloidyQuery | 100 ++++++++++++++++++------------ 1 file changed, 59 insertions(+), 41 deletions(-) diff --git a/Main/bin/runGeneCNVAndPloidyQuery b/Main/bin/runGeneCNVAndPloidyQuery index 818dc8f29..d3d85e0a2 100755 --- a/Main/bin/runGeneCNVAndPloidyQuery +++ b/Main/bin/runGeneCNVAndPloidyQuery @@ -7,43 +7,30 @@ use GUS::ObjRelP::DbiDatabase; use GUS::Supported::GusConfig; use CBIL::Util::PropertySet; -my ($gusConfigFile,$organismAbbrev,$geneSourceIdOrthologFile,$chrsForCalcsFile); -&GetOptions("organismAbbrev=s" => \$organismAbbrev, +my ($gusConfigFile,$taxonId,$orthoGroupFile,$geneSourceIdOrthologFile,$chrsForCalcsFile); +&GetOptions("taxonId=s" => \$taxonId, + "orthoGroupFile=s" => \$orthoGroupFile, "geneSourceIdOrthologFile=s" => \$geneSourceIdOrthologFile, "chrsForCalcsFile=s" => \$chrsForCalcsFile); -my $ploidy = 2; - -my $geneSourceSql = "with sequence as ( - select gf.source_id as gene_source_id - , gf.na_feature_id - , ns.source_id as contig_source_id - , ns.source_id as sequence_source_id - , ns.TAXON_ID - from dots.genefeature gf - , DOTS.NASEQUENCE ns - , SRES.ONTOLOGYTERM ot - where gf.na_sequence_id = ns.na_sequence_id - and ot.name = 'chromosome' - and ns.SEQUENCE_ONTOLOGY_ID = ot.ONTOLOGY_TERM_ID - and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev') - ), orthologs as ( - select gf.na_feature_id, sg.name - from dots.genefeature gf - , dots.SequenceSequenceGroup ssg - , dots.SequenceGroup sg - , core.TableInfo ti - where gf.na_feature_id = ssg.sequence_id - and ssg.sequence_group_id = sg.sequence_group_id - and ssg.source_table_id = ti.table_id - and ti.name = 'GeneFeature' - ) - select s.gene_source_id - , o.name - from sequence s - , orthologs o - where s.na_feature_id = o.na_feature_id"; - -my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev')"; + +my $proteinToGeneSql = " +SELECT aas.source_id AS protein_source_id, + gf.source_id AS gene_source_id +FROM dots.AASequence aas +JOIN dots.TranslatedAASequence tas ON aas.aa_sequence_id = tas.aa_sequence_id +JOIN dots.TranslatedAAFeature taf ON taf.aa_sequence_id = tas.aa_sequence_id +JOIN dots.Transcript t ON taf.na_feature_id = t.na_feature_id +JOIN dots.GeneFeature gf ON t.parent_id = gf.na_feature_id +WHERE aas.subclass_view = 'TranslatedAASequence' + AND aas.taxon_id = $taxonId + AND aas.taxon_id IN ( + SELECT taxon_id + FROM apidb.organism + WHERE is_annotated_genome = 1 + ) +"; + +my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = $taxonId"; $gusConfigFile = $ENV{GUS_HOME}."/config/gus.config"; die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; @@ -59,19 +46,50 @@ my $db = GUS::ObjRelP::DbiDatabase-> new($gusConfig->{props}->{dbiDsn}, my $dbh = $db->getQueryHandle(); -my $orthoMclStmt = $dbh->prepare($geneSourceSql); -$orthoMclStmt->execute(); +my $proteinToGeneStmt = $dbh->prepare($proteinToGeneSql); +$proteinToGeneStmt->execute(); + +my %proteinToGene; +while (my @row = $proteinToGeneStmt->fetchrow_array()){ + $proteinToGene{$row[0]} = $row[1]; +} -open(GENE,">$geneSourceIdOrthologFile"); -while (my @row = $orthoMclStmt->fetchrow_array()){ - print GENE "$row[0]\t$row[1]\n"; +my %proteinToGroup; +open(GROUPS, "<$orthoGroupFile") or die "Cannot open $orthoGroupFile: $!"; +while (my $line = ) { + chomp $line; + my ($groupId, $proteinList) = split(/:\s*/, $line, 2); + next unless defined $proteinList; + foreach my $protein (split(/\s+/, $proteinList)) { + $proteinToGroup{$protein} = $groupId; + } +} +close GROUPS; + +my @proteinsWithNoGroup; +open(GENE, ">$geneSourceIdOrthologFile") or die "Cannot open $geneSourceIdOrthologFile: $!"; +while (my ($protein, $gene) = each %proteinToGene) { + my $group = $proteinToGroup{$protein}; + unless ($group) { + (my $altProtein = $protein) =~ s/:/\_/g; + $group = $proteinToGroup{$altProtein}; + } + if ($group) { + print GENE "$gene\t$group\n"; + } else { + push @proteinsWithNoGroup, $protein; + } } close GENE; +if (@proteinsWithNoGroup) { + print STDERR "The following proteins have no group assignment in $orthoGroupFile:\n" . join("\n", @proteinsWithNoGroup) . "\n"; +} + my $chrsForCalcs = $dbh->prepare($chrsForCalcsSql); $chrsForCalcs->execute(); -open(CHRS,">$chrsForCalcsFile"); +open(CHRS, ">$chrsForCalcsFile") or die "Cannot open $chrsForCalcsFile: $!"; while (my @row = $chrsForCalcs->fetchrow_array()){ print CHRS "$row[0]\t\n"; } From 3e3692c77fe84baa0c2589621c9319998d13ddda Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Fri, 15 May 2026 09:33:06 -0400 Subject: [PATCH 6/7] add repeatmasker bed and fix dependencies --- Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 37 ++++++++++++++++++---- Main/lib/xml/workflowTemplates/dnaseq.xml | 12 ++++--- Main/lib/xml/workflowTemplates/project.xml | 1 + 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml index aca5a9003..cbc6313c1 100644 --- a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -15,16 +15,25 @@ + - $$parentDataDir$$/dnaseq + $$parentDataDir$$/dnaseqNextflow $$dataDir$$/analysisDir $$analysisDirectory$$/results $$analysisDirectory$$/nextflow.config $$analysisDirectory$$/ngs-samples-nextflow.config $$parentDataDir$$/final + $$dataDir$$/$$organismAbbrev$$.gtf $$dataDir$$/$$organismAbbrev$$.fasta + $$dataDir$$/$$organismAbbrev$$.bed.gz + $$dataDir$$/geneFootprintFile.txt + $$dataDir$$/final + + + + $$dataDir$$/geneSourceIdOrthologFile.tsv $$dataDir$$/chrsForCalcsFile.tsv @@ -53,6 +62,13 @@ + + $$repeatMaskedBed$$ + $$repeatMaskedBedSymLink$$ + + + + $$finalDir$$ $$finalSymLink$$ @@ -71,6 +87,12 @@ + + $$footprintFile$$ + $$footprintFileSymLink$$ + + + $$gusConfigFile$$ $$analysisDirectory$$ @@ -91,15 +113,21 @@ $$dataDir$$ $$analysisDirectory$$/ngs-samples-results/samplesheet.csv $$genomeSymLink$$ + $$repeatMaskedBedSymLink$$ $$gtfSymLink$$ - $$footprintFile$$ + $$footprintFileSymLink$$ $$ploidy$$ $$resultsDirectory$$ $$geneSourceIdOrthologSymLink$$ $$chrsForCalcsSymLink$$ + + + + + @@ -117,11 +145,6 @@ processSingleExperiment - - - - - diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index 2446dd05c..623274398 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -7,7 +7,8 @@ - + + $$parentDataDir$$/dnaseq $$dataDir$$/geneFootprintFile.txt $$dataDir$$/$$organismAbbrev$$.gtf @@ -33,9 +34,10 @@ $$projectName$$ $$genomeExtDbRlsSpec$$ $$gusConfigFile$$ - + + $$organismAbbrev$$ $$geneSourceIdOrthologFile$$ @@ -69,6 +71,7 @@ $$dataDir$$/${organismAbbrev}_${name}_dnaseqExperiment_RSRC $$gtfFile$$ $$genomeFastaFile$$ + $$repeatMaskedBed$$ $$geneFootprintFile$$ ${ploidy} ${fromSRA} @@ -78,6 +81,7 @@ $$geneSourceIdOrthologFile$$ $$chrsForCalcsFile$$ + @@ -89,7 +93,7 @@ $$dataDir$$/$$organismAbbrev$$_mergeExperiments - + diff --git a/Main/lib/xml/workflowTemplates/project.xml b/Main/lib/xml/workflowTemplates/project.xml index f8fa7206b..d9573e568 100644 --- a/Main/lib/xml/workflowTemplates/project.xml +++ b/Main/lib/xml/workflowTemplates/project.xml @@ -347,6 +347,7 @@ ${organismAbbrev}_primary_genome_RSRC|${genomeVersion} $$relativeWebServicesDir$$ $$dataDir$$/${organismAbbrev}/loadGenome/genomicSeqs.fa + $$dataDir$$/${organismAbbrev}/maskGenome/analysisDir/results/blocked.seq.bed.gz From f21e855c1f32d7bdab44a157a5ced61079df053f Mon Sep 17 00:00:00 2001 From: John Brestelli Date: Fri, 15 May 2026 09:36:49 -0400 Subject: [PATCH 7/7] minor --- Main/lib/xml/workflow/snpAndCnvDNASeq.xml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml index cbc6313c1..076ee3cc9 100644 --- a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -24,18 +24,16 @@ $$analysisDirectory$$/ngs-samples-nextflow.config $$parentDataDir$$/final - $$dataDir$$/$$organismAbbrev$$.gtf $$dataDir$$/$$organismAbbrev$$.fasta $$dataDir$$/$$organismAbbrev$$.bed.gz - $$dataDir$$/geneFootprintFile.txt - - $$dataDir$$/final - - + $$dataDir$$/final + $$dataDir$$/$$organismAbbrev$$.gtf + $$dataDir$$/geneFootprintFile.txt $$dataDir$$/geneSourceIdOrthologFile.tsv $$dataDir$$/chrsForCalcsFile.tsv + $$dataDir$$