diff --git a/Main/bin/runGeneCNVAndPloidyQuery b/Main/bin/runGeneCNVAndPloidyQuery index 818dc8f29..d3d85e0a2 100755 --- a/Main/bin/runGeneCNVAndPloidyQuery +++ b/Main/bin/runGeneCNVAndPloidyQuery @@ -7,43 +7,30 @@ use GUS::ObjRelP::DbiDatabase; use GUS::Supported::GusConfig; use CBIL::Util::PropertySet; -my ($gusConfigFile,$organismAbbrev,$geneSourceIdOrthologFile,$chrsForCalcsFile); -&GetOptions("organismAbbrev=s" => \$organismAbbrev, +my ($gusConfigFile,$taxonId,$orthoGroupFile,$geneSourceIdOrthologFile,$chrsForCalcsFile); +&GetOptions("taxonId=s" => \$taxonId, + "orthoGroupFile=s" => \$orthoGroupFile, "geneSourceIdOrthologFile=s" => \$geneSourceIdOrthologFile, "chrsForCalcsFile=s" => \$chrsForCalcsFile); -my $ploidy = 2; - -my $geneSourceSql = "with sequence as ( - select gf.source_id as gene_source_id - , gf.na_feature_id - , ns.source_id as contig_source_id - , ns.source_id as sequence_source_id - , ns.TAXON_ID - from dots.genefeature gf - , DOTS.NASEQUENCE ns - , SRES.ONTOLOGYTERM ot - where gf.na_sequence_id = ns.na_sequence_id - and ot.name = 'chromosome' - and ns.SEQUENCE_ONTOLOGY_ID = ot.ONTOLOGY_TERM_ID - and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev') - ), orthologs as ( - select gf.na_feature_id, sg.name - from dots.genefeature gf - , dots.SequenceSequenceGroup ssg - , dots.SequenceGroup sg - , core.TableInfo ti - where gf.na_feature_id = ssg.sequence_id - and ssg.sequence_group_id = sg.sequence_group_id - and ssg.source_table_id = ti.table_id - and ti.name = 'GeneFeature' - ) - select s.gene_source_id - , o.name - from sequence s - , orthologs o - where s.na_feature_id = o.na_feature_id"; - -my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = (select taxon_id from apidb.organism where abbrev = '$organismAbbrev')"; + +my $proteinToGeneSql = " +SELECT aas.source_id AS protein_source_id, + gf.source_id AS gene_source_id +FROM dots.AASequence aas +JOIN dots.TranslatedAASequence tas ON aas.aa_sequence_id = tas.aa_sequence_id +JOIN dots.TranslatedAAFeature taf ON taf.aa_sequence_id = tas.aa_sequence_id +JOIN dots.Transcript t ON taf.na_feature_id = t.na_feature_id +JOIN dots.GeneFeature gf ON t.parent_id = gf.na_feature_id +WHERE aas.subclass_view = 'TranslatedAASequence' + AND aas.taxon_id = $taxonId + AND aas.taxon_id IN ( + SELECT taxon_id + FROM apidb.organism + WHERE is_annotated_genome = 1 + ) +"; + +my $chrsForCalcsSql = "select ns.source_id from dots.nasequence ns, sres.ontologyterm ot where ot.name = 'chromosome' and ot.ontology_term_id = ns.sequence_ontology_id and ns.taxon_id = $taxonId"; $gusConfigFile = $ENV{GUS_HOME}."/config/gus.config"; die "Config file $gusConfigFile does not exist" unless -e $gusConfigFile; @@ -59,19 +46,50 @@ my $db = GUS::ObjRelP::DbiDatabase-> new($gusConfig->{props}->{dbiDsn}, my $dbh = $db->getQueryHandle(); -my $orthoMclStmt = $dbh->prepare($geneSourceSql); -$orthoMclStmt->execute(); +my $proteinToGeneStmt = $dbh->prepare($proteinToGeneSql); +$proteinToGeneStmt->execute(); + +my %proteinToGene; +while (my @row = $proteinToGeneStmt->fetchrow_array()){ + $proteinToGene{$row[0]} = $row[1]; +} -open(GENE,">$geneSourceIdOrthologFile"); -while (my @row = $orthoMclStmt->fetchrow_array()){ - print GENE "$row[0]\t$row[1]\n"; +my %proteinToGroup; +open(GROUPS, "<$orthoGroupFile") or die "Cannot open $orthoGroupFile: $!"; +while (my $line = ) { + chomp $line; + my ($groupId, $proteinList) = split(/:\s*/, $line, 2); + next unless defined $proteinList; + foreach my $protein (split(/\s+/, $proteinList)) { + $proteinToGroup{$protein} = $groupId; + } +} +close GROUPS; + +my @proteinsWithNoGroup; +open(GENE, ">$geneSourceIdOrthologFile") or die "Cannot open $geneSourceIdOrthologFile: $!"; +while (my ($protein, $gene) = each %proteinToGene) { + my $group = $proteinToGroup{$protein}; + unless ($group) { + (my $altProtein = $protein) =~ s/:/\_/g; + $group = $proteinToGroup{$altProtein}; + } + if ($group) { + print GENE "$gene\t$group\n"; + } else { + push @proteinsWithNoGroup, $protein; + } } close GENE; +if (@proteinsWithNoGroup) { + print STDERR "The following proteins have no group assignment in $orthoGroupFile:\n" . join("\n", @proteinsWithNoGroup) . "\n"; +} + my $chrsForCalcs = $dbh->prepare($chrsForCalcsSql); $chrsForCalcs->execute(); -open(CHRS,">$chrsForCalcsFile"); +open(CHRS, ">$chrsForCalcsFile") or die "Cannot open $chrsForCalcsFile: $!"; while (my @row = $chrsForCalcs->fetchrow_array()){ print CHRS "$row[0]\t\n"; } diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm new file mode 100644 index 000000000..33b1569eb --- /dev/null +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqLoadNextflowConfig.pm @@ -0,0 +1,39 @@ +package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqLoadNextflowConfig; + +@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep); + +use strict; +use warnings; +use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep; + +sub run { + my ($self, $test, $undo) = @_; + + my $indelDir = $self->getParamValue("indelDir"); + my $extDbRlsSpec = $self->getParamValue("extDbRlsSpec"); + my $genomeExtDbRlsSpec = $self->getParamValue("genomeExtDbRlsSpec"); + + my $configPath = $self->getWorkflowDataDir() . "/" . $self->getParamValue("nextflowConfigFile"); + + if ($undo) { + $self->runCmd(0, "rm -rf $configPath"); + } else { + open(F, ">", $configPath) or die "$! :Can't open config file '$configPath' for writing"; + print F +" +params { + indelDir = \"$indelDir\" + extDbRlsSpec = '\"$extDbRlsSpec\"' + genomeExtDbRlsSpec = '\"$genomeExtDbRlsSpec\"' +} + +singularity { + enabled = true + autoMounts = true +} +"; + close(F); + } +} + +1; diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm new file mode 100644 index 000000000..9fc9d4067 --- /dev/null +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqNextflowConfig.pm @@ -0,0 +1,76 @@ +package ApiCommonWorkflow::Main::WorkflowSteps::MakeDnaSeqNextflowConfig; + +@ISA = (ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep); + +use strict; +use warnings; +use ApiCommonWorkflow::Main::WorkflowSteps::WorkflowStep; + +sub run { + my ($self, $test, $undo) = @_; + + my $workingDirRelativePath = $self->getParamValue("workingDirRelativePath"); + + my $sampleSheetFile = $self->getParamValue("sampleSheetFile"); + my $genomeFile = $self->getParamValue("genomeFile"); + my $gtfFile = $self->getParamValue("gtfFile"); + my $footprintFile = $self->getParamValue("footprintFile"); + my $ploidy = $self->getParamValue("ploidy"); + my $resultsDirectory = $self->getParamValue("resultsDirectory"); + my $geneSourceIdOrthologFile = $self->getParamValue("geneSourceIdOrthologFile"); + my $chrsForCalcFile = $self->getParamValue("chrsForCalcFile"); + + my $nextflowConfigFile = $self->getWorkflowDataDir() . "/" . $self->getParamValue("nextflowConfigFile"); + + # Translate local paths to cluster-side paths + my $digestedSampleSheet = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $sampleSheetFile); + my $digestedGenomeFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $genomeFile); + my $digestedGtfFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $gtfFile); + my $digestedFootprintFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $footprintFile); + my $digestedResultsDir = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $resultsDirectory); + my $digestedOrthologFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $geneSourceIdOrthologFile); + my $digestedChrsForCalcFile = $self->relativePathToNextflowClusterPath($workingDirRelativePath, $chrsForCalcFile); + + # Workflow config values + my $minCoverage = $self->getConfig("minCoverage"); + my $winLen = $self->getConfig("winLen"); + my $bwaThreads = $self->getConfig("bwaThreads"); + + my $executor = $self->getClusterExecutor(); + my $queue = $self->getClusterQueue(); + + if ($undo) { + $self->runCmd(0, "rm -rf $nextflowConfigFile"); + } else { + open(F, ">", $nextflowConfigFile) or die "$! :Can't open config file '$nextflowConfigFile' for writing"; + print F +" +params { + samplesheet = \"$digestedSampleSheet\" + bwaThreads = $bwaThreads + minCoverage = $minCoverage + genomeFastaFile = \"$digestedGenomeFile\" + gtfFile = \"$digestedGtfFile\" + footprintFile = \"$digestedFootprintFile\" + winLen = $winLen + ploidy = $ploidy + outputDir = \"$digestedResultsDir\" + geneSourceIdOrthologFile = \"$digestedOrthologFile\" + chrsForCalcFile = \"$digestedChrsForCalcFile\" +} + +process { + executor = '$executor' + queue = '$queue' +} + +singularity { + enabled = true + autoMounts = true +} +"; + close(F); + } +} + +1; diff --git a/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm b/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm index 9ade04724..4579a431a 100644 --- a/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm +++ b/Main/lib/perl/WorkflowSteps/MakeDnaSeqSingleExperimentNextflowConfig.pm @@ -35,10 +35,8 @@ sub run { my $varscanPValue = $self->getConfig("varscanPValue"); my $varscanMinVarFreqSnp = $self->getConfig("varscanMinVarFreqSnp"); my $varscanMinVarFreqCons = $self->getConfig("varscanMinVarFreqCons"); - my $maxNumberOfReads = $self->getConfig("maxNumberOfReads"); my $hisat2Index = $self->getConfig("hisat2Index"); my $createIndex = $self->getConfig("createIndex"); - my $trimmomaticAdaptorsFile = $self->getConfig("trimmomaticAdaptorsFile"); my $ebiFtpUser = $self->getConfig("ebiFtpUser"); my $ebiFtpPassword = $self->getConfig("ebiFtpPassword"); @@ -93,11 +91,9 @@ params { hisat2Index = $hisat2Index createIndex = $createIndex outputDir = \"$clusterResultDir\" - trimmomaticAdaptorsFile = $trimmomaticAdaptorsFile varscanPValue = $varscanPValue varscanMinVarFreqSnp = $varscanMinVarFreqSnp varscanMinVarFreqCons = $varscanMinVarFreqCons - maxNumberOfReads = $maxNumberOfReads taxonId = \"$taxonId\" geneSourceIdOrthologFile = \"$geneSourceIdOrthologFile\" chrsForCalcFile = \"$chrsForCalcFile\" diff --git a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm index 627a8aff2..515450be2 100755 --- a/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm +++ b/Main/lib/perl/WorkflowSteps/RetrieveGeneCNVAndPloidyQueries.pm @@ -10,7 +10,12 @@ sub run { my $organismAbbrev = $self->getParamValue('organismAbbrev'); my $geneSourceIdOrthologFile = join("/", $self->getWorkflowDataDir(), $self->getParamValue("geneSourceIdOrthologFile")); my $chrsForCalcsFile = join("/", $self->getWorkflowDataDir(), $self->getParamValue('chrsForCalcsFile')); - + my $gusConfigFile = $self->getParamValue('gusConfigFile'); + $gusConfigFile = $self->getWorkflowDataDir() . "/$gusConfigFile"; + my $fullOrthoGroupsFile = $self->getSharedConfig("fullOrthoGroupsFile"); + + my $taxonId = $self->getOrganismInfo($test, $organismAbbrev, $gusConfigFile)->getTaxonId(); + if ($undo) { $self->runCmd(0, "rm -f $geneSourceIdOrthologFile"); $self->runCmd(0, "rm -f $chrsForCalcsFile"); @@ -19,7 +24,7 @@ sub run { $self->runCmd($test,"echo test > $geneSourceIdOrthologFile"); $self->runCmd($test,"echo test > $chrsForCalcsFile"); } else { - $self->runCmd($test,"runGeneCNVAndPloidyQuery --organismAbbrev $organismAbbrev --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile"); + $self->runCmd($test,"runGeneCNVAndPloidyQuery --taxonId $taxonId --geneSourceIdOrthologFile $geneSourceIdOrthologFile --chrsForCalcsFile $chrsForCalcsFile --orthoGroupFile $fullGroupsFile"); } } } diff --git a/Main/lib/xml/workflow/snpAndCnvDNASeq.xml b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml new file mode 100644 index 000000000..076ee3cc9 --- /dev/null +++ b/Main/lib/xml/workflow/snpAndCnvDNASeq.xml @@ -0,0 +1,149 @@ + + + + + + + + + + + + + + + + + + + + $$parentDataDir$$/dnaseqNextflow + $$dataDir$$/analysisDir + $$analysisDirectory$$/results + $$analysisDirectory$$/nextflow.config + $$analysisDirectory$$/ngs-samples-nextflow.config + $$parentDataDir$$/final + + $$dataDir$$/$$organismAbbrev$$.fasta + $$dataDir$$/$$organismAbbrev$$.bed.gz + + $$dataDir$$/final + + $$dataDir$$/$$organismAbbrev$$.gtf + $$dataDir$$/geneFootprintFile.txt + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv + + + $$dataDir$$ + + + + $$analysisDirectory$$ + + + + + $$resultsDirectory$$ + + + + + $$gtfFile$$ + $$gtfSymLink$$ + + + + + $$genomeFastaFile$$ + $$genomeSymLink$$ + + + + + $$repeatMaskedBed$$ + $$repeatMaskedBedSymLink$$ + + + + + + $$finalDir$$ + $$finalSymLink$$ + + + + + $$geneSourceIdOrthologFile$$ + $$geneSourceIdOrthologSymLink$$ + + + + + $$chrsForCalcsFile$$ + $$chrsForCalcsSymLink$$ + + + + + $$footprintFile$$ + $$footprintFileSymLink$$ + + + + + $$gusConfigFile$$ + $$analysisDirectory$$ + $$finalSymLink$$ + $$analysisDirectory$$/ngs-samples-results + $$ngsSamplesNextflowConfigFile$$ + $$dataDir$$ + samplesheet.csv + $$fromSRA$$ + DNASeq + $$organismAbbrev$$ + + + + + $$gusConfigFile$$ + $$nextflowConfigFile$$ + $$dataDir$$ + $$analysisDirectory$$/ngs-samples-results/samplesheet.csv + $$genomeSymLink$$ + $$repeatMaskedBedSymLink$$ + $$gtfSymLink$$ + $$footprintFileSymLink$$ + $$ploidy$$ + $$resultsDirectory$$ + $$geneSourceIdOrthologSymLink$$ + $$chrsForCalcsSymLink$$ + + + + + + + + + + + + $$gusConfigFile$$ + $$projectName$$ + $$dataDir$$ + $$nextflowConfigFile$$ + $$ngsSamplesNextflowConfigFile$$ + $$organismAbbrev$$ + $$genomeExtDbRlsSpec$$ + false + $$experimentDatasetName$$|$$experimentDatasetVersion$$ + $$analysisDirectory$$ + VEuPathDB/dnaseq-nextflow + processSingleExperiment + + + + + + diff --git a/Main/lib/xml/workflowTemplates/dnaseq.xml b/Main/lib/xml/workflowTemplates/dnaseq.xml index 0fda986b5..623274398 100644 --- a/Main/lib/xml/workflowTemplates/dnaseq.xml +++ b/Main/lib/xml/workflowTemplates/dnaseq.xml @@ -1,16 +1,19 @@ + - - + + $$parentDataDir$$/dnaseq $$dataDir$$/geneFootprintFile.txt $$dataDir$$/$$organismAbbrev$$.gtf + $$dataDir$$/geneSourceIdOrthologFile.tsv + $$dataDir$$/chrsForCalcsFile.tsv $$dataDir$$ @@ -30,87 +33,74 @@ $$geneFootprintFile$$ $$projectName$$ $$genomeExtDbRlsSpec$$ - $$gusConfigFile$$ - + $$gusConfigFile$$ + - - - - - - - - $$organismAbbrev$$_dnaseqExperiment_${name}_RSRC - $$organismDatasetLoaderXmlFile$$ - $$dataDir$$ - - - - - $$dataDir$$ - $$organismAbbrev$$ - $$dataDir$$/results - $$relativeWebServicesDir$$ - $$genomeFastaFile$$ - $$gtfFile$$ - $$geneFootprintFile$$ - ${isPaired} - ${ploidy} - ${fromBAM} - ${isLocal} - ${name} - ${version} - $$genomeExtDbRlsSpec$$ - - - + + $$organismAbbrev$$ + $$geneSourceIdOrthologFile$$ + $$chrsForCalcsFile$$ + $$gusConfigFile$$ + + - + + - - - - $$organismAbbrev$$_dnaseqExperiment_${name}_RSRC + + + + + ${organismAbbrev}_${name}_dnaseqExperiment_RSRC $$organismDatasetLoaderXmlFile$$ $$dataDir$$ + $$gusConfigFile$$ - + - - $$dataDir$$ - $$organismAbbrev$$ - $$dataDir$$/results - $$relativeWebServicesDir$$ - $$genomeFastaFile$$ + + ${organismAbbrev} + ${projectName} + ${name} + ${organismAbbrev}_${name}_dnaseqExperiment_RSRC + ${version} + $$dataDir$$/${organismAbbrev}_${name}_dnaseqExperiment_RSRC $$gtfFile$$ - $$geneFootprintFile$$ - ${isPaired} + $$genomeFastaFile$$ + $$repeatMaskedBed$$ + $$geneFootprintFile$$ ${ploidy} - ${fromBAM} - ${isLocal} - ${name} - ${version} - $$genomeExtDbRlsSpec$$ + ${fromSRA} + $$relativeWebServicesDir$$ + $$gusConfigFile$$ + $$genomeExtDbRlsSpec$$ + $$geneSourceIdOrthologFile$$ + $$chrsForCalcsFile$$ + + + + + - $$dataDir$$/$$referenceStrainOrganismAbbrev$$_mergeExperiments - + $$dataDir$$/$$organismAbbrev$$_mergeExperiments + - + diff --git a/Main/lib/xml/workflowTemplates/project.xml b/Main/lib/xml/workflowTemplates/project.xml index 3b62836da..d9573e568 100644 --- a/Main/lib/xml/workflowTemplates/project.xml +++ b/Main/lib/xml/workflowTemplates/project.xml @@ -345,9 +345,9 @@ ${projectName}/${organismAbbrev}.xml $$dataDir$$/${organismAbbrev} ${organismAbbrev}_primary_genome_RSRC|${genomeVersion} - ${referenceStrainOrganismAbbrev} $$relativeWebServicesDir$$ $$dataDir$$/${organismAbbrev}/loadGenome/genomicSeqs.fa + $$dataDir$$/${organismAbbrev}/maskGenome/analysisDir/results/blocked.seq.bed.gz