diff --git a/CHANGELOG.md b/CHANGELOG.md index fc8dd3cf..43814c4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` - [#674](https://github.com/nf-core/mag/pull/674) - Changed to porechop-abi as default adapter trimming tool for long reads. User can still use porechop if preferred (added by @muabnezor) +- [#666](https://github.com/nf-core/mag/pull/666) - Update SPAdes to version 4.0.0, replace both METASPADES and MEGAHIT with official nf-core modules (requested by @elsherbini, fix by @jfy133) +- [#666](https://github.com/nf-core/mag/pull/666) - Update URLs to GTDB database downloads due to server move (reported by @Jokendo-collab, fix by @jfy133) - [#695](https://github.com/nf-core/mag/pull/695) - Updated to nf-core 3.0.2 `TEMPLATE` (by @jfy133) - [#695](https://github.com/nf-core/mag/pull/695) - Switch more stable Zenodo link for CheckM data (by @jfy133) @@ -29,6 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | ------------ | ---------------- | ----------- | | Porechop_ABI | | 0.5.0 | | Filtlong | 0.2.0 | 0.2.1 | +| SPAdes | 3.15.3 | 4.0.0 | ### `Deprecated` diff --git a/conf/base.config b/conf/base.config index 6ed5a366..21a8ac3e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,9 +10,8 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { 1 * task.attempt } - memory = { 6.GB * task.attempt } + memory = { 7.GB * task.attempt } time = { 4.h * task.attempt } errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } diff --git a/conf/modules.config b/conf/modules.config index df53a47e..8f2c1042 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -13,20 +13,11 @@ process { //default: do not publish into the results folder - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: false - ] + publishDir = [path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: false] withName: FASTQC_RAW { ext.args = '--quiet' - publishDir = [ - path: { "${params.outdir}/QC_shortreads/fastqc" }, - mode: params.publish_dir_mode, - pattern: "*.html" - ] + publishDir = [path: { "${params.outdir}/QC_shortreads/fastqc" }, mode: params.publish_dir_mode, pattern: "*.html"] ext.prefix = { "${meta.id}_run${meta.run}_raw" } tag = { "${meta.id}_run${meta.run}_raw" } } @@ -204,19 +195,7 @@ process { } withName: NANOLYSE { - publishDir = [ - [ - path: { "${params.outdir}/QC_longreads/NanoLyse" }, - mode: params.publish_dir_mode, - pattern: "*.log" - ], - [ - path: { "${params.outdir}/QC_longreads/NanoLyse" }, - mode: params.publish_dir_mode, - pattern: "*_nanolyse.fastq.gz", - enabled: params.save_lambdaremoved_reads - ] - ] + publishDir = [[path: { "${params.outdir}/QC_longreads/NanoLyse" }, mode: params.publish_dir_mode, pattern: "*.log"], [path: { "${params.outdir}/QC_longreads/NanoLyse" }, mode: params.publish_dir_mode, pattern: "*_nanolyse.fastq.gz", enabled: params.save_lambdaremoved_reads]] ext.prefix = { "${meta.id}_run${meta.run}_lambdafiltered" } } @@ -252,20 +231,12 @@ process { } withName: CENTRIFUGE_CENTRIFUGE { - publishDir = [ - path: { "${params.outdir}/Taxonomy/centrifuge/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: "*.txt" - ] + publishDir = [path: { "${params.outdir}/Taxonomy/centrifuge/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.txt"] } withName: CENTRIFUGE_KREPORT { ext.prefix = { "${meta.id}_kreport" } - publishDir = [ - path: { "${params.outdir}/Taxonomy/centrifuge/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: "*.txt" - ] + publishDir = [path: { "${params.outdir}/Taxonomy/centrifuge/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.txt"] } withName: KRAKEN2 { @@ -278,62 +249,33 @@ process { } withName: KREPORT2KRONA_CENTRIFUGE { - publishDir = [ - path: { "${params.outdir}/Taxonomy/${meta.classifier}/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: "*.txt", - enabled: false - ] + publishDir = [path: { "${params.outdir}/Taxonomy/${meta.classifier}/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.txt", enabled: false] } withName: KRONA_KTIMPORTTAXONOMY { - publishDir = [ - path: { "${params.outdir}/Taxonomy/${meta.classifier}/${meta.id}" }, - mode: params.publish_dir_mode, - pattern: "*.html" - ] + publishDir = [path: { "${params.outdir}/Taxonomy/${meta.classifier}/${meta.id}" }, mode: params.publish_dir_mode, pattern: "*.html"] } - //pattern: "*.{fa.gz,log}" //'pattern' didnt work, probably because the output is in a folder, solved with 'saveAs' withName: MEGAHIT { - ext.args = params.megahit_options ?: '' - publishDir = [ - path: { "${params.outdir}/Assembly" }, - mode: params.publish_dir_mode, - saveAs: { filename -> - filename.equals('versions.yml') - ? null - : filename.indexOf('.contigs.fa.gz') > 0 - ? filename - : filename.indexOf('.log') > 0 ? filename : null - } - ] + ext.args = { params.megahit_options ? params.megahit_options + "-m ${task.memory.toBytes()}" : "-m ${task.memory.toBytes()}" } + ext.prefix = { "MEGAHIT-${meta.id}" } + publishDir = [path: { "${params.outdir}/Assembly/MEGAHIT" }, mode: params.publish_dir_mode, pattern: "*.{fa.gz,log}"] } - withName: SPADES { - ext.args = params.spades_options ?: '' - publishDir = [ - path: { "${params.outdir}/Assembly/SPAdes" }, - mode: params.publish_dir_mode, - pattern: "*.{fasta.gz,gfa.gz,log}" - ] + withName: METASPADES { + ext.args = params.spades_options ?: '--meta' + ext.prefix = { "SPAdes-${meta.id}" } + publishDir = [path: { "${params.outdir}/Assembly/SPAdes" }, mode: params.publish_dir_mode, pattern: "*.{fasta.gz,gfa.gz,fa.gz,log}"] } - withName: SPADESHYBRID { - ext.args = params.spades_options ?: '' - publishDir = [ - path: { "${params.outdir}/Assembly/SPAdesHybrid" }, - mode: params.publish_dir_mode, - pattern: "*.{fasta.gz,gfa.gz,log}" - ] + withName: METASPADESHYBRID { + ext.args = params.spades_options ?: '--meta' + ext.prefix = { "SPAdesHybrid-${meta.id}" } + publishDir = [path: { "${params.outdir}/Assembly/SPAdesHybrid" }, mode: params.publish_dir_mode, pattern: "*.{fasta.gz,gfa.gz,fa.gz,log}"] } withName: QUAST { - publishDir = [ - path: { "${params.outdir}/Assembly/${meta.assembler}/QC/${meta.id}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/Assembly/${meta.assembler}/QC/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: GENOMAD_ENDTOEND { @@ -368,11 +310,7 @@ process { } withName: 'MAG_DEPTHS_PLOT|MAG_DEPTHS_SUMMARY' { - publishDir = [ - path: { "${params.outdir}/GenomeBinning/depths/bins" }, - mode: params.publish_dir_mode, - pattern: "*.{png,tsv}" - ] + publishDir = [path: { "${params.outdir}/GenomeBinning/depths/bins" }, mode: params.publish_dir_mode, pattern: "*.{png,tsv}"] } withName: BIN_SUMMARY { @@ -384,11 +322,7 @@ process { } withName: BUSCO_DB_PREPARATION { - publishDir = [ - path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, - mode: params.publish_dir_mode, - pattern: "*.tar.gz" - ] + publishDir = [path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, mode: params.publish_dir_mode, pattern: "*.tar.gz"] } withName: BUSCO { @@ -403,40 +337,21 @@ process { } withName: BUSCO_SAVE_DOWNLOAD { - publishDir = [ - path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, - mode: params.publish_dir_mode, - overwrite: false, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/GenomeBinning/QC/BUSCO" }, mode: params.publish_dir_mode, overwrite: false, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: 'BUSCO_SUMMARY|QUAST_BINS|QUAST_BINS_SUMMARY' { - publishDir = [ - path: { "${params.outdir}/GenomeBinning/QC" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: ARIA2_UNTAR { - publishDir = [ - path: { "${params.outdir}/GenomeBinning/QC/CheckM/checkm_downloads" }, - mode: params.publish_dir_mode, - overwrite: false, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_checkm_data - ] + publishDir = [path: { "${params.outdir}/GenomeBinning/QC/CheckM/checkm_downloads" }, mode: params.publish_dir_mode, overwrite: false, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_checkm_data] } withName: CHECKM_LINEAGEWF { tag = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}" } ext.prefix = { "${meta.assembler}-${meta.binner}-${meta.domain}-${meta.refinement}-${meta.id}_wf" } - publishDir = [ - path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/GenomeBinning/QC/CheckM" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: CHECKM_QA { @@ -451,11 +366,7 @@ process { withName: COMBINE_CHECKM_TSV { ext.prefix = { "checkm_summary" } - publishDir = [ - path: { "${params.outdir}/GenomeBinning/QC" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/GenomeBinning/QC" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: GUNC_DOWNLOADDB { @@ -486,27 +397,15 @@ process { } withName: CAT_DB_GENERATE { - publishDir = [ - path: { "${params.outdir}/Taxonomy/CAT" }, - mode: params.publish_dir_mode, - pattern: "*.tar.gz" - ] + publishDir = [path: { "${params.outdir}/Taxonomy/CAT" }, mode: params.publish_dir_mode, pattern: "*.tar.gz"] } withName: CAT { - publishDir = [ - path: { "${params.outdir}/Taxonomy/CAT/${meta.assembler}/${meta.binner}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/Taxonomy/CAT/${meta.assembler}/${meta.binner}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: CAT_SUMMARY { ext.prefix = "cat_summary" - publishDir = [ - path: { "${params.outdir}/Taxonomy/CAT/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/Taxonomy/CAT/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: GTDBTK_CLASSIFYWF { @@ -526,49 +425,30 @@ process { withName: GTDBTK_SUMMARY { ext.args = "--extension fa" - publishDir = [ - path: { "${params.outdir}/Taxonomy/GTDB-Tk" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/Taxonomy/GTDB-Tk" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: PROKKA { ext.args = "--metagenome" - publishDir = [ - path: { "${params.outdir}/Annotation/Prokka/${meta.assembler}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/Annotation/Prokka/${meta.assembler}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: PRODIGAL { ext.args = "-p meta" - publishDir = [ - path: { "${params.outdir}/Annotation/Prodigal/${meta.assembler}/${meta.id}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + ext.prefix = { "${meta.assembler}-${meta.id}_prodigal" } + publishDir = [path: { "${params.outdir}/Annotation/Prodigal/${meta.assembler}/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: FREEBAYES { ext.prefix = { "${meta.assembler}-${meta.id}" } ext.args = "-p ${params.freebayes_ploidy} -q ${params.freebayes_min_basequality} -F ${params.freebayes_minallelefreq}" - publishDir = [ - path: { "${params.outdir}/Ancient_DNA/variant_calling/freebayes" }, - mode: params.publish_dir_mode, - pattern: "*.vcf.gz" - ] + publishDir = [path: { "${params.outdir}/Ancient_DNA/variant_calling/freebayes" }, mode: params.publish_dir_mode, pattern: "*.vcf.gz"] } withName: BCFTOOLS_VIEW { ext.prefix = { "${meta.assembler}-${meta.id}.filtered" } ext.args = "-v snps,mnps -i 'QUAL>=${params.bcftools_view_high_variant_quality} || (QUAL>=${params.bcftools_view_medium_variant_quality} && FORMAT/AO>=${params.bcftools_view_minimal_allelesupport})'" - publishDir = [ - path: { "${params.outdir}/Ancient_DNA/variant_calling/filtered" }, - mode: params.publish_dir_mode, - pattern: "*.vcf.gz" - ] + publishDir = [path: { "${params.outdir}/Ancient_DNA/variant_calling/filtered" }, mode: params.publish_dir_mode, pattern: "*.vcf.gz"] } withName: BCFTOOLS_CONSENSUS { @@ -617,32 +497,12 @@ process { } withName: METABAT2_JGISUMMARIZEBAMCONTIGDEPTHS { - publishDir = [ - path: { "${params.outdir}/GenomeBinning/depths/contigs" }, - mode: params.publish_dir_mode, - pattern: '*-depth.txt.gz' - ] + publishDir = [path: { "${params.outdir}/GenomeBinning/depths/contigs" }, mode: params.publish_dir_mode, pattern: '*-depth.txt.gz'] ext.prefix = { "${meta.assembler}-${meta.id}-depth" } } withName: METABAT2_METABAT2 { - publishDir = [ - [ - path: { "${params.outdir}/GenomeBinning/MetaBAT2/bins/" }, - mode: params.publish_dir_mode, - pattern: '*[!lowDepth|tooShort|unbinned].fa.gz' - ], - [ - path: { "${params.outdir}/GenomeBinning/MetaBAT2/discarded" }, - mode: params.publish_dir_mode, - pattern: '*tooShort.fa.gz' - ], - [ - path: { "${params.outdir}/GenomeBinning/MetaBAT2/discarded" }, - mode: params.publish_dir_mode, - pattern: '*lowDepth.fa.gz' - ] - ] + publishDir = [[path: { "${params.outdir}/GenomeBinning/MetaBAT2/bins/" }, mode: params.publish_dir_mode, pattern: '*[!lowDepth|tooShort|unbinned].fa.gz'], [path: { "${params.outdir}/GenomeBinning/MetaBAT2/discarded" }, mode: params.publish_dir_mode, pattern: '*tooShort.fa.gz'], [path: { "${params.outdir}/GenomeBinning/MetaBAT2/discarded" }, mode: params.publish_dir_mode, pattern: '*lowDepth.fa.gz']] ext.prefix = { "${meta.assembler}-MetaBAT2-${meta.id}" } ext.args = [ params.min_contig_size < 1500 ? "-m 1500" : "-m ${params.min_contig_size}", @@ -695,23 +555,7 @@ process { } withName: SPLIT_FASTA { - publishDir = [ - [ - path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned" }, - mode: params.publish_dir_mode, - pattern: '*.*[0-9].fa.gz' - ], - [ - path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" }, - mode: params.publish_dir_mode, - pattern: '*.pooled.fa.gz' - ], - [ - path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" }, - mode: params.publish_dir_mode, - pattern: '*.remaining.fa.gz' - ] - ] + publishDir = [[path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned" }, mode: params.publish_dir_mode, pattern: '*.*[0-9].fa.gz'], [path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" }, mode: params.publish_dir_mode, pattern: '*.pooled.fa.gz'], [path: { "${params.outdir}/GenomeBinning/${meta.binner}/unbinned/discarded" }, mode: params.publish_dir_mode, pattern: '*.remaining.fa.gz']] } withName: DASTOOL_FASTATOCONTIG2BIN_METABAT2 { @@ -773,32 +617,19 @@ process { } withName: TIARA_SUMMARY { - publishDir = [ - path: { "${params.outdir}/GenomeBinning/Tiara" }, - mode: params.publish_dir_mode, - pattern: "tiara_summary.tsv" - ] + publishDir = [path: { "${params.outdir}/GenomeBinning/Tiara" }, mode: params.publish_dir_mode, pattern: "tiara_summary.tsv"] ext.prefix = "tiara_summary" } withName: MMSEQS_DATABASES { ext.prefix = { "${params.metaeuk_mmseqs_db.replaceAll("/", "-")}" } - publishDir = [ - path: { "${params.outdir}/Annotation/mmseqs_db/" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, - enabled: params.save_mmseqs_db - ] + publishDir = [path: { "${params.outdir}/Annotation/mmseqs_db/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, enabled: params.save_mmseqs_db] } withName: METAEUK_EASYPREDICT { ext.args = "" ext.prefix = { "${meta.id}" } - publishDir = [ - path: { "${params.outdir}/Annotation/MetaEuk/${meta.assembler}/${meta.id}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + publishDir = [path: { "${params.outdir}/Annotation/MetaEuk/${meta.assembler}/${meta.id}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename }] } withName: MULTIQC { diff --git a/conf/test_full.config b/conf/test_full.config index 9a01bc58..b09e6fe1 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -16,30 +16,30 @@ params { // Input data for full size test // hg19 reference with highly conserved and low-complexity regions masked by Brian Bushnell - host_fasta = "s3://ngi-igenomes/test-data/mag/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz" - input = "s3://ngi-igenomes/test-data/mag/samplesheets/samplesheet.full.csv" + host_fasta = "s3://ngi-igenomes/test-data/mag/hg19_main_mask_ribo_animal_allplant_allfungus.fa.gz" + input = "s3://ngi-igenomes/test-data/mag/samplesheets/samplesheet.full.csv" //centrifuge_db = "s3://ngi-igenomes/test-data/mag/p_compressed+h+v.tar.gz" - kraken2_db = "s3://ngi-igenomes/test-data/mag/minikraken_8GB_202003.tgz" - cat_db = "s3://ngi-igenomes/test-data/mag/CAT_prepare_20210107.tar.gz" + kraken2_db = "s3://ngi-igenomes/test-data/mag/minikraken_8GB_202003.tgz" + cat_db = "s3://ngi-igenomes/test-data/mag/CAT_prepare_20210107.tar.gz" // gtdb_db = "s3://ngi-igenomes/test-data/mag/gtdbtk_r214_data.tar.gz" ## This should be updated to release 220, once we get GTDB-Tk working again - skip_gtdbtk = true + skip_gtdbtk = true // TODO TEMPORARY: deactivate SPAdes due to incompatibility of container with fusion file system - skip_spades = true - skip_spadeshybrid = true + skip_spades = false + skip_spadeshybrid = false // reproducibility options for assembly - spades_fix_cpus = 10 - spadeshybrid_fix_cpus = 10 - megahit_fix_cpu_1 = true + spades_fix_cpus = 10 + spadeshybrid_fix_cpus = 10 + megahit_fix_cpu_1 = true // available options to enable reproducibility for BUSCO (--busco_db) not used here // to allow detection of possible problems in automated lineage selection mode using public databases // test CAT with official taxonomic ranks only - cat_official_taxonomy = true + cat_official_taxonomy = true // Skip CONCOCT due to timeout issues - skip_concoct = true + skip_concoct = true } diff --git a/docs/output.md b/docs/output.md index 7ca0ae13..4e43ffb6 100644 --- a/docs/output.md +++ b/docs/output.md @@ -219,10 +219,10 @@ Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembl Output files - `Assembly/SPAdes/` - - `[sample/group]_scaffolds.fasta.gz`: Compressed assembled scaffolds in fasta format - - `[sample/group]_graph.gfa.gz`: Compressed assembly graph in gfa format - - `[sample/group]_contigs.fasta.gz`: Compressed assembled contigs in fasta format - - `[sample/group].log`: Log file + - `[sample/group].scaffolds.fa.gz`: Compressed assembled scaffolds in fasta format + - `[sample/group].assembly.gfa.gz`: Compressed assembly graph in gfa format + - `[sample/group].contigs.fa.gz`: Compressed assembled contigs in fasta format + - `[sample/group].spades.log`: Log file - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - `SPAdes-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - `SPAdes-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). @@ -238,10 +238,10 @@ SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) soft Output files - `Assembly/SPAdesHybrid/` - - `[sample/group]_scaffolds.fasta.gz`: Compressed assembled scaffolds in fasta format - - `[sample/group]_graph.gfa.gz`: Compressed assembly graph in gfa format - - `[sample/group]_contigs.fasta.gz`: Compressed assembled contigs in fasta format - - `[sample/group].log`: Log file + - `[sample/group].scaffolds.fa.gz`: Compressed assembled scaffolds in fasta format + - `[sample/group].assembly.gfa.gz`: Compressed assembly graph in gfa format + - `[sample/group].contigs.fa.gz`: Compressed assembled contigs in fasta format + - `[sample/group].spades.log`: Log file - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - `SPAdesHybrid-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - `SPAdesHybrid-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). diff --git a/modules.json b/modules.json index 3fba80df..3eea27cd 100644 --- a/modules.json +++ b/modules.json @@ -172,6 +172,11 @@ "git_sha": "283613159e079152f1336cef0db1c836086206e0", "installed_by": ["modules"] }, + "megahit": { + "branch": "master", + "git_sha": "7755db15e36b30da564cd67fffdfe18a255092aa", + "installed_by": ["modules"] + }, "metabat2/jgisummarizebamcontigdepths": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", @@ -247,6 +252,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "spades": { + "branch": "master", + "git_sha": "cfebb244d8c83ae533bf2db399f9af361927d504", + "installed_by": ["modules"] + }, "tiara/tiara": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/local/megahit.nf b/modules/local/megahit.nf deleted file mode 100644 index 6f31425c..00000000 --- a/modules/local/megahit.nf +++ /dev/null @@ -1,40 +0,0 @@ -process MEGAHIT { - tag "$meta.id" - - conda "bioconda::megahit=1.2.9" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/megahit:1.2.9--h2e03b76_1' : - 'biocontainers/megahit:1.2.9--h2e03b76_1' }" - - input: - tuple val(meta), path(reads1), path(reads2) - - output: - tuple val(meta), path("MEGAHIT/MEGAHIT-${meta.id}.contigs.fa"), emit: assembly - path "MEGAHIT/*.log" , emit: log - path "MEGAHIT/MEGAHIT-${meta.id}.contigs.fa.gz" , emit: assembly_gz - path "versions.yml" , emit: versions - - script: - def args = task.ext.args ?: '' - def input = meta.single_end ? "-r \"" + reads1.join(",") + "\"" : "-1 \"" + reads1.join(",") + "\" -2 \"" + reads2.join(",") + "\"" - mem = task.memory.toBytes() - if ( !params.megahit_fix_cpu_1 || task.cpus == 1 ) - """ - ## Check if we're in the same work directory as a previous failed MEGAHIT run - if [[ -d MEGAHIT ]]; then - rm -r MEGAHIT/ - fi - - megahit $args -t "${task.cpus}" -m $mem $input -o MEGAHIT --out-prefix "MEGAHIT-${meta.id}" - - gzip -c "MEGAHIT/MEGAHIT-${meta.id}.contigs.fa" > "MEGAHIT/MEGAHIT-${meta.id}.contigs.fa.gz" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - megahit: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') - END_VERSIONS - """ - else - error "ERROR: '--megahit_fix_cpu_1' was specified, but not succesfully applied. Likely this is caused by changed process properties in a custom config file." -} diff --git a/modules/local/spades.nf b/modules/local/spades.nf deleted file mode 100644 index 9ef7ec77..00000000 --- a/modules/local/spades.nf +++ /dev/null @@ -1,51 +0,0 @@ -process SPADES { - tag "$meta.id" - - conda "bioconda::spades=3.15.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/spades:3.15.3--h95f258a_0' : - 'biocontainers/spades:3.15.3--h95f258a_0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("SPAdes-${meta.id}_scaffolds.fasta"), emit: assembly - path "SPAdes-${meta.id}.log" , emit: log - path "SPAdes-${meta.id}_contigs.fasta.gz" , emit: contigs_gz - path "SPAdes-${meta.id}_scaffolds.fasta.gz" , emit: assembly_gz - path "SPAdes-${meta.id}_graph.gfa.gz" , emit: graph - path "versions.yml" , emit: versions - - script: - def args = task.ext.args ?: '' - maxmem = task.memory.toGiga() - // The -s option is not supported for metaspades. Each time this is called with `meta.single_end` it's because - // read depth was normalized with BBNorm, which actually outputs pairs, but in an interleaved file. - def readstr = meta.single_end ? "--12 ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" - - if ( params.spades_fix_cpus == -1 || task.cpus == params.spades_fix_cpus ) - """ - metaspades.py \ - $args \ - --threads "${task.cpus}" \ - --memory $maxmem \ - ${readstr} \ - -o spades - mv spades/assembly_graph_with_scaffolds.gfa SPAdes-${meta.id}_graph.gfa - mv spades/scaffolds.fasta SPAdes-${meta.id}_scaffolds.fasta - mv spades/contigs.fasta SPAdes-${meta.id}_contigs.fasta - mv spades/spades.log SPAdes-${meta.id}.log - gzip "SPAdes-${meta.id}_contigs.fasta" - gzip "SPAdes-${meta.id}_graph.gfa" - gzip -c "SPAdes-${meta.id}_scaffolds.fasta" > "SPAdes-${meta.id}_scaffolds.fasta.gz" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version 2>&1 | sed 's/Python //g') - metaspades: \$(metaspades.py --version | sed "s/SPAdes genome assembler v//; s/ \\[.*//") - END_VERSIONS - """ - else - error "ERROR: '--spades_fix_cpus' was specified, but not succesfully applied. Likely this is caused by changed process properties in a custom config file." -} diff --git a/modules/local/spadeshybrid.nf b/modules/local/spadeshybrid.nf deleted file mode 100644 index 13578a69..00000000 --- a/modules/local/spadeshybrid.nf +++ /dev/null @@ -1,49 +0,0 @@ -process SPADESHYBRID { - tag "$meta.id" - - conda "bioconda::spades=3.15.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/spades:3.15.3--h95f258a_0' : - 'biocontainers/spades:3.15.3--h95f258a_0' }" - - input: - tuple val(meta), path(long_reads), path(short_reads) - - output: - tuple val(meta), path("SPAdesHybrid-${meta.id}_scaffolds.fasta"), emit: assembly - path "SPAdesHybrid-${meta.id}.log" , emit: log - path "SPAdesHybrid-${meta.id}_contigs.fasta.gz" , emit: contigs_gz - path "SPAdesHybrid-${meta.id}_scaffolds.fasta.gz" , emit: assembly_gz - path "SPAdesHybrid-${meta.id}_graph.gfa.gz" , emit: graph - path "versions.yml" , emit: versions - - script: - def args = task.ext.args ?: '' - maxmem = task.memory.toGiga() - if ( params.spadeshybrid_fix_cpus == -1 || task.cpus == params.spadeshybrid_fix_cpus ) - """ - metaspades.py \ - $args \ - --threads "${task.cpus}" \ - --memory $maxmem \ - --pe1-1 ${short_reads[0]} \ - --pe1-2 ${short_reads[1]} \ - --nanopore ${long_reads} \ - -o spades - mv spades/assembly_graph_with_scaffolds.gfa SPAdesHybrid-${meta.id}_graph.gfa - mv spades/scaffolds.fasta SPAdesHybrid-${meta.id}_scaffolds.fasta - mv spades/contigs.fasta SPAdesHybrid-${meta.id}_contigs.fasta - mv spades/spades.log SPAdesHybrid-${meta.id}.log - gzip "SPAdesHybrid-${meta.id}_contigs.fasta" - gzip "SPAdesHybrid-${meta.id}_graph.gfa" - gzip -c "SPAdesHybrid-${meta.id}_scaffolds.fasta" > "SPAdesHybrid-${meta.id}_scaffolds.fasta.gz" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version 2>&1 | sed 's/Python //g') - metaspades: \$(metaspades.py --version | sed "s/SPAdes genome assembler v//; s/ \\[.*//") - END_VERSIONS - """ - else - error "ERROR: '--spadeshybrid_fix_cpus' was specified, but not succesfully applied. Likely this is caused by changed process properties in a custom config file." -} diff --git a/modules/nf-core/megahit/environment.yml b/modules/nf-core/megahit/environment.yml new file mode 100644 index 00000000..eed8b725 --- /dev/null +++ b/modules/nf-core/megahit/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::megahit=1.2.9 + - conda-forge::pigz=2.8 diff --git a/modules/nf-core/megahit/main.nf b/modules/nf-core/megahit/main.nf new file mode 100644 index 00000000..f6e50f94 --- /dev/null +++ b/modules/nf-core/megahit/main.nf @@ -0,0 +1,70 @@ +process MEGAHIT { + tag "${meta.id}" + label 'process_high' + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/f2/f2cb827988dca7067ff8096c37cb20bc841c878013da52ad47a50865d54efe83/data' : + 'community.wave.seqera.io/library/megahit_pigz:87a590163e594224' }" + + input: + tuple val(meta), path(reads1), path(reads2) + + output: + tuple val(meta), path("*.contigs.fa.gz") , emit: contigs + tuple val(meta), path("intermediate_contigs/k*.contigs.fa.gz") , emit: k_contigs + tuple val(meta), path("intermediate_contigs/k*.addi.fa.gz") , emit: addi_contigs + tuple val(meta), path("intermediate_contigs/k*.local.fa.gz") , emit: local_contigs + tuple val(meta), path("intermediate_contigs/k*.final.contigs.fa.gz"), emit: kfinal_contigs + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reads_command = meta.single_end || !reads2 ? "-r ${reads1}" : "-1 ${reads1.join(',')} -2 ${reads2.join(',')}" + """ + megahit \\ + ${reads_command} \\ + ${args} \\ + -t ${task.cpus} \\ + --out-prefix ${prefix} + + pigz \\ + --no-name \\ + -p ${task.cpus} \\ + ${args2} \\ + megahit_out/*.fa \\ + megahit_out/intermediate_contigs/*.fa + + mv megahit_out/* . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megahit: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reads_command = meta.single_end || !reads2 ? "-r ${reads1}" : "-1 ${reads1.join(',')} -2 ${reads2.join(',')}" + """ + mkdir -p intermediate_contigs + echo "" | gzip > ${prefix}.contigs.fa.gz + echo "" | gzip > intermediate_contigs/k21.contigs.fa.gz + echo "" | gzip > intermediate_contigs/k21.addi.fa.gz + echo "" | gzip > intermediate_contigs/k21.local.fa.gz + echo "" | gzip > intermediate_contigs/k21.final.contigs.fa.gz + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megahit: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/megahit/meta.yml b/modules/nf-core/megahit/meta.yml new file mode 100644 index 00000000..04dab4c2 --- /dev/null +++ b/modules/nf-core/megahit/meta.yml @@ -0,0 +1,114 @@ +name: megahit +description: An ultra-fast metagenomic assembler for large and complex metagenomics +keywords: + - megahit + - denovo + - assembly + - debruijn + - metagenomics +tools: + - megahit: + description: "An ultra-fast single-node solution for large and complex metagenomics + assembly via succinct de Bruijn graph" + homepage: https://github.com/voutcn/megahit + documentation: https://github.com/voutcn/megahit + tool_dev_url: https://github.com/voutcn/megahit + doi: "10.1093/bioinformatics/btv033" + licence: ["GPL v3"] + args_id: "$args" + identifier: biotools:megahit + - pigz: + description: "Parallel implementation of the gzip algorithm." + homepage: "https://zlib.net/pigz/" + documentation: "https://zlib.net/pigz/pigz.pdf" + args_id: "$args2" + + identifier: biotools:megahit +input: + - - meta: + type: map + description: | + Groovy Map containing sample information and input single, or paired-end FASTA/FASTQ files (optionally decompressed) + e.g. [ id:'test', single_end:false ] + - reads1: + type: file + description: | + A single or list of input FastQ files for single-end or R1 of paired-end library(s), + respectively in gzipped or uncompressed FASTQ or FASTA format. + - reads2: + type: file + description: | + A single or list of input FastQ files for R2 of paired-end library(s), + respectively in gzipped or uncompressed FASTQ or FASTA format. +output: + - contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.contigs.fa.gz": + type: file + description: Final final contigs result of the assembly in FASTA format. + pattern: "*.contigs.fa.gz" + - k_contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.contigs.fa.gz: + type: file + description: Contigs assembled from the de Bruijn graph of order-K + pattern: "k*.contigs.fa.gz" + - addi_contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.addi.fa.gz: + type: file + description: Contigs assembled after iteratively removing local low coverage + unitigs in the de Bruijn graph of order-K + pattern: "k*.addi.fa.gz" + - local_contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.local.fa.gz: + type: file + description: Contigs of the locally assembled contigs for k=K + pattern: "k*.local.fa.gz" + - kfinal_contigs: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - intermediate_contigs/k*.final.contigs.fa.gz: + type: file + description: Stand-alone contigs for k=K; if local assembly is turned on, the + file will be empty + pattern: "k*.final.contigs.fa.gz" + - log: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.log": + type: file + description: Log file containing statistics of the assembly output + pattern: "*.log" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/megahit/tests/main.nf.test b/modules/nf-core/megahit/tests/main.nf.test new file mode 100644 index 00000000..b52765d4 --- /dev/null +++ b/modules/nf-core/megahit/tests/main.nf.test @@ -0,0 +1,126 @@ +nextflow_process { + + name "Test Process MEGAHIT" + script "../main.nf" + process "MEGAHIT" + + tag "modules" + tag "modules_nfcore" + tag "megahit" + + test("sarscov2 - fastq - se") { + + when { + process { + """ + input[0] = [ [id:"test", single_end:true], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + []] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.contigs[0][1]).linesGzip.toString().contains(">k") }, + { assert process.out.k_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.addi_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.local_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.kfinal_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert snapshot( + path(process.out.log[0][1]).readLines().last().contains("ALL DONE. Time elapsed"), + process.out.versions + ).match() + } + ) + } + + } + + test("sarscov2 - fastq - pe") { + + when { + process { + """ + input[0] = [ [id:"test", single_end:false], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.contigs[0][1]).linesGzip.toString().contains(">k") }, + { assert process.out.k_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.addi_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.local_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.kfinal_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert snapshot( + path(process.out.log[0][1]).readLines().last().contains("ALL DONE. Time elapsed"), + process.out.versions + ).match() + } + ) + } + + } + + test("sarscov2 - fastq - pe - coassembly") { + + when { + process { + """ + input[0] = [ [id:"test", single_end:false], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true)] , + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true)] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.contigs[0][1]).linesGzip.toString().contains(">k") }, + { assert process.out.k_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.addi_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.local_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert process.out.kfinal_contigs[0][1].each{path(it).linesGzip.toString().contains(">k")}}, + { assert snapshot( + path(process.out.log[0][1]).readLines().last().contains("ALL DONE. Time elapsed"), + process.out.versions + ).match() + } + ) + } + + } + + test("sarscov2 - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id:"test", single_end:true], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + [] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/megahit/tests/main.nf.test.snap b/modules/nf-core/megahit/tests/main.nf.test.snap new file mode 100644 index 00000000..4677cc33 --- /dev/null +++ b/modules/nf-core/megahit/tests/main.nf.test.snap @@ -0,0 +1,172 @@ +{ + "sarscov2 - fastq - se": { + "content": [ + true, + [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:45:42.387947698" + }, + "sarscov2 - fastq - pe": { + "content": [ + true, + [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:45:48.679485983" + }, + "sarscov2 - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "k21.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.addi.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.local.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": true + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ], + "addi_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.addi.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "k_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + [ + "k21.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + ], + "kfinal_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.final.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "local_contigs": [ + [ + { + "id": "test", + "single_end": true + }, + "k21.local.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "log": [ + [ + { + "id": "test", + "single_end": true + }, + "test.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:44:35.245399991" + }, + "sarscov2 - fastq - pe - coassembly": { + "content": [ + true, + [ + "versions.yml:md5,e3c0731297c9abe2f495ab6d541ac0e6" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.4" + }, + "timestamp": "2024-09-12T16:45:56.23363342" + } +} \ No newline at end of file diff --git a/modules/nf-core/megahit/tests/tags.yml b/modules/nf-core/megahit/tests/tags.yml new file mode 100644 index 00000000..9e865846 --- /dev/null +++ b/modules/nf-core/megahit/tests/tags.yml @@ -0,0 +1,2 @@ +megahit: + - "modules/nf-core/megahit/**" diff --git a/modules/nf-core/spades/environment.yml b/modules/nf-core/spades/environment.yml new file mode 100644 index 00000000..8cc5321f --- /dev/null +++ b/modules/nf-core/spades/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::spades=4.0.0 diff --git a/modules/nf-core/spades/main.nf b/modules/nf-core/spades/main.nf new file mode 100644 index 00000000..36cdfe44 --- /dev/null +++ b/modules/nf-core/spades/main.nf @@ -0,0 +1,102 @@ +process SPADES { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/spades:4.0.0--h5fb382e_1' : + 'biocontainers/spades:4.0.0--h5fb382e_1' }" + + input: + tuple val(meta), path(illumina), path(pacbio), path(nanopore) + path yml + path hmm + + output: + tuple val(meta), path('*.scaffolds.fa.gz') , optional:true, emit: scaffolds + tuple val(meta), path('*.contigs.fa.gz') , optional:true, emit: contigs + tuple val(meta), path('*.transcripts.fa.gz') , optional:true, emit: transcripts + tuple val(meta), path('*.gene_clusters.fa.gz'), optional:true, emit: gene_clusters + tuple val(meta), path('*.assembly.gfa.gz') , optional:true, emit: gfa + tuple val(meta), path('*.warnings.log') , optional:true, emit: warnings + tuple val(meta), path('*.spades.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def maxmem = task.memory.toGiga() + def illumina_reads = illumina ? ( meta.single_end ? "-s $illumina" : "-1 ${illumina[0]} -2 ${illumina[1]}" ) : "" + def pacbio_reads = pacbio ? "--pacbio $pacbio" : "" + def nanopore_reads = nanopore ? "--nanopore $nanopore" : "" + def custom_hmms = hmm ? "--custom-hmms $hmm" : "" + def reads = yml ? "--dataset $yml" : "$illumina_reads $pacbio_reads $nanopore_reads" + """ + spades.py \\ + $args \\ + --threads $task.cpus \\ + --memory $maxmem \\ + $custom_hmms \\ + $reads \\ + -o ./ + mv spades.log ${prefix}.spades.log + + if [ -f scaffolds.fasta ]; then + mv scaffolds.fasta ${prefix}.scaffolds.fa + gzip -n ${prefix}.scaffolds.fa + fi + if [ -f contigs.fasta ]; then + mv contigs.fasta ${prefix}.contigs.fa + gzip -n ${prefix}.contigs.fa + fi + if [ -f transcripts.fasta ]; then + mv transcripts.fasta ${prefix}.transcripts.fa + gzip -n ${prefix}.transcripts.fa + fi + if [ -f assembly_graph_with_scaffolds.gfa ]; then + mv assembly_graph_with_scaffolds.gfa ${prefix}.assembly.gfa + gzip -n ${prefix}.assembly.gfa + fi + + if [ -f gene_clusters.fasta ]; then + mv gene_clusters.fasta ${prefix}.gene_clusters.fa + gzip -n ${prefix}.gene_clusters.fa + fi + + if [ -f warnings.log ]; then + mv warnings.log ${prefix}.warnings.log + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + spades: \$(spades.py --version 2>&1 | sed -n 's/^.*SPAdes genome assembler v//p') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def maxmem = task.memory.toGiga() + def illumina_reads = illumina ? ( meta.single_end ? "-s $illumina" : "-1 ${illumina[0]} -2 ${illumina[1]}" ) : "" + def pacbio_reads = pacbio ? "--pacbio $pacbio" : "" + def nanopore_reads = nanopore ? "--nanopore $nanopore" : "" + def custom_hmms = hmm ? "--custom-hmms $hmm" : "" + def reads = yml ? "--dataset $yml" : "$illumina_reads $pacbio_reads $nanopore_reads" + """ + echo "" | gzip > ${prefix}.scaffolds.fa.gz + echo "" | gzip > ${prefix}.contigs.fa.gz + echo "" | gzip > ${prefix}.transcripts.fa.gz + echo "" | gzip > ${prefix}.gene_clusters.fa.gz + echo "" | gzip > ${prefix}.assembly.gfa.gz + touch ${prefix}.spades.log + touch ${prefix}.warnings.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + spades: \$(spades.py --version 2>&1 | sed -n 's/^.*SPAdes genome assembler v//p') + END_VERSIONS + """ +} diff --git a/modules/nf-core/spades/meta.yml b/modules/nf-core/spades/meta.yml new file mode 100644 index 00000000..986871be --- /dev/null +++ b/modules/nf-core/spades/meta.yml @@ -0,0 +1,99 @@ +name: spades +description: Assembles a small genome (bacterial, fungal, viral) +keywords: + - genome + - assembly + - genome assembler + - small genome + - de novo assembler +tools: + - spades: + description: SPAdes (St. Petersburg genome assembler) is intended for both standard isolates and single-cell MDA bacteria assemblies. + homepage: http://cab.spbu.ru/files/release3.15.0/manual.html + documentation: http://cab.spbu.ru/files/release3.15.0/manual.html + tool_dev_url: https://github.com/ablab/spades + doi: 10.1089/cmb.2012.0021 + licence: ["GPL v2"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - illumina: + type: file + description: | + List of input FastQ (Illumina or PacBio CCS reads) files + of size 1 and 2 for single-end and paired-end data, + respectively. This input data type is required. + - pacbio: + type: file + description: | + List of input PacBio CLR FastQ files of size 1. + - nanopore: + type: file + description: | + List of input FastQ files of size 1, originating from Oxford Nanopore technology. + - yml: + type: file + description: | + Path to yml file containing read information. + The raw FASTQ files listed in this YAML file MUST be supplied to the respective illumina/pacbio/nanopore input channel(s) _in addition_ to this YML. + File entries in this yml must contain only the file name and no paths. + pattern: "*.{yml,yaml}" + - hmm: + type: file + description: File or directory with amino acid HMMs for Spades HMM-guided mode. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - scaffolds: + type: file + description: | + Fasta file containing scaffolds + pattern: "*.fa.gz" + - contigs: + type: file + description: | + Fasta file containing contigs + pattern: "*.fa.gz" + - transcripts: + type: file + description: | + Fasta file containing transcripts + pattern: "*.fa.gz" + - gene_clusters: + type: file + description: | + Fasta file containing gene_clusters + pattern: "*.fa.gz" + - gfa: + type: file + description: | + gfa file containing assembly + pattern: "*.gfa.gz" + - log: + type: file + description: | + Spades log file + pattern: "*.spades.log" + - log: + type: file + description: | + Spades warning log file + pattern: "*.warning.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@JoseEspinosa" + - "@drpatelh" + - "@d4straub" +maintainers: + - "@JoseEspinosa" + - "@drpatelh" + - "@d4straub" diff --git a/modules/nf-core/spades/tests/main.nf.test b/modules/nf-core/spades/tests/main.nf.test new file mode 100644 index 00000000..3a93f486 --- /dev/null +++ b/modules/nf-core/spades/tests/main.nf.test @@ -0,0 +1,228 @@ +nextflow_process { + + name "Test Process SPADES" + script "../main.nf" + process "SPADES" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "spades" + + test("sarscov2 - se ") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true ], + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [] + ] + input[1] = [] + input[2] = [] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } } + ) + } + } + + test("sarscov2 - pe ") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [] + ] + input [1] = [] + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } }, + { assert file(process.out.warnings[0][1]).find{ file(it).name == "warnings.log"} } + ) + } + + } + // isnt perfect, because CCS reads should rather be used with -s instead of --pacbio + test("sarscov2 - pe - pacbio ") { + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [ file(params.modules_testdata_base_path + "genomics/sarscov2/nanopore/fastq/test.fastq.gz", checkIfExists: true) ] + ] + input [1] = [] + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } }, + { assert file(process.out.warnings[0][1]).find{ file(it).name == "warnings.log"} } + ) + } + } + + test("sarscov2 - pe - nanopore ") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [ file(params.modules_testdata_base_path + "genomics/sarscov2/nanopore/fastq/test.fastq.gz", checkIfExists: true) ] + ] + input [1] = [] + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } }, + { assert file(process.out.warnings[0][1]).find{ file(it).name == "warnings.log"} } + ) + } + } + + test("sarscov2 - pe - nanopore - yml ") { + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [ file(params.modules_testdata_base_path + "genomics/sarscov2/nanopore/fastq/test.fastq.gz", checkIfExists: true) ] + ] + input [1] = file(params.modules_testdata_base_path + "delete_me/spades/spades_input_yml.yml", checkIfExists: true) + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } }, + { assert file(process.out.warnings[0][1]).find{ file(it).name == "warnings.log"} } + ) + } + } + + test("sarscov2 - pe - hmm ") { + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file("https://github.com/nf-core/test-datasets/raw/viralrecon/illumina/sispa/SRR11140744_R1.fastq.gz", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/viralrecon/illumina/sispa/SRR11140744_R2.fastq.gz", checkIfExists: true) ], + [], + [] + ] + input [1] = [] + input [2] = [file(params.modules_testdata_base_path + "/genomics/sarscov2/genome/proteome.hmm.gz", checkIfExists: true)] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.scaffolds, + process.out.contigs, + process.out.transcripts, + process.out.gene_clusters, + process.out.gfa, + process.out.versions + ).match() }, + { assert path(process.out.log[0][1]).readLines().any { it.contains("SPAdes pipeline finished") } } + ) + } + } + + test("sarscov2 - pe - stub ") { + options "-stub" + when { + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) ], + [], + [] + ] + input [1] = [] + input [2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + +} diff --git a/modules/nf-core/spades/tests/main.nf.test.snap b/modules/nf-core/spades/tests/main.nf.test.snap new file mode 100644 index 00000000..e1b3b652 --- /dev/null +++ b/modules/nf-core/spades/tests/main.nf.test.snap @@ -0,0 +1,403 @@ +{ + "sarscov2 - pe - nanopore ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,19418df83534fc93543dec4ec9b2ae72" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:13:08.663068339" + }, + "sarscov2 - pe - hmm ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,ce077d5f3380690f8d9a5fe188f82128" + ] + ], + [ + + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,07136eab8e231f095dc5dd62f1b62a91" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T08:04:19.650636803" + }, + "sarscov2 - pe - pacbio ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,19418df83534fc93543dec4ec9b2ae72" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:12:49.305512756" + }, + "sarscov2 - pe ": { + "content": [ + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,70e4a5485dd59566b212a199c31c343b" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,b773132d52be5090cdbdf5a643027093" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:12:36.161628498" + }, + "sarscov2 - pe - nanopore - yml ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,7ddaf03740df422a93fcaffbcd7e9679" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,19418df83534fc93543dec4ec9b2ae72" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:13:21.868805946" + }, + "sarscov2 - se ": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.scaffolds.fa.gz:md5,65ba6a517c152dbe219bf4b5b92bdad7" + ] + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.contigs.fa.gz:md5,65ba6a517c152dbe219bf4b5b92bdad7" + ] + ], + [ + + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": true + }, + "test.assembly.gfa.gz:md5,e4836fdf7104d79e314e3e50986b4bb2" + ] + ], + [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:12:16.562778962" + }, + "sarscov2 - pe - stub ": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.transcripts.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gene_clusters.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + [ + { + "id": "test", + "single_end": false + }, + "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "6": [ + [ + { + "id": "test", + "single_end": false + }, + "test.spades.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "7": [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ], + "contigs": [ + [ + { + "id": "test", + "single_end": false + }, + "test.contigs.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "gene_clusters": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gene_clusters.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "gfa": [ + [ + { + "id": "test", + "single_end": false + }, + "test.assembly.gfa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + "test.spades.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "scaffolds": [ + [ + { + "id": "test", + "single_end": false + }, + "test.scaffolds.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "transcripts": [ + [ + { + "id": "test", + "single_end": false + }, + "test.transcripts.fa.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,990abcdf543421412170e5cf413ec56d" + ], + "warnings": [ + [ + { + "id": "test", + "single_end": false + }, + "test.warnings.log:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-07T07:20:07.195881734" + } +} diff --git a/modules/nf-core/spades/tests/nextflow.config b/modules/nf-core/spades/tests/nextflow.config new file mode 100644 index 00000000..adec1bde --- /dev/null +++ b/modules/nf-core/spades/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: SPADES { + ext.args = '--rnaviral' + } +} diff --git a/modules/nf-core/spades/tests/tags.yml b/modules/nf-core/spades/tests/tags.yml new file mode 100644 index 00000000..035861ff --- /dev/null +++ b/modules/nf-core/spades/tests/tags.yml @@ -0,0 +1,2 @@ +spades: + - "modules/nf-core/spades/**" diff --git a/nextflow.config b/nextflow.config index 02820eb1..12c6c6aa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -90,7 +90,7 @@ params { cat_official_taxonomy = false save_cat_db = false skip_gtdbtk = false - gtdb_db = "https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" + gtdb_db = "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" gtdb_mash = null gtdbtk_min_completeness = 50.0 gtdbtk_max_contamination = 10.0 diff --git a/nextflow_schema.json b/nextflow_schema.json index 40e3553e..1063c4ac 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -485,7 +485,7 @@ "gtdb_db": { "type": "string", "description": "Specify the location of a GTDBTK database. Can be either an uncompressed directory or a `.tar.gz` archive. If not specified will be downloaded for you when GTDBTK or binning QC is not skipped.", - "default": "https://data.ace.uq.edu.au/public/gtdb/data/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" + "default": "https://data.gtdb.ecogenomic.org/releases/release220/220.0/auxillary_files/gtdbtk_package/full_package/gtdbtk_r220_data.tar.gz" }, "gtdb_mash": { "type": "string", @@ -551,7 +551,7 @@ }, "spades_options": { "type": "string", - "description": "Additional custom options for SPAdes.", + "description": "Additional custom options for SPAdes and SPAdesHybrid. You must also specify `--meta` to run SPAdes in metagenomic mode if customising these options!", "help_text": "An example is adjusting k-mers (\"-k 21,33,55,77\") or adding [advanced options](https://github.com/ablab/spades#advanced-options). But not -t, -m, -o or --out-prefix, because these are already in use. Must be used like this: --spades_options \"-k 21,33,55,77\")" }, "megahit_options": { diff --git a/workflows/mag.nf b/workflows/mag.nf index e8adec33..01519243 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -43,7 +43,11 @@ include { KRONA_KRONADB } from '../modul include { KRONA_KTIMPORTTAXONOMY } from '../modules/nf-core/krona/ktimporttaxonomy/main' include { KRAKENTOOLS_KREPORT2KRONA as KREPORT2KRONA_CENTRIFUGE } from '../modules/nf-core/krakentools/kreport2krona/main' include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' +include { MEGAHIT } from '../modules/nf-core/megahit/main' +include { SPADES as METASPADES } from '../modules/nf-core/spades/main' +include { SPADES as METASPADESHYBRID } from '../modules/nf-core/spades/main' include { GUNZIP as GUNZIP_ASSEMBLIES } from '../modules/nf-core/gunzip' +include { GUNZIP as GUNZIP_ASSEMBLYINPUT } from '../modules/nf-core/gunzip' include { PRODIGAL } from '../modules/nf-core/prodigal/main' include { PROKKA } from '../modules/nf-core/prokka/main' include { MMSEQS_DATABASES } from '../modules/nf-core/mmseqs/databases/main' @@ -61,9 +65,6 @@ include { KRAKEN2 } from '../modules include { POOL_SINGLE_READS as POOL_SHORT_SINGLE_READS } from '../modules/local/pool_single_reads' include { POOL_PAIRED_READS } from '../modules/local/pool_paired_reads' include { POOL_SINGLE_READS as POOL_LONG_READS } from '../modules/local/pool_single_reads' -include { MEGAHIT } from '../modules/local/megahit' -include { SPADES } from '../modules/local/spades' -include { SPADESHYBRID } from '../modules/local/spadeshybrid' include { QUAST } from '../modules/local/quast' include { QUAST_BINS } from '../modules/local/quast_bins' include { QUAST_BINS_SUMMARY } from '../modules/local/quast_bins_summary' @@ -74,112 +75,105 @@ include { CAT_SUMMARY } from "../modules include { BIN_SUMMARY } from '../modules/local/bin_summary' include { COMBINE_TSV as COMBINE_SUMMARY_TSV } from '../modules/local/combine_tsv' -//////////////////////////////////////////////////// -/* -- Create channel for reference databases -- */ -//////////////////////////////////////////////////// - -if ( params.host_genome ) { - host_fasta = params.genomes[params.host_genome].fasta ?: false - ch_host_fasta = Channel - .value(file( "${host_fasta}" )) - host_bowtie2index = params.genomes[params.host_genome].bowtie2 ?: false - ch_host_bowtie2index = Channel - .value(file( "${host_bowtie2index}/*" )) -} else if ( params.host_fasta ) { - ch_host_fasta = Channel - .value(file( "${params.host_fasta}" )) -} else { - ch_host_fasta = Channel.empty() -} - -if (params.busco_db) { - ch_busco_db = file(params.busco_db, checkIfExists: true) -} else { - ch_busco_db = [] -} +workflow MAG { -if(params.checkm_db) { - ch_checkm_db = file(params.checkm_db, checkIfExists: true) -} + take: + ch_raw_short_reads // channel: samplesheet read in from --input + ch_raw_long_reads + ch_input_assemblies -if (params.gunc_db) { - ch_gunc_db = file(params.gunc_db, checkIfExists: true) -} else { - ch_gunc_db = Channel.empty() -} + main: -if(params.kraken2_db){ - ch_kraken2_db_file = file(params.kraken2_db, checkIfExists: true) -} else { - ch_kraken2_db_file = [] -} + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() -if(params.cat_db){ - ch_cat_db_file = Channel - .value(file( "${params.cat_db}" )) -} else { - ch_cat_db_file = Channel.empty() -} + //////////////////////////////////////////////////// + /* -- Create channel for reference databases -- */ + //////////////////////////////////////////////////// + + if ( params.host_genome ) { + host_fasta = params.genomes[params.host_genome].fasta ?: false + ch_host_fasta = Channel + .value(file( "${host_fasta}" )) + host_bowtie2index = params.genomes[params.host_genome].bowtie2 ?: false + ch_host_bowtie2index = Channel + .value(file( "${host_bowtie2index}/*" )) + } else if ( params.host_fasta ) { + ch_host_fasta = Channel + .value(file( "${params.host_fasta}" )) + } else { + ch_host_fasta = Channel.empty() + } -if(params.krona_db){ - ch_krona_db_file = Channel - .value(file( "${params.krona_db}" )) -} else { - ch_krona_db_file = Channel.empty() -} + if (params.busco_db) { + ch_busco_db = file(params.busco_db, checkIfExists: true) + } else { + ch_busco_db = [] + } -if(!params.keep_phix) { - ch_phix_db_file = Channel - .value(file( "${params.phix_reference}" )) -} + if(params.checkm_db) { + ch_checkm_db = file(params.checkm_db, checkIfExists: true) + } -if (!params.keep_lambda) { - ch_nanolyse_db = Channel - .value(file( "${params.lambda_reference}" )) -} + if (params.gunc_db) { + ch_gunc_db = file(params.gunc_db, checkIfExists: true) + } else { + ch_gunc_db = Channel.empty() + } -if (params.genomad_db){ - ch_genomad_db = file(params.genomad_db, checkIfExists: true) -} else { - ch_genomad_db = Channel.empty() -} + if(params.kraken2_db){ + ch_kraken2_db_file = file(params.kraken2_db, checkIfExists: true) + } else { + ch_kraken2_db_file = [] + } -gtdb = ( params.skip_binqc || params.skip_gtdbtk ) ? false : params.gtdb_db + if(params.cat_db){ + ch_cat_db_file = Channel + .value(file( "${params.cat_db}" )) + } else { + ch_cat_db_file = Channel.empty() + } -if (gtdb) { - gtdb = file( "${gtdb}", checkIfExists: true) - gtdb_mash = params.gtdb_mash ? file("${params.gtdb_mash}", checkIfExists: true) : [] -} else { - gtdb = [] -} + if(params.krona_db){ + ch_krona_db_file = Channel + .value(file( "${params.krona_db}" )) + } else { + ch_krona_db_file = Channel.empty() + } -if(params.metaeuk_db && !params.skip_metaeuk) { - ch_metaeuk_db = Channel. - value(file("${params.metaeuk_db}", checkIfExists: true)) -} else { - ch_metaeuk_db = Channel.empty() -} + if(!params.keep_phix) { + ch_phix_db_file = Channel + .value(file( "${params.phix_reference}" )) + } -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ + if (!params.keep_lambda) { + ch_nanolyse_db = Channel + .value(file( "${params.lambda_reference}" )) + } -// Additional info for completion email and summary -def busco_failed_bins = [:] + if (params.genomad_db){ + ch_genomad_db = file(params.genomad_db, checkIfExists: true) + } else { + ch_genomad_db = Channel.empty() + } -workflow MAG { + gtdb = ( params.skip_binqc || params.skip_gtdbtk ) ? false : params.gtdb_db - take: - ch_raw_short_reads // channel: samplesheet read in from --input - ch_raw_long_reads - ch_input_assemblies + if (gtdb) { + gtdb = file( "${gtdb}", checkIfExists: true) + gtdb_mash = params.gtdb_mash ? file("${params.gtdb_mash}", checkIfExists: true) : [] + } else { + gtdb = [] + } - main: + if(params.metaeuk_db && !params.skip_metaeuk) { + ch_metaeuk_db = Channel.value(file("${params.metaeuk_db}", checkIfExists: true)) + } else { + ch_metaeuk_db = Channel.empty() + } - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + // Additional info for completion email and summary + def busco_failed_bins = [:] // Get checkM database if not supplied @@ -462,7 +456,8 @@ workflow MAG { */ if ( !params.assembly_input ) { - // Co-assembly: prepare grouping for MEGAHIT and for pooling for SPAdes + + // Co-assembly preparation: grouping for MEGAHIT and for pooling for SPAdes if (params.coassemble_group) { // short reads // group and set group as new id @@ -501,20 +496,6 @@ workflow MAG { ch_long_reads_grouped = ch_long_reads } - ch_assemblies = Channel.empty() - - if (!params.skip_megahit){ - MEGAHIT ( ch_short_reads_grouped ) - ch_megahit_assemblies = MEGAHIT.out.assembly - .map { meta, assembly -> - def meta_new = meta + [assembler: 'MEGAHIT'] - [ meta_new, assembly ] - } - ch_assemblies = ch_assemblies.mix(ch_megahit_assemblies) - ch_versions = ch_versions.mix(MEGAHIT.out.versions.first()) - } - - // Co-assembly: pool reads for SPAdes if ( ! params.skip_spades || ! params.skip_spadeshybrid ){ if ( params.coassemble_group ) { if ( params.bbnorm ) { @@ -546,15 +527,19 @@ workflow MAG { ch_long_reads_spades = Channel.empty() } + // Assembly + + ch_assembled_contigs = Channel.empty() + if (!params.single_end && !params.skip_spades){ - SPADES ( ch_short_reads_spades ) - ch_spades_assemblies = SPADES.out.assembly + METASPADES ( ch_short_reads_spades.map{ meta, reads -> [meta, reads, [], []]}, [], [] ) + ch_spades_assemblies = METASPADES.out.scaffolds .map { meta, assembly -> def meta_new = meta + [assembler: 'SPAdes'] [ meta_new, assembly ] } - ch_assemblies = ch_assemblies.mix(ch_spades_assemblies) - ch_versions = ch_versions.mix(SPADES.out.versions.first()) + ch_assembled_contigs = ch_assembled_contigs.mix(ch_spades_assemblies) + ch_versions = ch_versions.mix(METASPADES.out.versions.first()) } if (!params.single_end && !params.skip_spadeshybrid){ @@ -564,17 +549,36 @@ workflow MAG { ch_reads_spadeshybrid = ch_long_reads_spades .map { meta, reads -> [ meta.id, meta, reads ] } .combine(ch_short_reads_spades_tmp, by: 0) - .map { id, meta_long, long_reads, meta_short, short_reads -> [ meta_short, long_reads, short_reads ] } + .map { id, meta_long, long_reads, meta_short, short_reads -> [ meta_short, short_reads, [], long_reads ] } - SPADESHYBRID ( ch_reads_spadeshybrid ) - ch_spadeshybrid_assemblies = SPADESHYBRID.out.assembly + METASPADESHYBRID ( ch_reads_spadeshybrid, [], [] ) + ch_spadeshybrid_assemblies = METASPADESHYBRID.out.scaffolds .map { meta, assembly -> def meta_new = meta + [assembler: "SPAdesHybrid"] [ meta_new, assembly ] } - ch_assemblies = ch_assemblies.mix(ch_spadeshybrid_assemblies) - ch_versions = ch_versions.mix(SPADESHYBRID.out.versions.first()) + ch_assembled_contigs = ch_assembled_contigs.mix(ch_spadeshybrid_assemblies) + ch_versions = ch_versions.mix(METASPADESHYBRID.out.versions.first()) + } + + if (!params.skip_megahit){ + MEGAHIT ( ch_short_reads_grouped ) + ch_megahit_assemblies = MEGAHIT.out.contigs + .map { meta, assembly -> + def meta_new = meta + [assembler: 'MEGAHIT'] + [ meta_new, assembly ] + } + ch_assembled_contigs = ch_assembled_contigs.mix(ch_megahit_assemblies) + ch_versions = ch_versions.mix(MEGAHIT.out.versions.first()) } + + + + GUNZIP_ASSEMBLIES ( ch_assembled_contigs ) + ch_versions = ch_versions.mix(GUNZIP_ASSEMBLIES .out.versions) + + ch_assemblies = GUNZIP_ASSEMBLIES.out.gunzip + } else { ch_assemblies_split = ch_input_assemblies .branch { meta, assembly -> @@ -582,11 +586,11 @@ workflow MAG { ungzip: true } - GUNZIP_ASSEMBLIES(ch_assemblies_split.gzipped) - ch_versions = ch_versions.mix(GUNZIP_ASSEMBLIES.out.versions) + GUNZIP_ASSEMBLYINPUT(ch_assemblies_split.gzipped) + ch_versions = ch_versions.mix(GUNZIP_ASSEMBLYINPUT.out.versions) ch_assemblies = Channel.empty() - ch_assemblies = ch_assemblies.mix(ch_assemblies_split.ungzip, GUNZIP_ASSEMBLIES.out.gunzip) + ch_assemblies = ch_assemblies.mix(ch_assemblies_split.ungzip, GUNZIP_ASSEMBLYINPUT.out.gunzip) } ch_quast_multiqc = Channel.empty()