Skip to content
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions modules/nf-core/samtools/multicommand/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
channels:
- conda-forge
- bioconda
dependencies:
- "bioconda::samtools=1.23.1"
169 changes: 169 additions & 0 deletions modules/nf-core/samtools/multicommand/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
process SAMTOOLS_MULTICOMMAND {
tag "$meta.id"
label 'process_medium'

conda "${moduleDir}/environment.yml"
container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container
? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/8c/8c5d2818c8b9f58e1fba77ce219fdaf32087ae53e857c4a496402978af26e78c/data'
: 'community.wave.seqera.io/library/htslib_samtools:1.23.1--5b6bb4ede7e612e5'}"

input:
tuple val(meta), path(input), path(index)
tuple val(meta2), path(fasta), path(fai)
val(pipeline)

output:
// Alignment format outputs (view, sort, markdup, merge, cat, collate)
tuple val(meta), path("*.bam"), optional: true, emit: bam
tuple val(meta), path("*.cram"), optional: true, emit: cram
tuple val(meta), path("*.sam"), optional: true, emit: sam
tuple val(meta), path("*.{bai,csi,crai}"), optional: true, emit: index

// Sequence outputs (fasta, fastq)
tuple val(meta), path("*.fasta.gz"), optional: true, emit: fasta
tuple val(meta), path("*.fastq.gz"), optional: true, emit: fastq

tuple val("${task.process}"), val('samtools'), eval('samtools version | sed "1!d;s/.* //"'), emit: versions_samtools, topic: versions

when:
task.ext.when == null || task.ext.when

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

def valid_options = ['view', 'sort', 'markdup', 'fixmate', 'merge', 'cat', 'collate', 'fastq', 'fasta']
pipeline.collect { tool ->
if (!(tool in valid_options)) {
error("Error: ${tool} not a valid pipeline argument for SAMTOOLS_PIPELINE! Valid options are: ${valid_options.join(", ")}")
}
}

def n_commands = pipeline.size()
def final_command = pipeline[n_commands - 1]
Comment thread
prototaxites marked this conversation as resolved.

// Build output string based on final command
def output_string = ""
def input_reference = (fasta && input.getExtension() == "cram") ? "--reference ${fasta}" : ""
def output_reference = ""

if (final_command in ['view', 'sort', 'merge', 'cat', 'markdup', 'fixmate', 'merge', 'cat', 'collate']) {
// These produce alignment files
def argsKey = n_commands == 1 ? "args" : "args${n_commands}"
def argsLast = task.ext[argsKey] ?: ""
def extension = argsLast.contains("--output-fmt sam")
? "sam"
: argsLast.contains("--output-fmt cram")
? "cram"
: "bam"
output_reference = (fasta && input.getExtension() == "cram") ? "--reference ${fasta}" : ""
output_string = "-o ${prefix}.${extension}"
} else if (final_command == "fasta") {
// fasta produces multiple files with special output flags
output_string = "-0 ${prefix}_other.fasta.gz"
if (!meta.single_end) {
output_string = output_string + " -1 ${prefix}_1.fasta.gz -2 ${prefix}_2.fasta.gz -s ${prefix}_singleton.fasta.gz"
} else {
output_string = output_string + " -1 ${prefix}_1.fasta.gz -s ${prefix}_singleton.fasta.gz"
}
} else if (final_command == "fastq") {
// fastq produces multiple files with special output flags
output_string = "-0 ${prefix}_other.fastq.gz"
if (!meta.single_end) {
output_string = output_string + " -1 ${prefix}_1.fastq.gz -2 ${prefix}_2.fastq.gz -s ${prefix}_singleton.fastq.gz"
} else {
output_string = output_string + " -1 ${prefix}_1.fastq.gz -s ${prefix}_singleton.fastq.gz"
}
}
Comment thread
prototaxites marked this conversation as resolved.

// Build the pipeline command
def pipeline_command = pipeline.withIndex().collect { subcommand, idx ->
def argsKey = idx == 0 ? "args" : "args${idx + 1}"
def taskArgs = task.ext[argsKey] ?: ""

def cmd_parts = ["samtools", subcommand]
if (taskArgs) {
cmd_parts << taskArgs
Comment thread
prototaxites marked this conversation as resolved.
}
if (idx == 0) {
if (input_reference) {
cmd_parts << input_reference
}
cmd_parts << (input instanceof List ? input.join(" ") : input)
}
if (idx == n_commands - 1) {
if (output_reference) {
cmd_parts << output_reference
}
cmd_parts << output_string
}

Comment thread
prototaxites marked this conversation as resolved.
return cmd_parts.join(" ")
}.join(" |\\\n")
Comment on lines +92 to +118
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You just pipe the commands directly. Don't you need to add some - or -o - to force stdin/stdout ? I did that in my version https://github.com/nf-core/modules/pull/4571/changes#diff-34320bc32138ad0bc2d6a277f60183a5c4fc988b01314cab05ac6ff6245a9c94R38 , but perhaps that was unnecessary.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK it's not mandatory in samtools?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know :) I don't use samtools often enough


// EXAMPLE:
//
// This module will construct a samtools pipeline command from an input list of
// subtools, such as [view, sort, markdup]:
//
// samtools view ${args} input.bam |\
// samtools sort ${args2} |\
// samtools markdup ${args3} -o output.bam
//
// The args are numbered sequenctially for each tool in the sequence and CRAM references
// are automatically applied if needed. FASTA and FASTQ outputs are also available.
"""
${pipeline_command}
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"

def valid_options = ['view', 'sort', 'markdup', 'fixmate', 'merge', 'cat', 'collate', 'fastq', 'fasta']
pipeline.collect { tool ->
if (!(tool in valid_options)) {
error("Error: ${tool} not a valid pipeline argument for SAMTOOLS_PIPELINE! Valid options are: ${valid_options.join(", ")}")
}
}

def n_commands = pipeline.size()
def final_command = pipeline[n_commands - 1]

def stub_outputs = []

if (final_command in ['view', 'sort', 'merge', 'cat', 'markdup', 'fixmate', 'collate']) {
def argsKey = n_commands == 1 ? "args" : "args${n_commands}"
def argsLast = task.ext[argsKey] ?: ""
def extension = argsLast.contains("--output-fmt sam")
? "sam"
: argsLast.contains("--output-fmt cram")
? "cram"
: "bam"
stub_outputs << "touch ${prefix}.${extension}"
} else if (final_command == "fasta") {
if (meta.single_end) {
stub_outputs << "echo | gzip > ${prefix}_1.fasta.gz"
stub_outputs << "echo | gzip > ${prefix}_singleton.fasta.gz"
} else {
stub_outputs << "echo | gzip > ${prefix}_1.fasta.gz"
stub_outputs << "echo | gzip > ${prefix}_2.fasta.gz"
stub_outputs << "echo | gzip > ${prefix}_singleton.fasta.gz"
}
stub_outputs << "echo | gzip > ${prefix}_other.fasta.gz"
} else if (final_command == "fastq") {
if (meta.single_end) {
stub_outputs << "echo | gzip > ${prefix}_1.fastq.gz"
stub_outputs << "echo | gzip > ${prefix}_singleton.fastq.gz"
} else {
stub_outputs << "echo | gzip > ${prefix}_1.fastq.gz"
stub_outputs << "echo | gzip > ${prefix}_2.fastq.gz"
stub_outputs << "echo | gzip > ${prefix}_singleton.fastq.gz"
}
stub_outputs << "echo | gzip > ${prefix}_other.fastq.gz"
}

"""
${stub_outputs.join("\n")}
"""
}
185 changes: 185 additions & 0 deletions modules/nf-core/samtools/multicommand/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "samtools_multicommand"
description: |
Execute a series of samtools commands in a pipeline, allowing flexible composition of
samtools operations such as view, sort, markdup, merge, fastq conversion and more.

The exact order of operations is specified by the pipeline input, which takes a list such as
[view, sort, markdup]. This is then interpolated to a script where the output of each command
is streamed to the next before being written to disk. The script can handle references being passed,
as well as writing indexes, outputting in CRAM format, and conversion to FASTA and FASTQ.

samtools view ${args} input.bam |\
samtools sort ${args2} |\
samtools markdup ${args3} -o output.bam
keywords:
- view
- sort
- markdup
- fixmate
- merge
- cat
- collate
- fastq
- fasta
- bam
- sam
- cram
tools:
- samtools:
description: |
SAMtools is a set of utilities for interacting with and post-processing
short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
These files are generated as output by short read aligners like BWA.
homepage: http://www.htslib.org/
documentation: http://www.htslib.org/doc/samtools.html
doi: 10.1093/bioinformatics/btp352
licence:
- "MIT"
identifier: biotools:samtools
input:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- input:
type: file
description: BAM/CRAM/SAM file
pattern: "*.{bam,cram,sam}"
ontologies:
- edam: http://edamontology.org/data_0924 # sequence_trace
- edam: http://edamontology.org/format_2572 # BAM
- edam: http://edamontology.org/format_3462 # CRAM
- edam: http://edamontology.org/format_2573 # SAM
- index:
type: file
description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional)
pattern: "*.{bai,csi,crai}"
ontologies:
- edam: http://edamontology.org/format_3326 # Index
- - meta2:
type: map
description: |
Groovy Map containing reference information
e.g. [ id:'genome' ]
- fasta:
type: file
description: Fasta reference file
pattern: "*.{fasta,fa}"
ontologies:
- edam: http://edamontology.org/format_1929 # FASTA
- fai:
type: file
description: Fasta reference file index
pattern: "*.{fai}"
ontologies:
- edam: http://edamontology.org/format_3326 # Index
- pipeline:
type: list
description: |
List of samtools commands to execute in sequence.
Valid options: view, sort, markdup, fixmate, merge, cat, collate, fastq, fasta
output:
bam:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.bam":
type: file
description: optional BAM file output
pattern: "*.{bam}"
ontologies:
- edam: http://edamontology.org/data_0924 # sequence_trace
- edam: http://edamontology.org/format_2572 # BAM
cram:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.cram":
type: file
description: optional CRAM file output
pattern: "*.{cram}"
ontologies:
- edam: http://edamontology.org/data_0924 # sequence_trace
- edam: http://edamontology.org/format_3462 # CRAM
sam:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.sam":
type: file
description: optional SAM file output
pattern: "*.{sam}"
ontologies:
- edam: http://edamontology.org/data_0924 # sequence_trace
- edam: http://edamontology.org/format_2573 # SAM
index:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.{bai,csi,crai}":
type: file
description: optional index file for alignment output
pattern: "*.{bai,csi,crai}"
ontologies:
- edam: http://edamontology.org/format_3326 # Index
fasta:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.fasta.gz":
type: file
description: FASTA files output when final command is "fasta".
pattern: "*.fasta.gz"
ontologies:
- edam: http://edamontology.org/format_1929 # FASTA
- edam: http://edamontology.org/format_3989 # GZIP
fastq:
- - meta:
type: map
description: |
Groovy Map containing sample information
e.g. [ id:'test', single_end:false ]
- "*.fastq.gz":
type: file
description: FASTQ file output when final command is "fastq".
pattern: "*.fastq.gz"
ontologies:
- edam: http://edamontology.org/format_1930 # FASTQ
- edam: http://edamontology.org/format_3989 # GZIP
versions_samtools:
- - ${task.process}:
type: string
description: Name of the process
- samtools:
type: string
description: Name of the tool
- samtools version | sed "1!d;s/.* //":
type: eval
description: The expression to obtain the version of the tool
topics:
versions:
- - ${task.process}:
type: string
description: Name of the process
- samtools:
type: string
description: Name of the tool
- samtools version | sed "1!d;s/.* //":
type: eval
description: The expression to obtain the version of the tool
authors:
- "@prototaxites"
maintainers:
- "@prototaxites"
Loading
Loading