Files
linguist/samples/Nextflow/callings.nf
Paolo Di Tommaso bee7e55618 Add Nextflow language support (#3870)
* Added nextflow language
* Added main.nf to list of filenames
* Fixed duplicate groovy scope
* Removed hello-world example
* Update grammar submodule
* Removed main.nf from filenames
* Added nextflow.config example
2018-01-09 12:47:59 +01:00

497 lines
13 KiB
Plaintext
Executable File

#!/usr/bin/env nextflow
/*
* This is free and unencumbered software released into the public domain.
*
* Anyone is free to copy, modify, publish, use, compile, sell, or
* distribute this software, either in source code form or as a compiled
* binary, for any purpose, commercial or non-commercial, and by any
* means.
*
* In jurisdictions that recognize copyright laws, the author or authors
* of this software dedicate any and all copyright interest in the
* software to the public domain. We make this dedication for the benefit
* of the public at large and to the detriment of our heirs and
* successors. We intend this dedication to be an overt act of
* relinquishment in perpetuity of all present and future rights to this
* software under copyright law.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* For more information, please refer to <http://unlicense.org/>
*/
/*
* 'CalliNGS-NF' - A Nextflow pipeline for variant calling with NGS data
*
* This pipeline that reproduces steps from the GATK best practics of SNP
* calling with RNAseq data procedure:
* https://software.broadinstitute.org/gatk/guide/article?id=3891
*
* Anna Vlasova
* Emilio Palumbo
* Paolo Di Tommaso
* Evan Floden
*/
/*
* Define the default parameters
*/
params.genome = "$baseDir/data/genome.fa"
params.variants = "$baseDir/data/known_variants.vcf.gz"
params.blacklist = "$baseDir/data/blacklist.bed"
params.reads = "$baseDir/data/reads/rep1_{1,2}.fq.gz"
params.results = "results"
params.gatk = '/usr/local/bin/GenomeAnalysisTK.jar'
params.gatk_launch = "java -jar $params.gatk"
log.info "C A L L I N G S - N F v 1.0"
log.info "================================"
log.info "genome : $params.genome"
log.info "reads : $params.reads"
log.info "variants : $params.variants"
log.info "blacklist: $params.blacklist"
log.info "results : $params.results"
log.info "gatk : $params.gatk"
log.info ""
/*
* Parse the input parameters
*/
GATK = params.gatk_launch
genome_file = file(params.genome)
variants_file = file(params.variants)
blacklist_file = file(params.blacklist)
reads_ch = Channel.fromFilePairs(params.reads)
/**********
* PART 1: Data preparation
*
* Process 1A: Create a FASTA genome index (.fai) with samtools for GATK
*/
process '1A_prepare_genome_samtools' {
tag "$genome.baseName"
input:
file genome from genome_file
output:
file "${genome}.fai" into genome_index_ch
script:
"""
samtools faidx ${genome}
"""
}
/*
* Process 1B: Create a FASTA genome sequence dictionary with Picard for GATK
*/
process '1B_prepare_genome_picard' {
tag "$genome.baseName"
input:
file genome from genome_file
output:
file "${genome.baseName}.dict" into genome_dict_ch
script:
"""
PICARD=`which picard.jar`
java -jar \$PICARD CreateSequenceDictionary R= $genome O= ${genome.baseName}.dict
"""
}
/*
* Process 1C: Create STAR genome index file.
*/
process '1C_prepare_star_genome_index' {
tag "$genome.baseName"
input:
file genome from genome_file
output:
file "genome_dir" into genome_dir_ch
script:
"""
mkdir genome_dir
STAR --runMode genomeGenerate \
--genomeDir genome_dir \
--genomeFastaFiles ${genome} \
--runThreadN ${task.cpus}
"""
}
/*
* Process 1D: Create a file containing the filtered and recoded set of variants
*/
process '1D_prepare_vcf_file' {
tag "$variantsFile.baseName"
input:
file variantsFile from variants_file
file blacklisted from blacklist_file
output:
set file("${variantsFile.baseName}.filtered.recode.vcf.gz"), file("${variantsFile.baseName}.filtered.recode.vcf.gz.tbi") into prepared_vcf_ch
script:
"""
vcftools --gzvcf $variantsFile -c \
--exclude-bed ${blacklisted} \
--recode | bgzip -c \
> ${variantsFile.baseName}.filtered.recode.vcf.gz
tabix ${variantsFile.baseName}.filtered.recode.vcf.gz
"""
}
/*
* END OF PART 1
*********/
/**********
* PART 2: STAR RNA-Seq Mapping
*
* Process 2: Align RNA-Seq reads to the genome with STAR
*/
process '2_rnaseq_mapping_star' {
tag "$replicateId"
input:
file genome from genome_file
file genomeDir from genome_dir_ch
set replicateId, file(reads) from reads_ch
output:
set replicateId, file('Aligned.sortedByCoord.out.bam'), file('Aligned.sortedByCoord.out.bam.bai') into aligned_bam_ch
script:
"""
# ngs-nf-dev Align reads to genome
STAR --genomeDir $genomeDir \
--readFilesIn $reads \
--runThreadN ${task.cpus} \
--readFilesCommand zcat \
--outFilterType BySJout \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--outFilterMismatchNmax 999
# 2nd pass (improve alignmets using table of splice junctions and create a new index)
mkdir genomeDir
STAR --runMode genomeGenerate \
--genomeDir genomeDir \
--genomeFastaFiles $genome \
--sjdbFileChrStartEnd SJ.out.tab \
--sjdbOverhang 75 \
--runThreadN ${task.cpus}
# Final read alignments
STAR --genomeDir genomeDir \
--readFilesIn $reads \
--runThreadN ${task.cpus} \
--readFilesCommand zcat \
--outFilterType BySJout \
--alignSJoverhangMin 8 \
--alignSJDBoverhangMin 1 \
--outFilterMismatchNmax 999 \
--outSAMtype BAM SortedByCoordinate \
--outSAMattrRGline ID:$replicateId LB:library PL:illumina PU:machine SM:GM12878
# Index the BAM file
samtools index Aligned.sortedByCoord.out.bam
"""
}
/*
* END OF PART 2
******/
/**********
* PART 3: GATK Prepare Mapped Reads
*
* Process 3: Split reads that contain Ns in their CIGAR string.
* Creates k+1 new reads (where k is the number of N cigar elements)
* that correspond to the segments of the original read beside/between
* the splicing events represented by the Ns in the original CIGAR.
*/
process '3_rnaseq_gatk_splitNcigar' {
tag "$replicateId"
input:
file genome from genome_file
file index from genome_index_ch
file genome_dict from genome_dict_ch
set replicateId, file(bam), file(index) from aligned_bam_ch
output:
set replicateId, file('split.bam'), file('split.bai') into splitted_bam_ch
script:
"""
# SplitNCigarReads and reassign mapping qualities
$GATK -T SplitNCigarReads \
-R $genome -I $bam \
-o split.bam \
-rf ReassignOneMappingQuality \
-RMQF 255 -RMQT 60 \
-U ALLOW_N_CIGAR_READS \
--fix_misencoded_quality_scores
"""
}
/*
* END OF PART 3
******/
/***********
* PART 4: GATK Base Quality Score Recalibration Workflow
*
* Process 4: Base recalibrate to detect systematic errors in base quality scores,
* select unique alignments and index
*
*/
process '4_rnaseq_gatk_recalibrate' {
tag "$replicateId"
input:
file genome from genome_file
file index from genome_index_ch
file dict from genome_dict_ch
set replicateId, file(bam), file(index) from splitted_bam_ch
set file(variants_file), file(variants_file_index) from prepared_vcf_ch
output:
set sampleId, file("${replicateId}.final.uniq.bam"), file("${replicateId}.final.uniq.bam.bai") into (final_output_ch, bam_for_ASE_ch)
script:
sampleId = replicateId.replaceAll(/[12]$/,'')
"""
# Indel Realignment and Base Recalibration
$GATK -T BaseRecalibrator \
--default_platform illumina \
-cov ReadGroupCovariate \
-cov QualityScoreCovariate \
-cov CycleCovariate \
-knownSites ${variants_file} \
-cov ContextCovariate \
-R ${genome} -I ${bam} \
--downsampling_type NONE \
-nct ${task.cpus} \
-o final.rnaseq.grp
$GATK -T PrintReads \
-R ${genome} -I ${bam} \
-BQSR final.rnaseq.grp \
-nct ${task.cpus} \
-o final.bam
# Select only unique alignments, no multimaps
(samtools view -H final.bam; samtools view final.bam| grep -w 'NH:i:1') \
|samtools view -Sb - > ${replicateId}.final.uniq.bam
# Index BAM files
samtools index ${replicateId}.final.uniq.bam
"""
}
/*
* END OF PART 4
******/
/***********
* PART 5: GATK Variant Calling
*
* Process 5: Call variants with GATK HaplotypeCaller.
* Calls SNPs and indels simultaneously via local de-novo assembly of
* haplotypes in an active region.
* Filter called variants with GATK VariantFiltration.
*/
process '5_rnaseq_call_variants' {
tag "$sampleId"
input:
file genome from genome_file
file index from genome_index_ch
file dict from genome_dict_ch
set sampleId, file(bam), file(bai) from final_output_ch.groupTuple()
output:
set sampleId, file('final.vcf') into vcf_files
script:
"""
# fix absolute path in dict file
sed -i 's@UR:file:.*${genome}@UR:file:${genome}@g' $dict
echo "${bam.join('\n')}" > bam.list
# Variant calling
$GATK -T HaplotypeCaller \
-R $genome -I bam.list \
-dontUseSoftClippedBases \
-stand_call_conf 20.0 \
-o output.gatk.vcf.gz
# Variant filtering
$GATK -T VariantFiltration \
-R $genome -V output.gatk.vcf.gz \
-window 35 -cluster 3 \
-filterName FS -filter "FS > 30.0" \
-filterName QD -filter "QD < 2.0" \
-o final.vcf
"""
}
/*
* END OF PART 5
******/
/***********
* PART 6: Post-process variants file and prepare for Allele-Specific Expression and RNA Editing Analysis
*
* Process 6A: Post-process the VCF result
*/
process '6A_post_process_vcf' {
tag "$sampleId"
publishDir "$params.results/$sampleId"
input:
set sampleId, file('final.vcf') from vcf_files
set file('filtered.recode.vcf.gz'), file('filtered.recode.vcf.gz.tbi') from prepared_vcf_ch
output:
set sampleId, file('final.vcf'), file('commonSNPs.diff.sites_in_files') into vcf_and_snps_ch
script:
'''
grep -v '#' final.vcf | awk '$7~/PASS/' |perl -ne 'chomp($_); ($dp)=$_=~/DP\\=(\\d+)\\;/; if($dp>=8){print $_."\\n"};' > result.DP8.vcf
vcftools --vcf result.DP8.vcf --gzdiff filtered.recode.vcf.gz --diff-site --out commonSNPs
'''
}
/*
* Process 6B: Prepare variants file for allele specific expression (ASE) analysis
*/
process '6B_prepare_vcf_for_ase' {
tag "$sampleId"
publishDir "$params.results/$sampleId"
input:
set sampleId, file('final.vcf'), file('commonSNPs.diff.sites_in_files') from vcf_and_snps_ch
output:
set sampleId, file('known_snps.vcf') into vcf_for_ASE
file('AF.histogram.pdf') into gghist_pdfs
script:
'''
awk 'BEGIN{OFS="\t"} $4~/B/{print $1,$2,$3}' commonSNPs.diff.sites_in_files > test.bed
vcftools --vcf final.vcf --bed test.bed --recode --keep-INFO-all --stdout > known_snps.vcf
grep -v '#' known_snps.vcf | awk -F '\\t' '{print $10}' \
|awk -F ':' '{print $2}'|perl -ne 'chomp($_); \
@v=split(/\\,/,$_); if($v[0]!=0 ||$v[1] !=0)\
{print $v[1]/($v[1]+$v[0])."\\n"; }' |awk '$1!=1' \
>AF.4R
gghist.R -i AF.4R -o AF.histogram.pdf
'''
}
/*
* Group data for allele-specific expression.
*
* The `bam_for_ASE_ch` emites tuples having the following structure, holding the final BAM/BAI files:
*
* ( sample_id, file_bam, file_bai )
*
* The `vcf_for_ASE` channel emits tuples having the following structure, holding the VCF file:
*
* ( sample_id, output.vcf )
*
* The BAMs are grouped together and merged with VCFs having the same sample id. Finally
* it creates a channel named `grouped_vcf_bam_bai_ch` emitting the following tuples:
*
* ( sample_id, file_vcf, List[file_bam], List[file_bai] )
*/
bam_for_ASE_ch
.groupTuple()
.phase(vcf_for_ASE)
.map{ left, right ->
def sampleId = left[0]
def bam = left[1]
def bai = left[2]
def vcf = right[1]
tuple(sampleId, vcf, bam, bai)
}
.set { grouped_vcf_bam_bai_ch }
/*
* Process 6C: Allele-Specific Expression analysis with GATK ASEReadCounter.
* Calculates allele counts at a set of positions after applying
* filters that are tuned for enabling allele-specific expression
* (ASE) analysis
*/
process '6C_ASE_knownSNPs' {
tag "$sampleId"
publishDir "$params.results/$sampleId"
input:
file genome from genome_file
file index from genome_index_ch
file dict from genome_dict_ch
set sampleId, file(vcf), file(bam), file(bai) from grouped_vcf_bam_bai_ch
output:
file "ASE.tsv"
script:
"""
echo "${bam.join('\n')}" > bam.list
$GATK -R ${genome} \
-T ASEReadCounter \
-o ASE.tsv \
-I bam.list \
-sites ${vcf}
"""
}