#!/usr/bin/env nextflow /* * This is free and unencumbered software released into the public domain. * * Anyone is free to copy, modify, publish, use, compile, sell, or * distribute this software, either in source code form or as a compiled * binary, for any purpose, commercial or non-commercial, and by any * means. * * In jurisdictions that recognize copyright laws, the author or authors * of this software dedicate any and all copyright interest in the * software to the public domain. We make this dedication for the benefit * of the public at large and to the detriment of our heirs and * successors. We intend this dedication to be an overt act of * relinquishment in perpetuity of all present and future rights to this * software under copyright law. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * For more information, please refer to */ /* * 'CalliNGS-NF' - A Nextflow pipeline for variant calling with NGS data * * This pipeline that reproduces steps from the GATK best practics of SNP * calling with RNAseq data procedure: * https://software.broadinstitute.org/gatk/guide/article?id=3891 * * Anna Vlasova * Emilio Palumbo * Paolo Di Tommaso * Evan Floden */ /* * Define the default parameters */ params.genome = "$baseDir/data/genome.fa" params.variants = "$baseDir/data/known_variants.vcf.gz" params.blacklist = "$baseDir/data/blacklist.bed" params.reads = "$baseDir/data/reads/rep1_{1,2}.fq.gz" params.results = "results" params.gatk = '/usr/local/bin/GenomeAnalysisTK.jar' params.gatk_launch = "java -jar $params.gatk" log.info "C A L L I N G S - N F v 1.0" log.info "================================" log.info "genome : $params.genome" log.info "reads : $params.reads" log.info "variants : $params.variants" log.info "blacklist: $params.blacklist" log.info "results : $params.results" log.info "gatk : $params.gatk" log.info "" /* * Parse the input parameters */ GATK = params.gatk_launch genome_file = file(params.genome) variants_file = file(params.variants) blacklist_file = file(params.blacklist) reads_ch = Channel.fromFilePairs(params.reads) /********** * PART 1: Data preparation * * Process 1A: Create a FASTA genome index (.fai) with samtools for GATK */ process '1A_prepare_genome_samtools' { tag "$genome.baseName" input: file genome from genome_file output: file "${genome}.fai" into genome_index_ch script: """ samtools faidx ${genome} """ } /* * Process 1B: Create a FASTA genome sequence dictionary with Picard for GATK */ process '1B_prepare_genome_picard' { tag "$genome.baseName" input: file genome from genome_file output: file "${genome.baseName}.dict" into genome_dict_ch script: """ PICARD=`which picard.jar` java -jar \$PICARD CreateSequenceDictionary R= $genome O= ${genome.baseName}.dict """ } /* * Process 1C: Create STAR genome index file. */ process '1C_prepare_star_genome_index' { tag "$genome.baseName" input: file genome from genome_file output: file "genome_dir" into genome_dir_ch script: """ mkdir genome_dir STAR --runMode genomeGenerate \ --genomeDir genome_dir \ --genomeFastaFiles ${genome} \ --runThreadN ${task.cpus} """ } /* * Process 1D: Create a file containing the filtered and recoded set of variants */ process '1D_prepare_vcf_file' { tag "$variantsFile.baseName" input: file variantsFile from variants_file file blacklisted from blacklist_file output: set file("${variantsFile.baseName}.filtered.recode.vcf.gz"), file("${variantsFile.baseName}.filtered.recode.vcf.gz.tbi") into prepared_vcf_ch script: """ vcftools --gzvcf $variantsFile -c \ --exclude-bed ${blacklisted} \ --recode | bgzip -c \ > ${variantsFile.baseName}.filtered.recode.vcf.gz tabix ${variantsFile.baseName}.filtered.recode.vcf.gz """ } /* * END OF PART 1 *********/ /********** * PART 2: STAR RNA-Seq Mapping * * Process 2: Align RNA-Seq reads to the genome with STAR */ process '2_rnaseq_mapping_star' { tag "$replicateId" input: file genome from genome_file file genomeDir from genome_dir_ch set replicateId, file(reads) from reads_ch output: set replicateId, file('Aligned.sortedByCoord.out.bam'), file('Aligned.sortedByCoord.out.bam.bai') into aligned_bam_ch script: """ # ngs-nf-dev Align reads to genome STAR --genomeDir $genomeDir \ --readFilesIn $reads \ --runThreadN ${task.cpus} \ --readFilesCommand zcat \ --outFilterType BySJout \ --alignSJoverhangMin 8 \ --alignSJDBoverhangMin 1 \ --outFilterMismatchNmax 999 # 2nd pass (improve alignmets using table of splice junctions and create a new index) mkdir genomeDir STAR --runMode genomeGenerate \ --genomeDir genomeDir \ --genomeFastaFiles $genome \ --sjdbFileChrStartEnd SJ.out.tab \ --sjdbOverhang 75 \ --runThreadN ${task.cpus} # Final read alignments STAR --genomeDir genomeDir \ --readFilesIn $reads \ --runThreadN ${task.cpus} \ --readFilesCommand zcat \ --outFilterType BySJout \ --alignSJoverhangMin 8 \ --alignSJDBoverhangMin 1 \ --outFilterMismatchNmax 999 \ --outSAMtype BAM SortedByCoordinate \ --outSAMattrRGline ID:$replicateId LB:library PL:illumina PU:machine SM:GM12878 # Index the BAM file samtools index Aligned.sortedByCoord.out.bam """ } /* * END OF PART 2 ******/ /********** * PART 3: GATK Prepare Mapped Reads * * Process 3: Split reads that contain Ns in their CIGAR string. * Creates k+1 new reads (where k is the number of N cigar elements) * that correspond to the segments of the original read beside/between * the splicing events represented by the Ns in the original CIGAR. */ process '3_rnaseq_gatk_splitNcigar' { tag "$replicateId" input: file genome from genome_file file index from genome_index_ch file genome_dict from genome_dict_ch set replicateId, file(bam), file(index) from aligned_bam_ch output: set replicateId, file('split.bam'), file('split.bai') into splitted_bam_ch script: """ # SplitNCigarReads and reassign mapping qualities $GATK -T SplitNCigarReads \ -R $genome -I $bam \ -o split.bam \ -rf ReassignOneMappingQuality \ -RMQF 255 -RMQT 60 \ -U ALLOW_N_CIGAR_READS \ --fix_misencoded_quality_scores """ } /* * END OF PART 3 ******/ /*********** * PART 4: GATK Base Quality Score Recalibration Workflow * * Process 4: Base recalibrate to detect systematic errors in base quality scores, * select unique alignments and index * */ process '4_rnaseq_gatk_recalibrate' { tag "$replicateId" input: file genome from genome_file file index from genome_index_ch file dict from genome_dict_ch set replicateId, file(bam), file(index) from splitted_bam_ch set file(variants_file), file(variants_file_index) from prepared_vcf_ch output: set sampleId, file("${replicateId}.final.uniq.bam"), file("${replicateId}.final.uniq.bam.bai") into (final_output_ch, bam_for_ASE_ch) script: sampleId = replicateId.replaceAll(/[12]$/,'') """ # Indel Realignment and Base Recalibration $GATK -T BaseRecalibrator \ --default_platform illumina \ -cov ReadGroupCovariate \ -cov QualityScoreCovariate \ -cov CycleCovariate \ -knownSites ${variants_file} \ -cov ContextCovariate \ -R ${genome} -I ${bam} \ --downsampling_type NONE \ -nct ${task.cpus} \ -o final.rnaseq.grp $GATK -T PrintReads \ -R ${genome} -I ${bam} \ -BQSR final.rnaseq.grp \ -nct ${task.cpus} \ -o final.bam # Select only unique alignments, no multimaps (samtools view -H final.bam; samtools view final.bam| grep -w 'NH:i:1') \ |samtools view -Sb - > ${replicateId}.final.uniq.bam # Index BAM files samtools index ${replicateId}.final.uniq.bam """ } /* * END OF PART 4 ******/ /*********** * PART 5: GATK Variant Calling * * Process 5: Call variants with GATK HaplotypeCaller. * Calls SNPs and indels simultaneously via local de-novo assembly of * haplotypes in an active region. * Filter called variants with GATK VariantFiltration. */ process '5_rnaseq_call_variants' { tag "$sampleId" input: file genome from genome_file file index from genome_index_ch file dict from genome_dict_ch set sampleId, file(bam), file(bai) from final_output_ch.groupTuple() output: set sampleId, file('final.vcf') into vcf_files script: """ # fix absolute path in dict file sed -i 's@UR:file:.*${genome}@UR:file:${genome}@g' $dict echo "${bam.join('\n')}" > bam.list # Variant calling $GATK -T HaplotypeCaller \ -R $genome -I bam.list \ -dontUseSoftClippedBases \ -stand_call_conf 20.0 \ -o output.gatk.vcf.gz # Variant filtering $GATK -T VariantFiltration \ -R $genome -V output.gatk.vcf.gz \ -window 35 -cluster 3 \ -filterName FS -filter "FS > 30.0" \ -filterName QD -filter "QD < 2.0" \ -o final.vcf """ } /* * END OF PART 5 ******/ /*********** * PART 6: Post-process variants file and prepare for Allele-Specific Expression and RNA Editing Analysis * * Process 6A: Post-process the VCF result */ process '6A_post_process_vcf' { tag "$sampleId" publishDir "$params.results/$sampleId" input: set sampleId, file('final.vcf') from vcf_files set file('filtered.recode.vcf.gz'), file('filtered.recode.vcf.gz.tbi') from prepared_vcf_ch output: set sampleId, file('final.vcf'), file('commonSNPs.diff.sites_in_files') into vcf_and_snps_ch script: ''' grep -v '#' final.vcf | awk '$7~/PASS/' |perl -ne 'chomp($_); ($dp)=$_=~/DP\\=(\\d+)\\;/; if($dp>=8){print $_."\\n"};' > result.DP8.vcf vcftools --vcf result.DP8.vcf --gzdiff filtered.recode.vcf.gz --diff-site --out commonSNPs ''' } /* * Process 6B: Prepare variants file for allele specific expression (ASE) analysis */ process '6B_prepare_vcf_for_ase' { tag "$sampleId" publishDir "$params.results/$sampleId" input: set sampleId, file('final.vcf'), file('commonSNPs.diff.sites_in_files') from vcf_and_snps_ch output: set sampleId, file('known_snps.vcf') into vcf_for_ASE file('AF.histogram.pdf') into gghist_pdfs script: ''' awk 'BEGIN{OFS="\t"} $4~/B/{print $1,$2,$3}' commonSNPs.diff.sites_in_files > test.bed vcftools --vcf final.vcf --bed test.bed --recode --keep-INFO-all --stdout > known_snps.vcf grep -v '#' known_snps.vcf | awk -F '\\t' '{print $10}' \ |awk -F ':' '{print $2}'|perl -ne 'chomp($_); \ @v=split(/\\,/,$_); if($v[0]!=0 ||$v[1] !=0)\ {print $v[1]/($v[1]+$v[0])."\\n"; }' |awk '$1!=1' \ >AF.4R gghist.R -i AF.4R -o AF.histogram.pdf ''' } /* * Group data for allele-specific expression. * * The `bam_for_ASE_ch` emites tuples having the following structure, holding the final BAM/BAI files: * * ( sample_id, file_bam, file_bai ) * * The `vcf_for_ASE` channel emits tuples having the following structure, holding the VCF file: * * ( sample_id, output.vcf ) * * The BAMs are grouped together and merged with VCFs having the same sample id. Finally * it creates a channel named `grouped_vcf_bam_bai_ch` emitting the following tuples: * * ( sample_id, file_vcf, List[file_bam], List[file_bai] ) */ bam_for_ASE_ch .groupTuple() .phase(vcf_for_ASE) .map{ left, right -> def sampleId = left[0] def bam = left[1] def bai = left[2] def vcf = right[1] tuple(sampleId, vcf, bam, bai) } .set { grouped_vcf_bam_bai_ch } /* * Process 6C: Allele-Specific Expression analysis with GATK ASEReadCounter. * Calculates allele counts at a set of positions after applying * filters that are tuned for enabling allele-specific expression * (ASE) analysis */ process '6C_ASE_knownSNPs' { tag "$sampleId" publishDir "$params.results/$sampleId" input: file genome from genome_file file index from genome_index_ch file dict from genome_dict_ch set sampleId, file(vcf), file(bam), file(bai) from grouped_vcf_bam_bai_ch output: file "ASE.tsv" script: """ echo "${bam.join('\n')}" > bam.list $GATK -R ${genome} \ -T ASEReadCounter \ -o ASE.tsv \ -I bam.list \ -sites ${vcf} """ }