mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	* Added nextflow language * Added main.nf to list of filenames * Fixed duplicate groovy scope * Removed hello-world example * Update grammar submodule * Removed main.nf from filenames * Added nextflow.config example
		
			
				
	
	
		
			497 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Plaintext
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			497 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Plaintext
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/env nextflow
 | 
						|
/*
 | 
						|
 * This is free and unencumbered software released into the public domain.
 | 
						|
 * 
 | 
						|
 * Anyone is free to copy, modify, publish, use, compile, sell, or
 | 
						|
 * distribute this software, either in source code form or as a compiled
 | 
						|
 * binary, for any purpose, commercial or non-commercial, and by any
 | 
						|
 * means.
 | 
						|
 * 
 | 
						|
 * In jurisdictions that recognize copyright laws, the author or authors
 | 
						|
 * of this software dedicate any and all copyright interest in the
 | 
						|
 * software to the public domain. We make this dedication for the benefit
 | 
						|
 * of the public at large and to the detriment of our heirs and
 | 
						|
 * successors. We intend this dedication to be an overt act of
 | 
						|
 * relinquishment in perpetuity of all present and future rights to this
 | 
						|
 * software under copyright law.
 | 
						|
 * 
 | 
						|
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 | 
						|
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 | 
						|
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 | 
						|
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 | 
						|
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 | 
						|
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 | 
						|
 * OTHER DEALINGS IN THE SOFTWARE.
 | 
						|
 * 
 | 
						|
 * For more information, please refer to <http://unlicense.org/>
 | 
						|
 */
 | 
						|
  
 | 
						|
  
 | 
						|
/* 
 | 
						|
 * 'CalliNGS-NF' - A Nextflow pipeline for variant calling with NGS data
 | 
						|
 * 
 | 
						|
 * This pipeline that reproduces steps from the GATK best practics of SNP 
 | 
						|
 * calling with RNAseq data procedure:
 | 
						|
 * https://software.broadinstitute.org/gatk/guide/article?id=3891
 | 
						|
 * 
 | 
						|
 * Anna Vlasova 
 | 
						|
 * Emilio Palumbo 
 | 
						|
 * Paolo Di Tommaso
 | 
						|
 * Evan Floden 
 | 
						|
 */
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
 * Define the default parameters
 | 
						|
 */ 
 | 
						|
 | 
						|
params.genome     = "$baseDir/data/genome.fa"
 | 
						|
params.variants   = "$baseDir/data/known_variants.vcf.gz"
 | 
						|
params.blacklist  = "$baseDir/data/blacklist.bed" 
 | 
						|
params.reads      = "$baseDir/data/reads/rep1_{1,2}.fq.gz"
 | 
						|
params.results    = "results"
 | 
						|
params.gatk       = '/usr/local/bin/GenomeAnalysisTK.jar'
 | 
						|
params.gatk_launch = "java -jar $params.gatk" 
 | 
						|
 | 
						|
log.info "C A L L I N G S  -  N F    v 1.0" 
 | 
						|
log.info "================================"
 | 
						|
log.info "genome   : $params.genome"
 | 
						|
log.info "reads    : $params.reads"
 | 
						|
log.info "variants : $params.variants"
 | 
						|
log.info "blacklist: $params.blacklist"
 | 
						|
log.info "results  : $params.results" 
 | 
						|
log.info "gatk     : $params.gatk"
 | 
						|
log.info ""
 | 
						|
 | 
						|
/*
 | 
						|
 *  Parse the input parameters
 | 
						|
 */
 | 
						|
 | 
						|
GATK            = params.gatk_launch
 | 
						|
genome_file     = file(params.genome)
 | 
						|
variants_file   = file(params.variants)
 | 
						|
blacklist_file  = file(params.blacklist)
 | 
						|
reads_ch        = Channel.fromFilePairs(params.reads)
 | 
						|
 | 
						|
 | 
						|
/**********
 | 
						|
 * PART 1: Data preparation
 | 
						|
 *
 | 
						|
 * Process 1A: Create a FASTA genome index (.fai) with samtools for GATK
 | 
						|
 */
 | 
						|
 | 
						|
process '1A_prepare_genome_samtools' { 
 | 
						|
  tag "$genome.baseName"
 | 
						|
  
 | 
						|
  input: 
 | 
						|
      file genome from genome_file 
 | 
						|
 
 | 
						|
  output: 
 | 
						|
      file "${genome}.fai" into genome_index_ch  
 | 
						|
  
 | 
						|
  script:
 | 
						|
  """
 | 
						|
  samtools faidx ${genome}
 | 
						|
  """
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
 * Process 1B: Create a FASTA genome sequence dictionary with Picard for GATK
 | 
						|
 */
 | 
						|
 | 
						|
process '1B_prepare_genome_picard' {
 | 
						|
  tag "$genome.baseName"
 | 
						|
 | 
						|
  input:
 | 
						|
      file genome from genome_file
 | 
						|
  output:
 | 
						|
      file "${genome.baseName}.dict" into genome_dict_ch
 | 
						|
 | 
						|
  script:
 | 
						|
  """
 | 
						|
  PICARD=`which picard.jar`
 | 
						|
  java -jar \$PICARD CreateSequenceDictionary R= $genome O= ${genome.baseName}.dict
 | 
						|
  """
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
 * Process 1C: Create STAR genome index file.
 | 
						|
 */
 | 
						|
 | 
						|
process '1C_prepare_star_genome_index' {
 | 
						|
  tag "$genome.baseName"
 | 
						|
 | 
						|
  input:
 | 
						|
      file genome from genome_file
 | 
						|
  output:
 | 
						|
      file "genome_dir" into genome_dir_ch
 | 
						|
 | 
						|
  script:
 | 
						|
  """
 | 
						|
  mkdir genome_dir
 | 
						|
 | 
						|
  STAR --runMode genomeGenerate \
 | 
						|
       --genomeDir genome_dir \
 | 
						|
       --genomeFastaFiles ${genome} \
 | 
						|
       --runThreadN ${task.cpus}
 | 
						|
  """
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/*
 | 
						|
 * Process 1D: Create a file containing the filtered and recoded set of variants
 | 
						|
 */
 | 
						|
 | 
						|
process '1D_prepare_vcf_file' {
 | 
						|
  tag "$variantsFile.baseName"
 | 
						|
 | 
						|
  input: 
 | 
						|
      file variantsFile from variants_file
 | 
						|
      file blacklisted from blacklist_file
 | 
						|
 | 
						|
  output:
 | 
						|
      set file("${variantsFile.baseName}.filtered.recode.vcf.gz"), file("${variantsFile.baseName}.filtered.recode.vcf.gz.tbi") into prepared_vcf_ch
 | 
						|
  
 | 
						|
  script:  
 | 
						|
  """
 | 
						|
  vcftools --gzvcf $variantsFile -c \
 | 
						|
           --exclude-bed ${blacklisted} \
 | 
						|
           --recode | bgzip -c \
 | 
						|
           > ${variantsFile.baseName}.filtered.recode.vcf.gz
 | 
						|
 | 
						|
  tabix ${variantsFile.baseName}.filtered.recode.vcf.gz
 | 
						|
  """
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 *  END OF PART 1
 | 
						|
 *********/
 | 
						|
 | 
						|
 | 
						|
 | 
						|
/**********
 | 
						|
 * PART 2: STAR RNA-Seq Mapping
 | 
						|
 *
 | 
						|
 * Process 2: Align RNA-Seq reads to the genome with STAR
 | 
						|
 */
 | 
						|
 | 
						|
process '2_rnaseq_mapping_star' {
 | 
						|
  tag "$replicateId"
 | 
						|
 | 
						|
  input: 
 | 
						|
      file genome from genome_file 
 | 
						|
      file genomeDir from genome_dir_ch
 | 
						|
      set replicateId, file(reads) from reads_ch 
 | 
						|
 | 
						|
  output: 
 | 
						|
      set replicateId, file('Aligned.sortedByCoord.out.bam'), file('Aligned.sortedByCoord.out.bam.bai') into aligned_bam_ch
 | 
						|
 | 
						|
  script:    
 | 
						|
  """
 | 
						|
  # ngs-nf-dev Align reads to genome
 | 
						|
  STAR --genomeDir $genomeDir \
 | 
						|
       --readFilesIn $reads \
 | 
						|
       --runThreadN ${task.cpus} \
 | 
						|
       --readFilesCommand zcat \
 | 
						|
       --outFilterType BySJout \
 | 
						|
       --alignSJoverhangMin 8 \
 | 
						|
       --alignSJDBoverhangMin 1 \
 | 
						|
       --outFilterMismatchNmax 999
 | 
						|
    
 | 
						|
  # 2nd pass (improve alignmets using table of splice junctions and create a new index)  
 | 
						|
  mkdir genomeDir  
 | 
						|
  STAR --runMode genomeGenerate \
 | 
						|
       --genomeDir genomeDir \
 | 
						|
       --genomeFastaFiles $genome \
 | 
						|
       --sjdbFileChrStartEnd SJ.out.tab \
 | 
						|
       --sjdbOverhang 75 \
 | 
						|
       --runThreadN ${task.cpus}  
 | 
						|
    
 | 
						|
  # Final read alignments  
 | 
						|
  STAR --genomeDir genomeDir \
 | 
						|
       --readFilesIn $reads \
 | 
						|
       --runThreadN ${task.cpus} \
 | 
						|
       --readFilesCommand zcat \
 | 
						|
       --outFilterType BySJout \
 | 
						|
       --alignSJoverhangMin 8 \
 | 
						|
       --alignSJDBoverhangMin 1 \
 | 
						|
       --outFilterMismatchNmax 999 \
 | 
						|
       --outSAMtype BAM SortedByCoordinate \
 | 
						|
       --outSAMattrRGline ID:$replicateId LB:library PL:illumina PU:machine SM:GM12878
 | 
						|
 | 
						|
  # Index the BAM file
 | 
						|
  samtools index Aligned.sortedByCoord.out.bam
 | 
						|
  """
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 *  END OF PART 2
 | 
						|
 ******/
 | 
						|
 | 
						|
 | 
						|
/**********
 | 
						|
 * PART 3: GATK Prepare Mapped Reads
 | 
						|
 *
 | 
						|
 * Process 3: Split reads that contain Ns in their CIGAR string.
 | 
						|
 *            Creates k+1 new reads (where k is the number of N cigar elements) 
 | 
						|
 *            that correspond to the segments of the original read beside/between 
 | 
						|
 *            the splicing events represented by the Ns in the original CIGAR.
 | 
						|
 */
 | 
						|
 | 
						|
process '3_rnaseq_gatk_splitNcigar' {
 | 
						|
  tag "$replicateId"
 | 
						|
  
 | 
						|
  input: 
 | 
						|
      file genome from genome_file 
 | 
						|
      file index from genome_index_ch
 | 
						|
      file genome_dict from genome_dict_ch
 | 
						|
      set replicateId, file(bam), file(index) from aligned_bam_ch
 | 
						|
 | 
						|
  output:
 | 
						|
      set replicateId, file('split.bam'), file('split.bai') into splitted_bam_ch
 | 
						|
  
 | 
						|
  script:
 | 
						|
  """
 | 
						|
  # SplitNCigarReads and reassign mapping qualities
 | 
						|
  $GATK -T SplitNCigarReads \
 | 
						|
          -R $genome -I $bam \
 | 
						|
          -o split.bam \
 | 
						|
          -rf ReassignOneMappingQuality \
 | 
						|
          -RMQF 255 -RMQT 60 \
 | 
						|
          -U ALLOW_N_CIGAR_READS \
 | 
						|
          --fix_misencoded_quality_scores
 | 
						|
  """
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 *  END OF PART 3
 | 
						|
 ******/
 | 
						|
 | 
						|
 | 
						|
/***********
 | 
						|
 * PART 4: GATK Base Quality Score Recalibration Workflow
 | 
						|
 *
 | 
						|
 * Process 4: Base recalibrate to detect systematic errors in base quality scores, 
 | 
						|
 *            select unique alignments and index
 | 
						|
 *             
 | 
						|
 */
 | 
						|
 | 
						|
process '4_rnaseq_gatk_recalibrate' {
 | 
						|
  tag "$replicateId"
 | 
						|
    
 | 
						|
  input: 
 | 
						|
      file genome from genome_file 
 | 
						|
      file index from genome_index_ch
 | 
						|
      file dict from genome_dict_ch
 | 
						|
      set replicateId, file(bam), file(index) from splitted_bam_ch
 | 
						|
      set file(variants_file), file(variants_file_index) from prepared_vcf_ch
 | 
						|
 | 
						|
  output:
 | 
						|
      set sampleId, file("${replicateId}.final.uniq.bam"), file("${replicateId}.final.uniq.bam.bai") into (final_output_ch, bam_for_ASE_ch)
 | 
						|
  
 | 
						|
  script: 
 | 
						|
  sampleId = replicateId.replaceAll(/[12]$/,'')
 | 
						|
  """
 | 
						|
  # Indel Realignment and Base Recalibration
 | 
						|
  $GATK -T BaseRecalibrator \
 | 
						|
          --default_platform illumina \
 | 
						|
          -cov ReadGroupCovariate \
 | 
						|
          -cov QualityScoreCovariate \
 | 
						|
          -cov CycleCovariate \
 | 
						|
          -knownSites ${variants_file} \
 | 
						|
          -cov ContextCovariate \
 | 
						|
          -R ${genome} -I ${bam} \
 | 
						|
          --downsampling_type NONE \
 | 
						|
          -nct ${task.cpus} \
 | 
						|
          -o final.rnaseq.grp 
 | 
						|
 | 
						|
  $GATK -T PrintReads \
 | 
						|
          -R ${genome} -I ${bam} \
 | 
						|
          -BQSR final.rnaseq.grp \
 | 
						|
          -nct ${task.cpus} \
 | 
						|
          -o final.bam
 | 
						|
 | 
						|
  # Select only unique alignments, no multimaps
 | 
						|
  (samtools view -H final.bam; samtools view final.bam| grep -w 'NH:i:1') \
 | 
						|
  |samtools view -Sb -  > ${replicateId}.final.uniq.bam
 | 
						|
 | 
						|
  # Index BAM files
 | 
						|
  samtools index ${replicateId}.final.uniq.bam
 | 
						|
  """
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 *  END OF PART 4
 | 
						|
 ******/
 | 
						|
 | 
						|
 | 
						|
 | 
						|
/***********
 | 
						|
 * PART 5: GATK Variant Calling
 | 
						|
 *
 | 
						|
 * Process 5: Call variants with GATK HaplotypeCaller.
 | 
						|
 *            Calls SNPs and indels simultaneously via local de-novo assembly of 
 | 
						|
 *            haplotypes in an active region.
 | 
						|
 *            Filter called variants with GATK VariantFiltration.    
 | 
						|
 */
 | 
						|
 | 
						|
 | 
						|
process '5_rnaseq_call_variants' {
 | 
						|
  tag "$sampleId"
 | 
						|
 | 
						|
  input:
 | 
						|
      file genome from genome_file
 | 
						|
      file index from genome_index_ch
 | 
						|
      file dict from genome_dict_ch
 | 
						|
      set sampleId, file(bam), file(bai) from final_output_ch.groupTuple()
 | 
						|
  
 | 
						|
  output: 
 | 
						|
      set sampleId, file('final.vcf') into vcf_files
 | 
						|
 | 
						|
  script:
 | 
						|
  """
 | 
						|
  # fix absolute path in dict file
 | 
						|
  sed -i 's@UR:file:.*${genome}@UR:file:${genome}@g' $dict
 | 
						|
  echo "${bam.join('\n')}" > bam.list
 | 
						|
  
 | 
						|
  # Variant calling
 | 
						|
  $GATK -T HaplotypeCaller \
 | 
						|
          -R $genome -I bam.list \
 | 
						|
          -dontUseSoftClippedBases \
 | 
						|
          -stand_call_conf 20.0 \
 | 
						|
          -o output.gatk.vcf.gz
 | 
						|
 | 
						|
  # Variant filtering
 | 
						|
  $GATK -T VariantFiltration \
 | 
						|
          -R $genome -V output.gatk.vcf.gz \
 | 
						|
          -window 35 -cluster 3 \
 | 
						|
          -filterName FS -filter "FS > 30.0" \
 | 
						|
          -filterName QD -filter "QD < 2.0" \
 | 
						|
          -o final.vcf
 | 
						|
  """
 | 
						|
}
 | 
						|
 | 
						|
/*
 | 
						|
 *  END OF PART 5
 | 
						|
 ******/
 | 
						|
 | 
						|
 | 
						|
/***********
 | 
						|
 * PART 6: Post-process variants file and prepare for Allele-Specific Expression and RNA Editing Analysis
 | 
						|
 *
 | 
						|
 * Process 6A: Post-process the VCF result  
 | 
						|
 */
 | 
						|
 | 
						|
process '6A_post_process_vcf' {
 | 
						|
  tag "$sampleId"
 | 
						|
  publishDir "$params.results/$sampleId" 
 | 
						|
  
 | 
						|
  input:
 | 
						|
      set sampleId, file('final.vcf') from vcf_files
 | 
						|
      set file('filtered.recode.vcf.gz'), file('filtered.recode.vcf.gz.tbi') from prepared_vcf_ch 
 | 
						|
  output: 
 | 
						|
      set sampleId, file('final.vcf'), file('commonSNPs.diff.sites_in_files') into vcf_and_snps_ch
 | 
						|
  
 | 
						|
  script:
 | 
						|
  '''
 | 
						|
  grep -v '#' final.vcf | awk '$7~/PASS/' |perl -ne 'chomp($_); ($dp)=$_=~/DP\\=(\\d+)\\;/; if($dp>=8){print $_."\\n"};' > result.DP8.vcf
 | 
						|
  
 | 
						|
  vcftools --vcf result.DP8.vcf --gzdiff filtered.recode.vcf.gz  --diff-site --out commonSNPs
 | 
						|
  '''
 | 
						|
}
 | 
						|
 | 
						|
/* 
 | 
						|
 * Process 6B: Prepare variants file for allele specific expression (ASE) analysis
 | 
						|
 */
 | 
						|
 | 
						|
process '6B_prepare_vcf_for_ase' {
 | 
						|
  tag "$sampleId"
 | 
						|
  publishDir "$params.results/$sampleId" 
 | 
						|
  
 | 
						|
  input: 
 | 
						|
      set sampleId, file('final.vcf'), file('commonSNPs.diff.sites_in_files') from vcf_and_snps_ch
 | 
						|
  output: 
 | 
						|
      set sampleId, file('known_snps.vcf') into vcf_for_ASE
 | 
						|
      file('AF.histogram.pdf') into gghist_pdfs
 | 
						|
 | 
						|
  script:
 | 
						|
  '''
 | 
						|
  awk 'BEGIN{OFS="\t"} $4~/B/{print $1,$2,$3}' commonSNPs.diff.sites_in_files  > test.bed
 | 
						|
    
 | 
						|
  vcftools --vcf final.vcf --bed test.bed --recode --keep-INFO-all --stdout > known_snps.vcf
 | 
						|
 | 
						|
  grep -v '#'  known_snps.vcf | awk -F '\\t' '{print $10}' \
 | 
						|
               |awk -F ':' '{print $2}'|perl -ne 'chomp($_); \
 | 
						|
               @v=split(/\\,/,$_); if($v[0]!=0 ||$v[1] !=0)\
 | 
						|
               {print  $v[1]/($v[1]+$v[0])."\\n"; }' |awk '$1!=1' \
 | 
						|
               >AF.4R
 | 
						|
 | 
						|
  gghist.R -i AF.4R -o AF.histogram.pdf
 | 
						|
  '''
 | 
						|
}
 | 
						|
 | 
						|
 | 
						|
/* 
 | 
						|
 * Group data for allele-specific expression.
 | 
						|
 * 
 | 
						|
 * The `bam_for_ASE_ch` emites tuples having the following structure, holding the final BAM/BAI files:
 | 
						|
 *  
 | 
						|
 *   ( sample_id, file_bam, file_bai )
 | 
						|
 * 
 | 
						|
 * The `vcf_for_ASE` channel emits tuples having the following structure, holding the VCF file:
 | 
						|
 *  
 | 
						|
 *   ( sample_id, output.vcf ) 
 | 
						|
 * 
 | 
						|
 * The BAMs are grouped together and merged with VCFs having the same sample id. Finally 
 | 
						|
 * it creates a channel named `grouped_vcf_bam_bai_ch` emitting the following tuples: 
 | 
						|
 *  
 | 
						|
 *   ( sample_id, file_vcf, List[file_bam], List[file_bai] )
 | 
						|
 */
 | 
						|
 | 
						|
bam_for_ASE_ch
 | 
						|
    .groupTuple()
 | 
						|
    .phase(vcf_for_ASE)
 | 
						|
    .map{ left, right -> 
 | 
						|
      def sampleId = left[0]
 | 
						|
      def bam = left[1]
 | 
						|
      def bai = left[2]
 | 
						|
      def vcf = right[1]
 | 
						|
      tuple(sampleId, vcf, bam, bai)  
 | 
						|
    }
 | 
						|
    .set { grouped_vcf_bam_bai_ch }
 | 
						|
 | 
						|
 | 
						|
/* 
 | 
						|
 * Process 6C: Allele-Specific Expression analysis with GATK ASEReadCounter.
 | 
						|
 *             Calculates allele counts at a set of positions after applying 
 | 
						|
 *             filters that are tuned for enabling allele-specific expression 
 | 
						|
 *             (ASE) analysis
 | 
						|
 */
 | 
						|
 | 
						|
process '6C_ASE_knownSNPs' {
 | 
						|
  tag "$sampleId"
 | 
						|
  publishDir "$params.results/$sampleId" 
 | 
						|
  
 | 
						|
  input:
 | 
						|
      file genome from genome_file 
 | 
						|
      file index from genome_index_ch
 | 
						|
      file dict from genome_dict_ch
 | 
						|
      set sampleId, file(vcf),  file(bam), file(bai) from grouped_vcf_bam_bai_ch
 | 
						|
  
 | 
						|
  output:
 | 
						|
      file "ASE.tsv"
 | 
						|
  
 | 
						|
  script:
 | 
						|
  """
 | 
						|
  echo "${bam.join('\n')}" > bam.list
 | 
						|
    
 | 
						|
  $GATK -R ${genome} \
 | 
						|
          -T ASEReadCounter \
 | 
						|
          -o ASE.tsv \
 | 
						|
          -I bam.list \
 | 
						|
          -sites ${vcf}
 | 
						|
  """
 | 
						|
}
 |