mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			102 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			R
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			102 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			R
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env Rscript
 | |
| 
 | |
| # Copyright (c) 2013 Daniel S. Standage, released under MIT license
 | |
| #
 | |
| # expr-dist: plot distributions of expression values before and after
 | |
| #            normalization; visually confirm that normalization worked
 | |
| #            as expected
 | |
| #
 | |
| # Program input is a matrix of expression values, each row corresponding to a
 | |
| # molecule (gene, transcript, etc) and each row corresponding to that molecule's
 | |
| # expression level or abundance. The program expects the rows and columns to be
 | |
| # named, and was tested primarily on output produced by the
 | |
| # 'rsem-generate-data-matrix' script distributed with the RSEM package.
 | |
| #
 | |
| # The program plots the distributions of the logged expression values by sample
 | |
| # as provided, then normalizes the values, and finally plots the distribution of
 | |
| # the logged normalized expression values by sample. The expectation is that all
 | |
| # samples' distributions will have a similar shape but different medians prior
 | |
| # to normalization, and that post normalization they will all have an identical
 | |
| # median to facilitate cross-sample comparison.
 | |
| 
 | |
| 
 | |
| # MedianNorm function borrowed from the EBSeq library version 1.1.6
 | |
| # See http://www.bioconductor.org/packages/devel/bioc/html/EBSeq.html
 | |
| MedianNorm <- function(data)
 | |
| {
 | |
|   geomeans <- exp( rowMeans(log(data)) )
 | |
|   apply(data, 2, function(cnts) median((cnts/geomeans)[geomeans > 0]))
 | |
| }
 | |
| 
 | |
| library("getopt")
 | |
| print_usage <- function(file=stderr())
 | |
| {
 | |
|   cat("
 | |
| expr-dist: see source code for full description
 | |
| Usage: expr-dist [options] < expr-matrix.txt
 | |
|   Options:
 | |
|     -h|--help:          print this help message and exit
 | |
|     -o|--out: STRING    prefix for output files; default is 'expr-dist'
 | |
|     -r|--res: INT       resolution (dpi) of generated graphics; default is 150
 | |
|     -t|--height: INT    height (pixels) of generated graphics; default is 1200
 | |
|     -w|--width: INT     width (pixels) of generated graphics; default is 1200
 | |
|     -y|--ylim: REAL     the visible range of the Y axis depends on the first
 | |
|                         distribution plotted; if other distributions are getting
 | |
|                         cut off, use this setting to override the default\n\n")
 | |
| }
 | |
| 
 | |
| spec <- matrix( c("help",   'h', 0, "logical",
 | |
|                   "out",    'o', 1, "character",
 | |
|                   "res",    'r', 1, "integer",
 | |
|                   "height", 't', 1, "integer",
 | |
|                   "width",  'w', 1, "integer",
 | |
|                   "ylim",   'y', 1, "double"),
 | |
|                 byrow=TRUE, ncol=4)
 | |
| opt  <- getopt(spec)
 | |
| if(!is.null(opt$help))
 | |
| {
 | |
|   print_usage(file=stdout())
 | |
|   q(status=1)
 | |
| }
 | |
| if(is.null(opt$height)) { opt$height <- 1200           }
 | |
| if(is.null(opt$out))    { opt$out    <- "expr-dist"    }
 | |
| if(is.null(opt$res))    { opt$res    <- 150            }
 | |
| if(is.null(opt$width))  { opt$width  <- 1200           }
 | |
| if(!is.null(opt$ylim))  { opt$ylim   <- c(0, opt$ylim) }
 | |
| 
 | |
| # Load data, determine number of samples
 | |
| data  <- read.table(file("stdin"), header=TRUE, sep="\t", quote="")
 | |
| nsamp <- dim(data)[2] - 1
 | |
| data  <- data[,1:nsamp+1]
 | |
| 
 | |
| # Plot distribution of expression values before normalization
 | |
| outfile <- sprintf("%s-median.png", opt$out)
 | |
| png(outfile, height=opt$height, width=opt$width, res=opt$res)
 | |
| h <- hist(log(data[,1]), plot=FALSE)
 | |
| plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="",
 | |
|      xlab="Log expression value", ylab="Proportion of molecules", ylim=opt$ylim)
 | |
| for(i in 2:nsamp)
 | |
| {
 | |
|   h <- hist(log(data[,i]), plot=FALSE)
 | |
|   lines(h$mids, h$density, col=rainbow(nsamp)[i])
 | |
| }
 | |
| devnum <- dev.off()
 | |
| 
 | |
| # Normalize by median
 | |
| size.factors <- MedianNorm(data.matrix(data))
 | |
| data.norm <- t(apply(data, 1, function(x){ x / size.factors }))
 | |
| 
 | |
| # Plot distribution of normalized expression values
 | |
| outfile <- sprintf("%s-median-norm.png", opt$out)
 | |
| png(outfile, height=opt$height, width=opt$width, res=opt$res)
 | |
| h <- hist(log(data.norm[,1]), plot=FALSE)
 | |
| plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="",
 | |
|      xlab="Log normalized expression value", ylab="Proportion of molecules",
 | |
|      ylim=opt$ylim)
 | |
| for(i in 2:nsamp)
 | |
| {
 | |
|   h <- hist(log(data.norm[,i]), plot=FALSE)
 | |
|   lines(h$mids, h$density, col=rainbow(nsamp)[i])
 | |
| }
 | |
| devnum <- dev.off()
 |