From bcefa61fe0c4f73cee254340e8b9f28858daf47a Mon Sep 17 00:00:00 2001 From: Daniel Standage Date: Thu, 14 Nov 2013 22:29:41 -0500 Subject: [PATCH] Added new Rscript code --- samples/R/expr-dist | 101 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100755 samples/R/expr-dist diff --git a/samples/R/expr-dist b/samples/R/expr-dist new file mode 100755 index 00000000..1f7ab280 --- /dev/null +++ b/samples/R/expr-dist @@ -0,0 +1,101 @@ +#!/usr/bin/env Rscript + +# Copyright (c) 2013 Daniel S. Standage, released under MIT license +# +# expr-dist: plot distributions of expression values before and after +# normalization; visually confirm that normalization worked +# as expected +# +# Program input is a matrix of expression values, each row corresponding to a +# molecule (gene, transcript, etc) and each row corresponding to that molecule's +# expression level or abundance. The program expects the rows and columns to be +# named, and was tested primarily on output produced by the +# 'rsem-generate-data-matrix' script distributed with the RSEM package. +# +# The program plots the distributions of the logged expression values by sample +# as provided, then normalizes the values, and finally plots the distribution of +# the logged normalized expression values by sample. The expectation is that all +# samples' distributions will have a similar shape but different medians prior +# to normalization, and that post normalization they will all have an identical +# median to facilitate cross-sample comparison. + + +# MedianNorm function borrowed from the EBSeq library version 1.1.6 +# See http://www.bioconductor.org/packages/devel/bioc/html/EBSeq.html +MedianNorm <- function(data) +{ + geomeans <- exp( rowMeans(log(data)) ) + apply(data, 2, function(cnts) median((cnts/geomeans)[geomeans > 0])) +} + +library("getopt") +print_usage <- function(file=stderr()) +{ + cat(" +expr-dist: see source code for full description +Usage: expr-dist [options] < expr-matrix.txt + Options: + -h|--help: print this help message and exit + -o|--out: STRING prefix for output files; default is 'expr-dist' + -r|--res: INT resolution (dpi) of generated graphics; default is 150 + -t|--height: INT height (pixels) of generated graphics; default is 1200 + -w|--width: INT width (pixels) of generated graphics; default is 1200 + -y|--ylim: REAL the visible range of the Y axis depends on the first + distribution plotted; if other distributions are getting + cut off, use this setting to override the default\n\n") +} + +spec <- matrix( c("help", 'h', 0, "logical", + "out", 'o', 1, "character", + "res", 'r', 1, "integer", + "height", 't', 1, "integer", + "width", 'w', 1, "integer", + "ylim", 'y', 1, "double"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) +if(!is.null(opt$help)) +{ + print_usage(file=stdout()) + q(status=1) +} +if(is.null(opt$height)) { opt$height <- 1200 } +if(is.null(opt$out)) { opt$out <- "expr-dist" } +if(is.null(opt$res)) { opt$res <- 150 } +if(is.null(opt$width)) { opt$width <- 1200 } +if(!is.null(opt$ylim)) { opt$ylim <- c(0, opt$ylim) } + +# Load data, determine number of samples +data <- read.table(file("stdin"), header=TRUE, sep="\t", quote="") +nsamp <- dim(data)[2] - 1 +data <- data[,1:nsamp+1] + +# Plot distribution of expression values before normalization +outfile <- sprintf("%s-median.png", opt$out) +png(outfile, height=opt$height, width=opt$width, res=opt$res) +h <- hist(log(data[,1]), plot=FALSE) +plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="", + xlab="Log expression value", ylab="Proportion of molecules", ylim=opt$ylim) +for(i in 2:nsamp) +{ + h <- hist(log(data[,i]), plot=FALSE) + lines(h$mids, h$density, col=rainbow(nsamp)[i]) +} +devnum <- dev.off() + +# Normalize by median +size.factors <- MedianNorm(data.matrix(data)) +data.norm <- t(apply(data, 1, function(x){ x / size.factors })) + +# Plot distribution of normalized expression values +outfile <- sprintf("%s-median-norm.png", opt$out) +png(outfile, height=opt$height, width=opt$width, res=opt$res) +h <- hist(log(data.norm[,1]), plot=FALSE) +plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="", + xlab="Log normalized expression value", ylab="Proportion of molecules", + ylim=opt$ylim) +for(i in 2:nsamp) +{ + h <- hist(log(data.norm[,i]), plot=FALSE) + lines(h$mids, h$density, col=rainbow(nsamp)[i]) +} +devnum <- dev.off()