Merge pull request #1159 from github/780-update

780 update
This commit is contained in:
Arfon Smith
2014-05-03 18:31:48 -05:00
3 changed files with 192 additions and 37 deletions

View File

@@ -1584,6 +1584,8 @@ R:
aliases: aliases:
- R - R
primary_extension: .r primary_extension: .r
aliases:
- Rscript
extensions: extensions:
- .R - .R
- .rsx - .rsx

View File

@@ -649,6 +649,9 @@
"Perl": [ "Perl": [
"ack" "ack"
], ],
"R": [
"expr-dist"
],
"Ruby": [ "Ruby": [
"Appraisals", "Appraisals",
"Capfile", "Capfile",
@@ -688,8 +691,8 @@
".gemrc" ".gemrc"
] ]
}, },
"tokens_total": 591725, "tokens_total": 592149,
"languages_total": 719, "languages_total": 720,
"tokens": { "tokens": {
"ABAP": { "ABAP": {
"*/**": 1, "*/**": 1,
@@ -52298,39 +52301,39 @@
}, },
"R": { "R": {
"df.residual.mira": 1, "df.residual.mira": 1,
"<": 24, "<": 46,
"-": 28, "-": 51,
"function": 14, "function": 18,
"(": 163, "(": 219,
"object": 12, "object": 12,
"...": 4, "...": 4,
")": 162, ")": 220,
"{": 35, "{": 46,
"fit": 2, "fit": 2,
"analyses": 1, "analyses": 1,
"[": 13, "[": 23,
"]": 13, "]": 24,
"return": 8, "return": 8,
"df.residual": 2, "df.residual": 2,
"}": 35, "}": 46,
"df.residual.lme": 1, "df.residual.lme": 1,
"fixDF": 1, "fixDF": 1,
"df.residual.mer": 1, "df.residual.mer": 1,
"sum": 1, "sum": 1,
"object@dims": 1, "object@dims": 1,
"*": 2, "*": 2,
"c": 9, "c": 11,
"+": 3, "+": 4,
"df.residual.default": 1, "df.residual.default": 1,
"q": 2, "q": 3,
"df": 3, "df": 3,
"if": 13, "if": 19,
"is.null": 2, "is.null": 8,
"mk": 2, "mk": 2,
"try": 3, "try": 3,
"coef": 1, "coef": 1,
"silent": 3, "silent": 3,
"TRUE": 12, "TRUE": 14,
"mn": 2, "mn": 2,
"f": 9, "f": 9,
"fitted": 1, "fitted": 1,
@@ -52345,15 +52348,75 @@
"length": 3, "length": 3,
"k": 3, "k": 3,
"max": 1, "max": 1,
"SHEBANG#!Rscript": 1, "SHEBANG#!Rscript": 2,
"#": 45,
"MedianNorm": 2,
"data": 11,
"geomeans": 3,
"<->": 1,
"exp": 1,
"rowMeans": 1,
"log": 5,
"apply": 2,
"2": 1,
"cnts": 2,
"median": 1,
"library": 1,
"print_usage": 2,
"file": 4,
"stderr": 1,
"cat": 1,
"spec": 2,
"matrix": 3,
"byrow": 3,
"ncol": 3,
"opt": 23,
"getopt": 1,
"help": 1,
"stdout": 1,
"status": 1,
"height": 7,
"out": 4,
"res": 6,
"width": 7,
"ylim": 7,
"read.table": 1,
"header": 1,
"sep": 4,
"quote": 1,
"nsamp": 8,
"dim": 1,
"outfile": 4,
"sprintf": 2,
"png": 2,
"h": 12,
"hist": 4,
"plot": 7,
"FALSE": 9,
"mids": 4,
"density": 4,
"type": 3,
"col": 4,
"rainbow": 4,
"main": 2,
"xlab": 2,
"ylab": 2,
"for": 3,
"i": 6,
"in": 8,
"lines": 6,
"devnum": 2,
"dev.off": 2,
"size.factors": 2,
"data.matrix": 1,
"data.norm": 3,
"t": 1,
"x": 3,
"/": 1,
"ParseDates": 2, "ParseDates": 2,
"lines": 4,
"dates": 3, "dates": 3,
"matrix": 2,
"unlist": 2, "unlist": 2,
"strsplit": 3, "strsplit": 3,
"ncol": 2,
"byrow": 2,
"days": 2, "days": 2,
"times": 2, "times": 2,
"hours": 2, "hours": 2,
@@ -52374,7 +52437,6 @@
"ggplot": 1, "ggplot": 1,
"aes": 2, "aes": 2,
"y": 1, "y": 1,
"x": 1,
"geom_point": 1, "geom_point": 1,
"size": 1, "size": 1,
"Freq": 1, "Freq": 1,
@@ -52382,12 +52444,8 @@
"range": 1, "range": 1,
"ggsave": 1, "ggsave": 1,
"filename": 1, "filename": 1,
"plot": 1,
"width": 1,
"height": 1,
"hello": 2, "hello": 2,
"print": 1, "print": 1,
"#": 42,
"module": 25, "module": 25,
"code": 19, "code": 19,
"available": 1, "available": 1,
@@ -52409,7 +52467,6 @@
"even": 1, "even": 1,
"attach": 11, "attach": 11,
"is": 7, "is": 7,
"FALSE": 5,
"optionally": 1, "optionally": 1,
"attached": 2, "attached": 2,
"to": 8, "to": 8,
@@ -52419,7 +52476,6 @@
"defaults": 1, "defaults": 1,
".": 5, ".": 5,
"However": 1, "However": 1,
"in": 6,
"interactive": 2, "interactive": 2,
"invoked": 1, "invoked": 1,
"directly": 1, "directly": 1,
@@ -52467,7 +52523,6 @@
"first.": 1, "first.": 1,
"That": 1, "That": 1,
"local": 3, "local": 3,
"file": 1,
"./a.r": 1, "./a.r": 1,
"will": 2, "will": 2,
"loaded.": 1, "loaded.": 1,
@@ -52537,7 +52592,6 @@
"parent": 9, "parent": 9,
".BaseNamespaceEnv": 1, ".BaseNamespaceEnv": 1,
"paste": 3, "paste": 3,
"sep": 3,
"source": 2, "source": 2,
"chdir": 1, "chdir": 1,
"envir": 5, "envir": 5,
@@ -52590,7 +52644,6 @@
"Reloading": 1, "Reloading": 1,
"primarily": 1, "primarily": 1,
"useful": 1, "useful": 1,
"for": 1,
"testing": 1, "testing": 1,
"during": 1, "during": 1,
"module_ref": 3, "module_ref": 3,
@@ -52612,8 +52665,7 @@
"pts": 1, "pts": 1,
"spsample": 1, "spsample": 1,
"polyg": 1, "polyg": 1,
"numpoints": 1, "numpoints": 1
"type": 1
}, },
"Racket": { "Racket": {
";": 3, ";": 3,
@@ -63056,7 +63108,7 @@
"Protocol Buffer": 63, "Protocol Buffer": 63,
"PureScript": 1652, "PureScript": 1652,
"Python": 5715, "Python": 5715,
"R": 1243, "R": 1667,
"Racket": 331, "Racket": 331,
"Ragel in Ruby Host": 593, "Ragel in Ruby Host": 593,
"RDoc": 279, "RDoc": 279,
@@ -63236,7 +63288,7 @@
"Protocol Buffer": 1, "Protocol Buffer": 1,
"PureScript": 4, "PureScript": 4,
"Python": 7, "Python": 7,
"R": 5, "R": 6,
"Racket": 2, "Racket": 2,
"Ragel in Ruby Host": 3, "Ragel in Ruby Host": 3,
"RDoc": 1, "RDoc": 1,
@@ -63287,5 +63339,5 @@
"YAML": 2, "YAML": 2,
"Zephir": 2 "Zephir": 2
}, },
"md5": "58816c8da227d1157f624a68c2f3ab55" "md5": "fa38e2b617caaf230146a7adab264419"
} }

101
samples/R/filenames/expr-dist Executable file
View File

@@ -0,0 +1,101 @@
#!/usr/bin/env Rscript
# Copyright (c) 2013 Daniel S. Standage, released under MIT license
#
# expr-dist: plot distributions of expression values before and after
# normalization; visually confirm that normalization worked
# as expected
#
# Program input is a matrix of expression values, each row corresponding to a
# molecule (gene, transcript, etc) and each row corresponding to that molecule's
# expression level or abundance. The program expects the rows and columns to be
# named, and was tested primarily on output produced by the
# 'rsem-generate-data-matrix' script distributed with the RSEM package.
#
# The program plots the distributions of the logged expression values by sample
# as provided, then normalizes the values, and finally plots the distribution of
# the logged normalized expression values by sample. The expectation is that all
# samples' distributions will have a similar shape but different medians prior
# to normalization, and that post normalization they will all have an identical
# median to facilitate cross-sample comparison.
# MedianNorm function borrowed from the EBSeq library version 1.1.6
# See http://www.bioconductor.org/packages/devel/bioc/html/EBSeq.html
MedianNorm <- function(data)
{
geomeans <- exp( rowMeans(log(data)) )
apply(data, 2, function(cnts) median((cnts/geomeans)[geomeans > 0]))
}
library("getopt")
print_usage <- function(file=stderr())
{
cat("
expr-dist: see source code for full description
Usage: expr-dist [options] < expr-matrix.txt
Options:
-h|--help: print this help message and exit
-o|--out: STRING prefix for output files; default is 'expr-dist'
-r|--res: INT resolution (dpi) of generated graphics; default is 150
-t|--height: INT height (pixels) of generated graphics; default is 1200
-w|--width: INT width (pixels) of generated graphics; default is 1200
-y|--ylim: REAL the visible range of the Y axis depends on the first
distribution plotted; if other distributions are getting
cut off, use this setting to override the default\n\n")
}
spec <- matrix( c("help", 'h', 0, "logical",
"out", 'o', 1, "character",
"res", 'r', 1, "integer",
"height", 't', 1, "integer",
"width", 'w', 1, "integer",
"ylim", 'y', 1, "double"),
byrow=TRUE, ncol=4)
opt <- getopt(spec)
if(!is.null(opt$help))
{
print_usage(file=stdout())
q(status=1)
}
if(is.null(opt$height)) { opt$height <- 1200 }
if(is.null(opt$out)) { opt$out <- "expr-dist" }
if(is.null(opt$res)) { opt$res <- 150 }
if(is.null(opt$width)) { opt$width <- 1200 }
if(!is.null(opt$ylim)) { opt$ylim <- c(0, opt$ylim) }
# Load data, determine number of samples
data <- read.table(file("stdin"), header=TRUE, sep="\t", quote="")
nsamp <- dim(data)[2] - 1
data <- data[,1:nsamp+1]
# Plot distribution of expression values before normalization
outfile <- sprintf("%s-median.png", opt$out)
png(outfile, height=opt$height, width=opt$width, res=opt$res)
h <- hist(log(data[,1]), plot=FALSE)
plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="",
xlab="Log expression value", ylab="Proportion of molecules", ylim=opt$ylim)
for(i in 2:nsamp)
{
h <- hist(log(data[,i]), plot=FALSE)
lines(h$mids, h$density, col=rainbow(nsamp)[i])
}
devnum <- dev.off()
# Normalize by median
size.factors <- MedianNorm(data.matrix(data))
data.norm <- t(apply(data, 1, function(x){ x / size.factors }))
# Plot distribution of normalized expression values
outfile <- sprintf("%s-median-norm.png", opt$out)
png(outfile, height=opt$height, width=opt$width, res=opt$res)
h <- hist(log(data.norm[,1]), plot=FALSE)
plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="",
xlab="Log normalized expression value", ylab="Proportion of molecules",
ylim=opt$ylim)
for(i in 2:nsamp)
{
h <- hist(log(data.norm[,i]), plot=FALSE)
lines(h$mids, h$density, col=rainbow(nsamp)[i])
}
devnum <- dev.off()