diff --git a/lib/linguist/languages.yml b/lib/linguist/languages.yml index 5195c5fe..224daf81 100644 --- a/lib/linguist/languages.yml +++ b/lib/linguist/languages.yml @@ -1584,6 +1584,8 @@ R: aliases: - R primary_extension: .r + aliases: + - Rscript extensions: - .R - .rsx diff --git a/lib/linguist/samples.json b/lib/linguist/samples.json index 6320bde3..43b79b3a 100644 --- a/lib/linguist/samples.json +++ b/lib/linguist/samples.json @@ -649,6 +649,9 @@ "Perl": [ "ack" ], + "R": [ + "expr-dist" + ], "Ruby": [ "Appraisals", "Capfile", @@ -688,8 +691,8 @@ ".gemrc" ] }, - "tokens_total": 591725, - "languages_total": 719, + "tokens_total": 592149, + "languages_total": 720, "tokens": { "ABAP": { "*/**": 1, @@ -52298,39 +52301,39 @@ }, "R": { "df.residual.mira": 1, - "<": 24, - "-": 28, - "function": 14, - "(": 163, + "<": 46, + "-": 51, + "function": 18, + "(": 219, "object": 12, "...": 4, - ")": 162, - "{": 35, + ")": 220, + "{": 46, "fit": 2, "analyses": 1, - "[": 13, - "]": 13, + "[": 23, + "]": 24, "return": 8, "df.residual": 2, - "}": 35, + "}": 46, "df.residual.lme": 1, "fixDF": 1, "df.residual.mer": 1, "sum": 1, "object@dims": 1, "*": 2, - "c": 9, - "+": 3, + "c": 11, + "+": 4, "df.residual.default": 1, - "q": 2, + "q": 3, "df": 3, - "if": 13, - "is.null": 2, + "if": 19, + "is.null": 8, "mk": 2, "try": 3, "coef": 1, "silent": 3, - "TRUE": 12, + "TRUE": 14, "mn": 2, "f": 9, "fitted": 1, @@ -52345,15 +52348,75 @@ "length": 3, "k": 3, "max": 1, - "SHEBANG#!Rscript": 1, + "SHEBANG#!Rscript": 2, + "#": 45, + "MedianNorm": 2, + "data": 11, + "geomeans": 3, + "<->": 1, + "exp": 1, + "rowMeans": 1, + "log": 5, + "apply": 2, + "2": 1, + "cnts": 2, + "median": 1, + "library": 1, + "print_usage": 2, + "file": 4, + "stderr": 1, + "cat": 1, + "spec": 2, + "matrix": 3, + "byrow": 3, + "ncol": 3, + "opt": 23, + "getopt": 1, + "help": 1, + "stdout": 1, + "status": 1, + "height": 7, + "out": 4, + "res": 6, + "width": 7, + "ylim": 7, + "read.table": 1, + "header": 1, + "sep": 4, + "quote": 1, + "nsamp": 8, + "dim": 1, + "outfile": 4, + "sprintf": 2, + "png": 2, + "h": 12, + "hist": 4, + "plot": 7, + "FALSE": 9, + "mids": 4, + "density": 4, + "type": 3, + "col": 4, + "rainbow": 4, + "main": 2, + "xlab": 2, + "ylab": 2, + "for": 3, + "i": 6, + "in": 8, + "lines": 6, + "devnum": 2, + "dev.off": 2, + "size.factors": 2, + "data.matrix": 1, + "data.norm": 3, + "t": 1, + "x": 3, + "/": 1, "ParseDates": 2, - "lines": 4, "dates": 3, - "matrix": 2, "unlist": 2, "strsplit": 3, - "ncol": 2, - "byrow": 2, "days": 2, "times": 2, "hours": 2, @@ -52374,7 +52437,6 @@ "ggplot": 1, "aes": 2, "y": 1, - "x": 1, "geom_point": 1, "size": 1, "Freq": 1, @@ -52382,12 +52444,8 @@ "range": 1, "ggsave": 1, "filename": 1, - "plot": 1, - "width": 1, - "height": 1, "hello": 2, "print": 1, - "#": 42, "module": 25, "code": 19, "available": 1, @@ -52409,7 +52467,6 @@ "even": 1, "attach": 11, "is": 7, - "FALSE": 5, "optionally": 1, "attached": 2, "to": 8, @@ -52419,7 +52476,6 @@ "defaults": 1, ".": 5, "However": 1, - "in": 6, "interactive": 2, "invoked": 1, "directly": 1, @@ -52467,7 +52523,6 @@ "first.": 1, "That": 1, "local": 3, - "file": 1, "./a.r": 1, "will": 2, "loaded.": 1, @@ -52537,7 +52592,6 @@ "parent": 9, ".BaseNamespaceEnv": 1, "paste": 3, - "sep": 3, "source": 2, "chdir": 1, "envir": 5, @@ -52590,7 +52644,6 @@ "Reloading": 1, "primarily": 1, "useful": 1, - "for": 1, "testing": 1, "during": 1, "module_ref": 3, @@ -52612,8 +52665,7 @@ "pts": 1, "spsample": 1, "polyg": 1, - "numpoints": 1, - "type": 1 + "numpoints": 1 }, "Racket": { ";": 3, @@ -63056,7 +63108,7 @@ "Protocol Buffer": 63, "PureScript": 1652, "Python": 5715, - "R": 1243, + "R": 1667, "Racket": 331, "Ragel in Ruby Host": 593, "RDoc": 279, @@ -63236,7 +63288,7 @@ "Protocol Buffer": 1, "PureScript": 4, "Python": 7, - "R": 5, + "R": 6, "Racket": 2, "Ragel in Ruby Host": 3, "RDoc": 1, @@ -63287,5 +63339,5 @@ "YAML": 2, "Zephir": 2 }, - "md5": "58816c8da227d1157f624a68c2f3ab55" + "md5": "fa38e2b617caaf230146a7adab264419" } \ No newline at end of file diff --git a/samples/R/filenames/expr-dist b/samples/R/filenames/expr-dist new file mode 100755 index 00000000..1f7ab280 --- /dev/null +++ b/samples/R/filenames/expr-dist @@ -0,0 +1,101 @@ +#!/usr/bin/env Rscript + +# Copyright (c) 2013 Daniel S. Standage, released under MIT license +# +# expr-dist: plot distributions of expression values before and after +# normalization; visually confirm that normalization worked +# as expected +# +# Program input is a matrix of expression values, each row corresponding to a +# molecule (gene, transcript, etc) and each row corresponding to that molecule's +# expression level or abundance. The program expects the rows and columns to be +# named, and was tested primarily on output produced by the +# 'rsem-generate-data-matrix' script distributed with the RSEM package. +# +# The program plots the distributions of the logged expression values by sample +# as provided, then normalizes the values, and finally plots the distribution of +# the logged normalized expression values by sample. The expectation is that all +# samples' distributions will have a similar shape but different medians prior +# to normalization, and that post normalization they will all have an identical +# median to facilitate cross-sample comparison. + + +# MedianNorm function borrowed from the EBSeq library version 1.1.6 +# See http://www.bioconductor.org/packages/devel/bioc/html/EBSeq.html +MedianNorm <- function(data) +{ + geomeans <- exp( rowMeans(log(data)) ) + apply(data, 2, function(cnts) median((cnts/geomeans)[geomeans > 0])) +} + +library("getopt") +print_usage <- function(file=stderr()) +{ + cat(" +expr-dist: see source code for full description +Usage: expr-dist [options] < expr-matrix.txt + Options: + -h|--help: print this help message and exit + -o|--out: STRING prefix for output files; default is 'expr-dist' + -r|--res: INT resolution (dpi) of generated graphics; default is 150 + -t|--height: INT height (pixels) of generated graphics; default is 1200 + -w|--width: INT width (pixels) of generated graphics; default is 1200 + -y|--ylim: REAL the visible range of the Y axis depends on the first + distribution plotted; if other distributions are getting + cut off, use this setting to override the default\n\n") +} + +spec <- matrix( c("help", 'h', 0, "logical", + "out", 'o', 1, "character", + "res", 'r', 1, "integer", + "height", 't', 1, "integer", + "width", 'w', 1, "integer", + "ylim", 'y', 1, "double"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) +if(!is.null(opt$help)) +{ + print_usage(file=stdout()) + q(status=1) +} +if(is.null(opt$height)) { opt$height <- 1200 } +if(is.null(opt$out)) { opt$out <- "expr-dist" } +if(is.null(opt$res)) { opt$res <- 150 } +if(is.null(opt$width)) { opt$width <- 1200 } +if(!is.null(opt$ylim)) { opt$ylim <- c(0, opt$ylim) } + +# Load data, determine number of samples +data <- read.table(file("stdin"), header=TRUE, sep="\t", quote="") +nsamp <- dim(data)[2] - 1 +data <- data[,1:nsamp+1] + +# Plot distribution of expression values before normalization +outfile <- sprintf("%s-median.png", opt$out) +png(outfile, height=opt$height, width=opt$width, res=opt$res) +h <- hist(log(data[,1]), plot=FALSE) +plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="", + xlab="Log expression value", ylab="Proportion of molecules", ylim=opt$ylim) +for(i in 2:nsamp) +{ + h <- hist(log(data[,i]), plot=FALSE) + lines(h$mids, h$density, col=rainbow(nsamp)[i]) +} +devnum <- dev.off() + +# Normalize by median +size.factors <- MedianNorm(data.matrix(data)) +data.norm <- t(apply(data, 1, function(x){ x / size.factors })) + +# Plot distribution of normalized expression values +outfile <- sprintf("%s-median-norm.png", opt$out) +png(outfile, height=opt$height, width=opt$width, res=opt$res) +h <- hist(log(data.norm[,1]), plot=FALSE) +plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="", + xlab="Log normalized expression value", ylab="Proportion of molecules", + ylim=opt$ylim) +for(i in 2:nsamp) +{ + h <- hist(log(data.norm[,i]), plot=FALSE) + lines(h$mids, h$density, col=rainbow(nsamp)[i]) +} +devnum <- dev.off()