Merge pull request #1159 from github/780-update

780 update
2026-02-01 22:25:58 +00:00 · 2014-05-03 18:31:48 -05:00
parent 3023516796 44a0d19ac0
commit 8b878784a4
3 changed files with 192 additions and 37 deletions
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -1584,6 +1584,8 @@ R:
  aliases:
  - R
  primary_extension: .r
+  aliases:
+  - Rscript
  extensions:
  - .R
  - .rsx
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
@@ -649,6 +649,9 @@
    "Perl": [
      "ack"
    ],
+    "R": [
+      "expr-dist"
+    ],
    "Ruby": [
      "Appraisals",
      "Capfile",
@@ -688,8 +691,8 @@
      ".gemrc"
    ]
  },
-  "tokens_total": 591725,
-  "languages_total": 719,
+  "tokens_total": 592149,
+  "languages_total": 720,
  "tokens": {
    "ABAP": {
      "*/**": 1,
@@ -52298,39 +52301,39 @@
    },
    "R": {
      "df.residual.mira": 1,
-      "<": 24,
-      "-": 28,
-      "function": 14,
-      "(": 163,
+      "<": 46,
+      "-": 51,
+      "function": 18,
+      "(": 219,
      "object": 12,
      "...": 4,
-      ")": 162,
-      "{": 35,
+      ")": 220,
+      "{": 46,
      "fit": 2,
      "analyses": 1,
-      "[": 13,
-      "]": 13,
+      "[": 23,
+      "]": 24,
      "return": 8,
      "df.residual": 2,
-      "}": 35,
+      "}": 46,
      "df.residual.lme": 1,
      "fixDF": 1,
      "df.residual.mer": 1,
      "sum": 1,
      "object@dims": 1,
      "*": 2,
-      "c": 9,
-      "+": 3,
+      "c": 11,
+      "+": 4,
      "df.residual.default": 1,
-      "q": 2,
+      "q": 3,
      "df": 3,
-      "if": 13,
-      "is.null": 2,
+      "if": 19,
+      "is.null": 8,
      "mk": 2,
      "try": 3,
      "coef": 1,
      "silent": 3,
-      "TRUE": 12,
+      "TRUE": 14,
      "mn": 2,
      "f": 9,
      "fitted": 1,
@@ -52345,15 +52348,75 @@
      "length": 3,
      "k": 3,
      "max": 1,
-      "SHEBANG#!Rscript": 1,
+      "SHEBANG#!Rscript": 2,
+      "#": 45,
+      "MedianNorm": 2,
+      "data": 11,
+      "geomeans": 3,
+      "<->": 1,
+      "exp": 1,
+      "rowMeans": 1,
+      "log": 5,
+      "apply": 2,
+      "2": 1,
+      "cnts": 2,
+      "median": 1,
+      "library": 1,
+      "print_usage": 2,
+      "file": 4,
+      "stderr": 1,
+      "cat": 1,
+      "spec": 2,
+      "matrix": 3,
+      "byrow": 3,
+      "ncol": 3,
+      "opt": 23,
+      "getopt": 1,
+      "help": 1,
+      "stdout": 1,
+      "status": 1,
+      "height": 7,
+      "out": 4,
+      "res": 6,
+      "width": 7,
+      "ylim": 7,
+      "read.table": 1,
+      "header": 1,
+      "sep": 4,
+      "quote": 1,
+      "nsamp": 8,
+      "dim": 1,
+      "outfile": 4,
+      "sprintf": 2,
+      "png": 2,
+      "h": 12,
+      "hist": 4,
+      "plot": 7,
+      "FALSE": 9,
+      "mids": 4,
+      "density": 4,
+      "type": 3,
+      "col": 4,
+      "rainbow": 4,
+      "main": 2,
+      "xlab": 2,
+      "ylab": 2,
+      "for": 3,
+      "i": 6,
+      "in": 8,
+      "lines": 6,
+      "devnum": 2,
+      "dev.off": 2,
+      "size.factors": 2,
+      "data.matrix": 1,
+      "data.norm": 3,
+      "t": 1,
+      "x": 3,
+      "/": 1,
      "ParseDates": 2,
-      "lines": 4,
      "dates": 3,
-      "matrix": 2,
      "unlist": 2,
      "strsplit": 3,
-      "ncol": 2,
-      "byrow": 2,
      "days": 2,
      "times": 2,
      "hours": 2,
@@ -52374,7 +52437,6 @@
      "ggplot": 1,
      "aes": 2,
      "y": 1,
-      "x": 1,
      "geom_point": 1,
      "size": 1,
      "Freq": 1,
@@ -52382,12 +52444,8 @@
      "range": 1,
      "ggsave": 1,
      "filename": 1,
-      "plot": 1,
-      "width": 1,
-      "height": 1,
      "hello": 2,
      "print": 1,
-      "#": 42,
      "module": 25,
      "code": 19,
      "available": 1,
@@ -52409,7 +52467,6 @@
      "even": 1,
      "attach": 11,
      "is": 7,
-      "FALSE": 5,
      "optionally": 1,
      "attached": 2,
      "to": 8,
@@ -52419,7 +52476,6 @@
      "defaults": 1,
      ".": 5,
      "However": 1,
-      "in": 6,
      "interactive": 2,
      "invoked": 1,
      "directly": 1,
@@ -52467,7 +52523,6 @@
      "first.": 1,
      "That": 1,
      "local": 3,
-      "file": 1,
      "./a.r": 1,
      "will": 2,
      "loaded.": 1,
@@ -52537,7 +52592,6 @@
      "parent": 9,
      ".BaseNamespaceEnv": 1,
      "paste": 3,
-      "sep": 3,
      "source": 2,
      "chdir": 1,
      "envir": 5,
@@ -52590,7 +52644,6 @@
      "Reloading": 1,
      "primarily": 1,
      "useful": 1,
-      "for": 1,
      "testing": 1,
      "during": 1,
      "module_ref": 3,
@@ -52612,8 +52665,7 @@
      "pts": 1,
      "spsample": 1,
      "polyg": 1,
-      "numpoints": 1,
-      "type": 1
+      "numpoints": 1
    },
    "Racket": {
      ";": 3,
@@ -63056,7 +63108,7 @@
    "Protocol Buffer": 63,
    "PureScript": 1652,
    "Python": 5715,
-    "R": 1243,
+    "R": 1667,
    "Racket": 331,
    "Ragel in Ruby Host": 593,
    "RDoc": 279,
@@ -63236,7 +63288,7 @@
    "Protocol Buffer": 1,
    "PureScript": 4,
    "Python": 7,
-    "R": 5,
+    "R": 6,
    "Racket": 2,
    "Ragel in Ruby Host": 3,
    "RDoc": 1,
@@ -63287,5 +63339,5 @@
    "YAML": 2,
    "Zephir": 2
  },
-  "md5": "58816c8da227d1157f624a68c2f3ab55"
+  "md5": "fa38e2b617caaf230146a7adab264419"
 }
--- a/samples/R/filenames/expr-dist
+++ b/samples/R/filenames/expr-dist
@@ -0,0 +1,101 @@
+#!/usr/bin/env Rscript
+
+# Copyright (c) 2013 Daniel S. Standage, released under MIT license
+#
+# expr-dist: plot distributions of expression values before and after
+#            normalization; visually confirm that normalization worked
+#            as expected
+#
+# Program input is a matrix of expression values, each row corresponding to a
+# molecule (gene, transcript, etc) and each row corresponding to that molecule's
+# expression level or abundance. The program expects the rows and columns to be
+# named, and was tested primarily on output produced by the
+# 'rsem-generate-data-matrix' script distributed with the RSEM package.
+#
+# The program plots the distributions of the logged expression values by sample
+# as provided, then normalizes the values, and finally plots the distribution of
+# the logged normalized expression values by sample. The expectation is that all
+# samples' distributions will have a similar shape but different medians prior
+# to normalization, and that post normalization they will all have an identical
+# median to facilitate cross-sample comparison.
+
+
+# MedianNorm function borrowed from the EBSeq library version 1.1.6
+# See http://www.bioconductor.org/packages/devel/bioc/html/EBSeq.html
+MedianNorm <- function(data)
+{
+  geomeans <- exp( rowMeans(log(data)) )
+  apply(data, 2, function(cnts) median((cnts/geomeans)[geomeans > 0]))
+}
+
+library("getopt")
+print_usage <- function(file=stderr())
+{
+  cat("
+expr-dist: see source code for full description
+Usage: expr-dist [options] < expr-matrix.txt
+  Options:
+    -h|--help:          print this help message and exit
+    -o|--out: STRING    prefix for output files; default is 'expr-dist'
+    -r|--res: INT       resolution (dpi) of generated graphics; default is 150
+    -t|--height: INT    height (pixels) of generated graphics; default is 1200
+    -w|--width: INT     width (pixels) of generated graphics; default is 1200
+    -y|--ylim: REAL     the visible range of the Y axis depends on the first
+                        distribution plotted; if other distributions are getting
+                        cut off, use this setting to override the default\n\n")
+}
+
+spec <- matrix( c("help",   'h', 0, "logical",
+                  "out",    'o', 1, "character",
+                  "res",    'r', 1, "integer",
+                  "height", 't', 1, "integer",
+                  "width",  'w', 1, "integer",
+                  "ylim",   'y', 1, "double"),
+                byrow=TRUE, ncol=4)
+opt  <- getopt(spec)
+if(!is.null(opt$help))
+{
+  print_usage(file=stdout())
+  q(status=1)
+}
+if(is.null(opt$height)) { opt$height <- 1200           }
+if(is.null(opt$out))    { opt$out    <- "expr-dist"    }
+if(is.null(opt$res))    { opt$res    <- 150            }
+if(is.null(opt$width))  { opt$width  <- 1200           }
+if(!is.null(opt$ylim))  { opt$ylim   <- c(0, opt$ylim) }
+
+# Load data, determine number of samples
+data  <- read.table(file("stdin"), header=TRUE, sep="\t", quote="")
+nsamp <- dim(data)[2] - 1
+data  <- data[,1:nsamp+1]
+
+# Plot distribution of expression values before normalization
+outfile <- sprintf("%s-median.png", opt$out)
+png(outfile, height=opt$height, width=opt$width, res=opt$res)
+h <- hist(log(data[,1]), plot=FALSE)
+plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="",
+     xlab="Log expression value", ylab="Proportion of molecules", ylim=opt$ylim)
+for(i in 2:nsamp)
+{
+  h <- hist(log(data[,i]), plot=FALSE)
+  lines(h$mids, h$density, col=rainbow(nsamp)[i])
+}
+devnum <- dev.off()
+
+# Normalize by median
+size.factors <- MedianNorm(data.matrix(data))
+data.norm <- t(apply(data, 1, function(x){ x / size.factors }))
+
+# Plot distribution of normalized expression values
+outfile <- sprintf("%s-median-norm.png", opt$out)
+png(outfile, height=opt$height, width=opt$width, res=opt$res)
+h <- hist(log(data.norm[,1]), plot=FALSE)
+plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="",
+     xlab="Log normalized expression value", ylab="Proportion of molecules",
+     ylim=opt$ylim)
+for(i in 2:nsamp)
+{
+  h <- hist(log(data.norm[,i]), plot=FALSE)
+  lines(h$mids, h$density, col=rainbow(nsamp)[i])
+}
+devnum <- dev.off()