Merge pull request #1159 from github/780-update

780 update
2026-02-11 18:59:35 +00:00 · 2014-05-03 18:31:48 -05:00
parent 3023516796 44a0d19ac0
commit 8b878784a4
3 changed files with 192 additions and 37 deletions
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -1584,6 +1584,8 @@ R:
  aliases:
  - R
  primary_extension: .r
  aliases:
  - Rscript
  extensions:
  - .R
  - .rsx
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
@@ -649,6 +649,9 @@
    "Perl": [
      "ack"
    ],
    "R": [
      "expr-dist"
    ],
    "Ruby": [
      "Appraisals",
      "Capfile",
@@ -688,8 +691,8 @@
      ".gemrc"
    ]
  },
-  "tokens_total": 591725,
+  "tokens_total": 592149,
-  "languages_total": 719,
+  "languages_total": 720,
  "tokens": {
    "ABAP": {
      "*/**": 1,
@@ -52298,39 +52301,39 @@
    },
    "R": {
      "df.residual.mira": 1,
-      "<": 24,
+      "<": 46,
-      "-": 28,
+      "-": 51,
-      "function": 14,
+      "function": 18,
-      "(": 163,
+      "(": 219,
      "object": 12,
      "...": 4,
-      ")": 162,
+      ")": 220,
-      "{": 35,
+      "{": 46,
      "fit": 2,
      "analyses": 1,
-      "[": 13,
+      "[": 23,
-      "]": 13,
+      "]": 24,
      "return": 8,
      "df.residual": 2,
-      "}": 35,
+      "}": 46,
      "df.residual.lme": 1,
      "fixDF": 1,
      "df.residual.mer": 1,
      "sum": 1,
      "object@dims": 1,
      "*": 2,
-      "c": 9,
+      "c": 11,
-      "+": 3,
+      "+": 4,
      "df.residual.default": 1,
-      "q": 2,
+      "q": 3,
      "df": 3,
-      "if": 13,
+      "if": 19,
-      "is.null": 2,
+      "is.null": 8,
      "mk": 2,
      "try": 3,
      "coef": 1,
      "silent": 3,
-      "TRUE": 12,
+      "TRUE": 14,
      "mn": 2,
      "f": 9,
      "fitted": 1,
@@ -52345,15 +52348,75 @@
      "length": 3,
      "k": 3,
      "max": 1,
-      "SHEBANG#!Rscript": 1,
+      "SHEBANG#!Rscript": 2,
      "#": 45,
      "MedianNorm": 2,
      "data": 11,
      "geomeans": 3,
      "<->": 1,
      "exp": 1,
      "rowMeans": 1,
      "log": 5,
      "apply": 2,
      "2": 1,
      "cnts": 2,
      "median": 1,
      "library": 1,
      "print_usage": 2,
      "file": 4,
      "stderr": 1,
      "cat": 1,
      "spec": 2,
      "matrix": 3,
      "byrow": 3,
      "ncol": 3,
      "opt": 23,
      "getopt": 1,
      "help": 1,
      "stdout": 1,
      "status": 1,
      "height": 7,
      "out": 4,
      "res": 6,
      "width": 7,
      "ylim": 7,
      "read.table": 1,
      "header": 1,
      "sep": 4,
      "quote": 1,
      "nsamp": 8,
      "dim": 1,
      "outfile": 4,
      "sprintf": 2,
      "png": 2,
      "h": 12,
      "hist": 4,
      "plot": 7,
      "FALSE": 9,
      "mids": 4,
      "density": 4,
      "type": 3,
      "col": 4,
      "rainbow": 4,
      "main": 2,
      "xlab": 2,
      "ylab": 2,
      "for": 3,
      "i": 6,
      "in": 8,
      "lines": 6,
      "devnum": 2,
      "dev.off": 2,
      "size.factors": 2,
      "data.matrix": 1,
      "data.norm": 3,
      "t": 1,
      "x": 3,
      "/": 1,
      "ParseDates": 2,
      "lines": 4,
      "dates": 3,
      "matrix": 2,
      "unlist": 2,
      "strsplit": 3,
      "ncol": 2,
      "byrow": 2,
      "days": 2,
      "times": 2,
      "hours": 2,
@@ -52374,7 +52437,6 @@
      "ggplot": 1,
      "aes": 2,
      "y": 1,
      "x": 1,
      "geom_point": 1,
      "size": 1,
      "Freq": 1,
@@ -52382,12 +52444,8 @@
      "range": 1,
      "ggsave": 1,
      "filename": 1,
      "plot": 1,
      "width": 1,
      "height": 1,
      "hello": 2,
      "print": 1,
      "#": 42,
      "module": 25,
      "code": 19,
      "available": 1,
@@ -52409,7 +52467,6 @@
      "even": 1,
      "attach": 11,
      "is": 7,
      "FALSE": 5,
      "optionally": 1,
      "attached": 2,
      "to": 8,
@@ -52419,7 +52476,6 @@
      "defaults": 1,
      ".": 5,
      "However": 1,
      "in": 6,
      "interactive": 2,
      "invoked": 1,
      "directly": 1,
@@ -52467,7 +52523,6 @@
      "first.": 1,
      "That": 1,
      "local": 3,
      "file": 1,
      "./a.r": 1,
      "will": 2,
      "loaded.": 1,
@@ -52537,7 +52592,6 @@
      "parent": 9,
      ".BaseNamespaceEnv": 1,
      "paste": 3,
      "sep": 3,
      "source": 2,
      "chdir": 1,
      "envir": 5,
@@ -52590,7 +52644,6 @@
      "Reloading": 1,
      "primarily": 1,
      "useful": 1,
      "for": 1,
      "testing": 1,
      "during": 1,
      "module_ref": 3,
@@ -52612,8 +52665,7 @@
      "pts": 1,
      "spsample": 1,
      "polyg": 1,
-      "numpoints": 1,
+      "numpoints": 1
      "type": 1
    },
    "Racket": {
      ";": 3,
@@ -63056,7 +63108,7 @@
    "Protocol Buffer": 63,
    "PureScript": 1652,
    "Python": 5715,
-    "R": 1243,
+    "R": 1667,
    "Racket": 331,
    "Ragel in Ruby Host": 593,
    "RDoc": 279,
@@ -63236,7 +63288,7 @@
    "Protocol Buffer": 1,
    "PureScript": 4,
    "Python": 7,
-    "R": 5,
+    "R": 6,
    "Racket": 2,
    "Ragel in Ruby Host": 3,
    "RDoc": 1,
@@ -63287,5 +63339,5 @@
    "YAML": 2,
    "Zephir": 2
  },
-  "md5": "58816c8da227d1157f624a68c2f3ab55"
+  "md5": "fa38e2b617caaf230146a7adab264419"
 }
--- a/samples/R/filenames/expr-dist
+++ b/samples/R/filenames/expr-dist
@@ -0,0 +1,101 @@
 #!/usr/bin/env Rscript
 # Copyright (c) 2013 Daniel S. Standage, released under MIT license
 #
 # expr-dist: plot distributions of expression values before and after
 #            normalization; visually confirm that normalization worked
 #            as expected
 #
 # Program input is a matrix of expression values, each row corresponding to a
 # molecule (gene, transcript, etc) and each row corresponding to that molecule's
 # expression level or abundance. The program expects the rows and columns to be
 # named, and was tested primarily on output produced by the
 # 'rsem-generate-data-matrix' script distributed with the RSEM package.
 #
 # The program plots the distributions of the logged expression values by sample
 # as provided, then normalizes the values, and finally plots the distribution of
 # the logged normalized expression values by sample. The expectation is that all
 # samples' distributions will have a similar shape but different medians prior
 # to normalization, and that post normalization they will all have an identical
 # median to facilitate cross-sample comparison.
 # MedianNorm function borrowed from the EBSeq library version 1.1.6
 # See http://www.bioconductor.org/packages/devel/bioc/html/EBSeq.html
 MedianNorm <- function(data)
 {
  geomeans <- exp( rowMeans(log(data)) )
  apply(data, 2, function(cnts) median((cnts/geomeans)[geomeans > 0]))
 }
 library("getopt")
 print_usage <- function(file=stderr())
 {
  cat("
 expr-dist: see source code for full description
 Usage: expr-dist [options] < expr-matrix.txt
  Options:
    -h|--help:          print this help message and exit
    -o|--out: STRING    prefix for output files; default is 'expr-dist'
    -r|--res: INT       resolution (dpi) of generated graphics; default is 150
    -t|--height: INT    height (pixels) of generated graphics; default is 1200
    -w|--width: INT     width (pixels) of generated graphics; default is 1200
    -y|--ylim: REAL     the visible range of the Y axis depends on the first
                        distribution plotted; if other distributions are getting
                        cut off, use this setting to override the default\n\n")
 }
 spec <- matrix( c("help",   'h', 0, "logical",
                  "out",    'o', 1, "character",
                  "res",    'r', 1, "integer",
                  "height", 't', 1, "integer",
                  "width",  'w', 1, "integer",
                  "ylim",   'y', 1, "double"),
                byrow=TRUE, ncol=4)
 opt  <- getopt(spec)
 if(!is.null(opt$help))
 {
  print_usage(file=stdout())
  q(status=1)
 }
 if(is.null(opt$height)) { opt$height <- 1200           }
 if(is.null(opt$out))    { opt$out    <- "expr-dist"    }
 if(is.null(opt$res))    { opt$res    <- 150            }
 if(is.null(opt$width))  { opt$width  <- 1200           }
 if(!is.null(opt$ylim))  { opt$ylim   <- c(0, opt$ylim) }
 # Load data, determine number of samples
 data  <- read.table(file("stdin"), header=TRUE, sep="\t", quote="")
 nsamp <- dim(data)[2] - 1
 data  <- data[,1:nsamp+1]
 # Plot distribution of expression values before normalization
 outfile <- sprintf("%s-median.png", opt$out)
 png(outfile, height=opt$height, width=opt$width, res=opt$res)
 h <- hist(log(data[,1]), plot=FALSE)
 plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="",
     xlab="Log expression value", ylab="Proportion of molecules", ylim=opt$ylim)
 for(i in 2:nsamp)
 {
  h <- hist(log(data[,i]), plot=FALSE)
  lines(h$mids, h$density, col=rainbow(nsamp)[i])
 }
 devnum <- dev.off()
 # Normalize by median
 size.factors <- MedianNorm(data.matrix(data))
 data.norm <- t(apply(data, 1, function(x){ x / size.factors }))
 # Plot distribution of normalized expression values
 outfile <- sprintf("%s-median-norm.png", opt$out)
 png(outfile, height=opt$height, width=opt$width, res=opt$res)
 h <- hist(log(data.norm[,1]), plot=FALSE)
 plot(h$mids, h$density, type="l", col=rainbow(nsamp)[1], main="",
     xlab="Log normalized expression value", ylab="Proportion of molecules",
     ylim=opt$ylim)
 for(i in 2:nsamp)
 {
  h <- hist(log(data.norm[,i]), plot=FALSE)
  lines(h$mids, h$density, col=rainbow(nsamp)[i])
 }
 devnum <- dev.off()