Merge branch 'master' into 815

Conflicts: lib/linguist/samples.json
2026-06-19 10:49:30 +00:00 · 2014-04-21 11:37:49 -05:00
parent 89795ebd1f 0f5e2a1ea4
commit 26fbc45baf
142 changed files with 29547 additions and 1870 deletions
--- a/lib/linguist.rb
+++ b/lib/linguist.rb
@@ -1,5 +1,6 @@
 require 'linguist/blob_helper'
 require 'linguist/generated'
+require 'linguist/heuristics'
 require 'linguist/language'
 require 'linguist/repository'
 require 'linguist/samples'
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -190,9 +190,9 @@ module Linguist
    # Public: Is the blob safe to colorize?
    #
    # We use Pygments for syntax highlighting blobs. Pygments
-    # can be too slow for very large blobs or for certain 
+    # can be too slow for very large blobs or for certain
    # corner-case blobs.
-    # 
+    #
    # Return true or false
    def safe_to_colorize?
      !large? && text? && !high_ratio_of_long_lines?
--- a/lib/linguist/classifier.rb
+++ b/lib/linguist/classifier.rb
@@ -78,18 +78,13 @@ module Linguist
    def classify(tokens, languages)
      return [] if tokens.nil?
      tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
-
      scores = {}
-      if verbosity >= 2
-        dump_all_tokens(tokens, languages)
-      end
+
+      debug_dump_all_tokens(tokens, languages) if verbosity >= 2
+
      languages.each do |language|
-        scores[language] = tokens_probability(tokens, language) +
-                                   language_probability(language)
-        if verbosity >= 1
-          printf "%10s = %10.3f + %7.3f = %10.3f\n",
-            language, tokens_probability(tokens, language), language_probability(language), scores[language]
-        end
+        scores[language] = tokens_probability(tokens, language) + language_probability(language)
+        debug_dump_probabilities(tokens, language, scores[language]) if verbosity >= 1
      end

      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
@@ -135,6 +130,11 @@ module Linguist
        @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
      end

+      def debug_dump_probabilities(tokens, language, score)
+        printf("%10s = %10.3f + %7.3f = %10.3f\n",
+            language, tokens_probability(tokens, language), language_probability(language), score)
+      end
+
      # Internal: show a table of probabilities for each <token,language> pair.
      #
      # The number in each table entry is the number of "points" that each
@@ -145,22 +145,22 @@ module Linguist
      # how much more likely (log of probability ratio) that token is to
      # appear in one language vs. the least-likely language.  Dashes
      # indicate the least-likely language (and zero points) for each token.
-      def dump_all_tokens(tokens, languages)
+      def debug_dump_all_tokens(tokens, languages)
        maxlen = tokens.map { |tok| tok.size }.max
-        
+
        printf "%#{maxlen}s", ""
        puts "    #" + languages.map { |lang| sprintf("%10s", lang) }.join
-        
+
        token_map = Hash.new(0)
        tokens.each { |tok| token_map[tok] += 1 }
-        
+
        token_map.sort.each { |tok, count|
          arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
          min = arr.map { |a,b| b }.min
          minlog = Math.log(min)
          if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
            printf "%#{maxlen}s%5d", tok, count
-            
+
            puts arr.map { |ent|
              ent[1] == min ? "         -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
            }.join
--- a/lib/linguist/generated.rb
+++ b/lib/linguist/generated.rb
@@ -58,9 +58,12 @@ module Linguist
        generated_parser? ||
        generated_net_docfile? ||
        generated_net_designer_file? ||
+        generated_postscript? ||
        generated_protocol_buffer? ||
        generated_jni_header? ||
-        node_modules?
+        composer_lock? ||
+        node_modules? ||
+        vcr_cassette?
    end

    # Internal: Is the blob an XCode project file?
@@ -175,6 +178,29 @@ module Linguist
      false
    end

+    # Internal: Is the blob of PostScript generated?
+    #
+    # PostScript files are often generated by other programs. If they tell us so,
+    # we can detect them.
+    #
+    # Returns true or false.
+    def generated_postscript?
+      return false unless ['.ps', '.eps'].include? extname
+
+      # We analyze the "%%Creator:" comment, which contains the author/generator
+      # of the file. If there is one, it should be in one of the first few lines.
+      creator = lines[0..9].find {|line| line =~ /^%%Creator: /}
+      return false if creator.nil?
+
+      # Most generators write their version number, while human authors' or companies'
+      # names don't contain numbers. So look if the line contains digits. Also
+      # look for some special cases without version numbers.
+      return creator =~ /[0-9]/ ||
+        creator.include?("mpage") ||
+        creator.include?("draw") ||
+        creator.include?("ImageMagick")
+    end
+
    # Internal: Is the blob a C++, Java or Python source file generated by the
    # Protocol Buffer compiler?
    #
@@ -197,12 +223,28 @@ module Linguist
               lines[1].include?("#include <jni.h>")
    end

-    # node_modules/ can contain large amounts of files, in general not meant
-    # for humans in pull requests.
+    # Internal: Is the blob part of node_modules/, which are not meant for humans in pull requests.
    #
    # Returns true or false.
    def node_modules?
      !!name.match(/node_modules\//)
    end
+
+    # Internal: Is the blob a generated php composer lock file?
+    #
+    # Returns true or false.
+    def composer_lock?
+      !!name.match(/composer.lock/)
+    end
+
+    # Is the blob a VCR Cassette file?
+    #
+    # Returns true or false
+    def vcr_cassette?
+      return false unless extname == '.yml'
+      return false unless lines.count > 2
+      # VCR Cassettes have "recorded_with: VCR" in the second last line.
+      return lines[-2].include?("recorded_with: VCR")
+    end
  end
 end
--- a/lib/linguist/heuristics.rb
+++ b/lib/linguist/heuristics.rb
@@ -0,0 +1,80 @@
+module Linguist
+  # A collection of simple heuristics that can be used to better analyze languages.
+  class Heuristics
+    ACTIVE = false
+
+    # Public: Given an array of String language names,
+    # apply heuristics against the given data and return an array
+    # of matching languages, or nil.
+    #
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Returns an array of Languages or []
+    def self.find_by_heuristics(data, languages)
+      if active?
+        if languages.all? { |l| ["Objective-C", "C++"].include?(l) }
+          disambiguate_c(data, languages)
+        end
+        if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
+          disambiguate_pl(data, languages)
+        end
+        if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
+          disambiguate_ecl(data, languages)
+        end
+        if languages.all? { |l| ["TypeScript", "XML"].include?(l) }
+          disambiguate_ts(data, languages)
+        end
+        if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
+          disambiguate_cl(data, languages)
+        end
+      end
+    end
+
+    # .h extensions are ambigious between C, C++, and Objective-C.
+    # We want to shortcut look for Objective-C _and_ now C++ too!
+    #
+    # Returns an array of Languages or []
+    def self.disambiguate_c(data, languages)
+      matches = []
+      matches << Language["Objective-C"] if data.include?("@interface")
+      matches << Language["C++"] if data.include?("#include <cstdint>")
+      matches
+    end
+
+    def self.disambiguate_pl(data, languages)
+      matches = []
+      matches << Language["Prolog"] if data.include?(":-")
+      matches << Language["Perl"] if data.include?("use strict")
+      matches
+    end
+
+    def self.disambiguate_ecl(data, languages)
+      matches = []
+      matches << Language["Prolog"] if data.include?(":-")
+      matches << Language["ECL"] if data.include?(":=")
+      matches
+    end
+
+    def self.disambiguate_ts(data, languages)
+      matches = []
+      if (data.include?("</translation>"))
+        matches << Language["XML"]
+      else
+        matches << Language["TypeScript"]
+      end
+      matches
+    end
+
+    def self.disambiguate_cl(data, languages)
+      matches = []
+      matches << Language["Common Lisp"] if data.include?("(defun ")
+      matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data)
+      matches
+    end
+
+    def self.active?
+      !!ACTIVE
+    end
+  end
+end
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -1,8 +1,13 @@
 require 'escape_utils'
 require 'pygments'
 require 'yaml'
+begin
+  require 'json'
+rescue LoadError
+end

 require 'linguist/classifier'
+require 'linguist/heuristics'
 require 'linguist/samples'

 module Linguist
@@ -17,17 +22,27 @@ module Linguist
    @alias_index     = {}

    @extension_index          = Hash.new { |h,k| h[k] = [] }
+    @interpreter_index        = Hash.new { |h,k| h[k] = [] }
    @filename_index           = Hash.new { |h,k| h[k] = [] }
    @primary_extension_index  = {}

    # Valid Languages types
-    TYPES = [:data, :markup, :programming]
+    TYPES = [:data, :markup, :programming, :prose]

    # Names of non-programming languages that we will still detect
    #
    # Returns an array
    def self.detectable_markup
-      ["CSS", "Less", "Sass", "TeX"]
+      ["CSS", "Less", "Sass", "SCSS", "Stylus", "TeX"]
+    end
+
+    # Detect languages by a specific type
+    #
+    # type - A symbol that exists within TYPES
+    #
+    # Returns an array
+    def self.by_type(type)
+      all.select { |h| h.type == type }
    end

    # Internal: Create a new Language object
@@ -71,6 +86,10 @@ module Linguist

      @primary_extension_index[language.primary_extension] = language

+      language.interpreters.each do |interpreter|
+        @interpreter_index[interpreter] << language
+      end
+
      language.filenames.each do |filename|
        @filename_index[filename] << language
      end
@@ -95,16 +114,32 @@ module Linguist
        name += ".script!"
      end

+      # First try to find languages that match based on filename.
      possible_languages = find_by_filename(name)

+      # If there is more than one possible language with that extension (or no
+      # extension at all, in the case of extensionless scripts), we need to continue
+      # our detection work
      if possible_languages.length > 1
        data = data.call() if data.respond_to?(:call)
+        possible_language_names = possible_languages.map(&:name)
+
+        # Don't bother with emptiness
        if data.nil? || data == ""
          nil
-        elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
-          Language[result[0]]
+        # Check if there's a shebang line and use that as authoritative
+        elsif (result = find_by_shebang(data)) && !result.empty?
+          result.first
+        # No shebang. Still more work to do. Try to find it with our heuristics.
+        elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
+          determined.first
+        # Lastly, fall back to the probablistic classifier.
+        elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names ).first
+          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
+          Language[classified[0]]
        end
      else
+        # Simplest and most common case, we can just return the one match based on extension
        possible_languages.first
      end
    end
@@ -162,6 +197,20 @@ module Linguist
      langs.compact.uniq
    end

+    # Public: Look up Languages by shebang line.
+    #
+    # data - Array of tokens or String data to analyze.
+    #
+    # Examples
+    #
+    #   Language.find_by_shebang("#!/bin/bash\ndate;")
+    #   # => [#<Language name="Bash">]
+    #
+    # Returns the matching Language
+    def self.find_by_shebang(data)
+      @interpreter_index[Linguist.interpreter_from_shebang(data)]
+    end
+
    # Public: Look up Language by its name or lexer.
    #
    # name - The String name of the Language
@@ -247,6 +296,7 @@ module Linguist

      # Set extensions or default to [].
      @extensions = attributes[:extensions] || []
+      @interpreters = attributes[:interpreters]   || []
      @filenames  = attributes[:filenames]  || []

      unless @primary_extension = attributes[:primary_extension]
@@ -359,6 +409,15 @@ module Linguist
    # Returns the extension String.
    attr_reader :primary_extension

+    # Public: Get interpreters
+    #
+    # Examples
+    #
+    #   # => ['awk', 'gawk', 'mawk' ...]
+    #
+    # Returns the interpreters Array
+    attr_reader :interpreters
+
    # Public: Get filenames
    #
    # Examples
@@ -426,7 +485,7 @@ module Linguist
    #
    # Returns html String
    def colorize(text, options = {})
-      lexer.highlight(text, options = {})
+      lexer.highlight(text, options)
    end

    # Public: Return name as String representation
@@ -452,11 +511,22 @@ module Linguist
  end

  extensions = Samples::DATA['extnames']
+  interpreters = Samples::DATA['interpreters']
  filenames = Samples::DATA['filenames']
  popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))

-  YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options|
+  languages_yml = File.expand_path("../languages.yml", __FILE__)
+  languages_json = File.expand_path("../languages.json", __FILE__)
+
+  if File.exist?(languages_json) && defined?(JSON)
+    languages = JSON.load(File.read(languages_json))
+  else
+    languages = YAML.load_file(languages_yml)
+  end
+
+  languages.each do |name, options|
    options['extensions'] ||= []
+    options['interpreters'] ||= []
    options['filenames'] ||= []

    if extnames = extensions[name]
@@ -467,6 +537,18 @@ module Linguist
      end
    end

+    if interpreters == nil
+      interpreters = {}
+    end
+
+    if interpreter_names = interpreters[name]
+      interpreter_names.each do |interpreter|
+        if !options['interpreters'].include?(interpreter)
+          options['interpreters'] << interpreter
+        end
+      end
+    end
+
    if fns = filenames[name]
      fns.each do |filename|
        if !options['filenames'].include?(filename)
@@ -487,6 +569,7 @@ module Linguist
      :searchable        => options.key?('searchable') ? options['searchable'] : true,
      :search_term       => options['search_term'],
      :extensions        => options['extensions'].sort,
+      :interpreters      => options['interpreters'].sort,
      :primary_extension => options['primary_extension'],
      :filenames         => options['filenames'],
      :popular           => popular.include?(name)
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -10,6 +10,7 @@
 # ace_mode          - A String name of Ace Mode (if available)
 # wrap              - Boolean wrap to enable line wrapping (default: false)
 # extension         - An Array of associated extensions
+# interpreters      - An Array of associated interpreters
 # primary_extension - A String for the main extension associated with
 #                     the language. Must be unique. Used when a Language is picked
 #                     from a dropdown and we need to automatically choose an
@@ -22,7 +23,7 @@
 # Any additions or modifications (even trivial) should have corresponding
 # test change in `test/test_blob.rb`.
 #
-# Please keep this list alphabetized.
+# Please keep this list alphabetized. Capitalization comes before lower case.

 ABAP:
  type: programming
@@ -52,6 +53,18 @@ ASP:
  - .aspx
  - .axd

+ATS:
+  type: programming
+  color: "#1ac620"
+  primary_extension: .dats
+  lexer: OCaml
+  aliases:
+  - ats2
+  extensions:
+  - .atxt
+  - .hats
+  - .sats
+
 ActionScript:
  type: programming
  lexer: ActionScript 3
@@ -70,6 +83,7 @@ Ada:

 Agda:
  type: programming
+  color: "#467C91"
  primary_extension: .agda

 ApacheConf:
@@ -88,6 +102,10 @@ AppleScript:
  aliases:
  - osascript
  primary_extension: .applescript
+  extensions:
+  - .scpt
+  interpreters:
+  - osascript

 Arc:
  type: programming
@@ -101,6 +119,22 @@ Arduino:
  lexer: C++
  primary_extension: .ino

+AsciiDoc:
+  type: prose
+  lexer: Text only
+  ace_mode: asciidoc
+  wrap: true
+  primary_extension: .asciidoc
+  extensions:
+  - .adoc
+  - .asc
+
+AspectJ:
+  type: programming
+  lexer: AspectJ
+  color: "#1957b0"
+  primary_extension: .aj
+
 Assembly:
  type: programming
  lexer: NASM
@@ -140,6 +174,11 @@ Awk:
  - .gawk
  - .mawk
  - .nawk
+  interpreters:
+  - awk
+  - gawk
+  - mawk
+  - nawk

 Batchfile:
  type: programming
@@ -181,6 +220,11 @@ Brainfuck:
  extensions:
  - .bf

+Brightscript:
+  type: programming
+  lexer: Text only
+  primary_extension: .brs
+
 Bro:
  type: programming
  primary_extension: .bro
@@ -190,6 +234,7 @@ C:
  color: "#555"
  primary_extension: .c
  extensions:
+  - .cats
  - .w

 C#:
@@ -201,6 +246,7 @@ C#:
  - csharp
  primary_extension: .cs
  extensions:
+  - .cshtml
  - .csx

 C++:
@@ -214,6 +260,7 @@ C++:
  extensions:
  - .C
  - .c++
+  - .cc
  - .cxx
  - .H
  - .h++
@@ -259,7 +306,7 @@ COBOL:

 CSS:
  ace_mode: css
-  color: "#1f085e"
+  color: "#563d7c"
  primary_extension: .css

 Ceylon:
@@ -271,6 +318,16 @@ ChucK:
  lexer: Java
  primary_extension: .ck

+Cirru:
+  type: programming
+  color: "#aaaaff"
+  primary_extension: .cirru
+  # ace_mode: cirru
+  # lexer: Cirru
+  lexer: Text only
+  extensions:
+  - .cr
+
 Clean:
  type: programming
  color: "#3a81ad"
@@ -291,6 +348,7 @@ Clojure:
  - .cljscm
  - .cljx
  - .hic
+  - .cljs.hl
  filenames:
  - riemann.config

@@ -308,6 +366,8 @@ CoffeeScript:
  - .iced
  filenames:
  - Cakefile
+  interpreters:
+  - coffee

 ColdFusion:
  type: programming
@@ -333,6 +393,12 @@ Common Lisp:
  - .lsp
  - .ny
  - .podsl
+  interpreters:
+  - lisp
+  - sbcl
+  - ccl
+  - clisp
+  - ecl

 Coq:
  type: programming
@@ -346,6 +412,18 @@ Cpp-ObjDump:
  - .c++objdump
  - .cxx-objdump

+Creole:
+  type: prose
+  lexer: Text only
+  wrap: true
+  primary_extension: .creole
+
+Crystal:
+  type: programming
+  lexer: Ruby
+  primary_extension: .cr
+  ace_mode: ruby
+
 Cucumber:
  lexer: Gherkin
  primary_extension: .feature
@@ -379,7 +457,7 @@ D-ObjDump:
 DM:
  type: programming
  color: "#075ff1"
-  lexer: Text only
+  lexer: C++
  primary_extension: .dm
  aliases:
  - byond
@@ -416,10 +494,19 @@ DCPU-16 ASM:
 Diff:
  primary_extension: .diff

+Dogescript:
+  type: programming
+  lexer: Text only
+  color: "#cca760"
+  primary_extension: .djs
+
 Dylan:
  type: programming
  color: "#3ebc27"
  primary_extension: .dylan
+  extensions:
+  - .intr
+  - .lid

 Ecere Projects:
  type: data
@@ -485,6 +572,14 @@ F#:
  - .fsi
  - .fsx

+FLUX:
+  type: programming
+  color: "#33CCFF"
+  primary_extension: .fx
+  lexer: Text only
+  extensions:
+  - .flux
+
 FORTRAN:
  type: programming
  lexer: Fortran
@@ -537,6 +632,17 @@ Forth:
  extensions:
  - .4th

+Frege:
+  type: programming
+  color: "#00cafe"
+  lexer: Haskell
+  primary_extension: .fr
+
+Game Maker Language:
+  type: programming
+  lexer: JavaScript
+  primary_extension: .gml
+
 GAS:
  type: programming
  group: Assembly
@@ -584,6 +690,17 @@ Glyph:
  lexer: Tcl
  primary_extension: .glf

+Gnuplot:
+  type: programming
+  color: "#f0a9f0"
+  lexer: Gnuplot
+  primary_extension: .gp
+  extensions:
+  - .gnu
+  - .gnuplot
+  - .plot
+  - .plt
+
 Go:
  type: programming
  color: "#a89b4d"
@@ -610,6 +727,8 @@ Groovy:
  ace_mode: groovy
  color: "#e69f56"
  primary_extension: .groovy
+  interpreters:
+  - groovy

 Groovy Server Pages:
  group: Groovy
@@ -627,6 +746,7 @@ HTML:
  extensions:
  - .htm
  - .xhtml
+  - .html.hl

 HTML+Django:
  type: markup
@@ -675,6 +795,12 @@ Handlebars:
  - .html.handlebars
  - .html.hbs

+Harbour:
+  type: programming
+  lexer: Text only
+  color: "#0e60e3"
+  primary_extension: .hb
+
 Haskell:
  type: programming
  color: "#29b544"
@@ -690,6 +816,19 @@ Haxe:
  extensions:
  - .hxsl

+Hy:
+  type: programming
+  lexer: Clojure
+  ace_mode: clojure
+  color: "#7891b1"
+  primary_extension: .hy
+
+IDL:
+  type: programming
+  lexer: Text only
+  color: "#e3592c"
+  primary_extension: .pro
+
 INI:
  type: data
  extensions:
@@ -748,8 +887,21 @@ JSON:
  - .sublime-settings
  - .sublime-workspace
  filenames:
+  - .jshintrc
  - composer.lock

+JSON5:
+  type: data
+  lexer: JavaScript
+  primary_extension: .json5
+
+JSONLD:
+  type: data
+  group: JavaScript
+  ace_mode: json
+  lexer: JavaScript
+  primary_extension: .jsonld
+
 Jade:
  group: HTML
  type: markup
@@ -772,7 +924,7 @@ Java Server Pages:
 JavaScript:
  type: programming
  ace_mode: javascript
-  color: "#f15501"
+  color: "#f7df1e"
  aliases:
  - js
  - node
@@ -780,16 +932,20 @@ JavaScript:
  extensions:
  - ._js
  - .bones
+  - .es6
  - .jake
  - .jsfl
  - .jsm
  - .jss
  - .jsx
+  - .njs
  - .pac
  - .sjs
  - .ssjs
  filenames:
  - Jakefile
+  interpreters:
+  - node

 Julia:
  type: programming
@@ -878,10 +1034,6 @@ LiveScript:
 Logos:
  type: programming
  primary_extension: .xm
-  extensions:
-  - .x
-  - .xi
-  - .xmi

 Logtalk:
  type: programming
@@ -897,6 +1049,8 @@ Lua:
  extensions:
  - .nse
  - .rbxs
+  interpreters:
+  - lua

 M:
  type: programming
@@ -918,6 +1072,8 @@ Makefile:
  - makefile
  - Makefile
  - GNUmakefile
+  interpreters:
+  - make

 Mako:
  primary_extension: .mako
@@ -925,7 +1081,7 @@ Mako:
  - .mao

 Markdown:
-  type: markup
+  type: prose
  lexer: Text only
  ace_mode: markdown
  wrap: true
@@ -936,6 +1092,18 @@ Markdown:
  - .mkdown
  - .ron

+Mask:
+  type: markup
+  lexer: SCSS
+  color: "#f97732"
+  ace_mode: scss
+  primary_extension: .mask
+
+Mathematica:
+  type: programming
+  primary_extension: .mathematica
+  lexer: Text only
+
 Matlab:
  type: programming
  color: "#bb92ac"
@@ -956,6 +1124,12 @@ Max:
  - .mxt
  - .pat

+MediaWiki:
+  type: prose
+  lexer: Text only
+  wrap: true
+  primary_extension: .mediawiki
+
 MiniD: # Legacy
  searchable: false
  primary_extension: .minid # Dummy extension
@@ -1037,6 +1211,7 @@ OCaml:
  primary_extension: .ml
  extensions:
  - .eliomi
+  - .ml4
  - .mli
  - .mll
  - .mly
@@ -1091,12 +1266,24 @@ OpenEdge ABL:
  - abl
  primary_extension: .p

+Org:
+  type: prose
+  lexer: Text only
+  wrap: true
+  primary_extension: .org
+
 Oxygene:
  type: programming
  lexer: Text only
  color: "#5a63a3"
  primary_extension: .oxygene

+PAWN:
+  type: programming
+  lexer: C++
+  color: "#dbb284"
+  primary_extension: .pwn
+
 PHP:
  type: programming
  ace_mode: php
@@ -1150,13 +1337,27 @@ Perl:
  primary_extension: .pl
  extensions:
  - .PL
-  - .nqp
  - .perl
  - .ph
  - .plx
-  - .pm6
+  - .pm
  - .pod
  - .psgi
+  interpreters:
+  - perl
+
+Perl6:
+  type: programming
+  color: "#0298c3"
+  primary_extension: .p6
+  extensions:
+  - .6pl
+  - .6pm
+  - .nqp
+  - .p6l
+  - .p6m
+  - .pl6
+  - .pm6

 Pike:
  type: programming
@@ -1166,12 +1367,25 @@ Pike:
  extensions:
  - .pmod

+Pod:
+  type: prose
+  lexer: Text only
+  ace_mode: perl
+  wrap: true
+  primary_extension: .pod
+
 PogoScript:
  type: programming
  color: "#d80074"
  lexer: Text only
  primary_extension: .pogo

+PostScript:
+  type: markup
+  primary_extension: .ps
+  extensions:
+  - .eps
+
 PowerShell:
  type: programming
  ace_mode: powershell
@@ -1193,7 +1407,8 @@ Prolog:
  color: "#74283c"
  primary_extension: .prolog
  extensions:
-  - .pro
+  - .ecl
+  - .pl

 Protocol Buffer:
  type: markup
@@ -1224,12 +1439,17 @@ Python:
  primary_extension: .py
  extensions:
  - .gyp
+  - .lmi
  - .pyt
  - .pyw
  - .wsgi
  - .xpy
  filenames:
  - wscript
+  - SConstruct
+  - SConscript
+  interpreters:
+  - python

 Python traceback:
  type: data
@@ -1247,11 +1467,23 @@ R:
  type: programming
  color: "#198ce7"
  lexer: S
+  aliases:
+  - R
  primary_extension: .r
  extensions:
  - .R
+  - .rsx
  filenames:
  - .Rprofile
+  interpreters:
+  - Rscript
+
+RDoc:
+  type: prose
+  lexer: Text only
+  ace_mode: rdoc
+  wrap: true
+  primary_extension: .rdoc

 REALbasic:
  type: programming
@@ -1269,6 +1501,15 @@ RHTML:
  group: HTML
  primary_extension: .rhtml

+RMarkdown:
+  type: prose
+  lexer: Text only
+  wrap: true
+  ace_mode: markdown
+  primary_extension: .rmd
+  extensions:
+  - .Rmd
+
 Racket:
  type: programming
  lexer: Racket
@@ -1339,10 +1580,13 @@ Ruby:
  - .ru
  - .thor
  - .watchr
+  interpreters:
+  - ruby
  filenames:
  - Appraisals
  - Berksfile
  - Gemfile
+  - Gemfile.lock
  - Guardfile
  - Podfile
  - Thorfile
@@ -1386,6 +1630,8 @@ Scala:
  ace_mode: scala
  color: "#7dd3b0"
  primary_extension: .scala
+  extensions:
+  - .sc

 Scaml:
  group: HTML
@@ -1397,8 +1643,14 @@ Scheme:
  color: "#1e4aec"
  primary_extension: .scm
  extensions:
+  - .sld
  - .sls
  - .ss
+  interpreters:
+  - guile
+  - racket
+  - bigloo
+  - chicken

 Scilab:
  type: programming
@@ -1423,9 +1675,19 @@ Shell:
  extensions:
  - .bats
  - .tmux
+  interpreters:
+  - bash
+  - sh
+  - zsh
  filenames:
  - Dockerfile

+Shen:
+  type: programming
+  color: "#120F14"
+  lexer: Text only
+  primary_extension: .shen
+
 Slash:
  type: programming
  color: "#007eff"
@@ -1450,12 +1712,29 @@ Standard ML:
  aliases:
  - sml
  primary_extension: .sml
+  extensions:
+  - .fun
+
+Stylus:
+  type: markup
+  group: CSS
+  lexer: Text only
+  primary_extension: .styl

 SuperCollider:
  type: programming
  color: "#46390b"
  lexer: Text only
-  primary_extension: .sc
+  primary_extension: .scd
+
+SystemVerilog:
+  type: programming
+  color: "#343761"
+  lexer: systemverilog
+  primary_extension: .sv
+  extensions:
+  - .svh
+  - .vh

 TOML:
  type: data
@@ -1472,6 +1751,7 @@ Tcl:
  primary_extension: .tcl
  extensions:
  - .adp
+  - .tm

 Tcsh:
  type: programming
@@ -1482,13 +1762,16 @@ Tcsh:

 TeX:
  type: markup
+  color: "#3D6117"
  ace_mode: latex
+  wrap: true
  aliases:
  - latex
  primary_extension: .tex
  extensions:
  - .aux
  - .bib
+  - .cls
  - .dtx
  - .ins
  - .ltx
@@ -1503,7 +1786,7 @@ Tea:
  primary_extension: .tea

 Textile:
-  type: markup
+  type: prose
  lexer: Text only
  ace_mode: textile
  wrap: true
@@ -1549,6 +1832,14 @@ VHDL:
  lexer: vhdl
  color: "#543978"
  primary_extension: .vhdl
+  extensions:
+  - .vhd
+  - .vhf
+  - .vhi
+  - .vho
+  - .vhs
+  - .vht
+  - .vhw

 Vala:
  type: programming
@@ -1587,6 +1878,7 @@ Visual Basic:
  - .frm
  - .frx
  - .vba
+  - .vbhtml
  - .vbs

 Volt:
@@ -1622,6 +1914,7 @@ XML:
  - .kml
  - .launch
  - .mxml
+  - .osm
  - .plist
  - .pluginspec
  - .ps1xml
@@ -1738,7 +2031,7 @@ ooc:
  primary_extension: .ooc

 reStructuredText:
-  type: markup
+  type: prose
  wrap: true
  search_term: rst
  aliases:
--- a/lib/linguist/repository.rb
+++ b/lib/linguist/repository.rb
@@ -29,6 +29,7 @@ module Linguist
      @computed_stats = false
      @language = @size = nil
      @sizes = Hash.new { 0 }
+      @file_breakdown = Hash.new { |h,k| h[k] = Array.new }
    end

    # Public: Returns a breakdown of language stats.
@@ -60,6 +61,12 @@ module Linguist
      @size
    end

+    # Public: Return the language breakdown of this repository by file
+    def breakdown_by_file
+      compute_stats
+      @file_breakdown
+    end
+
    # Internal: Compute language breakdown for each blob in the Repository.
    #
    # Returns nothing
@@ -75,6 +82,10 @@ module Linguist

        # Only include programming languages and acceptable markup languages
        if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name)
+
+          # Build up the per-file breakdown stats
+          @file_breakdown[blob.language.group.name] << blob.name
+
          @sizes[blob.language.group] += blob.size
        end
      end
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
--- a/lib/linguist/samples.rb
+++ b/lib/linguist/samples.rb
@@ -57,6 +57,7 @@ module Linguist
            yield({
              :path     => File.join(dirname, filename),
              :language => category,
+              :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
              :extname  => File.extname(filename)
            })
          end
@@ -72,6 +73,7 @@ module Linguist
    def self.data
      db = {}
      db['extnames'] = {}
+      db['interpreters'] = {}
      db['filenames'] = {}

      each do |sample|
@@ -85,6 +87,14 @@ module Linguist
          end
        end

+        if sample[:interpreter]
+          db['interpreters'][language_name] ||= []
+          if !db['interpreters'][language_name].include?(sample[:interpreter])
+            db['interpreters'][language_name] << sample[:interpreter]
+            db['interpreters'][language_name].sort!
+          end
+        end
+
        if sample[:filename]
          db['filenames'][language_name] ||= []
          db['filenames'][language_name] << sample[:filename]
@@ -100,4 +110,40 @@ module Linguist
      db
    end
  end
+
+  # Used to retrieve the interpreter from the shebang line of a file's
+  # data.
+  def self.interpreter_from_shebang(data)
+    lines = data.lines.to_a
+
+    if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
+      bang.sub!(/^#! /, '#!')
+      tokens = bang.split(' ')
+      pieces = tokens.first.split('/')
+
+      if pieces.size > 1
+        script = pieces.last
+      else
+        script = pieces.first.sub('#!', '')
+      end
+
+      script = script == 'env' ? tokens[1] : script
+
+      # "python2.6" -> "python"
+      if script =~ /((?:\d+\.?)+)/
+        script.sub! $1, ''
+      end
+
+      # Check for multiline shebang hacks that call `exec`
+      if script == 'sh' &&
+        lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
+        script = $1
+      end
+
+      script
+    else
+      nil
+    end
+  end
+
 end
--- a/lib/linguist/vendor.yml
+++ b/lib/linguist/vendor.yml
@@ -10,7 +10,7 @@
 ## Vendor Conventions ##

 # Caches
- cache/
+- (^|/)cache/

 # Dependencies
 - ^[Dd]ependencies/
@@ -27,11 +27,18 @@
 # Node dependencies
 - node_modules/

+# Bower Components
+- bower_components/
+
 # Erlang bundles
 - ^rebar$

 # Bootstrap minified css and js
- (^|/)bootstrap([^.]*)(\.min)\.(js|css)$
+- (^|/)bootstrap([^.]*)(\.min)?\.(js|css)$
+
+# Foundation css
+- foundation.min.css
+- foundation.css

 # Vendored dependencies
 - thirdparty/
@@ -40,6 +47,9 @@
 # Debian packaging
 - ^debian/

+# Haxelib projects often contain a neko bytecode file named run.n
+- run.n$
+
 ## Commonly Bundled JavaScript frameworks ##

 # jQuery
@@ -56,6 +66,9 @@
 - (^|/)controls\.js$
 - (^|/)dragdrop\.js$

+# Typescript definition files
+- (.*?)\.d\.ts$
+
 # MooTools
 - (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$

@@ -82,6 +95,12 @@
 - (^|/)shCore\.js$
 - (^|/)shLegacy\.js$

+# AngularJS
+- (^|/)angular([^.]*)(\.min)?\.js$
+
+# React
+- (^|/)react(-[^.]*)?(\.min)?\.js$
+
 ## Python ##

 # django
@@ -101,6 +120,13 @@
 # Sparkle
 - (^|/)Sparkle/

+## Groovy ##
+
+# Gradle
+- (^|/)gradlew$
+- (^|/)gradlew\.bat$
+- (^|/)gradle/wrapper/
+
 ## .NET ##

 # Visual Studio IntelliSense
@@ -140,6 +166,7 @@
 # LICENSE, README, git config files
 - ^COPYING$
 - LICENSE$
+- License$
 - gitattributes$
 - gitignore$
 - gitmodules$