merge master; regen samples data

2026-08-01 22:42:25 +00:00 · 2014-04-21 11:08:29 -05:00
parent bc923bb6b1 40f4c49ba9
commit 01c4fba092
123 changed files with 69457 additions and 46235 deletions
@@ -1,5 +1,6 @@
 require 'linguist/blob_helper'
 require 'linguist/generated'
+require 'linguist/heuristics'
 require 'linguist/language'
 require 'linguist/repository'
 require 'linguist/samples'
@@ -78,18 +78,13 @@ module Linguist
    def classify(tokens, languages)
      return [] if tokens.nil?
      tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
-
      scores = {}
-      if verbosity >= 2
-        dump_all_tokens(tokens, languages)
-      end
+
+      debug_dump_all_tokens(tokens, languages) if verbosity >= 2
+
      languages.each do |language|
-        scores[language] = tokens_probability(tokens, language) +
-                                   language_probability(language)
-        if verbosity >= 1
-          printf "%10s = %10.3f + %7.3f = %10.3f\n",
-            language, tokens_probability(tokens, language), language_probability(language), scores[language]
-        end
+        scores[language] = tokens_probability(tokens, language) + language_probability(language)
+        debug_dump_probabilities(tokens, language, scores[language]) if verbosity >= 1
      end

      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
@@ -135,6 +130,11 @@ module Linguist
        @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
      end

+      def debug_dump_probabilities(tokens, language, score)
+        printf("%10s = %10.3f + %7.3f = %10.3f\n",
+            language, tokens_probability(tokens, language), language_probability(language), score)
+      end
+
      # Internal: show a table of probabilities for each <token,language> pair.
      #
      # The number in each table entry is the number of "points" that each
@@ -145,22 +145,22 @@ module Linguist
      # how much more likely (log of probability ratio) that token is to
      # appear in one language vs. the least-likely language.  Dashes
      # indicate the least-likely language (and zero points) for each token.
-      def dump_all_tokens(tokens, languages)
+      def debug_dump_all_tokens(tokens, languages)
        maxlen = tokens.map { |tok| tok.size }.max
-        
+
        printf "%#{maxlen}s", ""
        puts "    #" + languages.map { |lang| sprintf("%10s", lang) }.join
-        
+
        token_map = Hash.new(0)
        tokens.each { |tok| token_map[tok] += 1 }
-        
+
        token_map.sort.each { |tok, count|
          arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
          min = arr.map { |a,b| b }.min
          minlog = Math.log(min)
          if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
            printf "%#{maxlen}s%5d", tok, count
-            
+
            puts arr.map { |ent|
              ent[1] == min ? "         -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
            }.join
@@ -58,10 +58,12 @@ module Linguist
        generated_parser? ||
        generated_net_docfile? ||
        generated_net_designer_file? ||
+        generated_postscript? ||
        generated_protocol_buffer? ||
        generated_jni_header? ||
        composer_lock? ||
-        node_modules?
+        node_modules? ||
+        vcr_cassette?
    end

    # Internal: Is the blob an XCode project file?
@@ -176,6 +178,29 @@ module Linguist
      false
    end

+    # Internal: Is the blob of PostScript generated?
+    #
+    # PostScript files are often generated by other programs. If they tell us so,
+    # we can detect them.
+    #
+    # Returns true or false.
+    def generated_postscript?
+      return false unless ['.ps', '.eps'].include? extname
+
+      # We analyze the "%%Creator:" comment, which contains the author/generator
+      # of the file. If there is one, it should be in one of the first few lines.
+      creator = lines[0..9].find {|line| line =~ /^%%Creator: /}
+      return false if creator.nil?
+
+      # Most generators write their version number, while human authors' or companies'
+      # names don't contain numbers. So look if the line contains digits. Also
+      # look for some special cases without version numbers.
+      return creator =~ /[0-9]/ ||
+        creator.include?("mpage") ||
+        creator.include?("draw") ||
+        creator.include?("ImageMagick")
+    end
+
    # Internal: Is the blob a C++, Java or Python source file generated by the
    # Protocol Buffer compiler?
    #
@@ -198,20 +223,28 @@ module Linguist
               lines[1].include?("#include <jni.h>")
    end

-    # node_modules/ can contain large amounts of files, in general not meant
-    # for humans in pull requests.
+    # Internal: Is the blob part of node_modules/, which are not meant for humans in pull requests.
    #
    # Returns true or false.
    def node_modules?
      !!name.match(/node_modules\//)
    end

-    # the php composer tool generates a lock file to represent a specific dependency state.
-    # In general not meant for humans in pull requests.
+    # Internal: Is the blob a generated php composer lock file?
    #
    # Returns true or false.
    def composer_lock?
      !!name.match(/composer.lock/)
    end
+
+    # Is the blob a VCR Cassette file?
+    #
+    # Returns true or false
+    def vcr_cassette?
+      return false unless extname == '.yml'
+      return false unless lines.count > 2
+      # VCR Cassettes have "recorded_with: VCR" in the second last line.
+      return lines[-2].include?("recorded_with: VCR")
+    end
  end
 end
@@ -0,0 +1,80 @@
+module Linguist
+  # A collection of simple heuristics that can be used to better analyze languages.
+  class Heuristics
+    ACTIVE = false
+
+    # Public: Given an array of String language names,
+    # apply heuristics against the given data and return an array
+    # of matching languages, or nil.
+    #
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Returns an array of Languages or []
+    def self.find_by_heuristics(data, languages)
+      if active?
+        if languages.all? { |l| ["Objective-C", "C++"].include?(l) }
+          disambiguate_c(data, languages)
+        end
+        if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
+          disambiguate_pl(data, languages)
+        end
+        if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
+          disambiguate_ecl(data, languages)
+        end
+        if languages.all? { |l| ["TypeScript", "XML"].include?(l) }
+          disambiguate_ts(data, languages)
+        end
+        if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
+          disambiguate_cl(data, languages)
+        end
+      end
+    end
+
+    # .h extensions are ambigious between C, C++, and Objective-C.
+    # We want to shortcut look for Objective-C _and_ now C++ too!
+    #
+    # Returns an array of Languages or []
+    def self.disambiguate_c(data, languages)
+      matches = []
+      matches << Language["Objective-C"] if data.include?("@interface")
+      matches << Language["C++"] if data.include?("#include <cstdint>")
+      matches
+    end
+
+    def self.disambiguate_pl(data, languages)
+      matches = []
+      matches << Language["Prolog"] if data.include?(":-")
+      matches << Language["Perl"] if data.include?("use strict")
+      matches
+    end
+
+    def self.disambiguate_ecl(data, languages)
+      matches = []
+      matches << Language["Prolog"] if data.include?(":-")
+      matches << Language["ECL"] if data.include?(":=")
+      matches
+    end
+
+    def self.disambiguate_ts(data, languages)
+      matches = []
+      if (data.include?("</translation>"))
+        matches << Language["XML"]
+      else
+        matches << Language["TypeScript"]
+      end
+      matches
+    end
+
+    def self.disambiguate_cl(data, languages)
+      matches = []
+      matches << Language["Common Lisp"] if data.include?("(defun ")
+      matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data)
+      matches
+    end
+
+    def self.active?
+      !!ACTIVE
+    end
+  end
+end
@@ -7,6 +7,7 @@ rescue LoadError
 end

 require 'linguist/classifier'
+require 'linguist/heuristics'
 require 'linguist/samples'

 module Linguist
@@ -32,7 +33,7 @@ module Linguist
    #
    # Returns an array
    def self.detectable_markup
-      ["CSS", "Less", "Sass", "TeX"]
+      ["CSS", "Less", "Sass", "SCSS", "Stylus", "TeX"]
    end

    # Detect languages by a specific type
@@ -113,18 +114,32 @@ module Linguist
        name += ".script!"
      end

+      # First try to find languages that match based on filename.
      possible_languages = find_by_filename(name)

+      # If there is more than one possible language with that extension (or no
+      # extension at all, in the case of extensionless scripts), we need to continue
+      # our detection work
      if possible_languages.length > 1
        data = data.call() if data.respond_to?(:call)
+        possible_language_names = possible_languages.map(&:name)
+
+        # Don't bother with emptiness
        if data.nil? || data == ""
          nil
+        # Check if there's a shebang line and use that as authoritative
        elsif (result = find_by_shebang(data)) && !result.empty?
          result.first
-        elsif classified = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
+        # No shebang. Still more work to do. Try to find it with our heuristics.
+        elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
+          determined.first
+        # Lastly, fall back to the probablistic classifier.
+        elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names ).first
+          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
          Language[classified[0]]
        end
      else
+        # Simplest and most common case, we can just return the one match based on extension
        possible_languages.first
      end
    end
@@ -470,7 +485,7 @@ module Linguist
    #
    # Returns html String
    def colorize(text, options = {})
-      lexer.highlight(text, options = {})
+      lexer.highlight(text, options)
    end

    # Public: Return name as String representation
@@ -53,6 +53,18 @@ ASP:
  - .aspx
  - .axd

+ATS:
+  type: programming
+  color: "#1ac620"
+  primary_extension: .dats
+  lexer: OCaml
+  aliases:
+  - ats2
+  extensions:
+  - .atxt
+  - .hats
+  - .sats
+
 ActionScript:
  type: programming
  lexer: ActionScript 3
@@ -90,6 +102,10 @@ AppleScript:
  aliases:
  - osascript
  primary_extension: .applescript
+  extensions:
+  - .scpt
+  interpreters:
+  - osascript

 Arc:
  type: programming
@@ -113,6 +129,12 @@ AsciiDoc:
  - .adoc
  - .asc

+AspectJ:
+  type: programming
+  lexer: AspectJ
+  color: "#1957b0"
+  primary_extension: .aj
+
 Assembly:
  type: programming
  lexer: NASM
@@ -212,6 +234,7 @@ C:
  color: "#555"
  primary_extension: .c
  extensions:
+  - .cats
  - .w

 C#:
@@ -223,6 +246,7 @@ C#:
  - csharp
  primary_extension: .cs
  extensions:
+  - .cshtml
  - .csx

 C++:
@@ -236,6 +260,7 @@ C++:
  extensions:
  - .C
  - .c++
+  - .cc
  - .cxx
  - .H
  - .h++
@@ -281,7 +306,7 @@ COBOL:

 CSS:
  ace_mode: css
-  color: "#1f085e"
+  color: "#563d7c"
  primary_extension: .css

 Ceylon:
@@ -293,6 +318,16 @@ ChucK:
  lexer: Java
  primary_extension: .ck

+Cirru:
+  type: programming
+  color: "#aaaaff"
+  primary_extension: .cirru
+  # ace_mode: cirru
+  # lexer: Cirru
+  lexer: Text only
+  extensions:
+  - .cr
+
 Clean:
  type: programming
  color: "#3a81ad"
@@ -313,6 +348,7 @@ Clojure:
  - .cljscm
  - .cljx
  - .hic
+  - .cljs.hl
  filenames:
  - riemann.config

@@ -330,6 +366,8 @@ CoffeeScript:
  - .iced
  filenames:
  - Cakefile
+  interpreters:
+  - coffee

 ColdFusion:
  type: programming
@@ -380,6 +418,12 @@ Creole:
  wrap: true
  primary_extension: .creole

+Crystal:
+  type: programming
+  lexer: Ruby
+  primary_extension: .cr
+  ace_mode: ruby
+
 Cucumber:
  lexer: Gherkin
  primary_extension: .feature
@@ -454,6 +498,9 @@ Dylan:
  type: programming
  color: "#3ebc27"
  primary_extension: .dylan
+  extensions:
+  - .intr
+  - .lid

 Ecere Projects:
  type: data
@@ -519,6 +566,14 @@ F#:
  - .fsi
  - .fsx

+FLUX:
+  type: programming
+  color: "#33CCFF"
+  primary_extension: .fx
+  lexer: Text only
+  extensions:
+  - .flux
+
 FORTRAN:
  type: programming
  lexer: Fortran
@@ -577,6 +632,11 @@ Frege:
  lexer: Haskell
  primary_extension: .fr

+Game Maker Language:
+  type: programming
+  lexer: JavaScript
+  primary_extension: .gml
+
 GAS:
  type: programming
  group: Assembly
@@ -624,6 +684,17 @@ Glyph:
  lexer: Tcl
  primary_extension: .glf

+Gnuplot:
+  type: programming
+  color: "#f0a9f0"
+  lexer: Gnuplot
+  primary_extension: .gp
+  extensions:
+  - .gnu
+  - .gnuplot
+  - .plot
+  - .plt
+
 Go:
  type: programming
  color: "#a89b4d"
@@ -650,6 +721,8 @@ Groovy:
  ace_mode: groovy
  color: "#e69f56"
  primary_extension: .groovy
+  interpreters:
+  - groovy

 Groovy Server Pages:
  group: Groovy
@@ -667,6 +740,7 @@ HTML:
  extensions:
  - .htm
  - .xhtml
+  - .html.hl

 HTML+Django:
  type: markup
@@ -715,6 +789,12 @@ Handlebars:
  - .html.handlebars
  - .html.hbs

+Harbour:
+  type: programming
+  lexer: Text only
+  color: "#0e60e3"
+  primary_extension: .hb
+
 Haskell:
  type: programming
  color: "#29b544"
@@ -730,6 +810,19 @@ Haxe:
  extensions:
  - .hxsl

+Hy:
+  type: programming
+  lexer: Clojure
+  ace_mode: clojure
+  color: "#7891b1"
+  primary_extension: .hy
+
+IDL:
+  type: programming
+  lexer: Text only
+  color: "#e3592c"
+  primary_extension: .pro
+
 INI:
  type: data
  extensions:
@@ -788,8 +881,21 @@ JSON:
  - .sublime-settings
  - .sublime-workspace
  filenames:
+  - .jshintrc
  - composer.lock

+JSON5:
+  type: data
+  lexer: JavaScript
+  primary_extension: .json5
+
+JSONLD:
+  type: data
+  group: JavaScript
+  ace_mode: json
+  lexer: JavaScript
+  primary_extension: .jsonld
+
 Jade:
  group: HTML
  type: markup
@@ -812,7 +918,7 @@ Java Server Pages:
 JavaScript:
  type: programming
  ace_mode: javascript
-  color: "#f15501"
+  color: "#f7df1e"
  aliases:
  - js
  - node
@@ -820,16 +926,20 @@ JavaScript:
  extensions:
  - ._js
  - .bones
+  - .es6
  - .jake
  - .jsfl
  - .jsm
  - .jss
  - .jsx
+  - .njs
  - .pac
  - .sjs
  - .ssjs
  filenames:
  - Jakefile
+  interpreters:
+  - node

 Julia:
  type: programming
@@ -918,10 +1028,6 @@ LiveScript:
 Logos:
  type: programming
  primary_extension: .xm
-  extensions:
-  - .x
-  - .xi
-  - .xmi

 Logtalk:
  type: programming
@@ -937,6 +1043,8 @@ Lua:
  extensions:
  - .nse
  - .rbxs
+  interpreters:
+  - lua

 M:
  type: programming
@@ -978,6 +1086,18 @@ Markdown:
  - .mkdown
  - .ron

+Mask:
+  type: markup
+  lexer: SCSS
+  color: "#f97732"
+  ace_mode: scss
+  primary_extension: .mask
+
+Mathematica:
+  type: programming
+  primary_extension: .mathematica
+  lexer: Text only
+
 Matlab:
  type: programming
  color: "#bb92ac"
@@ -1085,6 +1205,7 @@ OCaml:
  primary_extension: .ml
  extensions:
  - .eliomi
+  - .ml4
  - .mli
  - .mll
  - .mly
@@ -1151,6 +1272,12 @@ Oxygene:
  color: "#5a63a3"
  primary_extension: .oxygene

+PAWN:
+  type: programming
+  lexer: C++
+  color: "#dbb284"
+  primary_extension: .pwn
+
 PHP:
  type: programming
  ace_mode: php
@@ -1204,16 +1331,28 @@ Perl:
  primary_extension: .pl
  extensions:
  - .PL
-  - .nqp
  - .perl
  - .ph
  - .plx
-  - .pm6
+  - .pm
  - .pod
  - .psgi
  interpreters:
  - perl

+Perl6:
+  type: programming
+  color: "#0298c3"
+  primary_extension: .p6
+  extensions:
+  - .6pl
+  - .6pm
+  - .nqp
+  - .p6l
+  - .p6m
+  - .pl6
+  - .pm6
+
 Pike:
  type: programming
  color: "#066ab2"
@@ -1222,12 +1361,25 @@ Pike:
  extensions:
  - .pmod

+Pod:
+  type: prose
+  lexer: Text only
+  ace_mode: perl
+  wrap: true
+  primary_extension: .pod
+
 PogoScript:
  type: programming
  color: "#d80074"
  lexer: Text only
  primary_extension: .pogo

+PostScript:
+  type: markup
+  primary_extension: .ps
+  extensions:
+  - .eps
+
 PowerShell:
  type: programming
  ace_mode: powershell
@@ -1249,7 +1401,8 @@ Prolog:
  color: "#74283c"
  primary_extension: .prolog
  extensions:
-  - .pro
+  - .ecl
+  - .pl

 Protocol Buffer:
  type: markup
@@ -1287,6 +1440,8 @@ Python:
  - .xpy
  filenames:
  - wscript
+  - SConstruct
+  - SConscript
  interpreters:
  - python

@@ -1306,9 +1461,12 @@ R:
  type: programming
  color: "#198ce7"
  lexer: S
+  aliases:
+  - R
  primary_extension: .r
  extensions:
  - .R
+  - .rsx
  filenames:
  - .Rprofile
  interpreters:
@@ -1337,6 +1495,15 @@ RHTML:
  group: HTML
  primary_extension: .rhtml

+RMarkdown:
+  type: prose
+  lexer: Text only
+  wrap: true
+  ace_mode: markdown
+  primary_extension: .rmd
+  extensions:
+  - .Rmd
+
 Racket:
  type: programming
  lexer: Racket
@@ -1413,6 +1580,7 @@ Ruby:
  - Appraisals
  - Berksfile
  - Gemfile
+  - Gemfile.lock
  - Guardfile
  - Podfile
  - Thorfile
@@ -1451,6 +1619,8 @@ Scala:
  ace_mode: scala
  color: "#7dd3b0"
  primary_extension: .scala
+  extensions:
+  - .sc

 Scaml:
  group: HTML
@@ -1462,6 +1632,7 @@ Scheme:
  color: "#1e4aec"
  primary_extension: .scm
  extensions:
+  - .sld
  - .sls
  - .ss
  interpreters:
@@ -1500,6 +1671,12 @@ Shell:
  filenames:
  - Dockerfile

+Shen:
+  type: programming
+  color: "#120F14"
+  lexer: Text only
+  primary_extension: .shen
+
 Slash:
  type: programming
  color: "#007eff"
@@ -1524,12 +1701,29 @@ Standard ML:
  aliases:
  - sml
  primary_extension: .sml
+  extensions:
+  - .fun
+
+Stylus:
+  type: markup
+  group: CSS
+  lexer: Text only
+  primary_extension: .styl

 SuperCollider:
  type: programming
  color: "#46390b"
  lexer: Text only
-  primary_extension: .sc
+  primary_extension: .scd
+
+SystemVerilog:
+  type: programming
+  color: "#343761"
+  lexer: systemverilog
+  primary_extension: .sv
+  extensions:
+  - .svh
+  - .vh

 TOML:
  type: data
@@ -1558,12 +1752,14 @@ TeX:
  type: markup
  color: "#3D6117"
  ace_mode: latex
+  wrap: true
  aliases:
  - latex
  primary_extension: .tex
  extensions:
  - .aux
  - .bib
+  - .cls
  - .dtx
  - .ins
  - .ltx
@@ -1670,6 +1866,7 @@ Visual Basic:
  - .frm
  - .frx
  - .vba
+  - .vbhtml
  - .vbs

 Volt:
@@ -1705,6 +1902,7 @@ XML:
  - .kml
  - .launch
  - .mxml
+  - .osm
  - .plist
  - .pluginspec
  - .ps1xml
@@ -29,6 +29,7 @@ module Linguist
      @computed_stats = false
      @language = @size = nil
      @sizes = Hash.new { 0 }
+      @file_breakdown = Hash.new { |h,k| h[k] = Array.new }
    end

    # Public: Returns a breakdown of language stats.
@@ -60,6 +61,12 @@ module Linguist
      @size
    end

+    # Public: Return the language breakdown of this repository by file
+    def breakdown_by_file
+      compute_stats
+      @file_breakdown
+    end
+
    # Internal: Compute language breakdown for each blob in the Repository.
    #
    # Returns nothing
@@ -75,6 +82,10 @@ module Linguist

        # Only include programming languages and acceptable markup languages
        if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name)
+
+          # Build up the per-file breakdown stats
+          @file_breakdown[blob.language.group.name] << blob.name
+
          @sizes[blob.language.group] += blob.size
        end
      end
@@ -10,7 +10,7 @@
 ## Vendor Conventions ##

 # Caches
- cache/
+- (^|/)cache/

 # Dependencies
 - ^[Dd]ependencies/
@@ -27,11 +27,18 @@
 # Node dependencies
 - node_modules/

+# Bower Components
+- bower_components/
+
 # Erlang bundles
 - ^rebar$

 # Bootstrap minified css and js
- (^|/)bootstrap([^.]*)(\.min)\.(js|css)$
+- (^|/)bootstrap([^.]*)(\.min)?\.(js|css)$
+
+# Foundation css
+- foundation.min.css
+- foundation.css

 # Vendored dependencies
 - thirdparty/
@@ -40,6 +47,9 @@
 # Debian packaging
 - ^debian/

+# Haxelib projects often contain a neko bytecode file named run.n
+- run.n$
+
 ## Commonly Bundled JavaScript frameworks ##

 # jQuery
@@ -56,6 +66,9 @@
 - (^|/)controls\.js$
 - (^|/)dragdrop\.js$

+# Typescript definition files
+- (.*?)\.d\.ts$
+
 # MooTools
 - (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$

@@ -82,6 +95,12 @@
 - (^|/)shCore\.js$
 - (^|/)shLegacy\.js$

+# AngularJS
+- (^|/)angular([^.]*)(\.min)?\.js$
+
+# React
+- (^|/)react(-[^.]*)?(\.min)?\.js$
+
 ## Python ##

 # django
@@ -101,6 +120,13 @@
 # Sparkle
 - (^|/)Sparkle/

+## Groovy ##
+
+# Gradle
+- (^|/)gradlew$
+- (^|/)gradlew\.bat$
+- (^|/)gradle/wrapper/
+
 ## .NET ##

 # Visual Studio IntelliSense
@@ -140,6 +166,7 @@
 # LICENSE, README, git config files
 - ^COPYING$
 - LICENSE$
+- License$
 - gitattributes$
 - gitignore$
 - gitmodules$