Merge branch 'master' into GrammaticalFramework-master

2025-10-29 17:50:22 +00:00 · 2014-04-21 11:42:23 -05:00
parent a2d6b374da 5c5999fbf3
commit 2f94e46f1f
185 changed files with 40351 additions and 1807 deletions
--- a/lib/linguist.rb
+++ b/lib/linguist.rb
@@ -1,5 +1,6 @@
 require 'linguist/blob_helper'
 require 'linguist/generated'
+require 'linguist/heuristics'
 require 'linguist/language'
 require 'linguist/repository'
 require 'linguist/samples'
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -190,9 +190,9 @@ module Linguist
    # Public: Is the blob safe to colorize?
    #
    # We use Pygments for syntax highlighting blobs. Pygments
-    # can be too slow for very large blobs or for certain 
+    # can be too slow for very large blobs or for certain
    # corner-case blobs.
-    # 
+    #
    # Return true or false
    def safe_to_colorize?
      !large? && text? && !high_ratio_of_long_lines?
--- a/lib/linguist/classifier.rb
+++ b/lib/linguist/classifier.rb
@@ -15,8 +15,8 @@ module Linguist
    #
    # Returns nothing.
    #
-    # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token,
-    # per-language.  See also dump_all_tokens, below.
+    # Set LINGUIST_DEBUG=1 or =2 to see probabilities per-token or
+    # per-language.  See also #dump_all_tokens, below.
    def self.train!(db, language, data)
      tokens = Tokenizer.tokenize(data)

@@ -78,18 +78,13 @@ module Linguist
    def classify(tokens, languages)
      return [] if tokens.nil?
      tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
-
      scores = {}
-      if verbosity >= 2
-        dump_all_tokens(tokens, languages)
-      end
+
+      debug_dump_all_tokens(tokens, languages) if verbosity >= 2
+
      languages.each do |language|
-        scores[language] = tokens_probability(tokens, language) +
-                                   language_probability(language)
-        if verbosity >= 1
-          printf "%10s = %10.3f + %7.3f = %10.3f\n",
-            language, tokens_probability(tokens, language), language_probability(language), scores[language]
-        end
+        scores[language] = tokens_probability(tokens, language) + language_probability(language)
+        debug_dump_probabilities(tokens, language, scores[language]) if verbosity >= 1
      end

      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
@@ -135,6 +130,11 @@ module Linguist
        @verbosity ||= (ENV['LINGUIST_DEBUG'] || 0).to_i
      end

+      def debug_dump_probabilities(tokens, language, score)
+        printf("%10s = %10.3f + %7.3f = %10.3f\n",
+            language, tokens_probability(tokens, language), language_probability(language), score)
+      end
+
      # Internal: show a table of probabilities for each <token,language> pair.
      #
      # The number in each table entry is the number of "points" that each
@@ -145,22 +145,22 @@ module Linguist
      # how much more likely (log of probability ratio) that token is to
      # appear in one language vs. the least-likely language.  Dashes
      # indicate the least-likely language (and zero points) for each token.
-      def dump_all_tokens(tokens, languages)
+      def debug_dump_all_tokens(tokens, languages)
        maxlen = tokens.map { |tok| tok.size }.max
-        
+
        printf "%#{maxlen}s", ""
        puts "    #" + languages.map { |lang| sprintf("%10s", lang) }.join
-        
-        tokmap = Hash.new(0)
-        tokens.each { |tok| tokmap[tok] += 1 }
-        
-        tokmap.sort.each { |tok, count|
+
+        token_map = Hash.new(0)
+        tokens.each { |tok| token_map[tok] += 1 }
+
+        token_map.sort.each { |tok, count|
          arr = languages.map { |lang| [lang, token_probability(tok, lang)] }
          min = arr.map { |a,b| b }.min
          minlog = Math.log(min)
          if !arr.inject(true) { |result, n| result && n[1] == arr[0][1] }
            printf "%#{maxlen}s%5d", tok, count
-            
+
            puts arr.map { |ent|
              ent[1] == min ? "         -" : sprintf("%10.3f", count * (Math.log(ent[1]) - minlog))
            }.join
--- a/lib/linguist/generated.rb
+++ b/lib/linguist/generated.rb
@@ -58,7 +58,12 @@ module Linguist
        generated_parser? ||
        generated_net_docfile? ||
        generated_net_designer_file? ||
-        generated_protocol_buffer?
+        generated_postscript? ||
+        generated_protocol_buffer? ||
+        generated_jni_header? ||
+        composer_lock? ||
+        node_modules? ||
+        vcr_cassette?
    end

    # Internal: Is the blob an XCode project file?
@@ -73,14 +78,16 @@ module Linguist

    # Internal: Is the blob minified files?
    #
-    # Consider a file minified if it contains more than 5% spaces.
+    # Consider a file minified if the average line length is
+    # greater then 110c.
+    #
    # Currently, only JS and CSS files are detected by this method.
    #
    # Returns true or false.
    def minified_files?
      return unless ['.js', '.css'].include? extname
-      if data && data.length > 200
-        (data.each_char.count{ |c| c <= ' ' } / data.length.to_f) < 0.05
+      if lines.any?
+        (lines.inject(0) { |n, l| n += l.length } / lines.length) > 110
      else
        false
      end
@@ -171,6 +178,29 @@ module Linguist
      false
    end

+    # Internal: Is the blob of PostScript generated?
+    #
+    # PostScript files are often generated by other programs. If they tell us so,
+    # we can detect them.
+    #
+    # Returns true or false.
+    def generated_postscript?
+      return false unless ['.ps', '.eps'].include? extname
+
+      # We analyze the "%%Creator:" comment, which contains the author/generator
+      # of the file. If there is one, it should be in one of the first few lines.
+      creator = lines[0..9].find {|line| line =~ /^%%Creator: /}
+      return false if creator.nil?
+
+      # Most generators write their version number, while human authors' or companies'
+      # names don't contain numbers. So look if the line contains digits. Also
+      # look for some special cases without version numbers.
+      return creator =~ /[0-9]/ ||
+        creator.include?("mpage") ||
+        creator.include?("draw") ||
+        creator.include?("ImageMagick")
+    end
+
    # Internal: Is the blob a C++, Java or Python source file generated by the
    # Protocol Buffer compiler?
    #
@@ -181,5 +211,40 @@ module Linguist

      return lines[0].include?("Generated by the protocol buffer compiler.  DO NOT EDIT!")
    end
+
+    # Internal: Is the blob a C/C++ header generated by the Java JNI tool javah?
+    #
+    # Returns true of false.
+    def generated_jni_header?
+      return false unless extname == '.h'
+      return false unless lines.count > 2
+
+      return lines[0].include?("/* DO NOT EDIT THIS FILE - it is machine generated */") &&
+               lines[1].include?("#include <jni.h>")
+    end
+
+    # Internal: Is the blob part of node_modules/, which are not meant for humans in pull requests.
+    #
+    # Returns true or false.
+    def node_modules?
+      !!name.match(/node_modules\//)
+    end
+
+    # Internal: Is the blob a generated php composer lock file?
+    #
+    # Returns true or false.
+    def composer_lock?
+      !!name.match(/composer.lock/)
+    end
+
+    # Is the blob a VCR Cassette file?
+    #
+    # Returns true or false
+    def vcr_cassette?
+      return false unless extname == '.yml'
+      return false unless lines.count > 2
+      # VCR Cassettes have "recorded_with: VCR" in the second last line.
+      return lines[-2].include?("recorded_with: VCR")
+    end
  end
 end
--- a/lib/linguist/heuristics.rb
+++ b/lib/linguist/heuristics.rb
@@ -0,0 +1,80 @@
+module Linguist
+  # A collection of simple heuristics that can be used to better analyze languages.
+  class Heuristics
+    ACTIVE = false
+
+    # Public: Given an array of String language names,
+    # apply heuristics against the given data and return an array
+    # of matching languages, or nil.
+    #
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Returns an array of Languages or []
+    def self.find_by_heuristics(data, languages)
+      if active?
+        if languages.all? { |l| ["Objective-C", "C++"].include?(l) }
+          disambiguate_c(data, languages)
+        end
+        if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
+          disambiguate_pl(data, languages)
+        end
+        if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
+          disambiguate_ecl(data, languages)
+        end
+        if languages.all? { |l| ["TypeScript", "XML"].include?(l) }
+          disambiguate_ts(data, languages)
+        end
+        if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
+          disambiguate_cl(data, languages)
+        end
+      end
+    end
+
+    # .h extensions are ambigious between C, C++, and Objective-C.
+    # We want to shortcut look for Objective-C _and_ now C++ too!
+    #
+    # Returns an array of Languages or []
+    def self.disambiguate_c(data, languages)
+      matches = []
+      matches << Language["Objective-C"] if data.include?("@interface")
+      matches << Language["C++"] if data.include?("#include <cstdint>")
+      matches
+    end
+
+    def self.disambiguate_pl(data, languages)
+      matches = []
+      matches << Language["Prolog"] if data.include?(":-")
+      matches << Language["Perl"] if data.include?("use strict")
+      matches
+    end
+
+    def self.disambiguate_ecl(data, languages)
+      matches = []
+      matches << Language["Prolog"] if data.include?(":-")
+      matches << Language["ECL"] if data.include?(":=")
+      matches
+    end
+
+    def self.disambiguate_ts(data, languages)
+      matches = []
+      if (data.include?("</translation>"))
+        matches << Language["XML"]
+      else
+        matches << Language["TypeScript"]
+      end
+      matches
+    end
+
+    def self.disambiguate_cl(data, languages)
+      matches = []
+      matches << Language["Common Lisp"] if data.include?("(defun ")
+      matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data)
+      matches
+    end
+
+    def self.active?
+      !!ACTIVE
+    end
+  end
+end
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -1,8 +1,13 @@
 require 'escape_utils'
 require 'pygments'
 require 'yaml'
+begin
+  require 'json'
+rescue LoadError
+end

 require 'linguist/classifier'
+require 'linguist/heuristics'
 require 'linguist/samples'

 module Linguist
@@ -15,17 +20,29 @@ module Linguist
    @index           = {}
    @name_index      = {}
    @alias_index     = {}
-    @extension_index = Hash.new { |h,k| h[k] = [] }
-    @filename_index  = Hash.new { |h,k| h[k] = [] }
+
+    @extension_index          = Hash.new { |h,k| h[k] = [] }
+    @interpreter_index        = Hash.new { |h,k| h[k] = [] }
+    @filename_index           = Hash.new { |h,k| h[k] = [] }
+    @primary_extension_index  = {}

    # Valid Languages types
-    TYPES = [:data, :markup, :programming]
+    TYPES = [:data, :markup, :programming, :prose]

    # Names of non-programming languages that we will still detect
    #
    # Returns an array
    def self.detectable_markup
-      ["CSS", "Less", "Sass"]
+      ["CSS", "Less", "Sass", "SCSS", "Stylus", "TeX"]
+    end
+
+    # Detect languages by a specific type
+    #
+    # type - A symbol that exists within TYPES
+    #
+    # Returns an array
+    def self.by_type(type)
+      all.select { |h| h.type == type }
    end

    # Internal: Create a new Language object
@@ -63,6 +80,16 @@ module Linguist
        @extension_index[extension] << language
      end

+      if @primary_extension_index.key?(language.primary_extension)
+        raise ArgumentError, "Duplicate primary extension: #{language.primary_extension}"
+      end
+
+      @primary_extension_index[language.primary_extension] = language
+
+      language.interpreters.each do |interpreter|
+        @interpreter_index[interpreter] << language
+      end
+
      language.filenames.each do |filename|
        @filename_index[filename] << language
      end
@@ -87,16 +114,32 @@ module Linguist
        name += ".script!"
      end

+      # First try to find languages that match based on filename.
      possible_languages = find_by_filename(name)

+      # If there is more than one possible language with that extension (or no
+      # extension at all, in the case of extensionless scripts), we need to continue
+      # our detection work
      if possible_languages.length > 1
        data = data.call() if data.respond_to?(:call)
+        possible_language_names = possible_languages.map(&:name)
+
+        # Don't bother with emptiness
        if data.nil? || data == ""
          nil
-        elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
-          Language[result[0]]
+        # Check if there's a shebang line and use that as authoritative
+        elsif (result = find_by_shebang(data)) && !result.empty?
+          result.first
+        # No shebang. Still more work to do. Try to find it with our heuristics.
+        elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
+          determined.first
+        # Lastly, fall back to the probablistic classifier.
+        elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names ).first
+          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
+          Language[classified[0]]
        end
      else
+        # Simplest and most common case, we can just return the one match based on extension
        possible_languages.first
      end
    end
@@ -148,7 +191,24 @@ module Linguist
    # Returns all matching Languages or [] if none were found.
    def self.find_by_filename(filename)
      basename, extname = File.basename(filename), File.extname(filename)
-      @filename_index[basename] + @extension_index[extname]
+      langs = [@primary_extension_index[extname]] +
+              @filename_index[basename] +
+              @extension_index[extname]
+      langs.compact.uniq
+    end
+
+    # Public: Look up Languages by shebang line.
+    #
+    # data - Array of tokens or String data to analyze.
+    #
+    # Examples
+    #
+    #   Language.find_by_shebang("#!/bin/bash\ndate;")
+    #   # => [#<Language name="Bash">]
+    #
+    # Returns the matching Language
+    def self.find_by_shebang(data)
+      @interpreter_index[Linguist.interpreter_from_shebang(data)]
    end

    # Public: Look up Language by its name or lexer.
@@ -236,6 +296,7 @@ module Linguist

      # Set extensions or default to [].
      @extensions = attributes[:extensions] || []
+      @interpreters = attributes[:interpreters]   || []
      @filenames  = attributes[:filenames]  || []

      unless @primary_extension = attributes[:primary_extension]
@@ -348,6 +409,15 @@ module Linguist
    # Returns the extension String.
    attr_reader :primary_extension

+    # Public: Get interpreters
+    #
+    # Examples
+    #
+    #   # => ['awk', 'gawk', 'mawk' ...]
+    #
+    # Returns the interpreters Array
+    attr_reader :interpreters
+
    # Public: Get filenames
    #
    # Examples
@@ -415,7 +485,7 @@ module Linguist
    #
    # Returns html String
    def colorize(text, options = {})
-      lexer.highlight(text, options = {})
+      lexer.highlight(text, options)
    end

    # Public: Return name as String representation
@@ -441,11 +511,22 @@ module Linguist
  end

  extensions = Samples::DATA['extnames']
+  interpreters = Samples::DATA['interpreters']
  filenames = Samples::DATA['filenames']
  popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))

-  YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options|
+  languages_yml = File.expand_path("../languages.yml", __FILE__)
+  languages_json = File.expand_path("../languages.json", __FILE__)
+
+  if File.exist?(languages_json) && defined?(JSON)
+    languages = JSON.load(File.read(languages_json))
+  else
+    languages = YAML.load_file(languages_yml)
+  end
+
+  languages.each do |name, options|
    options['extensions'] ||= []
+    options['interpreters'] ||= []
    options['filenames'] ||= []

    if extnames = extensions[name]
@@ -456,6 +537,18 @@ module Linguist
      end
    end

+    if interpreters == nil
+      interpreters = {}
+    end
+
+    if interpreter_names = interpreters[name]
+      interpreter_names.each do |interpreter|
+        if !options['interpreters'].include?(interpreter)
+          options['interpreters'] << interpreter
+        end
+      end
+    end
+
    if fns = filenames[name]
      fns.each do |filename|
        if !options['filenames'].include?(filename)
@@ -476,6 +569,7 @@ module Linguist
      :searchable        => options.key?('searchable') ? options['searchable'] : true,
      :search_term       => options['search_term'],
      :extensions        => options['extensions'].sort,
+      :interpreters      => options['interpreters'].sort,
      :primary_extension => options['primary_extension'],
      :filenames         => options['filenames'],
      :popular           => popular.include?(name)
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
--- a/lib/linguist/repository.rb
+++ b/lib/linguist/repository.rb
@@ -29,6 +29,7 @@ module Linguist
      @computed_stats = false
      @language = @size = nil
      @sizes = Hash.new { 0 }
+      @file_breakdown = Hash.new { |h,k| h[k] = Array.new }
    end

    # Public: Returns a breakdown of language stats.
@@ -60,6 +61,12 @@ module Linguist
      @size
    end

+    # Public: Return the language breakdown of this repository by file
+    def breakdown_by_file
+      compute_stats
+      @file_breakdown
+    end
+
    # Internal: Compute language breakdown for each blob in the Repository.
    #
    # Returns nothing
@@ -75,6 +82,10 @@ module Linguist

        # Only include programming languages and acceptable markup languages
        if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name)
+
+          # Build up the per-file breakdown stats
+          @file_breakdown[blob.language.group.name] << blob.name
+
          @sizes[blob.language.group] += blob.size
        end
      end
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
--- a/lib/linguist/samples.rb
+++ b/lib/linguist/samples.rb
@@ -1,4 +1,8 @@
-require 'yaml'
+begin
+  require 'json'
+rescue LoadError
+  require 'yaml'
+end

 require 'linguist/md5'
 require 'linguist/classifier'
@@ -14,7 +18,8 @@ module Linguist

    # Hash of serialized samples object
    if File.exist?(PATH)
-      DATA = YAML.load_file(PATH)
+      serializer = defined?(JSON) ? JSON : YAML
+      DATA = serializer.load(File.read(PATH))
    end

    # Public: Iterate over each sample.
@@ -52,6 +57,7 @@ module Linguist
            yield({
              :path     => File.join(dirname, filename),
              :language => category,
+              :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
              :extname  => File.extname(filename)
            })
          end
@@ -67,6 +73,7 @@ module Linguist
    def self.data
      db = {}
      db['extnames'] = {}
+      db['interpreters'] = {}
      db['filenames'] = {}

      each do |sample|
@@ -80,6 +87,14 @@ module Linguist
          end
        end

+        if sample[:interpreter]
+          db['interpreters'][language_name] ||= []
+          if !db['interpreters'][language_name].include?(sample[:interpreter])
+            db['interpreters'][language_name] << sample[:interpreter]
+            db['interpreters'][language_name].sort!
+          end
+        end
+
        if sample[:filename]
          db['filenames'][language_name] ||= []
          db['filenames'][language_name] << sample[:filename]
@@ -95,4 +110,40 @@ module Linguist
      db
    end
  end
+
+  # Used to retrieve the interpreter from the shebang line of a file's
+  # data.
+  def self.interpreter_from_shebang(data)
+    lines = data.lines.to_a
+
+    if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
+      bang.sub!(/^#! /, '#!')
+      tokens = bang.split(' ')
+      pieces = tokens.first.split('/')
+
+      if pieces.size > 1
+        script = pieces.last
+      else
+        script = pieces.first.sub('#!', '')
+      end
+
+      script = script == 'env' ? tokens[1] : script
+
+      # "python2.6" -> "python"
+      if script =~ /((?:\d+\.?)+)/
+        script.sub! $1, ''
+      end
+
+      # Check for multiline shebang hacks that call `exec`
+      if script == 'sh' &&
+        lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
+        script = $1
+      end
+
+      script
+    else
+      nil
+    end
+  end
+
 end
--- a/lib/linguist/vendor.yml
+++ b/lib/linguist/vendor.yml
@@ -10,7 +10,10 @@
 ## Vendor Conventions ##

 # Caches
- cache/
+- (^|/)cache/
+
+# Dependencies
+- ^[Dd]ependencies/

 # C deps
 #  https://github.com/joyent/node
@@ -24,20 +27,34 @@
 # Node dependencies
 - node_modules/

+# Bower Components
+- bower_components/
+
 # Erlang bundles
 - ^rebar$

+# Bootstrap minified css and js
+- (^|/)bootstrap([^.]*)(\.min)?\.(js|css)$
+
+# Foundation css
+- foundation.min.css
+- foundation.css
+
 # Vendored dependencies
- vendor/
+- thirdparty/
+- vendors?/

 # Debian packaging
 - ^debian/

+# Haxelib projects often contain a neko bytecode file named run.n
+- run.n$
+
 ## Commonly Bundled JavaScript frameworks ##

 # jQuery
 - (^|/)jquery([^.]*)(\.min)?\.js$
- (^|/)jquery\-\d\.\d(\.\d)?(\.min)?\.js$
+- (^|/)jquery\-\d\.\d+(\.\d+)?(\.min)?\.js$

 # jQuery UI
 - (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?(\.min)?\.(js|css)$
@@ -49,6 +66,9 @@
 - (^|/)controls\.js$
 - (^|/)dragdrop\.js$

+# Typescript definition files
+- (.*?)\.d\.ts$
+
 # MooTools
 - (^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$

@@ -75,6 +95,12 @@
 - (^|/)shCore\.js$
 - (^|/)shLegacy\.js$

+# AngularJS
+- (^|/)angular([^.]*)(\.min)?\.js$
+
+# React
+- (^|/)react(-[^.]*)?(\.min)?\.js$
+
 ## Python ##

 # django
@@ -86,12 +112,21 @@
 # WAF
 - ^waf$

+# .osx
+- ^.osx$

 ## Obj-C ##

 # Sparkle
 - (^|/)Sparkle/

+## Groovy ##
+
+# Gradle
+- (^|/)gradlew$
+- (^|/)gradlew\.bat$
+- (^|/)gradle/wrapper/
+
 ## .NET ##

 # Visual Studio IntelliSense
@@ -108,14 +143,30 @@
 - ^[Pp]ackages/

 # ExtJS
- (^|/)extjs/
+- (^|/)extjs/.*?\.js$
+- (^|/)extjs/.*?\.xml$
+- (^|/)extjs/.*?\.txt$
+- (^|/)extjs/.*?\.html$
+- (^|/)extjs/.*?\.properties$
+- (^|/)extjs/.sencha/
+- (^|/)extjs/docs/
+- (^|/)extjs/builds/
+- (^|/)extjs/cmd/
+- (^|/)extjs/examples/
+- (^|/)extjs/locale/
+- (^|/)extjs/packages/
+- (^|/)extjs/plugins/
+- (^|/)extjs/resources/
+- (^|/)extjs/src/
+- (^|/)extjs/welcome/

 # Samples folders
 - ^[Ss]amples/

 # LICENSE, README, git config files
 - ^COPYING$
- ^LICENSE$
+- LICENSE$
+- License$
 - gitattributes$
 - gitignore$
 - gitmodules$
@@ -125,5 +176,12 @@
 # Test fixtures
 - ^[Tt]est/fixtures/

+# PhoneGap/Cordova
+- (^|/)cordova([^.]*)(\.min)?\.js$
+- (^|/)cordova\-\d\.\d(\.\d)?(\.min)?\.js$
+
+# Vagrant
+- ^Vagrantfile$
+
 # .DS_Store's
 - .[Dd][Ss]_[Ss]tore$