Merge master

2026-05-12 16:36:08 +00:00 · 2014-11-28 11:04:53 -08:00
parent 4603f3b2e7 b16149d641
commit 26ab33754f
34 changed files with 2071 additions and 97 deletions
--- a/lib/linguist/file_blob.rb
+++ b/lib/linguist/file_blob.rb
@@ -57,14 +57,20 @@ module Linguist
    #
    # Returns a String.
    def extension
-      # File.extname returns nil if the filename is an extension.
-      extension = File.extname(name)
-      basename = File.basename(name)
-      # Checks if the filename is an extension.
-      if extension.empty? && basename[0] == "."
-        basename
-      else
-        extension
+      extensions.last || ""
+    end
+
+    # Public: Return an array of the file extensions
+    #
+    #     >> Linguist::FileBlob.new("app/views/things/index.html.erb").extensions
+    #     => [".html.erb", ".erb"]
+    #
+    # Returns an Array
+    def extensions
+      basename, *segments = File.basename(name).split(".")
+
+      segments.map.with_index do |segment, index|
+        "." + segments[index..-1].join(".")
      end
    end
  end
--- a/lib/linguist/heuristics.rb
+++ b/lib/linguist/heuristics.rb
@@ -39,6 +39,9 @@ module Linguist
        if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) }
          result = disambiguate_f(data)
        end
+        if languages.all? { |l| ["F#", "Forth", "GLSL"].include?(l) }
+          result = disambiguate_fs(data)
+        end
        return result
      end
    end
@@ -151,6 +154,18 @@ module Linguist
      matches
    end

+    def self.disambiguate_fs(data)
+      matches = []
+      if /^(: |new-device)/.match(data)
+        matches << Language["Forth"]
+      elsif /^(#light|import|let|module|namespace|open|type)/.match(data)
+        matches << Language["F#"]
+      elsif /^(#include|#pragma|precision|uniform|varying|void)/.match(data)
+        matches << Language["GLSL"]
+      end
+      matches
+    end
+
    def self.active?
      !!ACTIVE
    end
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -106,40 +106,52 @@ module Linguist
      # A bit of an elegant hack. If the file is executable but extensionless,
      # append a "magic" extension so it can be classified with other
      # languages that have shebang scripts.
-      extension = FileBlob.new(name).extension
-      if extension.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
+      extensions = FileBlob.new(name).extensions
+      if extensions.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
        name += ".script!"
      end

-      # First try to find languages that match based on filename.
+      # Find languages that match based on filename.
      possible_languages = find_by_filename(name)

-      # If there is more than one possible language with that extension (or no
-      # extension at all, in the case of extensionless scripts), we need to continue
-      # our detection work
-      if possible_languages.length > 1
-        data = blob.data
-        possible_language_names = possible_languages.map(&:name)
-        heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
+      if possible_languages.length == 1
+        # Simplest and most common case, we can just return the one match based
+        # on extension
+        possible_languages.first

-        if heuristic_languages.size > 1
-          possible_language_names = heuristic_languages.map(&:name)
-        end
+      # If there is more than one possible language with that extension (or no
+      # extension at all, in the case of extensionless scripts), we need to
+      # continue our detection work
+      else
+        # Matches possible_languages.length == 0 || possible_languages.length > 0
+        data = blob.data

        # Check if there's a shebang line and use that as authoritative
        if (result = find_by_shebang(data)) && !result.empty?
-          result.first
-        # No shebang. Still more work to do. Try to find it with our heuristics.
-        elsif heuristic_languages.size == 1
-          heuristic_languages.first
-        # Lastly, fall back to the probabilistic classifier.
-        elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
-          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
-          Language[classified[0]]
+          return result.first
+
+        # More than one language with that extension. We need to make a choice.
+        elsif possible_languages.length > 1
+
+          # First try heuristics
+
+          possible_language_names = possible_languages.map(&:name)
+          heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
+
+          # If there are multiple possible languages returned from heuristics
+          # then reduce language candidates for Bayesian classifier here.
+          if heuristic_languages.size > 1
+            possible_language_names = heuristic_languages.map(&:name)
+          end
+
+          if heuristic_languages.size == 1
+            return heuristic_languages.first
+          # Lastly, fall back to the probabilistic classifier.
+          elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
+            # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
+            return Language[classified[0]]
+          end
        end
-      else
-        # Simplest and most common case, we can just return the one match based on extension
-        possible_languages.first
      end
    end

@@ -190,8 +202,13 @@ module Linguist
    # Returns all matching Languages or [] if none were found.
    def self.find_by_filename(filename)
      basename = File.basename(filename)
-      extname = FileBlob.new(filename).extension
-      (@filename_index[basename] + find_by_extension(extname)).compact.uniq
+
+      # find the first extension with language definitions
+      extname = FileBlob.new(filename).extensions.detect do |e|
+        !@extension_index[e].empty?
+      end
+
+      (@filename_index[basename] + @extension_index[extname]).compact.uniq
    end

    # Public: Look up Languages by file extension.
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -506,6 +506,7 @@ CoffeeScript:
  extensions:
  - .coffee
  - ._coffee
+  - .cjsx
  - .cson
  - .iced
  filenames:
@@ -607,6 +608,8 @@ Crystal:
  - .cr
  ace_mode: ruby
  tm_scope: source.ruby
+  interpreters:
+  - crystal

 Cucumber:
  extensions:
@@ -806,7 +809,6 @@ Erlang:
  - .es
  - .escript
  - .hrl
-  ace_mode: erlang

 F#:
  type: programming
@@ -892,7 +894,6 @@ Forth:
  - .for
  - .forth
  - .frt
-  ace_mode: forth

 Frege:
  type: programming
@@ -952,6 +953,7 @@ GLSL:
  - .fp
  - .frag
  - .frg
+  - .fs
  - .fshader
  - .geo
  - .geom
@@ -1022,6 +1024,8 @@ Gnuplot:
  - .gnuplot
  - .plot
  - .plt
+  interpreters:
+  - gnuplot
  ace_mode: none

 Go:
@@ -1313,6 +1317,8 @@ Ioke:
  color: "#078193"
  extensions:
  - .ik
+  interpreters:
+  - ioke
  ace_mode: none

 Isabelle:
@@ -1868,6 +1874,8 @@ Nu:
  - Nukefile
  tm_scope: source.scheme
  ace_mode: scheme
+  interpreters:
+  - nush

 NumPy:
  group: Python
@@ -2072,6 +2080,8 @@ Parrot Assembly:
  - pasm
  extensions:
  - .pasm
+  interpreters:
+  - parrot
  tm_scope: none
  ace_mode: none

@@ -2083,6 +2093,8 @@ Parrot Internal Representation:
  - pir
  extensions:
  - .pir
+  interpreters:
+  - parrot
  ace_mode: none

 Pascal:
@@ -2127,6 +2139,8 @@ Perl6:
  - .p6m
  - .pl6
  - .pm6
+  interpreters:
+  - perl6
  tm_scope: none
  ace_mode: perl

@@ -2197,6 +2211,8 @@ Prolog:
  - .ecl
  - .pro
  - .prolog
+  interpreters:
+  - swipl
  ace_mode: prolog

 Propeller Spin:
@@ -2266,6 +2282,8 @@ Python:
  - wscript
  interpreters:
  - python
+  - python2
+  - python3

 Python traceback:
  type: data
@@ -2288,6 +2306,8 @@ QMake:
  extensions:
  - .pro
  - .pri
+  interpreters:
+  - qmake
  ace_mode: none

 R:
@@ -2453,6 +2473,8 @@ Ruby:
  - .watchr
  interpreters:
  - ruby
+  - macruby
+  - rake
  filenames:
  - .pryrc
  - Appraisals
@@ -2545,6 +2567,8 @@ Scala:
  - .scala
  - .sbt
  - .sc
+  interpreters:
+  - scala

 Scaml:
  group: HTML
--- a/lib/linguist/samples.rb
+++ b/lib/linguist/samples.rb
@@ -52,14 +52,16 @@ module Linguist
              })
            end
          else
+            path = File.join(dirname, filename)
+
            if File.extname(filename) == ""
-              raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
+              raise "#{path} is missing an extension, maybe it belongs in filenames/ subdir"
            end

            yield({
-              :path     => File.join(dirname, filename),
+              :path     => path,
              :language => category,
-              :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
+              :interpreter => Linguist.interpreter_from_shebang(File.read(path)),
              :extname  => File.extname(filename)
            })
          end
@@ -131,18 +133,19 @@ module Linguist

      script = script == 'env' ? tokens[1] : script

-      # "python2.6" -> "python"
-      if script =~ /((?:\d+\.?)+)/
-        script.sub! $1, ''
-      end
+      # If script has an invalid shebang, we might get here
+      return unless script
+
+      # "python2.6" -> "python2"
+      script.sub! $1, '' if script =~ /(\.\d+)$/

      # Check for multiline shebang hacks that call `exec`
      if script == 'sh' &&
        lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
        script = $1
      end
-
-      script
+      
+      File.basename(script)
    else
      nil
    end