Merge branch 'master' of https://github.com/github/linguist

Conflicts: grammars.yml
2026-03-09 15:39:33 +00:00 · 2014-11-27 13:47:56 +01:00
parent 02db72515f 9f103abfb5
commit 43ee45d9b6
30 changed files with 1732 additions and 95 deletions
--- a/lib/linguist/file_blob.rb
+++ b/lib/linguist/file_blob.rb
@@ -57,14 +57,20 @@ module Linguist
    #
    # Returns a String.
    def extension
-      # File.extname returns nil if the filename is an extension.
-      extension = File.extname(name)
-      basename = File.basename(name)
-      # Checks if the filename is an extension.
-      if extension.empty? && basename[0] == "."
-        basename
-      else
-        extension
+      extensions.last || ""
+    end
+
+    # Public: Return an array of the file extensions
+    #
+    #     >> Linguist::FileBlob.new("app/views/things/index.html.erb").extensions
+    #     => [".html.erb", ".erb"]
+    #
+    # Returns an Array
+    def extensions
+      basename, *segments = File.basename(name).split(".")
+
+      segments.map.with_index do |segment, index|
+        "." + segments[index..-1].join(".")
      end
    end
  end
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -106,40 +106,52 @@ module Linguist
      # A bit of an elegant hack. If the file is executable but extensionless,
      # append a "magic" extension so it can be classified with other
      # languages that have shebang scripts.
-      extension = FileBlob.new(name).extension
-      if extension.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
+      extensions = FileBlob.new(name).extensions
+      if extensions.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
        name += ".script!"
      end

-      # First try to find languages that match based on filename.
+      # Find languages that match based on filename.
      possible_languages = find_by_filename(name)

-      # If there is more than one possible language with that extension (or no
-      # extension at all, in the case of extensionless scripts), we need to continue
-      # our detection work
-      if possible_languages.length > 1
-        data = blob.data
-        possible_language_names = possible_languages.map(&:name)
-        heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
+      if possible_languages.length == 1
+        # Simplest and most common case, we can just return the one match based
+        # on extension
+        possible_languages.first

-        if heuristic_languages.size > 1
-          possible_language_names = heuristic_languages.map(&:name)
-        end
+      # If there is more than one possible language with that extension (or no
+      # extension at all, in the case of extensionless scripts), we need to
+      # continue our detection work
+      else
+        # Matches possible_languages.length == 0 || possible_languages.length > 0
+        data = blob.data

        # Check if there's a shebang line and use that as authoritative
        if (result = find_by_shebang(data)) && !result.empty?
-          result.first
-        # No shebang. Still more work to do. Try to find it with our heuristics.
-        elsif heuristic_languages.size == 1
-          heuristic_languages.first
-        # Lastly, fall back to the probabilistic classifier.
-        elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
-          # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
-          Language[classified[0]]
+          return result.first
+
+        # More than one language with that extension. We need to make a choice.
+        elsif possible_languages.length > 1
+
+          # First try heuristics
+
+          possible_language_names = possible_languages.map(&:name)
+          heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
+
+          # If there are multiple possible languages returned from heuristics
+          # then reduce language candidates for Bayesian classifier here.
+          if heuristic_languages.size > 1
+            possible_language_names = heuristic_languages.map(&:name)
+          end
+
+          if heuristic_languages.size == 1
+            return heuristic_languages.first
+          # Lastly, fall back to the probabilistic classifier.
+          elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
+            # Return the actual Language object based of the string language name (i.e., first element of `#classify`)
+            return Language[classified[0]]
+          end
        end
-      else
-        # Simplest and most common case, we can just return the one match based on extension
-        possible_languages.first
      end
    end

@@ -190,8 +202,13 @@ module Linguist
    # Returns all matching Languages or [] if none were found.
    def self.find_by_filename(filename)
      basename = File.basename(filename)
-      extname = FileBlob.new(filename).extension
-      (@filename_index[basename] + find_by_extension(extname)).compact.uniq
+
+      # find the first extension with language definitions
+      extname = FileBlob.new(filename).extensions.detect do |e|
+        !@extension_index[e].empty?
+      end
+
+      (@filename_index[basename] + @extension_index[extname]).compact.uniq
    end

    # Public: Look up Languages by file extension.
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -470,6 +470,7 @@ CoffeeScript:
  extensions:
  - .coffee
  - ._coffee
+  - .cjsx
  - .cson
  - .iced
  filenames:
@@ -566,6 +567,8 @@ Crystal:
  - .cr
  ace_mode: ruby
  tm_scope: source.ruby
+  interpreters:
+  - crystal

 Cucumber:
  extensions:
@@ -743,6 +746,8 @@ Erlang:
  - .es
  - .escript
  - .hrl
+  interpreters:
+  - escript

 F#:
  type: programming
@@ -938,6 +943,8 @@ Gnuplot:
  - .gnuplot
  - .plot
  - .plt
+  interpreters:
+  - gnuplot

 Go:
  type: programming
@@ -1203,6 +1210,8 @@ Ioke:
  color: "#078193"
  extensions:
  - .ik
+  interpreters:
+  - ioke

 Isabelle:
  type: programming
@@ -1710,6 +1719,8 @@ Nu:
  filenames:
  - Nukefile
  tm_scope: source.scheme
+  interpreters:
+  - nush

 NumPy:
  group: Python
@@ -1896,6 +1907,8 @@ Parrot Assembly:
  - pasm
  extensions:
  - .pasm
+  interpreters:
+  - parrot
  tm_scope: none

 Parrot Internal Representation:
@@ -1906,6 +1919,8 @@ Parrot Internal Representation:
  - pir
  extensions:
  - .pir
+  interpreters:
+  - parrot

 Pascal:
  type: programming
@@ -1948,6 +1963,8 @@ Perl6:
  - .p6m
  - .pl6
  - .pm6
+  interpreters:
+  - perl6
  tm_scope: none

 PigLatin:
@@ -2012,6 +2029,8 @@ Prolog:
  - .ecl
  - .pro
  - .prolog
+  interpreters:
+  - swipl

 Propeller Spin:
  type: programming
@@ -2075,6 +2094,8 @@ Python:
  - wscript
  interpreters:
  - python
+  - python2
+  - python3

 Python traceback:
  type: data
@@ -2095,6 +2116,8 @@ QMake:
  extensions:
  - .pro
  - .pri
+  interpreters:
+  - qmake

 R:
  type: programming
@@ -2249,6 +2272,8 @@ Ruby:
  - .watchr
  interpreters:
  - ruby
+  - macruby
+  - rake
  filenames:
  - .pryrc
  - Appraisals
@@ -2335,6 +2360,8 @@ Scala:
  - .scala
  - .sbt
  - .sc
+  interpreters:
+  - scala

 Scaml:
  group: HTML
--- a/lib/linguist/samples.rb
+++ b/lib/linguist/samples.rb
@@ -52,14 +52,16 @@ module Linguist
              })
            end
          else
+            path = File.join(dirname, filename)
+
            if File.extname(filename) == ""
-              raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
+              raise "#{path} is missing an extension, maybe it belongs in filenames/ subdir"
            end

            yield({
-              :path     => File.join(dirname, filename),
+              :path     => path,
              :language => category,
-              :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
+              :interpreter => Linguist.interpreter_from_shebang(File.read(path)),
              :extname  => File.extname(filename)
            })
          end
@@ -131,18 +133,19 @@ module Linguist

      script = script == 'env' ? tokens[1] : script

-      # "python2.6" -> "python"
-      if script =~ /((?:\d+\.?)+)/
-        script.sub! $1, ''
-      end
+      # If script has an invalid shebang, we might get here
+      return unless script
+
+      # "python2.6" -> "python2"
+      script.sub! $1, '' if script =~ /(\.\d+)$/

      # Check for multiline shebang hacks that call `exec`
      if script == 'sh' &&
        lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
        script = $1
      end
-
-      script
+      
+      File.basename(script)
    else
      nil
    end