language interpreters and shebang lines

Add an interpreter array to each language, and match interpreters found in the shebang lines of scripts to this array to identify the language of scripts. With suggestions from tnm. https://github.com/github/linguist/pull/687
2025-10-29 17:50:22 +00:00 · 2013-09-15 09:55:59 -06:00
parent eb5f1468d2
commit 7a6202a8c3
2 changed files with 84 additions and 0 deletions
--- a/lib/linguist/samples.rb
+++ b/lib/linguist/samples.rb
@@ -52,6 +52,7 @@ module Linguist
            yield({
              :path     => File.join(dirname, filename),
              :language => category,
+              :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
              :extname  => File.extname(filename)
            })
          end
@@ -67,6 +68,7 @@ module Linguist
    def self.data
      db = {}
      db['extnames'] = {}
+      db['interpreters'] = {}
      db['filenames'] = {}

      each do |sample|
@@ -80,6 +82,14 @@ module Linguist
          end
        end

+        if sample[:interpreter]
+          db['interpreters'][language_name] ||= []
+          if !db['interpreters'][language_name].include?(sample[:interpreter])
+            db['interpreters'][language_name] << sample[:interpreter]
+            db['interpreters'][language_name].sort!
+          end
+        end
+
        if sample[:filename]
          db['filenames'][language_name] ||= []
          db['filenames'][language_name] << sample[:filename]
@@ -95,4 +105,32 @@ module Linguist
      db
    end
  end
+
+  # Used to retrieve the interpreter from the shebang line of a file's
+  # data.
+  def self.interpreter_from_shebang(data)
+    lines = data.lines
+
+    if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
+      bang.sub!(/^#! /, '#!')
+      tokens = bang.split(' ')
+      pieces = tokens.first.split('/')
+
+      if pieces.size > 1
+        script = pieces.last
+      else
+        script = pieces.first.sub('#!', '')
+      end
+
+      script = script == 'env' ? tokens[1] : script
+
+      # "python2.6" -> "python"
+      if script =~ /((?:\d+\.?)+)/
+        script.sub! $1, ''
+      end
+
+      script
+    end
+  end
+
 end