Move shebang detection into classifier

Fixes #203
2025-12-08 20:38:47 +00:00 · 2012-08-03 15:07:36 -05:00
parent fbbaff09cd
commit 16a67cb852
24 changed files with 178 additions and 275 deletions
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -1,3 +1,5 @@
+require 'strscan'
+
 module Linguist
  # Generic programming language tokenizer.
  #
@@ -50,8 +52,13 @@ module Linguist

      tokens = []
      until s.eos?
+        if token = s.scan(/^#!.+$/)
+          if name = extract_shebang(token)
+            tokens << "SHEBANG#!#{name}"
+          end
+
        # Single line comment
-        if token = s.scan(START_SINGLE_LINE_COMMENT)
+        elsif token = s.scan(START_SINGLE_LINE_COMMENT)
          tokens << token.strip
          s.skip_until(/\n|\Z/)

@@ -103,6 +110,33 @@ module Linguist
      tokens
    end

+    # Internal: Extract normalized shebang command token.
+    #
+    # Examples
+    #
+    #   extract_shebang("#!/usr/bin/ruby")
+    #   # => "ruby"
+    #
+    #   extract_shebang("#!/usr/bin/env node")
+    #   # => "node"
+    #
+    # Returns String token or nil it couldn't be parsed.
+    def extract_shebang(data)
+      s = StringScanner.new(data)
+
+      if path = s.scan(/^#!\s*\S+/)
+        script = path.split('/').last
+        if script == 'env'
+          s.scan(/\s+/)
+          script = s.scan(/\S+/)
+        end
+        script = script[/[^\d]+/, 0]
+        return script
+      end
+
+      nil
+    end
+
    # Internal: Extract tokens from inside SGML tag.
    #
    # data - SGML tag String.