diff --git a/lib/linguist.rb b/lib/linguist.rb index 3714b5a0..ff9fc3a2 100644 --- a/lib/linguist.rb +++ b/lib/linguist.rb @@ -4,4 +4,5 @@ require 'linguist/heuristics' require 'linguist/language' require 'linguist/repository' require 'linguist/samples' +require 'linguist/shebang' require 'linguist/version' diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 2353edd1..78aee90b 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -11,7 +11,7 @@ require 'linguist/samples' require 'linguist/file_blob' require 'linguist/blob_helper' require 'linguist/strategy/filename' -require 'linguist/strategy/shebang' +require 'linguist/shebang' module Linguist # Language names that are recognizable by GitHub. Defined languages @@ -95,7 +95,7 @@ module Linguist STRATEGIES = [ Linguist::Strategy::Filename, - Linguist::Strategy::Shebang, + Linguist::Shebang, Linguist::Heuristics, Linguist::Classifier ] @@ -199,20 +199,26 @@ module Linguist @extension_index[extname] end - # Public: Look up Languages by shebang line. + # DEPRECATED + def self.find_by_shebang(data) + @interpreter_index[Shebang.interpreter(data)] + end + + # Public: Look up Languages by interpreter. # - # data - Array of tokens or String data to analyze. + # interpreter - String of interpreter name # # Examples # - # Language.find_by_shebang("#!/bin/bash\ndate;") + # Language.find_by_interpreter("bash") # # => [#] # # Returns the matching Language - def self.find_by_shebang(data) - @interpreter_index[Linguist.interpreter_from_shebang(data)] + def self.find_by_interpreter(interpreter) + @interpreter_index[interpreter] end + # Public: Look up Language by its name or lexer. # # name - The String name of the Language diff --git a/lib/linguist/samples.rb b/lib/linguist/samples.rb index 001204b5..2d389443 100644 --- a/lib/linguist/samples.rb +++ b/lib/linguist/samples.rb @@ -6,6 +6,7 @@ end require 'linguist/md5' require 'linguist/classifier' +require 'linguist/shebang' module Linguist # Model for accessing classifier training data. @@ -61,7 +62,7 @@ module Linguist yield({ :path => path, :language => category, - :interpreter => Linguist.interpreter_from_shebang(File.read(path)), + :interpreter => Shebang.interpreter(File.read(path)), :extname => File.extname(filename) }) end @@ -114,41 +115,4 @@ module Linguist db end end - - # Used to retrieve the interpreter from the shebang line of a file's - # data. - def self.interpreter_from_shebang(data) - lines = data.lines.to_a - - if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/ - bang.sub!(/^#! /, '#!') - tokens = bang.split(' ') - pieces = tokens.first.split('/') - - if pieces.size > 1 - script = pieces.last - else - script = pieces.first.sub('#!', '') - end - - script = script == 'env' ? tokens[1] : script - - # If script has an invalid shebang, we might get here - return unless script - - # "python2.6" -> "python2" - script.sub! $1, '' if script =~ /(\.\d+)$/ - - # Check for multiline shebang hacks that call `exec` - if script == 'sh' && - lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } - script = $1 - end - - File.basename(script) - else - nil - end - end - end diff --git a/lib/linguist/shebang.rb b/lib/linguist/shebang.rb new file mode 100644 index 00000000..4ed11412 --- /dev/null +++ b/lib/linguist/shebang.rb @@ -0,0 +1,44 @@ +module Linguist + class Shebang + # Public: Use shebang to detect language of the blob. + # + # blob - An object that quacks like a blob. + # + # Examples + # + # Shebang.call(FileBlob.new("path/to/file")) + # + # Returns an Array with one Language if the blob has a shebang with a valid + # interpreter, or empty if there is no shebang. + def self.call(blob, _ = nil) + Language.find_by_interpreter interpreter(blob.data) + end + + # Public: Get the interpreter from the shebang + # + # Returns a String or nil + def self.interpreter(data) + lines = data.lines + return unless match = /^#! ?(.*)$/.match(lines.first) + + tokens = match[1].split(' ') + script = tokens.first.split('/').last + + script = tokens[1] if script == 'env' + + # If script has an invalid shebang, we might get here + return unless script + + # "python2.6" -> "python2" + script.sub! $1, '' if script =~ /(\.\d+)$/ + + # Check for multiline shebang hacks that call `exec` + if script == 'sh' && + lines.first(5).any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } + script = $1 + end + + File.basename(script) + end + end +end diff --git a/lib/linguist/strategy/shebang.rb b/lib/linguist/strategy/shebang.rb deleted file mode 100644 index dd5bc38b..00000000 --- a/lib/linguist/strategy/shebang.rb +++ /dev/null @@ -1,10 +0,0 @@ -module Linguist - module Strategy - # Check if there's a shebang line and use that as authoritative - class Shebang - def self.call(blob, _) - Language.find_by_shebang(blob.data) - end - end - end -end diff --git a/test/test_language.rb b/test/test_language.rb index c5c5255f..9a7498d1 100644 --- a/test/test_language.rb +++ b/test/test_language.rb @@ -223,34 +223,21 @@ class TestLanguage < Test::Unit::TestCase assert_equal [Language['Chapel']], Language.find_by_filename('examples/hello.chpl') end - def test_find_by_shebang - assert_equal 'ruby', Linguist.interpreter_from_shebang("#!/usr/bin/ruby\n# baz") - { [] => ["", - "foo", - "#bar", - "#baz", - "///", - "\n\n\n\n\n", - " #!/usr/sbin/ruby", - "\n#!/usr/sbin/ruby"], - ['Ruby'] => ["#!/usr/bin/env ruby\n# baz", - "#!/usr/sbin/ruby\n# bar", - "#!/usr/bin/ruby\n# foo", - "#!/usr/sbin/ruby", - "#!/usr/sbin/ruby foo bar baz\n"], - ['R'] => ["#!/usr/bin/env Rscript\n# example R script\n#\n"], - ['Shell'] => ["#!/usr/bin/bash\n", "#!/bin/sh"], - ['Python'] => ["#!/bin/python\n# foo\n# bar\n# baz", - "#!/usr/bin/python2.7\n\n\n\n", - "#!/usr/bin/python3\n\n\n\n"], - ["Common Lisp"] => ["#!/usr/bin/sbcl --script\n\n"] - }.each do |languages, bodies| - bodies.each do |body| - assert_equal([body, languages.map{|l| Language[l]}], - [body, Language.find_by_shebang(body)]) - - end + def test_find_by_interpreter + { + "ruby" => "Ruby", + "Rscript" => "R", + "sh" => "Shell", + "bash" => "Shell", + "python" => "Python", + "python2" => "Python", + "python3" => "Python", + "sbcl" => "Common Lisp" + }.each do |interpreter, language| + assert_equal [Language[language]], Language.find_by_interpreter(interpreter) end + + assert_equal [], Language.find_by_interpreter(nil) end def test_find diff --git a/test/test_samples.rb b/test/test_samples.rb index 06ede379..f47244ed 100644 --- a/test/test_samples.rb +++ b/test/test_samples.rb @@ -82,9 +82,4 @@ class TestSamples < Test::Unit::TestCase end end end - - def test_shebang - assert_equal "crystal", Linguist.interpreter_from_shebang("#!/usr/bin/env bin/crystal") - assert_equal "python2", Linguist.interpreter_from_shebang("#!/usr/bin/python2.4") - end end diff --git a/test/test_shebang.rb b/test/test_shebang.rb new file mode 100644 index 00000000..b359a771 --- /dev/null +++ b/test/test_shebang.rb @@ -0,0 +1,38 @@ +require_relative "./helper" + +class TestShebang < Test::Unit::TestCase + include Linguist + + def assert_interpreter(interpreter, body) + assert_equal interpreter, Shebang.interpreter(body) + end + + def test_shebangs + assert_interpreter nil, "" + assert_interpreter nil, "foo" + assert_interpreter nil, "#bar" + assert_interpreter nil, "#baz" + assert_interpreter nil, "///" + assert_interpreter nil, "\n\n\n\n\n" + assert_interpreter nil, " #!/usr/sbin/ruby" + assert_interpreter nil, "\n#!/usr/sbin/ruby" + + assert_interpreter "ruby", "#!/usr/sbin/ruby\n# bar" + assert_interpreter "ruby", "#!/usr/bin/ruby\n# foo" + assert_interpreter "ruby", "#!/usr/sbin/ruby" + assert_interpreter "ruby", "#!/usr/sbin/ruby foo bar baz\n" + + assert_interpreter "Rscript", "#!/usr/bin/env Rscript\n# example R script\n#\n" + assert_interpreter "crystal", "#!/usr/bin/env bin/crystal" + assert_interpreter "ruby", "#!/usr/bin/env ruby\n# baz" + + assert_interpreter "bash", "#!/usr/bin/bash\n" + assert_interpreter "sh", "#!/bin/sh" + assert_interpreter "python", "#!/bin/python\n# foo\n# bar\n# baz" + assert_interpreter "python2", "#!/usr/bin/python2.7\n\n\n\n" + assert_interpreter "python3", "#!/usr/bin/python3\n\n\n\n" + assert_interpreter "sbcl", "#!/usr/bin/sbcl --script\n\n" + assert_interpreter "perl", "#! perl" + end + +end