From fd85f7f112bd231d338fbcb6cd834a3f889da370 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 27 Nov 2014 12:18:23 -0500 Subject: [PATCH 01/10] consolidate shebang logic --- lib/linguist.rb | 1 + lib/linguist/language.rb | 19 +++++++++++-- lib/linguist/samples.rb | 35 ++--------------------- lib/linguist/shebang.rb | 49 ++++++++++++++++++++++++++++++++ lib/linguist/strategy/shebang.rb | 10 ------- 5 files changed, 69 insertions(+), 45 deletions(-) create mode 100644 lib/linguist/shebang.rb delete mode 100644 lib/linguist/strategy/shebang.rb diff --git a/lib/linguist.rb b/lib/linguist.rb index 3714b5a0..ff9fc3a2 100644 --- a/lib/linguist.rb +++ b/lib/linguist.rb @@ -4,4 +4,5 @@ require 'linguist/heuristics' require 'linguist/language' require 'linguist/repository' require 'linguist/samples' +require 'linguist/shebang' require 'linguist/version' diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 07972019..7c51ae9d 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -11,7 +11,7 @@ require 'linguist/samples' require 'linguist/file_blob' require 'linguist/blob_helper' require 'linguist/strategy/filename' -require 'linguist/strategy/shebang' +require 'linguist/shebang' module Linguist # Language names that are recognizable by GitHub. Defined languages @@ -95,7 +95,7 @@ module Linguist STRATEGIES = [ Linguist::Strategy::Filename, - Linguist::Strategy::Shebang, + Linguist::Shebang, Linguist::Heuristics, Linguist::Classifier ] @@ -213,6 +213,21 @@ module Linguist @interpreter_index[Linguist.interpreter_from_shebang(data)] end + # Public: Look up Languages by interpreter. + # + # interpreter - String of interpreter name + # + # Examples + # + # Language.find_by_interpreter("bash") + # # => [#] + # + # Returns the matching Language + def self.find_by_interpreter(interpreter) + @interpreter_index[interpreter] + end + + # Public: Look up Language by its name or lexer. # # name - The String name of the Language diff --git a/lib/linguist/samples.rb b/lib/linguist/samples.rb index 001204b5..1cacdf09 100644 --- a/lib/linguist/samples.rb +++ b/lib/linguist/samples.rb @@ -115,40 +115,9 @@ module Linguist end end - # Used to retrieve the interpreter from the shebang line of a file's - # data. + # Used to retrieve the interpreter from the shebang line of a file's data. def self.interpreter_from_shebang(data) - lines = data.lines.to_a - - if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/ - bang.sub!(/^#! /, '#!') - tokens = bang.split(' ') - pieces = tokens.first.split('/') - - if pieces.size > 1 - script = pieces.last - else - script = pieces.first.sub('#!', '') - end - - script = script == 'env' ? tokens[1] : script - - # If script has an invalid shebang, we might get here - return unless script - - # "python2.6" -> "python2" - script.sub! $1, '' if script =~ /(\.\d+)$/ - - # Check for multiline shebang hacks that call `exec` - if script == 'sh' && - lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } - script = $1 - end - - File.basename(script) - else - nil - end + Shebang.new(data).interpreter end end diff --git a/lib/linguist/shebang.rb b/lib/linguist/shebang.rb new file mode 100644 index 00000000..52933e2b --- /dev/null +++ b/lib/linguist/shebang.rb @@ -0,0 +1,49 @@ +module Linguist + # Check if there's a shebang line and use that as authoritative + class Shebang + def self.call(blob, _) + Language.find_by_interpreter(new(blob.data).interpreter) + end + + attr_reader :data + + def initialize(data) + @data = data + end + + def interpreter + lines = data.lines.to_a + + if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/ + bang.sub!(/^#! /, '#!') + tokens = bang.split(' ') + pieces = tokens.first.split('/') + + if pieces.size > 1 + script = pieces.last + else + script = pieces.first.sub('#!', '') + end + + script = script == 'env' ? tokens[1] : script + + # If script has an invalid shebang, we might get here + return unless script + + # "python2.6" -> "python2" + script.sub! $1, '' if script =~ /(\.\d+)$/ + + # Check for multiline shebang hacks that call `exec` + if script == 'sh' && + lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } + script = $1 + end + + File.basename(script) + else + nil + end + + end + end +end diff --git a/lib/linguist/strategy/shebang.rb b/lib/linguist/strategy/shebang.rb deleted file mode 100644 index dd5bc38b..00000000 --- a/lib/linguist/strategy/shebang.rb +++ /dev/null @@ -1,10 +0,0 @@ -module Linguist - module Strategy - # Check if there's a shebang line and use that as authoritative - class Shebang - def self.call(blob, _) - Language.find_by_shebang(blob.data) - end - end - end -end From cd3defda427f261ec7bb1924d0a77f799c48f755 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 27 Nov 2014 12:44:55 -0500 Subject: [PATCH 02/10] Simplify shebang detection --- lib/linguist/shebang.rb | 42 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/lib/linguist/shebang.rb b/lib/linguist/shebang.rb index 52933e2b..b0152326 100644 --- a/lib/linguist/shebang.rb +++ b/lib/linguist/shebang.rb @@ -5,45 +5,31 @@ module Linguist Language.find_by_interpreter(new(blob.data).interpreter) end - attr_reader :data - def initialize(data) - @data = data + @lines = data.lines end def interpreter - lines = data.lines.to_a + return unless match = /^#! ?(.*)$/.match(@lines.first) - if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/ - bang.sub!(/^#! /, '#!') - tokens = bang.split(' ') - pieces = tokens.first.split('/') + tokens = match[0].split(' ') + script = tokens.first.split('/').last - if pieces.size > 1 - script = pieces.last - else - script = pieces.first.sub('#!', '') - end + script = tokens[1] if script == 'env' - script = script == 'env' ? tokens[1] : script + # If script has an invalid shebang, we might get here + return unless script - # If script has an invalid shebang, we might get here - return unless script + # "python2.6" -> "python2" + script.sub! $1, '' if script =~ /(\.\d+)$/ - # "python2.6" -> "python2" - script.sub! $1, '' if script =~ /(\.\d+)$/ - - # Check for multiline shebang hacks that call `exec` - if script == 'sh' && - lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } - script = $1 - end - - File.basename(script) - else - nil + # Check for multiline shebang hacks that call `exec` + if script == 'sh' && + @lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } + script = $1 end + File.basename(script) end end end From 434ab9f2c0f39ccb4b6b6b03477d84274e679d99 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 27 Nov 2014 13:09:05 -0500 Subject: [PATCH 03/10] Add tests for shebangs --- test/test_samples.rb | 5 ----- test/test_shebang.rb | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 test/test_shebang.rb diff --git a/test/test_samples.rb b/test/test_samples.rb index 06ede379..f47244ed 100644 --- a/test/test_samples.rb +++ b/test/test_samples.rb @@ -82,9 +82,4 @@ class TestSamples < Test::Unit::TestCase end end end - - def test_shebang - assert_equal "crystal", Linguist.interpreter_from_shebang("#!/usr/bin/env bin/crystal") - assert_equal "python2", Linguist.interpreter_from_shebang("#!/usr/bin/python2.4") - end end diff --git a/test/test_shebang.rb b/test/test_shebang.rb new file mode 100644 index 00000000..290dc14f --- /dev/null +++ b/test/test_shebang.rb @@ -0,0 +1,37 @@ +require_relative "./helper" + +class TestShebang < Test::Unit::TestCase + include Linguist + + def assert_interpreter(interpreter, body) + assert_equal interpreter, Shebang.new(body).interpreter + end + + def test_shebangs + assert_interpreter nil, "" + assert_interpreter nil, "foo" + assert_interpreter nil, "#bar" + assert_interpreter nil, "#baz" + assert_interpreter nil, "///" + assert_interpreter nil, "\n\n\n\n\n" + assert_interpreter nil, " #!/usr/sbin/ruby" + assert_interpreter nil, "\n#!/usr/sbin/ruby" + + assert_interpreter "ruby", "#!/usr/sbin/ruby\n# bar", + assert_interpreter "ruby", "#!/usr/bin/ruby\n# foo", + assert_interpreter "ruby", "#!/usr/sbin/ruby", + assert_interpreter "ruby", "#!/usr/sbin/ruby foo bar baz\n" + + assert_interpreter "Rscript", "#!/usr/bin/env Rscript\n# example R script\n#\n" + assert_interpreter "crystal", "#!/usr/bin/env bin/crystal" + assert_interpreter "ruby", "#!/usr/bin/env ruby\n# baz", + + assert_interpreter "bash", "#!/usr/bin/bash\n" + assert_interpreter "sh", "#!/bin/sh" + assert_interpreter "python", "#!/bin/python\n# foo\n# bar\n# baz" + assert_interpreter "python2", "#!/usr/bin/python2.7\n\n\n\n" + assert_interpreter "python3", "#!/usr/bin/python3\n\n\n\n" + assert_interpreter "sbcl", "#!/usr/bin/sbcl --script\n\n" + end + +end From ffe2ccf1f614d2114c11e3f9e5b4271cc3749996 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 27 Nov 2014 13:17:28 -0500 Subject: [PATCH 04/10] Don't bother creating an instance --- lib/linguist/samples.rb | 2 +- lib/linguist/shebang.rb | 13 +++++-------- test/test_shebang.rb | 10 +++++----- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/lib/linguist/samples.rb b/lib/linguist/samples.rb index 1cacdf09..fa3655ce 100644 --- a/lib/linguist/samples.rb +++ b/lib/linguist/samples.rb @@ -117,7 +117,7 @@ module Linguist # Used to retrieve the interpreter from the shebang line of a file's data. def self.interpreter_from_shebang(data) - Shebang.new(data).interpreter + Shebang.interpreter(data) end end diff --git a/lib/linguist/shebang.rb b/lib/linguist/shebang.rb index b0152326..6181d98f 100644 --- a/lib/linguist/shebang.rb +++ b/lib/linguist/shebang.rb @@ -2,15 +2,12 @@ module Linguist # Check if there's a shebang line and use that as authoritative class Shebang def self.call(blob, _) - Language.find_by_interpreter(new(blob.data).interpreter) + Language.find_by_interpreter interpreter(blob.data) end - def initialize(data) - @lines = data.lines - end - - def interpreter - return unless match = /^#! ?(.*)$/.match(@lines.first) + def self.interpreter(data) + lines = data.lines + return unless match = /^#! ?(.*)$/.match(lines.first) tokens = match[0].split(' ') script = tokens.first.split('/').last @@ -25,7 +22,7 @@ module Linguist # Check for multiline shebang hacks that call `exec` if script == 'sh' && - @lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } + lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } script = $1 end diff --git a/test/test_shebang.rb b/test/test_shebang.rb index 290dc14f..aa3218f3 100644 --- a/test/test_shebang.rb +++ b/test/test_shebang.rb @@ -4,7 +4,7 @@ class TestShebang < Test::Unit::TestCase include Linguist def assert_interpreter(interpreter, body) - assert_equal interpreter, Shebang.new(body).interpreter + assert_equal interpreter, Shebang.interpreter(body) end def test_shebangs @@ -17,14 +17,14 @@ class TestShebang < Test::Unit::TestCase assert_interpreter nil, " #!/usr/sbin/ruby" assert_interpreter nil, "\n#!/usr/sbin/ruby" - assert_interpreter "ruby", "#!/usr/sbin/ruby\n# bar", - assert_interpreter "ruby", "#!/usr/bin/ruby\n# foo", - assert_interpreter "ruby", "#!/usr/sbin/ruby", + assert_interpreter "ruby", "#!/usr/sbin/ruby\n# bar" + assert_interpreter "ruby", "#!/usr/bin/ruby\n# foo" + assert_interpreter "ruby", "#!/usr/sbin/ruby" assert_interpreter "ruby", "#!/usr/sbin/ruby foo bar baz\n" assert_interpreter "Rscript", "#!/usr/bin/env Rscript\n# example R script\n#\n" assert_interpreter "crystal", "#!/usr/bin/env bin/crystal" - assert_interpreter "ruby", "#!/usr/bin/env ruby\n# baz", + assert_interpreter "ruby", "#!/usr/bin/env ruby\n# baz" assert_interpreter "bash", "#!/usr/bin/bash\n" assert_interpreter "sh", "#!/bin/sh" From 9020d7c04465acfdf1057b0602483c967486e7ad Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Thu, 27 Nov 2014 13:18:51 -0500 Subject: [PATCH 05/10] Deprecate find_by_shebang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This class doesn’t need to know about shebangs. --- lib/linguist/language.rb | 13 ++----------- test/test_language.rb | 41 ++++++++++++++-------------------------- 2 files changed, 16 insertions(+), 38 deletions(-) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index 7c51ae9d..75e25606 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -199,18 +199,9 @@ module Linguist @extension_index[extname] end - # Public: Look up Languages by shebang line. - # - # data - Array of tokens or String data to analyze. - # - # Examples - # - # Language.find_by_shebang("#!/bin/bash\ndate;") - # # => [#] - # - # Returns the matching Language + # DEPRECATED def self.find_by_shebang(data) - @interpreter_index[Linguist.interpreter_from_shebang(data)] + @interpreter_index[Shebang.interpreter(data)] end # Public: Look up Languages by interpreter. diff --git a/test/test_language.rb b/test/test_language.rb index c5c5255f..9a7498d1 100644 --- a/test/test_language.rb +++ b/test/test_language.rb @@ -223,34 +223,21 @@ class TestLanguage < Test::Unit::TestCase assert_equal [Language['Chapel']], Language.find_by_filename('examples/hello.chpl') end - def test_find_by_shebang - assert_equal 'ruby', Linguist.interpreter_from_shebang("#!/usr/bin/ruby\n# baz") - { [] => ["", - "foo", - "#bar", - "#baz", - "///", - "\n\n\n\n\n", - " #!/usr/sbin/ruby", - "\n#!/usr/sbin/ruby"], - ['Ruby'] => ["#!/usr/bin/env ruby\n# baz", - "#!/usr/sbin/ruby\n# bar", - "#!/usr/bin/ruby\n# foo", - "#!/usr/sbin/ruby", - "#!/usr/sbin/ruby foo bar baz\n"], - ['R'] => ["#!/usr/bin/env Rscript\n# example R script\n#\n"], - ['Shell'] => ["#!/usr/bin/bash\n", "#!/bin/sh"], - ['Python'] => ["#!/bin/python\n# foo\n# bar\n# baz", - "#!/usr/bin/python2.7\n\n\n\n", - "#!/usr/bin/python3\n\n\n\n"], - ["Common Lisp"] => ["#!/usr/bin/sbcl --script\n\n"] - }.each do |languages, bodies| - bodies.each do |body| - assert_equal([body, languages.map{|l| Language[l]}], - [body, Language.find_by_shebang(body)]) - - end + def test_find_by_interpreter + { + "ruby" => "Ruby", + "Rscript" => "R", + "sh" => "Shell", + "bash" => "Shell", + "python" => "Python", + "python2" => "Python", + "python3" => "Python", + "sbcl" => "Common Lisp" + }.each do |interpreter, language| + assert_equal [Language[language]], Language.find_by_interpreter(interpreter) end + + assert_equal [], Language.find_by_interpreter(nil) end def test_find From c05717d15ce8fb7f9d8bfef5e1f101ef487bd0f3 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 12:27:48 -0600 Subject: [PATCH 06/10] docs --- lib/linguist/shebang.rb | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/lib/linguist/shebang.rb b/lib/linguist/shebang.rb index 6181d98f..7fa021cc 100644 --- a/lib/linguist/shebang.rb +++ b/lib/linguist/shebang.rb @@ -1,10 +1,22 @@ module Linguist - # Check if there's a shebang line and use that as authoritative class Shebang - def self.call(blob, _) + # Public: Use shebang to detect language of the blob. + # + # blob - An object that quacks like a blob. + # + # Examples + # + # Shebang.call(FileBlob.new("path/to/file")) + # + # Returns an Array with one Language if the blob has a shebang with a valid + # interpreter, or empty if there is no shebang. + def self.call(blob, _ = nil) Language.find_by_interpreter interpreter(blob.data) end + # Public: Get the interpreter from the shebang + # + # Returns a String or nil def self.interpreter(data) lines = data.lines return unless match = /^#! ?(.*)$/.match(lines.first) From 88f08803ee5704be77c230560c9f362a639e0b27 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 12:34:41 -0600 Subject: [PATCH 07/10] require shebang when building samples --- lib/linguist/samples.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/linguist/samples.rb b/lib/linguist/samples.rb index fa3655ce..3f4f7ec4 100644 --- a/lib/linguist/samples.rb +++ b/lib/linguist/samples.rb @@ -6,6 +6,7 @@ end require 'linguist/md5' require 'linguist/classifier' +require 'linguist/shebang' module Linguist # Model for accessing classifier training data. @@ -61,7 +62,7 @@ module Linguist yield({ :path => path, :language => category, - :interpreter => Linguist.interpreter_from_shebang(File.read(path)), + :interpreter => Shebang.interpreter(File.read(path)), :extname => File.extname(filename) }) end From 47b739527a1d5a69b833691e66870ffcf4f2acbc Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 13:55:55 -0600 Subject: [PATCH 08/10] Treat lines as enumerator and not array --- lib/linguist/shebang.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/linguist/shebang.rb b/lib/linguist/shebang.rb index 7fa021cc..55481f69 100644 --- a/lib/linguist/shebang.rb +++ b/lib/linguist/shebang.rb @@ -34,7 +34,7 @@ module Linguist # Check for multiline shebang hacks that call `exec` if script == 'sh' && - lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } + lines.first(5).any? { |l| l.match(/exec (\w+).+\$0.+\$@/) } script = $1 end From 2517650ecb7e210211257ae262ca3c32932c218b Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 14:14:10 -0600 Subject: [PATCH 09/10] Fix shebang without path --- lib/linguist/shebang.rb | 2 +- test/test_shebang.rb | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/linguist/shebang.rb b/lib/linguist/shebang.rb index 55481f69..4ed11412 100644 --- a/lib/linguist/shebang.rb +++ b/lib/linguist/shebang.rb @@ -21,7 +21,7 @@ module Linguist lines = data.lines return unless match = /^#! ?(.*)$/.match(lines.first) - tokens = match[0].split(' ') + tokens = match[1].split(' ') script = tokens.first.split('/').last script = tokens[1] if script == 'env' diff --git a/test/test_shebang.rb b/test/test_shebang.rb index aa3218f3..b359a771 100644 --- a/test/test_shebang.rb +++ b/test/test_shebang.rb @@ -32,6 +32,7 @@ class TestShebang < Test::Unit::TestCase assert_interpreter "python2", "#!/usr/bin/python2.7\n\n\n\n" assert_interpreter "python3", "#!/usr/bin/python3\n\n\n\n" assert_interpreter "sbcl", "#!/usr/bin/sbcl --script\n\n" + assert_interpreter "perl", "#! perl" end end From 10de952ed6ceddda513b25d35c46766d5e522f67 Mon Sep 17 00:00:00 2001 From: Brandon Keepers Date: Fri, 28 Nov 2014 14:14:40 -0600 Subject: [PATCH 10/10] Remove Linguist.interpreter_from_shebang --- lib/linguist/samples.rb | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lib/linguist/samples.rb b/lib/linguist/samples.rb index 3f4f7ec4..2d389443 100644 --- a/lib/linguist/samples.rb +++ b/lib/linguist/samples.rb @@ -115,10 +115,4 @@ module Linguist db end end - - # Used to retrieve the interpreter from the shebang line of a file's data. - def self.interpreter_from_shebang(data) - Shebang.interpreter(data) - end - end