From 7a6202a8c3d667f3476839687f00cfc47c56703a Mon Sep 17 00:00:00 2001 From: Eric Schulte Date: Sun, 15 Sep 2013 09:55:59 -0600 Subject: [PATCH] language interpreters and shebang lines Add an interpreter array to each language, and match interpreters found in the shebang lines of scripts to this array to identify the language of scripts. With suggestions from tnm. https://github.com/github/linguist/pull/687 --- lib/linguist/language.rb | 46 ++++++++++++++++++++++++++++++++++++++++ lib/linguist/samples.rb | 38 +++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index a8dc8f9f..487f2724 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -17,6 +17,7 @@ module Linguist @alias_index = {} @extension_index = Hash.new { |h,k| h[k] = [] } + @interpreter_index = Hash.new { |h,k| h[k] = [] } @filename_index = Hash.new { |h,k| h[k] = [] } @primary_extension_index = {} @@ -71,6 +72,10 @@ module Linguist @primary_extension_index[language.primary_extension] = language + language.interpreters.each do |interpreter| + @interpreter_index[interpreter] << language + end + language.filenames.each do |filename| @filename_index[filename] << language end @@ -101,6 +106,8 @@ module Linguist data = data.call() if data.respond_to?(:call) if data.nil? || data == "" nil + elsif result = find_by_shebang(data) + result.first elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first Language[result[0]] end @@ -162,6 +169,20 @@ module Linguist langs.compact.uniq end + # Public: Look up Languages by shebang line. + # + # data - Array of tokens or String data to analyze. + # + # Examples + # + # Language.find_by_shebang("#!/bin/bash\ndate;") + # # => [#] + # + # Returns the matching Language + def self.find_by_shebang(data) + @interpreter_index[Linguist.interpreter_from_shebang(data)] + end + # Public: Look up Language by its name or lexer. # # name - The String name of the Language @@ -247,6 +268,7 @@ module Linguist # Set extensions or default to []. @extensions = attributes[:extensions] || [] + @interpreters = attributes[:interpreters] || [] @filenames = attributes[:filenames] || [] unless @primary_extension = attributes[:primary_extension] @@ -359,6 +381,15 @@ module Linguist # Returns the extension String. attr_reader :primary_extension + # Public: Get interpreters + # + # Examples + # + # # => ['awk', 'gawk', 'mawk' ...] + # + # Returns the interpreters Array + attr_reader :interpreters + # Public: Get filenames # # Examples @@ -452,11 +483,13 @@ module Linguist end extensions = Samples::DATA['extnames'] + interpreters = Samples::DATA['interpreters'] filenames = Samples::DATA['filenames'] popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__)) YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options| options['extensions'] ||= [] + options['interpreters'] ||= [] options['filenames'] ||= [] if extnames = extensions[name] @@ -467,6 +500,18 @@ module Linguist end end + if interpreters == nil + interpreters = {} + end + + if interpreter_names = interpreters[name] + interpreter_names.each do |interpreter| + if !options['interpreters'].include?(interpreter) + options['interpreters'] << interpreter + end + end + end + if fns = filenames[name] fns.each do |filename| if !options['filenames'].include?(filename) @@ -487,6 +532,7 @@ module Linguist :searchable => options.key?('searchable') ? options['searchable'] : true, :search_term => options['search_term'], :extensions => options['extensions'].sort, + :interpreters => options['interpreters'].sort, :primary_extension => options['primary_extension'], :filenames => options['filenames'], :popular => popular.include?(name) diff --git a/lib/linguist/samples.rb b/lib/linguist/samples.rb index d9099385..27ba218e 100644 --- a/lib/linguist/samples.rb +++ b/lib/linguist/samples.rb @@ -52,6 +52,7 @@ module Linguist yield({ :path => File.join(dirname, filename), :language => category, + :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil, :extname => File.extname(filename) }) end @@ -67,6 +68,7 @@ module Linguist def self.data db = {} db['extnames'] = {} + db['interpreters'] = {} db['filenames'] = {} each do |sample| @@ -80,6 +82,14 @@ module Linguist end end + if sample[:interpreter] + db['interpreters'][language_name] ||= [] + if !db['interpreters'][language_name].include?(sample[:interpreter]) + db['interpreters'][language_name] << sample[:interpreter] + db['interpreters'][language_name].sort! + end + end + if sample[:filename] db['filenames'][language_name] ||= [] db['filenames'][language_name] << sample[:filename] @@ -95,4 +105,32 @@ module Linguist db end end + + # Used to retrieve the interpreter from the shebang line of a file's + # data. + def self.interpreter_from_shebang(data) + lines = data.lines + + if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/ + bang.sub!(/^#! /, '#!') + tokens = bang.split(' ') + pieces = tokens.first.split('/') + + if pieces.size > 1 + script = pieces.last + else + script = pieces.first.sub('#!', '') + end + + script = script == 'env' ? tokens[1] : script + + # "python2.6" -> "python" + if script =~ /((?:\d+\.?)+)/ + script.sub! $1, '' + end + + script + end + end + end