diff --git a/lib/linguist/language.rb b/lib/linguist/language.rb index a8dc8f9f..487f2724 100644 --- a/lib/linguist/language.rb +++ b/lib/linguist/language.rb @@ -17,6 +17,7 @@ module Linguist @alias_index = {} @extension_index = Hash.new { |h,k| h[k] = [] } + @interpreter_index = Hash.new { |h,k| h[k] = [] } @filename_index = Hash.new { |h,k| h[k] = [] } @primary_extension_index = {} @@ -71,6 +72,10 @@ module Linguist @primary_extension_index[language.primary_extension] = language + language.interpreters.each do |interpreter| + @interpreter_index[interpreter] << language + end + language.filenames.each do |filename| @filename_index[filename] << language end @@ -101,6 +106,8 @@ module Linguist data = data.call() if data.respond_to?(:call) if data.nil? || data == "" nil + elsif result = find_by_shebang(data) + result.first elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first Language[result[0]] end @@ -162,6 +169,20 @@ module Linguist langs.compact.uniq end + # Public: Look up Languages by shebang line. + # + # data - Array of tokens or String data to analyze. + # + # Examples + # + # Language.find_by_shebang("#!/bin/bash\ndate;") + # # => [#] + # + # Returns the matching Language + def self.find_by_shebang(data) + @interpreter_index[Linguist.interpreter_from_shebang(data)] + end + # Public: Look up Language by its name or lexer. # # name - The String name of the Language @@ -247,6 +268,7 @@ module Linguist # Set extensions or default to []. @extensions = attributes[:extensions] || [] + @interpreters = attributes[:interpreters] || [] @filenames = attributes[:filenames] || [] unless @primary_extension = attributes[:primary_extension] @@ -359,6 +381,15 @@ module Linguist # Returns the extension String. attr_reader :primary_extension + # Public: Get interpreters + # + # Examples + # + # # => ['awk', 'gawk', 'mawk' ...] + # + # Returns the interpreters Array + attr_reader :interpreters + # Public: Get filenames # # Examples @@ -452,11 +483,13 @@ module Linguist end extensions = Samples::DATA['extnames'] + interpreters = Samples::DATA['interpreters'] filenames = Samples::DATA['filenames'] popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__)) YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options| options['extensions'] ||= [] + options['interpreters'] ||= [] options['filenames'] ||= [] if extnames = extensions[name] @@ -467,6 +500,18 @@ module Linguist end end + if interpreters == nil + interpreters = {} + end + + if interpreter_names = interpreters[name] + interpreter_names.each do |interpreter| + if !options['interpreters'].include?(interpreter) + options['interpreters'] << interpreter + end + end + end + if fns = filenames[name] fns.each do |filename| if !options['filenames'].include?(filename) @@ -487,6 +532,7 @@ module Linguist :searchable => options.key?('searchable') ? options['searchable'] : true, :search_term => options['search_term'], :extensions => options['extensions'].sort, + :interpreters => options['interpreters'].sort, :primary_extension => options['primary_extension'], :filenames => options['filenames'], :popular => popular.include?(name) diff --git a/lib/linguist/samples.rb b/lib/linguist/samples.rb index d9099385..27ba218e 100644 --- a/lib/linguist/samples.rb +++ b/lib/linguist/samples.rb @@ -52,6 +52,7 @@ module Linguist yield({ :path => File.join(dirname, filename), :language => category, + :interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil, :extname => File.extname(filename) }) end @@ -67,6 +68,7 @@ module Linguist def self.data db = {} db['extnames'] = {} + db['interpreters'] = {} db['filenames'] = {} each do |sample| @@ -80,6 +82,14 @@ module Linguist end end + if sample[:interpreter] + db['interpreters'][language_name] ||= [] + if !db['interpreters'][language_name].include?(sample[:interpreter]) + db['interpreters'][language_name] << sample[:interpreter] + db['interpreters'][language_name].sort! + end + end + if sample[:filename] db['filenames'][language_name] ||= [] db['filenames'][language_name] << sample[:filename] @@ -95,4 +105,32 @@ module Linguist db end end + + # Used to retrieve the interpreter from the shebang line of a file's + # data. + def self.interpreter_from_shebang(data) + lines = data.lines + + if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/ + bang.sub!(/^#! /, '#!') + tokens = bang.split(' ') + pieces = tokens.first.split('/') + + if pieces.size > 1 + script = pieces.last + else + script = pieces.first.sub('#!', '') + end + + script = script == 'env' ? tokens[1] : script + + # "python2.6" -> "python" + if script =~ /((?:\d+\.?)+)/ + script.sub! $1, '' + end + + script + end + end + end