language interpreters and shebang lines

Add an interpreter array to each language, and match interpreters found
in the shebang lines of scripts to this array to identify the language
of scripts.

With suggestions from tnm. https://github.com/github/linguist/pull/687
This commit is contained in:
Eric Schulte
2013-09-15 09:55:59 -06:00
parent eb5f1468d2
commit 7a6202a8c3
2 changed files with 84 additions and 0 deletions

View File

@@ -17,6 +17,7 @@ module Linguist
@alias_index = {} @alias_index = {}
@extension_index = Hash.new { |h,k| h[k] = [] } @extension_index = Hash.new { |h,k| h[k] = [] }
@interpreter_index = Hash.new { |h,k| h[k] = [] }
@filename_index = Hash.new { |h,k| h[k] = [] } @filename_index = Hash.new { |h,k| h[k] = [] }
@primary_extension_index = {} @primary_extension_index = {}
@@ -71,6 +72,10 @@ module Linguist
@primary_extension_index[language.primary_extension] = language @primary_extension_index[language.primary_extension] = language
language.interpreters.each do |interpreter|
@interpreter_index[interpreter] << language
end
language.filenames.each do |filename| language.filenames.each do |filename|
@filename_index[filename] << language @filename_index[filename] << language
end end
@@ -101,6 +106,8 @@ module Linguist
data = data.call() if data.respond_to?(:call) data = data.call() if data.respond_to?(:call)
if data.nil? || data == "" if data.nil? || data == ""
nil nil
elsif result = find_by_shebang(data)
result.first
elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
Language[result[0]] Language[result[0]]
end end
@@ -162,6 +169,20 @@ module Linguist
langs.compact.uniq langs.compact.uniq
end end
# Public: Look up Languages by shebang line.
#
# data - Array of tokens or String data to analyze.
#
# Examples
#
# Language.find_by_shebang("#!/bin/bash\ndate;")
# # => [#<Language name="Bash">]
#
# Returns the matching Language
def self.find_by_shebang(data)
@interpreter_index[Linguist.interpreter_from_shebang(data)]
end
# Public: Look up Language by its name or lexer. # Public: Look up Language by its name or lexer.
# #
# name - The String name of the Language # name - The String name of the Language
@@ -247,6 +268,7 @@ module Linguist
# Set extensions or default to []. # Set extensions or default to [].
@extensions = attributes[:extensions] || [] @extensions = attributes[:extensions] || []
@interpreters = attributes[:interpreters] || []
@filenames = attributes[:filenames] || [] @filenames = attributes[:filenames] || []
unless @primary_extension = attributes[:primary_extension] unless @primary_extension = attributes[:primary_extension]
@@ -359,6 +381,15 @@ module Linguist
# Returns the extension String. # Returns the extension String.
attr_reader :primary_extension attr_reader :primary_extension
# Public: Get interpreters
#
# Examples
#
# # => ['awk', 'gawk', 'mawk' ...]
#
# Returns the interpreters Array
attr_reader :interpreters
# Public: Get filenames # Public: Get filenames
# #
# Examples # Examples
@@ -452,11 +483,13 @@ module Linguist
end end
extensions = Samples::DATA['extnames'] extensions = Samples::DATA['extnames']
interpreters = Samples::DATA['interpreters']
filenames = Samples::DATA['filenames'] filenames = Samples::DATA['filenames']
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__)) popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options| YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options|
options['extensions'] ||= [] options['extensions'] ||= []
options['interpreters'] ||= []
options['filenames'] ||= [] options['filenames'] ||= []
if extnames = extensions[name] if extnames = extensions[name]
@@ -467,6 +500,18 @@ module Linguist
end end
end end
if interpreters == nil
interpreters = {}
end
if interpreter_names = interpreters[name]
interpreter_names.each do |interpreter|
if !options['interpreters'].include?(interpreter)
options['interpreters'] << interpreter
end
end
end
if fns = filenames[name] if fns = filenames[name]
fns.each do |filename| fns.each do |filename|
if !options['filenames'].include?(filename) if !options['filenames'].include?(filename)
@@ -487,6 +532,7 @@ module Linguist
:searchable => options.key?('searchable') ? options['searchable'] : true, :searchable => options.key?('searchable') ? options['searchable'] : true,
:search_term => options['search_term'], :search_term => options['search_term'],
:extensions => options['extensions'].sort, :extensions => options['extensions'].sort,
:interpreters => options['interpreters'].sort,
:primary_extension => options['primary_extension'], :primary_extension => options['primary_extension'],
:filenames => options['filenames'], :filenames => options['filenames'],
:popular => popular.include?(name) :popular => popular.include?(name)

View File

@@ -52,6 +52,7 @@ module Linguist
yield({ yield({
:path => File.join(dirname, filename), :path => File.join(dirname, filename),
:language => category, :language => category,
:interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
:extname => File.extname(filename) :extname => File.extname(filename)
}) })
end end
@@ -67,6 +68,7 @@ module Linguist
def self.data def self.data
db = {} db = {}
db['extnames'] = {} db['extnames'] = {}
db['interpreters'] = {}
db['filenames'] = {} db['filenames'] = {}
each do |sample| each do |sample|
@@ -80,6 +82,14 @@ module Linguist
end end
end end
if sample[:interpreter]
db['interpreters'][language_name] ||= []
if !db['interpreters'][language_name].include?(sample[:interpreter])
db['interpreters'][language_name] << sample[:interpreter]
db['interpreters'][language_name].sort!
end
end
if sample[:filename] if sample[:filename]
db['filenames'][language_name] ||= [] db['filenames'][language_name] ||= []
db['filenames'][language_name] << sample[:filename] db['filenames'][language_name] << sample[:filename]
@@ -95,4 +105,32 @@ module Linguist
db db
end end
end end
# Used to retrieve the interpreter from the shebang line of a file's
# data.
def self.interpreter_from_shebang(data)
lines = data.lines
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
bang.sub!(/^#! /, '#!')
tokens = bang.split(' ')
pieces = tokens.first.split('/')
if pieces.size > 1
script = pieces.last
else
script = pieces.first.sub('#!', '')
end
script = script == 'env' ? tokens[1] : script
# "python2.6" -> "python"
if script =~ /((?:\d+\.?)+)/
script.sub! $1, ''
end
script
end
end
end end