language interpreters and shebang lines

Add an interpreter array to each language, and match interpreters found
in the shebang lines of scripts to this array to identify the language
of scripts.

With suggestions from tnm. https://github.com/github/linguist/pull/687
This commit is contained in:
Eric Schulte
2013-09-15 09:55:59 -06:00
parent eb5f1468d2
commit 7a6202a8c3
2 changed files with 84 additions and 0 deletions

View File

@@ -17,6 +17,7 @@ module Linguist
@alias_index = {}
@extension_index = Hash.new { |h,k| h[k] = [] }
@interpreter_index = Hash.new { |h,k| h[k] = [] }
@filename_index = Hash.new { |h,k| h[k] = [] }
@primary_extension_index = {}
@@ -71,6 +72,10 @@ module Linguist
@primary_extension_index[language.primary_extension] = language
language.interpreters.each do |interpreter|
@interpreter_index[interpreter] << language
end
language.filenames.each do |filename|
@filename_index[filename] << language
end
@@ -101,6 +106,8 @@ module Linguist
data = data.call() if data.respond_to?(:call)
if data.nil? || data == ""
nil
elsif result = find_by_shebang(data)
result.first
elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
Language[result[0]]
end
@@ -162,6 +169,20 @@ module Linguist
langs.compact.uniq
end
# Public: Look up Languages by shebang line.
#
# data - Array of tokens or String data to analyze.
#
# Examples
#
# Language.find_by_shebang("#!/bin/bash\ndate;")
# # => [#<Language name="Bash">]
#
# Returns the matching Language
def self.find_by_shebang(data)
@interpreter_index[Linguist.interpreter_from_shebang(data)]
end
# Public: Look up Language by its name or lexer.
#
# name - The String name of the Language
@@ -247,6 +268,7 @@ module Linguist
# Set extensions or default to [].
@extensions = attributes[:extensions] || []
@interpreters = attributes[:interpreters] || []
@filenames = attributes[:filenames] || []
unless @primary_extension = attributes[:primary_extension]
@@ -359,6 +381,15 @@ module Linguist
# Returns the extension String.
attr_reader :primary_extension
# Public: Get interpreters
#
# Examples
#
# # => ['awk', 'gawk', 'mawk' ...]
#
# Returns the interpreters Array
attr_reader :interpreters
# Public: Get filenames
#
# Examples
@@ -452,11 +483,13 @@ module Linguist
end
extensions = Samples::DATA['extnames']
interpreters = Samples::DATA['interpreters']
filenames = Samples::DATA['filenames']
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options|
options['extensions'] ||= []
options['interpreters'] ||= []
options['filenames'] ||= []
if extnames = extensions[name]
@@ -467,6 +500,18 @@ module Linguist
end
end
if interpreters == nil
interpreters = {}
end
if interpreter_names = interpreters[name]
interpreter_names.each do |interpreter|
if !options['interpreters'].include?(interpreter)
options['interpreters'] << interpreter
end
end
end
if fns = filenames[name]
fns.each do |filename|
if !options['filenames'].include?(filename)
@@ -487,6 +532,7 @@ module Linguist
:searchable => options.key?('searchable') ? options['searchable'] : true,
:search_term => options['search_term'],
:extensions => options['extensions'].sort,
:interpreters => options['interpreters'].sort,
:primary_extension => options['primary_extension'],
:filenames => options['filenames'],
:popular => popular.include?(name)

View File

@@ -52,6 +52,7 @@ module Linguist
yield({
:path => File.join(dirname, filename),
:language => category,
:interpreter => File.exist?(filename) ? Linguist.interpreter_from_shebang(File.read(filename)) : nil,
:extname => File.extname(filename)
})
end
@@ -67,6 +68,7 @@ module Linguist
def self.data
db = {}
db['extnames'] = {}
db['interpreters'] = {}
db['filenames'] = {}
each do |sample|
@@ -80,6 +82,14 @@ module Linguist
end
end
if sample[:interpreter]
db['interpreters'][language_name] ||= []
if !db['interpreters'][language_name].include?(sample[:interpreter])
db['interpreters'][language_name] << sample[:interpreter]
db['interpreters'][language_name].sort!
end
end
if sample[:filename]
db['filenames'][language_name] ||= []
db['filenames'][language_name] << sample[:filename]
@@ -95,4 +105,32 @@ module Linguist
db
end
end
# Used to retrieve the interpreter from the shebang line of a file's
# data.
def self.interpreter_from_shebang(data)
lines = data.lines
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
bang.sub!(/^#! /, '#!')
tokens = bang.split(' ')
pieces = tokens.first.split('/')
if pieces.size > 1
script = pieces.last
else
script = pieces.first.sub('#!', '')
end
script = script == 'env' ? tokens[1] : script
# "python2.6" -> "python"
if script =~ /((?:\d+\.?)+)/
script.sub! $1, ''
end
script
end
end
end