mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
Originally, only "programming" languages were included in repository
language statistics. In 33ebee0f6a we
started detecting a few selected "markup" languages as well. We didn't
include all "markup" languages because at the time formats like Markdown
and AsciiDoc were labeled as "markup" languages, and we thought that
including those prose (i.e., non-code) languages in repository
statistics on github.com was misleading for repositories that are
largely about code but also contain a lot of documentation (e.g.,
rails/rails).
This hand-picked set of whitelisted "markup" languages can cause strange
categorization for some repositories. For example, it includes CSS (and
some variants) but not HTML. This results in repositories that contain
the source code for a static website being classified as either a
JavaScript (programming) or CSS (markup) repository, with no mention of
HTML anywhere.
Fast-forward to today, and prose languages are no longer "markup"
languages; they're now "prose" languages. So now we can include all
"markup" languages in repository language statistics without worrying
about undesirable effects for documentation-heavy repositories.
576 lines
15 KiB
Ruby
576 lines
15 KiB
Ruby
require 'escape_utils'
|
|
require 'yaml'
|
|
begin
|
|
require 'yajl'
|
|
rescue LoadError
|
|
end
|
|
|
|
require 'linguist/classifier'
|
|
require 'linguist/heuristics'
|
|
require 'linguist/samples'
|
|
require 'linguist/file_blob'
|
|
require 'linguist/blob_helper'
|
|
require 'linguist/strategy/filename'
|
|
require 'linguist/strategy/modeline'
|
|
require 'linguist/shebang'
|
|
|
|
module Linguist
|
|
# Language names that are recognizable by GitHub. Defined languages
|
|
# can be highlighted, searched and listed under the Top Languages page.
|
|
#
|
|
# Languages are defined in `lib/linguist/languages.yml`.
|
|
class Language
|
|
@languages = []
|
|
@index = {}
|
|
@name_index = {}
|
|
@alias_index = {}
|
|
|
|
@extension_index = Hash.new { |h,k| h[k] = [] }
|
|
@interpreter_index = Hash.new { |h,k| h[k] = [] }
|
|
@filename_index = Hash.new { |h,k| h[k] = [] }
|
|
|
|
# Valid Languages types
|
|
TYPES = [:data, :markup, :programming, :prose]
|
|
|
|
# Detect languages by a specific type
|
|
#
|
|
# type - A symbol that exists within TYPES
|
|
#
|
|
# Returns an array
|
|
def self.by_type(type)
|
|
all.select { |h| h.type == type }
|
|
end
|
|
|
|
# Internal: Create a new Language object
|
|
#
|
|
# attributes - A hash of attributes
|
|
#
|
|
# Returns a Language object
|
|
def self.create(attributes = {})
|
|
language = new(attributes)
|
|
|
|
@languages << language
|
|
|
|
# All Language names should be unique. Raise if there is a duplicate.
|
|
if @name_index.key?(language.name)
|
|
raise ArgumentError, "Duplicate language name: #{language.name}"
|
|
end
|
|
|
|
# Language name index
|
|
@index[language.name.downcase] = @name_index[language.name.downcase] = language
|
|
|
|
language.aliases.each do |name|
|
|
# All Language aliases should be unique. Raise if there is a duplicate.
|
|
if @alias_index.key?(name)
|
|
raise ArgumentError, "Duplicate alias: #{name}"
|
|
end
|
|
|
|
@index[name.downcase] = @alias_index[name.downcase] = language
|
|
end
|
|
|
|
language.extensions.each do |extension|
|
|
if extension !~ /^\./
|
|
raise ArgumentError, "Extension is missing a '.': #{extension.inspect}"
|
|
end
|
|
|
|
@extension_index[extension] << language
|
|
end
|
|
|
|
language.interpreters.each do |interpreter|
|
|
@interpreter_index[interpreter] << language
|
|
end
|
|
|
|
language.filenames.each do |filename|
|
|
@filename_index[filename] << language
|
|
end
|
|
|
|
language
|
|
end
|
|
|
|
STRATEGIES = [
|
|
Linguist::Strategy::Modeline,
|
|
Linguist::Strategy::Filename,
|
|
Linguist::Shebang,
|
|
Linguist::Heuristics,
|
|
Linguist::Classifier
|
|
]
|
|
|
|
# Public: Detects the Language of the blob.
|
|
#
|
|
# blob - an object that includes the Linguist `BlobHelper` interface;
|
|
# see Linguist::LazyBlob and Linguist::FileBlob for examples
|
|
#
|
|
# Returns Language or nil.
|
|
def self.detect(blob)
|
|
# Bail early if the blob is binary or empty.
|
|
return nil if blob.likely_binary? || blob.binary? || blob.empty?
|
|
|
|
# Call each strategy until one candidate is returned.
|
|
STRATEGIES.reduce([]) do |languages, strategy|
|
|
candidates = strategy.call(blob, languages)
|
|
if candidates.size == 1
|
|
return candidates.first
|
|
elsif candidates.size > 1
|
|
# More than one candidate was found, pass them to the next strategy.
|
|
candidates
|
|
else
|
|
# No candiates were found, pass on languages from the previous strategy.
|
|
languages
|
|
end
|
|
end.first
|
|
end
|
|
|
|
# Public: Get all Languages
|
|
#
|
|
# Returns an Array of Languages
|
|
def self.all
|
|
@languages
|
|
end
|
|
|
|
# Public: Look up Language by its proper name.
|
|
#
|
|
# name - The String name of the Language
|
|
#
|
|
# Examples
|
|
#
|
|
# Language.find_by_name('Ruby')
|
|
# # => #<Language name="Ruby">
|
|
#
|
|
# Returns the Language or nil if none was found.
|
|
def self.find_by_name(name)
|
|
name && @name_index[name.downcase]
|
|
end
|
|
|
|
# Public: Look up Language by one of its aliases.
|
|
#
|
|
# name - A String alias of the Language
|
|
#
|
|
# Examples
|
|
#
|
|
# Language.find_by_alias('cpp')
|
|
# # => #<Language name="C++">
|
|
#
|
|
# Returns the Language or nil if none was found.
|
|
def self.find_by_alias(name)
|
|
name && @alias_index[name.downcase]
|
|
end
|
|
|
|
# Public: Look up Languages by filename.
|
|
#
|
|
# filename - The path String.
|
|
#
|
|
# Examples
|
|
#
|
|
# Language.find_by_filename('foo.rb')
|
|
# # => [#<Language name="Ruby">]
|
|
#
|
|
# Returns all matching Languages or [] if none were found.
|
|
def self.find_by_filename(filename)
|
|
basename = File.basename(filename)
|
|
|
|
# find the first extension with language definitions
|
|
extname = FileBlob.new(filename).extensions.detect do |e|
|
|
!@extension_index[e].empty?
|
|
end
|
|
|
|
(@filename_index[basename] + @extension_index[extname]).compact.uniq
|
|
end
|
|
|
|
# Public: Look up Languages by file extension.
|
|
#
|
|
# extname - The extension String.
|
|
#
|
|
# Examples
|
|
#
|
|
# Language.find_by_extension('.rb')
|
|
# # => [#<Language name="Ruby">]
|
|
#
|
|
# Language.find_by_extension('rb')
|
|
# # => [#<Language name="Ruby">]
|
|
#
|
|
# Returns all matching Languages or [] if none were found.
|
|
def self.find_by_extension(extname)
|
|
extname = ".#{extname}" unless extname.start_with?(".")
|
|
@extension_index[extname]
|
|
end
|
|
|
|
# DEPRECATED
|
|
def self.find_by_shebang(data)
|
|
@interpreter_index[Shebang.interpreter(data)]
|
|
end
|
|
|
|
# Public: Look up Languages by interpreter.
|
|
#
|
|
# interpreter - String of interpreter name
|
|
#
|
|
# Examples
|
|
#
|
|
# Language.find_by_interpreter("bash")
|
|
# # => [#<Language name="Bash">]
|
|
#
|
|
# Returns the matching Language
|
|
def self.find_by_interpreter(interpreter)
|
|
@interpreter_index[interpreter]
|
|
end
|
|
|
|
|
|
# Public: Look up Language by its name.
|
|
#
|
|
# name - The String name of the Language
|
|
#
|
|
# Examples
|
|
#
|
|
# Language['Ruby']
|
|
# # => #<Language name="Ruby">
|
|
#
|
|
# Language['ruby']
|
|
# # => #<Language name="Ruby">
|
|
#
|
|
# Returns the Language or nil if none was found.
|
|
def self.[](name)
|
|
name && @index[name.downcase]
|
|
end
|
|
|
|
# Public: A List of popular languages
|
|
#
|
|
# Popular languages are sorted to the top of language chooser
|
|
# dropdowns.
|
|
#
|
|
# This list is configured in "popular.yml".
|
|
#
|
|
# Returns an Array of Languages.
|
|
def self.popular
|
|
@popular ||= all.select(&:popular?).sort_by { |lang| lang.name.downcase }
|
|
end
|
|
|
|
# Public: A List of non-popular languages
|
|
#
|
|
# Unpopular languages appear below popular ones in language
|
|
# chooser dropdowns.
|
|
#
|
|
# This list is created from all the languages not listed in "popular.yml".
|
|
#
|
|
# Returns an Array of Languages.
|
|
def self.unpopular
|
|
@unpopular ||= all.select(&:unpopular?).sort_by { |lang| lang.name.downcase }
|
|
end
|
|
|
|
# Public: A List of languages with assigned colors.
|
|
#
|
|
# Returns an Array of Languages.
|
|
def self.colors
|
|
@colors ||= all.select(&:color).sort_by { |lang| lang.name.downcase }
|
|
end
|
|
|
|
# Public: A List of languages compatible with Ace.
|
|
#
|
|
# TODO: Remove this method in a 5.x release. Every language now needs an ace_mode
|
|
# key, so this function isn't doing anything unique anymore.
|
|
#
|
|
# Returns an Array of Languages.
|
|
def self.ace_modes
|
|
warn "This method will be deprecated in a future 5.x release. Every language now has an `ace_mode` set."
|
|
@ace_modes ||= all.select(&:ace_mode).sort_by { |lang| lang.name.downcase }
|
|
end
|
|
|
|
# Internal: Initialize a new Language
|
|
#
|
|
# attributes - A hash of attributes
|
|
def initialize(attributes = {})
|
|
# @name is required
|
|
@name = attributes[:name] || raise(ArgumentError, "missing name")
|
|
|
|
# Set type
|
|
@type = attributes[:type] ? attributes[:type].to_sym : nil
|
|
if @type && !TYPES.include?(@type)
|
|
raise ArgumentError, "invalid type: #{@type}"
|
|
end
|
|
|
|
@color = attributes[:color]
|
|
|
|
# Set aliases
|
|
@aliases = [default_alias_name] + (attributes[:aliases] || [])
|
|
|
|
# Load the TextMate scope name or try to guess one
|
|
@tm_scope = attributes[:tm_scope] || begin
|
|
context = case @type
|
|
when :data, :markup, :prose
|
|
'text'
|
|
when :programming, nil
|
|
'source'
|
|
end
|
|
"#{context}.#{@name.downcase}"
|
|
end
|
|
|
|
@ace_mode = attributes[:ace_mode]
|
|
@wrap = attributes[:wrap] || false
|
|
|
|
# Set legacy search term
|
|
@search_term = attributes[:search_term] || default_alias_name
|
|
|
|
# Set extensions or default to [].
|
|
@extensions = attributes[:extensions] || []
|
|
@interpreters = attributes[:interpreters] || []
|
|
@filenames = attributes[:filenames] || []
|
|
|
|
# Set popular, and searchable flags
|
|
@popular = attributes.key?(:popular) ? attributes[:popular] : false
|
|
@searchable = attributes.key?(:searchable) ? attributes[:searchable] : true
|
|
|
|
# If group name is set, save the name so we can lazy load it later
|
|
if attributes[:group_name]
|
|
@group = nil
|
|
@group_name = attributes[:group_name]
|
|
|
|
# Otherwise we can set it to self now
|
|
else
|
|
@group = self
|
|
end
|
|
end
|
|
|
|
# Public: Get proper name
|
|
#
|
|
# Examples
|
|
#
|
|
# # => "Ruby"
|
|
# # => "Python"
|
|
# # => "Perl"
|
|
#
|
|
# Returns the name String
|
|
attr_reader :name
|
|
|
|
# Public: Get type.
|
|
#
|
|
# Returns a type Symbol or nil.
|
|
attr_reader :type
|
|
|
|
# Public: Get color.
|
|
#
|
|
# Returns a hex color String.
|
|
attr_reader :color
|
|
|
|
# Public: Get aliases
|
|
#
|
|
# Examples
|
|
#
|
|
# Language['C++'].aliases
|
|
# # => ["cpp"]
|
|
#
|
|
# Returns an Array of String names
|
|
attr_reader :aliases
|
|
|
|
# Deprecated: Get code search term
|
|
#
|
|
# Examples
|
|
#
|
|
# # => "ruby"
|
|
# # => "python"
|
|
# # => "perl"
|
|
#
|
|
# Returns the name String
|
|
attr_reader :search_term
|
|
|
|
# Public: Get the name of a TextMate-compatible scope
|
|
#
|
|
# Returns the scope
|
|
attr_reader :tm_scope
|
|
|
|
# Public: Get Ace mode
|
|
#
|
|
# Examples
|
|
#
|
|
# # => "text"
|
|
# # => "javascript"
|
|
# # => "c_cpp"
|
|
#
|
|
# Returns a String name or nil
|
|
attr_reader :ace_mode
|
|
|
|
# Public: Should language lines be wrapped
|
|
#
|
|
# Returns true or false
|
|
attr_reader :wrap
|
|
|
|
# Public: Get extensions
|
|
#
|
|
# Examples
|
|
#
|
|
# # => ['.rb', '.rake', ...]
|
|
#
|
|
# Returns the extensions Array
|
|
attr_reader :extensions
|
|
|
|
# Public: Get interpreters
|
|
#
|
|
# Examples
|
|
#
|
|
# # => ['awk', 'gawk', 'mawk' ...]
|
|
#
|
|
# Returns the interpreters Array
|
|
attr_reader :interpreters
|
|
|
|
# Public: Get filenames
|
|
#
|
|
# Examples
|
|
#
|
|
# # => ['Rakefile', ...]
|
|
#
|
|
# Returns the extensions Array
|
|
attr_reader :filenames
|
|
|
|
# Deprecated: Get primary extension
|
|
#
|
|
# Defaults to the first extension but can be overridden
|
|
# in the languages.yml.
|
|
#
|
|
# The primary extension can not be nil. Tests should verify this.
|
|
#
|
|
# This method is only used by app/helpers/gists_helper.rb for creating
|
|
# the language dropdown. It really should be using `name` instead.
|
|
# Would like to drop primary extension.
|
|
#
|
|
# Returns the extension String.
|
|
def primary_extension
|
|
extensions.first
|
|
end
|
|
|
|
# Public: Get URL escaped name.
|
|
#
|
|
# Examples
|
|
#
|
|
# "C%23"
|
|
# "C%2B%2B"
|
|
# "Common%20Lisp"
|
|
#
|
|
# Returns the escaped String.
|
|
def escaped_name
|
|
EscapeUtils.escape_url(name).gsub('+', '%20')
|
|
end
|
|
|
|
# Internal: Get default alias name
|
|
#
|
|
# Returns the alias name String
|
|
def default_alias_name
|
|
name.downcase.gsub(/\s/, '-')
|
|
end
|
|
|
|
# Public: Get Language group
|
|
#
|
|
# Returns a Language
|
|
def group
|
|
@group ||= Language.find_by_name(@group_name)
|
|
end
|
|
|
|
# Public: Is it popular?
|
|
#
|
|
# Returns true or false
|
|
def popular?
|
|
@popular
|
|
end
|
|
|
|
# Public: Is it not popular?
|
|
#
|
|
# Returns true or false
|
|
def unpopular?
|
|
!popular?
|
|
end
|
|
|
|
# Public: Is it searchable?
|
|
#
|
|
# Unsearchable languages won't by indexed by solr and won't show
|
|
# up in the code search dropdown.
|
|
#
|
|
# Returns true or false
|
|
def searchable?
|
|
@searchable
|
|
end
|
|
|
|
# Public: Return name as String representation
|
|
def to_s
|
|
name
|
|
end
|
|
|
|
def ==(other)
|
|
eql?(other)
|
|
end
|
|
|
|
def eql?(other)
|
|
equal?(other)
|
|
end
|
|
|
|
def hash
|
|
name.hash
|
|
end
|
|
|
|
def inspect
|
|
"#<#{self.class} name=#{name}>"
|
|
end
|
|
end
|
|
|
|
extensions = Samples.cache['extnames']
|
|
interpreters = Samples.cache['interpreters']
|
|
filenames = Samples.cache['filenames']
|
|
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
|
|
|
|
languages_yml = File.expand_path("../languages.yml", __FILE__)
|
|
languages_json = File.expand_path("../languages.json", __FILE__)
|
|
|
|
if File.exist?(languages_json) && defined?(Yajl)
|
|
languages = Yajl.load(File.read(languages_json))
|
|
else
|
|
languages = YAML.load_file(languages_yml)
|
|
end
|
|
|
|
languages.each do |name, options|
|
|
options['extensions'] ||= []
|
|
options['interpreters'] ||= []
|
|
options['filenames'] ||= []
|
|
|
|
if extnames = extensions[name]
|
|
extnames.each do |extname|
|
|
if !options['extensions'].index { |x| x.end_with? extname }
|
|
warn "#{name} has a sample with extension (#{extname}) that isn't explicitly defined in languages.yml" unless extname == '.script!'
|
|
options['extensions'] << extname
|
|
end
|
|
end
|
|
end
|
|
|
|
if interpreters == nil
|
|
interpreters = {}
|
|
end
|
|
|
|
if interpreter_names = interpreters[name]
|
|
interpreter_names.each do |interpreter|
|
|
if !options['interpreters'].include?(interpreter)
|
|
options['interpreters'] << interpreter
|
|
end
|
|
end
|
|
end
|
|
|
|
if fns = filenames[name]
|
|
fns.each do |filename|
|
|
if !options['filenames'].include?(filename)
|
|
options['filenames'] << filename
|
|
end
|
|
end
|
|
end
|
|
|
|
Language.create(
|
|
:name => name,
|
|
:color => options['color'],
|
|
:type => options['type'],
|
|
:aliases => options['aliases'],
|
|
:tm_scope => options['tm_scope'],
|
|
:ace_mode => options['ace_mode'],
|
|
:wrap => options['wrap'],
|
|
:group_name => options['group'],
|
|
:searchable => options.fetch('searchable', true),
|
|
:search_term => options['search_term'],
|
|
:extensions => Array(options['extensions']),
|
|
:interpreters => options['interpreters'].sort,
|
|
:filenames => options['filenames'],
|
|
:popular => popular.include?(name)
|
|
)
|
|
end
|
|
end
|