mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 09:40:21 +00:00
579 lines
15 KiB
Ruby
579 lines
15 KiB
Ruby
require 'escape_utils'
|
|
require 'pygments'
|
|
require 'yaml'
|
|
begin
|
|
require 'json'
|
|
rescue LoadError
|
|
end
|
|
|
|
require 'linguist/classifier'
|
|
require 'linguist/heuristics'
|
|
require 'linguist/samples'
|
|
require 'linguist/file_blob'
|
|
require 'linguist/blob_helper'
|
|
|
|
module Linguist
|
|
# Language names that are recognizable by GitHub. Defined languages
|
|
# can be highlighted, searched and listed under the Top Languages page.
|
|
#
|
|
# Languages are defined in `lib/linguist/languages.yml`.
|
|
class Language
|
|
@languages = []
|
|
@index = {}
|
|
@name_index = {}
|
|
@alias_index = {}
|
|
|
|
@extension_index = Hash.new { |h,k| h[k] = [] }
|
|
@interpreter_index = Hash.new { |h,k| h[k] = [] }
|
|
@filename_index = Hash.new { |h,k| h[k] = [] }
|
|
|
|
# Valid Languages types
|
|
TYPES = [:data, :markup, :programming, :prose]
|
|
|
|
# Names of non-programming languages that we will still detect
|
|
#
|
|
# Returns an array
|
|
def self.detectable_markup
|
|
["CSS", "Less", "Sass", "SCSS", "Stylus", "TeX"]
|
|
end
|
|
|
|
# Detect languages by a specific type
|
|
#
|
|
# type - A symbol that exists within TYPES
|
|
#
|
|
# Returns an array
|
|
def self.by_type(type)
|
|
all.select { |h| h.type == type }
|
|
end
|
|
|
|
# Internal: Create a new Language object
|
|
#
|
|
# attributes - A hash of attributes
|
|
#
|
|
# Returns a Language object
|
|
def self.create(attributes = {})
|
|
language = new(attributes)
|
|
|
|
@languages << language
|
|
|
|
# All Language names should be unique. Raise if there is a duplicate.
|
|
if @name_index.key?(language.name)
|
|
raise ArgumentError, "Duplicate language name: #{language.name}"
|
|
end
|
|
|
|
# Language name index
|
|
@index[language.name] = @name_index[language.name] = language
|
|
|
|
language.aliases.each do |name|
|
|
# All Language aliases should be unique. Raise if there is a duplicate.
|
|
if @alias_index.key?(name)
|
|
raise ArgumentError, "Duplicate alias: #{name}"
|
|
end
|
|
|
|
@index[name] = @alias_index[name] = language
|
|
end
|
|
|
|
language.extensions.each do |extension|
|
|
if extension !~ /^\./
|
|
raise ArgumentError, "Extension is missing a '.': #{extension.inspect}"
|
|
end
|
|
|
|
@extension_index[extension] << language
|
|
end
|
|
|
|
language.interpreters.each do |interpreter|
|
|
@interpreter_index[interpreter] << language
|
|
end
|
|
|
|
language.filenames.each do |filename|
|
|
@filename_index[filename] << language
|
|
end
|
|
|
|
language
|
|
end
|
|
|
|
# Public: Detects the Language of the blob.
|
|
#
|
|
# blob - an object that includes the Linguist `BlobHelper` interface;
|
|
# see Linguist::LazyBlob and Linguist::FileBlob for examples
|
|
#
|
|
# Returns Language or nil.
|
|
def self.detect(blob)
|
|
name = blob.name.to_s
|
|
|
|
# Check if the blob is possibly binary and bail early; this is a cheap
|
|
# test that uses the extension name to guess a binary binary mime type.
|
|
#
|
|
# We'll perform a more comprehensive test later which actually involves
|
|
# looking for binary characters in the blob
|
|
return nil if blob.likely_binary? || blob.binary?
|
|
|
|
# A bit of an elegant hack. If the file is executable but extensionless,
|
|
# append a "magic" extension so it can be classified with other
|
|
# languages that have shebang scripts.
|
|
extension = FileBlob.new(name).extension
|
|
if extension.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
|
|
name += ".script!"
|
|
end
|
|
|
|
# First try to find languages that match based on filename.
|
|
possible_languages = find_by_filename(name)
|
|
|
|
# If there is more than one possible language with that extension (or no
|
|
# extension at all, in the case of extensionless scripts), we need to continue
|
|
# our detection work
|
|
if possible_languages.length > 1
|
|
data = blob.data
|
|
possible_language_names = possible_languages.map(&:name)
|
|
|
|
# Don't bother with binary contents or an empty file
|
|
if data.nil? || data == ""
|
|
nil
|
|
# Check if there's a shebang line and use that as authoritative
|
|
elsif (result = find_by_shebang(data)) && !result.empty?
|
|
result.first
|
|
# No shebang. Still more work to do. Try to find it with our heuristics.
|
|
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
|
|
determined.first
|
|
# Lastly, fall back to the probablistic classifier.
|
|
elsif classified = Classifier.classify(Samples::DATA, data, possible_language_names).first
|
|
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
|
|
Language[classified[0]]
|
|
end
|
|
else
|
|
# Simplest and most common case, we can just return the one match based on extension
|
|
possible_languages.first
|
|
end
|
|
end
|
|
|
|
# Public: Get all Languages
|
|
#
|
|
# Returns an Array of Languages
|
|
def self.all
|
|
@languages
|
|
end
|
|
|
|
# Public: Look up Language by its proper name.
|
|
#
|
|
# name - The String name of the Language
|
|
#
|
|
# Examples
|
|
#
|
|
# Language.find_by_name('Ruby')
|
|
# # => #<Language name="Ruby">
|
|
#
|
|
# Returns the Language or nil if none was found.
|
|
def self.find_by_name(name)
|
|
@name_index[name]
|
|
end
|
|
|
|
# Public: Look up Language by one of its aliases.
|
|
#
|
|
# name - A String alias of the Language
|
|
#
|
|
# Examples
|
|
#
|
|
# Language.find_by_alias('cpp')
|
|
# # => #<Language name="C++">
|
|
#
|
|
# Returns the Lexer or nil if none was found.
|
|
def self.find_by_alias(name)
|
|
@alias_index[name]
|
|
end
|
|
|
|
# Public: Look up Languages by filename.
|
|
#
|
|
# filename - The path String.
|
|
#
|
|
# Examples
|
|
#
|
|
# Language.find_by_filename('foo.rb')
|
|
# # => [#<Language name="Ruby">]
|
|
#
|
|
# Returns all matching Languages or [] if none were found.
|
|
def self.find_by_filename(filename)
|
|
basename = File.basename(filename)
|
|
extname = FileBlob.new(filename).extension
|
|
langs = @filename_index[basename] +
|
|
@extension_index[extname]
|
|
langs.compact.uniq
|
|
end
|
|
|
|
# Public: Look up Languages by shebang line.
|
|
#
|
|
# data - Array of tokens or String data to analyze.
|
|
#
|
|
# Examples
|
|
#
|
|
# Language.find_by_shebang("#!/bin/bash\ndate;")
|
|
# # => [#<Language name="Bash">]
|
|
#
|
|
# Returns the matching Language
|
|
def self.find_by_shebang(data)
|
|
@interpreter_index[Linguist.interpreter_from_shebang(data)]
|
|
end
|
|
|
|
# Public: Look up Language by its name or lexer.
|
|
#
|
|
# name - The String name of the Language
|
|
#
|
|
# Examples
|
|
#
|
|
# Language['Ruby']
|
|
# # => #<Language name="Ruby">
|
|
#
|
|
# Language['ruby']
|
|
# # => #<Language name="Ruby">
|
|
#
|
|
# Returns the Language or nil if none was found.
|
|
def self.[](name)
|
|
@index[name]
|
|
end
|
|
|
|
# Public: A List of popular languages
|
|
#
|
|
# Popular languages are sorted to the top of language chooser
|
|
# dropdowns.
|
|
#
|
|
# This list is configured in "popular.yml".
|
|
#
|
|
# Returns an Array of Lexers.
|
|
def self.popular
|
|
@popular ||= all.select(&:popular?).sort_by { |lang| lang.name.downcase }
|
|
end
|
|
|
|
# Public: A List of non-popular languages
|
|
#
|
|
# Unpopular languages appear below popular ones in language
|
|
# chooser dropdowns.
|
|
#
|
|
# This list is created from all the languages not listed in "popular.yml".
|
|
#
|
|
# Returns an Array of Lexers.
|
|
def self.unpopular
|
|
@unpopular ||= all.select(&:unpopular?).sort_by { |lang| lang.name.downcase }
|
|
end
|
|
|
|
# Public: A List of languages with assigned colors.
|
|
#
|
|
# Returns an Array of Languages.
|
|
def self.colors
|
|
@colors ||= all.select(&:color).sort_by { |lang| lang.name.downcase }
|
|
end
|
|
|
|
# Public: A List of languages compatible with Ace.
|
|
#
|
|
# Returns an Array of Languages.
|
|
def self.ace_modes
|
|
@ace_modes ||= all.select(&:ace_mode).sort_by { |lang| lang.name.downcase }
|
|
end
|
|
|
|
# Internal: Initialize a new Language
|
|
#
|
|
# attributes - A hash of attributes
|
|
def initialize(attributes = {})
|
|
# @name is required
|
|
@name = attributes[:name] || raise(ArgumentError, "missing name")
|
|
|
|
# Set type
|
|
@type = attributes[:type] ? attributes[:type].to_sym : nil
|
|
if @type && !TYPES.include?(@type)
|
|
raise ArgumentError, "invalid type: #{@type}"
|
|
end
|
|
|
|
@color = attributes[:color]
|
|
|
|
# Set aliases
|
|
@aliases = [default_alias_name] + (attributes[:aliases] || [])
|
|
|
|
# Lookup Lexer object
|
|
@lexer = Pygments::Lexer.find_by_name(attributes[:lexer] || name) ||
|
|
raise(ArgumentError, "#{@name} is missing lexer")
|
|
|
|
@ace_mode = attributes[:ace_mode]
|
|
@wrap = attributes[:wrap] || false
|
|
|
|
# Set legacy search term
|
|
@search_term = attributes[:search_term] || default_alias_name
|
|
|
|
# Set extensions or default to [].
|
|
@extensions = attributes[:extensions] || []
|
|
@interpreters = attributes[:interpreters] || []
|
|
@filenames = attributes[:filenames] || []
|
|
|
|
# Set popular, and searchable flags
|
|
@popular = attributes.key?(:popular) ? attributes[:popular] : false
|
|
@searchable = attributes.key?(:searchable) ? attributes[:searchable] : true
|
|
|
|
# If group name is set, save the name so we can lazy load it later
|
|
if attributes[:group_name]
|
|
@group = nil
|
|
@group_name = attributes[:group_name]
|
|
|
|
# Otherwise we can set it to self now
|
|
else
|
|
@group = self
|
|
end
|
|
end
|
|
|
|
# Public: Get proper name
|
|
#
|
|
# Examples
|
|
#
|
|
# # => "Ruby"
|
|
# # => "Python"
|
|
# # => "Perl"
|
|
#
|
|
# Returns the name String
|
|
attr_reader :name
|
|
|
|
# Public: Get type.
|
|
#
|
|
# Returns a type Symbol or nil.
|
|
attr_reader :type
|
|
|
|
# Public: Get color.
|
|
#
|
|
# Returns a hex color String.
|
|
attr_reader :color
|
|
|
|
# Public: Get aliases
|
|
#
|
|
# Examples
|
|
#
|
|
# Language['C++'].aliases
|
|
# # => ["cpp"]
|
|
#
|
|
# Returns an Array of String names
|
|
attr_reader :aliases
|
|
|
|
# Deprecated: Get code search term
|
|
#
|
|
# Examples
|
|
#
|
|
# # => "ruby"
|
|
# # => "python"
|
|
# # => "perl"
|
|
#
|
|
# Returns the name String
|
|
attr_reader :search_term
|
|
|
|
# Public: Get Lexer
|
|
#
|
|
# Returns the Lexer
|
|
attr_reader :lexer
|
|
|
|
# Public: Get Ace mode
|
|
#
|
|
# Examples
|
|
#
|
|
# # => "text"
|
|
# # => "javascript"
|
|
# # => "c_cpp"
|
|
#
|
|
# Returns a String name or nil
|
|
attr_reader :ace_mode
|
|
|
|
# Public: Should language lines be wrapped
|
|
#
|
|
# Returns true or false
|
|
attr_reader :wrap
|
|
|
|
# Public: Get extensions
|
|
#
|
|
# Examples
|
|
#
|
|
# # => ['.rb', '.rake', ...]
|
|
#
|
|
# Returns the extensions Array
|
|
attr_reader :extensions
|
|
|
|
# Public: Get interpreters
|
|
#
|
|
# Examples
|
|
#
|
|
# # => ['awk', 'gawk', 'mawk' ...]
|
|
#
|
|
# Returns the interpreters Array
|
|
attr_reader :interpreters
|
|
|
|
# Public: Get filenames
|
|
#
|
|
# Examples
|
|
#
|
|
# # => ['Rakefile', ...]
|
|
#
|
|
# Returns the extensions Array
|
|
attr_reader :filenames
|
|
|
|
# Public: Return all possible extensions for language
|
|
def all_extensions
|
|
(extensions + [primary_extension]).uniq
|
|
end
|
|
|
|
# Deprecated: Get primary extension
|
|
#
|
|
# Defaults to the first extension but can be overridden
|
|
# in the languages.yml.
|
|
#
|
|
# The primary extension can not be nil. Tests should verify this.
|
|
#
|
|
# This method is only used by app/helpers/gists_helper.rb for creating
|
|
# the language dropdown. It really should be using `name` instead.
|
|
# Would like to drop primary extension.
|
|
#
|
|
# Returns the extension String.
|
|
def primary_extension
|
|
extensions.first
|
|
end
|
|
|
|
# Public: Get URL escaped name.
|
|
#
|
|
# Examples
|
|
#
|
|
# "C%23"
|
|
# "C%2B%2B"
|
|
# "Common%20Lisp"
|
|
#
|
|
# Returns the escaped String.
|
|
def escaped_name
|
|
EscapeUtils.escape_url(name).gsub('+', '%20')
|
|
end
|
|
|
|
# Internal: Get default alias name
|
|
#
|
|
# Returns the alias name String
|
|
def default_alias_name
|
|
name.downcase.gsub(/\s/, '-')
|
|
end
|
|
|
|
# Public: Get Language group
|
|
#
|
|
# Returns a Language
|
|
def group
|
|
@group ||= Language.find_by_name(@group_name)
|
|
end
|
|
|
|
# Public: Is it popular?
|
|
#
|
|
# Returns true or false
|
|
def popular?
|
|
@popular
|
|
end
|
|
|
|
# Public: Is it not popular?
|
|
#
|
|
# Returns true or false
|
|
def unpopular?
|
|
!popular?
|
|
end
|
|
|
|
# Public: Is it searchable?
|
|
#
|
|
# Unsearchable languages won't by indexed by solr and won't show
|
|
# up in the code search dropdown.
|
|
#
|
|
# Returns true or false
|
|
def searchable?
|
|
@searchable
|
|
end
|
|
|
|
# Public: Highlight syntax of text
|
|
#
|
|
# text - String of code to be highlighted
|
|
# options - A Hash of options (defaults to {})
|
|
#
|
|
# Returns html String
|
|
def colorize(text, options = {})
|
|
lexer.highlight(text, options)
|
|
end
|
|
|
|
# Public: Return name as String representation
|
|
def to_s
|
|
name
|
|
end
|
|
|
|
def ==(other)
|
|
eql?(other)
|
|
end
|
|
|
|
def eql?(other)
|
|
equal?(other)
|
|
end
|
|
|
|
def hash
|
|
name.hash
|
|
end
|
|
|
|
def inspect
|
|
"#<#{self.class} name=#{name}>"
|
|
end
|
|
end
|
|
|
|
extensions = Samples::DATA['extnames']
|
|
interpreters = Samples::DATA['interpreters']
|
|
filenames = Samples::DATA['filenames']
|
|
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
|
|
|
|
languages_yml = File.expand_path("../languages.yml", __FILE__)
|
|
languages_json = File.expand_path("../languages.json", __FILE__)
|
|
|
|
if File.exist?(languages_json) && defined?(JSON)
|
|
languages = JSON.load(File.read(languages_json))
|
|
else
|
|
languages = YAML.load_file(languages_yml)
|
|
end
|
|
|
|
languages.each do |name, options|
|
|
options['extensions'] ||= []
|
|
options['interpreters'] ||= []
|
|
options['filenames'] ||= []
|
|
|
|
if extnames = extensions[name]
|
|
extnames.each do |extname|
|
|
if !options['extensions'].include?(extname)
|
|
warn "#{name} has a sample with extension (#{extname}) that isn't explicitly defined in languages.yml" unless extname == '.script!'
|
|
options['extensions'] << extname
|
|
end
|
|
end
|
|
end
|
|
|
|
if interpreters == nil
|
|
interpreters = {}
|
|
end
|
|
|
|
if interpreter_names = interpreters[name]
|
|
interpreter_names.each do |interpreter|
|
|
if !options['interpreters'].include?(interpreter)
|
|
options['interpreters'] << interpreter
|
|
end
|
|
end
|
|
end
|
|
|
|
if fns = filenames[name]
|
|
fns.each do |filename|
|
|
if !options['filenames'].include?(filename)
|
|
options['filenames'] << filename
|
|
end
|
|
end
|
|
end
|
|
|
|
Language.create(
|
|
:name => name,
|
|
:color => options['color'],
|
|
:type => options['type'],
|
|
:aliases => options['aliases'],
|
|
:lexer => options['lexer'],
|
|
:ace_mode => options['ace_mode'],
|
|
:wrap => options['wrap'],
|
|
:group_name => options['group'],
|
|
:searchable => options.key?('searchable') ? options['searchable'] : true,
|
|
:search_term => options['search_term'],
|
|
:extensions => [options['extensions'].first] + options['extensions'][1..-1].sort,
|
|
:interpreters => options['interpreters'].sort,
|
|
:filenames => options['filenames'],
|
|
:popular => popular.include?(name)
|
|
)
|
|
end
|
|
end
|