Merge branch 'master' into 1036-local

Conflicts:
	lib/linguist/heuristics.rb
	lib/linguist/languages.yml
	test/test_heuristics.rb
This commit is contained in:
Arfon Smith
2014-11-25 13:06:11 -06:00
116 changed files with 6266 additions and 574 deletions

View File

@@ -2,7 +2,6 @@ require 'linguist/generated'
require 'charlock_holmes'
require 'escape_utils'
require 'mime/types'
require 'pygments'
require 'yaml'
module Linguist
@@ -147,6 +146,13 @@ module Linguist
end
end
# Public: Is the blob empty?
#
# Return true or false
def empty?
data.nil? || data == ""
end
# Public: Is the blob text?
#
# Return true or false
@@ -193,10 +199,6 @@ module Linguist
# Public: Is the blob safe to colorize?
#
# We use Pygments for syntax highlighting blobs. Pygments
# can be too slow for very large blobs or for certain
# corner-case blobs.
#
# Return true or false
def safe_to_colorize?
!large? && text? && !high_ratio_of_long_lines?
@@ -204,9 +206,6 @@ module Linguist
# Internal: Does the blob have a ratio of long lines?
#
# These types of files are usually going to make Pygments.rb
# angry if we try to colorize them.
#
# Return true or false
def high_ratio_of_long_lines?
return false if loc == 0
@@ -314,28 +313,9 @@ module Linguist
@language ||= Language.detect(self)
end
# Internal: Get the lexer of the blob.
#
# Returns a Lexer.
def lexer
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
end
# Internal: Get the TextMate compatible scope for the blob
def tm_scope
language && language.tm_scope
end
# Public: Highlight syntax of blob
#
# options - A Hash of options (defaults to {})
#
# Returns html String
def colorize(options = {})
return unless safe_to_colorize?
options[:options] ||= {}
options[:options][:encoding] ||= encoding
lexer.highlight(data, options)
end
end
end

View File

@@ -51,26 +51,25 @@ module Linguist
#
# Return true or false
def generated?
name == 'Gemfile.lock' ||
minified_files? ||
compiled_coffeescript? ||
xcode_file? ||
generated_parser? ||
generated_net_docfile? ||
generated_net_designer_file? ||
generated_postscript? ||
generated_protocol_buffer? ||
generated_jni_header? ||
composer_lock? ||
node_modules? ||
godeps? ||
vcr_cassette? ||
generated_by_zephir?
minified_files? ||
compiled_coffeescript? ||
xcode_file? ||
generated_parser? ||
generated_net_docfile? ||
generated_net_designer_file? ||
generated_postscript? ||
generated_protocol_buffer? ||
generated_jni_header? ||
composer_lock? ||
node_modules? ||
godeps? ||
vcr_cassette? ||
generated_by_zephir?
end
# Internal: Is the blob an Xcode file?
#
# Generated if the file extension is an Xcode
# Generated if the file extension is an Xcode
# file extension.
#
# Returns true of false.
@@ -265,4 +264,3 @@ module Linguist
end
end
end

13
lib/linguist/grammars.rb Normal file
View File

@@ -0,0 +1,13 @@
# Note: This file is included in the github-linguist-grammars gem, not the
# github-linguist gem.
module Linguist
module Grammars
# Get the path to the directory containing the language grammar JSON files.
#
# Returns a String.
def self.path
File.expand_path("../../../grammars", __FILE__)
end
end
end

View File

@@ -13,20 +13,34 @@ module Linguist
# Returns an array of Languages or []
def self.find_by_heuristics(data, languages)
if active?
result = []
if languages.all? { |l| ["Objective-C", "C++", "C"].include?(l) }
result = disambiguate_c(data, languages)
result = disambiguate_c(data)
end
if languages.all? { |l| ["Perl", "Prolog"].include?(l) }
result = disambiguate_pl(data, languages)
result = disambiguate_pl(data)
end
if languages.all? { |l| ["ECL", "Prolog"].include?(l) }
result = disambiguate_ecl(data, languages)
result = disambiguate_ecl(data)
end
if languages.all? { |l| ["IDL", "Prolog"].include?(l) }
result = disambiguate_pro(data, languages)
result = disambiguate_pro(data)
end
if languages.all? { |l| ["Common Lisp", "OpenCL"].include?(l) }
result = disambiguate_cl(data, languages)
result = disambiguate_cl(data)
end
if languages.all? { |l| ["Hack", "PHP"].include?(l) }
result = disambiguate_hack(data)
end
if languages.all? { |l| ["Scala", "SuperCollider"].include?(l) }
result = disambiguate_sc(data)
end
if languages.all? { |l| ["AsciiDoc", "AGS Script"].include?(l) }
result = disambiguate_asc(data)
end
if languages.all? { |l| ["FORTRAN", "Forth"].include?(l) }
result = disambiguate_f(data)
end
return result
end
@@ -36,33 +50,38 @@ module Linguist
# We want to shortcut look for Objective-C _and_ now C++ too!
#
# Returns an array of Languages or []
def self.disambiguate_c(data, languages)
def self.disambiguate_c(data)
matches = []
if (/@(interface|class|protocol|property|end|synchronised|selector|implementation)\b/.match(data))
matches << Language["Objective-C"]
end
if (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
/^\s*template\s*</.match(data) || /^[^@]class\s+\w+/.match(data) || /^[^@](private|public|protected):$/.match(data) || /std::.+$/.match(data))
matches << Language["C++"]
end
matches
end
def self.disambiguate_pl(data, languages)
def self.disambiguate_pl(data)
matches = []
matches << Language["Prolog"] if data.include?(":-")
matches << Language["Perl"] if data.include?("use strict")
if data.include?("use strict")
matches << Language["Perl"]
elsif data.include?(":-")
matches << Language["Prolog"]
end
matches
end
def self.disambiguate_ecl(data, languages)
def self.disambiguate_ecl(data)
matches = []
matches << Language["Prolog"] if data.include?(":-")
matches << Language["ECL"] if data.include?(":=")
if data.include?(":-")
matches << Language["Prolog"]
elsif data.include?(":=")
matches << Language["ECL"]
end
matches
end
def self.disambiguate_pro(data, languages)
def self.disambiguate_pro(data)
matches = []
if (data.include?(":-"))
matches << Language["Prolog"]
@@ -72,7 +91,7 @@ module Linguist
matches
end
def self.disambiguate_ts(data, languages)
def self.disambiguate_ts(data)
matches = []
if (data.include?("</translation>"))
matches << Language["XML"]
@@ -82,20 +101,60 @@ module Linguist
matches
end
def self.disambiguate_cl(data, languages)
def self.disambiguate_cl(data)
matches = []
matches << Language["Common Lisp"] if data.include?("(defun ")
matches << Language["OpenCL"] if /\/\* |\/\/ |^\}/.match(data)
if data.include?("(defun ")
matches << Language["Common Lisp"]
elsif /\/\* |\/\/ |^\}/.match(data)
matches << Language["OpenCL"]
end
matches
end
def self.disambiguate_r(data, languages)
def self.disambiguate_r(data)
matches = []
matches << Language["Rebol"] if /\bRebol\b/i.match(data)
matches << Language["R"] if data.include?("<-")
matches
end
def self.disambiguate_hack(data)
matches = []
if data.include?("<?hh")
matches << Language["Hack"]
elsif /<?[^h]/.match(data)
matches << Language["PHP"]
end
matches
end
def self.disambiguate_sc(data)
matches = []
if (/\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data))
matches << Language["SuperCollider"]
end
if (/^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data))
matches << Language["Scala"]
end
matches
end
def self.disambiguate_asc(data)
matches = []
matches << Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
matches
end
def self.disambiguate_f(data)
matches = []
if /^: /.match(data)
matches << Language["Forth"]
elsif /^([c*][^a-z]| subroutine\s)/i.match(data)
matches << Language["FORTRAN"]
end
matches
end
def self.active?
!!ACTIVE
end

View File

@@ -1,8 +1,7 @@
require 'escape_utils'
require 'pygments'
require 'yaml'
begin
require 'json'
require 'yajl'
rescue LoadError
end
@@ -62,7 +61,7 @@ module Linguist
end
# Language name index
@index[language.name] = @name_index[language.name] = language
@index[language.name.downcase] = @name_index[language.name.downcase] = language
language.aliases.each do |name|
# All Language aliases should be unique. Raise if there is a duplicate.
@@ -70,7 +69,7 @@ module Linguist
raise ArgumentError, "Duplicate alias: #{name}"
end
@index[name] = @alias_index[name] = language
@index[name.downcase] = @alias_index[name.downcase] = language
end
language.extensions.each do |extension|
@@ -101,12 +100,8 @@ module Linguist
def self.detect(blob)
name = blob.name.to_s
# Check if the blob is possibly binary and bail early; this is a cheap
# test that uses the extension name to guess a binary binary mime type.
#
# We'll perform a more comprehensive test later which actually involves
# looking for binary characters in the blob
return nil if blob.likely_binary? || blob.binary?
# Bail early if the blob is binary or empty.
return nil if blob.likely_binary? || blob.binary? || blob.empty?
# A bit of an elegant hack. If the file is executable but extensionless,
# append a "magic" extension so it can be classified with other
@@ -125,16 +120,18 @@ module Linguist
if possible_languages.length > 1
data = blob.data
possible_language_names = possible_languages.map(&:name)
heuristic_languages = Heuristics.find_by_heuristics(data, possible_language_names)
if heuristic_languages.size > 1
possible_language_names = heuristic_languages.map(&:name)
end
# Don't bother with binary contents or an empty file
if data.nil? || data == ""
nil
# Check if there's a shebang line and use that as authoritative
elsif (result = find_by_shebang(data)) && !result.empty?
if (result = find_by_shebang(data)) && !result.empty?
result.first
# No shebang. Still more work to do. Try to find it with our heuristics.
elsif (determined = Heuristics.find_by_heuristics(data, possible_language_names)) && !determined.empty?
determined.first
elsif heuristic_languages.size == 1
heuristic_languages.first
# Lastly, fall back to the probabilistic classifier.
elsif classified = Classifier.classify(Samples.cache, data, possible_language_names).first
# Return the actual Language object based of the string language name (i.e., first element of `#classify`)
@@ -164,7 +161,7 @@ module Linguist
#
# Returns the Language or nil if none was found.
def self.find_by_name(name)
@name_index[name]
name && @name_index[name.downcase]
end
# Public: Look up Language by one of its aliases.
@@ -178,7 +175,7 @@ module Linguist
#
# Returns the Lexer or nil if none was found.
def self.find_by_alias(name)
@alias_index[name]
name && @alias_index[name.downcase]
end
# Public: Look up Languages by filename.
@@ -194,9 +191,25 @@ module Linguist
def self.find_by_filename(filename)
basename = File.basename(filename)
extname = FileBlob.new(filename).extension
langs = @filename_index[basename] +
@extension_index[extname]
langs.compact.uniq
(@filename_index[basename] + find_by_extension(extname)).compact.uniq
end
# Public: Look up Languages by file extension.
#
# extname - The extension String.
#
# Examples
#
# Language.find_by_extension('.rb')
# # => [#<Language name="Ruby">]
#
# Language.find_by_extension('rb')
# # => [#<Language name="Ruby">]
#
# Returns all matching Languages or [] if none were found.
def self.find_by_extension(extname)
extname = ".#{extname}" unless extname.start_with?(".")
@extension_index[extname]
end
# Public: Look up Languages by shebang line.
@@ -227,7 +240,7 @@ module Linguist
#
# Returns the Language or nil if none was found.
def self.[](name)
@index[name]
name && @index[name.downcase]
end
# Public: A List of popular languages
@@ -286,10 +299,7 @@ module Linguist
# Set aliases
@aliases = [default_alias_name] + (attributes[:aliases] || [])
# Lookup Lexer object
@lexer = Pygments::Lexer.find_by_name(attributes[:lexer] || name) ||
raise(ArgumentError, "#{@name} is missing lexer")
# Load the TextMate scope name or try to guess one
@tm_scope = attributes[:tm_scope] || begin
context = case @type
when :data, :markup, :prose
@@ -421,11 +431,6 @@ module Linguist
# Returns the extensions Array
attr_reader :filenames
# Public: Return all possible extensions for language
def all_extensions
(extensions + [primary_extension]).uniq
end
# Deprecated: Get primary extension
#
# Defaults to the first extension but can be overridden
@@ -533,8 +538,8 @@ module Linguist
languages_yml = File.expand_path("../languages.yml", __FILE__)
languages_json = File.expand_path("../languages.json", __FILE__)
if File.exist?(languages_json) && defined?(JSON)
languages = JSON.load(File.read(languages_json))
if File.exist?(languages_json) && defined?(Yajl)
languages = Yajl.load(File.read(languages_json))
else
languages = YAML.load_file(languages_yml)
end
@@ -583,9 +588,9 @@ module Linguist
:ace_mode => options['ace_mode'],
:wrap => options['wrap'],
:group_name => options['group'],
:searchable => options.key?('searchable') ? options['searchable'] : true,
:searchable => options.fetch('searchable', true),
:search_term => options['search_term'],
:extensions => [options['extensions'].first] + options['extensions'][1..-1].sort,
:extensions => Array(options['extensions']),
:interpreters => options['interpreters'].sort,
:filenames => options['filenames'],
:popular => popular.include?(name)

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
begin
require 'json'
require 'yajl'
rescue LoadError
require 'yaml'
end
@@ -19,7 +19,7 @@ module Linguist
# Hash of serialized samples object
def self.cache
@cache ||= begin
serializer = defined?(JSON) ? JSON : YAML
serializer = defined?(Yajl) ? Yajl : YAML
serializer.load(File.read(PATH))
end
end

View File

@@ -36,15 +36,16 @@
# Go dependencies
- Godeps/_workspace/
# Bootstrap minified css and js
- (^|/)bootstrap([^.]*)(\.min)?\.(js|css)$
# Minified JavaScript and CSS
- (\.|-)min\.(js|css)$
# Bootstrap css and js
- (^|/)bootstrap([^.]*)\.(js|css)$
# Font Awesome
- font-awesome.min.css
- font-awesome.css
# Foundation css
- foundation.min.css
- foundation.css
# Normalize.css
@@ -56,7 +57,6 @@
# Animate.css
- animate.css
- animate.min.css
# Vendored dependencies
- third[-_]?party/
@@ -73,12 +73,12 @@
## Commonly Bundled JavaScript frameworks ##
# jQuery
- (^|/)jquery([^.]*)(\.min)?\.js$
- (^|/)jquery\-\d\.\d+(\.\d+)?(\.min)?\.js$
- (^|/)jquery([^.]*)\.js$
- (^|/)jquery\-\d\.\d+(\.\d+)?\.js$
# jQuery UI
- (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?(\.min)?\.(js|css)$
- (^|/)jquery\.(ui|effects)\.([^.]*)(\.min)?\.(js|css)$
- (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$
- (^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$
# Prototype
- (^|/)prototype(.*)\.js$
@@ -110,27 +110,32 @@
# MathJax
- (^|/)MathJax/
# Chart.js
- (^|/)Chart\.js$
# Codemirror
- (^|/)[Cc]ode[Mm]irror/(lib|mode|theme|addon|keymap)
# SyntaxHighlighter - http://alexgorbatchev.com/
- (^|/)shBrush([^.]*)\.js$
- (^|/)shCore\.js$
- (^|/)shLegacy\.js$
# AngularJS
- (^|/)angular([^.]*)(\.min)?\.js$
- (^|/)angular([^.]*)\.js$
# D3.js
- (^|\/)d3(\.v\d+)?([^.]*)(\.min)?\.js$
- (^|\/)d3(\.v\d+)?([^.]*)\.js$
# React
- (^|/)react(-[^.]*)?(\.min)?\.js$
- (^|/)react(-[^.]*)?\.js$
# Modernizr
- (^|/)modernizr\-\d\.\d+(\.\d+)?(\.min)?\.js$
- (^|/)modernizr\-\d\.\d+(\.\d+)?\.js$
- (^|/)modernizr\.custom\.\d+\.js$
# Knockout
- (^|/)knockout-(\d+\.){3}(debug\.)?js$
- knockout-min.js
## Python ##
@@ -168,8 +173,8 @@
- \.intellisense\.js$
# jQuery validation plugin (MS bundles this with asp.net mvc)
- (^|/)jquery([^.]*)\.validate(\.unobtrusive)?(\.min)?\.js$
- (^|/)jquery([^.]*)\.unobtrusive\-ajax(\.min)?\.js$
- (^|/)jquery([^.]*)\.validate(\.unobtrusive)?\.js$
- (^|/)jquery([^.]*)\.unobtrusive\-ajax\.js$
# Microsoft Ajax
- (^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$
@@ -196,7 +201,7 @@
- (^|/)extjs/welcome/
# Html5shiv
- (^|/)html5shiv(\.min)?\.js$
- (^|/)html5shiv\.js$
# Samples folders
- ^[Ss]amples/
@@ -215,8 +220,8 @@
- ^[Tt]est/fixtures/
# PhoneGap/Cordova
- (^|/)cordova([^.]*)(\.min)?\.js$
- (^|/)cordova\-\d\.\d(\.\d)?(\.min)?\.js$
- (^|/)cordova([^.]*)\.js$
- (^|/)cordova\-\d\.\d(\.\d)?\.js$
# Foundation js
- foundation(\..*)?\.js$
@@ -236,7 +241,6 @@
# Octicons
- octicons.css
- octicons.min.css
- sprockets-octicons.scss
# Typesafe Activator

View File

@@ -1,3 +1,3 @@
module Linguist
VERSION = "3.4.1"
VERSION = "4.0.3"
end