Merge branch 'master' into combine-gems

This commit is contained in:
Arfon Smith
2016-03-09 06:25:35 -06:00
744 changed files with 154027 additions and 57364 deletions

73
lib/linguist/blob.rb Normal file
View File

@@ -0,0 +1,73 @@
require 'linguist/blob_helper'
module Linguist
# A Blob is a wrapper around the content of a file to make it quack
# like a Grit::Blob. It provides the basic interface: `name`,
# `data`, `path` and `size`.
class Blob
include BlobHelper
# Public: Initialize a new Blob.
#
# path - A path String (does not necessarily exists on the file system).
# content - Content of the file.
#
# Returns a Blob.
def initialize(path, content)
@path = path
@content = content
end
# Public: Filename
#
# Examples
#
# Blob.new("/path/to/linguist/lib/linguist.rb", "").path
# # => "/path/to/linguist/lib/linguist.rb"
#
# Returns a String
attr_reader :path
# Public: File name
#
# Returns a String
def name
File.basename(@path)
end
# Public: File contents.
#
# Returns a String.
def data
@content
end
# Public: Get byte size
#
# Returns an Integer.
def size
@content.bytesize
end
# Public: Get file extension.
#
# Returns a String.
def extension
extensions.last || ""
end
# Public: Return an array of the file extensions
#
# >> Linguist::Blob.new("app/views/things/index.html.erb").extensions
# => [".html.erb", ".erb"]
#
# Returns an Array
def extensions
basename, *segments = name.downcase.split(".")
segments.map.with_index do |segment, index|
"." + segments[index..-1].join(".")
end
end
end
end

View File

@@ -99,7 +99,7 @@ module Linguist
elsif name.nil?
"attachment"
else
"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
"attachment; filename=#{EscapeUtils.escape_url(name)}"
end
end
@@ -233,7 +233,22 @@ module Linguist
#
# Return true or false
def vendored?
name =~ VendoredRegexp ? true : false
path =~ VendoredRegexp ? true : false
end
documentation_paths = YAML.load_file(File.expand_path("../documentation.yml", __FILE__))
DocumentationRegexp = Regexp.new(documentation_paths.join('|'))
# Public: Is the blob in a documentation directory?
#
# Documentation files are ignored by language statistics.
#
# See "documentation.yml" for a list of documentation conventions that match
# this pattern.
#
# Return true or false
def documentation?
path =~ DocumentationRegexp ? true : false
end
# Public: Get each line of data
@@ -301,7 +316,7 @@ module Linguist
#
# Return true or false
def generated?
@_generated ||= Generated.generated?(name, lambda { data })
@_generated ||= Generated.generated?(path, lambda { data })
end
# Public: Detects the Language of the blob.
@@ -317,5 +332,15 @@ module Linguist
def tm_scope
language && language.tm_scope
end
DETECTABLE_TYPES = [:programming, :markup].freeze
# Internal: Should this blob be included in repository language statistics?
def include_in_language_stats?
!vendored? &&
!documentation? &&
!generated? &&
language && DETECTABLE_TYPES.include?(language.type)
end
end
end

View File

@@ -0,0 +1,30 @@
# Documentation files and directories are excluded from language
# statistics.
#
# Lines in this file are Regexps that are matched against the file
# pathname.
#
# Please add additional test coverage to
# `test/test_blob.rb#test_documentation` if you make any changes.
## Documentation directories ##
- ^docs?/
- (^|/)[Dd]ocumentation/
- (^|/)javadoc/
- ^man/
- ^[Ee]xamples/
## Documentation files ##
- (^|/)CHANGE(S|LOG)?(\.|$)
- (^|/)CONTRIBUTING(\.|$)
- (^|/)COPYING(\.|$)
- (^|/)INSTALL(\.|$)
- (^|/)LICEN[CS]E(\.|$)
- (^|/)[Ll]icen[cs]e(\.|$)
- (^|/)README(\.|$)
- (^|/)[Rr]eadme(\.|$)
# Samples folders
- ^[Ss]amples/

View File

@@ -1,10 +1,11 @@
require 'linguist/blob_helper'
require 'linguist/blob'
module Linguist
# A FileBlob is a wrapper around a File object to make it quack
# like a Grit::Blob. It provides the basic interface: `name`,
# `data`, and `size`.
class FileBlob
# `data`, `path` and `size`.
class FileBlob < Blob
include BlobHelper
# Public: Initialize a new FileBlob from a path
@@ -14,64 +15,29 @@ module Linguist
#
# Returns a FileBlob.
def initialize(path, base_path = nil)
@path = path
@name = base_path ? path.sub("#{base_path}/", '') : path
@fullpath = path
@path = base_path ? path.sub("#{base_path}/", '') : path
end
# Public: Filename
#
# Examples
#
# FileBlob.new("/path/to/linguist/lib/linguist.rb").name
# # => "/path/to/linguist/lib/linguist.rb"
#
# FileBlob.new("/path/to/linguist/lib/linguist.rb",
# "/path/to/linguist").name
# # => "lib/linguist.rb"
#
# Returns a String
attr_reader :name
# Public: Read file permissions
#
# Returns a String like '100644'
def mode
File.stat(@path).mode.to_s(8)
File.stat(@fullpath).mode.to_s(8)
end
# Public: Read file contents.
#
# Returns a String.
def data
File.read(@path)
File.read(@fullpath)
end
# Public: Get byte size
#
# Returns an Integer.
def size
File.size(@path)
end
# Public: Get file extension.
#
# Returns a String.
def extension
extensions.last || ""
end
# Public: Return an array of the file extensions
#
# >> Linguist::FileBlob.new("app/views/things/index.html.erb").extensions
# => [".html.erb", ".erb"]
#
# Returns an Array
def extensions
basename, *segments = File.basename(name).split(".")
segments.map.with_index do |segment, index|
"." + segments[index..-1].join(".")
end
File.size(@fullpath)
end
end
end

View File

@@ -53,18 +53,28 @@ module Linguist
def generated?
xcode_file? ||
generated_net_designer_file? ||
generated_net_specflow_feature_file? ||
composer_lock? ||
node_modules? ||
godeps? ||
generated_by_zephir? ||
minified_files? ||
source_map? ||
compiled_coffeescript? ||
generated_parser? ||
generated_net_docfile? ||
generated_postscript? ||
compiled_cython_file? ||
generated_go? ||
generated_protocol_buffer? ||
generated_apache_thrift? ||
generated_jni_header? ||
vcr_cassette?
vcr_cassette? ||
generated_module? ||
generated_unity3d_meta? ||
generated_racc? ||
generated_jflex? ||
generated_grammarkit?
end
# Internal: Is the blob an Xcode file?
@@ -94,6 +104,20 @@ module Linguist
end
end
# Internal: Is the blob a generated source map?
#
# Source Maps usually have .css.map or .js.map extensions. In case they
# are not following the name convention, detect them based on the content.
#
# Returns true or false.
def source_map?
return false unless extname.downcase == '.map'
name =~ /(\.css|\.js)\.map$/i || # Name convention
lines[0] =~ /^{"version":\d+,/ || # Revision 2 and later begin with the version number
lines[0] =~ /^\/\*\* Begin line maps\. \*\*\/{/ # Revision 1 begins with a magic comment
end
# Internal: Is the blob of JS generated by CoffeeScript?
#
# CoffeeScript is meant to output JS that would be difficult to
@@ -162,6 +186,17 @@ module Linguist
name.downcase =~ /\.designer\.cs$/
end
# Internal: Is this a codegen file for Specflow feature file?
#
# Visual Studio's SpecFlow extension generates *.feature.cs files
# from *.feature files, they are not meant to be consumed by humans.
# Let's hide them.
#
# Returns true or false
def generated_net_specflow_feature_file?
name.downcase =~ /\.feature\.cs$/
end
# Internal: Is the blob of JS a parser generated by PEG.js?
#
# PEG.js-generated parsers are not meant to be consumed by humans.
@@ -202,17 +237,38 @@ module Linguist
creator.include?("ImageMagick")
end
def generated_go?
return false unless extname == '.go'
return false unless lines.count > 1
return lines[0].include?("Code generated by")
end
PROTOBUF_EXTENSIONS = ['.py', '.java', '.h', '.cc', '.cpp']
# Internal: Is the blob a C++, Java or Python source file generated by the
# Protocol Buffer compiler?
#
# Returns true of false.
def generated_protocol_buffer?
return false unless ['.py', '.java', '.h', '.cc', '.cpp'].include?(extname)
return false unless PROTOBUF_EXTENSIONS.include?(extname)
return false unless lines.count > 1
return lines[0].include?("Generated by the protocol buffer compiler. DO NOT EDIT!")
end
APACHE_THRIFT_EXTENSIONS = ['.rb', '.py', '.go', '.js', '.m', '.java', '.h', '.cc', '.cpp']
# Internal: Is the blob generated by Apache Thrift compiler?
#
# Returns true or false
def generated_apache_thrift?
return false unless APACHE_THRIFT_EXTENSIONS.include?(extname)
return false unless lines.count > 1
return lines[0].include?("Autogenerated by Thrift Compiler") || lines[1].include?("Autogenerated by Thrift Compiler")
end
# Internal: Is the blob a C/C++ header generated by the Java JNI tool javah?
#
# Returns true of false.
@@ -262,5 +318,89 @@ module Linguist
# VCR Cassettes have "recorded_with: VCR" in the second last line.
return lines[-2].include?("recorded_with: VCR")
end
# Internal: Is this a compiled C/C++ file from Cython?
#
# Cython-compiled C/C++ files typically contain:
# /* Generated by Cython x.x.x on ... */
# on the first line.
#
# Return true or false
def compiled_cython_file?
return false unless ['.c', '.cpp'].include? extname
return false unless lines.count > 1
return lines[0].include?("Generated by Cython")
end
# Internal: Is it a KiCAD or GFortran module file?
#
# KiCAD module files contain:
# PCBNEW-LibModule-V1 yyyy-mm-dd h:mm:ss XM
# on the first line.
#
# GFortran module files contain:
# GFORTRAN module version 'x' created from
# on the first line.
#
# Return true of false
def generated_module?
return false unless extname == '.mod'
return false unless lines.count > 1
return lines[0].include?("PCBNEW-LibModule-V") ||
lines[0].include?("GFORTRAN module version '")
end
# Internal: Is this a metadata file from Unity3D?
#
# Unity3D Meta files start with:
# fileFormatVersion: X
# guid: XXXXXXXXXXXXXXX
#
# Return true or false
def generated_unity3d_meta?
return false unless extname == '.meta'
return false unless lines.count > 1
return lines[0].include?("fileFormatVersion: ")
end
# Internal: Is this a Racc-generated file?
#
# A Racc-generated file contains:
# # This file is automatically generated by Racc x.y.z
# on the third line.
#
# Return true or false
def generated_racc?
return false unless extname == '.rb'
return false unless lines.count > 2
return lines[2].start_with?("# This file is automatically generated by Racc")
end
# Internal: Is this a JFlex-generated file?
#
# A JFlex-generated file contains:
# /* The following code was generated by JFlex x.y.z on d/at/e ti:me */
# on the first line.
#
# Return true or false
def generated_jflex?
return false unless extname == '.java'
return false unless lines.count > 1
return lines[0].start_with?("/* The following code was generated by JFlex ")
end
# Internal: Is this a GrammarKit-generated file?
#
# A GrammarKit-generated file typically contain:
# // This is a generated file. Not intended for manual editing.
# on the first line. This is not always the case, as it's possible to
# customize the class header.
#
# Return true or false
def generated_grammarkit?
return false unless extname == '.java'
return false unless lines.count > 1
return lines[0].start_with?("// This is a generated file. Not intended for manual editing.")
end
end
end

View File

@@ -13,11 +13,14 @@ module Linguist
# ])
#
# Returns an Array of languages, or empty if none matched or were inconclusive.
def self.call(blob, languages)
def self.call(blob, candidates)
data = blob.data
@heuristics.each do |heuristic|
return Array(heuristic.call(data)) if heuristic.matches?(languages)
if heuristic.matches?(blob.name)
languages = Array(heuristic.call(data))
return languages if languages.any? || languages.all? { |l| candidates.include?(l) }
end
end
[] # No heuristics matched
@@ -30,30 +33,31 @@ module Linguist
#
# Examples
#
# disambiguate "Perl", "Prolog" do |data|
# disambiguate ".pm" do |data|
# if data.include?("use strict")
# Language["Perl"]
# elsif data.include?(":-")
# elsif /^[^#]+:-/.match(data)
# Language["Prolog"]
# end
# end
#
def self.disambiguate(*languages, &heuristic)
@heuristics << new(languages, &heuristic)
def self.disambiguate(*extensions, &heuristic)
@heuristics << new(extensions, &heuristic)
end
# Internal: Array of defined heuristics
@heuristics = []
# Internal
def initialize(languages, &heuristic)
@languages = languages
def initialize(extensions, &heuristic)
@extensions = extensions
@heuristic = heuristic
end
# Internal: Check if this heuristic matches the candidate languages.
def matches?(candidates)
candidates.any? && candidates.all? { |l| @languages.include?(l.name) }
def matches?(filename)
filename = filename.downcase
@extensions.any? { |ext| filename.end_with?(ext) }
end
# Internal: Perform the heuristic
@@ -61,7 +65,20 @@ module Linguist
@heuristic.call(data)
end
disambiguate "BitBake", "BlitzBasic" do |data|
# Common heuristics
ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/
disambiguate ".asc" do |data|
if /^(----[- ]BEGIN|ssh-(rsa|dss)) /.match(data)
Language["Public Key"]
elsif /^[=-]+(\s|\n)|{{[A-Za-z]/.match(data)
Language["AsciiDoc"]
elsif /^(\/\/.+|((import|export)\s+)?(function|int|float|char)\s+((room|repeatedly|on|game)_)?([A-Za-z]+[A-Za-z_0-9]+)\s*[;\(])/.match(data)
Language["AGS Script"]
end
end
disambiguate ".bb" do |data|
if /^\s*; /.match(data) || data.include?("End Function")
Language["BlitzBasic"]
elsif /^\s*(# |include|require)\b/.match(data)
@@ -69,43 +86,14 @@ module Linguist
end
end
disambiguate "Objective-C", "C++", "C" do |data|
if (/^[ \t]*@(interface|class|protocol|property|end|synchronised|selector|implementation)\b/.match(data))
Language["Objective-C"]
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
/^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data))
Language["C++"]
disambiguate ".ch" do |data|
if /^\s*#\s*(if|ifdef|ifndef|define|command|xcommand|translate|xtranslate|include|pragma|undef)\b/i.match(data)
Language["xBase"]
end
end
disambiguate "Perl", "Perl6", "Prolog" do |data|
if data.include?("use v6")
Language["Perl6"]
elsif data.include?("use strict")
Language["Perl"]
elsif data.include?(":-")
Language["Prolog"]
end
end
disambiguate "ECL", "Prolog" do |data|
if data.include?(":-")
Language["Prolog"]
elsif data.include?(":=")
Language["ECL"]
end
end
disambiguate "IDL", "Prolog" do |data|
if data.include?(":-")
Language["Prolog"]
else
Language["IDL"]
end
end
disambiguate "Common Lisp", "OpenCL", "Cool" do |data|
if data.include?("(defun ")
disambiguate ".cl" do |data|
if /^\s*\((defun|in-package|defpackage) /i.match(data)
Language["Common Lisp"]
elsif /^class/x.match(data)
Language["Cool"]
@@ -114,49 +102,88 @@ module Linguist
end
end
disambiguate "Hack", "PHP" do |data|
if data.include?("<?hh")
Language["Hack"]
elsif /<?[^h]/.match(data)
Language["PHP"]
disambiguate ".cs" do |data|
if /![\w\s]+methodsFor: /.match(data)
Language["Smalltalk"]
elsif /^\s*namespace\s*[\w\.]+\s*{/.match(data) || /^\s*\/\//.match(data)
Language["C#"]
end
end
disambiguate "Scala", "SuperCollider" do |data|
if /\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data)
Language["SuperCollider"]
elsif /^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)
Language["Scala"]
disambiguate ".d" do |data|
if /^module /.match(data)
Language["D"]
elsif /^((dtrace:::)?BEGIN|provider |#pragma (D (option|attributes)|ident)\s)/.match(data)
Language["DTrace"]
elsif /(\/.*:( .* \\)$| : \\$|^ : |: \\$)/.match(data)
Language["Makefile"]
end
end
disambiguate "AsciiDoc", "AGS Script" do |data|
Language["AsciiDoc"] if /^=+(\s|\n)/.match(data)
disambiguate ".ecl" do |data|
if /^[^#]+:-/.match(data)
Language["ECLiPSe"]
elsif data.include?(":=")
Language["ECL"]
end
end
disambiguate "FORTRAN", "Forth" do |data|
disambiguate ".for", ".f" do |data|
if /^: /.match(data)
Language["Forth"]
elsif /^([c*][^a-z]| (subroutine|program)\s|!)/i.match(data)
elsif /^([c*][^abd-z]| (subroutine|program|end)\s|\s*!)/i.match(data)
Language["FORTRAN"]
end
end
disambiguate "F#", "Forth", "GLSL" do |data|
disambiguate ".fr" do |data|
if /^(: |also |new-device|previous )/.match(data)
Language["Forth"]
elsif /^\s*(import|module|package|data|type) /.match(data)
Language["Frege"]
else
Language["Text"]
end
end
disambiguate ".fs" do |data|
if /^(: |new-device)/.match(data)
Language["Forth"]
elsif /^\s*(#light|import|let|module|namespace|open|type)/.match(data)
Language["F#"]
elsif /^\s*(#include|#pragma|precision|uniform|varying|void)/.match(data)
elsif /^\s*(#version|precision|uniform|varying|vec[234])/.match(data)
Language["GLSL"]
elsif /#include|#pragma\s+(rs|version)|__attribute__/.match(data)
Language["Filterscript"]
end
end
disambiguate "Gosu", "JavaScript" do |data|
disambiguate ".gs" do |data|
Language["Gosu"] if /^uses java\./.match(data)
end
disambiguate "LoomScript", "LiveScript" do |data|
disambiguate ".h" do |data|
if ObjectiveCRegex.match(data)
Language["Objective-C"]
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
/^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data))
Language["C++"]
end
end
disambiguate ".l" do |data|
if /\(def(un|macro)\s/.match(data)
Language["Common Lisp"]
elsif /^(%[%{}]xs|<.*>)/.match(data)
Language["Lex"]
elsif /^\.[a-z][a-z](\s|$)/i.match(data)
Language["Groff"]
elsif /^\((de|class|rel|code|data|must)\s/.match(data)
Language["PicoLisp"]
end
end
disambiguate ".ls" do |data|
if /^\s*package\s*[\w\.\/\*\s]*\s*{/.match(data)
Language["LoomScript"]
else
@@ -164,7 +191,177 @@ module Linguist
end
end
disambiguate "TypeScript", "XML" do |data|
disambiguate ".lsp", ".lisp" do |data|
if /^\s*\((defun|in-package|defpackage) /i.match(data)
Language["Common Lisp"]
elsif /^\s*\(define /.match(data)
Language["NewLisp"]
end
end
disambiguate ".m" do |data|
if ObjectiveCRegex.match(data)
Language["Objective-C"]
elsif data.include?(":- module")
Language["Mercury"]
elsif /^: /.match(data)
Language["MUF"]
elsif /^\s*;/.match(data)
Language["M"]
elsif /^\s*\(\*/.match(data)
Language["Mathematica"]
elsif /^\s*%/.match(data)
Language["Matlab"]
elsif /^\w+\s*:\s*module\s*{/.match(data)
Language["Limbo"]
end
end
disambiguate ".ml" do |data|
if /(^\s*module)|let rec |match\s+(\S+\s)+with/.match(data)
Language["OCaml"]
elsif /=> |case\s+(\S+\s)+of/.match(data)
Language["Standard ML"]
end
end
disambiguate ".mod" do |data|
if data.include?('<!ENTITY ')
Language["XML"]
elsif /MODULE\s\w+\s*;/i.match(data) || /^\s*END \w+;$/i.match(data)
Language["Modula-2"]
else
[Language["Linux Kernel Module"], Language["AMPL"]]
end
end
disambiguate ".ms" do |data|
if /^[.'][a-z][a-z](\s|$)/i.match(data)
Language["Groff"]
elsif /(?<!\S)\.(include|globa?l)\s/.match(data) || /(?<!\/\*)(\A|\n)\s*\.[A-Za-z]/.match(data.gsub(/"([^\\"]|\\.)*"|'([^\\']|\\.)*'|\\\s*(?:--.*)?\n/, ""))
Language["GAS"]
else
Language["MAXScript"]
end
end
disambiguate ".n" do |data|
if /^[.']/.match(data)
Language["Groff"]
elsif /^(module|namespace|using)\s/.match(data)
Language["Nemerle"]
end
end
disambiguate ".ncl" do |data|
if data.include?("THE_TITLE")
Language["Text"]
end
end
disambiguate ".nl" do |data|
if /^(b|g)[0-9]+ /.match(data)
Language["NL"]
else
Language["NewLisp"]
end
end
disambiguate ".php" do |data|
if data.include?("<?hh")
Language["Hack"]
elsif /<?[^h]/.match(data)
Language["PHP"]
end
end
disambiguate ".pl" do |data|
if /^[^#]+:-/.match(data)
Language["Prolog"]
elsif /use strict|use\s+v?5\./.match(data)
Language["Perl"]
elsif /^(use v6|(my )?class|module)/.match(data)
Language["Perl6"]
end
end
disambiguate ".pm", ".t" do |data|
if /use strict|use\s+v?5\./.match(data)
Language["Perl"]
elsif /^(use v6|(my )?class|module)/.match(data)
Language["Perl6"]
end
end
disambiguate ".pod" do |data|
if /^=\w+$/.match(data)
Language["Pod"]
else
Language["Perl"]
end
end
disambiguate ".pro" do |data|
if /^[^#]+:-/.match(data)
Language["Prolog"]
elsif data.include?("last_client=")
Language["INI"]
elsif data.include?("HEADERS") && data.include?("SOURCES")
Language["QMake"]
elsif /^\s*function[ \w,]+$/.match(data)
Language["IDL"]
end
end
disambiguate ".r" do |data|
if /\bRebol\b/i.match(data)
Language["Rebol"]
elsif data.include?("<-")
Language["R"]
end
end
disambiguate ".rpy" do |data|
if /(^(import|from|class|def)[\s\S])/m.match(data)
Language["Python"]
else
Language["Ren'Py"]
end
end
disambiguate ".rs" do |data|
if /^(use |fn |mod |pub |macro_rules|impl|#!?\[)/.match(data)
Language["Rust"]
elsif /#include|#pragma\s+(rs|version)|__attribute__/.match(data)
Language["RenderScript"]
end
end
disambiguate ".sc" do |data|
if /\^(this|super)\./.match(data) || /^\s*(\+|\*)\s*\w+\s*{/.match(data) || /^\s*~\w+\s*=\./.match(data)
Language["SuperCollider"]
elsif /^\s*import (scala|java)\./.match(data) || /^\s*val\s+\w+\s*=/.match(data) || /^\s*class\b/.match(data)
Language["Scala"]
end
end
disambiguate ".sql" do |data|
if /^\\i\b|AS \$\$|LANGUAGE '+plpgsql'+/i.match(data) || /SECURITY (DEFINER|INVOKER)/i.match(data) || /BEGIN( WORK| TRANSACTION)?;/i.match(data)
#Postgres
Language["PLpgSQL"]
elsif /(alter module)|(language sql)|(begin( NOT)+ atomic)/i.match(data) || /signal SQLSTATE '[0-9]+'/i.match(data)
#IBM db2
Language["SQLPL"]
elsif /pragma|\$\$PLSQL_|XMLTYPE|sysdate|systimestamp|\.nextval|connect by|AUTHID (DEFINER|CURRENT_USER)/i.match(data) || /constructor\W+function/i.match(data)
#Oracle
Language["PLSQL"]
elsif ! /begin|boolean|package|exception/i.match(data)
#Generic SQL
Language["SQL"]
end
end
disambiguate ".ts" do |data|
if data.include?("<TS ")
Language["XML"]
else
@@ -172,13 +369,12 @@ module Linguist
end
end
disambiguate "Frege", "Forth", "Text" do |data|
if /^(: |also |new-device|previous )/.match(data)
Language["Forth"]
elsif /\s*(import|module|package|data|type) /.match(data)
Language["Frege"]
disambiguate ".tst" do |data|
if (data.include?("gap> "))
Language["GAP"]
# Heads up - we don't usually write heuristics like this (with no regex match)
else
Language["Text"]
Language["Scilab"]
end
end
end

View File

@@ -11,6 +11,7 @@ require 'linguist/samples'
require 'linguist/file_blob'
require 'linguist/blob_helper'
require 'linguist/strategy/filename'
require 'linguist/strategy/modeline'
require 'linguist/shebang'
module Linguist
@@ -31,13 +32,6 @@ module Linguist
# Valid Languages types
TYPES = [:data, :markup, :programming, :prose]
# Names of non-programming languages that we will still detect
#
# Returns an array
def self.detectable_markup
["CSS", "Less", "Sass", "SCSS", "Stylus", "TeX"]
end
# Detect languages by a specific type
#
# type - A symbol that exists within TYPES
@@ -79,7 +73,7 @@ module Linguist
raise ArgumentError, "Extension is missing a '.': #{extension.inspect}"
end
@extension_index[extension] << language
@extension_index[extension.downcase] << language
end
language.interpreters.each do |interpreter|
@@ -94,8 +88,9 @@ module Linguist
end
STRATEGIES = [
Linguist::Strategy::Filename,
Linguist::Strategy::Modeline,
Linguist::Shebang,
Linguist::Strategy::Filename,
Linguist::Heuristics,
Linguist::Classifier
]
@@ -110,19 +105,31 @@ module Linguist
# Bail early if the blob is binary or empty.
return nil if blob.likely_binary? || blob.binary? || blob.empty?
# Call each strategy until one candidate is returned.
STRATEGIES.reduce([]) do |languages, strategy|
candidates = strategy.call(blob, languages)
if candidates.size == 1
return candidates.first
elsif candidates.size > 1
# More than one candidate was found, pass them to the next strategy.
candidates
else
# No candiates were found, pass on languages from the previous strategy.
languages
Linguist.instrument("linguist.detection", :blob => blob) do
# Call each strategy until one candidate is returned.
languages = []
returning_strategy = nil
STRATEGIES.each do |strategy|
returning_strategy = strategy
candidates = Linguist.instrument("linguist.strategy", :blob => blob, :strategy => strategy, :candidates => languages) do
strategy.call(blob, languages)
end
if candidates.size == 1
languages = candidates
break
elsif candidates.size > 1
# More than one candidate was found, pass them to the next strategy.
languages = candidates
else
# No candidates, try the next strategy
end
end
end.first
Linguist.instrument("linguist.detected", :blob => blob, :strategy => returning_strategy, :language => languages.first)
languages.first
end
end
# Public: Get all Languages
@@ -143,7 +150,8 @@ module Linguist
#
# Returns the Language or nil if none was found.
def self.find_by_name(name)
name && @name_index[name.downcase]
return nil if name.to_s.empty?
name && (@name_index[name.downcase] || @name_index[name.split(',').first.downcase])
end
# Public: Look up Language by one of its aliases.
@@ -155,9 +163,10 @@ module Linguist
# Language.find_by_alias('cpp')
# # => #<Language name="C++">
#
# Returns the Lexer or nil if none was found.
# Returns the Language or nil if none was found.
def self.find_by_alias(name)
name && @alias_index[name.downcase]
return nil if name.to_s.empty?
name && (@alias_index[name.downcase] || @alias_index[name.split(',').first.downcase])
end
# Public: Look up Languages by filename.
@@ -196,7 +205,7 @@ module Linguist
# Returns all matching Languages or [] if none were found.
def self.find_by_extension(extname)
extname = ".#{extname}" unless extname.start_with?(".")
@extension_index[extname]
@extension_index[extname.downcase]
end
# DEPRECATED
@@ -219,7 +228,7 @@ module Linguist
end
# Public: Look up Language by its name or lexer.
# Public: Look up Language by its name.
#
# name - The String name of the Language
#
@@ -233,7 +242,8 @@ module Linguist
#
# Returns the Language or nil if none was found.
def self.[](name)
name && @index[name.downcase]
return nil if name.to_s.empty?
name && (@index[name.downcase] || @index[name.split(',').first.downcase])
end
# Public: A List of popular languages
@@ -243,7 +253,7 @@ module Linguist
#
# This list is configured in "popular.yml".
#
# Returns an Array of Lexers.
# Returns an Array of Languages.
def self.popular
@popular ||= all.select(&:popular?).sort_by { |lang| lang.name.downcase }
end
@@ -255,7 +265,7 @@ module Linguist
#
# This list is created from all the languages not listed in "popular.yml".
#
# Returns an Array of Lexers.
# Returns an Array of Languages.
def self.unpopular
@unpopular ||= all.select(&:unpopular?).sort_by { |lang| lang.name.downcase }
end
@@ -375,11 +385,6 @@ module Linguist
# Returns the name String
attr_reader :search_term
# Public: Get Lexer
#
# Returns the Lexer
attr_reader :lexer
# Public: Get the name of a TextMate-compatible scope
#
# Returns the scope
@@ -495,16 +500,6 @@ module Linguist
@searchable
end
# Public: Highlight syntax of text
#
# text - String of code to be highlighted
# options - A Hash of options (defaults to {})
#
# Returns html String
def colorize(text, options = {})
lexer.highlight(text, options)
end
# Public: Return name as String representation
def to_s
name
@@ -548,8 +543,8 @@ module Linguist
if extnames = extensions[name]
extnames.each do |extname|
if !options['extensions'].index { |x| x.end_with? extname }
warn "#{name} has a sample with extension (#{extname}) that isn't explicitly defined in languages.yml" unless extname == '.script!'
if !options['extensions'].index { |x| x.downcase.end_with? extname.downcase }
warn "#{name} has a sample with extension (#{extname.downcase}) that isn't explicitly defined in languages.yml"
options['extensions'] << extname
end
end
@@ -580,7 +575,6 @@ module Linguist
:color => options['color'],
:type => options['type'],
:aliases => options['aliases'],
:lexer => options['lexer'],
:tm_scope => options['tm_scope'],
:ace_mode => options['ace_mode'],
:wrap => options['wrap'],

1116
lib/linguist/languages.yml Normal file → Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,11 @@ require 'rugged'
module Linguist
class LazyBlob
GIT_ATTR = ['linguist-language', 'linguist-vendored']
GIT_ATTR = ['linguist-documentation',
'linguist-language',
'linguist-vendored',
'linguist-generated']
GIT_ATTR_OPTS = { :priority => [:index], :skip_system => true }
GIT_ATTR_FLAGS = Rugged::Repository::Attributes.parse_opts(GIT_ATTR_OPTS)
@@ -14,13 +18,15 @@ module Linguist
attr_reader :repository
attr_reader :oid
attr_reader :name
attr_reader :path
attr_reader :mode
def initialize(repo, oid, name, mode = nil)
alias :name :path
def initialize(repo, oid, path, mode = nil)
@repository = repo
@oid = oid
@name = name
@path = path
@mode = mode
end
@@ -29,11 +35,27 @@ module Linguist
name, GIT_ATTR, GIT_ATTR_FLAGS)
end
def documentation?
if attr = git_attributes['linguist-documentation']
boolean_attribute(attr)
else
super
end
end
def generated?
if attr = git_attributes['linguist-generated']
boolean_attribute(attr)
else
super
end
end
def vendored?
if attr = git_attributes['linguist-vendored']
return boolean_attribute(attr)
else
return super
super
end
end
@@ -41,7 +63,7 @@ module Linguist
return @language if defined?(@language)
@language = if lang = git_attributes['linguist-language']
Language.find_by_name(lang)
Language.find_by_alias(lang)
else
super
end
@@ -57,11 +79,15 @@ module Linguist
@size
end
def cleanup!
@data.clear if @data
end
protected
# Returns true if the attribute is present and not the string "false".
def boolean_attribute(attr)
attr != "false"
def boolean_attribute(attribute)
attribute != "false"
end
def load_blob!

View File

@@ -9,21 +9,21 @@
- CSS
- Clojure
- CoffeeScript
- Common Lisp
- Diff
- Emacs Lisp
- Erlang
- Go
- HTML
- Haskell
- Java
- JavaScript
- Lua
- Matlab
- Objective-C
- PHP
- Perl
- Python
- R
- Ruby
- SQL
- Scala
- Scheme
- Shell
- Swift
- TeX
- VimL

View File

@@ -126,12 +126,13 @@ module Linguist
end
protected
MAX_TREE_SIZE = 100_000
def compute_stats(old_commit_oid, cache = nil)
return {} if current_tree.count_recursive(MAX_TREE_SIZE) >= MAX_TREE_SIZE
old_tree = old_commit_oid && Rugged::Commit.lookup(repository, old_commit_oid).tree
read_index
diff = Rugged::Tree.diff(repository, old_tree, current_tree)
# Clear file map and fetch full diff if any .gitattributes files are changed
@@ -150,19 +151,18 @@ module Linguist
next if delta.binary
if [:added, :modified].include? delta.status
# Skip submodules
# Skip submodules and symlinks
mode = delta.new_file[:mode]
next if (mode & 040000) != 0
mode_format = (mode & 0170000)
next if mode_format == 0120000 || mode_format == 040000 || mode_format == 0160000
blob = Linguist::LazyBlob.new(repository, delta.new_file[:oid], new, mode.to_s(8))
# Skip vendored or generated blobs
next if blob.vendored? || blob.generated? || blob.language.nil?
# Only include programming languages and acceptable markup languages
if blob.language.type == :programming || Language.detectable_markup.include?(blob.language.name)
if blob.include_in_language_stats?
file_map[new] = [blob.language.group.name, blob.size]
end
blob.cleanup!
end
end

View File

@@ -50,16 +50,13 @@ module Linguist
end
else
path = File.join(dirname, filename)
if File.extname(filename) == ""
raise "#{path} is missing an extension, maybe it belongs in filenames/ subdir"
end
extname = File.extname(filename)
yield({
:path => path,
:language => category,
:interpreter => Shebang.interpreter(File.read(path)),
:extname => File.extname(filename)
:extname => extname.empty? ? nil : extname
})
end
end

View File

@@ -23,17 +23,20 @@ module Linguist
# First line must start with #!
return unless shebang && shebang.start_with?("#!")
# Get the parts of the shebang without the #!
tokens = shebang.sub(/^#!\s*/, '').strip.split(' ')
s = StringScanner.new(shebang)
# There was nothing after the #!
return if tokens.empty?
return unless path = s.scan(/^#!\s*\S+/)
# Get the name of the interpreter
script = File.basename(tokens.first)
# Keep going
script = path.split('/').last
# Get next argument if interpreter was /usr/bin/env
script = tokens[1] if script == 'env'
# if /usr/bin/env type shebang then walk the string
if script == 'env'
s.scan(/\s+/)
s.scan(/.*=[^\s]+\s+/) # skip over variable arguments e.g. foo=bar
script = s.scan(/\S+/)
end
# Interpreter was /usr/bin/env with no arguments
return unless script
@@ -41,6 +44,9 @@ module Linguist
# "python2.6" -> "python2"
script.sub! /(\.\d+)$/, ''
# #! perl -> perl
script.sub! /^#!\s*/, ''
# Check for multiline shebang hacks that call `exec`
if script == 'sh' &&
data.lines.first(5).any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }

View File

@@ -3,17 +3,7 @@ module Linguist
# Detects language based on filename and/or extension
class Filename
def self.call(blob, _)
name = blob.name.to_s
# A bit of an elegant hack. If the file is executable but extensionless,
# append a "magic" extension so it can be classified with other
# languages that have shebang scripts.
extensions = FileBlob.new(name).extensions
if extensions.empty? && blob.mode && (blob.mode.to_i(8) & 05) == 05
name += ".script!"
end
Language.find_by_filename(name)
Language.find_by_filename(blob.name.to_s)
end
end
end

View File

@@ -0,0 +1,41 @@
module Linguist
module Strategy
class Modeline
EMACS_MODELINE = /-\*-\s*(?:(?!mode)[\w-]+\s*:\s*(?:[\w+-]+)\s*;?\s*)*(?:mode\s*:)?\s*([\w+-]+)\s*(?:;\s*(?!mode)[\w-]+\s*:\s*[\w+-]+\s*)*;?\s*-\*-/i
# First form vim modeline
# [text]{white}{vi:|vim:|ex:}[white]{options}
# ex: 'vim: syntax=ruby'
VIM_MODELINE_1 = /(?:vim|vi|ex):\s*(?:ft|filetype|syntax)=(\w+)\s?/i
# Second form vim modeline (compatible with some versions of Vi)
# [text]{white}{vi:|vim:|Vim:|ex:}[white]se[t] {options}:[text]
# ex: 'vim set syntax=ruby:'
VIM_MODELINE_2 = /(?:vim|vi|Vim|ex):\s*se(?:t)?.*\s(?:ft|filetype|syntax)=(\w+)\s?.*:/i
MODELINES = [EMACS_MODELINE, VIM_MODELINE_1, VIM_MODELINE_2]
# Public: Detects language based on Vim and Emacs modelines
#
# blob - An object that quacks like a blob.
#
# Examples
#
# Modeline.call(FileBlob.new("path/to/file"))
#
# Returns an Array with one Language if the blob has a Vim or Emacs modeline
# that matches a Language name or alias. Returns an empty array if no match.
def self.call(blob, _ = nil)
Array(Language.find_by_alias(modeline(blob.data)))
end
# Public: Get the modeline from the first n-lines of the file
#
# Returns a String or nil
def self.modeline(data)
match = MODELINES.map { |regex| data.match(regex) }.reject(&:nil?).first
match[1] if match
end
end
end
end

View File

@@ -22,8 +22,10 @@ module Linguist
# Start state on token, ignore anything till the next newline
SINGLE_LINE_COMMENTS = [
'//', # C
'--', # Ada, Haskell, AppleScript
'#', # Ruby
'%', # Tex
'"', # Vim
]
# Start state on opening token, ignore anything until the closing
@@ -33,7 +35,8 @@ module Linguist
['<!--', '-->'], # XML
['{-', '-}'], # Haskell
['(*', '*)'], # Coq
['"""', '"""'] # Python
['"""', '"""'], # Python
["'''", "'''"] # Python
]
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
@@ -83,17 +86,17 @@ module Linguist
if s.peek(1) == "\""
s.getch
else
s.skip_until(/[^\\]"/)
s.skip_until(/(?<!\\)"/)
end
elsif s.scan(/'/)
if s.peek(1) == "'"
s.getch
else
s.skip_until(/[^\\]'/)
s.skip_until(/(?<!\\)'/)
end
# Skip number literals
elsif s.scan(/(0x)?\d(\d|\.)*/)
elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
# SGML style brackets
elsif token = s.scan(/<[^\s<>][^<>]*>/)
@@ -129,6 +132,9 @@ module Linguist
# extract_shebang("#!/usr/bin/env node")
# # => "node"
#
# extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
# # => "awk"
#
# Returns String token or nil it couldn't be parsed.
def extract_shebang(data)
s = StringScanner.new(data)
@@ -137,6 +143,7 @@ module Linguist
script = path.split('/').last
if script == 'env'
s.scan(/\s+/)
s.scan(/.*=[^\s]+\s+/)
script = s.scan(/\S+/)
end
script = script[/[^\d]+/, 0] if script

View File

@@ -20,10 +20,20 @@
- ^deps/
- ^tools/
- (^|/)configure$
- (^|/)configure.ac$
- (^|/)config.guess$
- (^|/)config.sub$
# stuff autogenerated by autoconf - still C deps
- (^|/)aclocal.m4
- (^|/)libtool.m4
- (^|/)ltoptions.m4
- (^|/)ltsugar.m4
- (^|/)ltversion.m4
- (^|/)lt~obsolete.m4
# Linters
- cpplint.py
# Node dependencies
- node_modules/
@@ -40,30 +50,34 @@
# Minified JavaScript and CSS
- (\.|-)min\.(js|css)$
# Stylesheets imported from packages
- ([^\s]*)import\.(css|less|scss|styl)$
# Bootstrap css and js
- (^|/)bootstrap([^.]*)\.(js|css)$
- (^|/)bootstrap([^.]*)\.(js|css|less|scss|styl)$
- (^|/)custom\.bootstrap([^\s]*)(js|css|less|scss|styl)$
# Font Awesome
- font-awesome.css
- (^|/)font-awesome\.(css|less|scss|styl)$
# Foundation css
- foundation.css
- (^|/)foundation\.(css|less|scss|styl)$
# Normalize.css
- normalize.css
- (^|/)normalize\.(css|less|scss|styl)$
# Bourbon SCSS
- (^|/)[Bb]ourbon/.*\.css$
- (^|/)[Bb]ourbon/.*\.scss$
# Bourbon css
- (^|/)[Bb]ourbon/.*\.(css|less|scss|styl)$
# Animate.css
- animate.css
- (^|/)animate\.(css|less|scss|styl)$
# Vendored dependencies
- third[-_]?party/
- 3rd[-_]?party/
- vendors?/
- extern(al)?/
- (^|/)[Vv]+endor/
# Debian packaging
- ^debian/
@@ -71,6 +85,9 @@
# Haxelib projects often contain a neko bytecode file named run.n
- run.n$
# Bootstrap Datepicker
- bootstrap-datepicker/
## Commonly Bundled JavaScript frameworks ##
# jQuery
@@ -81,6 +98,34 @@
- (^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$
- (^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$
# jQuery Gantt
- jquery.fn.gantt.js
# jQuery fancyBox
- jquery.fancybox.(js|css)
# Fuel UX
- fuelux.js
# jQuery File Upload
- (^|/)jquery\.fileupload(-\w+)?\.js$
# Slick
- (^|/)slick\.\w+.js$
# Leaflet plugins
- (^|/)Leaflet\.Coordinates-\d+\.\d+\.\d+\.src\.js$
- leaflet.draw-src.js
- leaflet.draw.css
- Control.FullScreen.css
- Control.FullScreen.js
- leaflet.spin.js
- wicket-leaflet.js
# Sublime Text workspace files
- .sublime-project
- .sublime-workspace
# Prototype
- (^|/)prototype(.*)\.js$
- (^|/)effects\.js$
@@ -115,7 +160,7 @@
- (^|/)Chart\.js$
# Codemirror
- (^|/)[Cc]ode[Mm]irror/(lib|mode|theme|addon|keymap)
- (^|/)[Cc]ode[Mm]irror/(\d+\.\d+/)?(lib|mode|theme|addon|keymap|demo)
# SyntaxHighlighter - http://alexgorbatchev.com/
- (^|/)shBrush([^.]*)\.js$
@@ -140,6 +185,9 @@
## Python ##
# Sphinx
- (^|/)docs?/_?(build|themes?|templates?|static)/
# django
- (^|/)admin_media/
@@ -154,12 +202,31 @@
## Obj-C ##
# Xcode
- \.xctemplate/
- \.imageset/
# Carthage
- ^Carthage/
# Cocoapods
- ^Pods/
# Sparkle
- (^|/)Sparkle/
# Crashlytics
- Crashlytics.framework/
# Fabric
- Fabric.framework/
# git config files
- gitattributes$
- gitignore$
- gitmodules$
## Groovy ##
# Gradle
@@ -204,21 +271,9 @@
# Html5shiv
- (^|/)html5shiv\.js$
# Samples folders
- ^[Ss]amples/
# LICENSE, README, git config files
- ^COPYING$
- LICENSE$
- License$
- gitattributes$
- gitignore$
- gitmodules$
- ^README$
- ^readme$
# Test fixtures
- ^[Tt]est/fixtures/
- ^[Tt]ests?/fixtures/
- ^[Ss]pecs?/fixtures/
# PhoneGap/Cordova
- (^|/)cordova([^.]*)\.js$
@@ -230,7 +285,7 @@
# Vagrant
- ^Vagrantfile$
# .DS_Store's
# .DS_Stores
- .[Dd][Ss]_[Ss]tore$
# R packages
@@ -248,3 +303,12 @@
# ProGuard
- proguard.pro
- proguard-rules.pro
# PuPHPet
- ^puphpet/
# Android Google APIs
- (^|/)\.google_apis/
# Jenkins Pipeline
- ^Jenkinsfile$

View File

@@ -1,3 +1,3 @@
module Linguist
VERSION = "4.2.6"
VERSION = "4.7.6"
end