Merged with upstream. Updated M (aka MUMPS) detection to use the new bayesian / samples method.

This commit is contained in:
Laurent Parenteau
2013-03-14 11:33:09 -04:00
472 changed files with 179182 additions and 1762 deletions

View File

@@ -1,5 +1,5 @@
require 'linguist/blob_helper'
require 'linguist/generated'
require 'linguist/language'
require 'linguist/mime'
require 'linguist/pathname'
require 'linguist/repository'
require 'linguist/samples'

View File

@@ -1,9 +1,9 @@
require 'linguist/generated'
require 'linguist/language'
require 'linguist/mime'
require 'linguist/pathname'
require 'charlock_holmes'
require 'escape_utils'
require 'mime/types'
require 'pygments'
require 'yaml'
@@ -11,13 +11,6 @@ module Linguist
# BlobHelper is a mixin for Blobish classes that respond to "name",
# "data" and "size" such as Grit::Blob.
module BlobHelper
# Internal: Get a Pathname wrapper for Blob#name
#
# Returns a Pathname.
def pathname
Pathname.new(name || "")
end
# Public: Get the extname of the path
#
# Examples
@@ -27,7 +20,23 @@ module Linguist
#
# Returns a String
def extname
pathname.extname
File.extname(name.to_s)
end
# Internal: Lookup mime type for extension.
#
# Returns a MIME::Type
def _mime_type
if defined? @_mime_type
@_mime_type
else
guesses = ::MIME::Types.type_for(extname.to_s)
# Prefer text mime types over binary
@_mime_type = guesses.detect { |type| type.ascii? } ||
# Otherwise use the first guess
guesses.first
end
end
# Public: Get the actual blob mime type
@@ -39,7 +48,23 @@ module Linguist
#
# Returns a mime type String.
def mime_type
@mime_type ||= pathname.mime_type
_mime_type ? _mime_type.to_s : 'text/plain'
end
# Internal: Is the blob binary according to its mime type
#
# Return true or false
def binary_mime_type?
_mime_type ? _mime_type.binary? : false
end
# Internal: Is the blob binary according to its mime type,
# overriding it if we have better data from the languages.yml
# database.
#
# Return true or false
def likely_binary?
binary_mime_type? and not Language.find_by_filename(name)
end
# Public: Get the Content-Type header value
@@ -71,7 +96,7 @@ module Linguist
elsif name.nil?
"attachment"
else
"attachment; filename=#{EscapeUtils.escape_url(pathname.basename)}"
"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
end
end
@@ -90,15 +115,6 @@ module Linguist
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
end
# Public: Is the blob binary according to its mime type
#
# Return true or false
def binary_mime_type?
if mime_type = Mime.lookup_mime_type_for(pathname.extname)
mime_type.binary?
end
end
# Public: Is the blob binary?
#
# Return true or false
@@ -132,23 +148,14 @@ module Linguist
#
# Return true or false
def image?
['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
end
# Public: Is the blob a possible drupal php file?
# Public: Is the blob a supported 3D model format?
#
# Return true or false
def drupal_extname?
['.module', '.install', '.test', '.inc'].include?(extname)
end
# Public: Is the blob likely to have a shebang?
#
# Return true or false
def shebang_extname?
extname.empty? &&
mode &&
(mode.to_i(8) & 05) == 05
def solid?
extname.downcase == '.stl'
end
MEGABYTE = 1024 * 1024
@@ -169,7 +176,7 @@ module Linguist
#
# Return true or false
def safe_to_colorize?
text? && !large? && !high_ratio_of_long_lines?
!large? && text? && !high_ratio_of_long_lines?
end
# Internal: Does the blob have a ratio of long lines?
@@ -213,7 +220,31 @@ module Linguist
#
# Returns an Array of lines
def lines
@lines ||= (viewable? && data) ? data.split("\n", -1) : []
@lines ||=
if viewable? && data
data.split(line_split_character, -1)
else
[]
end
end
# Character used to split lines. This is almost always "\n" except when Mac
# Format is detected in which case it's "\r".
#
# Returns a split pattern string.
def line_split_character
@line_split_character ||= (mac_format?? "\r" : "\n")
end
# Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
# for line ends and does not include a \n (0x0a).
#
# Returns true when mac format is detected.
def mac_format?
return if !viewable?
if pos = data[0, 4096].index("\r")
data[pos + 1] != ?\n
end
end
# Public: Get number of lines of code
@@ -234,125 +265,16 @@ module Linguist
lines.grep(/\S/).size
end
# Internal: Compute average line length.
#
# Returns Integer.
def average_line_length
if lines.any?
lines.inject(0) { |n, l| n += l.length } / lines.length
else
0
end
end
# Public: Is the blob a generated file?
#
# Generated source code is supressed in diffs and is ignored by
# Generated source code is suppressed in diffs and is ignored by
# language statistics.
#
# Requires Blob#data
#
# Includes:
# - XCode project XML files
# - Minified JavaScript
#
# Please add additional test coverage to
# `test/test_blob.rb#test_generated` if you make any changes.
# May load Blob#data
#
# Return true or false
def generated?
if xcode_project_file? || generated_net_docfile?
true
elsif generated_coffeescript? || minified_javascript?
true
elsif name == 'Gemfile.lock'
true
else
false
end
end
# Internal: Is the blob an XCode project file?
#
# Generated if the file extension is an XCode project
# file extension.
#
# Returns true of false.
def xcode_project_file?
['.xib', '.nib', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
end
# Internal: Is the blob minified JS?
#
# Consider JS minified if the average line length is
# greater then 100c.
#
# Returns true or false.
def minified_javascript?
return unless extname == '.js'
average_line_length > 100
end
# Internal: Is the blob JS generated by CoffeeScript?
#
# Requires Blob#data
#
# CoffeScript is meant to output JS that would be difficult to
# tell if it was generated or not. Look for a number of patterns
# outputed by the CS compiler.
#
# Return true or false
def generated_coffeescript?
return unless extname == '.js'
# CoffeeScript generated by > 1.2 include a comment on the first line
if lines[0] =~ /^\/\/ Generated by /
return true
end
if lines[0] == '(function() {' && # First line is module closure opening
lines[-2] == '}).call(this);' && # Second to last line closes module closure
lines[-1] == '' # Last line is blank
score = 0
lines.each do |line|
if line =~ /var /
# Underscored temp vars are likely to be Coffee
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
# bind and extend functions are very Coffee specific
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
end
end
# Require a score of 3. This is fairly arbitrary. Consider
# tweaking later.
score >= 3
else
false
end
end
# Internal: Is this a generated documentation file for a .NET assembly?
#
# Requires Blob#data
#
# .NET developers often check in the XML Intellisense file along with an
# assembly - however, these don't have a special extension, so we have to
# dig into the contents to determine if it's a docfile. Luckily, these files
# are extremely structured, so recognizing them is easy.
#
# Returns true or false
def generated_net_docfile?
return false unless extname.downcase == ".xml"
return false unless lines.count > 3
# .NET Docfiles always open with <doc> and their first tag is an
# <assembly> tag
return lines[1].include?("<doc>") &&
lines[2].include?("<assembly>") &&
lines[-2].include?("</doc>")
@_generated ||= Generated.generated?(name, lambda { data })
end
# Public: Should the blob be indexed for searching?
@@ -360,7 +282,7 @@ module Linguist
# Excluded:
# - Files over 0.1MB
# - Non-text files
# - Langauges marked as not searchable
# - Languages marked as not searchable
# - Generated source files
#
# Please add additional test coverage to
@@ -368,16 +290,18 @@ module Linguist
#
# Return true or false
def indexable?
if binary?
if size > 100 * 1024
false
elsif binary?
false
elsif extname == '.txt'
true
elsif language.nil?
false
elsif !language.searchable?
false
elsif generated?
false
elsif size > 100 * 1024
false
else
true
end
@@ -389,33 +313,15 @@ module Linguist
#
# Returns a Language or nil if none is detected
def language
if defined? @language
@language
return @language if defined? @language
if defined?(@data) && @data.is_a?(String)
data = @data
else
@language = guess_language
data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
end
end
# Internal: Guess language
#
# Please add additional test coverage to
# `test/test_blob.rb#test_language` if you make any changes.
#
# Returns a Language or nil
def guess_language
return if binary_mime_type?
# Disambiguate between multiple language extensions
disambiguate_extension_language ||
# See if there is a Language for the extension
pathname.language ||
# Look for idioms in first line
first_line_language ||
# Try to detect Language from shebang line
shebang_language
@language = Language.detect(name.to_s, data, mode)
end
# Internal: Get the lexer of the blob.
@@ -425,247 +331,6 @@ module Linguist
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
end
# Internal: Disambiguates between multiple language extensions.
#
# Delegates to "guess_EXTENSION_language".
#
# Please add additional test coverage to
# `test/test_blob.rb#test_language` if you add another method.
#
# Returns a Language or nil.
def disambiguate_extension_language
if Language.ambiguous?(extname)
name = "guess_#{extname.sub(/^\./, '')}_language"
send(name) if respond_to?(name)
end
end
# Internal: Guess language of .cls files
#
# Returns a Language.
def guess_cls_language
if lines.grep(/^(%|\\)/).any?
Language['TeX']
elsif lines.grep(/^\s*(CLASS|METHOD|INTERFACE).*:\s*/i).any? || lines.grep(/^\s*(USING|DEFINE)/i).any?
Language['OpenEdge ABL']
elsif lines.grep(/\{$/).any? || lines.grep(/\}$/).any?
Language['Apex']
elsif lines.grep(/^(\'\*|Attribute|Option|Sub|Private|Protected|Public|Friend)/i).any?
Language['Visual Basic']
else
# The most common language should be the fallback
Language['TeX']
end
end
# Internal: Guess language of header files (.h).
#
# Returns a Language.
def guess_h_language
if lines.grep(/^@(interface|property|private|public|end)/).any?
Language['Objective-C']
elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
Language['C++']
else
Language['C']
end
end
# Internal: Guess language of .m files.
#
# Objective-C heuristics:
# * Keywords ("#import", "#include", "#ifdef", #define, "@end") or "//" and opening "\*" comments
#
# Matlab heuristics:
# * Leading "function " of "classdef " keyword
# * "%" comments
#
# M heuristics:
# * Look at first line. It is either a comment (1st regex) or label/code (2nd regex)
#
# Note: All "#" keywords, e.g., "#import", are guaranteed to be Objective-C. Because the ampersand
# is used to created function handles and anonymous functions in Matlab, most "@" keywords are not
# safe heuristics. However, "end" is a reserved term in Matlab and can't be used to create a valid
# function handle. Because @end is required to close any @implementation, @property, @interface,
# @synthesize, etc. directive in Objective-C, only @end needs to be checked for.
#
# Returns a Language.
def guess_m_language
# Objective-C keywords or comments
if lines.grep(/^#(import|include|ifdef|define)|@end/).any? || lines.grep(/^\s*\/\//).any? || lines.grep(/^\s*\/\*/).any?
Language['Objective-C']
# Matlab file function or class or comments
elsif lines.any? && lines.first.match(/^\s*(function |classdef )/) || lines.grep(/^\s*%/).any?
Language['Matlab']
# M (see M heuristics above)
elsif lines.first.to_s =~ /^[\t ]*;/ or lines.first.to_s =~ /^%?[A-Za-z0-9]+[\t ]*;*/
Language['M']
# Fallback to Objective-C, don't want any M or Matlab false positives
else
Language['Objective-C']
end
end
# Internal: Guess language of .pl files
#
# The rules for disambiguation are:
#
# 1. Many perl files begin with a shebang
# 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
# 3. Default to Perl, because it is more popular
#
# Returns a Language.
def guess_pl_language
if shebang_script == 'perl'
Language['Perl']
elsif lines.grep(/:-/).any?
Language['Prolog']
else
Language['Perl']
end
end
# Internal: Guess language of .r files.
#
# Returns a Language.
def guess_r_language
if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
Language['Rebol']
else
Language['R']
end
end
# Internal: Guess language of .t files.
#
# Returns a Language.
def guess_t_language
score = 0
score += 1 if lines.grep(/^% /).any?
score += data.gsub(/ := /).count
score += data.gsub(/proc |procedure |fcn |function /).count
score += data.gsub(/var \w+: \w+/).count
# Tell-tale signs its gotta be Perl
if lines.grep(/^(my )?(sub |\$|@|%)\w+/).any?
score = 0
end
if score >= 3
Language['Turing']
else
Language['Perl']
end
end
# Internal: Guess language of .v files.
#
# Returns a Language
def guess_v_language
if lines.grep(/^(\/\*|\/\/|module|parameter|input|output|wire|reg|always|initial|begin|\`)/).any?
Language['Verilog']
else
Language['Coq']
end
end
# Internal: Guess language of .gsp files.
#
# Returns a Language.
def guess_gsp_language
if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
Language['Groovy Server Pages']
else
Language['Gosu']
end
end
# Internal: Guess language from the first line.
#
# Look for leading "<?php" in Drupal files
#
# Returns a Language.
def first_line_language
# Only check files with drupal php extensions
return unless drupal_extname?
# Fail fast if blob isn't viewable?
return unless viewable?
if lines.first.to_s =~ /^<\?php/
Language['PHP']
end
end
# Internal: Extract the script name from the shebang line
#
# Requires Blob#data
#
# Examples
#
# '#!/usr/bin/ruby'
# # => 'ruby'
#
# '#!/usr/bin/env ruby'
# # => 'ruby'
#
# '#!/usr/bash/python2.4'
# # => 'python'
#
# Please add additional test coverage to
# `test/test_blob.rb#test_shebang_script` if you make any changes.
#
# Returns a script name String or nil
def shebang_script
# Fail fast if blob isn't viewable?
return unless viewable?
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
bang.sub!(/^#! /, '#!')
tokens = bang.split(' ')
pieces = tokens.first.split('/')
if pieces.size > 1
script = pieces.last
else
script = pieces.first.sub('#!', '')
end
script = script == 'env' ? tokens[1] : script
# python2.4 => python
if script =~ /((?:\d+\.?)+)/
script.sub! $1, ''
end
# Check for multiline shebang hacks that exec themselves
#
# #!/bin/sh
# exec foo "$0" "$@"
#
if script == 'sh' &&
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
script = $1
end
script
end
end
# Internal: Get Language for shebang script
#
# Returns the Language or nil
def shebang_language
# Skip file extensions unlikely to have shebangs
return unless shebang_extname?
if script = shebang_script
Language[script]
end
end
# Public: Highlight syntax of blob
#
# options - A Hash of options (defaults to {})
@@ -691,12 +356,5 @@ module Linguist
''
end
end
Language.overridden_extensions.each do |extension|
name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
unless instance_methods.map(&:to_sym).include?(name)
warn "Language##{name} was not defined"
end
end
end
end

123
lib/linguist/classifier.rb Normal file
View File

@@ -0,0 +1,123 @@
require 'linguist/tokenizer'
module Linguist
# Language bayesian classifier.
class Classifier
# Public: Train classifier that data is a certain language.
#
# db - Hash classifier database object
# language - String language of data
# data - String contents of file
#
# Examples
#
# Classifier.train(db, 'Ruby', "def hello; end")
#
# Returns nothing.
def self.train!(db, language, data)
tokens = Tokenizer.tokenize(data)
db['tokens_total'] ||= 0
db['languages_total'] ||= 0
db['tokens'] ||= {}
db['language_tokens'] ||= {}
db['languages'] ||= {}
tokens.each do |token|
db['tokens'][language] ||= {}
db['tokens'][language][token] ||= 0
db['tokens'][language][token] += 1
db['language_tokens'][language] ||= 0
db['language_tokens'][language] += 1
db['tokens_total'] += 1
end
db['languages'][language] ||= 0
db['languages'][language] += 1
db['languages_total'] += 1
nil
end
# Public: Guess language of data.
#
# db - Hash of classifier tokens database.
# data - Array of tokens or String data to analyze.
# languages - Array of language name Strings to restrict to.
#
# Examples
#
# Classifier.classify(db, "def hello; end")
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
#
# Returns sorted Array of result pairs. Each pair contains the
# String language name and a Float score.
def self.classify(db, tokens, languages = nil)
languages ||= db['languages'].keys
new(db).classify(tokens, languages)
end
# Internal: Initialize a Classifier.
def initialize(db = {})
@tokens_total = db['tokens_total']
@languages_total = db['languages_total']
@tokens = db['tokens']
@language_tokens = db['language_tokens']
@languages = db['languages']
end
# Internal: Guess language of data
#
# data - Array of tokens or String data to analyze.
# languages - Array of language name Strings to restrict to.
#
# Returns sorted Array of result pairs. Each pair contains the
# String language name and a Float score.
def classify(tokens, languages)
return [] if tokens.nil?
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
scores = {}
languages.each do |language|
scores[language] = tokens_probability(tokens, language) +
language_probability(language)
end
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
end
# Internal: Probably of set of tokens in a language occurring - P(D | C)
#
# tokens - Array of String tokens.
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def tokens_probability(tokens, language)
tokens.inject(0.0) do |sum, token|
sum += Math.log(token_probability(token, language))
end
end
# Internal: Probably of token in language occurring - P(F | C)
#
# token - String token.
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def token_probability(token, language)
if @tokens[language][token].to_f == 0.0
1 / @tokens_total.to_f
else
@tokens[language][token].to_f / @language_tokens[language].to_f
end
end
# Internal: Probably of a language occurring - P(C)
#
# language - Language to check.
#
# Returns Float between 0.0 and 1.0.
def language_probability(language)
Math.log(@languages[language].to_f / @languages_total.to_f)
end
end
end

162
lib/linguist/generated.rb Normal file
View File

@@ -0,0 +1,162 @@
module Linguist
class Generated
# Public: Is the blob a generated file?
#
# name - String filename
# data - String blob data. A block also maybe passed in for lazy
# loading. This behavior is deprecated and you should always
# pass in a String.
#
# Return true or false
def self.generated?(name, data)
new(name, data).generated?
end
# Internal: Initialize Generated instance
#
# name - String filename
# data - String blob data
def initialize(name, data)
@name = name
@extname = File.extname(name)
@_data = data
end
attr_reader :name, :extname
# Lazy load blob data if block was passed in.
#
# Awful, awful stuff happening here.
#
# Returns String data.
def data
@data ||= @_data.respond_to?(:call) ? @_data.call() : @_data
end
# Public: Get each line of data
#
# Returns an Array of lines
def lines
# TODO: data should be required to be a String, no nils
@lines ||= data ? data.split("\n", -1) : []
end
# Internal: Is the blob a generated file?
#
# Generated source code is suppressed in diffs and is ignored by
# language statistics.
#
# Please add additional test coverage to
# `test/test_blob.rb#test_generated` if you make any changes.
#
# Return true or false
def generated?
name == 'Gemfile.lock' ||
minified_javascript? ||
compiled_coffeescript? ||
xcode_project_file? ||
generated_net_docfile? ||
generated_parser?
end
# Internal: Is the blob an XCode project file?
#
# Generated if the file extension is an XCode project
# file extension.
#
# Returns true of false.
def xcode_project_file?
['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
end
# Internal: Is the blob minified JS?
#
# Consider JS minified if the average line length is
# greater then 100c.
#
# Returns true or false.
def minified_javascript?
return unless extname == '.js'
if lines.any?
(lines.inject(0) { |n, l| n += l.length } / lines.length) > 100
else
false
end
end
# Internal: Is the blob of JS generated by CoffeeScript?
#
# CoffeeScript is meant to output JS that would be difficult to
# tell if it was generated or not. Look for a number of patterns
# output by the CS compiler.
#
# Return true or false
def compiled_coffeescript?
return false unless extname == '.js'
# CoffeeScript generated by > 1.2 include a comment on the first line
if lines[0] =~ /^\/\/ Generated by /
return true
end
if lines[0] == '(function() {' && # First line is module closure opening
lines[-2] == '}).call(this);' && # Second to last line closes module closure
lines[-1] == '' # Last line is blank
score = 0
lines.each do |line|
if line =~ /var /
# Underscored temp vars are likely to be Coffee
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
# bind and extend functions are very Coffee specific
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
end
end
# Require a score of 3. This is fairly arbitrary. Consider
# tweaking later.
score >= 3
else
false
end
end
# Internal: Is this a generated documentation file for a .NET assembly?
#
# .NET developers often check in the XML Intellisense file along with an
# assembly - however, these don't have a special extension, so we have to
# dig into the contents to determine if it's a docfile. Luckily, these files
# are extremely structured, so recognizing them is easy.
#
# Returns true or false
def generated_net_docfile?
return false unless extname.downcase == ".xml"
return false unless lines.count > 3
# .NET Docfiles always open with <doc> and their first tag is an
# <assembly> tag
return lines[1].include?("<doc>") &&
lines[2].include?("<assembly>") &&
lines[-2].include?("</doc>")
end
# Internal: Is the blob of JS a parser generated by PEG.js?
#
# PEG.js-generated parsers are not meant to be consumed by humans.
#
# Return true or false
def generated_parser?
return false unless extname == '.js'
# PEG.js-generated parsers include a comment near the top of the file
# that marks them as such.
if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
return true
end
false
end
end
end

View File

@@ -2,6 +2,9 @@ require 'escape_utils'
require 'pygments'
require 'yaml'
require 'linguist/classifier'
require 'linguist/samples'
module Linguist
# Language names that are recognizable by GitHub. Defined languages
# can be highlighted, searched and listed under the Top Languages page.
@@ -9,30 +12,15 @@ module Linguist
# Languages are defined in `lib/linguist/languages.yml`.
class Language
@languages = []
@overrides = {}
@index = {}
@name_index = {}
@alias_index = {}
@extension_index = {}
@filename_index = {}
@extension_index = Hash.new { |h,k| h[k] = [] }
@filename_index = Hash.new { |h,k| h[k] = [] }
# Valid Languages types
TYPES = [:data, :markup, :programming]
# Internal: Test if extension maps to multiple Languages.
#
# Returns true or false.
def self.ambiguous?(extension)
@overrides.include?(extension)
end
# Include?: Return overridden extensions.
#
# Returns extensions Array.
def self.overridden_extensions
@overrides.keys
end
# Internal: Create a new Language object
#
# attributes - A hash of attributes
@@ -43,18 +31,18 @@ module Linguist
@languages << language
# All Language names should be unique. Warn if there is a duplicate.
# All Language names should be unique. Raise if there is a duplicate.
if @name_index.key?(language.name)
warn "Duplicate language name: #{language.name}"
raise ArgumentError, "Duplicate language name: #{language.name}"
end
# Language name index
@index[language.name] = @name_index[language.name] = language
language.aliases.each do |name|
# All Language aliases should be unique. Warn if there is a duplicate.
# All Language aliases should be unique. Raise if there is a duplicate.
if @alias_index.key?(name)
warn "Duplicate alias: #{name}"
raise ArgumentError, "Duplicate alias: #{name}"
end
@index[name] = @alias_index[name] = language
@@ -62,33 +50,50 @@ module Linguist
language.extensions.each do |extension|
if extension !~ /^\./
warn "Extension is missing a '.': #{extension.inspect}"
raise ArgumentError, "Extension is missing a '.': #{extension.inspect}"
end
unless ambiguous?(extension)
# Index the extension with a leading ".": ".rb"
@extension_index[extension] = language
# Index the extension without a leading ".": "rb"
@extension_index[extension.sub(/^\./, '')] = language
end
end
language.overrides.each do |extension|
if extension !~ /^\./
warn "Extension is missing a '.': #{extension.inspect}"
end
@overrides[extension] = language
@extension_index[extension] << language
end
language.filenames.each do |filename|
@filename_index[filename] = language
@filename_index[filename] << language
end
language
end
# Public: Detects the Language of the blob.
#
# name - String filename
# data - String blob data. A block also maybe passed in for lazy
# loading. This behavior is deprecated and you should always
# pass in a String.
# mode - Optional String mode (defaults to nil)
#
# Returns Language or nil.
def self.detect(name, data, mode = nil)
# A bit of an elegant hack. If the file is executable but extensionless,
# append a "magic" extension so it can be classified with other
# languages that have shebang scripts.
if File.extname(name).empty? && mode && (mode.to_i(8) & 05) == 05
name += ".script!"
end
possible_languages = find_by_filename(name)
if possible_languages.length > 1
data = data.call() if data.respond_to?(:call)
if data.nil? || data == ""
nil
elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
Language[result[0]]
end
else
possible_languages.first
end
end
# Public: Get all Languages
#
# Returns an Array of Languages
@@ -124,33 +129,19 @@ module Linguist
@alias_index[name]
end
# Public: Look up Language by extension.
#
# extension - The extension String. May include leading "."
#
# Examples
#
# Language.find_by_extension('.rb')
# # => #<Language name="Ruby">
#
# Returns the Language or nil if none was found.
def self.find_by_extension(extension)
@extension_index[extension]
end
# Public: Look up Language by filename.
# Public: Look up Languages by filename.
#
# filename - The path String.
#
# Examples
#
# Language.find_by_filename('foo.rb')
# # => #<Language name="Ruby">
# # => [#<Language name="Ruby">]
#
# Returns the Language or nil if none was found.
# Returns all matching Languages or [] if none were found.
def self.find_by_filename(filename)
basename, extname = File.basename(filename), File.extname(filename)
@filename_index[basename] || @extension_index[extname]
@filename_index[basename] + @extension_index[extname]
end
# Public: Look up Language by its name or lexer.
@@ -231,16 +222,18 @@ module Linguist
raise(ArgumentError, "#{@name} is missing lexer")
@ace_mode = attributes[:ace_mode]
@wrap = attributes[:wrap] || false
# Set legacy search term
@search_term = attributes[:search_term] || default_alias_name
# Set extensions or default to [].
@extensions = attributes[:extensions] || []
@overrides = attributes[:overrides] || []
@filenames = attributes[:filenames] || []
@primary_extension = attributes[:primary_extension] || default_primary_extension || extensions.first
unless @primary_extension = attributes[:primary_extension]
raise ArgumentError, "#{@name} is missing primary extension"
end
# Prepend primary extension unless its already included
if primary_extension && !extensions.include?(primary_extension)
@@ -320,6 +313,11 @@ module Linguist
# Returns a String name or nil
attr_reader :ace_mode
# Public: Should language lines be wrapped
#
# Returns true or false
attr_reader :wrap
# Public: Get extensions
#
# Examples
@@ -331,7 +329,7 @@ module Linguist
# Deprecated: Get primary extension
#
# Defaults to the first extension but can be overriden
# Defaults to the first extension but can be overridden
# in the languages.yml.
#
# The primary extension can not be nil. Tests should verify this.
@@ -343,11 +341,6 @@ module Linguist
# Returns the extension String.
attr_reader :primary_extension
# Internal: Get overridden extensions.
#
# Returns the extensions Array.
attr_reader :overrides
# Public: Get filenames
#
# Examples
@@ -377,13 +370,6 @@ module Linguist
name.downcase.gsub(/\s/, '-')
end
# Internal: Get default primary extension.
#
# Returns the extension String.
def default_primary_extension
extensions.first
end
# Public: Get Language group
#
# Returns a Language
@@ -441,11 +427,40 @@ module Linguist
def hash
name.hash
end
def inspect
"#<#{self.class} name=#{name}>"
end
end
extensions = Samples::DATA['extnames']
filenames = Samples::DATA['filenames']
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options|
options['extensions'] ||= []
options['filenames'] ||= []
if extnames = extensions[name]
extnames.each do |extname|
if !options['extensions'].include?(extname)
options['extensions'] << extname
else
warn "#{name} #{extname.inspect} is already defined in samples/. Remove from languages.yml."
end
end
end
if fns = filenames[name]
fns.each do |filename|
if !options['filenames'].include?(filename)
options['filenames'] << filename
else
warn "#{name} #{filename.inspect} is already defined in samples/. Remove from languages.yml."
end
end
end
Language.create(
:name => name,
:color => options['color'],
@@ -453,12 +468,12 @@ module Linguist
:aliases => options['aliases'],
:lexer => options['lexer'],
:ace_mode => options['ace_mode'],
:wrap => options['wrap'],
:group_name => options['group'],
:searchable => options.key?('searchable') ? options['searchable'] : true,
:search_term => options['search_term'],
:extensions => options['extensions'],
:extensions => options['extensions'].sort,
:primary_extension => options['primary_extension'],
:overrides => options['overrides'],
:filenames => options['filenames'],
:popular => popular.include?(name)
)

File diff suppressed because it is too large Load Diff

38
lib/linguist/md5.rb Normal file
View File

@@ -0,0 +1,38 @@
require 'digest/md5'
module Linguist
module MD5
# Public: Create deep nested digest of value object.
#
# Useful for object comparison.
#
# obj - Object to digest.
#
# Returns String hex digest
def self.hexdigest(obj)
digest = Digest::MD5.new
case obj
when String, Symbol, Integer
digest.update "#{obj.class}"
digest.update "#{obj}"
when TrueClass, FalseClass, NilClass
digest.update "#{obj.class}"
when Array
digest.update "#{obj.class}"
for e in obj
digest.update(hexdigest(e))
end
when Hash
digest.update "#{obj.class}"
for e in obj.map { |(k, v)| hexdigest([k, v]) }.sort
digest.update(e)
end
else
raise TypeError, "can't convert #{obj.inspect} into String"
end
digest.hexdigest
end
end
end

View File

@@ -1,91 +0,0 @@
require 'mime/types'
require 'yaml'
class MIME::Type
attr_accessor :override
end
# Register additional mime type extensions
#
# Follows same format as mime-types data file
# https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
File.read(File.expand_path("../mimes.yml", __FILE__)).lines.each do |line|
# Regexp was cargo culted from mime-types lib
next unless line =~ %r{^
#{MIME::Type::MEDIA_TYPE_RE}
(?:\s@([^\s]+))?
(?:\s:(#{MIME::Type::ENCODING_RE}))?
}x
mediatype = $1
subtype = $2
extensions = $3
encoding = $4
# Lookup existing mime type
mime_type = MIME::Types["#{mediatype}/#{subtype}"].first ||
# Or create a new instance
MIME::Type.new("#{mediatype}/#{subtype}")
if extensions
extensions.split(/,/).each do |extension|
mime_type.extensions << extension
end
end
if encoding
mime_type.encoding = encoding
end
mime_type.override = true
# Kind of hacky, but we need to reindex the mime type after making changes
MIME::Types.add_type_variant(mime_type)
MIME::Types.index_extensions(mime_type)
end
module Linguist
module Mime
# Internal: Look up mime type for extension.
#
# ext - The extension String. May include leading "."
#
# Examples
#
# Mime.mime_for('.html')
# # => 'text/html'
#
# Mime.mime_for('txt')
# # => 'text/plain'
#
# Return mime type String otherwise falls back to 'text/plain'.
def self.mime_for(ext)
mime_type = lookup_mime_type_for(ext)
mime_type ? mime_type.to_s : 'text/plain'
end
# Internal: Lookup mime type for extension or mime type
#
# ext_or_mime_type - A file extension ".txt" or mime type "text/plain".
#
# Returns a MIME::Type
def self.lookup_mime_type_for(ext_or_mime_type)
ext_or_mime_type ||= ''
if ext_or_mime_type =~ /\w+\/\w+/
guesses = ::MIME::Types[ext_or_mime_type]
else
guesses = ::MIME::Types.type_for(ext_or_mime_type)
end
# Use custom override first
guesses.detect { |type| type.override } ||
# Prefer text mime types over binary
guesses.detect { |type| type.ascii? } ||
# Otherwise use the first guess
guesses.first
end
end
end

View File

@@ -1,62 +0,0 @@
# Additional types to add to MIME::Types
#
# MIME types are used to set the Content-Type of raw binary blobs. All text
# blobs are served as text/plain regardless of their type to ensure they
# open in the browser rather than downloading.
#
# The encoding helps determine whether a file should be treated as plain
# text or binary. By default, a mime type's encoding is base64 (binary).
# These types will show a "View Raw" link. To force a type to render as
# plain text, set it to 8bit for UTF-8. text/* types will be treated as
# text by default.
#
# <type> @<extensions> :<encoding>
#
# type - mediatype/subtype
# extensions - comma seperated extension list
# encoding - base64 (binary), 7bit (ASCII), 8bit (UTF-8), or
# quoted-printable (Printable ASCII).
#
# Follows same format as mime-types data file
# https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
#
# Any additions or modifications (even trivial) should have corresponding
# test change in `test/test_mime.rb`.
# TODO: Lookup actual types
application/octet-stream @a,blend,gem,graffle,ipa,lib,mcz,nib,o,ogv,otf,pfx,pigx,plgx,psd,sib,spl,sqlite3,swc,ucode,xpi
# Please keep this list alphabetized
application/java-archive @ear,war
application/netcdf :8bit
application/ogg @ogg
application/postscript :base64
application/vnd.adobe.air-application-installer-package+zip @air
application/vnd.mozilla.xul+xml :8bit
application/vnd.oasis.opendocument.presentation @odp
application/vnd.oasis.opendocument.spreadsheet @ods
application/vnd.oasis.opendocument.text @odt
application/vnd.openofficeorg.extension @oxt
application/vnd.openxmlformats-officedocument.presentationml.presentation @pptx
application/x-chrome-extension @crx
application/x-iwork-keynote-sffkey @key
application/x-iwork-numbers-sffnumbers @numbers
application/x-iwork-pages-sffpages @pages
application/x-ms-xbap @xbap :8bit
application/x-parrot-bytecode @pbc
application/x-shockwave-flash @swf
application/x-silverlight-app @xap
application/x-supercollider @sc :8bit
application/x-troff-ms :8bit
application/x-wais-source :8bit
application/xaml+xml @xaml :8bit
application/xslt+xml @xslt :8bit
image/x-icns @icns
text/cache-manifest @manifest
text/plain @cu,cxx
text/x-logtalk @lgt
text/x-nemerle @n
text/x-nimrod @nim
text/x-ocaml @ml,mli,mll,mly,sig,sml
text/x-rust @rs,rc
text/x-scheme @rkt,scm,sls,sps,ss

View File

@@ -1,92 +0,0 @@
require 'linguist/language'
require 'linguist/mime'
require 'pygments'
module Linguist
# Similar to ::Pathname, Linguist::Pathname wraps a path string and
# provides helpful query methods. Its useful when you only have a
# filename but not a blob and need to figure out the language of the file.
class Pathname
# Public: Initialize a Pathname
#
# path - A filename String. The file may or maybe actually exist.
#
# Returns a Pathname.
def initialize(path)
@path = path
end
# Public: Get the basename of the path
#
# Examples
#
# Pathname.new('sub/dir/file.rb').basename
# # => 'file.rb'
#
# Returns a String.
def basename
File.basename(@path)
end
# Public: Get the extname of the path
#
# Examples
#
# Pathname.new('.rb').extname
# # => '.rb'
#
# Pathname.new('file.rb').extname
# # => '.rb'
#
# Returns a String.
def extname
File.extname(@path)
end
# Public: Get the language of the path
#
# The path extension name is the only heuristic used to detect the
# language name.
#
# Examples
#
# Pathname.new('file.rb').language
# # => Language['Ruby']
#
# Returns a Language or nil if none was found.
def language
@language ||= Language.find_by_filename(@path)
end
# Internal: Get the lexer of the path
#
# Returns a Lexer.
def lexer
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
end
# Public: Get the mime type
#
# Examples
#
# Pathname.new('index.html').mime_type
# # => 'text/html'
#
# Returns a mime type String.
def mime_type
@mime_type ||= Mime.mime_for(extname)
end
# Public: Return self as String
#
# Returns a String
def to_s
@path.dup
end
def eql?(other)
other.is_a?(self.class) && @path == other.to_s
end
alias_method :==, :eql?
end
end

View File

@@ -67,8 +67,8 @@ module Linguist
return if @computed_stats
@enum.each do |blob|
# Skip binary file extensions
next if blob.binary_mime_type?
# Skip files that are likely binary
next if blob.likely_binary?
# Skip vendored or generated blobs
next if blob.vendored? || blob.generated? || blob.language.nil?
@@ -80,7 +80,7 @@ module Linguist
end
# Compute total size
@size = @sizes.inject(0) { |s,(k,v)| s + v }
@size = @sizes.inject(0) { |s,(_,v)| s + v }
# Get primary language
if primary = @sizes.max_by { |(_, size)| size }

37758
lib/linguist/samples.json Normal file

File diff suppressed because it is too large Load Diff

98
lib/linguist/samples.rb Normal file
View File

@@ -0,0 +1,98 @@
require 'yaml'
require 'linguist/md5'
require 'linguist/classifier'
module Linguist
# Model for accessing classifier training data.
module Samples
# Path to samples root directory
ROOT = File.expand_path("../../../samples", __FILE__)
# Path for serialized samples db
PATH = File.expand_path('../samples.json', __FILE__)
# Hash of serialized samples object
if File.exist?(PATH)
DATA = YAML.load_file(PATH)
end
# Public: Iterate over each sample.
#
# &block - Yields Sample to block
#
# Returns nothing.
def self.each(&block)
Dir.entries(ROOT).each do |category|
next if category == '.' || category == '..'
# Skip text and binary for now
# Possibly reconsider this later
next if category == 'Text' || category == 'Binary'
dirname = File.join(ROOT, category)
Dir.entries(dirname).each do |filename|
next if filename == '.' || filename == '..'
if filename == 'filenames'
Dir.entries(File.join(dirname, filename)).each do |subfilename|
next if subfilename == '.' || subfilename == '..'
yield({
:path => File.join(dirname, filename, subfilename),
:language => category,
:filename => subfilename
})
end
else
if File.extname(filename) == ""
raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
end
yield({
:path => File.join(dirname, filename),
:language => category,
:extname => File.extname(filename)
})
end
end
end
nil
end
# Public: Build Classifier from all samples.
#
# Returns trained Classifier.
def self.data
db = {}
db['extnames'] = {}
db['filenames'] = {}
each do |sample|
language_name = sample[:language]
if sample[:extname]
db['extnames'][language_name] ||= []
if !db['extnames'][language_name].include?(sample[:extname])
db['extnames'][language_name] << sample[:extname]
db['extnames'][language_name].sort!
end
end
if sample[:filename]
db['filenames'][language_name] ||= []
db['filenames'][language_name] << sample[:filename]
db['filenames'][language_name].sort!
end
data = File.read(sample[:path])
Classifier.train!(db, language_name, data)
end
db['md5'] = Linguist::MD5.hexdigest(db)
db
end
end
end

197
lib/linguist/tokenizer.rb Normal file
View File

@@ -0,0 +1,197 @@
require 'strscan'
module Linguist
# Generic programming language tokenizer.
#
# Tokens are designed for use in the language bayes classifier.
# It strips any data strings or comments and preserves significant
# language symbols.
class Tokenizer
# Public: Extract tokens from data
#
# data - String to tokenize
#
# Returns Array of token Strings.
def self.tokenize(data)
new.extract_tokens(data)
end
# Read up to 100KB
BYTE_LIMIT = 100_000
# Start state on token, ignore anything till the next newline
SINGLE_LINE_COMMENTS = [
'//', # C
'#', # Ruby
'%', # Tex
]
# Start state on opening token, ignore anything until the closing
# token is reached.
MULTI_LINE_COMMENTS = [
['/*', '*/'], # C
['<!--', '-->'], # XML
['{-', '-}'], # Haskell
['(*', '*)'] # Coq
]
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
"\s*#{Regexp.escape(c)} "
}.join("|"))
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
Regexp.escape(c[0])
}.join("|"))
# Internal: Extract generic tokens from data.
#
# data - String to scan.
#
# Examples
#
# extract_tokens("printf('Hello')")
# # => ['printf', '(', ')']
#
# Returns Array of token Strings.
def extract_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
break if s.pos >= BYTE_LIMIT
if token = s.scan(/^#!.+$/)
if name = extract_shebang(token)
tokens << "SHEBANG#!#{name}"
end
# Single line comment
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
# tokens << token.strip
s.skip_until(/\n|\Z/)
# Multiline comments
elsif token = s.scan(START_MULTI_LINE_COMMENT)
# tokens << token
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
# tokens << close_token
# Skip single or double quoted strings
elsif s.scan(/"/)
if s.peek(1) == "\""
s.getch
else
s.skip_until(/[^\\]"/)
end
elsif s.scan(/'/)
if s.peek(1) == "'"
s.getch
else
s.skip_until(/[^\\]'/)
end
# Skip number literals
elsif s.scan(/(0x)?\d(\d|\.)*/)
# SGML style brackets
elsif token = s.scan(/<[^\s<>][^<>]*>/)
extract_sgml_tokens(token).each { |t| tokens << t }
# Common programming punctuation
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
tokens << token
# Regular token
elsif token = s.scan(/[\w\.@#\/\*]+/)
tokens << token
# Common operators
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
tokens << token
else
s.getch
end
end
tokens
end
# Internal: Extract normalized shebang command token.
#
# Examples
#
# extract_shebang("#!/usr/bin/ruby")
# # => "ruby"
#
# extract_shebang("#!/usr/bin/env node")
# # => "node"
#
# Returns String token or nil it couldn't be parsed.
def extract_shebang(data)
s = StringScanner.new(data)
if path = s.scan(/^#!\s*\S+/)
script = path.split('/').last
if script == 'env'
s.scan(/\s+/)
script = s.scan(/\S+/)
end
script = script[/[^\d]+/, 0] if script
return script
end
nil
end
# Internal: Extract tokens from inside SGML tag.
#
# data - SGML tag String.
#
# Examples
#
# extract_sgml_tokens("<a href='' class=foo>")
# # => ["<a>", "href="]
#
# Returns Array of token Strings.
def extract_sgml_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
# Emit start token
if token = s.scan(/<\/?[^\s>]+/)
tokens << "#{token}>"
# Emit attributes with trailing =
elsif token = s.scan(/\w+=/)
tokens << token
# Then skip over attribute value
if s.scan(/"/)
s.skip_until(/[^\\]"/)
elsif s.scan(/'/)
s.skip_until(/[^\\]'/)
else
s.skip_until(/\w+/)
end
# Emit lone attributes
elsif token = s.scan(/\w+/)
tokens << token
# Stop at the end of the tag
elsif s.scan(/>/)
s.terminate
else
s.getch
end
end
tokens
end
end
end

View File

@@ -16,13 +16,19 @@
# https://github.com/joyent/node
- ^deps/
- ^tools/
- (^|/)configure$
- (^|/)configure.ac$
- (^|/)config.guess$
- (^|/)config.sub$
# Node depedencies
# Node dependencies
- node_modules/
# Vendored depedencies
# Vendored dependencies
- vendor/
# Debian packaging
- ^debian/
## Commonly Bundled JavaScript frameworks ##
@@ -61,8 +67,16 @@
# MathJax
- (^|/)MathJax/
# SyntaxHighlighter - http://alexgorbatchev.com/
- (^|/)shBrush([^.]*)\.js$
- (^|/)shCore\.js$
- (^|/)shLegacy\.js$
## Python ##
# django
- (^|/)admin_media/
# Fabric
- ^fabfile\.py$
@@ -94,3 +108,6 @@
# Samples folders
- ^[Ss]amples/
# Test fixtures
- ^[Tt]est/fixtures/