mirror of
https://github.com/KevinMidboe/linguist.git
synced 2026-01-03 16:05:33 +00:00
Merged with upstream. Updated M (aka MUMPS) detection to use the new bayesian / samples method.
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
require 'linguist/blob_helper'
|
||||
require 'linguist/generated'
|
||||
require 'linguist/language'
|
||||
require 'linguist/mime'
|
||||
require 'linguist/pathname'
|
||||
require 'linguist/repository'
|
||||
require 'linguist/samples'
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
require 'linguist/generated'
|
||||
require 'linguist/language'
|
||||
require 'linguist/mime'
|
||||
require 'linguist/pathname'
|
||||
|
||||
require 'charlock_holmes'
|
||||
require 'escape_utils'
|
||||
require 'mime/types'
|
||||
require 'pygments'
|
||||
require 'yaml'
|
||||
|
||||
@@ -11,13 +11,6 @@ module Linguist
|
||||
# BlobHelper is a mixin for Blobish classes that respond to "name",
|
||||
# "data" and "size" such as Grit::Blob.
|
||||
module BlobHelper
|
||||
# Internal: Get a Pathname wrapper for Blob#name
|
||||
#
|
||||
# Returns a Pathname.
|
||||
def pathname
|
||||
Pathname.new(name || "")
|
||||
end
|
||||
|
||||
# Public: Get the extname of the path
|
||||
#
|
||||
# Examples
|
||||
@@ -27,7 +20,23 @@ module Linguist
|
||||
#
|
||||
# Returns a String
|
||||
def extname
|
||||
pathname.extname
|
||||
File.extname(name.to_s)
|
||||
end
|
||||
|
||||
# Internal: Lookup mime type for extension.
|
||||
#
|
||||
# Returns a MIME::Type
|
||||
def _mime_type
|
||||
if defined? @_mime_type
|
||||
@_mime_type
|
||||
else
|
||||
guesses = ::MIME::Types.type_for(extname.to_s)
|
||||
|
||||
# Prefer text mime types over binary
|
||||
@_mime_type = guesses.detect { |type| type.ascii? } ||
|
||||
# Otherwise use the first guess
|
||||
guesses.first
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Get the actual blob mime type
|
||||
@@ -39,7 +48,23 @@ module Linguist
|
||||
#
|
||||
# Returns a mime type String.
|
||||
def mime_type
|
||||
@mime_type ||= pathname.mime_type
|
||||
_mime_type ? _mime_type.to_s : 'text/plain'
|
||||
end
|
||||
|
||||
# Internal: Is the blob binary according to its mime type
|
||||
#
|
||||
# Return true or false
|
||||
def binary_mime_type?
|
||||
_mime_type ? _mime_type.binary? : false
|
||||
end
|
||||
|
||||
# Internal: Is the blob binary according to its mime type,
|
||||
# overriding it if we have better data from the languages.yml
|
||||
# database.
|
||||
#
|
||||
# Return true or false
|
||||
def likely_binary?
|
||||
binary_mime_type? and not Language.find_by_filename(name)
|
||||
end
|
||||
|
||||
# Public: Get the Content-Type header value
|
||||
@@ -71,7 +96,7 @@ module Linguist
|
||||
elsif name.nil?
|
||||
"attachment"
|
||||
else
|
||||
"attachment; filename=#{EscapeUtils.escape_url(pathname.basename)}"
|
||||
"attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
|
||||
end
|
||||
end
|
||||
|
||||
@@ -90,15 +115,6 @@ module Linguist
|
||||
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
|
||||
end
|
||||
|
||||
# Public: Is the blob binary according to its mime type
|
||||
#
|
||||
# Return true or false
|
||||
def binary_mime_type?
|
||||
if mime_type = Mime.lookup_mime_type_for(pathname.extname)
|
||||
mime_type.binary?
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Is the blob binary?
|
||||
#
|
||||
# Return true or false
|
||||
@@ -132,23 +148,14 @@ module Linguist
|
||||
#
|
||||
# Return true or false
|
||||
def image?
|
||||
['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
|
||||
['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
|
||||
end
|
||||
|
||||
# Public: Is the blob a possible drupal php file?
|
||||
# Public: Is the blob a supported 3D model format?
|
||||
#
|
||||
# Return true or false
|
||||
def drupal_extname?
|
||||
['.module', '.install', '.test', '.inc'].include?(extname)
|
||||
end
|
||||
|
||||
# Public: Is the blob likely to have a shebang?
|
||||
#
|
||||
# Return true or false
|
||||
def shebang_extname?
|
||||
extname.empty? &&
|
||||
mode &&
|
||||
(mode.to_i(8) & 05) == 05
|
||||
def solid?
|
||||
extname.downcase == '.stl'
|
||||
end
|
||||
|
||||
MEGABYTE = 1024 * 1024
|
||||
@@ -169,7 +176,7 @@ module Linguist
|
||||
#
|
||||
# Return true or false
|
||||
def safe_to_colorize?
|
||||
text? && !large? && !high_ratio_of_long_lines?
|
||||
!large? && text? && !high_ratio_of_long_lines?
|
||||
end
|
||||
|
||||
# Internal: Does the blob have a ratio of long lines?
|
||||
@@ -213,7 +220,31 @@ module Linguist
|
||||
#
|
||||
# Returns an Array of lines
|
||||
def lines
|
||||
@lines ||= (viewable? && data) ? data.split("\n", -1) : []
|
||||
@lines ||=
|
||||
if viewable? && data
|
||||
data.split(line_split_character, -1)
|
||||
else
|
||||
[]
|
||||
end
|
||||
end
|
||||
|
||||
# Character used to split lines. This is almost always "\n" except when Mac
|
||||
# Format is detected in which case it's "\r".
|
||||
#
|
||||
# Returns a split pattern string.
|
||||
def line_split_character
|
||||
@line_split_character ||= (mac_format?? "\r" : "\n")
|
||||
end
|
||||
|
||||
# Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
|
||||
# for line ends and does not include a \n (0x0a).
|
||||
#
|
||||
# Returns true when mac format is detected.
|
||||
def mac_format?
|
||||
return if !viewable?
|
||||
if pos = data[0, 4096].index("\r")
|
||||
data[pos + 1] != ?\n
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Get number of lines of code
|
||||
@@ -234,125 +265,16 @@ module Linguist
|
||||
lines.grep(/\S/).size
|
||||
end
|
||||
|
||||
# Internal: Compute average line length.
|
||||
#
|
||||
# Returns Integer.
|
||||
def average_line_length
|
||||
if lines.any?
|
||||
lines.inject(0) { |n, l| n += l.length } / lines.length
|
||||
else
|
||||
0
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Is the blob a generated file?
|
||||
#
|
||||
# Generated source code is supressed in diffs and is ignored by
|
||||
# Generated source code is suppressed in diffs and is ignored by
|
||||
# language statistics.
|
||||
#
|
||||
# Requires Blob#data
|
||||
#
|
||||
# Includes:
|
||||
# - XCode project XML files
|
||||
# - Minified JavaScript
|
||||
#
|
||||
# Please add additional test coverage to
|
||||
# `test/test_blob.rb#test_generated` if you make any changes.
|
||||
# May load Blob#data
|
||||
#
|
||||
# Return true or false
|
||||
def generated?
|
||||
if xcode_project_file? || generated_net_docfile?
|
||||
true
|
||||
elsif generated_coffeescript? || minified_javascript?
|
||||
true
|
||||
elsif name == 'Gemfile.lock'
|
||||
true
|
||||
else
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Is the blob an XCode project file?
|
||||
#
|
||||
# Generated if the file extension is an XCode project
|
||||
# file extension.
|
||||
#
|
||||
# Returns true of false.
|
||||
def xcode_project_file?
|
||||
['.xib', '.nib', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
|
||||
end
|
||||
|
||||
# Internal: Is the blob minified JS?
|
||||
#
|
||||
# Consider JS minified if the average line length is
|
||||
# greater then 100c.
|
||||
#
|
||||
# Returns true or false.
|
||||
def minified_javascript?
|
||||
return unless extname == '.js'
|
||||
average_line_length > 100
|
||||
end
|
||||
|
||||
# Internal: Is the blob JS generated by CoffeeScript?
|
||||
#
|
||||
# Requires Blob#data
|
||||
#
|
||||
# CoffeScript is meant to output JS that would be difficult to
|
||||
# tell if it was generated or not. Look for a number of patterns
|
||||
# outputed by the CS compiler.
|
||||
#
|
||||
# Return true or false
|
||||
def generated_coffeescript?
|
||||
return unless extname == '.js'
|
||||
|
||||
# CoffeeScript generated by > 1.2 include a comment on the first line
|
||||
if lines[0] =~ /^\/\/ Generated by /
|
||||
return true
|
||||
end
|
||||
|
||||
if lines[0] == '(function() {' && # First line is module closure opening
|
||||
lines[-2] == '}).call(this);' && # Second to last line closes module closure
|
||||
lines[-1] == '' # Last line is blank
|
||||
|
||||
score = 0
|
||||
|
||||
lines.each do |line|
|
||||
if line =~ /var /
|
||||
# Underscored temp vars are likely to be Coffee
|
||||
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
|
||||
|
||||
# bind and extend functions are very Coffee specific
|
||||
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
|
||||
end
|
||||
end
|
||||
|
||||
# Require a score of 3. This is fairly arbitrary. Consider
|
||||
# tweaking later.
|
||||
score >= 3
|
||||
else
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Is this a generated documentation file for a .NET assembly?
|
||||
#
|
||||
# Requires Blob#data
|
||||
#
|
||||
# .NET developers often check in the XML Intellisense file along with an
|
||||
# assembly - however, these don't have a special extension, so we have to
|
||||
# dig into the contents to determine if it's a docfile. Luckily, these files
|
||||
# are extremely structured, so recognizing them is easy.
|
||||
#
|
||||
# Returns true or false
|
||||
def generated_net_docfile?
|
||||
return false unless extname.downcase == ".xml"
|
||||
return false unless lines.count > 3
|
||||
|
||||
# .NET Docfiles always open with <doc> and their first tag is an
|
||||
# <assembly> tag
|
||||
return lines[1].include?("<doc>") &&
|
||||
lines[2].include?("<assembly>") &&
|
||||
lines[-2].include?("</doc>")
|
||||
@_generated ||= Generated.generated?(name, lambda { data })
|
||||
end
|
||||
|
||||
# Public: Should the blob be indexed for searching?
|
||||
@@ -360,7 +282,7 @@ module Linguist
|
||||
# Excluded:
|
||||
# - Files over 0.1MB
|
||||
# - Non-text files
|
||||
# - Langauges marked as not searchable
|
||||
# - Languages marked as not searchable
|
||||
# - Generated source files
|
||||
#
|
||||
# Please add additional test coverage to
|
||||
@@ -368,16 +290,18 @@ module Linguist
|
||||
#
|
||||
# Return true or false
|
||||
def indexable?
|
||||
if binary?
|
||||
if size > 100 * 1024
|
||||
false
|
||||
elsif binary?
|
||||
false
|
||||
elsif extname == '.txt'
|
||||
true
|
||||
elsif language.nil?
|
||||
false
|
||||
elsif !language.searchable?
|
||||
false
|
||||
elsif generated?
|
||||
false
|
||||
elsif size > 100 * 1024
|
||||
false
|
||||
else
|
||||
true
|
||||
end
|
||||
@@ -389,33 +313,15 @@ module Linguist
|
||||
#
|
||||
# Returns a Language or nil if none is detected
|
||||
def language
|
||||
if defined? @language
|
||||
@language
|
||||
return @language if defined? @language
|
||||
|
||||
if defined?(@data) && @data.is_a?(String)
|
||||
data = @data
|
||||
else
|
||||
@language = guess_language
|
||||
data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language
|
||||
#
|
||||
# Please add additional test coverage to
|
||||
# `test/test_blob.rb#test_language` if you make any changes.
|
||||
#
|
||||
# Returns a Language or nil
|
||||
def guess_language
|
||||
return if binary_mime_type?
|
||||
|
||||
# Disambiguate between multiple language extensions
|
||||
disambiguate_extension_language ||
|
||||
|
||||
# See if there is a Language for the extension
|
||||
pathname.language ||
|
||||
|
||||
# Look for idioms in first line
|
||||
first_line_language ||
|
||||
|
||||
# Try to detect Language from shebang line
|
||||
shebang_language
|
||||
@language = Language.detect(name.to_s, data, mode)
|
||||
end
|
||||
|
||||
# Internal: Get the lexer of the blob.
|
||||
@@ -425,247 +331,6 @@ module Linguist
|
||||
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
|
||||
end
|
||||
|
||||
# Internal: Disambiguates between multiple language extensions.
|
||||
#
|
||||
# Delegates to "guess_EXTENSION_language".
|
||||
#
|
||||
# Please add additional test coverage to
|
||||
# `test/test_blob.rb#test_language` if you add another method.
|
||||
#
|
||||
# Returns a Language or nil.
|
||||
def disambiguate_extension_language
|
||||
if Language.ambiguous?(extname)
|
||||
name = "guess_#{extname.sub(/^\./, '')}_language"
|
||||
send(name) if respond_to?(name)
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language of .cls files
|
||||
#
|
||||
# Returns a Language.
|
||||
def guess_cls_language
|
||||
if lines.grep(/^(%|\\)/).any?
|
||||
Language['TeX']
|
||||
elsif lines.grep(/^\s*(CLASS|METHOD|INTERFACE).*:\s*/i).any? || lines.grep(/^\s*(USING|DEFINE)/i).any?
|
||||
Language['OpenEdge ABL']
|
||||
elsif lines.grep(/\{$/).any? || lines.grep(/\}$/).any?
|
||||
Language['Apex']
|
||||
elsif lines.grep(/^(\'\*|Attribute|Option|Sub|Private|Protected|Public|Friend)/i).any?
|
||||
Language['Visual Basic']
|
||||
else
|
||||
# The most common language should be the fallback
|
||||
Language['TeX']
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language of header files (.h).
|
||||
#
|
||||
# Returns a Language.
|
||||
def guess_h_language
|
||||
if lines.grep(/^@(interface|property|private|public|end)/).any?
|
||||
Language['Objective-C']
|
||||
elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
|
||||
Language['C++']
|
||||
else
|
||||
Language['C']
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language of .m files.
|
||||
#
|
||||
# Objective-C heuristics:
|
||||
# * Keywords ("#import", "#include", "#ifdef", #define, "@end") or "//" and opening "\*" comments
|
||||
#
|
||||
# Matlab heuristics:
|
||||
# * Leading "function " of "classdef " keyword
|
||||
# * "%" comments
|
||||
#
|
||||
# M heuristics:
|
||||
# * Look at first line. It is either a comment (1st regex) or label/code (2nd regex)
|
||||
#
|
||||
# Note: All "#" keywords, e.g., "#import", are guaranteed to be Objective-C. Because the ampersand
|
||||
# is used to created function handles and anonymous functions in Matlab, most "@" keywords are not
|
||||
# safe heuristics. However, "end" is a reserved term in Matlab and can't be used to create a valid
|
||||
# function handle. Because @end is required to close any @implementation, @property, @interface,
|
||||
# @synthesize, etc. directive in Objective-C, only @end needs to be checked for.
|
||||
#
|
||||
# Returns a Language.
|
||||
def guess_m_language
|
||||
# Objective-C keywords or comments
|
||||
if lines.grep(/^#(import|include|ifdef|define)|@end/).any? || lines.grep(/^\s*\/\//).any? || lines.grep(/^\s*\/\*/).any?
|
||||
Language['Objective-C']
|
||||
|
||||
# Matlab file function or class or comments
|
||||
elsif lines.any? && lines.first.match(/^\s*(function |classdef )/) || lines.grep(/^\s*%/).any?
|
||||
Language['Matlab']
|
||||
|
||||
# M (see M heuristics above)
|
||||
elsif lines.first.to_s =~ /^[\t ]*;/ or lines.first.to_s =~ /^%?[A-Za-z0-9]+[\t ]*;*/
|
||||
Language['M']
|
||||
|
||||
# Fallback to Objective-C, don't want any M or Matlab false positives
|
||||
else
|
||||
Language['Objective-C']
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language of .pl files
|
||||
#
|
||||
# The rules for disambiguation are:
|
||||
#
|
||||
# 1. Many perl files begin with a shebang
|
||||
# 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
|
||||
# 3. Default to Perl, because it is more popular
|
||||
#
|
||||
# Returns a Language.
|
||||
def guess_pl_language
|
||||
if shebang_script == 'perl'
|
||||
Language['Perl']
|
||||
elsif lines.grep(/:-/).any?
|
||||
Language['Prolog']
|
||||
else
|
||||
Language['Perl']
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language of .r files.
|
||||
#
|
||||
# Returns a Language.
|
||||
def guess_r_language
|
||||
if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
|
||||
Language['Rebol']
|
||||
else
|
||||
Language['R']
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language of .t files.
|
||||
#
|
||||
# Returns a Language.
|
||||
def guess_t_language
|
||||
score = 0
|
||||
score += 1 if lines.grep(/^% /).any?
|
||||
score += data.gsub(/ := /).count
|
||||
score += data.gsub(/proc |procedure |fcn |function /).count
|
||||
score += data.gsub(/var \w+: \w+/).count
|
||||
|
||||
# Tell-tale signs its gotta be Perl
|
||||
if lines.grep(/^(my )?(sub |\$|@|%)\w+/).any?
|
||||
score = 0
|
||||
end
|
||||
|
||||
if score >= 3
|
||||
Language['Turing']
|
||||
else
|
||||
Language['Perl']
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language of .v files.
|
||||
#
|
||||
# Returns a Language
|
||||
def guess_v_language
|
||||
if lines.grep(/^(\/\*|\/\/|module|parameter|input|output|wire|reg|always|initial|begin|\`)/).any?
|
||||
Language['Verilog']
|
||||
else
|
||||
Language['Coq']
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language of .gsp files.
|
||||
#
|
||||
# Returns a Language.
|
||||
def guess_gsp_language
|
||||
if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
|
||||
Language['Groovy Server Pages']
|
||||
else
|
||||
Language['Gosu']
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Guess language from the first line.
|
||||
#
|
||||
# Look for leading "<?php" in Drupal files
|
||||
#
|
||||
# Returns a Language.
|
||||
def first_line_language
|
||||
# Only check files with drupal php extensions
|
||||
return unless drupal_extname?
|
||||
|
||||
# Fail fast if blob isn't viewable?
|
||||
return unless viewable?
|
||||
|
||||
if lines.first.to_s =~ /^<\?php/
|
||||
Language['PHP']
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Extract the script name from the shebang line
|
||||
#
|
||||
# Requires Blob#data
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# '#!/usr/bin/ruby'
|
||||
# # => 'ruby'
|
||||
#
|
||||
# '#!/usr/bin/env ruby'
|
||||
# # => 'ruby'
|
||||
#
|
||||
# '#!/usr/bash/python2.4'
|
||||
# # => 'python'
|
||||
#
|
||||
# Please add additional test coverage to
|
||||
# `test/test_blob.rb#test_shebang_script` if you make any changes.
|
||||
#
|
||||
# Returns a script name String or nil
|
||||
def shebang_script
|
||||
# Fail fast if blob isn't viewable?
|
||||
return unless viewable?
|
||||
|
||||
if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
|
||||
bang.sub!(/^#! /, '#!')
|
||||
tokens = bang.split(' ')
|
||||
pieces = tokens.first.split('/')
|
||||
if pieces.size > 1
|
||||
script = pieces.last
|
||||
else
|
||||
script = pieces.first.sub('#!', '')
|
||||
end
|
||||
|
||||
script = script == 'env' ? tokens[1] : script
|
||||
|
||||
# python2.4 => python
|
||||
if script =~ /((?:\d+\.?)+)/
|
||||
script.sub! $1, ''
|
||||
end
|
||||
|
||||
# Check for multiline shebang hacks that exec themselves
|
||||
#
|
||||
# #!/bin/sh
|
||||
# exec foo "$0" "$@"
|
||||
#
|
||||
if script == 'sh' &&
|
||||
lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
|
||||
script = $1
|
||||
end
|
||||
|
||||
script
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Get Language for shebang script
|
||||
#
|
||||
# Returns the Language or nil
|
||||
def shebang_language
|
||||
# Skip file extensions unlikely to have shebangs
|
||||
return unless shebang_extname?
|
||||
|
||||
if script = shebang_script
|
||||
Language[script]
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Highlight syntax of blob
|
||||
#
|
||||
# options - A Hash of options (defaults to {})
|
||||
@@ -691,12 +356,5 @@ module Linguist
|
||||
''
|
||||
end
|
||||
end
|
||||
|
||||
Language.overridden_extensions.each do |extension|
|
||||
name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
|
||||
unless instance_methods.map(&:to_sym).include?(name)
|
||||
warn "Language##{name} was not defined"
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
123
lib/linguist/classifier.rb
Normal file
123
lib/linguist/classifier.rb
Normal file
@@ -0,0 +1,123 @@
|
||||
require 'linguist/tokenizer'
|
||||
|
||||
module Linguist
|
||||
# Language bayesian classifier.
|
||||
class Classifier
|
||||
# Public: Train classifier that data is a certain language.
|
||||
#
|
||||
# db - Hash classifier database object
|
||||
# language - String language of data
|
||||
# data - String contents of file
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Classifier.train(db, 'Ruby', "def hello; end")
|
||||
#
|
||||
# Returns nothing.
|
||||
def self.train!(db, language, data)
|
||||
tokens = Tokenizer.tokenize(data)
|
||||
|
||||
db['tokens_total'] ||= 0
|
||||
db['languages_total'] ||= 0
|
||||
db['tokens'] ||= {}
|
||||
db['language_tokens'] ||= {}
|
||||
db['languages'] ||= {}
|
||||
|
||||
tokens.each do |token|
|
||||
db['tokens'][language] ||= {}
|
||||
db['tokens'][language][token] ||= 0
|
||||
db['tokens'][language][token] += 1
|
||||
db['language_tokens'][language] ||= 0
|
||||
db['language_tokens'][language] += 1
|
||||
db['tokens_total'] += 1
|
||||
end
|
||||
db['languages'][language] ||= 0
|
||||
db['languages'][language] += 1
|
||||
db['languages_total'] += 1
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
# Public: Guess language of data.
|
||||
#
|
||||
# db - Hash of classifier tokens database.
|
||||
# data - Array of tokens or String data to analyze.
|
||||
# languages - Array of language name Strings to restrict to.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Classifier.classify(db, "def hello; end")
|
||||
# # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
|
||||
#
|
||||
# Returns sorted Array of result pairs. Each pair contains the
|
||||
# String language name and a Float score.
|
||||
def self.classify(db, tokens, languages = nil)
|
||||
languages ||= db['languages'].keys
|
||||
new(db).classify(tokens, languages)
|
||||
end
|
||||
|
||||
# Internal: Initialize a Classifier.
|
||||
def initialize(db = {})
|
||||
@tokens_total = db['tokens_total']
|
||||
@languages_total = db['languages_total']
|
||||
@tokens = db['tokens']
|
||||
@language_tokens = db['language_tokens']
|
||||
@languages = db['languages']
|
||||
end
|
||||
|
||||
# Internal: Guess language of data
|
||||
#
|
||||
# data - Array of tokens or String data to analyze.
|
||||
# languages - Array of language name Strings to restrict to.
|
||||
#
|
||||
# Returns sorted Array of result pairs. Each pair contains the
|
||||
# String language name and a Float score.
|
||||
def classify(tokens, languages)
|
||||
return [] if tokens.nil?
|
||||
tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
|
||||
|
||||
scores = {}
|
||||
languages.each do |language|
|
||||
scores[language] = tokens_probability(tokens, language) +
|
||||
language_probability(language)
|
||||
end
|
||||
|
||||
scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
|
||||
end
|
||||
|
||||
# Internal: Probably of set of tokens in a language occurring - P(D | C)
|
||||
#
|
||||
# tokens - Array of String tokens.
|
||||
# language - Language to check.
|
||||
#
|
||||
# Returns Float between 0.0 and 1.0.
|
||||
def tokens_probability(tokens, language)
|
||||
tokens.inject(0.0) do |sum, token|
|
||||
sum += Math.log(token_probability(token, language))
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Probably of token in language occurring - P(F | C)
|
||||
#
|
||||
# token - String token.
|
||||
# language - Language to check.
|
||||
#
|
||||
# Returns Float between 0.0 and 1.0.
|
||||
def token_probability(token, language)
|
||||
if @tokens[language][token].to_f == 0.0
|
||||
1 / @tokens_total.to_f
|
||||
else
|
||||
@tokens[language][token].to_f / @language_tokens[language].to_f
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Probably of a language occurring - P(C)
|
||||
#
|
||||
# language - Language to check.
|
||||
#
|
||||
# Returns Float between 0.0 and 1.0.
|
||||
def language_probability(language)
|
||||
Math.log(@languages[language].to_f / @languages_total.to_f)
|
||||
end
|
||||
end
|
||||
end
|
||||
162
lib/linguist/generated.rb
Normal file
162
lib/linguist/generated.rb
Normal file
@@ -0,0 +1,162 @@
|
||||
module Linguist
|
||||
class Generated
|
||||
# Public: Is the blob a generated file?
|
||||
#
|
||||
# name - String filename
|
||||
# data - String blob data. A block also maybe passed in for lazy
|
||||
# loading. This behavior is deprecated and you should always
|
||||
# pass in a String.
|
||||
#
|
||||
# Return true or false
|
||||
def self.generated?(name, data)
|
||||
new(name, data).generated?
|
||||
end
|
||||
|
||||
# Internal: Initialize Generated instance
|
||||
#
|
||||
# name - String filename
|
||||
# data - String blob data
|
||||
def initialize(name, data)
|
||||
@name = name
|
||||
@extname = File.extname(name)
|
||||
@_data = data
|
||||
end
|
||||
|
||||
attr_reader :name, :extname
|
||||
|
||||
# Lazy load blob data if block was passed in.
|
||||
#
|
||||
# Awful, awful stuff happening here.
|
||||
#
|
||||
# Returns String data.
|
||||
def data
|
||||
@data ||= @_data.respond_to?(:call) ? @_data.call() : @_data
|
||||
end
|
||||
|
||||
# Public: Get each line of data
|
||||
#
|
||||
# Returns an Array of lines
|
||||
def lines
|
||||
# TODO: data should be required to be a String, no nils
|
||||
@lines ||= data ? data.split("\n", -1) : []
|
||||
end
|
||||
|
||||
# Internal: Is the blob a generated file?
|
||||
#
|
||||
# Generated source code is suppressed in diffs and is ignored by
|
||||
# language statistics.
|
||||
#
|
||||
# Please add additional test coverage to
|
||||
# `test/test_blob.rb#test_generated` if you make any changes.
|
||||
#
|
||||
# Return true or false
|
||||
def generated?
|
||||
name == 'Gemfile.lock' ||
|
||||
minified_javascript? ||
|
||||
compiled_coffeescript? ||
|
||||
xcode_project_file? ||
|
||||
generated_net_docfile? ||
|
||||
generated_parser?
|
||||
end
|
||||
|
||||
# Internal: Is the blob an XCode project file?
|
||||
#
|
||||
# Generated if the file extension is an XCode project
|
||||
# file extension.
|
||||
#
|
||||
# Returns true of false.
|
||||
def xcode_project_file?
|
||||
['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
|
||||
end
|
||||
|
||||
# Internal: Is the blob minified JS?
|
||||
#
|
||||
# Consider JS minified if the average line length is
|
||||
# greater then 100c.
|
||||
#
|
||||
# Returns true or false.
|
||||
def minified_javascript?
|
||||
return unless extname == '.js'
|
||||
if lines.any?
|
||||
(lines.inject(0) { |n, l| n += l.length } / lines.length) > 100
|
||||
else
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Is the blob of JS generated by CoffeeScript?
|
||||
#
|
||||
# CoffeeScript is meant to output JS that would be difficult to
|
||||
# tell if it was generated or not. Look for a number of patterns
|
||||
# output by the CS compiler.
|
||||
#
|
||||
# Return true or false
|
||||
def compiled_coffeescript?
|
||||
return false unless extname == '.js'
|
||||
|
||||
# CoffeeScript generated by > 1.2 include a comment on the first line
|
||||
if lines[0] =~ /^\/\/ Generated by /
|
||||
return true
|
||||
end
|
||||
|
||||
if lines[0] == '(function() {' && # First line is module closure opening
|
||||
lines[-2] == '}).call(this);' && # Second to last line closes module closure
|
||||
lines[-1] == '' # Last line is blank
|
||||
|
||||
score = 0
|
||||
|
||||
lines.each do |line|
|
||||
if line =~ /var /
|
||||
# Underscored temp vars are likely to be Coffee
|
||||
score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
|
||||
|
||||
# bind and extend functions are very Coffee specific
|
||||
score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
|
||||
end
|
||||
end
|
||||
|
||||
# Require a score of 3. This is fairly arbitrary. Consider
|
||||
# tweaking later.
|
||||
score >= 3
|
||||
else
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
# Internal: Is this a generated documentation file for a .NET assembly?
|
||||
#
|
||||
# .NET developers often check in the XML Intellisense file along with an
|
||||
# assembly - however, these don't have a special extension, so we have to
|
||||
# dig into the contents to determine if it's a docfile. Luckily, these files
|
||||
# are extremely structured, so recognizing them is easy.
|
||||
#
|
||||
# Returns true or false
|
||||
def generated_net_docfile?
|
||||
return false unless extname.downcase == ".xml"
|
||||
return false unless lines.count > 3
|
||||
|
||||
# .NET Docfiles always open with <doc> and their first tag is an
|
||||
# <assembly> tag
|
||||
return lines[1].include?("<doc>") &&
|
||||
lines[2].include?("<assembly>") &&
|
||||
lines[-2].include?("</doc>")
|
||||
end
|
||||
|
||||
# Internal: Is the blob of JS a parser generated by PEG.js?
|
||||
#
|
||||
# PEG.js-generated parsers are not meant to be consumed by humans.
|
||||
#
|
||||
# Return true or false
|
||||
def generated_parser?
|
||||
return false unless extname == '.js'
|
||||
|
||||
# PEG.js-generated parsers include a comment near the top of the file
|
||||
# that marks them as such.
|
||||
if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
|
||||
return true
|
||||
end
|
||||
|
||||
false
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -2,6 +2,9 @@ require 'escape_utils'
|
||||
require 'pygments'
|
||||
require 'yaml'
|
||||
|
||||
require 'linguist/classifier'
|
||||
require 'linguist/samples'
|
||||
|
||||
module Linguist
|
||||
# Language names that are recognizable by GitHub. Defined languages
|
||||
# can be highlighted, searched and listed under the Top Languages page.
|
||||
@@ -9,30 +12,15 @@ module Linguist
|
||||
# Languages are defined in `lib/linguist/languages.yml`.
|
||||
class Language
|
||||
@languages = []
|
||||
@overrides = {}
|
||||
@index = {}
|
||||
@name_index = {}
|
||||
@alias_index = {}
|
||||
@extension_index = {}
|
||||
@filename_index = {}
|
||||
@extension_index = Hash.new { |h,k| h[k] = [] }
|
||||
@filename_index = Hash.new { |h,k| h[k] = [] }
|
||||
|
||||
# Valid Languages types
|
||||
TYPES = [:data, :markup, :programming]
|
||||
|
||||
# Internal: Test if extension maps to multiple Languages.
|
||||
#
|
||||
# Returns true or false.
|
||||
def self.ambiguous?(extension)
|
||||
@overrides.include?(extension)
|
||||
end
|
||||
|
||||
# Include?: Return overridden extensions.
|
||||
#
|
||||
# Returns extensions Array.
|
||||
def self.overridden_extensions
|
||||
@overrides.keys
|
||||
end
|
||||
|
||||
# Internal: Create a new Language object
|
||||
#
|
||||
# attributes - A hash of attributes
|
||||
@@ -43,18 +31,18 @@ module Linguist
|
||||
|
||||
@languages << language
|
||||
|
||||
# All Language names should be unique. Warn if there is a duplicate.
|
||||
# All Language names should be unique. Raise if there is a duplicate.
|
||||
if @name_index.key?(language.name)
|
||||
warn "Duplicate language name: #{language.name}"
|
||||
raise ArgumentError, "Duplicate language name: #{language.name}"
|
||||
end
|
||||
|
||||
# Language name index
|
||||
@index[language.name] = @name_index[language.name] = language
|
||||
|
||||
language.aliases.each do |name|
|
||||
# All Language aliases should be unique. Warn if there is a duplicate.
|
||||
# All Language aliases should be unique. Raise if there is a duplicate.
|
||||
if @alias_index.key?(name)
|
||||
warn "Duplicate alias: #{name}"
|
||||
raise ArgumentError, "Duplicate alias: #{name}"
|
||||
end
|
||||
|
||||
@index[name] = @alias_index[name] = language
|
||||
@@ -62,33 +50,50 @@ module Linguist
|
||||
|
||||
language.extensions.each do |extension|
|
||||
if extension !~ /^\./
|
||||
warn "Extension is missing a '.': #{extension.inspect}"
|
||||
raise ArgumentError, "Extension is missing a '.': #{extension.inspect}"
|
||||
end
|
||||
|
||||
unless ambiguous?(extension)
|
||||
# Index the extension with a leading ".": ".rb"
|
||||
@extension_index[extension] = language
|
||||
|
||||
# Index the extension without a leading ".": "rb"
|
||||
@extension_index[extension.sub(/^\./, '')] = language
|
||||
end
|
||||
end
|
||||
|
||||
language.overrides.each do |extension|
|
||||
if extension !~ /^\./
|
||||
warn "Extension is missing a '.': #{extension.inspect}"
|
||||
end
|
||||
|
||||
@overrides[extension] = language
|
||||
@extension_index[extension] << language
|
||||
end
|
||||
|
||||
language.filenames.each do |filename|
|
||||
@filename_index[filename] = language
|
||||
@filename_index[filename] << language
|
||||
end
|
||||
|
||||
language
|
||||
end
|
||||
|
||||
# Public: Detects the Language of the blob.
|
||||
#
|
||||
# name - String filename
|
||||
# data - String blob data. A block also maybe passed in for lazy
|
||||
# loading. This behavior is deprecated and you should always
|
||||
# pass in a String.
|
||||
# mode - Optional String mode (defaults to nil)
|
||||
#
|
||||
# Returns Language or nil.
|
||||
def self.detect(name, data, mode = nil)
|
||||
# A bit of an elegant hack. If the file is executable but extensionless,
|
||||
# append a "magic" extension so it can be classified with other
|
||||
# languages that have shebang scripts.
|
||||
if File.extname(name).empty? && mode && (mode.to_i(8) & 05) == 05
|
||||
name += ".script!"
|
||||
end
|
||||
|
||||
possible_languages = find_by_filename(name)
|
||||
|
||||
if possible_languages.length > 1
|
||||
data = data.call() if data.respond_to?(:call)
|
||||
if data.nil? || data == ""
|
||||
nil
|
||||
elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
|
||||
Language[result[0]]
|
||||
end
|
||||
else
|
||||
possible_languages.first
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Get all Languages
|
||||
#
|
||||
# Returns an Array of Languages
|
||||
@@ -124,33 +129,19 @@ module Linguist
|
||||
@alias_index[name]
|
||||
end
|
||||
|
||||
# Public: Look up Language by extension.
|
||||
#
|
||||
# extension - The extension String. May include leading "."
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Language.find_by_extension('.rb')
|
||||
# # => #<Language name="Ruby">
|
||||
#
|
||||
# Returns the Language or nil if none was found.
|
||||
def self.find_by_extension(extension)
|
||||
@extension_index[extension]
|
||||
end
|
||||
|
||||
# Public: Look up Language by filename.
|
||||
# Public: Look up Languages by filename.
|
||||
#
|
||||
# filename - The path String.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Language.find_by_filename('foo.rb')
|
||||
# # => #<Language name="Ruby">
|
||||
# # => [#<Language name="Ruby">]
|
||||
#
|
||||
# Returns the Language or nil if none was found.
|
||||
# Returns all matching Languages or [] if none were found.
|
||||
def self.find_by_filename(filename)
|
||||
basename, extname = File.basename(filename), File.extname(filename)
|
||||
@filename_index[basename] || @extension_index[extname]
|
||||
@filename_index[basename] + @extension_index[extname]
|
||||
end
|
||||
|
||||
# Public: Look up Language by its name or lexer.
|
||||
@@ -231,16 +222,18 @@ module Linguist
|
||||
raise(ArgumentError, "#{@name} is missing lexer")
|
||||
|
||||
@ace_mode = attributes[:ace_mode]
|
||||
@wrap = attributes[:wrap] || false
|
||||
|
||||
# Set legacy search term
|
||||
@search_term = attributes[:search_term] || default_alias_name
|
||||
|
||||
# Set extensions or default to [].
|
||||
@extensions = attributes[:extensions] || []
|
||||
@overrides = attributes[:overrides] || []
|
||||
@filenames = attributes[:filenames] || []
|
||||
|
||||
@primary_extension = attributes[:primary_extension] || default_primary_extension || extensions.first
|
||||
unless @primary_extension = attributes[:primary_extension]
|
||||
raise ArgumentError, "#{@name} is missing primary extension"
|
||||
end
|
||||
|
||||
# Prepend primary extension unless its already included
|
||||
if primary_extension && !extensions.include?(primary_extension)
|
||||
@@ -320,6 +313,11 @@ module Linguist
|
||||
# Returns a String name or nil
|
||||
attr_reader :ace_mode
|
||||
|
||||
# Public: Should language lines be wrapped
|
||||
#
|
||||
# Returns true or false
|
||||
attr_reader :wrap
|
||||
|
||||
# Public: Get extensions
|
||||
#
|
||||
# Examples
|
||||
@@ -331,7 +329,7 @@ module Linguist
|
||||
|
||||
# Deprecated: Get primary extension
|
||||
#
|
||||
# Defaults to the first extension but can be overriden
|
||||
# Defaults to the first extension but can be overridden
|
||||
# in the languages.yml.
|
||||
#
|
||||
# The primary extension can not be nil. Tests should verify this.
|
||||
@@ -343,11 +341,6 @@ module Linguist
|
||||
# Returns the extension String.
|
||||
attr_reader :primary_extension
|
||||
|
||||
# Internal: Get overridden extensions.
|
||||
#
|
||||
# Returns the extensions Array.
|
||||
attr_reader :overrides
|
||||
|
||||
# Public: Get filenames
|
||||
#
|
||||
# Examples
|
||||
@@ -377,13 +370,6 @@ module Linguist
|
||||
name.downcase.gsub(/\s/, '-')
|
||||
end
|
||||
|
||||
# Internal: Get default primary extension.
|
||||
#
|
||||
# Returns the extension String.
|
||||
def default_primary_extension
|
||||
extensions.first
|
||||
end
|
||||
|
||||
# Public: Get Language group
|
||||
#
|
||||
# Returns a Language
|
||||
@@ -441,11 +427,40 @@ module Linguist
|
||||
def hash
|
||||
name.hash
|
||||
end
|
||||
|
||||
def inspect
|
||||
"#<#{self.class} name=#{name}>"
|
||||
end
|
||||
end
|
||||
|
||||
extensions = Samples::DATA['extnames']
|
||||
filenames = Samples::DATA['filenames']
|
||||
popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))
|
||||
|
||||
YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options|
|
||||
options['extensions'] ||= []
|
||||
options['filenames'] ||= []
|
||||
|
||||
if extnames = extensions[name]
|
||||
extnames.each do |extname|
|
||||
if !options['extensions'].include?(extname)
|
||||
options['extensions'] << extname
|
||||
else
|
||||
warn "#{name} #{extname.inspect} is already defined in samples/. Remove from languages.yml."
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if fns = filenames[name]
|
||||
fns.each do |filename|
|
||||
if !options['filenames'].include?(filename)
|
||||
options['filenames'] << filename
|
||||
else
|
||||
warn "#{name} #{filename.inspect} is already defined in samples/. Remove from languages.yml."
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Language.create(
|
||||
:name => name,
|
||||
:color => options['color'],
|
||||
@@ -453,12 +468,12 @@ module Linguist
|
||||
:aliases => options['aliases'],
|
||||
:lexer => options['lexer'],
|
||||
:ace_mode => options['ace_mode'],
|
||||
:wrap => options['wrap'],
|
||||
:group_name => options['group'],
|
||||
:searchable => options.key?('searchable') ? options['searchable'] : true,
|
||||
:search_term => options['search_term'],
|
||||
:extensions => options['extensions'],
|
||||
:extensions => options['extensions'].sort,
|
||||
:primary_extension => options['primary_extension'],
|
||||
:overrides => options['overrides'],
|
||||
:filenames => options['filenames'],
|
||||
:popular => popular.include?(name)
|
||||
)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
38
lib/linguist/md5.rb
Normal file
38
lib/linguist/md5.rb
Normal file
@@ -0,0 +1,38 @@
|
||||
require 'digest/md5'
|
||||
|
||||
module Linguist
|
||||
module MD5
|
||||
# Public: Create deep nested digest of value object.
|
||||
#
|
||||
# Useful for object comparison.
|
||||
#
|
||||
# obj - Object to digest.
|
||||
#
|
||||
# Returns String hex digest
|
||||
def self.hexdigest(obj)
|
||||
digest = Digest::MD5.new
|
||||
|
||||
case obj
|
||||
when String, Symbol, Integer
|
||||
digest.update "#{obj.class}"
|
||||
digest.update "#{obj}"
|
||||
when TrueClass, FalseClass, NilClass
|
||||
digest.update "#{obj.class}"
|
||||
when Array
|
||||
digest.update "#{obj.class}"
|
||||
for e in obj
|
||||
digest.update(hexdigest(e))
|
||||
end
|
||||
when Hash
|
||||
digest.update "#{obj.class}"
|
||||
for e in obj.map { |(k, v)| hexdigest([k, v]) }.sort
|
||||
digest.update(e)
|
||||
end
|
||||
else
|
||||
raise TypeError, "can't convert #{obj.inspect} into String"
|
||||
end
|
||||
|
||||
digest.hexdigest
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -1,91 +0,0 @@
|
||||
require 'mime/types'
|
||||
require 'yaml'
|
||||
|
||||
class MIME::Type
|
||||
attr_accessor :override
|
||||
end
|
||||
|
||||
# Register additional mime type extensions
|
||||
#
|
||||
# Follows same format as mime-types data file
|
||||
# https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
|
||||
File.read(File.expand_path("../mimes.yml", __FILE__)).lines.each do |line|
|
||||
# Regexp was cargo culted from mime-types lib
|
||||
next unless line =~ %r{^
|
||||
#{MIME::Type::MEDIA_TYPE_RE}
|
||||
(?:\s@([^\s]+))?
|
||||
(?:\s:(#{MIME::Type::ENCODING_RE}))?
|
||||
}x
|
||||
|
||||
mediatype = $1
|
||||
subtype = $2
|
||||
extensions = $3
|
||||
encoding = $4
|
||||
|
||||
# Lookup existing mime type
|
||||
mime_type = MIME::Types["#{mediatype}/#{subtype}"].first ||
|
||||
# Or create a new instance
|
||||
MIME::Type.new("#{mediatype}/#{subtype}")
|
||||
|
||||
if extensions
|
||||
extensions.split(/,/).each do |extension|
|
||||
mime_type.extensions << extension
|
||||
end
|
||||
end
|
||||
|
||||
if encoding
|
||||
mime_type.encoding = encoding
|
||||
end
|
||||
|
||||
mime_type.override = true
|
||||
|
||||
# Kind of hacky, but we need to reindex the mime type after making changes
|
||||
MIME::Types.add_type_variant(mime_type)
|
||||
MIME::Types.index_extensions(mime_type)
|
||||
end
|
||||
|
||||
module Linguist
|
||||
module Mime
|
||||
# Internal: Look up mime type for extension.
|
||||
#
|
||||
# ext - The extension String. May include leading "."
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Mime.mime_for('.html')
|
||||
# # => 'text/html'
|
||||
#
|
||||
# Mime.mime_for('txt')
|
||||
# # => 'text/plain'
|
||||
#
|
||||
# Return mime type String otherwise falls back to 'text/plain'.
|
||||
def self.mime_for(ext)
|
||||
mime_type = lookup_mime_type_for(ext)
|
||||
mime_type ? mime_type.to_s : 'text/plain'
|
||||
end
|
||||
|
||||
# Internal: Lookup mime type for extension or mime type
|
||||
#
|
||||
# ext_or_mime_type - A file extension ".txt" or mime type "text/plain".
|
||||
#
|
||||
# Returns a MIME::Type
|
||||
def self.lookup_mime_type_for(ext_or_mime_type)
|
||||
ext_or_mime_type ||= ''
|
||||
|
||||
if ext_or_mime_type =~ /\w+\/\w+/
|
||||
guesses = ::MIME::Types[ext_or_mime_type]
|
||||
else
|
||||
guesses = ::MIME::Types.type_for(ext_or_mime_type)
|
||||
end
|
||||
|
||||
# Use custom override first
|
||||
guesses.detect { |type| type.override } ||
|
||||
|
||||
# Prefer text mime types over binary
|
||||
guesses.detect { |type| type.ascii? } ||
|
||||
|
||||
# Otherwise use the first guess
|
||||
guesses.first
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -1,62 +0,0 @@
|
||||
# Additional types to add to MIME::Types
|
||||
#
|
||||
# MIME types are used to set the Content-Type of raw binary blobs. All text
|
||||
# blobs are served as text/plain regardless of their type to ensure they
|
||||
# open in the browser rather than downloading.
|
||||
#
|
||||
# The encoding helps determine whether a file should be treated as plain
|
||||
# text or binary. By default, a mime type's encoding is base64 (binary).
|
||||
# These types will show a "View Raw" link. To force a type to render as
|
||||
# plain text, set it to 8bit for UTF-8. text/* types will be treated as
|
||||
# text by default.
|
||||
#
|
||||
# <type> @<extensions> :<encoding>
|
||||
#
|
||||
# type - mediatype/subtype
|
||||
# extensions - comma seperated extension list
|
||||
# encoding - base64 (binary), 7bit (ASCII), 8bit (UTF-8), or
|
||||
# quoted-printable (Printable ASCII).
|
||||
#
|
||||
# Follows same format as mime-types data file
|
||||
# https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
|
||||
#
|
||||
# Any additions or modifications (even trivial) should have corresponding
|
||||
# test change in `test/test_mime.rb`.
|
||||
|
||||
# TODO: Lookup actual types
|
||||
application/octet-stream @a,blend,gem,graffle,ipa,lib,mcz,nib,o,ogv,otf,pfx,pigx,plgx,psd,sib,spl,sqlite3,swc,ucode,xpi
|
||||
|
||||
# Please keep this list alphabetized
|
||||
application/java-archive @ear,war
|
||||
application/netcdf :8bit
|
||||
application/ogg @ogg
|
||||
application/postscript :base64
|
||||
application/vnd.adobe.air-application-installer-package+zip @air
|
||||
application/vnd.mozilla.xul+xml :8bit
|
||||
application/vnd.oasis.opendocument.presentation @odp
|
||||
application/vnd.oasis.opendocument.spreadsheet @ods
|
||||
application/vnd.oasis.opendocument.text @odt
|
||||
application/vnd.openofficeorg.extension @oxt
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation @pptx
|
||||
application/x-chrome-extension @crx
|
||||
application/x-iwork-keynote-sffkey @key
|
||||
application/x-iwork-numbers-sffnumbers @numbers
|
||||
application/x-iwork-pages-sffpages @pages
|
||||
application/x-ms-xbap @xbap :8bit
|
||||
application/x-parrot-bytecode @pbc
|
||||
application/x-shockwave-flash @swf
|
||||
application/x-silverlight-app @xap
|
||||
application/x-supercollider @sc :8bit
|
||||
application/x-troff-ms :8bit
|
||||
application/x-wais-source :8bit
|
||||
application/xaml+xml @xaml :8bit
|
||||
application/xslt+xml @xslt :8bit
|
||||
image/x-icns @icns
|
||||
text/cache-manifest @manifest
|
||||
text/plain @cu,cxx
|
||||
text/x-logtalk @lgt
|
||||
text/x-nemerle @n
|
||||
text/x-nimrod @nim
|
||||
text/x-ocaml @ml,mli,mll,mly,sig,sml
|
||||
text/x-rust @rs,rc
|
||||
text/x-scheme @rkt,scm,sls,sps,ss
|
||||
@@ -1,92 +0,0 @@
|
||||
require 'linguist/language'
|
||||
require 'linguist/mime'
|
||||
require 'pygments'
|
||||
|
||||
module Linguist
|
||||
# Similar to ::Pathname, Linguist::Pathname wraps a path string and
|
||||
# provides helpful query methods. Its useful when you only have a
|
||||
# filename but not a blob and need to figure out the language of the file.
|
||||
class Pathname
|
||||
# Public: Initialize a Pathname
|
||||
#
|
||||
# path - A filename String. The file may or maybe actually exist.
|
||||
#
|
||||
# Returns a Pathname.
|
||||
def initialize(path)
|
||||
@path = path
|
||||
end
|
||||
|
||||
# Public: Get the basename of the path
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Pathname.new('sub/dir/file.rb').basename
|
||||
# # => 'file.rb'
|
||||
#
|
||||
# Returns a String.
|
||||
def basename
|
||||
File.basename(@path)
|
||||
end
|
||||
|
||||
# Public: Get the extname of the path
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Pathname.new('.rb').extname
|
||||
# # => '.rb'
|
||||
#
|
||||
# Pathname.new('file.rb').extname
|
||||
# # => '.rb'
|
||||
#
|
||||
# Returns a String.
|
||||
def extname
|
||||
File.extname(@path)
|
||||
end
|
||||
|
||||
# Public: Get the language of the path
|
||||
#
|
||||
# The path extension name is the only heuristic used to detect the
|
||||
# language name.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Pathname.new('file.rb').language
|
||||
# # => Language['Ruby']
|
||||
#
|
||||
# Returns a Language or nil if none was found.
|
||||
def language
|
||||
@language ||= Language.find_by_filename(@path)
|
||||
end
|
||||
|
||||
# Internal: Get the lexer of the path
|
||||
#
|
||||
# Returns a Lexer.
|
||||
def lexer
|
||||
language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
|
||||
end
|
||||
|
||||
# Public: Get the mime type
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Pathname.new('index.html').mime_type
|
||||
# # => 'text/html'
|
||||
#
|
||||
# Returns a mime type String.
|
||||
def mime_type
|
||||
@mime_type ||= Mime.mime_for(extname)
|
||||
end
|
||||
|
||||
# Public: Return self as String
|
||||
#
|
||||
# Returns a String
|
||||
def to_s
|
||||
@path.dup
|
||||
end
|
||||
|
||||
def eql?(other)
|
||||
other.is_a?(self.class) && @path == other.to_s
|
||||
end
|
||||
alias_method :==, :eql?
|
||||
end
|
||||
end
|
||||
@@ -67,8 +67,8 @@ module Linguist
|
||||
return if @computed_stats
|
||||
|
||||
@enum.each do |blob|
|
||||
# Skip binary file extensions
|
||||
next if blob.binary_mime_type?
|
||||
# Skip files that are likely binary
|
||||
next if blob.likely_binary?
|
||||
|
||||
# Skip vendored or generated blobs
|
||||
next if blob.vendored? || blob.generated? || blob.language.nil?
|
||||
@@ -80,7 +80,7 @@ module Linguist
|
||||
end
|
||||
|
||||
# Compute total size
|
||||
@size = @sizes.inject(0) { |s,(k,v)| s + v }
|
||||
@size = @sizes.inject(0) { |s,(_,v)| s + v }
|
||||
|
||||
# Get primary language
|
||||
if primary = @sizes.max_by { |(_, size)| size }
|
||||
|
||||
37758
lib/linguist/samples.json
Normal file
37758
lib/linguist/samples.json
Normal file
File diff suppressed because it is too large
Load Diff
98
lib/linguist/samples.rb
Normal file
98
lib/linguist/samples.rb
Normal file
@@ -0,0 +1,98 @@
|
||||
require 'yaml'
|
||||
|
||||
require 'linguist/md5'
|
||||
require 'linguist/classifier'
|
||||
|
||||
module Linguist
|
||||
# Model for accessing classifier training data.
|
||||
module Samples
|
||||
# Path to samples root directory
|
||||
ROOT = File.expand_path("../../../samples", __FILE__)
|
||||
|
||||
# Path for serialized samples db
|
||||
PATH = File.expand_path('../samples.json', __FILE__)
|
||||
|
||||
# Hash of serialized samples object
|
||||
if File.exist?(PATH)
|
||||
DATA = YAML.load_file(PATH)
|
||||
end
|
||||
|
||||
# Public: Iterate over each sample.
|
||||
#
|
||||
# &block - Yields Sample to block
|
||||
#
|
||||
# Returns nothing.
|
||||
def self.each(&block)
|
||||
Dir.entries(ROOT).each do |category|
|
||||
next if category == '.' || category == '..'
|
||||
|
||||
# Skip text and binary for now
|
||||
# Possibly reconsider this later
|
||||
next if category == 'Text' || category == 'Binary'
|
||||
|
||||
dirname = File.join(ROOT, category)
|
||||
Dir.entries(dirname).each do |filename|
|
||||
next if filename == '.' || filename == '..'
|
||||
|
||||
if filename == 'filenames'
|
||||
Dir.entries(File.join(dirname, filename)).each do |subfilename|
|
||||
next if subfilename == '.' || subfilename == '..'
|
||||
|
||||
yield({
|
||||
:path => File.join(dirname, filename, subfilename),
|
||||
:language => category,
|
||||
:filename => subfilename
|
||||
})
|
||||
end
|
||||
else
|
||||
if File.extname(filename) == ""
|
||||
raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
|
||||
end
|
||||
|
||||
yield({
|
||||
:path => File.join(dirname, filename),
|
||||
:language => category,
|
||||
:extname => File.extname(filename)
|
||||
})
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
# Public: Build Classifier from all samples.
|
||||
#
|
||||
# Returns trained Classifier.
|
||||
def self.data
|
||||
db = {}
|
||||
db['extnames'] = {}
|
||||
db['filenames'] = {}
|
||||
|
||||
each do |sample|
|
||||
language_name = sample[:language]
|
||||
|
||||
if sample[:extname]
|
||||
db['extnames'][language_name] ||= []
|
||||
if !db['extnames'][language_name].include?(sample[:extname])
|
||||
db['extnames'][language_name] << sample[:extname]
|
||||
db['extnames'][language_name].sort!
|
||||
end
|
||||
end
|
||||
|
||||
if sample[:filename]
|
||||
db['filenames'][language_name] ||= []
|
||||
db['filenames'][language_name] << sample[:filename]
|
||||
db['filenames'][language_name].sort!
|
||||
end
|
||||
|
||||
data = File.read(sample[:path])
|
||||
Classifier.train!(db, language_name, data)
|
||||
end
|
||||
|
||||
db['md5'] = Linguist::MD5.hexdigest(db)
|
||||
|
||||
db
|
||||
end
|
||||
end
|
||||
end
|
||||
197
lib/linguist/tokenizer.rb
Normal file
197
lib/linguist/tokenizer.rb
Normal file
@@ -0,0 +1,197 @@
|
||||
require 'strscan'
|
||||
|
||||
module Linguist
|
||||
# Generic programming language tokenizer.
|
||||
#
|
||||
# Tokens are designed for use in the language bayes classifier.
|
||||
# It strips any data strings or comments and preserves significant
|
||||
# language symbols.
|
||||
class Tokenizer
|
||||
# Public: Extract tokens from data
|
||||
#
|
||||
# data - String to tokenize
|
||||
#
|
||||
# Returns Array of token Strings.
|
||||
def self.tokenize(data)
|
||||
new.extract_tokens(data)
|
||||
end
|
||||
|
||||
# Read up to 100KB
|
||||
BYTE_LIMIT = 100_000
|
||||
|
||||
# Start state on token, ignore anything till the next newline
|
||||
SINGLE_LINE_COMMENTS = [
|
||||
'//', # C
|
||||
'#', # Ruby
|
||||
'%', # Tex
|
||||
]
|
||||
|
||||
# Start state on opening token, ignore anything until the closing
|
||||
# token is reached.
|
||||
MULTI_LINE_COMMENTS = [
|
||||
['/*', '*/'], # C
|
||||
['<!--', '-->'], # XML
|
||||
['{-', '-}'], # Haskell
|
||||
['(*', '*)'] # Coq
|
||||
]
|
||||
|
||||
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
|
||||
"\s*#{Regexp.escape(c)} "
|
||||
}.join("|"))
|
||||
|
||||
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
|
||||
Regexp.escape(c[0])
|
||||
}.join("|"))
|
||||
|
||||
# Internal: Extract generic tokens from data.
|
||||
#
|
||||
# data - String to scan.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# extract_tokens("printf('Hello')")
|
||||
# # => ['printf', '(', ')']
|
||||
#
|
||||
# Returns Array of token Strings.
|
||||
def extract_tokens(data)
|
||||
s = StringScanner.new(data)
|
||||
|
||||
tokens = []
|
||||
until s.eos?
|
||||
break if s.pos >= BYTE_LIMIT
|
||||
|
||||
if token = s.scan(/^#!.+$/)
|
||||
if name = extract_shebang(token)
|
||||
tokens << "SHEBANG#!#{name}"
|
||||
end
|
||||
|
||||
# Single line comment
|
||||
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
|
||||
# tokens << token.strip
|
||||
s.skip_until(/\n|\Z/)
|
||||
|
||||
# Multiline comments
|
||||
elsif token = s.scan(START_MULTI_LINE_COMMENT)
|
||||
# tokens << token
|
||||
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
|
||||
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
|
||||
# tokens << close_token
|
||||
|
||||
# Skip single or double quoted strings
|
||||
elsif s.scan(/"/)
|
||||
if s.peek(1) == "\""
|
||||
s.getch
|
||||
else
|
||||
s.skip_until(/[^\\]"/)
|
||||
end
|
||||
elsif s.scan(/'/)
|
||||
if s.peek(1) == "'"
|
||||
s.getch
|
||||
else
|
||||
s.skip_until(/[^\\]'/)
|
||||
end
|
||||
|
||||
# Skip number literals
|
||||
elsif s.scan(/(0x)?\d(\d|\.)*/)
|
||||
|
||||
# SGML style brackets
|
||||
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
||||
extract_sgml_tokens(token).each { |t| tokens << t }
|
||||
|
||||
# Common programming punctuation
|
||||
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
|
||||
tokens << token
|
||||
|
||||
# Regular token
|
||||
elsif token = s.scan(/[\w\.@#\/\*]+/)
|
||||
tokens << token
|
||||
|
||||
# Common operators
|
||||
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
|
||||
tokens << token
|
||||
|
||||
else
|
||||
s.getch
|
||||
end
|
||||
end
|
||||
|
||||
tokens
|
||||
end
|
||||
|
||||
# Internal: Extract normalized shebang command token.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# extract_shebang("#!/usr/bin/ruby")
|
||||
# # => "ruby"
|
||||
#
|
||||
# extract_shebang("#!/usr/bin/env node")
|
||||
# # => "node"
|
||||
#
|
||||
# Returns String token or nil it couldn't be parsed.
|
||||
def extract_shebang(data)
|
||||
s = StringScanner.new(data)
|
||||
|
||||
if path = s.scan(/^#!\s*\S+/)
|
||||
script = path.split('/').last
|
||||
if script == 'env'
|
||||
s.scan(/\s+/)
|
||||
script = s.scan(/\S+/)
|
||||
end
|
||||
script = script[/[^\d]+/, 0] if script
|
||||
return script
|
||||
end
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
# Internal: Extract tokens from inside SGML tag.
|
||||
#
|
||||
# data - SGML tag String.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# extract_sgml_tokens("<a href='' class=foo>")
|
||||
# # => ["<a>", "href="]
|
||||
#
|
||||
# Returns Array of token Strings.
|
||||
def extract_sgml_tokens(data)
|
||||
s = StringScanner.new(data)
|
||||
|
||||
tokens = []
|
||||
|
||||
until s.eos?
|
||||
# Emit start token
|
||||
if token = s.scan(/<\/?[^\s>]+/)
|
||||
tokens << "#{token}>"
|
||||
|
||||
# Emit attributes with trailing =
|
||||
elsif token = s.scan(/\w+=/)
|
||||
tokens << token
|
||||
|
||||
# Then skip over attribute value
|
||||
if s.scan(/"/)
|
||||
s.skip_until(/[^\\]"/)
|
||||
elsif s.scan(/'/)
|
||||
s.skip_until(/[^\\]'/)
|
||||
else
|
||||
s.skip_until(/\w+/)
|
||||
end
|
||||
|
||||
# Emit lone attributes
|
||||
elsif token = s.scan(/\w+/)
|
||||
tokens << token
|
||||
|
||||
# Stop at the end of the tag
|
||||
elsif s.scan(/>/)
|
||||
s.terminate
|
||||
|
||||
else
|
||||
s.getch
|
||||
end
|
||||
end
|
||||
|
||||
tokens
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -16,13 +16,19 @@
|
||||
# https://github.com/joyent/node
|
||||
- ^deps/
|
||||
- ^tools/
|
||||
- (^|/)configure$
|
||||
- (^|/)configure.ac$
|
||||
- (^|/)config.guess$
|
||||
- (^|/)config.sub$
|
||||
|
||||
# Node depedencies
|
||||
# Node dependencies
|
||||
- node_modules/
|
||||
|
||||
# Vendored depedencies
|
||||
# Vendored dependencies
|
||||
- vendor/
|
||||
|
||||
# Debian packaging
|
||||
- ^debian/
|
||||
|
||||
## Commonly Bundled JavaScript frameworks ##
|
||||
|
||||
@@ -61,8 +67,16 @@
|
||||
# MathJax
|
||||
- (^|/)MathJax/
|
||||
|
||||
# SyntaxHighlighter - http://alexgorbatchev.com/
|
||||
- (^|/)shBrush([^.]*)\.js$
|
||||
- (^|/)shCore\.js$
|
||||
- (^|/)shLegacy\.js$
|
||||
|
||||
## Python ##
|
||||
|
||||
# django
|
||||
- (^|/)admin_media/
|
||||
|
||||
# Fabric
|
||||
- ^fabfile\.py$
|
||||
|
||||
@@ -94,3 +108,6 @@
|
||||
|
||||
# Samples folders
|
||||
- ^[Ss]amples/
|
||||
|
||||
# Test fixtures
|
||||
- ^[Tt]est/fixtures/
|
||||
|
||||
Reference in New Issue
Block a user