mirror of
https://github.com/KevinMidboe/linguist.git
synced 2026-05-02 19:48:32 +00:00
Merge remote branch 'upstream/master' into lasso
Conflicts: lib/linguist/languages.yml
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
require 'linguist/blob_helper'
|
||||
require 'linguist/generated'
|
||||
require 'linguist/language'
|
||||
require 'linguist/mime'
|
||||
require 'linguist/repository'
|
||||
require 'linguist/samples'
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
require 'linguist/generated'
|
||||
require 'linguist/language'
|
||||
require 'linguist/mime'
|
||||
|
||||
require 'charlock_holmes'
|
||||
require 'escape_utils'
|
||||
require 'mime/types'
|
||||
require 'pygments'
|
||||
require 'yaml'
|
||||
|
||||
@@ -23,6 +23,22 @@ module Linguist
|
||||
File.extname(name.to_s)
|
||||
end
|
||||
|
||||
# Internal: Lookup mime type for extension.
|
||||
#
|
||||
# Returns a MIME::Type
|
||||
def _mime_type
|
||||
if defined? @_mime_type
|
||||
@_mime_type
|
||||
else
|
||||
guesses = ::MIME::Types.type_for(extname.to_s)
|
||||
|
||||
# Prefer text mime types over binary
|
||||
@_mime_type = guesses.detect { |type| type.ascii? } ||
|
||||
# Otherwise use the first guess
|
||||
guesses.first
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Get the actual blob mime type
|
||||
#
|
||||
# Examples
|
||||
@@ -32,7 +48,14 @@ module Linguist
|
||||
#
|
||||
# Returns a mime type String.
|
||||
def mime_type
|
||||
@mime_type ||= Mime.mime_for(extname.to_s)
|
||||
_mime_type ? _mime_type.to_s : 'text/plain'
|
||||
end
|
||||
|
||||
# Internal: Is the blob binary according to its mime type
|
||||
#
|
||||
# Return true or false
|
||||
def binary_mime_type?
|
||||
_mime_type ? _mime_type.binary? : false
|
||||
end
|
||||
|
||||
# Public: Get the Content-Type header value
|
||||
@@ -83,15 +106,6 @@ module Linguist
|
||||
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
|
||||
end
|
||||
|
||||
# Public: Is the blob binary according to its mime type
|
||||
#
|
||||
# Return true or false
|
||||
def binary_mime_type?
|
||||
if mime_type = Mime.lookup_mime_type_for(extname)
|
||||
mime_type.binary?
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Is the blob binary?
|
||||
#
|
||||
# Return true or false
|
||||
@@ -146,7 +160,7 @@ module Linguist
|
||||
#
|
||||
# Return true or false
|
||||
def safe_to_colorize?
|
||||
text? && !large? && !high_ratio_of_long_lines?
|
||||
!large? && text? && !high_ratio_of_long_lines?
|
||||
end
|
||||
|
||||
# Internal: Does the blob have a ratio of long lines?
|
||||
@@ -190,7 +204,31 @@ module Linguist
|
||||
#
|
||||
# Returns an Array of lines
|
||||
def lines
|
||||
@lines ||= (viewable? && data) ? data.split("\n", -1) : []
|
||||
@lines ||=
|
||||
if viewable? && data
|
||||
data.split(line_split_character, -1)
|
||||
else
|
||||
[]
|
||||
end
|
||||
end
|
||||
|
||||
# Character used to split lines. This is almost always "\n" except when Mac
|
||||
# Format is detected in which case it's "\r".
|
||||
#
|
||||
# Returns a split pattern string.
|
||||
def line_split_character
|
||||
@line_split_character ||= (mac_format?? "\r" : "\n")
|
||||
end
|
||||
|
||||
# Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
|
||||
# for line ends and does not include a \n (0x0a).
|
||||
#
|
||||
# Returns true when mac format is detected.
|
||||
def mac_format?
|
||||
return if !viewable?
|
||||
if pos = data[0, 4096].index("\r")
|
||||
data[pos + 1] != ?\n
|
||||
end
|
||||
end
|
||||
|
||||
# Public: Get number of lines of code
|
||||
@@ -236,7 +274,9 @@ module Linguist
|
||||
#
|
||||
# Return true or false
|
||||
def indexable?
|
||||
if binary?
|
||||
if size > 100 * 1024
|
||||
false
|
||||
elsif binary?
|
||||
false
|
||||
elsif extname == '.txt'
|
||||
true
|
||||
@@ -246,8 +286,6 @@ module Linguist
|
||||
false
|
||||
elsif generated?
|
||||
false
|
||||
elsif size > 100 * 1024
|
||||
false
|
||||
else
|
||||
true
|
||||
end
|
||||
@@ -259,11 +297,15 @@ module Linguist
|
||||
#
|
||||
# Returns a Language or nil if none is detected
|
||||
def language
|
||||
if defined? @language
|
||||
@language
|
||||
elsif !binary_mime_type?
|
||||
@language = Language.detect(name.to_s, lambda { data }, mode)
|
||||
return @language if defined? @language
|
||||
|
||||
if defined?(@data) && @data.is_a?(String)
|
||||
data = @data
|
||||
else
|
||||
data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
|
||||
end
|
||||
|
||||
@language = Language.detect(name.to_s, data, mode)
|
||||
end
|
||||
|
||||
# Internal: Get the lexer of the blob.
|
||||
|
||||
@@ -84,7 +84,9 @@ module Linguist
|
||||
|
||||
if possible_languages.length > 1
|
||||
data = data.call() if data.respond_to?(:call)
|
||||
if result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
|
||||
if data.nil? || data == ""
|
||||
nil
|
||||
elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
|
||||
Language[result[0]]
|
||||
end
|
||||
else
|
||||
@@ -220,6 +222,7 @@ module Linguist
|
||||
raise(ArgumentError, "#{@name} is missing lexer")
|
||||
|
||||
@ace_mode = attributes[:ace_mode]
|
||||
@wrap = attributes[:wrap] || false
|
||||
|
||||
# Set legacy search term
|
||||
@search_term = attributes[:search_term] || default_alias_name
|
||||
@@ -310,6 +313,11 @@ module Linguist
|
||||
# Returns a String name or nil
|
||||
attr_reader :ace_mode
|
||||
|
||||
# Public: Should language lines be wrapped
|
||||
#
|
||||
# Returns true or false
|
||||
attr_reader :wrap
|
||||
|
||||
# Public: Get extensions
|
||||
#
|
||||
# Examples
|
||||
@@ -460,6 +468,7 @@ module Linguist
|
||||
:aliases => options['aliases'],
|
||||
:lexer => options['lexer'],
|
||||
:ace_mode => options['ace_mode'],
|
||||
:wrap => options['wrap'],
|
||||
:group_name => options['group'],
|
||||
:searchable => options.key?('searchable') ? options['searchable'] : true,
|
||||
:search_term => options['search_term'],
|
||||
|
||||
@@ -2,21 +2,20 @@
|
||||
#
|
||||
# All languages have an associated lexer for syntax highlighting. It
|
||||
# defaults to name.downcase, which covers most cases. Make sure the
|
||||
# lexer exists in lexers.yml. This is a list of available lexers in
|
||||
# our version of pygments.
|
||||
# lexer exists in lexers.yml. This is a list of available in our
|
||||
# version of pygments.
|
||||
#
|
||||
# type - Either data, programming, markup, or nil
|
||||
# lexer - An explicit lexer String (defaults to name.downcase)
|
||||
# aliases - An Array of additional aliases (implicitly
|
||||
# includes name.downcase)
|
||||
# ace_mode - A String name of Ace Mode (if available)
|
||||
# extension - An Array of associated extensions. If file samples
|
||||
# are included in 'samples/<Language Name>/', then
|
||||
# its extension does not need to be listed.
|
||||
# wrap - Boolean wrap to enable line wrapping (default: false)
|
||||
# extension - An Array of associated extensions
|
||||
# primary_extension - A String for the main extension associated with
|
||||
# the language. Must be unique. Used when a Language
|
||||
# is picked from a dropdown and we need to
|
||||
# automatically choose an extension.
|
||||
# the language. Must be unique. Used when a Language is picked
|
||||
# from a dropdown and we need to automatically choose an
|
||||
# extension.
|
||||
# searchable - Boolean flag to enable searching (defaults to true)
|
||||
# search_term - Deprecated: Some languages maybe indexed under a
|
||||
# different alias. Avoid defining new exceptions.
|
||||
@@ -742,6 +741,7 @@ Markdown:
|
||||
type: markup
|
||||
lexer: Text only
|
||||
ace_mode: markdown
|
||||
wrap: true
|
||||
primary_extension: .md
|
||||
extensions:
|
||||
- .markdown
|
||||
@@ -1189,6 +1189,7 @@ Textile:
|
||||
type: markup
|
||||
lexer: Text only
|
||||
ace_mode: textile
|
||||
wrap: true
|
||||
primary_extension: .textile
|
||||
extensions:
|
||||
- .textile
|
||||
@@ -1333,6 +1334,7 @@ ooc:
|
||||
|
||||
reStructuredText:
|
||||
type: markup
|
||||
wrap: true
|
||||
search_term: rst
|
||||
aliases:
|
||||
- rst
|
||||
|
||||
@@ -1,91 +0,0 @@
|
||||
require 'mime/types'
|
||||
require 'yaml'
|
||||
|
||||
class MIME::Type
|
||||
attr_accessor :override
|
||||
end
|
||||
|
||||
# Register additional mime type extensions
|
||||
#
|
||||
# Follows same format as mime-types data file
|
||||
# https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
|
||||
File.read(File.expand_path("../mimes.yml", __FILE__)).lines.each do |line|
|
||||
# Regexp was cargo culted from mime-types lib
|
||||
next unless line =~ %r{^
|
||||
#{MIME::Type::MEDIA_TYPE_RE}
|
||||
(?:\s@([^\s]+))?
|
||||
(?:\s:(#{MIME::Type::ENCODING_RE}))?
|
||||
}x
|
||||
|
||||
mediatype = $1
|
||||
subtype = $2
|
||||
extensions = $3
|
||||
encoding = $4
|
||||
|
||||
# Lookup existing mime type
|
||||
mime_type = MIME::Types["#{mediatype}/#{subtype}"].first ||
|
||||
# Or create a new instance
|
||||
MIME::Type.new("#{mediatype}/#{subtype}")
|
||||
|
||||
if extensions
|
||||
extensions.split(/,/).each do |extension|
|
||||
mime_type.extensions << extension
|
||||
end
|
||||
end
|
||||
|
||||
if encoding
|
||||
mime_type.encoding = encoding
|
||||
end
|
||||
|
||||
mime_type.override = true
|
||||
|
||||
# Kind of hacky, but we need to reindex the mime type after making changes
|
||||
MIME::Types.add_type_variant(mime_type)
|
||||
MIME::Types.index_extensions(mime_type)
|
||||
end
|
||||
|
||||
module Linguist
|
||||
module Mime
|
||||
# Internal: Look up mime type for extension.
|
||||
#
|
||||
# ext - The extension String. May include leading "."
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# Mime.mime_for('.html')
|
||||
# # => 'text/html'
|
||||
#
|
||||
# Mime.mime_for('txt')
|
||||
# # => 'text/plain'
|
||||
#
|
||||
# Return mime type String otherwise falls back to 'text/plain'.
|
||||
def self.mime_for(ext)
|
||||
mime_type = lookup_mime_type_for(ext)
|
||||
mime_type ? mime_type.to_s : 'text/plain'
|
||||
end
|
||||
|
||||
# Internal: Lookup mime type for extension or mime type
|
||||
#
|
||||
# ext_or_mime_type - A file extension ".txt" or mime type "text/plain".
|
||||
#
|
||||
# Returns a MIME::Type
|
||||
def self.lookup_mime_type_for(ext_or_mime_type)
|
||||
ext_or_mime_type ||= ''
|
||||
|
||||
if ext_or_mime_type =~ /\w+\/\w+/
|
||||
guesses = ::MIME::Types[ext_or_mime_type]
|
||||
else
|
||||
guesses = ::MIME::Types.type_for(ext_or_mime_type)
|
||||
end
|
||||
|
||||
# Use custom override first
|
||||
guesses.detect { |type| type.override } ||
|
||||
|
||||
# Prefer text mime types over binary
|
||||
guesses.detect { |type| type.ascii? } ||
|
||||
|
||||
# Otherwise use the first guess
|
||||
guesses.first
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -1,62 +0,0 @@
|
||||
# Additional types to add to MIME::Types
|
||||
#
|
||||
# MIME types are used to set the Content-Type of raw binary blobs. All text
|
||||
# blobs are served as text/plain regardless of their type to ensure they
|
||||
# open in the browser rather than downloading.
|
||||
#
|
||||
# The encoding helps determine whether a file should be treated as plain
|
||||
# text or binary. By default, a mime type's encoding is base64 (binary).
|
||||
# These types will show a "View Raw" link. To force a type to render as
|
||||
# plain text, set it to 8bit for UTF-8. text/* types will be treated as
|
||||
# text by default.
|
||||
#
|
||||
# <type> @<extensions> :<encoding>
|
||||
#
|
||||
# type - mediatype/subtype
|
||||
# extensions - comma seperated extension list
|
||||
# encoding - base64 (binary), 7bit (ASCII), 8bit (UTF-8), or
|
||||
# quoted-printable (Printable ASCII).
|
||||
#
|
||||
# Follows same format as mime-types data file
|
||||
# https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
|
||||
#
|
||||
# Any additions or modifications (even trivial) should have corresponding
|
||||
# test change in `test/test_mime.rb`.
|
||||
|
||||
# TODO: Lookup actual types
|
||||
application/octet-stream @a,blend,gem,graffle,ipa,lib,mcz,nib,o,ogv,otf,pfx,pigx,plgx,psd,sib,spl,sqlite3,swc,ucode,xpi
|
||||
|
||||
# Please keep this list alphabetized
|
||||
application/java-archive @ear,war
|
||||
application/netcdf :8bit
|
||||
application/ogg @ogg
|
||||
application/postscript :base64
|
||||
application/vnd.adobe.air-application-installer-package+zip @air
|
||||
application/vnd.mozilla.xul+xml :8bit
|
||||
application/vnd.oasis.opendocument.presentation @odp
|
||||
application/vnd.oasis.opendocument.spreadsheet @ods
|
||||
application/vnd.oasis.opendocument.text @odt
|
||||
application/vnd.openofficeorg.extension @oxt
|
||||
application/vnd.openxmlformats-officedocument.presentationml.presentation @pptx
|
||||
application/x-chrome-extension @crx
|
||||
application/x-iwork-keynote-sffkey @key
|
||||
application/x-iwork-numbers-sffnumbers @numbers
|
||||
application/x-iwork-pages-sffpages @pages
|
||||
application/x-ms-xbap @xbap :8bit
|
||||
application/x-parrot-bytecode @pbc
|
||||
application/x-shockwave-flash @swf
|
||||
application/x-silverlight-app @xap
|
||||
application/x-supercollider @sc :8bit
|
||||
application/x-troff-ms :8bit
|
||||
application/x-wais-source :8bit
|
||||
application/xaml+xml @xaml :8bit
|
||||
application/xslt+xml @xslt :8bit
|
||||
image/x-icns @icns
|
||||
text/cache-manifest @manifest
|
||||
text/plain @cu,cxx
|
||||
text/x-logtalk @lgt
|
||||
text/x-nemerle @n
|
||||
text/x-nimrod @nim
|
||||
text/x-ocaml @ml,mli,mll,mly,sig,sml
|
||||
text/x-rust @rs,rc
|
||||
text/x-scheme @rkt,scm,sls,sps,ss
|
||||
File diff suppressed because it is too large
Load Diff
@@ -76,12 +76,14 @@ module Linguist
|
||||
db['extnames'][language_name] ||= []
|
||||
if !db['extnames'][language_name].include?(sample[:extname])
|
||||
db['extnames'][language_name] << sample[:extname]
|
||||
db['extnames'][language_name].sort!
|
||||
end
|
||||
end
|
||||
|
||||
if sample[:filename]
|
||||
db['filenames'][language_name] ||= []
|
||||
db['filenames'][language_name] << sample[:filename]
|
||||
db['filenames'][language_name].sort!
|
||||
end
|
||||
|
||||
data = File.read(sample[:path])
|
||||
|
||||
@@ -16,12 +16,18 @@ module Linguist
|
||||
new.extract_tokens(data)
|
||||
end
|
||||
|
||||
# Read up to 100KB
|
||||
BYTE_LIMIT = 100_000
|
||||
|
||||
# Start state on token, ignore anything till the next newline
|
||||
SINGLE_LINE_COMMENTS = [
|
||||
'//', # C
|
||||
'#', # Ruby
|
||||
'%', # Tex
|
||||
]
|
||||
|
||||
# Start state on opening token, ignore anything until the closing
|
||||
# token is reached.
|
||||
MULTI_LINE_COMMENTS = [
|
||||
['/*', '*/'], # C
|
||||
['<!--', '-->'], # XML
|
||||
@@ -30,7 +36,7 @@ module Linguist
|
||||
]
|
||||
|
||||
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
|
||||
"^\s*#{Regexp.escape(c)} "
|
||||
"\s*#{Regexp.escape(c)} "
|
||||
}.join("|"))
|
||||
|
||||
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
|
||||
@@ -52,22 +58,24 @@ module Linguist
|
||||
|
||||
tokens = []
|
||||
until s.eos?
|
||||
break if s.pos >= BYTE_LIMIT
|
||||
|
||||
if token = s.scan(/^#!.+$/)
|
||||
if name = extract_shebang(token)
|
||||
tokens << "SHEBANG#!#{name}"
|
||||
end
|
||||
|
||||
# Single line comment
|
||||
elsif token = s.scan(START_SINGLE_LINE_COMMENT)
|
||||
tokens << token.strip
|
||||
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
|
||||
# tokens << token.strip
|
||||
s.skip_until(/\n|\Z/)
|
||||
|
||||
# Multiline comments
|
||||
elsif token = s.scan(START_MULTI_LINE_COMMENT)
|
||||
tokens << token
|
||||
# tokens << token
|
||||
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
|
||||
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
|
||||
tokens << close_token
|
||||
# tokens << close_token
|
||||
|
||||
# Skip single or double quoted strings
|
||||
elsif s.scan(/"/)
|
||||
|
||||
Reference in New Issue
Block a user