Merge remote branch 'upstream/master' into lasso

Conflicts:
	lib/linguist/languages.yml
This commit is contained in:
Steve Piercy
2012-12-05 12:55:30 -08:00
39 changed files with 36699 additions and 27166 deletions

View File

@@ -1,6 +1,5 @@
require 'linguist/blob_helper'
require 'linguist/generated'
require 'linguist/language'
require 'linguist/mime'
require 'linguist/repository'
require 'linguist/samples'

View File

@@ -1,9 +1,9 @@
require 'linguist/generated'
require 'linguist/language'
require 'linguist/mime'
require 'charlock_holmes'
require 'escape_utils'
require 'mime/types'
require 'pygments'
require 'yaml'
@@ -23,6 +23,22 @@ module Linguist
File.extname(name.to_s)
end
# Internal: Lookup mime type for extension.
#
# Returns a MIME::Type
def _mime_type
if defined? @_mime_type
@_mime_type
else
guesses = ::MIME::Types.type_for(extname.to_s)
# Prefer text mime types over binary
@_mime_type = guesses.detect { |type| type.ascii? } ||
# Otherwise use the first guess
guesses.first
end
end
# Public: Get the actual blob mime type
#
# Examples
@@ -32,7 +48,14 @@ module Linguist
#
# Returns a mime type String.
def mime_type
@mime_type ||= Mime.mime_for(extname.to_s)
_mime_type ? _mime_type.to_s : 'text/plain'
end
# Internal: Is the blob binary according to its mime type
#
# Return true or false
def binary_mime_type?
_mime_type ? _mime_type.binary? : false
end
# Public: Get the Content-Type header value
@@ -83,15 +106,6 @@ module Linguist
@detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
end
# Public: Is the blob binary according to its mime type
#
# Return true or false
def binary_mime_type?
if mime_type = Mime.lookup_mime_type_for(extname)
mime_type.binary?
end
end
# Public: Is the blob binary?
#
# Return true or false
@@ -146,7 +160,7 @@ module Linguist
#
# Return true or false
def safe_to_colorize?
text? && !large? && !high_ratio_of_long_lines?
!large? && text? && !high_ratio_of_long_lines?
end
# Internal: Does the blob have a ratio of long lines?
@@ -190,7 +204,31 @@ module Linguist
#
# Returns an Array of lines
def lines
@lines ||= (viewable? && data) ? data.split("\n", -1) : []
@lines ||=
if viewable? && data
data.split(line_split_character, -1)
else
[]
end
end
# Character used to split lines. This is almost always "\n" except when Mac
# Format is detected in which case it's "\r".
#
# Returns a split pattern string.
def line_split_character
@line_split_character ||= (mac_format?? "\r" : "\n")
end
# Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
# for line ends and does not include a \n (0x0a).
#
# Returns true when mac format is detected.
def mac_format?
return if !viewable?
if pos = data[0, 4096].index("\r")
data[pos + 1] != ?\n
end
end
# Public: Get number of lines of code
@@ -236,7 +274,9 @@ module Linguist
#
# Return true or false
def indexable?
if binary?
if size > 100 * 1024
false
elsif binary?
false
elsif extname == '.txt'
true
@@ -246,8 +286,6 @@ module Linguist
false
elsif generated?
false
elsif size > 100 * 1024
false
else
true
end
@@ -259,11 +297,15 @@ module Linguist
#
# Returns a Language or nil if none is detected
def language
if defined? @language
@language
elsif !binary_mime_type?
@language = Language.detect(name.to_s, lambda { data }, mode)
return @language if defined? @language
if defined?(@data) && @data.is_a?(String)
data = @data
else
data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
end
@language = Language.detect(name.to_s, data, mode)
end
# Internal: Get the lexer of the blob.

View File

@@ -84,7 +84,9 @@ module Linguist
if possible_languages.length > 1
data = data.call() if data.respond_to?(:call)
if result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
if data.nil? || data == ""
nil
elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
Language[result[0]]
end
else
@@ -220,6 +222,7 @@ module Linguist
raise(ArgumentError, "#{@name} is missing lexer")
@ace_mode = attributes[:ace_mode]
@wrap = attributes[:wrap] || false
# Set legacy search term
@search_term = attributes[:search_term] || default_alias_name
@@ -310,6 +313,11 @@ module Linguist
# Returns a String name or nil
attr_reader :ace_mode
# Public: Should language lines be wrapped
#
# Returns true or false
attr_reader :wrap
# Public: Get extensions
#
# Examples
@@ -460,6 +468,7 @@ module Linguist
:aliases => options['aliases'],
:lexer => options['lexer'],
:ace_mode => options['ace_mode'],
:wrap => options['wrap'],
:group_name => options['group'],
:searchable => options.key?('searchable') ? options['searchable'] : true,
:search_term => options['search_term'],

View File

@@ -2,21 +2,20 @@
#
# All languages have an associated lexer for syntax highlighting. It
# defaults to name.downcase, which covers most cases. Make sure the
# lexer exists in lexers.yml. This is a list of available lexers in
# our version of pygments.
# lexer exists in lexers.yml. This is a list of available in our
# version of pygments.
#
# type - Either data, programming, markup, or nil
# lexer - An explicit lexer String (defaults to name.downcase)
# aliases - An Array of additional aliases (implicitly
# includes name.downcase)
# ace_mode - A String name of Ace Mode (if available)
# extension - An Array of associated extensions. If file samples
# are included in 'samples/<Language Name>/', then
# its extension does not need to be listed.
# wrap - Boolean wrap to enable line wrapping (default: false)
# extension - An Array of associated extensions
# primary_extension - A String for the main extension associated with
# the language. Must be unique. Used when a Language
# is picked from a dropdown and we need to
# automatically choose an extension.
# the language. Must be unique. Used when a Language is picked
# from a dropdown and we need to automatically choose an
# extension.
# searchable - Boolean flag to enable searching (defaults to true)
# search_term - Deprecated: Some languages maybe indexed under a
# different alias. Avoid defining new exceptions.
@@ -742,6 +741,7 @@ Markdown:
type: markup
lexer: Text only
ace_mode: markdown
wrap: true
primary_extension: .md
extensions:
- .markdown
@@ -1189,6 +1189,7 @@ Textile:
type: markup
lexer: Text only
ace_mode: textile
wrap: true
primary_extension: .textile
extensions:
- .textile
@@ -1333,6 +1334,7 @@ ooc:
reStructuredText:
type: markup
wrap: true
search_term: rst
aliases:
- rst

View File

@@ -1,91 +0,0 @@
require 'mime/types'
require 'yaml'
class MIME::Type
attr_accessor :override
end
# Register additional mime type extensions
#
# Follows same format as mime-types data file
# https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
File.read(File.expand_path("../mimes.yml", __FILE__)).lines.each do |line|
# Regexp was cargo culted from mime-types lib
next unless line =~ %r{^
#{MIME::Type::MEDIA_TYPE_RE}
(?:\s@([^\s]+))?
(?:\s:(#{MIME::Type::ENCODING_RE}))?
}x
mediatype = $1
subtype = $2
extensions = $3
encoding = $4
# Lookup existing mime type
mime_type = MIME::Types["#{mediatype}/#{subtype}"].first ||
# Or create a new instance
MIME::Type.new("#{mediatype}/#{subtype}")
if extensions
extensions.split(/,/).each do |extension|
mime_type.extensions << extension
end
end
if encoding
mime_type.encoding = encoding
end
mime_type.override = true
# Kind of hacky, but we need to reindex the mime type after making changes
MIME::Types.add_type_variant(mime_type)
MIME::Types.index_extensions(mime_type)
end
module Linguist
module Mime
# Internal: Look up mime type for extension.
#
# ext - The extension String. May include leading "."
#
# Examples
#
# Mime.mime_for('.html')
# # => 'text/html'
#
# Mime.mime_for('txt')
# # => 'text/plain'
#
# Return mime type String otherwise falls back to 'text/plain'.
def self.mime_for(ext)
mime_type = lookup_mime_type_for(ext)
mime_type ? mime_type.to_s : 'text/plain'
end
# Internal: Lookup mime type for extension or mime type
#
# ext_or_mime_type - A file extension ".txt" or mime type "text/plain".
#
# Returns a MIME::Type
def self.lookup_mime_type_for(ext_or_mime_type)
ext_or_mime_type ||= ''
if ext_or_mime_type =~ /\w+\/\w+/
guesses = ::MIME::Types[ext_or_mime_type]
else
guesses = ::MIME::Types.type_for(ext_or_mime_type)
end
# Use custom override first
guesses.detect { |type| type.override } ||
# Prefer text mime types over binary
guesses.detect { |type| type.ascii? } ||
# Otherwise use the first guess
guesses.first
end
end
end

View File

@@ -1,62 +0,0 @@
# Additional types to add to MIME::Types
#
# MIME types are used to set the Content-Type of raw binary blobs. All text
# blobs are served as text/plain regardless of their type to ensure they
# open in the browser rather than downloading.
#
# The encoding helps determine whether a file should be treated as plain
# text or binary. By default, a mime type's encoding is base64 (binary).
# These types will show a "View Raw" link. To force a type to render as
# plain text, set it to 8bit for UTF-8. text/* types will be treated as
# text by default.
#
# <type> @<extensions> :<encoding>
#
# type - mediatype/subtype
# extensions - comma seperated extension list
# encoding - base64 (binary), 7bit (ASCII), 8bit (UTF-8), or
# quoted-printable (Printable ASCII).
#
# Follows same format as mime-types data file
# https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
#
# Any additions or modifications (even trivial) should have corresponding
# test change in `test/test_mime.rb`.
# TODO: Lookup actual types
application/octet-stream @a,blend,gem,graffle,ipa,lib,mcz,nib,o,ogv,otf,pfx,pigx,plgx,psd,sib,spl,sqlite3,swc,ucode,xpi
# Please keep this list alphabetized
application/java-archive @ear,war
application/netcdf :8bit
application/ogg @ogg
application/postscript :base64
application/vnd.adobe.air-application-installer-package+zip @air
application/vnd.mozilla.xul+xml :8bit
application/vnd.oasis.opendocument.presentation @odp
application/vnd.oasis.opendocument.spreadsheet @ods
application/vnd.oasis.opendocument.text @odt
application/vnd.openofficeorg.extension @oxt
application/vnd.openxmlformats-officedocument.presentationml.presentation @pptx
application/x-chrome-extension @crx
application/x-iwork-keynote-sffkey @key
application/x-iwork-numbers-sffnumbers @numbers
application/x-iwork-pages-sffpages @pages
application/x-ms-xbap @xbap :8bit
application/x-parrot-bytecode @pbc
application/x-shockwave-flash @swf
application/x-silverlight-app @xap
application/x-supercollider @sc :8bit
application/x-troff-ms :8bit
application/x-wais-source :8bit
application/xaml+xml @xaml :8bit
application/xslt+xml @xslt :8bit
image/x-icns @icns
text/cache-manifest @manifest
text/plain @cu,cxx
text/x-logtalk @lgt
text/x-nemerle @n
text/x-nimrod @nim
text/x-ocaml @ml,mli,mll,mly,sig,sml
text/x-rust @rs,rc
text/x-scheme @rkt,scm,sls,sps,ss

File diff suppressed because it is too large Load Diff

View File

@@ -76,12 +76,14 @@ module Linguist
db['extnames'][language_name] ||= []
if !db['extnames'][language_name].include?(sample[:extname])
db['extnames'][language_name] << sample[:extname]
db['extnames'][language_name].sort!
end
end
if sample[:filename]
db['filenames'][language_name] ||= []
db['filenames'][language_name] << sample[:filename]
db['filenames'][language_name].sort!
end
data = File.read(sample[:path])

View File

@@ -16,12 +16,18 @@ module Linguist
new.extract_tokens(data)
end
# Read up to 100KB
BYTE_LIMIT = 100_000
# Start state on token, ignore anything till the next newline
SINGLE_LINE_COMMENTS = [
'//', # C
'#', # Ruby
'%', # Tex
]
# Start state on opening token, ignore anything until the closing
# token is reached.
MULTI_LINE_COMMENTS = [
['/*', '*/'], # C
['<!--', '-->'], # XML
@@ -30,7 +36,7 @@ module Linguist
]
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
"^\s*#{Regexp.escape(c)} "
"\s*#{Regexp.escape(c)} "
}.join("|"))
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
@@ -52,22 +58,24 @@ module Linguist
tokens = []
until s.eos?
break if s.pos >= BYTE_LIMIT
if token = s.scan(/^#!.+$/)
if name = extract_shebang(token)
tokens << "SHEBANG#!#{name}"
end
# Single line comment
elsif token = s.scan(START_SINGLE_LINE_COMMENT)
tokens << token.strip
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
# tokens << token.strip
s.skip_until(/\n|\Z/)
# Multiline comments
elsif token = s.scan(START_MULTI_LINE_COMMENT)
tokens << token
# tokens << token
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
tokens << close_token
# tokens << close_token
# Skip single or double quoted strings
elsif s.scan(/"/)