From 4ba4257fdce84ca8f7a57b5710ef2934d76752dc Mon Sep 17 00:00:00 2001 From: Joshua Peek Date: Tue, 24 May 2011 21:41:16 -0500 Subject: [PATCH] Document blob helper --- lib/linguist/blob_helper.rb | 245 ++++++++++++++++++++++++++++-------- 1 file changed, 191 insertions(+), 54 deletions(-) diff --git a/lib/linguist/blob_helper.rb b/lib/linguist/blob_helper.rb index 963d4263..369a46ce 100644 --- a/lib/linguist/blob_helper.rb +++ b/lib/linguist/blob_helper.rb @@ -6,19 +6,50 @@ require 'escape_utils' require 'yaml' module Linguist + # BlobHelper is a mixin for Blobish classes that respond to "name", + # "data" and "size" such as Grit::Blob. module BlobHelper + # Internal: Get a Pathname wrapper for Blob#name + # + # Returns a Pathname. def pathname Pathname.new(name || "") end + # Public: Get the actual blob mime type + # + # Examples + # + # # => 'text/plain' + # # => 'text/html' + # + # Returns a mime type String. def mime_type @mime_type ||= pathname.mime_type end + # Public: Get the Content-Type header value + # + # This value is used when serving raw blobs. + # + # Examples + # + # # => 'text/plain; charset=utf-8' + # # => 'application/octet-stream' + # + # Returns a content type String. def content_type pathname.content_type end + # Public: Get the Content-Disposition header value + # + # This value is used when serving raw blobs. + # + # # => "attachment; filename=file.tar" + # # => "inline" + # + # Returns a content disposition String. def disposition case content_type when 'application/octet-stream', 'application/java-archive' @@ -28,40 +59,102 @@ module Linguist end end - def lines - @lines ||= data ? data.split("\n", -1) : [] - end - - def loc - lines.size - end - - def sloc - lines.grep(/\S/).size - end - - def binary? - content_type.include?('octet') || !(text? || image?) - end - + # Public: Is the blob text? + # + # Return true or false def text? content_type[/(text|json)/] end + # Public: Is the blob a supported image format? + # + # Return true or false def image? ['.png', '.jpg', '.jpeg', '.gif'].include?(pathname.extname) end + # Public: Is the blob binary? + # + # Return true or false + def binary? + content_type.include?('octet') || !(text? || image?) + end + MEGABYTE = 1024 * 1024 + # Public: Is the blob too big to load? + # + # Return true or false def large? size.to_i > MEGABYTE end + # Public: Is the blob viewable? + # + # Non-viewable blobs will just show a "View Raw" link + # + # Return true or false def viewable? !image? && !binary? && !large? end + vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__)) + VendoredRegexp = Regexp.new(vendored_paths.join('|')) + + # Public: Is the blob in a vendored directory? + # + # Vendored files are ignored by language statistics. + # + # See "vendor.yml" for a list of vendored conventions that match + # this pattern. + # + # Return true or false + def vendored? + name =~ VendoredRegexp + end + + ################################################################ + # Below here are methods they may access Blob#data. Consider the + # performance implications of loading it. + ################################################################ + + # Public: Get each line of data + # + # Requires Blob#data + # + # Returns an Array of lines + def lines + @lines ||= data ? data.split("\n", -1) : [] + end + + # Public: Get number of lines of code + # + # Requires Blob#data + # + # Returns Integer + def loc + lines.size + end + + # Public: Get number of source lines of code + # + # Requires Blob#data + # + # Returns Integer + def sloc + lines.grep(/\S/).size + end + + # Public: Is the blob a generated file? + # + # Generated source code is supressed in diffs and is ignored by + # langauge statistics. + # + # Includes: + # - XCode project XML files + # - Minified JavaScript + # + # Return true or false def generated? if ['.xib', '.nib', '.pbxproj'].include?(pathname.extname) true @@ -73,26 +166,14 @@ module Linguist end end - vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__)) - VendoredRegexp = Regexp.new(vendored_paths.join('|')) - - def vendored? - name =~ VendoredRegexp - end - - # Determine if the blob contains bad content that can be used for various - # cross site attacks. Right now this is limited to flash files -- the flash - # plugin ignores the response content type and treats any URL as flash - # when the tag is specified correctly regardless of file extension. + # Public: Should the blob be indexed for searching? # - # Returns true when the blob data should not be served with any content-type. - def forbidden? - if data = self.data - data.size >= 8 && # all flash has at least 8 bytes - %w(CWS FWS).include?(data[0,3]) # file type sigs - end - end - + # Excluded: + # - Non-text files + # - Generated source files + # - .po and .sql files + # + # Return true or false def indexable? if !text? false @@ -107,10 +188,41 @@ module Linguist end end + # Public: Determine if the blob contains bad content that can be + # used for various cross site attacks. + # + # Right now this is limited to flash files -- the flash plugin + # ignores the response content type and treats any URL as flash + # when the tag is specified correctly regardless of file + # extension. + # + # Requires Blob#data + # + # Returns true when the blob data should not be served with any + # content-type. + def forbidden? + if data = self.data + data.size >= 8 && # all flash has at least 8 bytes + %w(CWS FWS).include?(data[0,3]) # file type sigs + end + end + + # Public: Detects the Language of the blob. + # + # May load Blob#data + # + # Returns a Language object def language if text? - if !Language.find_by_extension(pathname.extname) - shebang_language || pathname.language + # First see if there is a Language for the extension + if Language.find_by_extension(pathname.extname) + pathname.language + + # Try to detect Language from shebang line + elsif language = shebang_language + language + + # Default to Pathname#language else pathname.language end @@ -119,12 +231,32 @@ module Linguist end end + # Deprecated: Get the lexer of the blob. + # + # Returns a Lexer. def lexer language.lexer end + # Internal: Extract the script name from the shebang line + # + # Requires Blob#data + # + # Examples + # + # '#!/usr/bin/ruby' + # # => 'ruby' + # + # '#!/usr/bin/env ruby' + # # => 'ruby' + # + # '#!/usr/bash/python2.4' + # # => 'python' + # + # Returns a script name String or nil def shebang_script - return if !text? || large? + # Fail fast if blob isn't viewable? + return unless viewable? if data && (match = data.match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/ bang.sub!(/^#! /, '#!') @@ -147,35 +279,40 @@ module Linguist end end + shebangs = YAML.load_file(File.expand_path("../shebangs.yml", __FILE__)) + Shebangs = shebangs.inject({}) { |h, (name, scripts)| + scripts.each { |script| h[script] = Language[name] } + h + } + + # Internal: Get Language for shebang script + # + # Matches script name with shebang script name mappings in "shebangs.yml" + # + # Returns the Language or nil def shebang_language if script = shebang_script - case script - when 'bash' - Language['Shell'] - when 'groovy' - Language['Java'] - when 'macruby' - Language['Ruby'] - when 'node' - Language['JavaScript'] - when 'rake' - Language['Ruby'] - when 'sh' - Language['Shell'] - when 'zsh' - Language['Shell'] + if lang = Shebangs[script] + lang else - lang = Language.find_by_lexer(shebang_script) + lang = Language.find_by_lexer(script) lang != Language['Text'] ? lang : nil end end end + # Public: Highlight syntax of blob + # + # Returns html String def colorize return if !text? || large? lexer.colorize(data) end + # Public: Highlight syntax of blob without the outer highlight div + # wrapper. + # + # Returns html String def colorize_without_wrapper return if !text? || large? lexer.colorize_without_wrapper(data)