mirror of
				https://github.com/KevinMidboe/linguist.git
				synced 2025-10-29 17:50:22 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			545 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
			
		
		
	
	
			545 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			Ruby
		
	
	
	
	
	
| require 'linguist/classifier'
 | |
| require 'linguist/language'
 | |
| require 'linguist/mime'
 | |
| require 'linguist/pathname'
 | |
| require 'linguist/samples'
 | |
| 
 | |
| require 'charlock_holmes'
 | |
| require 'escape_utils'
 | |
| require 'pygments'
 | |
| require 'yaml'
 | |
| 
 | |
| module Linguist
 | |
|   # BlobHelper is a mixin for Blobish classes that respond to "name",
 | |
|   # "data" and "size" such as Grit::Blob.
 | |
|   module BlobHelper
 | |
|     # Internal: Get a Pathname wrapper for Blob#name
 | |
|     #
 | |
|     # Returns a Pathname.
 | |
|     def pathname
 | |
|       Pathname.new(name || "")
 | |
|     end
 | |
| 
 | |
|     # Public: Get the extname of the path
 | |
|     #
 | |
|     # Examples
 | |
|     #
 | |
|     #   blob(name='foo.rb').extname
 | |
|     #   # => '.rb'
 | |
|     #
 | |
|     # Returns a String
 | |
|     def extname
 | |
|       pathname.extname
 | |
|     end
 | |
| 
 | |
|     # Public: Get the actual blob mime type
 | |
|     #
 | |
|     # Examples
 | |
|     #
 | |
|     #   # => 'text/plain'
 | |
|     #   # => 'text/html'
 | |
|     #
 | |
|     # Returns a mime type String.
 | |
|     def mime_type
 | |
|       @mime_type ||= pathname.mime_type
 | |
|     end
 | |
| 
 | |
|     # Public: Get the Content-Type header value
 | |
|     #
 | |
|     # This value is used when serving raw blobs.
 | |
|     #
 | |
|     # Examples
 | |
|     #
 | |
|     #   # => 'text/plain; charset=utf-8'
 | |
|     #   # => 'application/octet-stream'
 | |
|     #
 | |
|     # Returns a content type String.
 | |
|     def content_type
 | |
|       @content_type ||= (binary_mime_type? || binary?) ? mime_type :
 | |
|         (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
 | |
|     end
 | |
| 
 | |
|     # Public: Get the Content-Disposition header value
 | |
|     #
 | |
|     # This value is used when serving raw blobs.
 | |
|     #
 | |
|     #   # => "attachment; filename=file.tar"
 | |
|     #   # => "inline"
 | |
|     #
 | |
|     # Returns a content disposition String.
 | |
|     def disposition
 | |
|       if text? || image?
 | |
|         'inline'
 | |
|       elsif name.nil?
 | |
|         "attachment"
 | |
|       else
 | |
|         "attachment; filename=#{EscapeUtils.escape_url(pathname.basename)}"
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     def encoding
 | |
|       if hash = detect_encoding
 | |
|         hash[:encoding]
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Try to guess the encoding
 | |
|     #
 | |
|     # Returns: a Hash, with :encoding, :confidence, :type
 | |
|     #          this will return nil if an error occurred during detection or
 | |
|     #          no valid encoding could be found
 | |
|     def detect_encoding
 | |
|       @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
 | |
|     end
 | |
| 
 | |
|     # Public: Is the blob binary according to its mime type
 | |
|     #
 | |
|     # Return true or false
 | |
|     def binary_mime_type?
 | |
|       if mime_type = Mime.lookup_mime_type_for(pathname.extname)
 | |
|         mime_type.binary?
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Public: Is the blob binary?
 | |
|     #
 | |
|     # Return true or false
 | |
|     def binary?
 | |
|       # Large blobs aren't even loaded into memory
 | |
|       if data.nil?
 | |
|         true
 | |
| 
 | |
|       # Treat blank files as text
 | |
|       elsif data == ""
 | |
|         false
 | |
| 
 | |
|       # Charlock doesn't know what to think
 | |
|       elsif encoding.nil?
 | |
|         true
 | |
| 
 | |
|       # If Charlock says its binary
 | |
|       else
 | |
|         detect_encoding[:type] == :binary
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Public: Is the blob text?
 | |
|     #
 | |
|     # Return true or false
 | |
|     def text?
 | |
|       !binary?
 | |
|     end
 | |
| 
 | |
|     # Public: Is the blob a supported image format?
 | |
|     #
 | |
|     # Return true or false
 | |
|     def image?
 | |
|       ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
 | |
|     end
 | |
| 
 | |
|     # Public: Is the blob likely to have a shebang?
 | |
|     #
 | |
|     # Return true or false
 | |
|     def shebang_extname?
 | |
|       extname.empty? &&
 | |
|         mode &&
 | |
|         (mode.to_i(8) & 05) == 05
 | |
|     end
 | |
| 
 | |
|     MEGABYTE = 1024 * 1024
 | |
| 
 | |
|     # Public: Is the blob too big to load?
 | |
|     #
 | |
|     # Return true or false
 | |
|     def large?
 | |
|       size.to_i > MEGABYTE
 | |
|     end
 | |
| 
 | |
|     # Public: Is the blob safe to colorize?
 | |
|     #
 | |
|     # We use Pygments.rb for syntax highlighting blobs, which
 | |
|     # has some quirks and also is essentially 'un-killable' via
 | |
|     # normal timeout.  To workaround this we try to
 | |
|     # carefully handling Pygments.rb anything it can't handle.
 | |
|     #
 | |
|     # Return true or false
 | |
|     def safe_to_colorize?
 | |
|       text? && !large? && !high_ratio_of_long_lines?
 | |
|     end
 | |
| 
 | |
|     # Internal: Does the blob have a ratio of long lines?
 | |
|     #
 | |
|     # These types of files are usually going to make Pygments.rb
 | |
|     # angry if we try to colorize them.
 | |
|     #
 | |
|     # Return true or false
 | |
|     def high_ratio_of_long_lines?
 | |
|       return false if loc == 0
 | |
|       size / loc > 5000
 | |
|     end
 | |
| 
 | |
|     # Public: Is the blob viewable?
 | |
|     #
 | |
|     # Non-viewable blobs will just show a "View Raw" link
 | |
|     #
 | |
|     # Return true or false
 | |
|     def viewable?
 | |
|       !large? && text?
 | |
|     end
 | |
| 
 | |
|     vendored_paths = YAML.load_file(File.expand_path("../vendor.yml", __FILE__))
 | |
|     VendoredRegexp = Regexp.new(vendored_paths.join('|'))
 | |
| 
 | |
|     # Public: Is the blob in a vendored directory?
 | |
|     #
 | |
|     # Vendored files are ignored by language statistics.
 | |
|     #
 | |
|     # See "vendor.yml" for a list of vendored conventions that match
 | |
|     # this pattern.
 | |
|     #
 | |
|     # Return true or false
 | |
|     def vendored?
 | |
|       name =~ VendoredRegexp ? true : false
 | |
|     end
 | |
| 
 | |
|     # Public: Get each line of data
 | |
|     #
 | |
|     # Requires Blob#data
 | |
|     #
 | |
|     # Returns an Array of lines
 | |
|     def lines
 | |
|       @lines ||= (viewable? && data) ? data.split("\n", -1) : []
 | |
|     end
 | |
| 
 | |
|     # Public: Get number of lines of code
 | |
|     #
 | |
|     # Requires Blob#data
 | |
|     #
 | |
|     # Returns Integer
 | |
|     def loc
 | |
|       lines.size
 | |
|     end
 | |
| 
 | |
|     # Public: Get number of source lines of code
 | |
|     #
 | |
|     # Requires Blob#data
 | |
|     #
 | |
|     # Returns Integer
 | |
|     def sloc
 | |
|       lines.grep(/\S/).size
 | |
|     end
 | |
| 
 | |
|     # Internal: Compute average line length.
 | |
|     #
 | |
|     # Returns Integer.
 | |
|     def average_line_length
 | |
|       if lines.any?
 | |
|         lines.inject(0) { |n, l| n += l.length } / lines.length
 | |
|       else
 | |
|         0
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Public: Is the blob a generated file?
 | |
|     #
 | |
|     # Generated source code is supressed in diffs and is ignored by
 | |
|     # language statistics.
 | |
|     #
 | |
|     # Requires Blob#data
 | |
|     #
 | |
|     # Includes:
 | |
|     # - XCode project XML files
 | |
|     # - Minified JavaScript
 | |
|     # - Compiled CoffeeScript
 | |
|     # - PEG.js-generated parsers
 | |
|     #
 | |
|     # Please add additional test coverage to
 | |
|     # `test/test_blob.rb#test_generated` if you make any changes.
 | |
|     #
 | |
|     # Return true or false
 | |
|     def generated?
 | |
|       if name == 'Gemfile.lock' || minified_javascript? || compiled_coffeescript? ||
 | |
|       xcode_project_file? || generated_net_docfile? || generated_parser?
 | |
|         true
 | |
|       else
 | |
|         false
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Internal: Is the blob an XCode project file?
 | |
|     #
 | |
|     # Generated if the file extension is an XCode project
 | |
|     # file extension.
 | |
|     #
 | |
|     # Returns true of false.
 | |
|     def xcode_project_file?
 | |
|       ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
 | |
|     end
 | |
| 
 | |
|     # Internal: Is the blob minified JS?
 | |
|     #
 | |
|     # Consider JS minified if the average line length is
 | |
|     # greater then 100c.
 | |
|     #
 | |
|     # Returns true or false.
 | |
|     def minified_javascript?
 | |
|       return unless extname == '.js'
 | |
|       average_line_length > 100
 | |
|     end
 | |
| 
 | |
|     # Internal: Is the blob of JS a parser generated by PEG.js?
 | |
|     #
 | |
|     # Requires Blob#data
 | |
|     #
 | |
|     # PEG.js-generated parsers are not meant to be consumed by humans.
 | |
|     #
 | |
|     # Return true or false
 | |
|     def generated_parser?
 | |
|       return false unless extname == '.js'
 | |
| 
 | |
|       # PEG.js-generated parsers include a comment near the top  of the file
 | |
|       # that marks them as such.
 | |
|       if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
 | |
|         return true
 | |
|       end
 | |
| 
 | |
|       false
 | |
|     end
 | |
| 
 | |
|     # Internal: Is the blob of JS generated by CoffeeScript?
 | |
|     #
 | |
|     # Requires Blob#data
 | |
|     #
 | |
|     # CoffeScript is meant to output JS that would be difficult to
 | |
|     # tell if it was generated or not. Look for a number of patterns
 | |
|     # output by the CS compiler.
 | |
|     #
 | |
|     # Return true or false
 | |
|     def compiled_coffeescript?
 | |
|       return false unless extname == '.js'
 | |
| 
 | |
|       # CoffeeScript generated by > 1.2 include a comment on the first line
 | |
|       if lines[0] =~ /^\/\/ Generated by /
 | |
|         return true
 | |
|       end
 | |
| 
 | |
|       if lines[0] == '(function() {' &&     # First line is module closure opening
 | |
|           lines[-2] == '}).call(this);' &&  # Second to last line closes module closure
 | |
|           lines[-1] == ''                   # Last line is blank
 | |
| 
 | |
|         score = 0
 | |
| 
 | |
|         lines.each do |line|
 | |
|           if line =~ /var /
 | |
|             # Underscored temp vars are likely to be Coffee
 | |
|             score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
 | |
| 
 | |
|             # bind and extend functions are very Coffee specific
 | |
|             score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
 | |
|           end
 | |
|         end
 | |
| 
 | |
|         # Require a score of 3. This is fairly arbitrary. Consider
 | |
|         # tweaking later.
 | |
|         score >= 3
 | |
|       else
 | |
|         false
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Internal: Is this a generated documentation file for a .NET assembly?
 | |
|     #
 | |
|     # Requires Blob#data
 | |
|     #
 | |
|     # .NET developers often check in the XML Intellisense file along with an
 | |
|     # assembly - however, these don't have a special extension, so we have to
 | |
|     # dig into the contents to determine if it's a docfile. Luckily, these files
 | |
|     # are extremely structured, so recognizing them is easy.
 | |
|     #
 | |
|     # Returns true or false
 | |
|     def generated_net_docfile?
 | |
|       return false unless extname.downcase == ".xml"
 | |
|       return false unless lines.count > 3
 | |
| 
 | |
|       # .NET Docfiles always open with <doc> and their first tag is an
 | |
|       # <assembly> tag
 | |
|       return lines[1].include?("<doc>") &&
 | |
|         lines[2].include?("<assembly>") &&
 | |
|         lines[-2].include?("</doc>")
 | |
|     end
 | |
| 
 | |
|     # Public: Should the blob be indexed for searching?
 | |
|     #
 | |
|     # Excluded:
 | |
|     # - Files over 0.1MB
 | |
|     # - Non-text files
 | |
|     # - Langauges marked as not searchable
 | |
|     # - Generated source files
 | |
|     #
 | |
|     # Please add additional test coverage to
 | |
|     # `test/test_blob.rb#test_indexable` if you make any changes.
 | |
|     #
 | |
|     # Return true or false
 | |
|     def indexable?
 | |
|       if binary?
 | |
|         false
 | |
|       elsif language.nil?
 | |
|         false
 | |
|       elsif !language.searchable?
 | |
|         false
 | |
|       elsif generated?
 | |
|         false
 | |
|       elsif size > 100 * 1024
 | |
|         false
 | |
|       else
 | |
|         true
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Public: Detects the Language of the blob.
 | |
|     #
 | |
|     # May load Blob#data
 | |
|     #
 | |
|     # Returns a Language or nil if none is detected
 | |
|     def language
 | |
|       if defined? @language
 | |
|         @language
 | |
|       else
 | |
|         @language = guess_language
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Internal: Guess language
 | |
|     #
 | |
|     # Please add additional test coverage to
 | |
|     # `test/test_blob.rb#test_language` if you make any changes.
 | |
|     #
 | |
|     # Returns a Language or nil
 | |
|     def guess_language
 | |
|       return if binary_mime_type?
 | |
| 
 | |
|       # Disambiguate between multiple language extensions
 | |
|       disambiguate_extension_language ||
 | |
| 
 | |
|         # See if there is a Language for the extension
 | |
|         pathname.language ||
 | |
| 
 | |
|         # Try to detect Language from shebang line
 | |
|         shebang_language
 | |
|     end
 | |
| 
 | |
|     # Internal: Get the lexer of the blob.
 | |
|     #
 | |
|     # Returns a Lexer.
 | |
|     def lexer
 | |
|       language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
 | |
|     end
 | |
| 
 | |
|     # Internal: Disambiguates between multiple language extensions.
 | |
|     #
 | |
|     # Returns a Language or nil.
 | |
|     def disambiguate_extension_language
 | |
|       if Language.ambiguous?(extname)
 | |
|         possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
 | |
|         if possible_languages.any?
 | |
|           if result = Classifier.new(Samples::DATA).classify(data, possible_languages).first
 | |
|             Language[result[0]]
 | |
|           end
 | |
|         end
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Internal: Extract the script name from the shebang line
 | |
|     #
 | |
|     # Requires Blob#data
 | |
|     #
 | |
|     # Examples
 | |
|     #
 | |
|     #   '#!/usr/bin/ruby'
 | |
|     #   # => 'ruby'
 | |
|     #
 | |
|     #   '#!/usr/bin/env ruby'
 | |
|     #   # => 'ruby'
 | |
|     #
 | |
|     #   '#!/usr/bash/python2.4'
 | |
|     #   # => 'python'
 | |
|     #
 | |
|     # Please add additional test coverage to
 | |
|     # `test/test_blob.rb#test_shebang_script` if you make any changes.
 | |
|     #
 | |
|     # Returns a script name String or nil
 | |
|     def shebang_script
 | |
|       # Fail fast if blob isn't viewable?
 | |
|       return unless viewable?
 | |
| 
 | |
|       if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
 | |
|         bang.sub!(/^#! /, '#!')
 | |
|         tokens = bang.split(' ')
 | |
|         pieces = tokens.first.split('/')
 | |
|         if pieces.size > 1
 | |
|           script = pieces.last
 | |
|         else
 | |
|           script = pieces.first.sub('#!', '')
 | |
|         end
 | |
| 
 | |
|         script = script == 'env' ? tokens[1] : script
 | |
| 
 | |
|         # python2.4 => python
 | |
|         if script =~ /((?:\d+\.?)+)/
 | |
|           script.sub! $1, ''
 | |
|         end
 | |
| 
 | |
|         # Check for multiline shebang hacks that exec themselves
 | |
|         #
 | |
|         #   #!/bin/sh
 | |
|         #   exec foo "$0" "$@"
 | |
|         #
 | |
|         if script == 'sh' &&
 | |
|             lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
 | |
|           script = $1
 | |
|         end
 | |
| 
 | |
|         script
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Internal: Get Language for shebang script
 | |
|     #
 | |
|     # Returns the Language or nil
 | |
|     def shebang_language
 | |
|       # Skip file extensions unlikely to have shebangs
 | |
|       return unless shebang_extname?
 | |
| 
 | |
|       if script = shebang_script
 | |
|         Language[script]
 | |
|       end
 | |
|     end
 | |
| 
 | |
|     # Public: Highlight syntax of blob
 | |
|     #
 | |
|     # options - A Hash of options (defaults to {})
 | |
|     #
 | |
|     # Returns html String
 | |
|     def colorize(options = {})
 | |
|       return unless safe_to_colorize?
 | |
|       options[:options] ||= {}
 | |
|       options[:options][:encoding] ||= encoding
 | |
|       lexer.highlight(data, options)
 | |
|     end
 | |
| 
 | |
|     # Public: Highlight syntax of blob without the outer highlight div
 | |
|     # wrapper.
 | |
|     #
 | |
|     # options - A Hash of options (defaults to {})
 | |
|     #
 | |
|     # Returns html String
 | |
|     def colorize_without_wrapper(options = {})
 | |
|       if text = colorize(options)
 | |
|         text[%r{<div class="highlight"><pre>(.*?)</pre>\s*</div>}m, 1]
 | |
|       else
 | |
|         ''
 | |
|       end
 | |
|     end
 | |
|   end
 | |
| end
 |