Merge remote branch 'upstream/master' into lasso

Conflicts: lib/linguist/languages.yml
2026-05-02 19:48:32 +00:00 · 2012-12-05 12:55:30 -08:00
parent 287e1b855d b036e8d3c2
commit 5a9ef5eac2
39 changed files with 36699 additions and 27166 deletions
--- a/lib/linguist.rb
+++ b/lib/linguist.rb
@@ -1,6 +1,5 @@
 require 'linguist/blob_helper'
 require 'linguist/generated'
 require 'linguist/language'
-require 'linguist/mime'
 require 'linguist/repository'
 require 'linguist/samples'
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -1,9 +1,9 @@
 require 'linguist/generated'
 require 'linguist/language'
-require 'linguist/mime'

 require 'charlock_holmes'
 require 'escape_utils'
+require 'mime/types'
 require 'pygments'
 require 'yaml'

@@ -23,6 +23,22 @@ module Linguist
      File.extname(name.to_s)
    end

+    # Internal: Lookup mime type for extension.
+    #
+    # Returns a MIME::Type
+    def _mime_type
+      if defined? @_mime_type
+        @_mime_type
+      else
+        guesses = ::MIME::Types.type_for(extname.to_s)
+
+        # Prefer text mime types over binary
+        @_mime_type = guesses.detect { |type| type.ascii? } ||
+          # Otherwise use the first guess
+          guesses.first
+      end
+    end
+
    # Public: Get the actual blob mime type
    #
    # Examples
@@ -32,7 +48,14 @@ module Linguist
    #
    # Returns a mime type String.
    def mime_type
-      @mime_type ||= Mime.mime_for(extname.to_s)
+      _mime_type ? _mime_type.to_s : 'text/plain'
+    end
+
+    # Internal: Is the blob binary according to its mime type
+    #
+    # Return true or false
+    def binary_mime_type?
+      _mime_type ? _mime_type.binary? : false
    end

    # Public: Get the Content-Type header value
@@ -83,15 +106,6 @@ module Linguist
      @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
    end

-    # Public: Is the blob binary according to its mime type
-    #
-    # Return true or false
-    def binary_mime_type?
-      if mime_type = Mime.lookup_mime_type_for(extname)
-        mime_type.binary?
-      end
-    end
-
    # Public: Is the blob binary?
    #
    # Return true or false
@@ -146,7 +160,7 @@ module Linguist
    #
    # Return true or false
    def safe_to_colorize?
-      text? && !large? && !high_ratio_of_long_lines?
+      !large? && text? && !high_ratio_of_long_lines?
    end

    # Internal: Does the blob have a ratio of long lines?
@@ -190,7 +204,31 @@ module Linguist
    #
    # Returns an Array of lines
    def lines
-      @lines ||= (viewable? && data) ? data.split("\n", -1) : []
+      @lines ||=
+        if viewable? && data
+          data.split(line_split_character, -1)
+        else
+          []
+        end
+    end
+
+    # Character used to split lines. This is almost always "\n" except when Mac
+    # Format is detected in which case it's "\r".
+    #
+    # Returns a split pattern string.
+    def line_split_character
+      @line_split_character ||= (mac_format?? "\r" : "\n")
+    end
+
+    # Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
+    # for line ends and does not include a \n (0x0a).
+    #
+    # Returns true when mac format is detected.
+    def mac_format?
+      return if !viewable?
+      if pos = data[0, 4096].index("\r")
+        data[pos + 1] != ?\n
+      end
    end

    # Public: Get number of lines of code
@@ -236,7 +274,9 @@ module Linguist
    #
    # Return true or false
    def indexable?
-      if binary?
+      if size > 100 * 1024
+        false
+      elsif binary?
        false
      elsif extname == '.txt'
        true
@@ -246,8 +286,6 @@ module Linguist
        false
      elsif generated?
        false
-      elsif size > 100 * 1024
-        false
      else
        true
      end
@@ -259,11 +297,15 @@ module Linguist
    #
    # Returns a Language or nil if none is detected
    def language
-      if defined? @language
-        @language
-      elsif !binary_mime_type?
-        @language = Language.detect(name.to_s, lambda { data }, mode)
+      return @language if defined? @language
+
+      if defined?(@data) && @data.is_a?(String)
+        data = @data
+      else
+        data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
      end
+
+      @language = Language.detect(name.to_s, data, mode)
    end

    # Internal: Get the lexer of the blob.
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -84,7 +84,9 @@ module Linguist

      if possible_languages.length > 1
        data = data.call() if data.respond_to?(:call)
-        if result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
+        if data.nil? || data == ""
+          nil
+        elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
          Language[result[0]]
        end
      else
@@ -220,6 +222,7 @@ module Linguist
        raise(ArgumentError, "#{@name} is missing lexer")

      @ace_mode = attributes[:ace_mode]
+      @wrap = attributes[:wrap] || false

      # Set legacy search term
      @search_term = attributes[:search_term] || default_alias_name
@@ -310,6 +313,11 @@ module Linguist
    # Returns a String name or nil
    attr_reader :ace_mode

+    # Public: Should language lines be wrapped
+    #
+    # Returns true or false
+    attr_reader :wrap
+
    # Public: Get extensions
    #
    # Examples
@@ -460,6 +468,7 @@ module Linguist
      :aliases           => options['aliases'],
      :lexer             => options['lexer'],
      :ace_mode          => options['ace_mode'],
+      :wrap              => options['wrap'],
      :group_name        => options['group'],
      :searchable        => options.key?('searchable') ? options['searchable'] : true,
      :search_term       => options['search_term'],
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -2,21 +2,20 @@
 #
 # All languages have an associated lexer for syntax highlighting. It
 # defaults to name.downcase, which covers most cases. Make sure the
-# lexer exists in lexers.yml. This is a list of available lexers in
-# our version of pygments.
+# lexer exists in lexers.yml. This is a list of available in our
+# version of pygments.
 #
 # type              - Either data, programming, markup, or nil
 # lexer             - An explicit lexer String (defaults to name.downcase)
 # aliases           - An Array of additional aliases (implicitly
 #                     includes name.downcase)
 # ace_mode          - A String name of Ace Mode (if available)
-# extension         - An Array of associated extensions. If file samples
-#                     are included in 'samples/<Language Name>/', then
-#                     its extension does not need to be listed.
+# wrap              - Boolean wrap to enable line wrapping (default: false)
+# extension         - An Array of associated extensions
 # primary_extension - A String for the main extension associated with
-#                     the language. Must be unique. Used when a Language
-#                     is picked from a dropdown and we need to
-#                     automatically choose an extension.
+#                     the language. Must be unique. Used when a Language is picked
+#                     from a dropdown and we need to automatically choose an
+#                     extension.
 # searchable        - Boolean flag to enable searching (defaults to true)
 # search_term       - Deprecated: Some languages maybe indexed under a
 #                     different alias. Avoid defining new exceptions.
@@ -742,6 +741,7 @@ Markdown:
  type: markup
  lexer: Text only
  ace_mode: markdown
+  wrap: true
  primary_extension: .md
  extensions:
  - .markdown
@@ -1189,6 +1189,7 @@ Textile:
  type: markup
  lexer: Text only
  ace_mode: textile
+  wrap: true
  primary_extension: .textile
  extensions:
  - .textile
@@ -1333,6 +1334,7 @@ ooc:

 reStructuredText:
  type: markup
+  wrap: true
  search_term: rst
  aliases:
  - rst
--- a/lib/linguist/mime.rb
+++ b/lib/linguist/mime.rb
@@ -1,91 +0,0 @@
-require 'mime/types'
-require 'yaml'
-
-class MIME::Type
-  attr_accessor :override
-end
-
-# Register additional mime type extensions
-#
-# Follows same format as mime-types data file
-#   https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
-File.read(File.expand_path("../mimes.yml", __FILE__)).lines.each do |line|
-  # Regexp was cargo culted from mime-types lib
-  next unless line =~ %r{^
-    #{MIME::Type::MEDIA_TYPE_RE}
-    (?:\s@([^\s]+))?
-    (?:\s:(#{MIME::Type::ENCODING_RE}))?
-  }x
-
-  mediatype  = $1
-  subtype    = $2
-  extensions = $3
-  encoding   = $4
-
-  # Lookup existing mime type
-  mime_type = MIME::Types["#{mediatype}/#{subtype}"].first ||
-    # Or create a new instance
-    MIME::Type.new("#{mediatype}/#{subtype}")
-
-  if extensions
-    extensions.split(/,/).each do |extension|
-      mime_type.extensions << extension
-    end
-  end
-
-  if encoding
-    mime_type.encoding = encoding
-  end
-
-  mime_type.override = true
-
-  # Kind of hacky, but we need to reindex the mime type after making changes
-  MIME::Types.add_type_variant(mime_type)
-  MIME::Types.index_extensions(mime_type)
-end
-
-module Linguist
-  module Mime
-    # Internal: Look up mime type for extension.
-    #
-    # ext - The extension String. May include leading "."
-    #
-    # Examples
-    #
-    #   Mime.mime_for('.html')
-    #   # => 'text/html'
-    #
-    #   Mime.mime_for('txt')
-    #   # => 'text/plain'
-    #
-    # Return mime type String otherwise falls back to 'text/plain'.
-    def self.mime_for(ext)
-      mime_type = lookup_mime_type_for(ext)
-      mime_type ? mime_type.to_s : 'text/plain'
-    end
-
-    # Internal: Lookup mime type for extension or mime type
-    #
-    # ext_or_mime_type - A file extension ".txt" or mime type "text/plain".
-    #
-    # Returns a MIME::Type
-    def self.lookup_mime_type_for(ext_or_mime_type)
-      ext_or_mime_type ||= ''
-
-      if ext_or_mime_type =~ /\w+\/\w+/
-        guesses = ::MIME::Types[ext_or_mime_type]
-      else
-        guesses = ::MIME::Types.type_for(ext_or_mime_type)
-      end
-
-      # Use custom override first
-      guesses.detect { |type| type.override } ||
-
-        # Prefer text mime types over binary
-        guesses.detect { |type| type.ascii? } ||
-
-        # Otherwise use the first guess
-        guesses.first
-    end
-  end
-end
--- a/lib/linguist/mimes.yml
+++ b/lib/linguist/mimes.yml
@@ -1,62 +0,0 @@
-# Additional types to add to MIME::Types
-#
-# MIME types are used to set the Content-Type of raw binary blobs. All text
-# blobs are served as text/plain regardless of their type to ensure they
-# open in the browser rather than downloading.
-#
-# The encoding helps determine whether a file should be treated as plain
-# text or binary. By default, a mime type's encoding is base64 (binary).
-# These types will show a "View Raw" link. To force a type to render as
-# plain text, set it to 8bit for UTF-8. text/* types will be treated as
-# text by default.
-#
-#   <type> @<extensions> :<encoding>
-#
-# type       - mediatype/subtype
-# extensions - comma seperated extension list
-# encoding   - base64 (binary), 7bit (ASCII), 8bit (UTF-8), or
-#              quoted-printable (Printable ASCII).
-#
-# Follows same format as mime-types data file
-#   https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
-#
-# Any additions or modifications (even trivial) should have corresponding
-# test change in `test/test_mime.rb`.
-
-# TODO: Lookup actual types
-application/octet-stream @a,blend,gem,graffle,ipa,lib,mcz,nib,o,ogv,otf,pfx,pigx,plgx,psd,sib,spl,sqlite3,swc,ucode,xpi
-
-# Please keep this list alphabetized
-application/java-archive @ear,war
-application/netcdf :8bit
-application/ogg @ogg
-application/postscript :base64
-application/vnd.adobe.air-application-installer-package+zip @air
-application/vnd.mozilla.xul+xml :8bit
-application/vnd.oasis.opendocument.presentation @odp
-application/vnd.oasis.opendocument.spreadsheet @ods
-application/vnd.oasis.opendocument.text @odt
-application/vnd.openofficeorg.extension @oxt
-application/vnd.openxmlformats-officedocument.presentationml.presentation @pptx
-application/x-chrome-extension @crx
-application/x-iwork-keynote-sffkey @key
-application/x-iwork-numbers-sffnumbers @numbers
-application/x-iwork-pages-sffpages @pages
-application/x-ms-xbap @xbap :8bit
-application/x-parrot-bytecode @pbc
-application/x-shockwave-flash @swf
-application/x-silverlight-app @xap
-application/x-supercollider @sc :8bit
-application/x-troff-ms :8bit
-application/x-wais-source :8bit
-application/xaml+xml @xaml :8bit
-application/xslt+xml @xslt :8bit
-image/x-icns @icns
-text/cache-manifest @manifest
-text/plain @cu,cxx
-text/x-logtalk @lgt
-text/x-nemerle @n
-text/x-nimrod @nim
-text/x-ocaml @ml,mli,mll,mly,sig,sml
-text/x-rust @rs,rc
-text/x-scheme @rkt,scm,sls,sps,ss
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
--- a/lib/linguist/samples.rb
+++ b/lib/linguist/samples.rb
@@ -76,12 +76,14 @@ module Linguist
          db['extnames'][language_name] ||= []
          if !db['extnames'][language_name].include?(sample[:extname])
            db['extnames'][language_name] << sample[:extname]
+            db['extnames'][language_name].sort!
          end
        end

        if sample[:filename]
          db['filenames'][language_name] ||= []
          db['filenames'][language_name] << sample[:filename]
+          db['filenames'][language_name].sort!
        end

        data = File.read(sample[:path])
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -16,12 +16,18 @@ module Linguist
      new.extract_tokens(data)
    end

+    # Read up to 100KB
+    BYTE_LIMIT = 100_000
+
+    # Start state on token, ignore anything till the next newline
    SINGLE_LINE_COMMENTS = [
      '//', # C
      '#',  # Ruby
      '%',  # Tex
    ]

+    # Start state on opening token, ignore anything until the closing
+    # token is reached.
    MULTI_LINE_COMMENTS = [
      ['/*', '*/'],    # C
      ['<!--', '-->'], # XML
@@ -30,7 +36,7 @@ module Linguist
    ]

    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
-      "^\s*#{Regexp.escape(c)} "
+      "\s*#{Regexp.escape(c)} "
    }.join("|"))

    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
@@ -52,22 +58,24 @@ module Linguist

      tokens = []
      until s.eos?
+        break if s.pos >= BYTE_LIMIT
+
        if token = s.scan(/^#!.+$/)
          if name = extract_shebang(token)
            tokens << "SHEBANG#!#{name}"
          end

        # Single line comment
-        elsif token = s.scan(START_SINGLE_LINE_COMMENT)
-          tokens << token.strip
+        elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
+          # tokens << token.strip
          s.skip_until(/\n|\Z/)

        # Multiline comments
        elsif token = s.scan(START_MULTI_LINE_COMMENT)
-          tokens << token
+          # tokens << token
          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
-          tokens << close_token
+          # tokens << close_token

        # Skip single or double quoted strings
        elsif s.scan(/"/)