Merged with upstream. Updated M (aka MUMPS) detection to use the new bayesian / samples method.

2026-01-03 16:05:33 +00:00 · 2013-03-14 11:33:09 -04:00
parent 46cde87c09 a20631af04
commit 58420f62d9
472 changed files with 179182 additions and 1762 deletions
--- a/lib/linguist.rb
+++ b/lib/linguist.rb
@@ -1,5 +1,5 @@
 require 'linguist/blob_helper'
+require 'linguist/generated'
 require 'linguist/language'
-require 'linguist/mime'
-require 'linguist/pathname'
 require 'linguist/repository'
+require 'linguist/samples'
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -1,9 +1,9 @@
+require 'linguist/generated'
 require 'linguist/language'
-require 'linguist/mime'
-require 'linguist/pathname'

 require 'charlock_holmes'
 require 'escape_utils'
+require 'mime/types'
 require 'pygments'
 require 'yaml'

@@ -11,13 +11,6 @@ module Linguist
  # BlobHelper is a mixin for Blobish classes that respond to "name",
  # "data" and "size" such as Grit::Blob.
  module BlobHelper
-    # Internal: Get a Pathname wrapper for Blob#name
-    #
-    # Returns a Pathname.
-    def pathname
-      Pathname.new(name || "")
-    end
-
    # Public: Get the extname of the path
    #
    # Examples
@@ -27,7 +20,23 @@ module Linguist
    #
    # Returns a String
    def extname
-      pathname.extname
+      File.extname(name.to_s)
+    end
+
+    # Internal: Lookup mime type for extension.
+    #
+    # Returns a MIME::Type
+    def _mime_type
+      if defined? @_mime_type
+        @_mime_type
+      else
+        guesses = ::MIME::Types.type_for(extname.to_s)
+
+        # Prefer text mime types over binary
+        @_mime_type = guesses.detect { |type| type.ascii? } ||
+          # Otherwise use the first guess
+          guesses.first
+      end
    end

    # Public: Get the actual blob mime type
@@ -39,7 +48,23 @@ module Linguist
    #
    # Returns a mime type String.
    def mime_type
-      @mime_type ||= pathname.mime_type
+      _mime_type ? _mime_type.to_s : 'text/plain'
+    end
+
+    # Internal: Is the blob binary according to its mime type
+    #
+    # Return true or false
+    def binary_mime_type?
+      _mime_type ? _mime_type.binary? : false
+    end
+
+    # Internal: Is the blob binary according to its mime type,
+    # overriding it if we have better data from the languages.yml
+    # database.
+    #
+    # Return true or false
+    def likely_binary?
+       binary_mime_type? and not Language.find_by_filename(name)
    end

    # Public: Get the Content-Type header value
@@ -71,7 +96,7 @@ module Linguist
      elsif name.nil?
        "attachment"
      else
-        "attachment; filename=#{EscapeUtils.escape_url(pathname.basename)}"
+        "attachment; filename=#{EscapeUtils.escape_url(File.basename(name))}"
      end
    end

@@ -90,15 +115,6 @@ module Linguist
      @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data
    end

-    # Public: Is the blob binary according to its mime type
-    #
-    # Return true or false
-    def binary_mime_type?
-      if mime_type = Mime.lookup_mime_type_for(pathname.extname)
-        mime_type.binary?
-      end
-    end
-
    # Public: Is the blob binary?
    #
    # Return true or false
@@ -132,23 +148,14 @@ module Linguist
    #
    # Return true or false
    def image?
-      ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
+      ['.png', '.jpg', '.jpeg', '.gif'].include?(extname.downcase)
    end

-    # Public: Is the blob a possible drupal php file?
+    # Public: Is the blob a supported 3D model format?
    #
    # Return true or false
-    def drupal_extname?
-      ['.module', '.install', '.test', '.inc'].include?(extname)
-    end
-
-    # Public: Is the blob likely to have a shebang?
-    #
-    # Return true or false
-    def shebang_extname?
-      extname.empty? &&
-        mode &&
-        (mode.to_i(8) & 05) == 05
+    def solid?
+      extname.downcase == '.stl'
    end

    MEGABYTE = 1024 * 1024
@@ -169,7 +176,7 @@ module Linguist
    #
    # Return true or false
    def safe_to_colorize?
-      text? && !large? && !high_ratio_of_long_lines?
+      !large? && text? && !high_ratio_of_long_lines?
    end

    # Internal: Does the blob have a ratio of long lines?
@@ -213,7 +220,31 @@ module Linguist
    #
    # Returns an Array of lines
    def lines
-      @lines ||= (viewable? && data) ? data.split("\n", -1) : []
+      @lines ||=
+        if viewable? && data
+          data.split(line_split_character, -1)
+        else
+          []
+        end
+    end
+
+    # Character used to split lines. This is almost always "\n" except when Mac
+    # Format is detected in which case it's "\r".
+    #
+    # Returns a split pattern string.
+    def line_split_character
+      @line_split_character ||= (mac_format?? "\r" : "\n")
+    end
+
+    # Public: Is the data in ** Mac Format **. This format uses \r (0x0d) characters
+    # for line ends and does not include a \n (0x0a).
+    #
+    # Returns true when mac format is detected.
+    def mac_format?
+      return if !viewable?
+      if pos = data[0, 4096].index("\r")
+        data[pos + 1] != ?\n
+      end
    end

    # Public: Get number of lines of code
@@ -234,125 +265,16 @@ module Linguist
      lines.grep(/\S/).size
    end

-    # Internal: Compute average line length.
-    #
-    # Returns Integer.
-    def average_line_length
-      if lines.any?
-        lines.inject(0) { |n, l| n += l.length } / lines.length
-      else
-        0
-      end
-    end
-
    # Public: Is the blob a generated file?
    #
-    # Generated source code is supressed in diffs and is ignored by
+    # Generated source code is suppressed in diffs and is ignored by
    # language statistics.
    #
-    # Requires Blob#data
-    #
-    # Includes:
-    # - XCode project XML files
-    # - Minified JavaScript
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_generated` if you make any changes.
+    # May load Blob#data
    #
    # Return true or false
    def generated?
-      if xcode_project_file? || generated_net_docfile?
-        true
-      elsif generated_coffeescript? || minified_javascript?
-        true
-      elsif name == 'Gemfile.lock'
-        true
-      else
-        false
-      end
-    end
-
-    # Internal: Is the blob an XCode project file?
-    #
-    # Generated if the file extension is an XCode project
-    # file extension.
-    #
-    # Returns true of false.
-    def xcode_project_file?
-      ['.xib', '.nib', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
-    end
-
-    # Internal: Is the blob minified JS?
-    #
-    # Consider JS minified if the average line length is
-    # greater then 100c.
-    #
-    # Returns true or false.
-    def minified_javascript?
-      return unless extname == '.js'
-      average_line_length > 100
-    end
-
-    # Internal: Is the blob JS generated by CoffeeScript?
-    #
-    # Requires Blob#data
-    #
-    # CoffeScript is meant to output JS that would be difficult to
-    # tell if it was generated or not. Look for a number of patterns
-    # outputed by the CS compiler.
-    #
-    # Return true or false
-    def generated_coffeescript?
-      return unless extname == '.js'
-
-      # CoffeeScript generated by > 1.2 include a comment on the first line
-      if lines[0] =~ /^\/\/ Generated by /
-        return true
-      end
-
-      if lines[0] == '(function() {' &&     # First line is module closure opening
-          lines[-2] == '}).call(this);' &&  # Second to last line closes module closure
-          lines[-1] == ''                   # Last line is blank
-
-        score = 0
-
-        lines.each do |line|
-          if line =~ /var /
-            # Underscored temp vars are likely to be Coffee
-            score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
-
-            # bind and extend functions are very Coffee specific
-            score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
-          end
-        end
-
-        # Require a score of 3. This is fairly arbitrary. Consider
-        # tweaking later.
-        score >= 3
-      else
-        false
-      end
-    end
-
-    # Internal: Is this a generated documentation file for a .NET assembly?
-    #
-    # Requires Blob#data
-    #
-    # .NET developers often check in the XML Intellisense file along with an
-    # assembly - however, these don't have a special extension, so we have to
-    # dig into the contents to determine if it's a docfile. Luckily, these files
-    # are extremely structured, so recognizing them is easy.
-    #
-    # Returns true or false
-    def generated_net_docfile?
-      return false unless extname.downcase == ".xml"
-      return false unless lines.count > 3
-
-      # .NET Docfiles always open with <doc> and their first tag is an
-      # <assembly> tag
-      return lines[1].include?("<doc>") &&
-        lines[2].include?("<assembly>") &&
-        lines[-2].include?("</doc>")
+      @_generated ||= Generated.generated?(name, lambda { data })
    end

    # Public: Should the blob be indexed for searching?
@@ -360,7 +282,7 @@ module Linguist
    # Excluded:
    # - Files over 0.1MB
    # - Non-text files
-    # - Langauges marked as not searchable
+    # - Languages marked as not searchable
    # - Generated source files
    #
    # Please add additional test coverage to
@@ -368,16 +290,18 @@ module Linguist
    #
    # Return true or false
    def indexable?
-      if binary?
+      if size > 100 * 1024
        false
+      elsif binary?
+        false
+      elsif extname == '.txt'
+        true
      elsif language.nil?
        false
      elsif !language.searchable?
        false
      elsif generated?
        false
-      elsif size > 100 * 1024
-        false
      else
        true
      end
@@ -389,33 +313,15 @@ module Linguist
    #
    # Returns a Language or nil if none is detected
    def language
-      if defined? @language
-        @language
+      return @language if defined? @language
+
+      if defined?(@data) && @data.is_a?(String)
+        data = @data
      else
-        @language = guess_language
+        data = lambda { (binary_mime_type? || binary?) ? "" : self.data }
      end
-    end

-    # Internal: Guess language
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_language` if you make any changes.
-    #
-    # Returns a Language or nil
-    def guess_language
-      return if binary_mime_type?
-
-      # Disambiguate between multiple language extensions
-      disambiguate_extension_language ||
-
-        # See if there is a Language for the extension
-        pathname.language ||
-
-        # Look for idioms in first line
-        first_line_language ||
-
-        # Try to detect Language from shebang line
-        shebang_language
+      @language = Language.detect(name.to_s, data, mode)
    end

    # Internal: Get the lexer of the blob.
@@ -425,247 +331,6 @@ module Linguist
      language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
    end

-    # Internal: Disambiguates between multiple language extensions.
-    #
-    # Delegates to "guess_EXTENSION_language".
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_language` if you add another method.
-    #
-    # Returns a Language or nil.
-    def disambiguate_extension_language
-      if Language.ambiguous?(extname)
-        name = "guess_#{extname.sub(/^\./, '')}_language"
-        send(name) if respond_to?(name)
-      end
-    end
-
-    # Internal: Guess language of .cls files
-    #
-    # Returns a Language.
-    def guess_cls_language
-      if lines.grep(/^(%|\\)/).any?
-        Language['TeX']
-      elsif lines.grep(/^\s*(CLASS|METHOD|INTERFACE).*:\s*/i).any? || lines.grep(/^\s*(USING|DEFINE)/i).any?
-        Language['OpenEdge ABL']
-      elsif lines.grep(/\{$/).any? || lines.grep(/\}$/).any?
-        Language['Apex']
-      elsif lines.grep(/^(\'\*|Attribute|Option|Sub|Private|Protected|Public|Friend)/i).any?
-        Language['Visual Basic']
-      else
-        # The most common language should be the fallback
-        Language['TeX']
-      end
-    end
-
-    # Internal: Guess language of header files (.h).
-    #
-    # Returns a Language.
-    def guess_h_language
-      if lines.grep(/^@(interface|property|private|public|end)/).any?
-        Language['Objective-C']
-      elsif lines.grep(/^class |^\s+(public|protected|private):/).any?
-        Language['C++']
-      else
-        Language['C']
-      end
-    end
-
-    # Internal: Guess language of .m files.
-    #
-    # Objective-C heuristics:
-    # * Keywords  ("#import", "#include", "#ifdef", #define, "@end") or "//" and opening "\*" comments
-    #
-    # Matlab heuristics:
-    # * Leading "function " of "classdef " keyword
-    # * "%" comments
-    #
-    # M heuristics:
-    # * Look at first line.  It is either a comment (1st regex) or label/code (2nd regex)
-    #
-    # Note: All "#" keywords, e.g., "#import", are guaranteed to be Objective-C. Because the ampersand
-    # is used to created function handles and anonymous functions in Matlab, most "@" keywords are not
-    # safe heuristics. However, "end" is a reserved term in Matlab and can't be used to create a valid
-    # function handle. Because @end is required to close any @implementation, @property, @interface,
-    # @synthesize, etc. directive in Objective-C, only @end needs to be checked for.
-    #
-    # Returns a Language.
-    def guess_m_language
-      # Objective-C keywords or comments
-      if lines.grep(/^#(import|include|ifdef|define)|@end/).any? || lines.grep(/^\s*\/\//).any? || lines.grep(/^\s*\/\*/).any?
-        Language['Objective-C']
-
-      # Matlab file function or class or comments
-      elsif lines.any? && lines.first.match(/^\s*(function |classdef )/) || lines.grep(/^\s*%/).any?
-        Language['Matlab']
-
-      # M (see M heuristics above)
-      elsif lines.first.to_s =~ /^[\t ]*;/ or lines.first.to_s =~ /^%?[A-Za-z0-9]+[\t ]*;*/
-        Language['M']
-
-      # Fallback to Objective-C, don't want any M or Matlab false positives
-      else
-        Language['Objective-C']
-      end
-    end
-
-    # Internal: Guess language of .pl files
-    #
-    # The rules for disambiguation are:
-    #
-    # 1. Many perl files begin with a shebang
-    # 2. Most Prolog source files have a rule somewhere (marked by the :- operator)
-    # 3. Default to Perl, because it is more popular
-    #
-    # Returns a Language.
-    def guess_pl_language
-      if shebang_script == 'perl'
-        Language['Perl']
-      elsif lines.grep(/:-/).any?
-        Language['Prolog']
-      else
-        Language['Perl']
-      end
-    end
-
-    # Internal: Guess language of .r files.
-    #
-    # Returns a Language.
-    def guess_r_language
-      if lines.grep(/(rebol|(:\s+func|make\s+object!|^\s*context)\s*\[)/i).any?
-        Language['Rebol']
-      else
-        Language['R']
-      end
-    end
-
-    # Internal: Guess language of .t files.
-    #
-    # Returns a Language.
-    def guess_t_language
-      score = 0
-      score += 1 if lines.grep(/^% /).any?
-      score += data.gsub(/ := /).count
-      score += data.gsub(/proc |procedure |fcn |function /).count
-      score += data.gsub(/var \w+: \w+/).count
-
-      # Tell-tale signs its gotta be Perl
-      if lines.grep(/^(my )?(sub |\$|@|%)\w+/).any?
-        score = 0
-      end
-
-      if score >= 3
-        Language['Turing']
-      else
-        Language['Perl']
-      end
-    end
-
-    # Internal: Guess language of .v files.
-    #
-    # Returns a Language
-    def guess_v_language
-      if lines.grep(/^(\/\*|\/\/|module|parameter|input|output|wire|reg|always|initial|begin|\`)/).any?
-        Language['Verilog']
-      else
-        Language['Coq']
-      end
-    end
-
-    # Internal: Guess language of .gsp files.
-    #
-    # Returns a Language.
-    def guess_gsp_language
-      if lines.grep(/<%|<%@|\$\{|<%|<g:|<meta name="layout"|<r:/).any?
-        Language['Groovy Server Pages']
-      else
-        Language['Gosu']
-      end
-    end
-
-    # Internal: Guess language from the first line.
-    #
-    # Look for leading "<?php" in Drupal files
-    #
-    # Returns a Language.
-    def first_line_language
-      # Only check files with drupal php extensions
-      return unless drupal_extname?
-
-      # Fail fast if blob isn't viewable?
-      return unless viewable?
-
-      if lines.first.to_s =~ /^<\?php/
-        Language['PHP']
-      end
-    end
-
-    # Internal: Extract the script name from the shebang line
-    #
-    # Requires Blob#data
-    #
-    # Examples
-    #
-    #   '#!/usr/bin/ruby'
-    #   # => 'ruby'
-    #
-    #   '#!/usr/bin/env ruby'
-    #   # => 'ruby'
-    #
-    #   '#!/usr/bash/python2.4'
-    #   # => 'python'
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_shebang_script` if you make any changes.
-    #
-    # Returns a script name String or nil
-    def shebang_script
-      # Fail fast if blob isn't viewable?
-      return unless viewable?
-
-      if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
-        bang.sub!(/^#! /, '#!')
-        tokens = bang.split(' ')
-        pieces = tokens.first.split('/')
-        if pieces.size > 1
-          script = pieces.last
-        else
-          script = pieces.first.sub('#!', '')
-        end
-
-        script = script == 'env' ? tokens[1] : script
-
-        # python2.4 => python
-        if script =~ /((?:\d+\.?)+)/
-          script.sub! $1, ''
-        end
-
-        # Check for multiline shebang hacks that exec themselves
-        #
-        #   #!/bin/sh
-        #   exec foo "$0" "$@"
-        #
-        if script == 'sh' &&
-            lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
-          script = $1
-        end
-
-        script
-      end
-    end
-
-    # Internal: Get Language for shebang script
-    #
-    # Returns the Language or nil
-    def shebang_language
-      # Skip file extensions unlikely to have shebangs
-      return unless shebang_extname?
-
-      if script = shebang_script
-        Language[script]
-      end
-    end
-
    # Public: Highlight syntax of blob
    #
    # options - A Hash of options (defaults to {})
@@ -691,12 +356,5 @@ module Linguist
        ''
      end
    end
-
-    Language.overridden_extensions.each do |extension|
-      name = "guess_#{extension.sub(/^\./, '')}_language".to_sym
-      unless instance_methods.map(&:to_sym).include?(name)
-        warn "Language##{name} was not defined"
-      end
-    end
  end
 end
--- a/lib/linguist/classifier.rb
+++ b/lib/linguist/classifier.rb
@@ -0,0 +1,123 @@
+require 'linguist/tokenizer'
+
+module Linguist
+  # Language bayesian classifier.
+  class Classifier
+    # Public: Train classifier that data is a certain language.
+    #
+    # db       - Hash classifier database object
+    # language - String language of data
+    # data     - String contents of file
+    #
+    # Examples
+    #
+    #   Classifier.train(db, 'Ruby', "def hello; end")
+    #
+    # Returns nothing.
+    def self.train!(db, language, data)
+      tokens = Tokenizer.tokenize(data)
+
+      db['tokens_total'] ||= 0
+      db['languages_total'] ||= 0
+      db['tokens'] ||= {}
+      db['language_tokens'] ||= {}
+      db['languages'] ||= {}
+
+      tokens.each do |token|
+        db['tokens'][language] ||= {}
+        db['tokens'][language][token] ||= 0
+        db['tokens'][language][token] += 1
+        db['language_tokens'][language] ||= 0
+        db['language_tokens'][language] += 1
+        db['tokens_total'] += 1
+      end
+      db['languages'][language] ||= 0
+      db['languages'][language] += 1
+      db['languages_total'] += 1
+
+      nil
+    end
+
+    # Public: Guess language of data.
+    #
+    # db        - Hash of classifier tokens database.
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Examples
+    #
+    #   Classifier.classify(db, "def hello; end")
+    #   # => [ 'Ruby', 0.90], ['Python', 0.2], ... ]
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def self.classify(db, tokens, languages = nil)
+      languages ||= db['languages'].keys
+      new(db).classify(tokens, languages)
+    end
+
+    # Internal: Initialize a Classifier.
+    def initialize(db = {})
+      @tokens_total    = db['tokens_total']
+      @languages_total = db['languages_total']
+      @tokens          = db['tokens']
+      @language_tokens = db['language_tokens']
+      @languages       = db['languages']
+    end
+
+    # Internal: Guess language of data
+    #
+    # data      - Array of tokens or String data to analyze.
+    # languages - Array of language name Strings to restrict to.
+    #
+    # Returns sorted Array of result pairs. Each pair contains the
+    # String language name and a Float score.
+    def classify(tokens, languages)
+      return [] if tokens.nil?
+      tokens = Tokenizer.tokenize(tokens) if tokens.is_a?(String)
+
+      scores = {}
+      languages.each do |language|
+        scores[language] = tokens_probability(tokens, language) +
+                                   language_probability(language)
+      end
+
+      scores.sort { |a, b| b[1] <=> a[1] }.map { |score| [score[0], score[1]] }
+    end
+
+    # Internal: Probably of set of tokens in a language occurring - P(D | C)
+    #
+    # tokens   - Array of String tokens.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def tokens_probability(tokens, language)
+      tokens.inject(0.0) do |sum, token|
+        sum += Math.log(token_probability(token, language))
+      end
+    end
+
+    # Internal: Probably of token in language occurring - P(F | C)
+    #
+    # token    - String token.
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def token_probability(token, language)
+      if @tokens[language][token].to_f == 0.0
+        1 / @tokens_total.to_f
+      else
+        @tokens[language][token].to_f / @language_tokens[language].to_f
+      end
+    end
+
+    # Internal: Probably of a language occurring - P(C)
+    #
+    # language - Language to check.
+    #
+    # Returns Float between 0.0 and 1.0.
+    def language_probability(language)
+      Math.log(@languages[language].to_f / @languages_total.to_f)
+    end
+  end
+end
--- a/lib/linguist/generated.rb
+++ b/lib/linguist/generated.rb
@@ -0,0 +1,162 @@
+module Linguist
+  class Generated
+    # Public: Is the blob a generated file?
+    #
+    # name - String filename
+    # data - String blob data. A block also maybe passed in for lazy
+    #        loading. This behavior is deprecated and you should always
+    #        pass in a String.
+    #
+    # Return true or false
+    def self.generated?(name, data)
+      new(name, data).generated?
+    end
+
+    # Internal: Initialize Generated instance
+    #
+    # name - String filename
+    # data - String blob data
+    def initialize(name, data)
+      @name = name
+      @extname = File.extname(name)
+      @_data = data
+    end
+
+    attr_reader :name, :extname
+
+    # Lazy load blob data if block was passed in.
+    #
+    # Awful, awful stuff happening here.
+    #
+    # Returns String data.
+    def data
+      @data ||= @_data.respond_to?(:call) ? @_data.call() : @_data
+    end
+
+    # Public: Get each line of data
+    #
+    # Returns an Array of lines
+    def lines
+      # TODO: data should be required to be a String, no nils
+      @lines ||= data ? data.split("\n", -1) : []
+    end
+
+    # Internal: Is the blob a generated file?
+    #
+    # Generated source code is suppressed in diffs and is ignored by
+    # language statistics.
+    #
+    # Please add additional test coverage to
+    # `test/test_blob.rb#test_generated` if you make any changes.
+    #
+    # Return true or false
+    def generated?
+      name == 'Gemfile.lock' ||
+        minified_javascript? ||
+        compiled_coffeescript? ||
+        xcode_project_file? ||
+        generated_net_docfile? ||
+        generated_parser?
+    end
+
+    # Internal: Is the blob an XCode project file?
+    #
+    # Generated if the file extension is an XCode project
+    # file extension.
+    #
+    # Returns true of false.
+    def xcode_project_file?
+      ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
+    end
+
+    # Internal: Is the blob minified JS?
+    #
+    # Consider JS minified if the average line length is
+    # greater then 100c.
+    #
+    # Returns true or false.
+    def minified_javascript?
+      return unless extname == '.js'
+      if lines.any?
+        (lines.inject(0) { |n, l| n += l.length } / lines.length) > 100
+      else
+        false
+      end
+    end
+
+    # Internal: Is the blob of JS generated by CoffeeScript?
+    #
+    # CoffeeScript is meant to output JS that would be difficult to
+    # tell if it was generated or not. Look for a number of patterns
+    # output by the CS compiler.
+    #
+    # Return true or false
+    def compiled_coffeescript?
+      return false unless extname == '.js'
+
+      # CoffeeScript generated by > 1.2 include a comment on the first line
+      if lines[0] =~ /^\/\/ Generated by /
+        return true
+      end
+
+      if lines[0] == '(function() {' &&     # First line is module closure opening
+          lines[-2] == '}).call(this);' &&  # Second to last line closes module closure
+          lines[-1] == ''                   # Last line is blank
+
+        score = 0
+
+        lines.each do |line|
+          if line =~ /var /
+            # Underscored temp vars are likely to be Coffee
+            score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
+
+            # bind and extend functions are very Coffee specific
+            score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
+          end
+        end
+
+        # Require a score of 3. This is fairly arbitrary. Consider
+        # tweaking later.
+        score >= 3
+      else
+        false
+      end
+    end
+
+    # Internal: Is this a generated documentation file for a .NET assembly?
+    #
+    # .NET developers often check in the XML Intellisense file along with an
+    # assembly - however, these don't have a special extension, so we have to
+    # dig into the contents to determine if it's a docfile. Luckily, these files
+    # are extremely structured, so recognizing them is easy.
+    #
+    # Returns true or false
+    def generated_net_docfile?
+      return false unless extname.downcase == ".xml"
+      return false unless lines.count > 3
+
+      # .NET Docfiles always open with <doc> and their first tag is an
+      # <assembly> tag
+      return lines[1].include?("<doc>") &&
+        lines[2].include?("<assembly>") &&
+        lines[-2].include?("</doc>")
+    end
+
+    # Internal: Is the blob of JS a parser generated by PEG.js?
+    #
+    # PEG.js-generated parsers are not meant to be consumed by humans.
+    #
+    # Return true or false
+    def generated_parser?
+      return false unless extname == '.js'
+
+      # PEG.js-generated parsers include a comment near the top  of the file
+      # that marks them as such.
+      if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
+        return true
+      end
+
+      false
+    end
+  end
+end
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -2,6 +2,9 @@ require 'escape_utils'
 require 'pygments'
 require 'yaml'

+require 'linguist/classifier'
+require 'linguist/samples'
+
 module Linguist
  # Language names that are recognizable by GitHub. Defined languages
  # can be highlighted, searched and listed under the Top Languages page.
@@ -9,30 +12,15 @@ module Linguist
  # Languages are defined in `lib/linguist/languages.yml`.
  class Language
    @languages       = []
-    @overrides       = {}
    @index           = {}
    @name_index      = {}
    @alias_index     = {}
-    @extension_index = {}
-    @filename_index  = {}
+    @extension_index = Hash.new { |h,k| h[k] = [] }
+    @filename_index  = Hash.new { |h,k| h[k] = [] }

    # Valid Languages types
    TYPES = [:data, :markup, :programming]

-    # Internal: Test if extension maps to multiple Languages.
-    #
-    # Returns true or false.
-    def self.ambiguous?(extension)
-      @overrides.include?(extension)
-    end
-
-    # Include?: Return overridden extensions.
-    #
-    # Returns extensions Array.
-    def self.overridden_extensions
-      @overrides.keys
-    end
-
    # Internal: Create a new Language object
    #
    # attributes - A hash of attributes
@@ -43,18 +31,18 @@ module Linguist

      @languages << language

-      # All Language names should be unique. Warn if there is a duplicate.
+      # All Language names should be unique. Raise if there is a duplicate.
      if @name_index.key?(language.name)
-        warn "Duplicate language name: #{language.name}"
+        raise ArgumentError, "Duplicate language name: #{language.name}"
      end

      # Language name index
      @index[language.name] = @name_index[language.name] = language

      language.aliases.each do |name|
-        # All Language aliases should be unique. Warn if there is a duplicate.
+        # All Language aliases should be unique. Raise if there is a duplicate.
        if @alias_index.key?(name)
-          warn "Duplicate alias: #{name}"
+          raise ArgumentError, "Duplicate alias: #{name}"
        end

        @index[name] = @alias_index[name] = language
@@ -62,33 +50,50 @@ module Linguist

      language.extensions.each do |extension|
        if extension !~ /^\./
-          warn "Extension is missing a '.': #{extension.inspect}"
+          raise ArgumentError, "Extension is missing a '.': #{extension.inspect}"
        end

-        unless ambiguous?(extension)
-          # Index the extension with a leading ".": ".rb"
-          @extension_index[extension] = language
-
-          # Index the extension without a leading ".": "rb"
-          @extension_index[extension.sub(/^\./, '')] = language
-        end
-      end
-
-      language.overrides.each do |extension|
-        if extension !~ /^\./
-          warn "Extension is missing a '.': #{extension.inspect}"
-        end
-
-        @overrides[extension] = language
+        @extension_index[extension] << language
      end

      language.filenames.each do |filename|
-        @filename_index[filename] = language
+        @filename_index[filename] << language
      end

      language
    end

+    # Public: Detects the Language of the blob.
+    #
+    # name - String filename
+    # data - String blob data. A block also maybe passed in for lazy
+    #        loading. This behavior is deprecated and you should always
+    #        pass in a String.
+    # mode - Optional String mode (defaults to nil)
+    #
+    # Returns Language or nil.
+    def self.detect(name, data, mode = nil)
+      # A bit of an elegant hack. If the file is executable but extensionless,
+      # append a "magic" extension so it can be classified with other
+      # languages that have shebang scripts.
+      if File.extname(name).empty? && mode && (mode.to_i(8) & 05) == 05
+        name += ".script!"
+      end
+
+      possible_languages = find_by_filename(name)
+
+      if possible_languages.length > 1
+        data = data.call() if data.respond_to?(:call)
+        if data.nil? || data == ""
+          nil
+        elsif result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
+          Language[result[0]]
+        end
+      else
+        possible_languages.first
+      end
+    end
+
    # Public: Get all Languages
    #
    # Returns an Array of Languages
@@ -124,33 +129,19 @@ module Linguist
      @alias_index[name]
    end

-    # Public: Look up Language by extension.
-    #
-    # extension - The extension String. May include leading "."
-    #
-    # Examples
-    #
-    #   Language.find_by_extension('.rb')
-    #   # => #<Language name="Ruby">
-    #
-    # Returns the Language or nil if none was found.
-    def self.find_by_extension(extension)
-      @extension_index[extension]
-    end
-
-    # Public: Look up Language by filename.
+    # Public: Look up Languages by filename.
    #
    # filename - The path String.
    #
    # Examples
    #
    #   Language.find_by_filename('foo.rb')
-    #   # => #<Language name="Ruby">
+    #   # => [#<Language name="Ruby">]
    #
-    # Returns the Language or nil if none was found.
+    # Returns all matching Languages or [] if none were found.
    def self.find_by_filename(filename)
      basename, extname = File.basename(filename), File.extname(filename)
-      @filename_index[basename] || @extension_index[extname]
+      @filename_index[basename] + @extension_index[extname]
    end

    # Public: Look up Language by its name or lexer.
@@ -231,16 +222,18 @@ module Linguist
        raise(ArgumentError, "#{@name} is missing lexer")

      @ace_mode = attributes[:ace_mode]
+      @wrap = attributes[:wrap] || false

      # Set legacy search term
      @search_term = attributes[:search_term] || default_alias_name

      # Set extensions or default to [].
      @extensions = attributes[:extensions] || []
-      @overrides  = attributes[:overrides]  || []
      @filenames  = attributes[:filenames]  || []

-      @primary_extension = attributes[:primary_extension] || default_primary_extension || extensions.first
+      unless @primary_extension = attributes[:primary_extension]
+        raise ArgumentError, "#{@name} is missing primary extension"
+      end

      # Prepend primary extension unless its already included
      if primary_extension && !extensions.include?(primary_extension)
@@ -320,6 +313,11 @@ module Linguist
    # Returns a String name or nil
    attr_reader :ace_mode

+    # Public: Should language lines be wrapped
+    #
+    # Returns true or false
+    attr_reader :wrap
+
    # Public: Get extensions
    #
    # Examples
@@ -331,7 +329,7 @@ module Linguist

    # Deprecated: Get primary extension
    #
-    # Defaults to the first extension but can be overriden
+    # Defaults to the first extension but can be overridden
    # in the languages.yml.
    #
    # The primary extension can not be nil. Tests should verify this.
@@ -343,11 +341,6 @@ module Linguist
    # Returns the extension String.
    attr_reader :primary_extension

-    # Internal: Get overridden extensions.
-    #
-    # Returns the extensions Array.
-    attr_reader :overrides
-
    # Public: Get filenames
    #
    # Examples
@@ -377,13 +370,6 @@ module Linguist
      name.downcase.gsub(/\s/, '-')
    end

-    # Internal: Get default primary extension.
-    #
-    # Returns the extension String.
-    def default_primary_extension
-      extensions.first
-    end
-
    # Public: Get Language group
    #
    # Returns a Language
@@ -441,11 +427,40 @@ module Linguist
    def hash
      name.hash
    end
+
+    def inspect
+      "#<#{self.class} name=#{name}>"
+    end
  end

+  extensions = Samples::DATA['extnames']
+  filenames = Samples::DATA['filenames']
  popular = YAML.load_file(File.expand_path("../popular.yml", __FILE__))

  YAML.load_file(File.expand_path("../languages.yml", __FILE__)).each do |name, options|
+    options['extensions'] ||= []
+    options['filenames'] ||= []
+
+    if extnames = extensions[name]
+      extnames.each do |extname|
+        if !options['extensions'].include?(extname)
+          options['extensions'] << extname
+        else
+          warn "#{name} #{extname.inspect} is already defined in samples/. Remove from languages.yml."
+        end
+      end
+    end
+
+    if fns = filenames[name]
+      fns.each do |filename|
+        if !options['filenames'].include?(filename)
+          options['filenames'] << filename
+        else
+          warn "#{name} #{filename.inspect} is already defined in samples/. Remove from languages.yml."
+        end
+      end
+    end
+
    Language.create(
      :name              => name,
      :color             => options['color'],
@@ -453,12 +468,12 @@ module Linguist
      :aliases           => options['aliases'],
      :lexer             => options['lexer'],
      :ace_mode          => options['ace_mode'],
+      :wrap              => options['wrap'],
      :group_name        => options['group'],
      :searchable        => options.key?('searchable') ? options['searchable'] : true,
      :search_term       => options['search_term'],
-      :extensions        => options['extensions'],
+      :extensions        => options['extensions'].sort,
      :primary_extension => options['primary_extension'],
-      :overrides         => options['overrides'],
      :filenames         => options['filenames'],
      :popular           => popular.include?(name)
    )
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
--- a/lib/linguist/md5.rb
+++ b/lib/linguist/md5.rb
@@ -0,0 +1,38 @@
+require 'digest/md5'
+
+module Linguist
+  module MD5
+    # Public: Create deep nested digest of value object.
+    #
+    # Useful for object comparison.
+    #
+    # obj - Object to digest.
+    #
+    # Returns String hex digest
+    def self.hexdigest(obj)
+      digest = Digest::MD5.new
+
+      case obj
+      when String, Symbol, Integer
+        digest.update "#{obj.class}"
+        digest.update "#{obj}"
+      when TrueClass, FalseClass, NilClass
+        digest.update "#{obj.class}"
+      when Array
+        digest.update "#{obj.class}"
+        for e in obj
+          digest.update(hexdigest(e))
+        end
+      when Hash
+        digest.update "#{obj.class}"
+        for e in obj.map { |(k, v)| hexdigest([k, v]) }.sort
+          digest.update(e)
+        end
+      else
+        raise TypeError, "can't convert #{obj.inspect} into String"
+      end
+
+      digest.hexdigest
+    end
+  end
+end
--- a/lib/linguist/mime.rb
+++ b/lib/linguist/mime.rb
@@ -1,91 +0,0 @@
-require 'mime/types'
-require 'yaml'
-
-class MIME::Type
-  attr_accessor :override
-end
-
-# Register additional mime type extensions
-#
-# Follows same format as mime-types data file
-#   https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
-File.read(File.expand_path("../mimes.yml", __FILE__)).lines.each do |line|
-  # Regexp was cargo culted from mime-types lib
-  next unless line =~ %r{^
-    #{MIME::Type::MEDIA_TYPE_RE}
-    (?:\s@([^\s]+))?
-    (?:\s:(#{MIME::Type::ENCODING_RE}))?
-  }x
-
-  mediatype  = $1
-  subtype    = $2
-  extensions = $3
-  encoding   = $4
-
-  # Lookup existing mime type
-  mime_type = MIME::Types["#{mediatype}/#{subtype}"].first ||
-    # Or create a new instance
-    MIME::Type.new("#{mediatype}/#{subtype}")
-
-  if extensions
-    extensions.split(/,/).each do |extension|
-      mime_type.extensions << extension
-    end
-  end
-
-  if encoding
-    mime_type.encoding = encoding
-  end
-
-  mime_type.override = true
-
-  # Kind of hacky, but we need to reindex the mime type after making changes
-  MIME::Types.add_type_variant(mime_type)
-  MIME::Types.index_extensions(mime_type)
-end
-
-module Linguist
-  module Mime
-    # Internal: Look up mime type for extension.
-    #
-    # ext - The extension String. May include leading "."
-    #
-    # Examples
-    #
-    #   Mime.mime_for('.html')
-    #   # => 'text/html'
-    #
-    #   Mime.mime_for('txt')
-    #   # => 'text/plain'
-    #
-    # Return mime type String otherwise falls back to 'text/plain'.
-    def self.mime_for(ext)
-      mime_type = lookup_mime_type_for(ext)
-      mime_type ? mime_type.to_s : 'text/plain'
-    end
-
-    # Internal: Lookup mime type for extension or mime type
-    #
-    # ext_or_mime_type - A file extension ".txt" or mime type "text/plain".
-    #
-    # Returns a MIME::Type
-    def self.lookup_mime_type_for(ext_or_mime_type)
-      ext_or_mime_type ||= ''
-
-      if ext_or_mime_type =~ /\w+\/\w+/
-        guesses = ::MIME::Types[ext_or_mime_type]
-      else
-        guesses = ::MIME::Types.type_for(ext_or_mime_type)
-      end
-
-      # Use custom override first
-      guesses.detect { |type| type.override } ||
-
-        # Prefer text mime types over binary
-        guesses.detect { |type| type.ascii? } ||
-
-        # Otherwise use the first guess
-        guesses.first
-    end
-  end
-end
--- a/lib/linguist/mimes.yml
+++ b/lib/linguist/mimes.yml
@@ -1,62 +0,0 @@
-# Additional types to add to MIME::Types
-#
-# MIME types are used to set the Content-Type of raw binary blobs. All text
-# blobs are served as text/plain regardless of their type to ensure they
-# open in the browser rather than downloading.
-#
-# The encoding helps determine whether a file should be treated as plain
-# text or binary. By default, a mime type's encoding is base64 (binary).
-# These types will show a "View Raw" link. To force a type to render as
-# plain text, set it to 8bit for UTF-8. text/* types will be treated as
-# text by default.
-#
-#   <type> @<extensions> :<encoding>
-#
-# type       - mediatype/subtype
-# extensions - comma seperated extension list
-# encoding   - base64 (binary), 7bit (ASCII), 8bit (UTF-8), or
-#              quoted-printable (Printable ASCII).
-#
-# Follows same format as mime-types data file
-#   https://github.com/halostatue/mime-types/blob/master/lib/mime/types.rb.data
-#
-# Any additions or modifications (even trivial) should have corresponding
-# test change in `test/test_mime.rb`.
-
-# TODO: Lookup actual types
-application/octet-stream @a,blend,gem,graffle,ipa,lib,mcz,nib,o,ogv,otf,pfx,pigx,plgx,psd,sib,spl,sqlite3,swc,ucode,xpi
-
-# Please keep this list alphabetized
-application/java-archive @ear,war
-application/netcdf :8bit
-application/ogg @ogg
-application/postscript :base64
-application/vnd.adobe.air-application-installer-package+zip @air
-application/vnd.mozilla.xul+xml :8bit
-application/vnd.oasis.opendocument.presentation @odp
-application/vnd.oasis.opendocument.spreadsheet @ods
-application/vnd.oasis.opendocument.text @odt
-application/vnd.openofficeorg.extension @oxt
-application/vnd.openxmlformats-officedocument.presentationml.presentation @pptx
-application/x-chrome-extension @crx
-application/x-iwork-keynote-sffkey @key
-application/x-iwork-numbers-sffnumbers @numbers
-application/x-iwork-pages-sffpages @pages
-application/x-ms-xbap @xbap :8bit
-application/x-parrot-bytecode @pbc
-application/x-shockwave-flash @swf
-application/x-silverlight-app @xap
-application/x-supercollider @sc :8bit
-application/x-troff-ms :8bit
-application/x-wais-source :8bit
-application/xaml+xml @xaml :8bit
-application/xslt+xml @xslt :8bit
-image/x-icns @icns
-text/cache-manifest @manifest
-text/plain @cu,cxx
-text/x-logtalk @lgt
-text/x-nemerle @n
-text/x-nimrod @nim
-text/x-ocaml @ml,mli,mll,mly,sig,sml
-text/x-rust @rs,rc
-text/x-scheme @rkt,scm,sls,sps,ss
--- a/lib/linguist/pathname.rb
+++ b/lib/linguist/pathname.rb
@@ -1,92 +0,0 @@
-require 'linguist/language'
-require 'linguist/mime'
-require 'pygments'
-
-module Linguist
-  # Similar to ::Pathname, Linguist::Pathname wraps a path string and
-  # provides helpful query methods. Its useful when you only have a
-  # filename but not a blob and need to figure out the language of the file.
-  class Pathname
-    # Public: Initialize a Pathname
-    #
-    # path - A filename String. The file may or maybe actually exist.
-    #
-    # Returns a Pathname.
-    def initialize(path)
-      @path = path
-    end
-
-    # Public: Get the basename of the path
-    #
-    # Examples
-    #
-    #   Pathname.new('sub/dir/file.rb').basename
-    #   # => 'file.rb'
-    #
-    # Returns a String.
-    def basename
-      File.basename(@path)
-    end
-
-    # Public: Get the extname of the path
-    #
-    # Examples
-    #
-    #   Pathname.new('.rb').extname
-    #   # => '.rb'
-    #
-    #   Pathname.new('file.rb').extname
-    #   # => '.rb'
-    #
-    # Returns a String.
-    def extname
-      File.extname(@path)
-    end
-
-    # Public: Get the language of the path
-    #
-    # The path extension name is the only heuristic used to detect the
-    # language name.
-    #
-    # Examples
-    #
-    #   Pathname.new('file.rb').language
-    #   # => Language['Ruby']
-    #
-    # Returns a Language or nil if none was found.
-    def language
-      @language ||= Language.find_by_filename(@path)
-    end
-
-    # Internal: Get the lexer of the path
-    #
-    # Returns a Lexer.
-    def lexer
-      language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
-    end
-
-    # Public: Get the mime type
-    #
-    # Examples
-    #
-    #   Pathname.new('index.html').mime_type
-    #   # => 'text/html'
-    #
-    # Returns a mime type String.
-    def mime_type
-      @mime_type ||= Mime.mime_for(extname)
-    end
-
-    # Public: Return self as String
-    #
-    # Returns a String
-    def to_s
-      @path.dup
-    end
-
-    def eql?(other)
-      other.is_a?(self.class) && @path == other.to_s
-    end
-    alias_method :==, :eql?
-  end
-end
--- a/lib/linguist/repository.rb
+++ b/lib/linguist/repository.rb
@@ -67,8 +67,8 @@ module Linguist
      return if @computed_stats

      @enum.each do |blob|
-        # Skip binary file extensions
-        next if blob.binary_mime_type?
+        # Skip files that are likely binary
+        next if blob.likely_binary?

        # Skip vendored or generated blobs
        next if blob.vendored? || blob.generated? || blob.language.nil?
@@ -80,7 +80,7 @@ module Linguist
      end

      # Compute total size
-      @size = @sizes.inject(0) { |s,(k,v)| s + v }
+      @size = @sizes.inject(0) { |s,(_,v)| s + v }

      # Get primary language
      if primary = @sizes.max_by { |(_, size)| size }
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
--- a/lib/linguist/samples.rb
+++ b/lib/linguist/samples.rb
@@ -0,0 +1,98 @@
+require 'yaml'
+
+require 'linguist/md5'
+require 'linguist/classifier'
+
+module Linguist
+  # Model for accessing classifier training data.
+  module Samples
+    # Path to samples root directory
+    ROOT = File.expand_path("../../../samples", __FILE__)
+
+    # Path for serialized samples db
+    PATH = File.expand_path('../samples.json', __FILE__)
+
+    # Hash of serialized samples object
+    if File.exist?(PATH)
+      DATA = YAML.load_file(PATH)
+    end
+
+    # Public: Iterate over each sample.
+    #
+    # &block - Yields Sample to block
+    #
+    # Returns nothing.
+    def self.each(&block)
+      Dir.entries(ROOT).each do |category|
+        next if category == '.' || category == '..'
+
+        # Skip text and binary for now
+        # Possibly reconsider this later
+        next if category == 'Text' || category == 'Binary'
+
+        dirname = File.join(ROOT, category)
+        Dir.entries(dirname).each do |filename|
+          next if filename == '.' || filename == '..'
+
+          if filename == 'filenames'
+            Dir.entries(File.join(dirname, filename)).each do |subfilename|
+              next if subfilename == '.' || subfilename == '..'
+
+              yield({
+                :path    => File.join(dirname, filename, subfilename),
+                :language => category,
+                :filename => subfilename
+              })
+            end
+          else
+            if File.extname(filename) == ""
+              raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
+            end
+
+            yield({
+              :path     => File.join(dirname, filename),
+              :language => category,
+              :extname  => File.extname(filename)
+            })
+          end
+        end
+      end
+
+      nil
+    end
+
+    # Public: Build Classifier from all samples.
+    #
+    # Returns trained Classifier.
+    def self.data
+      db = {}
+      db['extnames'] = {}
+      db['filenames'] = {}
+
+      each do |sample|
+        language_name = sample[:language]
+
+        if sample[:extname]
+          db['extnames'][language_name] ||= []
+          if !db['extnames'][language_name].include?(sample[:extname])
+            db['extnames'][language_name] << sample[:extname]
+            db['extnames'][language_name].sort!
+          end
+        end
+
+        if sample[:filename]
+          db['filenames'][language_name] ||= []
+          db['filenames'][language_name] << sample[:filename]
+          db['filenames'][language_name].sort!
+        end
+
+        data = File.read(sample[:path])
+        Classifier.train!(db, language_name, data)
+      end
+
+      db['md5'] = Linguist::MD5.hexdigest(db)
+
+      db
+    end
+  end
+end
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -0,0 +1,197 @@
+require 'strscan'
+
+module Linguist
+  # Generic programming language tokenizer.
+  #
+  # Tokens are designed for use in the language bayes classifier.
+  # It strips any data strings or comments and preserves significant
+  # language symbols.
+  class Tokenizer
+    # Public: Extract tokens from data
+    #
+    # data - String to tokenize
+    #
+    # Returns Array of token Strings.
+    def self.tokenize(data)
+      new.extract_tokens(data)
+    end
+
+    # Read up to 100KB
+    BYTE_LIMIT = 100_000
+
+    # Start state on token, ignore anything till the next newline
+    SINGLE_LINE_COMMENTS = [
+      '//', # C
+      '#',  # Ruby
+      '%',  # Tex
+    ]
+
+    # Start state on opening token, ignore anything until the closing
+    # token is reached.
+    MULTI_LINE_COMMENTS = [
+      ['/*', '*/'],    # C
+      ['<!--', '-->'], # XML
+      ['{-', '-}'],    # Haskell
+      ['(*', '*)']     # Coq
+    ]
+
+    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
+      "\s*#{Regexp.escape(c)} "
+    }.join("|"))
+
+    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
+      Regexp.escape(c[0])
+    }.join("|"))
+
+    # Internal: Extract generic tokens from data.
+    #
+    # data - String to scan.
+    #
+    # Examples
+    #
+    #   extract_tokens("printf('Hello')")
+    #   # => ['printf', '(', ')']
+    #
+    # Returns Array of token Strings.
+    def extract_tokens(data)
+      s = StringScanner.new(data)
+
+      tokens = []
+      until s.eos?
+        break if s.pos >= BYTE_LIMIT
+
+        if token = s.scan(/^#!.+$/)
+          if name = extract_shebang(token)
+            tokens << "SHEBANG#!#{name}"
+          end
+
+        # Single line comment
+        elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
+          # tokens << token.strip
+          s.skip_until(/\n|\Z/)
+
+        # Multiline comments
+        elsif token = s.scan(START_MULTI_LINE_COMMENT)
+          # tokens << token
+          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
+          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
+          # tokens << close_token
+
+        # Skip single or double quoted strings
+        elsif s.scan(/"/)
+          if s.peek(1) == "\""
+            s.getch
+          else
+            s.skip_until(/[^\\]"/)
+          end
+        elsif s.scan(/'/)
+          if s.peek(1) == "'"
+            s.getch
+          else
+            s.skip_until(/[^\\]'/)
+          end
+
+        # Skip number literals
+        elsif s.scan(/(0x)?\d(\d|\.)*/)
+
+        # SGML style brackets
+        elsif token = s.scan(/<[^\s<>][^<>]*>/)
+          extract_sgml_tokens(token).each { |t| tokens << t }
+
+        # Common programming punctuation
+        elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
+          tokens << token
+
+        # Regular token
+        elsif token = s.scan(/[\w\.@#\/\*]+/)
+          tokens << token
+
+        # Common operators
+        elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
+          tokens << token
+
+        else
+          s.getch
+        end
+      end
+
+      tokens
+    end
+
+    # Internal: Extract normalized shebang command token.
+    #
+    # Examples
+    #
+    #   extract_shebang("#!/usr/bin/ruby")
+    #   # => "ruby"
+    #
+    #   extract_shebang("#!/usr/bin/env node")
+    #   # => "node"
+    #
+    # Returns String token or nil it couldn't be parsed.
+    def extract_shebang(data)
+      s = StringScanner.new(data)
+
+      if path = s.scan(/^#!\s*\S+/)
+        script = path.split('/').last
+        if script == 'env'
+          s.scan(/\s+/)
+          script = s.scan(/\S+/)
+        end
+        script = script[/[^\d]+/, 0] if script
+        return script
+      end
+
+      nil
+    end
+
+    # Internal: Extract tokens from inside SGML tag.
+    #
+    # data - SGML tag String.
+    #
+    # Examples
+    #
+    #   extract_sgml_tokens("<a href='' class=foo>")
+    #   # => ["<a>", "href="]
+    #
+    # Returns Array of token Strings.
+    def extract_sgml_tokens(data)
+      s = StringScanner.new(data)
+
+      tokens = []
+
+      until s.eos?
+        # Emit start token
+        if token = s.scan(/<\/?[^\s>]+/)
+          tokens << "#{token}>"
+
+        # Emit attributes with trailing =
+        elsif token = s.scan(/\w+=/)
+          tokens << token
+
+          # Then skip over attribute value
+          if s.scan(/"/)
+            s.skip_until(/[^\\]"/)
+          elsif s.scan(/'/)
+            s.skip_until(/[^\\]'/)
+          else
+            s.skip_until(/\w+/)
+          end
+
+        # Emit lone attributes
+        elsif token = s.scan(/\w+/)
+          tokens << token
+
+        # Stop at the end of the tag
+        elsif s.scan(/>/)
+          s.terminate
+
+        else
+          s.getch
+        end
+      end
+
+      tokens
+    end
+  end
+end
--- a/lib/linguist/vendor.yml
+++ b/lib/linguist/vendor.yml
@@ -16,13 +16,19 @@
 #  https://github.com/joyent/node
 - ^deps/
 - ^tools/
+- (^|/)configure$
+- (^|/)configure.ac$
+- (^|/)config.guess$
+- (^|/)config.sub$

-# Node depedencies
+# Node dependencies
 - node_modules/

-# Vendored depedencies
+# Vendored dependencies
 - vendor/

+# Debian packaging
+- ^debian/

 ## Commonly Bundled JavaScript frameworks ##

@@ -61,8 +67,16 @@
 # MathJax
 - (^|/)MathJax/

+# SyntaxHighlighter - http://alexgorbatchev.com/
+- (^|/)shBrush([^.]*)\.js$
+- (^|/)shCore\.js$
+- (^|/)shLegacy\.js$
+
 ## Python ##

+# django
+- (^|/)admin_media/
+
 # Fabric
 - ^fabfile\.py$

@@ -94,3 +108,6 @@

 # Samples folders
 - ^[Ss]amples/
+
+# Test fixtures
+- ^[Tt]est/fixtures/