Linguist 2.2.0

Still index .txt
Extract seperate language detection method
2025-10-29 17:50:22 +00:00 · 2012-08-03 16:47:34 -05:00 · 2012-08-03 16:34:53 -05:00 · 2012-08-03 16:03:06 -05:00 · 2012-08-03 15:47:50 -05:00 · 2012-08-03 15:25:38 -05:00
46 changed files with 13183 additions and 6029 deletions
--- a/github-linguist.gemspec
+++ b/github-linguist.gemspec
@@ -1,6 +1,6 @@
 Gem::Specification.new do |s|
  s.name    = 'github-linguist'
-  s.version = '2.1.2'
+  s.version = '2.2.0'
  s.summary = "GitHub Language detection"

  s.authors = "GitHub"
--- a/lib/linguist.rb
+++ b/lib/linguist.rb
@@ -1,4 +1,5 @@
 require 'linguist/blob_helper'
+require 'linguist/generated'
 require 'linguist/language'
 require 'linguist/mime'
 require 'linguist/repository'
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -1,7 +1,6 @@
-require 'linguist/classifier'
+require 'linguist/generated'
 require 'linguist/language'
 require 'linguist/mime'
-require 'linguist/samples'

 require 'charlock_holmes'
 require 'escape_utils'
@@ -129,15 +128,6 @@ module Linguist
      ['.png', '.jpg', '.jpeg', '.gif'].include?(extname)
    end

-    # Public: Is the blob likely to have a shebang?
-    #
-    # Return true or false
-    def shebang_extname?
-      extname.empty? &&
-        mode &&
-        (mode.to_i(8) & 05) == 05
-    end
-
    MEGABYTE = 1024 * 1024

    # Public: Is the blob too big to load?
@@ -221,143 +211,16 @@ module Linguist
      lines.grep(/\S/).size
    end

-    # Internal: Compute average line length.
-    #
-    # Returns Integer.
-    def average_line_length
-      if lines.any?
-        lines.inject(0) { |n, l| n += l.length } / lines.length
-      else
-        0
-      end
-    end
-
    # Public: Is the blob a generated file?
    #
    # Generated source code is supressed in diffs and is ignored by
    # language statistics.
    #
-    # Requires Blob#data
-    #
-    # Includes:
-    # - XCode project XML files
-    # - Minified JavaScript
-    # - Compiled CoffeeScript
-    # - PEG.js-generated parsers
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_generated` if you make any changes.
+    # May load Blob#data
    #
    # Return true or false
    def generated?
-      if name == 'Gemfile.lock' || minified_javascript? || compiled_coffeescript? ||
-      xcode_project_file? || generated_net_docfile? || generated_parser?
-        true
-      else
-        false
-      end
-    end
-
-    # Internal: Is the blob an XCode project file?
-    #
-    # Generated if the file extension is an XCode project
-    # file extension.
-    #
-    # Returns true of false.
-    def xcode_project_file?
-      ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
-    end
-
-    # Internal: Is the blob minified JS?
-    #
-    # Consider JS minified if the average line length is
-    # greater then 100c.
-    #
-    # Returns true or false.
-    def minified_javascript?
-      return unless extname == '.js'
-      average_line_length > 100
-    end
-
-    # Internal: Is the blob of JS a parser generated by PEG.js?
-    #
-    # Requires Blob#data
-    #
-    # PEG.js-generated parsers are not meant to be consumed by humans.
-    #
-    # Return true or false
-    def generated_parser?
-      return false unless extname == '.js'
-
-      # PEG.js-generated parsers include a comment near the top  of the file
-      # that marks them as such.
-      if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
-        return true
-      end
-
-      false
-    end
-
-    # Internal: Is the blob of JS generated by CoffeeScript?
-    #
-    # Requires Blob#data
-    #
-    # CoffeScript is meant to output JS that would be difficult to
-    # tell if it was generated or not. Look for a number of patterns
-    # output by the CS compiler.
-    #
-    # Return true or false
-    def compiled_coffeescript?
-      return false unless extname == '.js'
-
-      # CoffeeScript generated by > 1.2 include a comment on the first line
-      if lines[0] =~ /^\/\/ Generated by /
-        return true
-      end
-
-      if lines[0] == '(function() {' &&     # First line is module closure opening
-          lines[-2] == '}).call(this);' &&  # Second to last line closes module closure
-          lines[-1] == ''                   # Last line is blank
-
-        score = 0
-
-        lines.each do |line|
-          if line =~ /var /
-            # Underscored temp vars are likely to be Coffee
-            score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
-
-            # bind and extend functions are very Coffee specific
-            score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
-          end
-        end
-
-        # Require a score of 3. This is fairly arbitrary. Consider
-        # tweaking later.
-        score >= 3
-      else
-        false
-      end
-    end
-
-    # Internal: Is this a generated documentation file for a .NET assembly?
-    #
-    # Requires Blob#data
-    #
-    # .NET developers often check in the XML Intellisense file along with an
-    # assembly - however, these don't have a special extension, so we have to
-    # dig into the contents to determine if it's a docfile. Luckily, these files
-    # are extremely structured, so recognizing them is easy.
-    #
-    # Returns true or false
-    def generated_net_docfile?
-      return false unless extname.downcase == ".xml"
-      return false unless lines.count > 3
-
-      # .NET Docfiles always open with <doc> and their first tag is an
-      # <assembly> tag
-      return lines[1].include?("<doc>") &&
-        lines[2].include?("<assembly>") &&
-        lines[-2].include?("</doc>")
+      @_generated ||= Generated.generated?(name, lambda { data })
    end

    # Public: Should the blob be indexed for searching?
@@ -375,6 +238,8 @@ module Linguist
    def indexable?
      if binary?
        false
+      elsif extname == '.txt'
+        true
      elsif language.nil?
        false
      elsif !language.searchable?
@@ -396,30 +261,11 @@ module Linguist
    def language
      if defined? @language
        @language
-      else
-        @language = guess_language
+      elsif !binary_mime_type?
+        @language = Language.detect(name.to_s, lambda { data }, mode)
      end
    end

-    # Internal: Guess language
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_language` if you make any changes.
-    #
-    # Returns a Language or nil
-    def guess_language
-      return if binary_mime_type?
-
-      # Disambiguate between multiple language extensions
-      disambiguate_extension_language ||
-
-        # See if there is a Language for the extension
-        Language.find_by_filename(name.to_s) ||
-
-        # Try to detect Language from shebang line
-        shebang_language
-    end
-
    # Internal: Get the lexer of the blob.
    #
    # Returns a Lexer.
@@ -427,86 +273,6 @@ module Linguist
      language ? language.lexer : Pygments::Lexer.find_by_name('Text only')
    end

-    # Internal: Disambiguates between multiple language extensions.
-    #
-    # Returns a Language or nil.
-    def disambiguate_extension_language
-      if Language.ambiguous?(extname)
-        possible_languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
-        if possible_languages.any?
-          if result = Classifier.classify(Samples::DATA, data, possible_languages).first
-            Language[result[0]]
-          end
-        end
-      end
-    end
-
-    # Internal: Extract the script name from the shebang line
-    #
-    # Requires Blob#data
-    #
-    # Examples
-    #
-    #   '#!/usr/bin/ruby'
-    #   # => 'ruby'
-    #
-    #   '#!/usr/bin/env ruby'
-    #   # => 'ruby'
-    #
-    #   '#!/usr/bash/python2.4'
-    #   # => 'python'
-    #
-    # Please add additional test coverage to
-    # `test/test_blob.rb#test_shebang_script` if you make any changes.
-    #
-    # Returns a script name String or nil
-    def shebang_script
-      # Fail fast if blob isn't viewable?
-      return unless viewable?
-
-      if lines.any? && (match = lines[0].match(/(.+)\n?/)) && (bang = match[0]) =~ /^#!/
-        bang.sub!(/^#! /, '#!')
-        tokens = bang.split(' ')
-        pieces = tokens.first.split('/')
-        if pieces.size > 1
-          script = pieces.last
-        else
-          script = pieces.first.sub('#!', '')
-        end
-
-        script = script == 'env' ? tokens[1] : script
-
-        # python2.4 => python
-        if script =~ /((?:\d+\.?)+)/
-          script.sub! $1, ''
-        end
-
-        # Check for multiline shebang hacks that exec themselves
-        #
-        #   #!/bin/sh
-        #   exec foo "$0" "$@"
-        #
-        if script == 'sh' &&
-            lines[0...5].any? { |l| l.match(/exec (\w+).+\$0.+\$@/) }
-          script = $1
-        end
-
-        script
-      end
-    end
-
-    # Internal: Get Language for shebang script
-    #
-    # Returns the Language or nil
-    def shebang_language
-      # Skip file extensions unlikely to have shebangs
-      return unless shebang_extname?
-
-      if script = shebang_script
-        Language[script]
-      end
-    end
-
    # Public: Highlight syntax of blob
    #
    # options - A Hash of options (defaults to {})
--- a/lib/linguist/generated.rb
+++ b/lib/linguist/generated.rb
@@ -0,0 +1,161 @@
+module Linguist
+  class Generated
+    # Public: Is the blob a generated file?
+    #
+    # name - String filename
+    # data - String blob data. A block also maybe passed in for lazy
+    #        loading. This behavior is deprecated and you should always
+    #        pass in a String.
+    #
+    # Return true or false
+    def self.generated?(name, data)
+      new(name, data).generated?
+    end
+
+    # Internal: Initialize Generated instance
+    #
+    # name - String filename
+    # data - String blob data
+    def initialize(name, data)
+      @name = name
+      @extname = File.extname(name)
+      @_data = data
+    end
+
+    attr_reader :name, :extname
+
+    # Lazy load blob data if block was passed in.
+    #
+    # Awful, awful stuff happening here.
+    #
+    # Returns String data.
+    def data
+      @data ||= @_data.respond_to?(:call) ? @_data.call() : @_data
+    end
+
+    # Public: Get each line of data
+    #
+    # Returns an Array of lines
+    def lines
+      @lines ||= data.split("\n", -1)
+    end
+
+    # Internal: Is the blob a generated file?
+    #
+    # Generated source code is supressed in diffs and is ignored by
+    # language statistics.
+    #
+    # Please add additional test coverage to
+    # `test/test_blob.rb#test_generated` if you make any changes.
+    #
+    # Return true or false
+    def generated?
+      name == 'Gemfile.lock' ||
+        minified_javascript? ||
+        compiled_coffeescript? ||
+        xcode_project_file? ||
+        generated_net_docfile? ||
+        generated_parser?
+    end
+
+    # Internal: Is the blob an XCode project file?
+    #
+    # Generated if the file extension is an XCode project
+    # file extension.
+    #
+    # Returns true of false.
+    def xcode_project_file?
+      ['.xib', '.nib', '.storyboard', '.pbxproj', '.xcworkspacedata', '.xcuserstate'].include?(extname)
+    end
+
+    # Internal: Is the blob minified JS?
+    #
+    # Consider JS minified if the average line length is
+    # greater then 100c.
+    #
+    # Returns true or false.
+    def minified_javascript?
+      return unless extname == '.js'
+      if lines.any?
+        (lines.inject(0) { |n, l| n += l.length } / lines.length) > 100
+      else
+        false
+      end
+    end
+
+    # Internal: Is the blob of JS generated by CoffeeScript?
+    #
+    # CoffeScript is meant to output JS that would be difficult to
+    # tell if it was generated or not. Look for a number of patterns
+    # output by the CS compiler.
+    #
+    # Return true or false
+    def compiled_coffeescript?
+      return false unless extname == '.js'
+
+      # CoffeeScript generated by > 1.2 include a comment on the first line
+      if lines[0] =~ /^\/\/ Generated by /
+        return true
+      end
+
+      if lines[0] == '(function() {' &&     # First line is module closure opening
+          lines[-2] == '}).call(this);' &&  # Second to last line closes module closure
+          lines[-1] == ''                   # Last line is blank
+
+        score = 0
+
+        lines.each do |line|
+          if line =~ /var /
+            # Underscored temp vars are likely to be Coffee
+            score += 1 * line.gsub(/(_fn|_i|_len|_ref|_results)/).count
+
+            # bind and extend functions are very Coffee specific
+            score += 3 * line.gsub(/(__bind|__extends|__hasProp|__indexOf|__slice)/).count
+          end
+        end
+
+        # Require a score of 3. This is fairly arbitrary. Consider
+        # tweaking later.
+        score >= 3
+      else
+        false
+      end
+    end
+
+    # Internal: Is this a generated documentation file for a .NET assembly?
+    #
+    # .NET developers often check in the XML Intellisense file along with an
+    # assembly - however, these don't have a special extension, so we have to
+    # dig into the contents to determine if it's a docfile. Luckily, these files
+    # are extremely structured, so recognizing them is easy.
+    #
+    # Returns true or false
+    def generated_net_docfile?
+      return false unless extname.downcase == ".xml"
+      return false unless lines.count > 3
+
+      # .NET Docfiles always open with <doc> and their first tag is an
+      # <assembly> tag
+      return lines[1].include?("<doc>") &&
+        lines[2].include?("<assembly>") &&
+        lines[-2].include?("</doc>")
+    end
+
+    # Internal: Is the blob of JS a parser generated by PEG.js?
+    #
+    # PEG.js-generated parsers are not meant to be consumed by humans.
+    #
+    # Return true or false
+    def generated_parser?
+      return false unless extname == '.js'
+
+      # PEG.js-generated parsers include a comment near the top  of the file
+      # that marks them as such.
+      if lines[0..4].join('') =~ /^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js/
+        return true
+      end
+
+      false
+    end
+  end
+end
--- a/lib/linguist/language.rb
+++ b/lib/linguist/language.rb
@@ -2,6 +2,7 @@ require 'escape_utils'
 require 'pygments'
 require 'yaml'

+require 'linguist/classifier'
 require 'linguist/samples'

 module Linguist
@@ -11,23 +12,15 @@ module Linguist
  # Languages are defined in `lib/linguist/languages.yml`.
  class Language
    @languages       = []
-    @overrides       = {}
    @index           = {}
    @name_index      = {}
    @alias_index     = {}
-    @extension_index = {}
-    @filename_index  = {}
+    @extension_index = Hash.new { |h,k| h[k] = [] }
+    @filename_index  = Hash.new { |h,k| h[k] = [] }

    # Valid Languages types
    TYPES = [:data, :markup, :programming]

-    # Internal: Test if extension maps to multiple Languages.
-    #
-    # Returns true or false.
-    def self.ambiguous?(extension)
-      @overrides.include?(extension)
-    end
-
    # Internal: Create a new Language object
    #
    # attributes - A hash of attributes
@@ -60,34 +53,45 @@ module Linguist
          raise ArgumentError, "Extension is missing a '.': #{extension.inspect}"
        end

-        unless ambiguous?(extension)
-          # Index the extension with a leading ".": ".rb"
-          @extension_index[extension] = language
-
-          # Index the extension without a leading ".": "rb"
-          @extension_index[extension.sub(/^\./, '')] = language
-        end
-      end
-
-      language.overrides.each do |extension|
-        if extension !~ /^\./
-          raise ArgumentError, "Extension is missing a '.': #{extension.inspect}"
-        end
-
-        if l = @overrides[extension]
-          raise ArgumentError, "#{extension} is already overridden by #{l.name}"
-        end
-
-        @overrides[extension] = language
+        @extension_index[extension] << language
      end

      language.filenames.each do |filename|
-        @filename_index[filename] = language
+        @filename_index[filename] << language
      end

      language
    end

+    # Public: Detects the Language of the blob.
+    #
+    # name - String filename
+    # data - String blob data. A block also maybe passed in for lazy
+    #        loading. This behavior is deprecated and you should always
+    #        pass in a String.
+    # mode - Optional String mode (defaults to nil)
+    #
+    # Returns Language or nil.
+    def self.detect(name, data, mode = nil)
+      # A bit of an elegant hack. If the file is exectable but extensionless,
+      # append a "magic" extension so it can be classified with other
+      # languages that have shebang scripts.
+      if File.extname(name).empty? && mode && (mode.to_i(8) & 05) == 05
+        name += ".script!"
+      end
+
+      possible_languages = find_by_filename(name)
+
+      if possible_languages.length > 1
+        data = data.call() if data.respond_to?(:call)
+        if result = Classifier.classify(Samples::DATA, data, possible_languages.map(&:name)).first
+          Language[result[0]]
+        end
+      else
+        possible_languages.first
+      end
+    end
+
    # Public: Get all Languages
    #
    # Returns an Array of Languages
@@ -123,33 +127,19 @@ module Linguist
      @alias_index[name]
    end

-    # Public: Look up Language by extension.
-    #
-    # extension - The extension String. May include leading "."
-    #
-    # Examples
-    #
-    #   Language.find_by_extension('.rb')
-    #   # => #<Language name="Ruby">
-    #
-    # Returns the Language or nil if none was found.
-    def self.find_by_extension(extension)
-      @extension_index[extension]
-    end
-
-    # Public: Look up Language by filename.
+    # Public: Look up Languages by filename.
    #
    # filename - The path String.
    #
    # Examples
    #
    #   Language.find_by_filename('foo.rb')
-    #   # => #<Language name="Ruby">
+    #   # => [#<Language name="Ruby">]
    #
-    # Returns the Language or nil if none was found.
+    # Returns all matching Languages or [] if none were found.
    def self.find_by_filename(filename)
      basename, extname = File.basename(filename), File.extname(filename)
-      @filename_index[basename] || @extension_index[extname]
+      @filename_index[basename] + @extension_index[extname]
    end

    # Public: Look up Language by its name or lexer.
@@ -236,7 +226,6 @@ module Linguist

      # Set extensions or default to [].
      @extensions = attributes[:extensions] || []
-      @overrides  = attributes[:overrides]  || []
      @filenames  = attributes[:filenames]  || []

      unless @primary_extension = attributes[:primary_extension]
@@ -344,11 +333,6 @@ module Linguist
    # Returns the extension String.
    attr_reader :primary_extension

-    # Internal: Get overridden extensions.
-    #
-    # Returns the extensions Array.
-    attr_reader :overrides
-
    # Public: Get filenames
    #
    # Examples
@@ -481,7 +465,6 @@ module Linguist
      :search_term       => options['search_term'],
      :extensions        => options['extensions'].sort,
      :primary_extension => options['primary_extension'],
-      :overrides         => options['overrides'],
      :filenames         => options['filenames'],
      :popular           => popular.include?(name)
    )
--- a/lib/linguist/languages.yml
+++ b/lib/linguist/languages.yml
@@ -15,7 +15,6 @@
 #                     the language. Must be unique. Used when a Language is picked
 #                     from a dropdown and we need to automatically choose an
 #                     extension.
-# overrides         - An Array of extensions that takes precedence over conflicts
 # searchable        - Boolean flag to enable searching (defaults to true)
 # search_term       - Deprecated: Some languages maybe indexed under a
 #                     different alias. Avoid defining new exceptions.
@@ -67,8 +66,6 @@ Apex:
  type: programming
  lexer: Text only
  primary_extension: .cls
-  overrides:
-  - .cls

 AppleScript:
  aliases:
@@ -157,8 +154,6 @@ Bro:
 C:
  type: programming
  color: "#555"
-  overrides:
-  - .h
  primary_extension: .c
  extensions:
  - .w
@@ -533,8 +528,6 @@ Groovy:
 Groovy Server Pages:
  group: Groovy
  lexer: Java Server Page
-  overrides:
-  - .gsp
  aliases:
  - gsp
  primary_extension: .gsp
@@ -604,8 +597,6 @@ INI:
  - .prefs
  - .properties
  primary_extension: .ini
-  filenames:
-  - .gitconfig

 IRC log:
  lexer: IRC logs
@@ -634,8 +625,6 @@ JSON:
  ace_mode: json
  searchable: false
  primary_extension: .json
-  extensions:
-  - .json

 Java:
  type: programming
@@ -757,13 +746,15 @@ Matlab:
  extensions:
  - .matlab

-Max/MSP:
+Max:
  type: programming
  color: "#ce279c"
  lexer: Text only
+  aliases:
+  - max/msp
+  - maxmsp
+  search_term: max/msp
  primary_extension: .mxt
-  extensions:
-  - .mxt

 MiniD: # Legacy
  searchable: false
@@ -841,8 +832,6 @@ ObjDump:
 Objective-C:
  type: programming
  color: "#438eff"
-  overrides:
-  - .m
  primary_extension: .m
  extensions:
  - .mm
@@ -915,8 +904,6 @@ Perl:
  ace_mode: perl
  color: "#0298c3"
  primary_extension: .pl
-  overrides:
-  - .pl
  extensions:
  - .PL
  - .perl
@@ -983,8 +970,6 @@ R:
  type: programming
  color: "#198ce7"
  lexer: S
-  overrides:
-  - .r
  primary_extension: .r
  extensions:
  - .r
@@ -1129,12 +1114,7 @@ Shell:
  - zsh
  primary_extension: .sh
  filenames:
-  - .bash_profile
-  - .bashrc
-  - .profile
-  - .zlogin
  - .zsh
-  - .zshrc
  - bashrc
  - zshrc

@@ -1195,14 +1175,6 @@ Tea:
  type: markup
  primary_extension: .tea

-Text:
-  type: data
-  lexer: Text only
-  ace_mode: text
-  primary_extension: .txt
-  extensions:
-  - .txt
-
 Textile:
  type: markup
  lexer: Text only
@@ -1216,8 +1188,6 @@ Turing:
  color: "#45f715"
  lexer: Text only
  primary_extension: .t
-  overrides:
-  - .t
  extensions:
  - .tu

@@ -1249,8 +1219,6 @@ Verilog:
  type: programming
  lexer: verilog
  color: "#848bf3"
-  overrides:
-  - .v
  primary_extension: .v

 VimL:
@@ -1263,8 +1231,6 @@ VimL:
  extensions:
  - .vim
  filenames:
-  - .gvimrc
-  - .vimrc
  - vimrc
  - gvimrc

@@ -1332,8 +1298,6 @@ YAML:
  extensions:
  - .yaml
  - .yml
-  filenames:
-  - .gemrc

 eC:
  type: programming
--- a/lib/linguist/samples.json
+++ b/lib/linguist/samples.json
--- a/lib/linguist/samples.rb
+++ b/lib/linguist/samples.rb
@@ -45,6 +45,10 @@ module Linguist
              })
            end
          else
+            if File.extname(filename) == ""
+              raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
+            end
+
            yield({
              :path     => File.join(dirname, filename),
              :language => category,
@@ -68,18 +72,16 @@ module Linguist
      each do |sample|
        language_name = sample[:language]

-        # TODO: For now skip empty extnames
-        if sample[:extname] && sample[:extname] != ""
+        if sample[:extname]
          db['extnames'][language_name] ||= []
          if !db['extnames'][language_name].include?(sample[:extname])
            db['extnames'][language_name] << sample[:extname]
          end
        end

-        # TODO: For now skip empty extnames
-        if fn = sample[:filename]
+        if sample[:filename]
          db['filenames'][language_name] ||= []
-          db['filenames'][language_name] << fn
+          db['filenames'][language_name] << sample[:filename]
        end

        data = File.read(sample[:path])
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -1,3 +1,5 @@
+require 'strscan'
+
 module Linguist
  # Generic programming language tokenizer.
  #
@@ -50,8 +52,13 @@ module Linguist

      tokens = []
      until s.eos?
+        if token = s.scan(/^#!.+$/)
+          if name = extract_shebang(token)
+            tokens << "SHEBANG#!#{name}"
+          end
+
        # Single line comment
-        if token = s.scan(START_SINGLE_LINE_COMMENT)
+        elsif token = s.scan(START_SINGLE_LINE_COMMENT)
          tokens << token.strip
          s.skip_until(/\n|\Z/)

@@ -64,19 +71,27 @@ module Linguist

        # Skip single or double quoted strings
        elsif s.scan(/"/)
-          s.skip_until(/[^\\]"/)
+          if s.peek(1) == "\""
+            s.getch
+          else
+            s.skip_until(/[^\\]"/)
+          end
        elsif s.scan(/'/)
-          s.skip_until(/[^\\]'/)
+          if s.peek(1) == "'"
+            s.getch
+          else
+            s.skip_until(/[^\\]'/)
+          end

        # Skip number literals
-        elsif s.scan(/(0x)?\d+/)
+        elsif s.scan(/(0x)?\d(\d|\.)*/)

        # SGML style brackets
        elsif token = s.scan(/<[^\s<>][^<>]*>/)
          extract_sgml_tokens(token).each { |t| tokens << t }

        # Common programming punctuation
-        elsif token = s.scan(/;|\{|\}|\(|\)/)
+        elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
          tokens << token

        # Regular token
@@ -95,6 +110,33 @@ module Linguist
      tokens
    end

+    # Internal: Extract normalized shebang command token.
+    #
+    # Examples
+    #
+    #   extract_shebang("#!/usr/bin/ruby")
+    #   # => "ruby"
+    #
+    #   extract_shebang("#!/usr/bin/env node")
+    #   # => "node"
+    #
+    # Returns String token or nil it couldn't be parsed.
+    def extract_shebang(data)
+      s = StringScanner.new(data)
+
+      if path = s.scan(/^#!\s*\S+/)
+        script = path.split('/').last
+        if script == 'env'
+          s.scan(/\s+/)
+          script = s.scan(/\S+/)
+        end
+        script = script[/[^\d]+/, 0]
+        return script
+      end
+
+      nil
+    end
+
    # Internal: Extract tokens from inside SGML tag.
    #
    # data - SGML tag String.
--- a/samples/Groovy/groovy.script!
+++ b/samples/Groovy/groovy.script!
--- a/samples/INI/filenames/.gitconfig
+++ b/samples/INI/filenames/.gitconfig
--- a/samples/JSON/Hello.maxhelp
+++ b/samples/JSON/Hello.maxhelp
@@ -0,0 +1,367 @@
+{
+	"patcher" : 	{
+		"fileversion" : 1,
+		"appversion" : 		{
+			"major" : 5,
+			"minor" : 1,
+			"revision" : 9
+		}
+,
+		"rect" : [ 198.0, 92.0, 365.0, 407.0 ],
+		"bglocked" : 0,
+		"defrect" : [ 198.0, 92.0, 365.0, 407.0 ],
+		"openrect" : [ 0.0, 0.0, 0.0, 0.0 ],
+		"openinpresentation" : 0,
+		"default_fontsize" : 14.0,
+		"default_fontface" : 0,
+		"default_fontname" : "Arial",
+		"gridonopen" : 0,
+		"gridsize" : [ 20.0, 20.0 ],
+		"gridsnaponopen" : 0,
+		"toolbarvisible" : 1,
+		"boxanimatetime" : 200,
+		"imprint" : 0,
+		"enablehscroll" : 1,
+		"enablevscroll" : 1,
+		"devicewidth" : 0.0,
+		"boxes" : [ 			{
+				"box" : 				{
+					"maxclass" : "button",
+					"patching_rect" : [ 260.0, 260.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"id" : "obj-22"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "button",
+					"patching_rect" : [ 240.0, 260.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"id" : "obj-20"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "button",
+					"patching_rect" : [ 220.0, 260.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"id" : "obj-18"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "button",
+					"patching_rect" : [ 200.0, 260.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"id" : "obj-16"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "route 0 1 2 3",
+					"patching_rect" : [ 200.0, 220.0, 99.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 5,
+					"outlettype" : [ "", "", "", "", "" ],
+					"fontsize" : 14.0,
+					"id" : "obj-14"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "r jojo",
+					"patching_rect" : [ 200.0, 180.0, 41.0, 23.0 ],
+					"numinlets" : 0,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "" ],
+					"fontsize" : 14.0,
+					"color" : [ 0.827451, 0.737255, 0.835294, 1.0 ],
+					"id" : "obj-13"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "s jojo",
+					"patching_rect" : [ 20.0, 340.0, 43.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 0,
+					"fontsize" : 14.0,
+					"color" : [ 0.827451, 0.737255, 0.835294, 1.0 ],
+					"id" : "obj-12"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "append toto",
+					"patching_rect" : [ 20.0, 300.0, 84.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "" ],
+					"fontsize" : 14.0,
+					"id" : "obj-11"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "% 4",
+					"patching_rect" : [ 20.0, 260.0, 35.0, 23.0 ],
+					"numinlets" : 2,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "int" ],
+					"fontsize" : 14.0,
+					"id" : "obj-10"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "counter",
+					"patching_rect" : [ 20.0, 220.0, 73.0, 23.0 ],
+					"numinlets" : 5,
+					"fontname" : "Arial",
+					"numoutlets" : 4,
+					"outlettype" : [ "int", "", "", "int" ],
+					"fontsize" : 14.0,
+					"id" : "obj-9"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "metro 250",
+					"patching_rect" : [ 20.0, 180.0, 74.0, 23.0 ],
+					"numinlets" : 2,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"fontsize" : 14.0,
+					"id" : "obj-8"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "toggle",
+					"patching_rect" : [ 20.0, 140.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "int" ],
+					"id" : "obj-7"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "t 0",
+					"patching_rect" : [ 140.0, 80.0, 26.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "int" ],
+					"fontsize" : 14.0,
+					"id" : "obj-5"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "t 1",
+					"patching_rect" : [ 20.0, 80.0, 26.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "int" ],
+					"fontsize" : 14.0,
+					"id" : "obj-4"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "message",
+					"text" : "Goodbye World !",
+					"patching_rect" : [ 140.0, 40.0, 115.0, 21.0 ],
+					"numinlets" : 2,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "" ],
+					"fontsize" : 14.0,
+					"id" : "obj-3"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "message",
+					"text" : "Hello World !",
+					"patching_rect" : [ 20.0, 40.0, 90.0, 21.0 ],
+					"numinlets" : 2,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "" ],
+					"fontsize" : 14.0,
+					"id" : "obj-2"
+				}
+
+			}
+ ],
+		"lines" : [ 			{
+				"patchline" : 				{
+					"source" : [ "obj-2", 0 ],
+					"destination" : [ "obj-4", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-3", 0 ],
+					"destination" : [ "obj-5", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-4", 0 ],
+					"destination" : [ "obj-7", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-5", 0 ],
+					"destination" : [ "obj-7", 0 ],
+					"hidden" : 0,
+					"midpoints" : [ 149.5, 121.0, 29.5, 121.0 ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-7", 0 ],
+					"destination" : [ "obj-8", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-8", 0 ],
+					"destination" : [ "obj-9", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-9", 0 ],
+					"destination" : [ "obj-10", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-10", 0 ],
+					"destination" : [ "obj-11", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-11", 0 ],
+					"destination" : [ "obj-12", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-13", 0 ],
+					"destination" : [ "obj-14", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-14", 0 ],
+					"destination" : [ "obj-16", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-14", 1 ],
+					"destination" : [ "obj-18", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-14", 2 ],
+					"destination" : [ "obj-20", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-14", 3 ],
+					"destination" : [ "obj-22", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+ ]
+	}
+
+}
--- a/samples/JSON/Hello.maxpat
+++ b/samples/JSON/Hello.maxpat
@@ -0,0 +1,368 @@
+{
+	"patcher" : 	{
+		"fileversion" : 1,
+		"appversion" : 		{
+			"major" : 5,
+			"minor" : 1,
+			"revision" : 9
+		}
+,
+		"rect" : [ 198.0, 92.0, 365.0, 407.0 ],
+		"bglocked" : 0,
+		"defrect" : [ 198.0, 92.0, 365.0, 407.0 ],
+		"openrect" : [ 0.0, 0.0, 0.0, 0.0 ],
+		"openinpresentation" : 0,
+		"default_fontsize" : 14.0,
+		"default_fontface" : 0,
+		"default_fontname" : "Arial",
+		"gridonopen" : 0,
+		"gridsize" : [ 20.0, 20.0 ],
+		"gridsnaponopen" : 0,
+		"toolbarvisible" : 1,
+		"boxanimatetime" : 200,
+		"imprint" : 0,
+		"enablehscroll" : 1,
+		"enablevscroll" : 1,
+		"devicewidth" : 0.0,
+		"boxes" : [ 			{
+				"box" : 				{
+					"maxclass" : "button",
+					"patching_rect" : [ 260.0, 260.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"id" : "obj-22"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "button",
+					"patching_rect" : [ 240.0, 260.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"id" : "obj-20"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "button",
+					"patching_rect" : [ 220.0, 260.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"id" : "obj-18"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "button",
+					"patching_rect" : [ 200.0, 260.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"id" : "obj-16"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "route 0 1 2 3",
+					"patching_rect" : [ 200.0, 220.0, 99.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 5,
+					"outlettype" : [ "", "", "", "", "" ],
+					"fontsize" : 14.0,
+					"id" : "obj-14"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "r jojo",
+					"patching_rect" : [ 200.0, 180.0, 41.0, 23.0 ],
+					"numinlets" : 0,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "" ],
+					"fontsize" : 14.0,
+					"color" : [ 0.827451, 0.737255, 0.835294, 1.0 ],
+					"id" : "obj-13"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "s jojo",
+					"patching_rect" : [ 20.0, 340.0, 43.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 0,
+					"fontsize" : 14.0,
+					"color" : [ 0.827451, 0.737255, 0.835294, 1.0 ],
+					"id" : "obj-12"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "append toto",
+					"patching_rect" : [ 20.0, 300.0, 84.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "" ],
+					"fontsize" : 14.0,
+					"id" : "obj-11"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "% 4",
+					"patching_rect" : [ 20.0, 260.0, 35.0, 23.0 ],
+					"numinlets" : 2,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "int" ],
+					"fontsize" : 14.0,
+					"id" : "obj-10"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "counter",
+					"patching_rect" : [ 20.0, 220.0, 73.0, 23.0 ],
+					"numinlets" : 5,
+					"fontname" : "Arial",
+					"numoutlets" : 4,
+					"outlettype" : [ "int", "", "", "int" ],
+					"fontsize" : 14.0,
+					"id" : "obj-9"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "metro 250",
+					"patching_rect" : [ 20.0, 180.0, 74.0, 23.0 ],
+					"numinlets" : 2,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "bang" ],
+					"fontsize" : 14.0,
+					"id" : "obj-8"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "toggle",
+					"patching_rect" : [ 20.0, 140.0, 20.0, 20.0 ],
+					"numinlets" : 1,
+					"numoutlets" : 1,
+					"outlettype" : [ "int" ],
+					"id" : "obj-7"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "t 0",
+					"patching_rect" : [ 140.0, 80.0, 26.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "int" ],
+					"fontsize" : 14.0,
+					"id" : "obj-5"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "newobj",
+					"text" : "t 1",
+					"patching_rect" : [ 20.0, 80.0, 26.0, 23.0 ],
+					"numinlets" : 1,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "int" ],
+					"fontsize" : 14.0,
+					"id" : "obj-4"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "message",
+					"text" : "Goodbye World !",
+					"patching_rect" : [ 140.0, 40.0, 115.0, 21.0 ],
+					"numinlets" : 2,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "" ],
+					"fontsize" : 14.0,
+					"presentation_rect" : [ 137.0, 42.0, 0.0, 0.0 ],
+					"id" : "obj-3"
+				}
+
+			}
+, 			{
+				"box" : 				{
+					"maxclass" : "message",
+					"text" : "Hello World !",
+					"patching_rect" : [ 20.0, 40.0, 90.0, 21.0 ],
+					"numinlets" : 2,
+					"fontname" : "Arial",
+					"numoutlets" : 1,
+					"outlettype" : [ "" ],
+					"fontsize" : 14.0,
+					"id" : "obj-2"
+				}
+
+			}
+ ],
+		"lines" : [ 			{
+				"patchline" : 				{
+					"source" : [ "obj-14", 3 ],
+					"destination" : [ "obj-22", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-14", 2 ],
+					"destination" : [ "obj-20", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-14", 1 ],
+					"destination" : [ "obj-18", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-14", 0 ],
+					"destination" : [ "obj-16", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-13", 0 ],
+					"destination" : [ "obj-14", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-11", 0 ],
+					"destination" : [ "obj-12", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-10", 0 ],
+					"destination" : [ "obj-11", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-9", 0 ],
+					"destination" : [ "obj-10", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-8", 0 ],
+					"destination" : [ "obj-9", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-7", 0 ],
+					"destination" : [ "obj-8", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-5", 0 ],
+					"destination" : [ "obj-7", 0 ],
+					"hidden" : 0,
+					"midpoints" : [ 149.5, 121.0, 29.5, 121.0 ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-4", 0 ],
+					"destination" : [ "obj-7", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-3", 0 ],
+					"destination" : [ "obj-5", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+, 			{
+				"patchline" : 				{
+					"source" : [ "obj-2", 0 ],
+					"destination" : [ "obj-4", 0 ],
+					"hidden" : 0,
+					"midpoints" : [  ]
+				}
+
+			}
+ ]
+	}
+
+}
--- a/samples/JSON/person.json
+++ b/samples/JSON/person.json
@@ -0,0 +1,23 @@
+{
+     "firstName": "John",
+     "lastName" : "Smith",
+     "age"      : 25,
+     "address"  :
+     {
+         "streetAddress": "21 2nd Street",
+         "city"         : "New York",
+         "state"        : "NY",
+         "postalCode"   : "10021"
+     },
+     "phoneNumber":
+     [
+         {
+           "type"  : "home",
+           "number": "212 555-1234"
+         },
+         {
+           "type"  : "fax",
+           "number": "646 555-4567"
+         }
+     ]
+ }
--- a/samples/JSON/product.json
+++ b/samples/JSON/product.json
@@ -0,0 +1,7 @@
+{
+        "id": 1,
+        "name": "Foo",
+        "price": 123,
+        "tags": ["Bar","Eek"],
+        "stock": { "warehouse":300, "retail":20 }
+}
--- a/samples/JSON/schema.json
+++ b/samples/JSON/schema.json
@@ -0,0 +1,47 @@
+{
+        "name":"Product",
+        "properties":
+        {
+                "id":
+                {
+                        "type":"number",
+                        "description":"Product identifier",
+                        "required":true
+                },
+                "name":
+                {
+                        "type":"string",
+                        "description":"Name of the product",
+                        "required":true
+                },
+                "price":
+                {
+                        "type":"number",
+                        "minimum":0,
+                        "required":true
+                },
+                "tags":
+                {
+                        "type":"array",
+                        "items":
+                        {
+                                "type":"string"
+                        }
+                },
+                "stock":
+                {
+                        "type":"object",
+                        "properties":
+                        {
+                                "warehouse":
+                                {
+                                        "type":"number"
+                                },
+                                "retail":
+                                {
+                                        "type":"number"
+                                }
+                        }
+                }
+        }
+}
--- a/samples/JavaScript/js.script!
+++ b/samples/JavaScript/js.script!
--- a/samples/JavaScript/js2.script!
+++ b/samples/JavaScript/js2.script!
@@ -0,0 +1,7 @@
+#!/usr/bin/env node
+var http = require('http');
+http.createServer(function (req, res) {
+  res.writeHead(200, {'Content-Type': 'text/plain'});
+  res.end('Hello World\n');
+}).listen(1337, '127.0.0.1');
+console.log('Server running at http://127.0.0.1:1337/');
--- a/samples/Max/Hello.mxt
+++ b/samples/Max/Hello.mxt
@@ -0,0 +1 @@
+max v2;
--- a/samples/Nu/nu.script!
+++ b/samples/Nu/nu.script!
--- a/samples/Perl/filenames/ack
+++ b/samples/Perl/filenames/ack
--- a/samples/Perl/perl.script!
+++ b/samples/Perl/perl.script!
@@ -0,0 +1,2 @@
+#!/usr/local/bin/perl
+print "Perl\n"
--- a/samples/Python/python.script!
+++ b/samples/Python/python.script!
--- a/samples/Racket/rkt.script!
+++ b/samples/Racket/rkt.script!
--- a/samples/Ruby/macruby.script!
+++ b/samples/Ruby/macruby.script!
--- a/samples/Ruby/ruby.script!
+++ b/samples/Ruby/ruby.script!
--- a/samples/Ruby/ruby2.script!
+++ b/samples/Ruby/ruby2.script!
@@ -1,2 +1,2 @@
 #! /usr/bin/env ruby -w -Ilib:test
-echo "Ruby"
+puts "Ruby"
--- a/samples/Scala/scala.script!
+++ b/samples/Scala/scala.script!
--- a/samples/Shell/bash.script!
+++ b/samples/Shell/bash.script!
@@ -0,0 +1,2 @@
+#!/bin/bash
+echo "bash"
--- a/samples/Shell/filenames/.bash_profile
+++ b/samples/Shell/filenames/.bash_profile
--- a/samples/Shell/filenames/.bashrc
+++ b/samples/Shell/filenames/.bashrc
--- a/samples/Shell/filenames/.profile
+++ b/samples/Shell/filenames/.profile
--- a/samples/Shell/filenames/.zlogin
+++ b/samples/Shell/filenames/.zlogin
--- a/samples/Shell/filenames/.zshrc
+++ b/samples/Shell/filenames/.zshrc
--- a/samples/Shell/rbenv-sh-shell.sh
+++ b/samples/Shell/rbenv-sh-shell.sh
--- a/samples/Shell/rvm.bash
+++ b/samples/Shell/rvm.bash
--- a/samples/Shell/sh.script!
+++ b/samples/Shell/sh.script!
@@ -0,0 +1,2 @@
+#!/bin/sh
+echo "sh"
--- a/samples/Shell/zsh.script!
+++ b/samples/Shell/zsh.script!
@@ -0,0 +1,2 @@
+#!/bin/zsh
+echo "zsh"
--- a/samples/Text/script.foo
+++ b/samples/Text/script.foo
@@ -1,2 +0,0 @@
-#!/bin/foo
-???
--- a/samples/VimL/filenames/.gvimrc
+++ b/samples/VimL/filenames/.gvimrc
--- a/samples/VimL/filenames/.vimrc
+++ b/samples/VimL/filenames/.vimrc
--- a/samples/YAML/filenames/.gemrc
+++ b/samples/YAML/filenames/.gemrc
--- a/test/test_blob.rb
+++ b/test/test_blob.rb
@@ -45,10 +45,6 @@ class TestBlob < Test::Unit::TestCase
    assert_equal "application/pdf", blob("Binary/foo.pdf").content_type
    assert_equal "image/png", blob("Binary/foo.png").content_type
    assert_equal "text/plain; charset=iso-8859-2", blob("Text/README").content_type
-    assert_equal "text/plain; charset=iso-8859-1", blob("Perl/script.pl").content_type
-    assert_equal "text/plain; charset=iso-8859-1", blob("Python/script.py").content_type
-    assert_equal "text/plain; charset=iso-8859-1", blob("Ruby/script.rb").content_type
-    assert_equal "text/plain; charset=iso-8859-1", blob("Shell/script.sh").content_type
  end

  def test_disposition
@@ -262,7 +258,6 @@ class TestBlob < Test::Unit::TestCase
  end

  def test_indexable
-    assert blob("Text/file.txt").indexable?
    assert blob("Ruby/foo.rb").indexable?
    assert !blob("Text/defu.nkt").indexable?
    assert !blob("Text/dump.sql").indexable?
@@ -281,25 +276,6 @@ class TestBlob < Test::Unit::TestCase
    assert_equal Lexer['Ruby'], blob("Ruby/foo.rb").lexer
  end

-  def test_shebang_script
-    assert_equal 'sh', script_blob("Shell/script.sh").shebang_script
-    assert_equal 'bash', script_blob("Shell/script.bash").shebang_script
-    assert_equal 'zsh', script_blob("Shell/script.zsh").shebang_script
-    assert_equal 'perl', script_blob("Perl/script.pl").shebang_script
-    assert_equal 'ruby', script_blob("Ruby/script.rb").shebang_script
-    assert_equal 'ruby', script_blob("Ruby/script2.rb").shebang_script
-    assert_equal 'python', script_blob("Python/script.py").shebang_script
-    assert_equal 'node', script_blob("JavaScript/script.js").shebang_script
-    assert_equal 'groovy', script_blob("Groovy/script.groovy").shebang_script
-    assert_equal 'macruby', script_blob("Ruby/macruby-script").shebang_script
-    assert_equal 'rake', script_blob("Ruby/script.rake").shebang_script
-    assert_equal 'foo', script_blob("Text/script.foo").shebang_script
-    assert_equal 'nush', script_blob("Nu/script.nu").shebang_script
-    assert_equal 'scala', script_blob("Scala/script.scala").shebang_script
-    assert_equal 'racket', script_blob("Racket/script.rkt").shebang_script
-    assert_equal nil, script_blob("Ruby/foo.rb").shebang_script
-  end
-
  def test_colorize
    assert_equal <<-HTML, blob("Ruby/foo.rb").colorize
 <div class="highlight"><pre><span class="k">module</span> <span class="nn">Foo</span>
--- a/test/test_classifier.rb
+++ b/test/test_classifier.rb
@@ -54,11 +54,8 @@ class TestClassifier < Test::Unit::TestCase

  def test_classify_ambiguous_languages
    Samples.each do |sample|
-      language = Linguist::Language.find_by_name(sample[:language])
-      next unless language.overrides.any?
-
-      extname   = File.extname(sample[:path])
-      languages = Language.all.select { |l| l.extensions.include?(extname) }.map(&:name)
+      language  = Linguist::Language.find_by_name(sample[:language])
+      languages = Language.find_by_filename(sample[:path]).map(&:name)
      next unless languages.length > 1

      results = Classifier.classify(Samples::DATA, File.read(sample[:path]), languages)
--- a/test/test_language.rb
+++ b/test/test_language.rb
@@ -8,29 +8,6 @@ class TestLanguage < Test::Unit::TestCase

  Lexer = Pygments::Lexer

-  def test_ambiguous_extensions
-    assert Language.ambiguous?('.cls')
-    assert_equal Language['Apex'], Language.find_by_extension('cls')
-
-    assert Language.ambiguous?('.h')
-    assert_equal Language['C'], Language.find_by_extension('h')
-
-    assert Language.ambiguous?('.m')
-    assert_equal Language['Objective-C'], Language.find_by_extension('m')
-
-    assert Language.ambiguous?('.pl')
-    assert_equal Language['Perl'], Language.find_by_extension('pl')
-
-    assert Language.ambiguous?('.r')
-    assert_equal Language['R'], Language.find_by_extension('r')
-
-    assert Language.ambiguous?('.t')
-    assert_equal Language['Turing'], Language.find_by_extension('t')
-
-    assert Language.ambiguous?('.v')
-    assert_equal Language['Verilog'], Language.find_by_extension('v')
-  end
-
  def test_lexer
    assert_equal Lexer['ActionScript 3'], Language['ActionScript'].lexer
    assert_equal Lexer['Bash'], Language['Gentoo Ebuild'].lexer
@@ -71,7 +48,6 @@ class TestLanguage < Test::Unit::TestCase
    assert_equal Lexer['Scheme'], Language['Scheme'].lexer
    assert_equal Lexer['Standard ML'], Language['Standard ML'].lexer
    assert_equal Lexer['TeX'], Language['TeX'].lexer
-    assert_equal Lexer['Text only'], Language['Text'].lexer
    assert_equal Lexer['Verilog'], Language['Verilog'].lexer
    assert_equal Lexer['XSLT'], Language['XSLT'].lexer
    assert_equal Lexer['aspx-vb'], Language['ASP'].lexer
@@ -165,7 +141,7 @@ class TestLanguage < Test::Unit::TestCase
    assert_equal 'ruby',        Language['Ruby'].search_term
    assert_equal 'common-lisp', Language['Common Lisp'].search_term
    assert_equal 'html+erb',    Language['HTML+ERB'].search_term
-    assert_equal 'max/msp',     Language['Max/MSP'].search_term
+    assert_equal 'max/msp',     Language['Max'].search_term
    assert_equal 'puppet',      Language['Puppet'].search_term
    assert_equal 'pure-data',   Language['Pure Data'].search_term

@@ -242,48 +218,16 @@ class TestLanguage < Test::Unit::TestCase
    end
  end

-  def test_find_by_extension
-    assert_equal Language['Ruby'], Language.find_by_extension('.rb')
-    assert_equal Language['Ruby'], Language.find_by_extension('rb')
-    assert_equal Language['Dart'], Language.find_by_extension('dart')
-    assert_equal Language['Groff'], Language.find_by_extension('man')
-    assert_equal Language['Groff'], Language.find_by_extension('1')
-    assert_equal Language['Groff'], Language.find_by_extension('2')
-    assert_equal Language['Groff'], Language.find_by_extension('3')
-    assert_equal Language['PHP'], Language.find_by_extension('php')
-    assert_equal Language['PHP'], Language.find_by_extension('php3')
-    assert_equal Language['PHP'], Language.find_by_extension('php4')
-    assert_equal Language['PHP'], Language.find_by_extension('php5')
-    assert_equal Language['PowerShell'], Language.find_by_extension('psm1')
-    assert_equal Language['PowerShell'], Language.find_by_extension('ps1')
-
-    # Aliases for Streamline.js ( https://github.com/Sage/streamlinejs )
-    assert_equal Language['JavaScript'], Language.find_by_extension('_js')
-    assert_equal Language['CoffeeScript'], Language.find_by_extension('_coffee')
-
-    assert_nil Language.find_by_extension('.nkt')
-  end
-
-  def test_find_all_by_extension
-    Language.all.each do |language|
-      assert_equal language, Language.find_by_extension(language.primary_extension)
-
-      language.extensions.each do |extension|
-        unless Language.ambiguous?(extension)
-          assert_equal language, Language.find_by_extension(extension)
-        end
-      end
-    end
-  end
-
  def test_find_by_filename
-    assert_equal Language['Shell'], Language.find_by_filename('PKGBUILD')
-    assert_equal Language['Ruby'], Language.find_by_filename('foo.rb')
-    assert_equal Language['Ruby'], Language.find_by_filename('foo/bar.rb')
-    assert_equal Language['Ruby'], Language.find_by_filename('Rakefile')
-    assert_nil Language.find_by_filename('rb')
-    assert_nil Language.find_by_filename('.rb')
-    assert_nil Language.find_by_filename('.nkt')
+    assert_equal [Language['Shell']], Language.find_by_filename('PKGBUILD')
+    assert_equal [Language['Ruby']], Language.find_by_filename('foo.rb')
+    assert_equal [Language['Ruby']], Language.find_by_filename('foo/bar.rb')
+    assert_equal [Language['Ruby']], Language.find_by_filename('Rakefile')
+    assert_equal [Language['Ruby']], Language.find_by_filename('PKGBUILD.rb')
+    assert_equal ['C', 'C++', 'Objective-C'], Language.find_by_filename('foo.h').map(&:name).sort
+    assert_equal [], Language.find_by_filename('rb')
+    assert_equal [], Language.find_by_filename('.rb')
+    assert_equal [], Language.find_by_filename('.nkt')
  end

  def test_find
@@ -310,7 +254,6 @@ class TestLanguage < Test::Unit::TestCase
    assert_equal 'C%2B%2B', Language['C++'].escaped_name
    assert_equal 'Objective-C', Language['Objective-C'].escaped_name
    assert_equal 'Common%20Lisp', Language['Common Lisp'].escaped_name
-    assert_equal 'Max%2FMSP', Language['Max/MSP'].escaped_name
  end

  def test_error_without_name
@@ -336,11 +279,9 @@ class TestLanguage < Test::Unit::TestCase
    assert_equal 'csharp', Language['C#'].ace_mode
    assert_equal 'css', Language['CSS'].ace_mode
    assert_equal 'javascript', Language['JavaScript'].ace_mode
-    assert_equal 'text', Language['Text'].ace_mode
  end

  def test_ace_modes
-    assert Language.ace_modes.include?(Language['Text'])
    assert Language.ace_modes.include?(Language['Ruby'])
    assert !Language.ace_modes.include?(Language['FORTRAN'])
  end
@@ -373,12 +314,6 @@ class TestLanguage < Test::Unit::TestCase


  def test_colorize
-    assert_equal <<-HTML, Language['Text'].colorize("Hello")
-<div class="highlight"><pre>Hello
-</pre>
-</div>
-    HTML
-
    assert_equal <<-HTML, Language['Ruby'].colorize("def foo\n  'foo'\nend\n")
 <div class="highlight"><pre><span class="k">def</span> <span class="nf">foo</span>
  <span class="s1">&#39;foo&#39;</span>
--- a/test/test_tokenizer.rb
+++ b/test/test_tokenizer.rb
@@ -20,12 +20,17 @@ class TestTokenizer < Test::Unit::TestCase
    assert_equal %w(print), tokenize("print 'Josh'")
    assert_equal %w(print), tokenize('print "Hello \"Josh\""')
    assert_equal %w(print), tokenize("print 'Hello \\'Josh\\''")
+    assert_equal %w(print), tokenize("print \"Hello\", \"Josh\"")
+    assert_equal %w(print), tokenize("print 'Hello', 'Josh'")
+    assert_equal %w(print), tokenize("print \"Hello\", \"\", \"Josh\"")
+    assert_equal %w(print), tokenize("print 'Hello', '', 'Josh'")
  end

  def test_skip_number_literals
    assert_equal %w(+), tokenize('1 + 1')
    assert_equal %w(add \( \)), tokenize('add(123, 456)')
    assert_equal %w(|), tokenize('0x01 | 0x10')
+    assert_equal %w(*), tokenize('500.42 * 1.0')
  end

  def test_skip_comments
@@ -77,16 +82,30 @@ class TestTokenizer < Test::Unit::TestCase
  def test_objective_c_tokens
    assert_equal %w(#import <Foundation/Foundation.h> @interface Foo NSObject { } @end), tokenize(:"Objective-C/Foo.h")
    assert_equal %w(#import @implementation Foo @end), tokenize(:"Objective-C/Foo.m")
-    assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char *argv \) { NSLog \( @ \) ; return ; }), tokenize(:"Objective-C/hello.m")
+    assert_equal %w(#import <Cocoa/Cocoa.h> int main \( int argc char *argv [ ] \) { NSLog \( @ \) ; return ; }), tokenize(:"Objective-C/hello.m")
+  end
+
+  def test_shebang
+    assert_equal "SHEBANG#!sh", tokenize(:"Shell/sh.script!")[0]
+    assert_equal "SHEBANG#!bash", tokenize(:"Shell/bash.script!")[0]
+    assert_equal "SHEBANG#!zsh", tokenize(:"Shell/zsh.script!")[0]
+    assert_equal "SHEBANG#!perl", tokenize(:"Perl/perl.script!")[0]
+    assert_equal "SHEBANG#!python", tokenize(:"Python/python.script!")[0]
+    assert_equal "SHEBANG#!ruby", tokenize(:"Ruby/ruby.script!")[0]
+    assert_equal "SHEBANG#!ruby", tokenize(:"Ruby/ruby2.script!")[0]
+    assert_equal "SHEBANG#!node", tokenize(:"JavaScript/js.script!")[0]
  end

  def test_javascript_tokens
    assert_equal %w( \( function \( \) { console.log \( \) ; } \) .call \( this \) ;), tokenize(:"JavaScript/hello.js")
  end

+  def test_json_tokens
+    assert_equal %w( { [ ] { } } ), tokenize(:"JSON/product.json")
+  end
+
  def test_ruby_tokens
    assert_equal %w(module Foo end), tokenize(:"Ruby/foo.rb")
-    assert_equal %w(# /usr/bin/env ruby puts), tokenize(:"Ruby/script.rb")
    assert_equal %w(task default do puts end), tokenize(:"Ruby/filenames/Rakefile")
  end
 end
Author	SHA1	Message	Date
Joshua Peek	566eaefda9	Linguist 2.2.0	2012-08-03 16:47:34 -05:00
Joshua Peek	047d23862e	Still index .txt	2012-08-03 16:34:53 -05:00
Joshua Peek	804e23e995	Extract seperate language detection method	2012-08-03 16:03:06 -05:00
Joshua Peek	41b7d13aa7	Extract generated blob check into its own module	2012-08-03 15:47:50 -05:00
Joshua Peek	4531103033	Forgot to move hidden samples to the correct dir	2012-08-03 15:25:38 -05:00
Joshua Peek	96267e8696	Sort test assertion	2012-08-03 15:11:30 -05:00
Joshua Peek	16a67cb852	Move shebang detection into classifier Fixes #203	2012-08-03 15:07:36 -05:00
Joshua Peek	fbbaff09cd	Stop treating text as a language	2012-08-03 13:55:51 -05:00
Joshua Peek	6014bd015e	Change find_by_filename api to return all matching languages	2012-08-03 13:53:12 -05:00
Joshua Peek	4a06d2ea7e	Merge branch 'jeanSapristi-master'	2012-07-24 11:51:54 -05:00
Joshua Peek	22efcf7aff	Update samples db	2012-07-24 11:51:37 -05:00
Joshua Peek	e5d302459f	Fix tokenzing empty strings	2012-07-24 11:49:29 -05:00
Joshua Peek	7aac87681b	Add brackets to tokens	2012-07-24 11:28:46 -05:00
Joshua Peek	53300ca581	Add brackets to tokens	2012-07-24 11:28:27 -05:00
Joshua Peek	52833b58d5	Rebuild samples db	2012-07-24 11:23:42 -05:00
Joshua Peek	f5705eaf38	Parse float tokens	2012-07-24 11:23:06 -05:00
Joshua Peek	e2a91bba3e	json extension is provided by samples	2012-07-24 11:12:57 -05:00
Joshua Peek	be1340bafc	Add a few more json samples	2012-07-24 11:12:33 -05:00
Joshua Peek	9777798cf7	Move max json into json samples	2012-07-24 11:10:57 -05:00
Joshua Peek	b7c4d96e5f	Max extensions are already covered by samples	2012-07-24 11:05:08 -05:00
Joshua Peek	e816a0a1b1	Update samples db	2012-07-24 11:04:24 -05:00
Joshua Peek	1bc9f555e6	Fix max samples dir	2012-07-24 11:03:34 -05:00
Joshua Peek	059f661eb6	Rename Max/MSP to Max	2012-07-24 11:03:09 -05:00
jeanSapristi	efbcd51ff6	Add samples for MaxMSP	2012-07-24 17:40:04 +02:00
Nicolas Danet	9f782fc261	Update lib/linguist/languages.yml	2012-07-24 12:25:03 +03:00
Nicolas Danet	5c2bdfd733	Add extensions for Max/MSP	2012-07-24 09:03:06 +03:00