Revert "Replace the tokenizer with a flex-based scanner (#3846)"

This reverts commit 99eaf5faf9.
2026-01-29 04:35:43 +00:00 · 2017-11-10 10:22:47 +11:00
parent 0f4955e5d5
commit 0698b0f36e
15 changed files with 202 additions and 8914 deletions
--- a/lib/linguist/blob_helper.rb
+++ b/lib/linguist/blob_helper.rb
@@ -275,8 +275,10 @@ module Linguist
          # also--importantly--without having to duplicate many (potentially
          # large) strings.
          begin
-            
-            data.split(encoded_newlines_re, -1)
+            encoded_newlines = ["\r\n", "\r", "\n"].
+              map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) }
+
+            data.split(Regexp.union(encoded_newlines), -1)
          rescue Encoding::ConverterNotFoundError
            # The data is not splittable in the detected encoding.  Assume it's
            # one big line.
@@ -287,51 +289,6 @@ module Linguist
        end
    end

-    def encoded_newlines_re
-      @encoded_newlines_re ||= Regexp.union(["\r\n", "\r", "\n"].
-                                              map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) })
-
-    end
-
-    def first_lines(n)
-      return lines[0...n] if defined? @lines
-      return [] unless viewable? && data
-
-      i, c = 0, 0
-      while c < n && j = data.index(encoded_newlines_re, i)
-        i = j + $&.length
-        c += 1
-      end
-      data[0...i].split(encoded_newlines_re, -1)
-    end
-
-    def last_lines(n)
-      if defined? @lines
-        if n >= @lines.length
-          @lines
-        else
-          lines[-n..-1]
-        end
-      end
-      return [] unless viewable? && data
-
-      no_eol = true
-      i, c = data.length, 0
-      k = i
-      while c < n && j = data.rindex(encoded_newlines_re, i - 1)
-        if c == 0 && j + $&.length == i
-          no_eol = false
-          n += 1
-        end
-        i = j
-        k = j + $&.length
-        c += 1
-      end
-      r = data[k..-1].split(encoded_newlines_re, -1)
-      r.pop if !no_eol
-      r
-    end
-
    # Public: Get number of lines of code
    #
    # Requires Blob#data
--- a/lib/linguist/classifier.rb
+++ b/lib/linguist/classifier.rb
@@ -3,8 +3,6 @@ require 'linguist/tokenizer'
 module Linguist
  # Language bayesian classifier.
  class Classifier
-    CLASSIFIER_CONSIDER_BYTES = 50 * 1024
-
    # Public: Use the classifier to detect language of the blob.
    #
    # blob               - An object that quacks like a blob.
@@ -19,7 +17,7 @@ module Linguist
    # Returns an Array of Language objects, most probable first.
    def self.call(blob, possible_languages)
      language_names = possible_languages.map(&:name)
-      classify(Samples.cache, blob.data[0...CLASSIFIER_CONSIDER_BYTES], language_names).map do |name, _|
+      classify(Samples.cache, blob.data, language_names).map do |name, _|
        Language[name] # Return the actual Language objects
      end
    end
--- a/lib/linguist/file_blob.rb
+++ b/lib/linguist/file_blob.rb
@@ -23,21 +23,21 @@ module Linguist
    #
    # Returns a String like '100644'
    def mode
-      @mode ||= File.stat(@fullpath).mode.to_s(8)
+      File.stat(@fullpath).mode.to_s(8)
    end

    # Public: Read file contents.
    #
    # Returns a String.
    def data
-      @data ||= File.read(@fullpath)
+      File.read(@fullpath)
    end

    # Public: Get byte size
    #
    # Returns an Integer.
    def size
-      @size ||= File.size(@fullpath)
+      File.size(@fullpath)
    end
  end
 end
--- a/lib/linguist/heuristics.rb
+++ b/lib/linguist/heuristics.rb
@@ -1,8 +1,6 @@
 module Linguist
  # A collection of simple heuristics that can be used to better analyze languages.
  class Heuristics
-    HEURISTICS_CONSIDER_BYTES = 50 * 1024
-
    # Public: Use heuristics to detect language of the blob.
    #
    # blob               - An object that quacks like a blob.
@@ -16,7 +14,7 @@ module Linguist
    #
    # Returns an Array of languages, or empty if none matched or were inconclusive.
    def self.call(blob, candidates)
-      data = blob.data[0...HEURISTICS_CONSIDER_BYTES]
+      data = blob.data

      @heuristics.each do |heuristic|
        if heuristic.matches?(blob.name, candidates)
@@ -74,14 +72,6 @@ module Linguist

    # Common heuristics
    ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/
-    CPlusPlusRegex = Regexp.union(
-        /^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/,
-        /^\s*template\s*</,
-        /^[ \t]*try/,
-        /^[ \t]*catch\s*\(/,
-        /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/,
-        /^[ \t]*(private|public|protected):$/,
-        /std::\w+/)

    disambiguate ".as" do |data|
      if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data)
@@ -229,7 +219,8 @@ module Linguist
    disambiguate ".h" do |data|
      if ObjectiveCRegex.match(data)
        Language["Objective-C"]
-      elsif CPlusPlusRegex.match(data)
+      elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
+        /^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data))
        Language["C++"]
      end
    end
--- a/lib/linguist/strategy/modeline.rb
+++ b/lib/linguist/strategy/modeline.rb
@@ -109,8 +109,8 @@ module Linguist
      # Returns an Array with one Language if the blob has a Vim or Emacs modeline
      # that matches a Language name or alias. Returns an empty array if no match.
      def self.call(blob, _ = nil)
-        header = blob.first_lines(SEARCH_SCOPE).join("\n")
-        footer = blob.last_lines(SEARCH_SCOPE).join("\n")
+        header = blob.lines.first(SEARCH_SCOPE).join("\n")
+        footer = blob.lines.last(SEARCH_SCOPE).join("\n")
        Array(Language.find_by_alias(modeline(header + footer)))
      end

--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -1,5 +1,4 @@
 require 'strscan'
-require 'linguist/linguist'

 module Linguist
  # Generic programming language tokenizer.
@@ -16,5 +15,191 @@ module Linguist
    def self.tokenize(data)
      new.extract_tokens(data)
    end
+
+    # Read up to 100KB
+    BYTE_LIMIT = 100_000
+
+    # Start state on token, ignore anything till the next newline
+    SINGLE_LINE_COMMENTS = [
+      '//', # C
+      '--', # Ada, Haskell, AppleScript
+      '#',  # Ruby
+      '%',  # Tex
+      '"',  # Vim
+    ]
+
+    # Start state on opening token, ignore anything until the closing
+    # token is reached.
+    MULTI_LINE_COMMENTS = [
+      ['/*', '*/'],    # C
+      ['<!--', '-->'], # XML
+      ['{-', '-}'],    # Haskell
+      ['(*', '*)'],    # Coq
+      ['"""', '"""'],  # Python
+      ["'''", "'''"]   # Python
+    ]
+
+    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
+      "\s*#{Regexp.escape(c)} "
+    }.join("|"))
+
+    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
+      Regexp.escape(c[0])
+    }.join("|"))
+
+    # Internal: Extract generic tokens from data.
+    #
+    # data - String to scan.
+    #
+    # Examples
+    #
+    #   extract_tokens("printf('Hello')")
+    #   # => ['printf', '(', ')']
+    #
+    # Returns Array of token Strings.
+    def extract_tokens(data)
+      s = StringScanner.new(data)
+
+      tokens = []
+      until s.eos?
+        break if s.pos >= BYTE_LIMIT
+
+        if token = s.scan(/^#!.+$/)
+          if name = extract_shebang(token)
+            tokens << "SHEBANG#!#{name}"
+          end
+
+        # Single line comment
+        elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
+          # tokens << token.strip
+          s.skip_until(/\n|\Z/)
+
+        # Multiline comments
+        elsif token = s.scan(START_MULTI_LINE_COMMENT)
+          # tokens << token
+          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
+          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
+          # tokens << close_token
+
+        # Skip single or double quoted strings
+        elsif s.scan(/"/)
+          if s.peek(1) == "\""
+            s.getch
+          else
+            s.skip_until(/(?<!\\)"/)
+          end
+        elsif s.scan(/'/)
+          if s.peek(1) == "'"
+            s.getch
+          else
+            s.skip_until(/(?<!\\)'/)
+          end
+
+        # Skip number literals
+        elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
+
+        # SGML style brackets
+        elsif token = s.scan(/<[^\s<>][^<>]*>/)
+          extract_sgml_tokens(token).each { |t| tokens << t }
+
+        # Common programming punctuation
+        elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
+          tokens << token
+
+        # Regular token
+        elsif token = s.scan(/[\w\.@#\/\*]+/)
+          tokens << token
+
+        # Common operators
+        elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
+          tokens << token
+
+        else
+          s.getch
+        end
+      end
+
+      tokens
+    end
+
+    # Internal: Extract normalized shebang command token.
+    #
+    # Examples
+    #
+    #   extract_shebang("#!/usr/bin/ruby")
+    #   # => "ruby"
+    #
+    #   extract_shebang("#!/usr/bin/env node")
+    #   # => "node"
+    #
+    #   extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
+    #   # => "awk"
+    #
+    # Returns String token or nil it couldn't be parsed.
+    def extract_shebang(data)
+      s = StringScanner.new(data)
+
+      if path = s.scan(/^#!\s*\S+/)
+        script = path.split('/').last
+        if script == 'env'
+          s.scan(/\s+/)
+          s.scan(/.*=[^\s]+\s+/)
+          script = s.scan(/\S+/)
+        end
+        script = script[/[^\d]+/, 0] if script
+        return script
+      end
+
+      nil
+    end
+
+    # Internal: Extract tokens from inside SGML tag.
+    #
+    # data - SGML tag String.
+    #
+    # Examples
+    #
+    #   extract_sgml_tokens("<a href='' class=foo>")
+    #   # => ["<a>", "href="]
+    #
+    # Returns Array of token Strings.
+    def extract_sgml_tokens(data)
+      s = StringScanner.new(data)
+
+      tokens = []
+
+      until s.eos?
+        # Emit start token
+        if token = s.scan(/<\/?[^\s>]+/)
+          tokens << "#{token}>"
+
+        # Emit attributes with trailing =
+        elsif token = s.scan(/\w+=/)
+          tokens << token
+
+          # Then skip over attribute value
+          if s.scan(/"/)
+            s.skip_until(/[^\\]"/)
+          elsif s.scan(/'/)
+            s.skip_until(/[^\\]'/)
+          else
+            s.skip_until(/\w+/)
+          end
+
+        # Emit lone attributes
+        elsif token = s.scan(/\w+/)
+          tokens << token
+
+        # Stop at the end of the tag
+        elsif s.scan(/>/)
+          s.terminate
+
+        else
+          s.getch
+        end
+      end
+
+      tokens
+    end
  end
 end